From 4e1f6f2a0ee27c362d56a762c12d43a551de64da Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Tue, 7 Apr 2020 06:57:42 -0400 Subject: [PATCH] Merge branch 'amd-master' into amd-master-next Change-Id: I3094c15008093f2072bcd38aca4ea90aeae2d97b [ROCm/hip commit: 2af31479e2a758aa17638606019898946902f672] --- projects/hip/CMakeLists.txt | 26 +- projects/hip/README.md | 3 +- projects/hip/bin/hipcc | 58 ++- projects/hip/bin/hipify-perl | 28 +- projects/hip/cmake/FindHIP.cmake | 1 - .../markdown/CUSPARSE_API_supported_by_HIP.md | 48 +- projects/hip/docs/markdown/hip_debugging.md | 14 +- projects/hip/hip_prof_gen.py | 47 +- projects/hip/hipify-clang/README.md | 243 ++++----- .../src/CUDA2HIP_Driver_API_functions.cpp | 4 +- .../src/CUDA2HIP_SPARSE_API_functions.cpp | 38 +- .../src/CUDA2HIP_SPARSE_API_types.cpp | 10 +- .../include/hip/hcc_detail/device_functions.h | 8 +- .../hip/hcc_detail/device_library_decls.h | 3 +- .../hip/hcc_detail/functional_grid_launch.hpp | 31 +- .../hip/include/hip/hcc_detail/hip_atomic.h | 8 + .../hip/include/hip/hcc_detail/hip_fp16.h | 14 + .../hip/hcc_detail/hip_fp16_math_fwd.h | 2 + .../hip/include/hip/hcc_detail/hip_runtime.h | 11 +- .../include/hip/hcc_detail/hip_runtime_api.h | 2 +- .../include/hip/hcc_detail/hip_vector_types.h | 97 +++- projects/hip/include/hip/hcc_detail/hiprtc.h | 4 + .../hip/include/hip/hcc_detail/host_defines.h | 2 +- projects/hip/include/hip/hip_runtime_api.h | 1 - .../include/hip/nvcc_detail/hip_runtime_api.h | 43 +- projects/hip/lpl_ca/CMakeLists.txt | 2 +- projects/hip/lpl_ca/ca.hpp | 2 +- projects/hip/packaging/hip-base.txt | 7 +- projects/hip/packaging/hip-doc.txt | 3 +- projects/hip/packaging/hip-hcc.txt | 3 +- projects/hip/packaging/hip-nvcc.txt | 3 +- projects/hip/packaging/hip-samples.txt | 5 +- projects/hip/packaging/hip-targets.cmake | 2 - projects/hip/packaging/hip-vdi.txt | 12 +- .../0_Intro/module_api/defaultDriver.cpp | 4 +- .../0_Intro/module_api/launchKernelHcc.cpp | 4 +- .../samples/0_Intro/module_api/runKernel.cpp | 4 +- .../0_Intro/module_api_global/runKernel.cpp | 4 +- .../hip/samples/1_Utils/hipInfo/hipInfo.cpp | 3 +- projects/hip/src/code_object_bundle.cpp | 34 -- .../code_object_bundle.inl} | 30 +- projects/hip/src/hip_clang.cpp | 4 +- projects/hip/src/hip_device.cpp | 12 + projects/hip/src/hip_hcc.cpp | 39 +- projects/hip/src/hip_hcc_internal.h | 14 - projects/hip/src/hip_memory.cpp | 101 ++-- projects/hip/src/hip_module.cpp | 475 ++++++++++++------ projects/hip/src/hip_stream.cpp | 38 +- projects/hip/src/hip_texture.cpp | 26 +- projects/hip/src/hiprtc.cpp | 2 +- projects/hip/src/program_state.inl | 9 +- .../libraries/cuSPARSE/cuSPARSE_12.cu | 3 +- .../Negative/memory/hipMemcpyFromSymbol.cpp | 46 ++ .../memory/hipMemcpyFromSymbolAsync.cpp | 49 ++ .../src/Negative/memory/hipMemcpyToSymbol.cpp | 46 ++ .../memory/hipMemcpyToSymbolAsync.cpp | 49 ++ .../tests/src/Negative/memory/hipMemory.cpp | 43 ++ .../stream/hipStreamCreateWithFlags.cpp | 40 ++ .../hip/tests/src/deviceLib/hipTestHalf.cpp | 39 ++ .../hip/tests/src/deviceLib/hip_floatnTM.cpp | 239 +++++++++ .../tests/src/hiprtc/hiprtcGetLoweredName.cpp | 2 +- projects/hip/tests/src/hiprtc/saxpy.cpp | 2 +- .../memory/hipMemcpyNegetiveTests.cpp | 53 ++ .../module/hipModuleLoadDataMultThreaded.cpp | 4 +- .../runtimeApi/stream/StreamAddCallback.cpp | 145 ++++++ .../stream/hipStreamAddCallbackCatch.cpp | 409 +++++++++++++++ .../src/texture/hipTex1DFetchCheckModes.cpp | 122 +++++ 67 files changed, 2278 insertions(+), 601 deletions(-) delete mode 100644 projects/hip/src/code_object_bundle.cpp rename projects/hip/{include/hip/hcc_detail/code_object_bundle.hpp => src/code_object_bundle.inl} (86%) create mode 100644 projects/hip/tests/src/Negative/memory/hipMemcpyFromSymbol.cpp create mode 100644 projects/hip/tests/src/Negative/memory/hipMemcpyFromSymbolAsync.cpp create mode 100644 projects/hip/tests/src/Negative/memory/hipMemcpyToSymbol.cpp create mode 100644 projects/hip/tests/src/Negative/memory/hipMemcpyToSymbolAsync.cpp create mode 100644 projects/hip/tests/src/Negative/memory/hipMemory.cpp create mode 100644 projects/hip/tests/src/Negative/stream/hipStreamCreateWithFlags.cpp create mode 100644 projects/hip/tests/src/deviceLib/hip_floatnTM.cpp create mode 100644 projects/hip/tests/src/runtimeApi/memory/hipMemcpyNegetiveTests.cpp create mode 100644 projects/hip/tests/src/runtimeApi/stream/StreamAddCallback.cpp create mode 100644 projects/hip/tests/src/runtimeApi/stream/hipStreamAddCallbackCatch.cpp create mode 100644 projects/hip/tests/src/texture/hipTex1DFetchCheckModes.cpp diff --git a/projects/hip/CMakeLists.txt b/projects/hip/CMakeLists.txt index d1a2b133c5..c67ed29203 100644 --- a/projects/hip/CMakeLists.txt +++ b/projects/hip/CMakeLists.txt @@ -207,19 +207,6 @@ if (NOT CPACK_SET_DESTDIR) set(CPACK_PACKAGING_INSTALL_PREFIX "/opt/rocm/hip" CACHE PATH "Default installation path of hcc installer package") endif (NOT CPACK_SET_DESTDIR) -# Check if we need to enable ATP marker -if(NOT DEFINED COMPILE_HIP_ATP_MARKER) - if(NOT DEFINED ENV{COMPILE_HIP_ATP_MARKER}) - set(COMPILE_HIP_ATP_MARKER 0) - else() - set(COMPILE_HIP_ATP_MARKER $ENV{COMPILE_HIP_ATP_MARKER}) - message(WARNING "HIP Markers are deprecated, please use roctracer/rocTX marker APIs.") - endif() -else() - message(WARNING "HIP Markers are deprecated, please use roctracer/rocTX marker APIs.") -endif() -add_to_config(_buildInfo COMPILE_HIP_ATP_MARKER) - ############################# # Profiling API support ############################# @@ -309,10 +296,6 @@ message(STATUS "\nHSA runtime in: " ${HSA_PATH}) if(HIP_PLATFORM STREQUAL "hcc") include_directories(${PROJECT_SOURCE_DIR}/include) set(HIP_HCC_BUILD_FLAGS) - if(COMPILE_HIP_ATP_MARKER) - include_directories(/opt/rocm/profiler/CXLActivityLogger/include) - set(HIP_HCC_BUILD_FLAGS "${HIP_HCC_BUILD_FLAGS} -DCOMPILE_HIP_ATP_MARKER=1") - endif() # Add HIP_VERSION to CMAKE__FLAGS set(HIP_HCC_BUILD_FLAGS "${HIP_HCC_BUILD_FLAGS} -DHIP_VERSION_MAJOR=${HIP_VERSION_MAJOR} -DHIP_VERSION_MINOR=${HIP_VERSION_MINOR} -DHIP_VERSION_PATCH=${HIP_VERSION_GITDATE}") @@ -328,7 +311,6 @@ if(HIP_PLATFORM STREQUAL "hcc") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${HIP_HCC_BUILD_FLAGS}") set(SOURCE_FILES_RUNTIME - src/code_object_bundle.cpp src/program_state.cpp src/hip_clang.cpp src/hip_hcc.cpp @@ -363,9 +345,6 @@ if(HIP_PLATFORM STREQUAL "hcc") set (CMAKE_BUILD_WITH_INSTALL_RPATH TRUE ) set (CMAKE_SKIP_BUILD_RPATH TRUE ) endif () - if(COMPILE_HIP_ATP_MARKER) - set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -L/opt/rocm/profiler/CXLActivityLogger/bin/x86_64 -lCXLActivityLogger") - endif() add_library(hip_hcc SHARED ${SOURCE_FILES_RUNTIME}) add_library(hip_hcc_static STATIC ${SOURCE_FILES_RUNTIME}) @@ -376,7 +355,7 @@ if(HIP_PLATFORM STREQUAL "hcc") target_link_libraries(hip_hcc PRIVATE hc_am) target_link_libraries(hip_hcc_static PRIVATE hc_am) - add_library(hiprtc SHARED src/hiprtc.cpp src/code_object_bundle.cpp) + add_library(hiprtc SHARED src/hiprtc.cpp) target_compile_options(hiprtc PRIVATE -DDISABLE_REDUCED_GPU_BLOB_COPY) set_property ( TARGET hiprtc PROPERTY VERSION "${HIP_LIB_VERSION_STRING}" ) set_property ( TARGET hiprtc PROPERTY SOVERSION "${HIP_LIB_VERSION_MAJOR}" ) @@ -387,6 +366,9 @@ if(HIP_PLATFORM STREQUAL "hcc") set_target_properties(hip_hcc PROPERTIES CXX_VISIBILITY_PRESET hidden) set_target_properties(hip_hcc PROPERTIES VISIBILITY_INLINES_HIDDEN 1) + set_target_properties(hiprtc PROPERTIES CXX_VISIBILITY_PRESET hidden) + set_target_properties(hiprtc PROPERTIES VISIBILITY_INLINES_HIDDEN 1) + if(HIP_PLATFORM STREQUAL "hcc") find_package(amd_comgr REQUIRED CONFIG diff --git a/projects/hip/README.md b/projects/hip/README.md index 2bffd12162..c2e2a7a456 100644 --- a/projects/hip/README.md +++ b/projects/hip/README.md @@ -1,6 +1,7 @@ ## What is this repository for? ### -HIP allows developers to convert CUDA code to portable C++. The same source code can be compiled to run on NVIDIA or AMD GPUs. +**HIP is a C++ Runtime API and Kernel Language that allows developers to create portable applications for AMD and NVIDIA GPUs from single source code.** + Key features include: * HIP is very thin and has little or no performance impact over coding directly in CUDA or hcc "HC" mode. diff --git a/projects/hip/bin/hipcc b/projects/hip/bin/hipcc index b3db312c78..5ed781bc60 100755 --- a/projects/hip/bin/hipcc +++ b/projects/hip/bin/hipcc @@ -134,6 +134,7 @@ if (defined $HIP_RUNTIME and $HIP_RUNTIME eq "VDI" and !defined $HIP_VDI_HOME) { $HIP_VDI_HOME = $HIP_PATH; # use HIP_PATH } $HIPCXXFLAGS .= "-D__HIP_VDI__"; + $HIPCFLAGS .= "-D__HIP_VDI__"; } if (defined $HIP_VDI_HOME) { @@ -207,7 +208,8 @@ if ($HIP_PLATFORM eq "clang") { } else { $HIPCXXFLAGS .= " -std=c++11"; } - $HIPCXXFLAGS .= " -isystem $HIP_CLANG_INCLUDE_PATH"; + $HIPCXXFLAGS .= " -isystem $HIP_CLANG_INCLUDE_PATH/.."; + $HIPCFLAGS .= " -isystem $HIP_CLANG_INCLUDE_PATH/.."; $HIPLDFLAGS .= " -L$HIP_LIB_PATH"; if (not $isWindows) { $HIPLDFLAGS .= " -Wl,--rpath-link=$HIP_LIB_PATH"; @@ -222,8 +224,10 @@ if ($HIP_PLATFORM eq "clang") { $HSA_PATH=$ENV{'HSA_PATH'} // "$ROCM_PATH/hsa"; $HIPCXXFLAGS .= " -isystem $HSA_PATH/include"; + $HIPCFLAGS .= " -isystem $HSA_PATH/include"; if (!($HIP_RUNTIME eq "HCC")) { $HIPCXXFLAGS .= " -D__HIP_VDI__ -fhip-new-launch-api"; + $HIPCFLAGS .= " -D__HIP_VDI__ -fhip-new-launch-api"; } } elsif ($HIP_PLATFORM eq "hcc") { @@ -282,8 +286,11 @@ if ($HIP_PLATFORM eq "clang") { } $HIPCXXFLAGS .= " -isystem $HIP_PATH/include/hip/hcc_detail/cuda"; + $HIPCFLAGS .= " -isystem $HIP_PATH/include/hip/hcc_detail/cuda"; $HIPCXXFLAGS .= " -isystem $HSA_PATH/include"; + $HIPCFLAGS .= " -isystem $HSA_PATH/include"; $HIPCXXFLAGS .= " -Wno-deprecated-register"; + $HIPCFLAGS .= " -Wno-deprecated-register"; $HIPLDFLAGS .= " -L$HSA_PATH/lib -L$ROCM_PATH/lib -lhsa-runtime64 -lhc_am "; # $HIPLDFLAGS .= " -L$HCC_HOME/compiler/lib -lLLVMAMDGPUDesc -lLLVMAMDGPUUtils -lLLVMMC -lLLVMCore -lLLVMSupport "; @@ -321,6 +328,7 @@ if ($HIP_PLATFORM eq "clang") { $HIPCC="$CUDA_PATH/bin/nvcc"; $HIPCXXFLAGS .= " -Wno-deprecated-gpu-targets "; $HIPCXXFLAGS .= " -isystem $CUDA_PATH/include"; + $HIPCFLAGS .= " -isystem $CUDA_PATH/include"; $HIPLDFLAGS = " -Wno-deprecated-gpu-targets -lcuda -lcudart -L$CUDA_PATH/lib64"; } else { @@ -330,11 +338,14 @@ if ($HIP_PLATFORM eq "clang") { # Add paths to common HIP includes: $HIPCXXFLAGS .= " -isystem $HIP_INCLUDE_PATH" ; +$HIPCFLAGS .= " -isystem $HIP_INCLUDE_PATH" ; my $compileOnly = 0; my $needCXXFLAGS = 0; # need to add CXX flags to compile step +my $needCFLAGS = 0; # need to add C flags to compile step my $needLDFLAGS = 1; # need to add LDFLAGS to compile step. -my $hasC = 0; # options contain a c-style file (NVCC must force recognition as GPU file) +my $hasC = 0; # options contain a c-style file +my $hasCXX = 0; # options contain a cpp-style file (NVCC must force recognition as GPU file) my $hasCU = 0; # options contain a cu-style file (HCC must force recognition as GPU file) my $needHipHcc = ($HIP_PLATFORM eq 'hcc'); # set if we need to link hip_hcc.o from src tree. (some builds, ie cmake, provide their own) my $printHipVersion = 0; # print HIP version @@ -343,6 +354,7 @@ my $buildDeps = 0; my $linkType = 1; my $setLinkType = 0; my $coFormatv3 = 1; +my $funcSupp = 0; # enable function support my @options = (); my @inputs = (); @@ -400,7 +412,6 @@ foreach $arg (@ARGV) my $swallowArg = 0; if ($arg eq '-c' or $arg eq '--genco') { $compileOnly = 1; - $needCXXFLAGS = 1; $needLDFLAGS = 0; } @@ -463,6 +474,7 @@ foreach $arg (@ARGV) } if($trimarg eq '-use_fast_math') { $HIPCXXFLAGS .= " -DHIP_FAST_MATH "; + $HIPCFLAGS .= " -DHIP_FAST_MATH "; } if(($trimarg eq '-use-staticlib') and ($setLinkType eq 0)) { @@ -599,20 +611,30 @@ foreach $arg (@ARGV) #if $arg eq "--hipcc_profile") { # Example argument here, hipcc # #} + if ($arg eq "--hipcc-func-supp") { + $funcSupp = 1; + } elsif ($arg eq "--hipcc-no-func-supp") { + $funcSupp = 0; + } } else { push (@options, $arg); } #print "O: <$arg>\n"; } else { # input files and libraries - if (($arg =~ /\.cpp$/) or ($arg =~ /\.cxx$/) or ($arg =~ /\.c$/) or ($arg =~ /\.cc$/) ) { + if ($arg =~ /\.c$/) { $hasC = 1; + $needCFLAGS = 1; + $toolArgs .= " -x c" + } + elsif (($arg =~ /\.cpp$/) or ($arg =~ /\.cxx$/) or ($arg =~ /\.cc$/) ) { + $hasCXX = 1; $needCXXFLAGS = 1; - if ($HIP_PLATFORM eq 'clang') { + if ($HIP_PLATFORM eq 'clang' and not $arg =~ /\.c$/) { $toolArgs .= " -x hip" } } - if (($arg =~ /\.cu$/) or ($arg =~ /\.cuh$/) or ($arg =~ /\.hip$/)) { + elsif (($arg =~ /\.cu$/) or ($arg =~ /\.cuh$/) or ($arg =~ /\.hip$/)) { $hasCU = 1; $needCXXFLAGS = 1; if ($HIP_PLATFORM eq 'clang') { @@ -657,7 +679,7 @@ if($HIP_PLATFORM eq "hcc" or $HIP_PLATFORM eq "clang"){ my $archMacro = ' -D__HIP_ARCH_' . uc($val) . '__=1 '; # Add the arch option and macro to the compiler options. $GPU_ARCH_ARG = $GPU_ARCH_OPT . $val; - $HIPLDFLAGS .= $GPU_ARCH_ARG; + $HIPLDARCHFLAGS .= $GPU_ARCH_ARG; $HIPCXXFLAGS .= $archMacro; if ($HIP_PLATFORM eq 'clang') { $HIPCXXFLAGS .= $GPU_ARCH_ARG; @@ -685,7 +707,7 @@ if ($coFormatv3 and $HIP_PLATFORM eq 'hcc') { $HIPCXXFLAGS .= " -mcode-object-v3"; } -if ($hasC and $HIP_PLATFORM eq 'nvcc') { +if ($hasCXX and $HIP_PLATFORM eq 'nvcc') { $HIPCXXFLAGS .= " -x cu"; } if ($hasCU and $HIP_PLATFORM eq 'hcc') { @@ -694,6 +716,7 @@ if ($hasCU and $HIP_PLATFORM eq 'hcc') { if ($buildDeps and $HIP_PLATFORM eq 'nvcc') { $HIPCXXFLAGS .= " -M -D__CUDACC__"; + $HIPCFLAGS .= " -M -D__CUDACC__"; } if ($buildDeps and $HIP_PLATFORM eq 'clang') { @@ -701,10 +724,14 @@ if ($buildDeps and $HIP_PLATFORM eq 'clang') { } # Add --hip-link only if there are no source files. -if (!$needCXXFLAGS and $HIP_PLATFORM eq 'clang') { +if (!$needCXXFLAGS and !$needCFLAGS and $HIP_PLATFORM eq 'clang') { $HIPLDFLAGS .= " --hip-link"; } +if (!$needCFLAGS and $HIP_PLATFORM eq 'clang') { + $HIPLDFLAGS .= $HIPLDARCHFLAGS; +} + if ($setStdLib eq 0 and $HIP_PLATFORM eq 'hcc') { $HIPCXXFLAGS .= $HCC_WA_FLAGS; @@ -727,11 +754,12 @@ if ($HIP_PLATFORM eq "clang") { # Set default optimization level to -O3 for hip-clang. if ($optArg eq "") { $HIPCXXFLAGS .= " -O3"; + $HIPCFLAGS .= " -O3"; $HIPLDFLAGS .= " -O3"; } # Do not pass -mllvm on Windows since there is a clang bug causing duplicate -mllvm options in clang -cc1 on Windows. # ToDo : remove restriction for Windows after clang bug is fixed. - if ($optArg ne "-O0" and not $isWindows) { + if (!$funcSupp and $optArg ne "-O0" and not $isWindows) { $HIPCXXFLAGS .= " -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false"; if ($needLDFLAGS and not $needCXXFLAGS) { $HIPLDFLAGS .= " -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false"; @@ -747,18 +775,22 @@ if ($HIP_PLATFORM eq "clang") { if ($HIPCC_COMPILE_FLAGS_APPEND) { $HIPCXXFLAGS .= " $HIPCC_COMPILE_FLAGS_APPEND"; + $HIPCFLAGS .= " $HIPCC_COMPILE_FLAGS_APPEND"; } if ($HIPCC_LINK_FLAGS_APPEND) { $HIPLDFLAGS .= " $HIPCC_LINK_FLAGS_APPEND"; } my $CMD="$HIPCC"; -if ($needCXXFLAGS) { - $CMD .= " $HIPCXXFLAGS"; -} if ($needLDFLAGS and not $compileOnly) { $CMD .= " $HIPLDFLAGS"; } +if ($needCFLAGS) { + $CMD .= " $HIPCFLAGS"; +} +if ($needCXXFLAGS) { + $CMD .= " $HIPCXXFLAGS"; +} $CMD .= " $toolArgs"; if ($verbose & 0x1) { diff --git a/projects/hip/bin/hipify-perl b/projects/hip/bin/hipify-perl index 62cd8436f8..4783214836 100755 --- a/projects/hip/bin/hipify-perl +++ b/projects/hip/bin/hipify-perl @@ -341,8 +341,8 @@ sub simpleSubstitutions { $ft{'execution'} += s/\bcudaLaunchCooperativeKernelMultiDevice\b/hipLaunchCooperativeKernelMultiDevice/g; $ft{'execution'} += s/\bcudaLaunchKernel\b/hipLaunchKernel/g; $ft{'execution'} += s/\bcudaSetupArgument\b/hipSetupArgument/g; - $ft{'occupancy'} += s/\bcuOccupancyMaxActiveBlocksPerMultiprocessor\b/hipOccupancyMaxActiveBlocksPerMultiprocessor/g; - $ft{'occupancy'} += s/\bcuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags\b/hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags/g; + $ft{'occupancy'} += s/\bcuOccupancyMaxActiveBlocksPerMultiprocessor\b/hipDrvOccupancyMaxActiveBlocksPerMultiprocessor/g; + $ft{'occupancy'} += s/\bcuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags\b/hipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags/g; $ft{'occupancy'} += s/\bcuOccupancyMaxPotentialBlockSize\b/hipOccupancyMaxPotentialBlockSize/g; $ft{'occupancy'} += s/\bcudaOccupancyMaxActiveBlocksPerMultiprocessor\b/hipOccupancyMaxActiveBlocksPerMultiprocessor/g; $ft{'occupancy'} += s/\bcudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags\b/hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags/g; @@ -754,6 +754,9 @@ sub simpleSubstitutions { $ft{'library'} += s/\bcusparseCcsrmm\b/hipsparseCcsrmm/g; $ft{'library'} += s/\bcusparseCcsrmm2\b/hipsparseCcsrmm2/g; $ft{'library'} += s/\bcusparseCcsrmv\b/hipsparseCcsrmv/g; + $ft{'library'} += s/\bcusparseCcsrsm2_analysis\b/hipsparseCcsrsm2_analysis/g; + $ft{'library'} += s/\bcusparseCcsrsm2_bufferSizeExt\b/hipsparseCcsrsm2_bufferSizeExt/g; + $ft{'library'} += s/\bcusparseCcsrsm_solve\b/hipsparseCcsrsm_solve/g; $ft{'library'} += s/\bcusparseCcsrsv2_analysis\b/hipsparseCcsrsv2_analysis/g; $ft{'library'} += s/\bcusparseCcsrsv2_bufferSize\b/hipsparseCcsrsv2_bufferSize/g; $ft{'library'} += s/\bcusparseCcsrsv2_bufferSizeExt\b/hipsparseCcsrsv2_bufferSizeExt/g; @@ -763,9 +766,11 @@ sub simpleSubstitutions { $ft{'library'} += s/\bcusparseCgthr\b/hipsparseCgthr/g; $ft{'library'} += s/\bcusparseCgthrz\b/hipsparseCgthrz/g; $ft{'library'} += s/\bcusparseChybmv\b/hipsparseChybmv/g; + $ft{'library'} += s/\bcusparseCnnz\b/hipsparseCnnz/g; $ft{'library'} += s/\bcusparseCreate\b/hipsparseCreate/g; $ft{'library'} += s/\bcusparseCreateCsrgemm2Info\b/hipsparseCreateCsrgemm2Info/g; $ft{'library'} += s/\bcusparseCreateCsrilu02Info\b/hipsparseCreateCsrilu02Info/g; + $ft{'library'} += s/\bcusparseCreateCsrsm2Info\b/hipsparseCreateCsrsm2Info/g; $ft{'library'} += s/\bcusparseCreateCsrsv2Info\b/hipsparseCreateCsrsv2Info/g; $ft{'library'} += s/\bcusparseCreateHybMat\b/hipsparseCreateHybMat/g; $ft{'library'} += s/\bcusparseCreateIdentityPermutation\b/hipsparseCreateIdentityPermutation/g; @@ -784,6 +789,9 @@ sub simpleSubstitutions { $ft{'library'} += s/\bcusparseDcsrmm\b/hipsparseDcsrmm/g; $ft{'library'} += s/\bcusparseDcsrmm2\b/hipsparseDcsrmm2/g; $ft{'library'} += s/\bcusparseDcsrmv\b/hipsparseDcsrmv/g; + $ft{'library'} += s/\bcusparseDcsrsm2_analysis\b/hipsparseDcsrsm2_analysis/g; + $ft{'library'} += s/\bcusparseDcsrsm2_bufferSizeExt\b/hipsparseDcsrsm2_bufferSizeExt/g; + $ft{'library'} += s/\bcusparseDcsrsm_solve\b/hipsparseDcsrsm_solve/g; $ft{'library'} += s/\bcusparseDcsrsv2_analysis\b/hipsparseDcsrsv2_analysis/g; $ft{'library'} += s/\bcusparseDcsrsv2_bufferSize\b/hipsparseDcsrsv2_bufferSize/g; $ft{'library'} += s/\bcusparseDcsrsv2_bufferSizeExt\b/hipsparseDcsrsv2_bufferSizeExt/g; @@ -792,12 +800,14 @@ sub simpleSubstitutions { $ft{'library'} += s/\bcusparseDestroy\b/hipsparseDestroy/g; $ft{'library'} += s/\bcusparseDestroyCsrgemm2Info\b/hipsparseDestroyCsrgemm2Info/g; $ft{'library'} += s/\bcusparseDestroyCsrilu02Info\b/hipsparseDestroyCsrilu02Info/g; + $ft{'library'} += s/\bcusparseDestroyCsrsm2Info\b/hipsparseDestroyCsrsm2Info/g; $ft{'library'} += s/\bcusparseDestroyCsrsv2Info\b/hipsparseDestroyCsrsv2Info/g; $ft{'library'} += s/\bcusparseDestroyHybMat\b/hipsparseDestroyHybMat/g; $ft{'library'} += s/\bcusparseDestroyMatDescr\b/hipsparseDestroyMatDescr/g; $ft{'library'} += s/\bcusparseDgthr\b/hipsparseDgthr/g; $ft{'library'} += s/\bcusparseDgthrz\b/hipsparseDgthrz/g; $ft{'library'} += s/\bcusparseDhybmv\b/hipsparseDhybmv/g; + $ft{'library'} += s/\bcusparseDnnz\b/hipsparseDnnz/g; $ft{'library'} += s/\bcusparseDroti\b/hipsparseDroti/g; $ft{'library'} += s/\bcusparseDsctr\b/hipsparseDsctr/g; $ft{'library'} += s/\bcusparseGetMatDiagType\b/hipsparseGetMatDiagType/g; @@ -820,6 +830,9 @@ sub simpleSubstitutions { $ft{'library'} += s/\bcusparseScsrmm\b/hipsparseScsrmm/g; $ft{'library'} += s/\bcusparseScsrmm2\b/hipsparseScsrmm2/g; $ft{'library'} += s/\bcusparseScsrmv\b/hipsparseScsrmv/g; + $ft{'library'} += s/\bcusparseScsrsm2_analysis\b/hipsparseScsrsm2_analysis/g; + $ft{'library'} += s/\bcusparseScsrsm2_bufferSizeExt\b/hipsparseScsrsm2_bufferSizeExt/g; + $ft{'library'} += s/\bcusparseScsrsm_solve\b/hipsparseScsrsm_solve/g; $ft{'library'} += s/\bcusparseScsrsv2_analysis\b/hipsparseScsrsv2_analysis/g; $ft{'library'} += s/\bcusparseScsrsv2_bufferSize\b/hipsparseScsrsv2_bufferSize/g; $ft{'library'} += s/\bcusparseScsrsv2_bufferSizeExt\b/hipsparseScsrsv2_bufferSizeExt/g; @@ -834,6 +847,7 @@ sub simpleSubstitutions { $ft{'library'} += s/\bcusparseSgthr\b/hipsparseSgthr/g; $ft{'library'} += s/\bcusparseSgthrz\b/hipsparseSgthrz/g; $ft{'library'} += s/\bcusparseShybmv\b/hipsparseShybmv/g; + $ft{'library'} += s/\bcusparseSnnz\b/hipsparseSnnz/g; $ft{'library'} += s/\bcusparseSroti\b/hipsparseSroti/g; $ft{'library'} += s/\bcusparseSsctr\b/hipsparseSsctr/g; $ft{'library'} += s/\bcusparseXbsrilu02_zeroPivot\b/hipsparseXbsrilu02_zeroPivot/g; @@ -847,6 +861,7 @@ sub simpleSubstitutions { $ft{'library'} += s/\bcusparseXcsrgemm2Nnz\b/hipsparseXcsrgemm2Nnz/g; $ft{'library'} += s/\bcusparseXcsrgemmNnz\b/hipsparseXcsrgemmNnz/g; $ft{'library'} += s/\bcusparseXcsrilu02_zeroPivot\b/hipsparseXcsrilu02_zeroPivot/g; + $ft{'library'} += s/\bcusparseXcsrsm2_zeroPivot\b/hipsparseXcsrsm2_zeroPivot/g; $ft{'library'} += s/\bcusparseXcsrsort\b/hipsparseXcsrsort/g; $ft{'library'} += s/\bcusparseXcsrsort_bufferSizeExt\b/hipsparseXcsrsort_bufferSizeExt/g; $ft{'library'} += s/\bcusparseXcsrsv2_zeroPivot\b/hipsparseXcsrsv2_zeroPivot/g; @@ -863,6 +878,9 @@ sub simpleSubstitutions { $ft{'library'} += s/\bcusparseZcsrmm\b/hipsparseZcsrmm/g; $ft{'library'} += s/\bcusparseZcsrmm2\b/hipsparseZcsrmm2/g; $ft{'library'} += s/\bcusparseZcsrmv\b/hipsparseZcsrmv/g; + $ft{'library'} += s/\bcusparseZcsrsm2_analysis\b/hipsparseZcsrsm2_analysis/g; + $ft{'library'} += s/\bcusparseZcsrsm2_bufferSizeExt\b/hipsparseZcsrsm2_bufferSizeExt/g; + $ft{'library'} += s/\bcusparseZcsrsm_solve\b/hipsparseZcsrsm_solve/g; $ft{'library'} += s/\bcusparseZcsrsv2_analysis\b/hipsparseZcsrsv2_analysis/g; $ft{'library'} += s/\bcusparseZcsrsv2_bufferSize\b/hipsparseZcsrsv2_bufferSize/g; $ft{'library'} += s/\bcusparseZcsrsv2_bufferSizeExt\b/hipsparseZcsrsv2_bufferSizeExt/g; @@ -872,6 +890,7 @@ sub simpleSubstitutions { $ft{'library'} += s/\bcusparseZgthr\b/hipsparseZgthr/g; $ft{'library'} += s/\bcusparseZgthrz\b/hipsparseZgthrz/g; $ft{'library'} += s/\bcusparseZhybmv\b/hipsparseZhybmv/g; + $ft{'library'} += s/\bcusparseZnnz\b/hipsparseZnnz/g; $ft{'library'} += s/\bcusparseZsctr\b/hipsparseZsctr/g; $ft{'device_library'} += s/\bcurand\b/hiprand/g; $ft{'device_library'} += s/\bcurand_discrete\b/hiprand_discrete/g; @@ -997,6 +1016,8 @@ sub simpleSubstitutions { $ft{'type'} += s/\bcsrgemm2Info\b/csrgemm2Info/g; $ft{'type'} += s/\bcsrgemm2Info_t\b/csrgemm2Info_t/g; $ft{'type'} += s/\bcsrilu02Info_t\b/csrilu02Info_t/g; + $ft{'type'} += s/\bcsrsm2Info\b/csrsm2Info/g; + $ft{'type'} += s/\bcsrsm2Info_t\b/csrsm2Info_t/g; $ft{'type'} += s/\bcsrsv2Info_t\b/csrsv2Info_t/g; $ft{'type'} += s/\bcuComplex\b/hipComplex/g; $ft{'type'} += s/\bcuDoubleComplex\b/hipDoubleComplex/g; @@ -1130,6 +1151,7 @@ sub simpleSubstitutions { $ft{'type'} += s/\bcurandStatus_t\b/hiprandStatus_t/g; $ft{'type'} += s/\bcusparseAction_t\b/hipsparseAction_t/g; $ft{'type'} += s/\bcusparseDiagType_t\b/hipsparseDiagType_t/g; + $ft{'type'} += s/\bcusparseDirection_t\b/hipsparseDirection_t/g; $ft{'type'} += s/\bcusparseFillMode_t\b/hipsparseFillMode_t/g; $ft{'type'} += s/\bcusparseHandle_t\b/hipsparseHandle_t/g; $ft{'type'} += s/\bcusparseHybMat_t\b/hipsparseHybMat_t/g; @@ -1398,6 +1420,8 @@ sub simpleSubstitutions { $ft{'numeric_literal'} += s/\bCUSPARSE_ACTION_SYMBOLIC\b/HIPSPARSE_ACTION_SYMBOLIC/g; $ft{'numeric_literal'} += s/\bCUSPARSE_DIAG_TYPE_NON_UNIT\b/HIPSPARSE_DIAG_TYPE_NON_UNIT/g; $ft{'numeric_literal'} += s/\bCUSPARSE_DIAG_TYPE_UNIT\b/HIPSPARSE_DIAG_TYPE_UNIT/g; + $ft{'numeric_literal'} += s/\bCUSPARSE_DIRECTION_COLUMN\b/HIPSPARSE_DIRECTION_COLUMN/g; + $ft{'numeric_literal'} += s/\bCUSPARSE_DIRECTION_ROW\b/HIPSPARSE_DIRECTION_ROW/g; $ft{'numeric_literal'} += s/\bCUSPARSE_FILL_MODE_LOWER\b/HIPSPARSE_FILL_MODE_LOWER/g; $ft{'numeric_literal'} += s/\bCUSPARSE_FILL_MODE_UPPER\b/HIPSPARSE_FILL_MODE_UPPER/g; $ft{'numeric_literal'} += s/\bCUSPARSE_HYB_PARTITION_AUTO\b/HIPSPARSE_HYB_PARTITION_AUTO/g; diff --git a/projects/hip/cmake/FindHIP.cmake b/projects/hip/cmake/FindHIP.cmake index 7edf27f3c7..0819a0364c 100644 --- a/projects/hip/cmake/FindHIP.cmake +++ b/projects/hip/cmake/FindHIP.cmake @@ -75,7 +75,6 @@ if(UNIX AND NOT APPLE AND NOT CYGWIN) endif() # And push it back to the cache set(HIP_ROOT_DIR ${HIP_ROOT_DIR} CACHE PATH "HIP installed location" FORCE) - message("Found HIP at ${HIP_ROOT_DIR}") endif() # Find HIPCC executable diff --git a/projects/hip/docs/markdown/CUSPARSE_API_supported_by_HIP.md b/projects/hip/docs/markdown/CUSPARSE_API_supported_by_HIP.md index fc7a8ee8cd..d23b06d307 100644 --- a/projects/hip/docs/markdown/CUSPARSE_API_supported_by_HIP.md +++ b/projects/hip/docs/markdown/CUSPARSE_API_supported_by_HIP.md @@ -12,9 +12,9 @@ | enum |***`cusparseAction_t`*** | |***`hipsparseAction_t`*** | | 0 |*`CUSPARSE_ACTION_SYMBOLIC`* | |*`HIPSPARSE_ACTION_SYMBOLIC`* | | 1 |*`CUSPARSE_ACTION_NUMERIC`* | |*`HIPSPARSE_ACTION_NUMERIC`* | -| enum |***`cusparseDirection_t`*** | | | -| 0 |*`CUSPARSE_DIRECTION_ROW`* | | | -| 1 |*`CUSPARSE_DIRECTION_COLUMN`* | | | +| enum |***`cusparseDirection_t`*** | |***`hipsparseDirection_t`*** | +| 0 |*`CUSPARSE_DIRECTION_ROW`* | |*`HIPSPARSE_DIRECTION_ROW`* | +| 1 |*`CUSPARSE_DIRECTION_COLUMN`* | |*`HIPSPARSE_DIRECTION_COLUMN`* | | enum |***`cusparseHybPartition_t`*** | |***`hipsparseHybPartition_t`*** | | 0 |*`CUSPARSE_HYB_PARTITION_AUTO`* | |*`HIPSPARSE_HYB_PARTITION_AUTO`* | | 1 |*`CUSPARSE_HYB_PARTITION_USER`* | |*`HIPSPARSE_HYB_PARTITION_USER`* | @@ -69,8 +69,8 @@ | typedef |`cusparseSolveAnalysisInfo_t` | | | | struct |`csrsv2Info` | | | | typedef |`csrsv2Info_t` | |`csrsv2Info_t` | -| struct |`csrsm2Info` | 9.2 | | -| typedef |`csrsm2Info_t` | | | +| struct |`csrsm2Info` | 9.2 |`csrsm2Info` | +| typedef |`csrsm2Info_t` | |`csrsm2Info_t` | | struct |`bsrsv2Info` | | | | typedef |`bsrsv2Info_t` | | | | struct |`bsrsm2Info` | | | @@ -151,8 +151,8 @@ |`cusparseGetStream` |`hipsparseGetStream` | 8.0 | |`cusparseCreateCsrsv2Info` |`hipsparseCreateCsrsv2Info` | |`cusparseDestroyCsrsv2Info` |`hipsparseDestroyCsrsv2Info` | -|`cusparseCreateCsrsm2Info` | | 9.2 | -|`cusparseDestroyCsrsm2Info` | | 9.2 | +|`cusparseCreateCsrsm2Info` |`hipsparseCreateCsrsm2Info` | 9.2 | +|`cusparseDestroyCsrsm2Info` |`hipsparseDestroyCsrsm2Info` | 9.2 | |`cusparseCreateCsric02Info` | | |`cusparseDestroyCsric02Info` | | |`cusparseCreateCsrilu02Info` |`hipsparseCreateCsrilu02Info` | @@ -306,19 +306,19 @@ |`cusparseDcsrsm_solve` | | |`cusparseCcsrsm_solve` | | |`cusparseZcsrsm_solve` | | -|`cusparseScsrsm2_bufferSizeExt` | | 9.2 | -|`cusparseDcsrsm2_bufferSizeExt` | | 9.2 | -|`cusparseCcsrsm2_bufferSizeExt` | | 9.2 | -|`cusparseZcsrsm2_bufferSizeExt` | | 9.2 | -|`cusparseScsrsm2_analysis` | | 9.2 | -|`cusparseDcsrsm2_analysis` | | 9.2 | -|`cusparseCcsrsm2_analysis` | | 9.2 | -|`cusparseZcsrsm2_analysis` | | 9.2 | -|`cusparseScsrsm2_solve` | | 9.2 | -|`cusparseDcsrsm2_solve` | | 9.2 | -|`cusparseCcsrsm2_solve` | | 9.2 | -|`cusparseZcsrsm2_solve` | | 9.2 | -|`cusparseXcsrsm2_zeroPivot` | | 9.2 | +|`cusparseScsrsm2_bufferSizeExt` |`hipsparseScsrsm2_bufferSizeExt` | 9.2 | +|`cusparseDcsrsm2_bufferSizeExt` |`hipsparseDcsrsm2_bufferSizeExt` | 9.2 | +|`cusparseCcsrsm2_bufferSizeExt` |`hipsparseCcsrsm2_bufferSizeExt` | 9.2 | +|`cusparseZcsrsm2_bufferSizeExt` |`hipsparseZcsrsm2_bufferSizeExt` | 9.2 | +|`cusparseScsrsm2_analysis` |`hipsparseScsrsm2_analysis` | 9.2 | +|`cusparseDcsrsm2_analysis` |`hipsparseDcsrsm2_analysis` | 9.2 | +|`cusparseCcsrsm2_analysis` |`hipsparseCcsrsm2_analysis` | 9.2 | +|`cusparseZcsrsm2_analysis` |`hipsparseZcsrsm2_analysis` | 9.2 | +|`cusparseScsrsm2_solve` |`hipsparseScsrsm2_solve` | 9.2 | +|`cusparseDcsrsm2_solve` |`hipsparseDcsrsm2_solve` | 9.2 | +|`cusparseCcsrsm2_solve` |`hipsparseCcsrsm2_solve` | 9.2 | +|`cusparseZcsrsm2_solve` |`hipsparseZcsrsm2_solve` | 9.2 | +|`cusparseXcsrsm2_zeroPivot` |`hipsparseXcsrsm2_zeroPivot` | 9.2 | |`cusparseSbsrmm` | | |`cusparseDbsrmm` | | |`cusparseCbsrmm` | | @@ -662,10 +662,10 @@ |`cusparseDhyb2dense` | | |`cusparseChyb2dense` | | |`cusparseZhyb2dense` | | -|`cusparseSnnz` | | -|`cusparseDnnz` | | -|`cusparseCnnz` | | -|`cusparseZnnz` | | +|`cusparseSnnz` |`cusparseSnnz` | +|`cusparseDnnz` |`cusparseDnnz` | +|`cusparseCnnz` |`cusparseCnnz` | +|`cusparseZnnz` |`cusparseZnnz` | |`cusparseCreateIdentityPermutation` |`hipsparseCreateIdentityPermutation` | |`cusparseXcoosort_bufferSizeExt` |`hipsparseXcoosort_bufferSizeExt` | |`cusparseXcoosortByRow` |`hipsparseXcoosortByRow` | diff --git a/projects/hip/docs/markdown/hip_debugging.md b/projects/hip/docs/markdown/hip_debugging.md index bf877d894e..fde17d410e 100644 --- a/projects/hip/docs/markdown/hip_debugging.md +++ b/projects/hip/docs/markdown/hip_debugging.md @@ -1,13 +1,13 @@ Table of Contents ================= - * [Profiling HIP Code](#profiling-hip-code" aria-hidden="true">\n'); + f.write('#include \n'); + f.write('// HIP API string method, method name and parameters\n') + f.write('const char* hipApiString(hip_api_id_t id, const hip_api_data_t* data) {\n') + f.write(' std::ostringstream oss;\n') + f.write(' switch (id) {\n') + for name, args in api_map.items(): + f.write(' case HIP_API_ID_' + name + ':\n') + f.write(' oss << "' + name + '("') + for ind in range(0, len(args)): + arg_tuple = args[ind] + arg_name = arg_tuple[1] + if ind != 0: f.write(' << ","') + f.write('\n << " ' + arg_name + '=" << data->args.' + name + '.' + arg_name) + f.write('\n << ")";\n') + f.write(' break;\n') + f.write(' default: oss << "unknown";\n') + f.write(' };\n') + f.write(' return strdup(oss.str().c_str());\n') + f.write('};\n') + f.write('#endif // ENABLE_HIP_API_STRING\n') f.write('#endif // _HIP_PROF_STR_H\n'); diff --git a/projects/hip/hipify-clang/README.md b/projects/hip/hipify-clang/README.md index 88d7a72ccd..8cdeceace0 100644 --- a/projects/hip/hipify-clang/README.md +++ b/projects/hip/hipify-clang/README.md @@ -42,10 +42,9 @@ After applying all the matchers, the output HIP source is produced. `hipify-clang` requires: -1. [**LLVM+CLANG**](http://releases.llvm.org) of at least version [3.8.0](http://releases.llvm.org/download.html#3.8.0); the latest stable and recommended release: [**9.0.1**](http://releases.llvm.org/download.html#9.0.1), the latest release candidate: [10.0.0-rc3](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc3). +1. [**LLVM+CLANG**](http://releases.llvm.org) of at least version [3.8.0](http://releases.llvm.org/download.html#3.8.0); the latest stable and recommended release: [**10.0.0**](http://releases.llvm.org/download.html#10.0.0). -2. [**CUDA**](https://developer.nvidia.com/cuda-downloads) of at least version [7.0](https://developer.nvidia.com/cuda-toolkit-70), the latest supported version is [**10.1 Update 2**](https://developer.nvidia.com/cuda-10.1-download-archive-base). -To use the latest CUDA version [10.2](https://developer.nvidia.com/cuda-downloads) please use the latest `LLVM` release candidate: [10.0.0-rc3](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc3). +2. [**CUDA**](https://developer.nvidia.com/cuda-downloads) of at least version [7.0](https://developer.nvidia.com/cuda-toolkit-70), the latest supported version is [**10.2**](https://developer.nvidia.com/cuda-downloads). | **LLVM release version** | **CUDA latest supported version** | **Windows** | **Linux** | |:----------------------------------------------------------:|:------------------------------------------------------------------------:|:-----------:|:---------:| @@ -66,15 +65,15 @@ To use the latest CUDA version [10.2](https://developer.nvidia.com/cuda-download | [8.0.0](http://releases.llvm.org/download.html#8.0.0) | [10.0](https://developer.nvidia.com/cuda-10.0-download-archive) | -
not working due to
the clang's bug [38811](https://bugs.llvm.org/show_bug.cgi?id=38811)
+
[patch](patches/patch_for_clang_8.0.0_bug_38811.zip)*
| + | | [8.0.1](http://releases.llvm.org/download.html#8.0.1) | [10.0](https://developer.nvidia.com/cuda-10.0-download-archive) | -
not working due to
the clang's bug [38811](https://bugs.llvm.org/show_bug.cgi?id=38811)
+
[patch](patches/patch_for_clang_8.0.1_bug_38811.zip)*
| + | | [9.0.0](http://releases.llvm.org/download.html#9.0.0) | [10.1](https://developer.nvidia.com/cuda-10.1-download-archive-base) | + | + | -| [**9.0.1**](http://releases.llvm.org/download.html#9.0.1) | [**10.1**](https://developer.nvidia.com/cuda-10.1-download-archive-base) | +
**LATEST STABLE RELEASE** | +
**LATEST STABLE RELEASE** | -| [10.0.0-rc3](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc3) | [10.2](https://developer.nvidia.com/cuda-downloads) | + | + | +| [9.0.1](http://releases.llvm.org/download.html#9.0.1) | [10.1](https://developer.nvidia.com/cuda-10.1-download-archive-base) | + | + | +| [**10.0.0**](http://releases.llvm.org/download.html#10.0.0)| [**10.2**](https://developer.nvidia.com/cuda-downloads) | +
**LATEST STABLE RELEASE** | +
**LATEST STABLE RELEASE** | `*` Download the patch and unpack it into your `LLVM` distributive directory; a few header files will be overwritten; rebuilding of `LLVM` is not needed. In most cases, you can get a suitable version of `LLVM+CLANG` with your package manager. Failing that or having multiple versions of `LLVM`, you can [download a release archive](http://releases.llvm.org/), build or install it, and set -[CMAKE_PREFIX_PATH](https://cmake.org/cmake/help/v3.5/variable/CMAKE_PREFIX_PATH.html) so `cmake` can find it; for instance: `-DCMAKE_PREFIX_PATH=f:\LLVM\9.0.1\dist` +[CMAKE_PREFIX_PATH](https://cmake.org/cmake/help/v3.5/variable/CMAKE_PREFIX_PATH.html) so `cmake` can find it; for instance: `-DCMAKE_PREFIX_PATH=d:\LLVM\10.0.0\dist` ### hipify-clang: usage @@ -83,14 +82,14 @@ To process a file, `hipify-clang` needs access to the same headers that would be For example: ```shell -./hipify-clang square.cu --cuda-path=/usr/local/cuda-10.1 -I /usr/local/cuda-10.1/samples/common/inc +./hipify-clang square.cu --cuda-path=/usr/local/cuda-10.2 -I /usr/local/cuda-10.2/samples/common/inc ``` `hipify-clang` arguments are given first, followed by a separator `'--'`, and then the arguments you'd pass to `clang` if you were compiling the input file. For example: ```bash -./hipify-clang cpp17.cu --cuda-path=/usr/local/cuda-10.1 -- -std=c++17 +./hipify-clang cpp17.cu --cuda-path=/usr/local/cuda-10.2 -- -std=c++17 ``` The [Clang manual for compiling CUDA](https://llvm.org/docs/CompileCudaWithLLVM.html#compiling-cuda-code) may be useful. @@ -158,7 +157,7 @@ Run `Visual Studio 16 2019`, open the generated `LLVM.sln`, build all, build pro **LLVM 10.0.0 or newer:** -1. download [`LLVM project`](https://github.com/llvm/llvm-project/archive/llvmorg-10.0.0-rc3.tar.gz) sources; +1. download [`LLVM project`](https://github.com/llvm/llvm-project/releases/download/llvmorg-10.0.0/llvm-project-10.0.0.tar.xz) sources; 2. build [`LLVM project`](http://llvm.org/docs/CMake.html): **Linux**: @@ -193,19 +192,19 @@ Run `Visual Studio 16 2019`, open the generated `LLVM.sln`, build all, build pro * Having multiple CUDA installations to choose a particular version the `DCUDA_TOOLKIT_ROOT_DIR` option should be specified: - - ***Linux***: `-DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-10.1` + - ***Linux***: `-DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-10.2` - - ***Windows***: `-DCUDA_TOOLKIT_ROOT_DIR="c:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1"` + - ***Windows***: `-DCUDA_TOOLKIT_ROOT_DIR="c:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.2"` - `-DCUDA_SDK_ROOT_DIR="c:/ProgramData/NVIDIA Corporation/CUDA Samples/v10.1"` + `-DCUDA_SDK_ROOT_DIR="c:/ProgramData/NVIDIA Corporation/CUDA Samples/v10.2"` 4. Ensure [`cuDNN`](https://developer.nvidia.com/rdp/cudnn-archive) of the version corresponding to CUDA's version is installed. * Path to cuDNN should be specified by the `CUDA_DNN_ROOT_DIR` option: - - ***Linux***: `-DCUDA_DNN_ROOT_DIR=/srv/CUDNN/cudnn-10.1-v7.6.5.32` + - ***Linux***: `-DCUDA_DNN_ROOT_DIR=/srv/CUDNN/cudnn-10.2-v7.6.5.32` - - ***Windows***: `-DCUDA_DNN_ROOT_DIR=f:/CUDNN/cudnn-10.1-windows10-x64-v7.6.5.32` + - ***Windows***: `-DCUDA_DNN_ROOT_DIR=d:/CUDNN/cudnn-10.2-windows10-x64-v7.6.5.32` 5. Ensure [`CUB`](https://github.com/NVlabs/cub) of the version corresponding to CUDA's version is installed. @@ -213,7 +212,7 @@ Run `Visual Studio 16 2019`, open the generated `LLVM.sln`, build all, build pro - ***Linux***: `-DCUDA_CUB_ROOT_DIR=/srv/git/CUB` - - ***Windows***: `-DCUDA_CUB_ROOT_DIR=f:/GIT/cub` + - ***Windows***: `-DCUDA_CUB_ROOT_DIR=d:/GIT/cub` 5. Ensure [`python`](https://www.python.org/downloads) of minimum required version 2.7 is installed. @@ -221,21 +220,21 @@ Run `Visual Studio 16 2019`, open the generated `LLVM.sln`, build all, build pro * Install `lit` into `python`: - - ***Linux***: `python /srv/git/LLVM/9.0.1/llvm/utils/lit/setup.py install` + - ***Linux***: `python /srv/git/LLVM/10.0.0/llvm/utils/lit/setup.py install` - - ***Windows***: `python f:/LLVM/9.0.1/llvm/utils/lit/setup.py install` + - ***Windows***: `python d:/LLVM/10.0.0/llvm/utils/lit/setup.py install` * Starting with LLVM 6.0.1 path to `llvm-lit` python script should be specified by the `LLVM_EXTERNAL_LIT` option: - - ***Linux***: `-DLLVM_EXTERNAL_LIT=/srv/git/LLVM/9.0.1/build/bin/llvm-lit` + - ***Linux***: `-DLLVM_EXTERNAL_LIT=/srv/git/LLVM/10.0.0/build/bin/llvm-lit` - - ***Windows***: `-DLLVM_EXTERNAL_LIT=f:/LLVM/9.0.1/build/Release/bin/llvm-lit.py` + - ***Windows***: `-DLLVM_EXTERNAL_LIT=d:/LLVM/10.0.0/build/Release/bin/llvm-lit.py` * `FileCheck`: - - ***Linux***: copy from `/srv/git/LLVM/9.0.1/build/bin/` to `CMAKE_INSTALL_PREFIX/dist/bin` + - ***Linux***: copy from `/srv/git/LLVM/10.0.0/build/bin/` to `CMAKE_INSTALL_PREFIX/dist/bin` - - ***Windows***: copy from `f:/LLVM/9.0.1/build/Release/bin` to `CMAKE_INSTALL_PREFIX/dist/bin` + - ***Windows***: copy from `d:/LLVM/10.0.0/build/Release/bin` to `CMAKE_INSTALL_PREFIX/dist/bin` - Or specify the path to `FileCheck` in `CMAKE_INSTALL_PREFIX` option @@ -249,7 +248,7 @@ On Linux the following configurations are tested: Ubuntu 14: LLVM 5.0.0 - 6.0.1, CUDA 7.0 - 9.0, cudnn-5.0.5 - cudnn-7.6.5.32 -Ubuntu 16-18: LLVM 8.0.0 - 10.0.0-rc3, CUDA 8.0 - 10.2, cudnn-5.1.10 - cudnn-7.6.5.32 +Ubuntu 16-18: LLVM 8.0.0 - 10.0.0, CUDA 8.0 - 10.2, cudnn-5.1.10 - cudnn-7.6.5.32 Minimum build system requirements for the above configurations: @@ -262,11 +261,11 @@ cmake -DHIPIFY_CLANG_TESTS=1 \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_INSTALL_PREFIX=../dist \ - -DCMAKE_PREFIX_PATH=/srv/git/LLVM/9.0.1/dist \ - -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-10.1 \ - -DCUDA_DNN_ROOT_DIR=/srv/CUDNN/cudnn-10.1-v7.6.5.32 \ + -DCMAKE_PREFIX_PATH=/srv/git/LLVM/10.0.0/dist \ + -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-10.2 \ + -DCUDA_DNN_ROOT_DIR=/srv/CUDNN/cudnn-10.2-v7.6.5.32 \ -DCUDA_CUB_ROOT_DIR=/srv/git/CUB \ - -DLLVM_EXTERNAL_LIT=/srv/git/LLVM/9.0.1/build/bin/llvm-lit \ + -DLLVM_EXTERNAL_LIT=/srv/git/LLVM/10.0.0/build/bin/llvm-lit \ .. ``` *A corresponding successful output:* @@ -285,14 +284,14 @@ cmake -- Detecting CXX compiler ABI info - done -- Detecting CXX compile features -- Detecting CXX compile features - done --- Found LLVM 9.0.1: --- - CMake module path: /srv/git/LLVM/9.0.1/dist/lib/cmake/llvm --- - Include path : /srv/git/LLVM/9.0.1/dist/include --- - Binary path : /srv/git/LLVM/9.0.1/dist/bin +-- Found LLVM 10.0.0: +-- - CMake module path: /srv/git/LLVM/10.0.0/dist/lib/cmake/llvm +-- - Include path : /srv/git/LLVM/10.0.0/dist/include +-- - Binary path : /srv/git/LLVM/10.0.0/dist/bin -- Linker detection: GNU ld -- Found PythonInterp: /usr/bin/python2.7 (found suitable version "2.7.12", minimum required is "2.7") -- Found lit: /usr/local/bin/lit --- Found FileCheck: /srv/git/LLVM/9.0.1/dist/bin/FileCheck +-- Found FileCheck: /srv/git/LLVM/10.0.0/dist/bin/FileCheck -- Looking for pthread.h -- Looking for pthread.h - found -- Looking for pthread_create @@ -302,7 +301,7 @@ cmake -- Looking for pthread_create in pthread -- Looking for pthread_create in pthread - found -- Found Threads: TRUE --- Found CUDA: /usr/local/cuda-10.1 (found version "10.1") +-- Found CUDA: /usr/local/cuda-10.2 (found version "10.2") -- Configuring done -- Generating done -- Build files have been written to: /srv/git/HIP/hipify-clang/build @@ -314,83 +313,85 @@ make test-hipify ```shell Running HIPify regression tests ======================================== -CUDA 10.1 - will be used for testing -LLVM 9.0.1 - will be used for testing +CUDA 10.2 - will be used for testing +LLVM 10.0.0 - will be used for testing x86_64 - Platform architecture Linux 5.2.0 - Platform OS 64 - hipify-clang binary bitness 64 - python 2.7.12 binary bitness ======================================== --- Testing: 67 tests, 12 threads -- -PASS: hipify :: unit_tests/casts/reinterpret_cast.cu (1 of 67) -PASS: hipify :: unit_tests/device/math_functions.cu (2 of 67) -PASS: hipify :: unit_tests/device/atomics.cu (3 of 67) -PASS: hipify :: unit_tests/device/device_symbols.cu (4 of 67) -PASS: hipify :: unit_tests/headers/headers_test_01.cu (5 of 67) -PASS: hipify :: unit_tests/headers/headers_test_02.cu (6 of 67) -PASS: hipify :: unit_tests/headers/headers_test_03.cu (7 of 67) -PASS: hipify :: unit_tests/headers/headers_test_05.cu (8 of 67) -PASS: hipify :: unit_tests/headers/headers_test_04.cu (9 of 67) -PASS: hipify :: unit_tests/headers/headers_test_06.cu (10 of 67) -PASS: hipify :: unit_tests/headers/headers_test_07.cu (11 of 67) -PASS: hipify :: unit_tests/headers/headers_test_10.cu (12 of 67) -PASS: hipify :: unit_tests/headers/headers_test_11.cu (13 of 67) -PASS: hipify :: unit_tests/headers/headers_test_08.cu (14 of 67) -PASS: hipify :: unit_tests/kernel_launch/kernel_launch_01.cu (15 of 67) -PASS: hipify :: unit_tests/headers/headers_test_09.cu (16 of 67) -PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_02.cu (17 of 67) -PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_01.cu (18 of 67) -PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_0_based_indexing.cu (19 of 67) -PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_1_based_indexing.cu (20 of 67) -PASS: hipify :: unit_tests/libraries/CUB/cub_03.cu (21 of 67) -PASS: hipify :: unit_tests/libraries/CUB/cub_01.cu (22 of 67) -PASS: hipify :: unit_tests/libraries/CUB/cub_02.cu (23 of 67) -PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_0_based_indexing_rocblas.cu (24 of 67) -PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_sgemm_matrix_multiplication.cu (25 of 67) -PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_1_based_indexing_rocblas.cu (26 of 67) -PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_sgemm_matrix_multiplication_rocblas.cu (27 of 67) -PASS: hipify :: unit_tests/libraries/cuComplex/cuComplex_Julia.cu (28 of 67) -PASS: hipify :: unit_tests/libraries/cuFFT/simple_cufft.cu (29 of 67) -PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_softmax.cu (30 of 67) -PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_convolution_forward.cu (31 of 67) -PASS: hipify :: unit_tests/libraries/cuRAND/poisson_api_example.cu (32 of 67) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_01.cu (33 of 67) -PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_generate.cpp (34 of 67) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_02.cu (35 of 67) -PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_kernel.cpp (36 of 67) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_03.cu (37 of 67) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_04.cu (38 of 67) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_05.cu (39 of 67) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_07.cu (40 of 67) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_06.cu (41 of 67) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_08.cu (42 of 67) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_09.cu (43 of 67) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_11.cu (44 of 67) -PASS: hipify :: unit_tests/namespace/ns_kernel_launch.cu (45 of 67) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_10.cu (46 of 67) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_12.cu (47 of 67) -PASS: hipify :: unit_tests/pp/pp_if_else_conditionals.cu (48 of 67) -PASS: hipify :: unit_tests/pp/pp_if_else_conditionals_01.cu (49 of 67) -PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/tex2dKernel.cpp (50 of 67) -PASS: hipify :: unit_tests/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp (51 of 67) -PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp (52 of 67) -PASS: hipify :: unit_tests/samples/2_Cookbook/13_occupancy/occupancy.cpp (53 of 67) -PASS: hipify :: unit_tests/samples/2_Cookbook/1_hipEvent/hipEvent.cpp (54 of 67) -PASS: hipify :: unit_tests/samples/2_Cookbook/2_Profiler/Profiler.cpp (55 of 67) -PASS: hipify :: unit_tests/samples/2_Cookbook/7_streams/stream.cpp (56 of 67) -PASS: hipify :: unit_tests/samples/2_Cookbook/8_peer2peer/peer2peer.cpp (57 of 67) -PASS: hipify :: unit_tests/samples/MallocManaged.cpp (58 of 67) -PASS: hipify :: unit_tests/samples/allocators.cu (59 of 67) -PASS: hipify :: unit_tests/samples/coalescing.cu (60 of 67) -PASS: hipify :: unit_tests/samples/dynamic_shared_memory.cu (61 of 67) -PASS: hipify :: unit_tests/samples/axpy.cu (62 of 67) -PASS: hipify :: unit_tests/samples/intro.cu (63 of 67) -PASS: hipify :: unit_tests/samples/cudaRegister.cu (64 of 67) -PASS: hipify :: unit_tests/samples/square.cu (65 of 67) -PASS: hipify :: unit_tests/samples/static_shared_memory.cu (66 of 67) -PASS: hipify :: unit_tests/samples/vec_add.cu (67 of 67) -Testing Time: 3.07s - Expected Passes : 67 +-- Testing: 69 tests, 12 threads -- +PASS: hipify :: unit_tests/casts/reinterpret_cast.cu (1 of 69) +PASS: hipify :: unit_tests/device/math_functions.cu (2 of 69) +PASS: hipify :: unit_tests/device/atomics.cu (3 of 69) +PASS: hipify :: unit_tests/headers/headers_test_01.cu (4 of 69) +PASS: hipify :: unit_tests/device/device_symbols.cu (5 of 69) +PASS: hipify :: unit_tests/headers/headers_test_02.cu (6 of 69) +PASS: hipify :: unit_tests/headers/headers_test_03.cu (7 of 69) +PASS: hipify :: unit_tests/headers/headers_test_05.cu (8 of 69) +PASS: hipify :: unit_tests/headers/headers_test_04.cu (9 of 69) +PASS: hipify :: unit_tests/headers/headers_test_07.cu (10 of 69) +PASS: hipify :: unit_tests/headers/headers_test_06.cu (11 of 69) +PASS: hipify :: unit_tests/headers/headers_test_11.cu (12 of 69) +PASS: hipify :: unit_tests/headers/headers_test_10.cu (13 of 69) +PASS: hipify :: unit_tests/headers/headers_test_08.cu (14 of 69) +PASS: hipify :: unit_tests/kernel_launch/kernel_launch_01.cu (15 of 69) +PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_02.cu (16 of 69) +PASS: hipify :: unit_tests/headers/headers_test_09.cu (17 of 69) +PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_01.cu (18 of 69) +PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_0_based_indexing.cu (19 of 69) +PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_1_based_indexing.cu (20 of 69) +PASS: hipify :: unit_tests/libraries/CUB/cub_03.cu (21 of 69) +PASS: hipify :: unit_tests/libraries/CUB/cub_01.cu (22 of 69) +PASS: hipify :: unit_tests/libraries/CUB/cub_02.cu (23 of 69) +PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_sgemm_matrix_multiplication.cu (24 of 69) +PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_0_based_indexing_rocblas.cu (25 of 69) +PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_1_based_indexing_rocblas.cu (26 of 69) +PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_sgemm_matrix_multiplication_rocblas.cu (27 of 69) +PASS: hipify :: unit_tests/libraries/cuComplex/cuComplex_Julia.cu (28 of 69) +PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_softmax.cu (29 of 69) +PASS: hipify :: unit_tests/libraries/cuFFT/simple_cufft.cu (30 of 69) +PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_convolution_forward.cu (31 of 69) +PASS: hipify :: unit_tests/libraries/cuRAND/poisson_api_example.cu (32 of 69) +PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_generate.cpp (33 of 69) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_01.cu (34 of 69) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_02.cu (35 of 69) +PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_kernel.cpp (36 of 69) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_03.cu (37 of 69) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_04.cu (38 of 69) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_05.cu (39 of 69) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_06.cu (40 of 69) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_07.cu (41 of 69) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_08.cu (42 of 69) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_09.cu (43 of 69) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_10.cu (44 of 69) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_11.cu (45 of 69) +PASS: hipify :: unit_tests/namespace/ns_kernel_launch.cu (46 of 69) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_12.cu (47 of 69) +PASS: hipify :: unit_tests/pp/pp_if_else_conditionals.cu (48 of 69) +PASS: hipify :: unit_tests/pp/pp_if_else_conditionals_01.cu (49 of 69) +PASS: hipify :: unit_tests/pp/pp_if_else_conditionals_01_LLVM_10.cu (50 of 69) +PASS: hipify :: unit_tests/pp/pp_if_else_conditionals_LLVM_10.cu (51 of 69) +PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/tex2dKernel.cpp (52 of 69) +PASS: hipify :: unit_tests/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp (53 of 69) +PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp (54 of 69) +PASS: hipify :: unit_tests/samples/2_Cookbook/1_hipEvent/hipEvent.cpp (55 of 69) +PASS: hipify :: unit_tests/samples/2_Cookbook/13_occupancy/occupancy.cpp (56 of 69) +PASS: hipify :: unit_tests/samples/2_Cookbook/2_Profiler/Profiler.cpp (57 of 69) +PASS: hipify :: unit_tests/samples/MallocManaged.cpp (58 of 69) +PASS: hipify :: unit_tests/samples/2_Cookbook/7_streams/stream.cpp (59 of 69) +PASS: hipify :: unit_tests/samples/2_Cookbook/8_peer2peer/peer2peer.cpp (60 of 69) +PASS: hipify :: unit_tests/samples/allocators.cu (61 of 69) +PASS: hipify :: unit_tests/samples/coalescing.cu (62 of 69) +PASS: hipify :: unit_tests/samples/axpy.cu (63 of 69) +PASS: hipify :: unit_tests/samples/dynamic_shared_memory.cu (64 of 69) +PASS: hipify :: unit_tests/samples/cudaRegister.cu (65 of 69) +PASS: hipify :: unit_tests/samples/intro.cu (66 of 69) +PASS: hipify :: unit_tests/samples/square.cu (67 of 69) +PASS: hipify :: unit_tests/samples/static_shared_memory.cu (68 of 69) +PASS: hipify :: unit_tests/samples/vec_add.cu (69 of 69) +Testing Time: 3.23s + Expected Passes : 69 [100%] Built target test-hipify ``` ### hipify-clang: Windows @@ -404,8 +405,8 @@ Testing Time: 3.07s | 7.0.0 - 7.1.0 | 9.2 | 7.6.5.32 | 2017.15.9.11 | 3.13.3 | 3.7.3 | | 8.0.0 - 8.0.1 | 10.0 | 7.6.5.32 | 2017.15.9.15 | 3.14.2 | 3.7.4 | | 9.0.0 - 9.0.1 | 10.1 | 7.6.5.32 | 2017.15.9.20, 2019.16.4.5 | 3.16.4 | 3.8.0 | -| 10.0.0-rc1-rc3 | 10.2 | 7.6.5.32 | 2017.15.9.20, 2019.16.4.5 | 3.16.4 | 3.8.1 | -| 11.0.0git | 10.2 | 7.6.5.32 | 2017.15.9.20, 2019.16.4.5 | 3.16.5 | 3.8.2 | +| 10.0.0 | 10.2 | 7.6.5.32 | 2017.15.9.21, 2019.16.5.1 | 3.17.0 | 3.8.2 | +| 11.0.0git | 10.2 | 7.6.5.32 | 2017.15.9.21, 2019.16.5.1 | 3.17.0 | 3.8.2 | *Building with testing support on `Windows 10` by `Visual Studio 16 2019`:* @@ -416,28 +417,28 @@ cmake -DHIPIFY_CLANG_TESTS=1 \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_INSTALL_PREFIX=../dist \ - -DCMAKE_PREFIX_PATH=f:/LLVM/9.0.1/dist \ - -DCUDA_TOOLKIT_ROOT_DIR="c:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1" \ - -DCUDA_SDK_ROOT_DIR="c:/ProgramData/NVIDIA Corporation/CUDA Samples/v10.1" \ - -DCUDA_DNN_ROOT_DIR=f:/CUDNN/cudnn-10.1-windows10-x64-v7.6.5.32 \ - -DCUDA_CUB_ROOT_DIR=f:/GIT/cub \ - -DLLVM_EXTERNAL_LIT=f:/LLVM/9.0.1/build/Release/bin/llvm-lit.py \ + -DCMAKE_PREFIX_PATH=d:/LLVM/10.0.0/dist \ + -DCUDA_TOOLKIT_ROOT_DIR="c:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.2" \ + -DCUDA_SDK_ROOT_DIR="c:/ProgramData/NVIDIA Corporation/CUDA Samples/v10.2" \ + -DCUDA_DNN_ROOT_DIR=d:/CUDNN/cudnn-10.2-windows10-x64-v7.6.5.32 \ + -DCUDA_CUB_ROOT_DIR=d:/GIT/cub \ + -DLLVM_EXTERNAL_LIT=d:/LLVM/10.0.0/build/Release/bin/llvm-lit.py \ -Thost=x64 .. ``` *A corresponding successful output:* ```shell --- Found LLVM 9.0.1: --- - CMake module path: F:/LLVM/9.0.1/dist/lib/cmake/llvm --- - Include path : F:/LLVM/9.0.1/dist/include --- - Binary path : F:/LLVM/9.0.1/dist/bin --- Found PythonInterp: C:/Program Files/Python38/python.exe (found suitable version "3.8.2", minimum required is "3.6") --- Found lit: C:/Program Files/Python38/Scripts/lit.exe --- Found FileCheck: F:/LLVM/9.0.1/dist/bin/FileCheck.exe --- Found CUDA: C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1 (found version "10.1") +-- Found LLVM 10.0.0: +-- - CMake module path: d:/LLVM/10.0.0/dist/lib/cmake/llvm +-- - Include path : d:/LLVM/10.0.0/dist/include +-- - Binary path : d:/LLVM/10.0.0/dist/bin +-- Found PythonInterp: c:/Program Files/Python38/python.exe (found suitable version "3.8.2", minimum required is "3.6") +-- Found lit: c:/Program Files/Python38/Scripts/lit.exe +-- Found FileCheck: d:/LLVM/10.0.0/dist/bin/FileCheck.exe +-- Found CUDA: c:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.2 (found version "10.2") -- Configuring done -- Generating done --- Build files have been written to: f:/HIP/hipify-clang/build +-- Build files have been written to: d:/HIP/hipify-clang/build ``` Run `Visual Studio 16 2019`, open the generated `hipify-clang.sln`, build project `test-hipify`. diff --git a/projects/hip/hipify-clang/src/CUDA2HIP_Driver_API_functions.cpp b/projects/hip/hipify-clang/src/CUDA2HIP_Driver_API_functions.cpp index ab07a10e93..7be0fd0f3d 100644 --- a/projects/hip/hipify-clang/src/CUDA2HIP_Driver_API_functions.cpp +++ b/projects/hip/hipify-clang/src/CUDA2HIP_Driver_API_functions.cpp @@ -545,9 +545,9 @@ const std::map CUDA_DRIVER_FUNCTION_MAP{ // 5.21. Occupancy // cudaOccupancyMaxActiveBlocksPerMultiprocessor - {"cuOccupancyMaxActiveBlocksPerMultiprocessor", {"hipDrvOccupancyMaxActiveBlocksPerMultiprocessor", "", CONV_OCCUPANCY, API_DRIVER}}, + {"cuOccupancyMaxActiveBlocksPerMultiprocessor", {"hipDrvOccupancyMaxActiveBlocksPerMultiprocessor", "", CONV_OCCUPANCY, API_DRIVER}}, // cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags - {"cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", {"hipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "", CONV_OCCUPANCY, API_DRIVER}}, + {"cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", {"hipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags","", CONV_OCCUPANCY, API_DRIVER}}, // cudaOccupancyMaxPotentialBlockSize {"cuOccupancyMaxPotentialBlockSize", {"hipOccupancyMaxPotentialBlockSize", "", CONV_OCCUPANCY, API_DRIVER}}, // cudaOccupancyMaxPotentialBlockSizeWithFlags diff --git a/projects/hip/hipify-clang/src/CUDA2HIP_SPARSE_API_functions.cpp b/projects/hip/hipify-clang/src/CUDA2HIP_SPARSE_API_functions.cpp index 1df1bb9cba..0f3997145e 100644 --- a/projects/hip/hipify-clang/src/CUDA2HIP_SPARSE_API_functions.cpp +++ b/projects/hip/hipify-clang/src/CUDA2HIP_SPARSE_API_functions.cpp @@ -49,8 +49,8 @@ const std::map CUDA_SPARSE_FUNCTION_MAP{ {"cusparseGetStream", {"hipsparseGetStream", "", CONV_LIB_FUNC, API_SPARSE}}, {"cusparseCreateCsrsv2Info", {"hipsparseCreateCsrsv2Info", "", CONV_LIB_FUNC, API_SPARSE}}, {"cusparseDestroyCsrsv2Info", {"hipsparseDestroyCsrsv2Info", "", CONV_LIB_FUNC, API_SPARSE}}, - {"cusparseCreateCsrsm2Info", {"hipsparseCreateCsrsm2Info", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}}, - {"cusparseDestroyCsrsm2Info", {"hipsparseDestroyCsrsm2Info", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}}, + {"cusparseCreateCsrsm2Info", {"hipsparseCreateCsrsm2Info", "", CONV_LIB_FUNC, API_SPARSE}}, + {"cusparseDestroyCsrsm2Info", {"hipsparseDestroyCsrsm2Info", "", CONV_LIB_FUNC, API_SPARSE}}, {"cusparseCreateCsric02Info", {"hipsparseCreateCsric02Info", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}}, {"cusparseDestroyCsric02Info", {"hipsparseDestroyCsric02Info", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}}, {"cusparseCreateCsrilu02Info", {"hipsparseCreateCsrilu02Info", "", CONV_LIB_FUNC, API_SPARSE}}, @@ -218,27 +218,27 @@ const std::map CUDA_SPARSE_FUNCTION_MAP{ {"cusparseCcsrsm_analysis", {"hipsparseCcsrsm_analysis", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}}, {"cusparseZcsrsm_analysis", {"hipsparseZcsrsm_analysis", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}}, - {"cusparseScsrsm_solve", {"hipsparseScsrsm_solve", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}}, - {"cusparseDcsrsm_solve", {"hipsparseDcsrsm_solve", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}}, - {"cusparseCcsrsm_solve", {"hipsparseCcsrsm_solve", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}}, - {"cusparseZcsrsm_solve", {"hipsparseZcsrsm_solve", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}}, + {"cusparseScsrsm_solve", {"hipsparseScsrsm_solve", "", CONV_LIB_FUNC, API_SPARSE}}, + {"cusparseDcsrsm_solve", {"hipsparseDcsrsm_solve", "", CONV_LIB_FUNC, API_SPARSE}}, + {"cusparseCcsrsm_solve", {"hipsparseCcsrsm_solve", "", CONV_LIB_FUNC, API_SPARSE}}, + {"cusparseZcsrsm_solve", {"hipsparseZcsrsm_solve", "", CONV_LIB_FUNC, API_SPARSE}}, - {"cusparseScsrsm2_bufferSizeExt", {"hipsparseScsrsm2_bufferSizeExt", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}}, - {"cusparseDcsrsm2_bufferSizeExt", {"hipsparseDcsrsm2_bufferSizeExt", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}}, - {"cusparseCcsrsm2_bufferSizeExt", {"hipsparseCcsrsm2_bufferSizeExt", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}}, - {"cusparseZcsrsm2_bufferSizeExt", {"hipsparseZcsrsm2_bufferSizeExt", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}}, + {"cusparseScsrsm2_bufferSizeExt", {"hipsparseScsrsm2_bufferSizeExt", "", CONV_LIB_FUNC, API_SPARSE}}, + {"cusparseDcsrsm2_bufferSizeExt", {"hipsparseDcsrsm2_bufferSizeExt", "", CONV_LIB_FUNC, API_SPARSE,}}, + {"cusparseCcsrsm2_bufferSizeExt", {"hipsparseCcsrsm2_bufferSizeExt", "", CONV_LIB_FUNC, API_SPARSE}}, + {"cusparseZcsrsm2_bufferSizeExt", {"hipsparseZcsrsm2_bufferSizeExt", "", CONV_LIB_FUNC, API_SPARSE}}, - {"cusparseScsrsm2_analysis", {"hipsparseScsrsm2_analysis", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}}, - {"cusparseDcsrsm2_analysis", {"hipsparseDcsrsm2_analysis", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}}, - {"cusparseCcsrsm2_analysis", {"hipsparseCcsrsm2_analysis", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}}, - {"cusparseZcsrsm2_analysis", {"hipsparseZcsrsm2_analysis", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}}, + {"cusparseScsrsm2_analysis", {"hipsparseScsrsm2_analysis", "", CONV_LIB_FUNC, API_SPARSE}}, + {"cusparseDcsrsm2_analysis", {"hipsparseDcsrsm2_analysis", "", CONV_LIB_FUNC, API_SPARSE}}, + {"cusparseCcsrsm2_analysis", {"hipsparseCcsrsm2_analysis", "", CONV_LIB_FUNC, API_SPARSE}}, + {"cusparseZcsrsm2_analysis", {"hipsparseZcsrsm2_analysis", "", CONV_LIB_FUNC, API_SPARSE}}, {"cusparseScsrsm2_solve", {"hipsparseScsrsm2_solve", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}}, {"cusparseDcsrsm2_solve", {"hipsparseDcsrsm2_solve", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}}, {"cusparseCcsrsm2_solve", {"hipsparseCcsrsm2_solve", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}}, {"cusparseZcsrsm2_solve", {"hipsparseZcsrsm2_solve", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}}, - {"cusparseXcsrsm2_zeroPivot", {"hipsparseXcsrsm2_zeroPivot", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}}, + {"cusparseXcsrsm2_zeroPivot", {"hipsparseXcsrsm2_zeroPivot", "", CONV_LIB_FUNC, API_SPARSE}}, {"cusparseSbsrmm", {"hipsparseSbsrmm", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}}, {"cusparseDbsrmm", {"hipsparseDbsrmm", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}}, @@ -619,10 +619,10 @@ const std::map CUDA_SPARSE_FUNCTION_MAP{ {"cusparseChyb2dense", {"hipsparseChyb2dense", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}}, {"cusparseZhyb2dense", {"hipsparseZhyb2dense", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}}, - {"cusparseSnnz", {"hipsparseSnnz", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}}, - {"cusparseDnnz", {"hipsparseDnnz", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}}, - {"cusparseCnnz", {"hipsparseCnnz", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}}, - {"cusparseZnnz", {"hipsparseZnnz", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}}, + {"cusparseSnnz", {"hipsparseSnnz", "", CONV_LIB_FUNC, API_SPARSE}}, + {"cusparseDnnz", {"hipsparseDnnz", "", CONV_LIB_FUNC, API_SPARSE}}, + {"cusparseCnnz", {"hipsparseCnnz", "", CONV_LIB_FUNC, API_SPARSE}}, + {"cusparseZnnz", {"hipsparseZnnz", "", CONV_LIB_FUNC, API_SPARSE}}, {"cusparseCreateIdentityPermutation", {"hipsparseCreateIdentityPermutation", "", CONV_LIB_FUNC, API_SPARSE}}, diff --git a/projects/hip/hipify-clang/src/CUDA2HIP_SPARSE_API_types.cpp b/projects/hip/hipify-clang/src/CUDA2HIP_SPARSE_API_types.cpp index 1d3fe28c62..aae85a50d3 100644 --- a/projects/hip/hipify-clang/src/CUDA2HIP_SPARSE_API_types.cpp +++ b/projects/hip/hipify-clang/src/CUDA2HIP_SPARSE_API_types.cpp @@ -41,8 +41,8 @@ const std::map CUDA_SPARSE_TYPE_NAME_MAP{ {"csrsv2Info", {"csrsv2Info", "", CONV_TYPE, API_SPARSE, HIP_UNSUPPORTED}}, {"csrsv2Info_t", {"csrsv2Info_t", "", CONV_TYPE, API_SPARSE}}, - {"csrsm2Info", {"csrsm2Info", "", CONV_TYPE, API_SPARSE, HIP_UNSUPPORTED}}, - {"csrsm2Info_t", {"csrsm2Info_t", "", CONV_TYPE, API_SPARSE, HIP_UNSUPPORTED}}, + {"csrsm2Info", {"csrsm2Info", "", CONV_TYPE, API_SPARSE}}, + {"csrsm2Info_t", {"csrsm2Info_t", "", CONV_TYPE, API_SPARSE}}, {"bsrsv2Info", {"bsrsv2Info", "", CONV_TYPE, API_SPARSE, HIP_UNSUPPORTED}}, {"bsrsv2Info_t", {"bsrsv2Info_t", "", CONV_TYPE, API_SPARSE, HIP_UNSUPPORTED}}, @@ -88,9 +88,9 @@ const std::map CUDA_SPARSE_TYPE_NAME_MAP{ {"CUSPARSE_ACTION_SYMBOLIC", {"HIPSPARSE_ACTION_SYMBOLIC", "", CONV_NUMERIC_LITERAL, API_SPARSE}}, {"CUSPARSE_ACTION_NUMERIC", {"HIPSPARSE_ACTION_NUMERIC", "", CONV_NUMERIC_LITERAL, API_SPARSE}}, - {"cusparseDirection_t", {"hipsparseDirection_t", "", CONV_TYPE, API_SPARSE, HIP_UNSUPPORTED}}, - {"CUSPARSE_DIRECTION_ROW", {"HIPSPARSE_DIRECTION_ROW", "", CONV_NUMERIC_LITERAL, API_SPARSE, HIP_UNSUPPORTED}}, - {"CUSPARSE_DIRECTION_COLUMN", {"HIPSPARSE_DIRECTION_COLUMN", "", CONV_NUMERIC_LITERAL, API_SPARSE, HIP_UNSUPPORTED}}, + {"cusparseDirection_t", {"hipsparseDirection_t", "", CONV_TYPE, API_SPARSE}}, + {"CUSPARSE_DIRECTION_ROW", {"HIPSPARSE_DIRECTION_ROW", "", CONV_NUMERIC_LITERAL, API_SPARSE}}, + {"CUSPARSE_DIRECTION_COLUMN", {"HIPSPARSE_DIRECTION_COLUMN", "", CONV_NUMERIC_LITERAL, API_SPARSE}}, {"cusparseHybPartition_t", {"hipsparseHybPartition_t", "", CONV_TYPE, API_SPARSE}}, {"CUSPARSE_HYB_PARTITION_AUTO", {"HIPSPARSE_HYB_PARTITION_AUTO", "", CONV_NUMERIC_LITERAL, API_SPARSE}}, diff --git a/projects/hip/include/hip/hcc_detail/device_functions.h b/projects/hip/include/hip/hcc_detail/device_functions.h index e6549dde0d..0a775df275 100644 --- a/projects/hip/include/hip/hcc_detail/device_functions.h +++ b/projects/hip/include/hip/hcc_detail/device_functions.h @@ -128,7 +128,7 @@ __device__ static int __mul24(int x, int y); __device__ static long long int __mul64hi(long long int x, long long int y); __device__ static int __mulhi(int x, int y); __device__ static int __rhadd(int x, int y); -__device__ static unsigned int __sad(int x, int y, int z); +__device__ static unsigned int __sad(int x, int y,unsigned int z); __device__ static unsigned int __uhadd(unsigned int x, unsigned int y); __device__ static int __umul24(unsigned int x, unsigned int y); __device__ static unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y); @@ -199,7 +199,7 @@ __device__ static inline int __rhadd(int x, int y) { int value = z & 0x7FFFFFFF; return ((value) >> 1 || sign); } -__device__ static inline unsigned int __sad(int x, int y, int z) { +__device__ static inline unsigned int __sad(int x, int y, unsigned int z) { return x > y ? x - y + z : y - x + z; } __device__ static inline unsigned int __uhadd(unsigned int x, unsigned int y) { @@ -230,7 +230,7 @@ __device__ static inline unsigned int __urhadd(unsigned int x, unsigned int y) { return (x + y + 1) >> 1; } __device__ static inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) { - return __ockl_sad_u32(x, y, z); + return __ockl_sadd_u32(x, y, z); } __device__ static inline unsigned int __lane_id() { return __mbcnt_hi(-1, __mbcnt_lo(-1, 0)); } @@ -563,7 +563,7 @@ long __shfl_xor(long var, int lane_mask, int width = warpSize) return tmp1; #else static_assert(sizeof(long) == sizeof(int), ""); - return static_cast(__shfl_down(static_cast(var), lane_mask, width)); + return static_cast(__shfl_xor(static_cast(var), lane_mask, width)); #endif } __device__ diff --git a/projects/hip/include/hip/hcc_detail/device_library_decls.h b/projects/hip/include/hip/hcc_detail/device_library_decls.h index ac35823cd2..2eadb86774 100644 --- a/projects/hip/include/hip/hcc_detail/device_library_decls.h +++ b/projects/hip/include/hip/hcc_detail/device_library_decls.h @@ -44,7 +44,7 @@ extern "C" __device__ __attribute__((const)) uint __ockl_mul24_u32(uint, uint); extern "C" __device__ __attribute__((const)) int __ockl_mul24_i32(int, int); extern "C" __device__ __attribute__((const)) uint __ockl_mul_hi_u32(uint, uint); extern "C" __device__ __attribute__((const)) int __ockl_mul_hi_i32(int, int); -extern "C" __device__ __attribute__((const)) uint __ockl_sad_u32(uint, uint, uint); +extern "C" __device__ __attribute__((const)) uint __ockl_sadd_u32(uint, uint, uint); extern "C" __device__ __attribute__((const)) uchar __ockl_clz_u8(uchar); extern "C" __device__ __attribute__((const)) ushort __ockl_clz_u16(ushort); @@ -72,6 +72,7 @@ extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_thread_rank( extern "C" __device__ __attribute__((const)) int __ockl_multi_grid_is_valid(void); extern "C" __device__ __attribute__((convergent)) void __ockl_multi_grid_sync(void); +extern "C" __device__ void __ockl_atomic_add_noret_f32(float*, float); // Introduce local address space #define __local __attribute__((address_space(3))) diff --git a/projects/hip/include/hip/hcc_detail/functional_grid_launch.hpp b/projects/hip/include/hip/hcc_detail/functional_grid_launch.hpp index cf4422070f..a2ee601e3e 100644 --- a/projects/hip/include/hip/hcc_detail/functional_grid_launch.hpp +++ b/projects/hip/include/hip/hcc_detail/functional_grid_launch.hpp @@ -37,14 +37,15 @@ THE SOFTWARE. hipError_t ihipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList, int numDevices, unsigned int flags, hip_impl::program_state& ps); -hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDimX, void** kernelParams, - unsigned int sharedMemBytes, hipStream_t stream, hip_impl::program_state& ps); - -hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList, int numDevices, - unsigned int flags, hip_impl::program_state& ps); - - +hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, + dim3 blockDim, void** args, + size_t sharedMem, hipStream_t stream, + hip_impl::program_state& ps); +hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList, + int numDevices, + unsigned int flags, + hip_impl::program_state& ps); #pragma GCC visibility push(hidden) @@ -192,22 +193,24 @@ void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks, template inline __attribute__((visibility("hidden"))) -hipError_t hipLaunchCooperativeKernel(F f, dim3 gridDim, dim3 blockDimX, void** kernelParams, - unsigned int sharedMemBytes, hipStream_t stream) { - +hipError_t hipLaunchCooperativeKernel(F f, dim3 gridDim, dim3 blockDim, + void** args, size_t sharedMem, + hipStream_t stream) { hip_impl::hip_init(); auto& ps = hip_impl::get_program_state(); - return ihipLaunchCooperativeKernel(reinterpret_cast(f), gridDim, blockDimX, kernelParams, sharedMemBytes, stream, ps); + return hipLaunchCooperativeKernel(reinterpret_cast(f), gridDim, + blockDim, args, sharedMem, stream, ps); } inline __attribute__((visibility("hidden"))) -hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList, int numDevices, - unsigned int flags) { +hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList, + int numDevices, + unsigned int flags) { hip_impl::hip_init(); auto& ps = hip_impl::get_program_state(); - return ihipLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags, ps); + return hipLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags, ps); } #pragma GCC visibility pop diff --git a/projects/hip/include/hip/hcc_detail/hip_atomic.h b/projects/hip/include/hip/hcc_detail/hip_atomic.h index 263f639e96..d00ebcdabb 100644 --- a/projects/hip/include/hip/hcc_detail/hip_atomic.h +++ b/projects/hip/include/hip/hcc_detail/hip_atomic.h @@ -73,6 +73,14 @@ float atomicAdd(float* address, float val) return __uint_as_float(r); } + +__device__ +inline +void atomicAddNoRet(float* address, float val) +{ + __ockl_atomic_add_noret_f32(address, val); +} + __device__ inline double atomicAdd(double* address, double val) diff --git a/projects/hip/include/hip/hcc_detail/hip_fp16.h b/projects/hip/include/hip/hcc_detail/hip_fp16.h index 3fa18dae2e..6fa86e94b9 100644 --- a/projects/hip/include/hip/hcc_detail/hip_fp16.h +++ b/projects/hip/include/hip/hcc_detail/hip_fp16.h @@ -1268,6 +1268,13 @@ THE SOFTWARE. static_cast<__half_raw>(x).data + static_cast<__half_raw>(y).data}; } + inline + __device__ + __half __habs(__half x) + { + return __half_raw{ + __ocml_fabs_f16(static_cast<__half_raw>(x).data)}; + } inline __device__ __half __hsub(__half x, __half y) @@ -1334,6 +1341,13 @@ THE SOFTWARE. static_cast<__half2_raw>(x).data + static_cast<__half2_raw>(y).data}; } + inline + __device__ + __half2 __habs2(__half2 x) + { + return __half2_raw{ + __ocml_fabs_2f16(static_cast<__half2_raw>(x).data)}; + } inline __device__ __half2 __hsub2(__half2 x, __half2 y) diff --git a/projects/hip/include/hip/hcc_detail/hip_fp16_math_fwd.h b/projects/hip/include/hip/hcc_detail/hip_fp16_math_fwd.h index eeb617c40b..95403e6ca8 100644 --- a/projects/hip/include/hip/hcc_detail/hip_fp16_math_fwd.h +++ b/projects/hip/include/hip/hcc_detail/hip_fp16_math_fwd.h @@ -38,6 +38,7 @@ extern "C" __device__ __attribute__((const)) _Float16 __ocml_floor_f16(_Float16); __device__ __attribute__((const)) _Float16 __ocml_fma_f16(_Float16, _Float16, _Float16); + __device__ __attribute__((const)) _Float16 __ocml_fabs_f16(_Float16); __device__ __attribute__((const)) int __ocml_isinf_f16(_Float16); __device__ __attribute__((const)) int __ocml_isnan_f16(_Float16); __device__ __attribute__((pure)) _Float16 __ocml_log_f16(_Float16); @@ -58,6 +59,7 @@ extern "C" #endif __device__ __attribute__((const)) __2f16 __ocml_ceil_2f16(__2f16); + __device__ __attribute__((const)) __2f16 __ocml_fabs_2f16(__2f16); __device__ __2f16 __ocml_cos_2f16(__2f16); __device__ __attribute__((pure)) __2f16 __ocml_exp_2f16(__2f16); __device__ __attribute__((pure)) __2f16 __ocml_exp10_2f16(__2f16); diff --git a/projects/hip/include/hip/hcc_detail/hip_runtime.h b/projects/hip/include/hip/hcc_detail/hip_runtime.h index a3a0963ba9..fdb61e70d3 100644 --- a/projects/hip/include/hip/hcc_detail/hip_runtime.h +++ b/projects/hip/include/hip/hcc_detail/hip_runtime.h @@ -504,9 +504,14 @@ hc_get_workitem_absolute_id(int dim) #define __CUDA__ #include <__clang_cuda_math_forward_declares.h> #include <__clang_cuda_complex_builtins.h> -#include -#include -#include +// Workaround for using libc++ with HIP-Clang. +// The following headers requires clang include path before standard C++ include path. +// However libc++ include path requires to be before clang include path. +// To workaround this, we pass -isystem with the parent directory of clang include +// path instead of the clang include path itself. +#include +#include +#include #undef __CUDA__ #pragma pop_macro("__CUDA__") #endif // !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__ diff --git a/projects/hip/include/hip/hcc_detail/hip_runtime_api.h b/projects/hip/include/hip/hcc_detail/hip_runtime_api.h index 67bd2486d0..206a2e5835 100644 --- a/projects/hip/include/hip/hcc_detail/hip_runtime_api.h +++ b/projects/hip/include/hip/hcc_detail/hip_runtime_api.h @@ -55,7 +55,7 @@ THE SOFTWARE. #define DEPRECATED(msg) __attribute__ ((deprecated(msg))) #endif // !defined(_MSC_VER) -#define DEPRECATED_MSG "This API is marked as deprecated and may not be supported in future releases.For more details please refer https://github.com/ROCm-Developer-Tools/HIP/tree/master/docs/markdown/hip_deprecated_api_list" +#define DEPRECATED_MSG "This API is marked as deprecated and may not be supported in future releases. For more details please refer https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_deprecated_api_list.md" #if defined(__HCC__) && (__hcc_workweek__ < 16155) #error("This version of HIP requires a newer version of HCC."); diff --git a/projects/hip/include/hip/hcc_detail/hip_vector_types.h b/projects/hip/include/hip/hcc_detail/hip_vector_types.h index 39457795ae..19259a3657 100644 --- a/projects/hip/include/hip/hcc_detail/hip_vector_types.h +++ b/projects/hip/include/hip/hcc_detail/hip_vector_types.h @@ -34,7 +34,7 @@ THE SOFTWARE. #include "hip/hcc_detail/host_defines.h" -#if !defined(_MSC_VER) || __clang__ +#if defined(__has_attribute) #if __has_attribute(ext_vector_type) #define __NATIVE_VECTOR__(n, T) T __attribute__((ext_vector_type(n))) #else @@ -694,7 +694,7 @@ THE SOFTWARE. typename U = T, typename std::enable_if{}>::type* = nullptr> inline __host__ __device__ - HIP_vector_type operator-() noexcept + HIP_vector_type operator-() const noexcept { auto tmp(*this); tmp.data = -tmp.data; @@ -705,7 +705,7 @@ THE SOFTWARE. typename U = T, typename std::enable_if{}>::type* = nullptr> inline __host__ __device__ - HIP_vector_type operator~() noexcept + HIP_vector_type operator~() const noexcept { HIP_vector_type r{*this}; r.data = ~r.data; @@ -1241,7 +1241,9 @@ DECLOP_MAKE_ONE_COMPONENT(signed long long, longlong1); DECLOP_MAKE_TWO_COMPONENT(signed long long, longlong2); DECLOP_MAKE_THREE_COMPONENT(signed long long, longlong3); DECLOP_MAKE_FOUR_COMPONENT(signed long long, longlong4); -#else // defined(_MSC_VER) +#else // !defined(__has_attribute) + +#if defined(_MSC_VER) #include #include #include @@ -1347,5 +1349,92 @@ typedef union { double4 data; } double3; typedef union { __m256d data[2]; } double8; typedef union { __m256d data[4]; } double16; +#else // !defined(_MSC_VER) + +typedef union { char data; } char1; +typedef union { char data[2]; } char2; +typedef union { char data[4]; } char4; +typedef union { char data[8]; } char8; +typedef union { char data[16]; } char16; +typedef union { char4 data; } char3; + +typedef union { unsigned char data; } uchar1; +typedef union { unsigned char data[2]; } uchar2; +typedef union { unsigned char data[4]; } uchar4; +typedef union { unsigned char data[8]; } uchar8; +typedef union { unsigned char data[16]; } uchar16; +typedef union { uchar4 data; } uchar3; + +typedef union { short data; } short1; +typedef union { short data[2]; } short2; +typedef union { short data[4]; } short4; +typedef union { short data[8]; } short8; +typedef union { short data[16]; } short16; +typedef union { short4 data; } short3; + +typedef union { unsigned short data; } ushort1; +typedef union { unsigned short data[2]; } ushort2; +typedef union { unsigned short data[4]; } ushort4; +typedef union { unsigned short data[8]; } ushort8; +typedef union { unsigned short data[16]; } ushort16; +typedef union { ushort4 data; } ushort3; + +typedef union { int data; } int1; +typedef union { int data[2]; } int2; +typedef union { int data[4]; } int4; +typedef union { int data[8]; } int8; +typedef union { int data[16]; } int16; +typedef union { int4 data; } int3; + +typedef union { unsigned int data; } uint1; +typedef union { unsigned int data[2]; } uint2; +typedef union { unsigned int data[4]; } uint4; +typedef union { unsigned int data[8]; } uint8; +typedef union { unsigned int data[16]; } uint16; +typedef union { uint4 data; } uint3; + +typedef union { long data; } long1; +typedef union { long data[2]; } long2; +typedef union { long data[4]; } long4; +typedef union { long data[8]; } long8; +typedef union { long data[16]; } long16; +typedef union { long4 data; } long3; + +typedef union { unsigned long data; } ulong1; +typedef union { unsigned long data[2]; } ulong2; +typedef union { unsigned long data[4]; } ulong4; +typedef union { unsigned long data[8]; } ulong8; +typedef union { unsigned long data[16]; } ulong16; +typedef union { ulong4 data; } ulong3; + +typedef union { long long data; } longlong1; +typedef union { long long data[2]; } longlong2; +typedef union { long long data[4]; } longlong4; +typedef union { long long data[8]; } longlong8; +typedef union { long long data[16]; } longlong16; +typedef union { longlong4 data; } longlong3; + +typedef union { unsigned long long data; } ulonglong1; +typedef union { unsigned long long data[2]; } ulonglong2; +typedef union { unsigned long long data[4]; } ulonglong4; +typedef union { unsigned long long data[8]; } ulonglong8; +typedef union { unsigned long long data[16]; } ulonglong16; +typedef union { ulonglong4 data; } ulonglong3; + +typedef union { float data; } float1; +typedef union { float data[2]; } float2; +typedef union { float data[4]; } float4; +typedef union { float data[8]; } float8; +typedef union { float data[16]; } float16; +typedef union { float4 data; } float3; + +typedef union { double data; } double1; +typedef union { double data[2]; } double2; +typedef union { double data[4]; } double4; +typedef union { double data[8]; } double8; +typedef union { double data[16]; } double16; +typedef union { double4 data; } double3; + #endif // defined(_MSC_VER) +#endif // defined(__has_attribute) #endif diff --git a/projects/hip/include/hip/hcc_detail/hiprtc.h b/projects/hip/include/hip/hcc_detail/hiprtc.h index 624f1ea157..ec9c85716a 100644 --- a/projects/hip/include/hip/hcc_detail/hiprtc.h +++ b/projects/hip/include/hip/hcc_detail/hiprtc.h @@ -28,6 +28,8 @@ extern "C" { #include +#pragma GCC visibility push (default) + enum hiprtcResult { HIPRTC_SUCCESS = 0, HIPRTC_ERROR_OUT_OF_MEMORY = 1, @@ -79,6 +81,8 @@ hiprtcResult hiprtcGetCode(hiprtcProgram prog, char* code); hiprtcResult hiprtcGetCodeSize(hiprtcProgram prog, size_t* codeSizeRet); +#pragma GCC visibility pop + #ifdef __cplusplus } #endif /* __cplusplus */ diff --git a/projects/hip/include/hip/hcc_detail/host_defines.h b/projects/hip/include/hip/hcc_detail/host_defines.h index 11bd577f08..b21946e99f 100644 --- a/projects/hip/include/hip/hcc_detail/host_defines.h +++ b/projects/hip/include/hip/hcc_detail/host_defines.h @@ -60,7 +60,7 @@ THE SOFTWARE. */ // _restrict is supported by the compiler #define __shared__ tile_static -#define __constant__ __attribute__((hc)) +#define __constant__ __attribute__((hc, annotate("__HIP_constant__"))) #elif defined(__clang__) && defined(__HIP__) diff --git a/projects/hip/include/hip/hip_runtime_api.h b/projects/hip/include/hip/hip_runtime_api.h index cf6a64ad65..b0974aeef6 100644 --- a/projects/hip/include/hip/hip_runtime_api.h +++ b/projects/hip/include/hip/hip_runtime_api.h @@ -321,7 +321,6 @@ typedef enum hipDeviceAttribute_t { hipDeviceAttributeIntegrated, ///< iGPU hipDeviceAttributeCooperativeLaunch, ///< Support cooperative launch hipDeviceAttributeCooperativeMultiDeviceLaunch, ///< Support cooperative launch on multiple devices - hipDeviceAttributeMaxTexture1DWidth, ///< Maximum number of elements in 1D images hipDeviceAttributeMaxTexture2DWidth, ///< Maximum dimension width of 2D images in image elements hipDeviceAttributeMaxTexture2DHeight, ///< Maximum dimension height of 2D images in image elements diff --git a/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h b/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h index 6e0d02d0c0..d9eb3e4146 100644 --- a/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h +++ b/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h @@ -186,6 +186,7 @@ typedef struct cudaArray hipArray; typedef struct cudaArray* hipArray_t; typedef struct cudaArray* hipArray_const_t; typedef struct cudaFuncAttributes hipFuncAttributes; +typedef struct cudaLaunchParams hipLaunchParams; #define hipFunction_attribute CUfunction_attribute #define hip_Memcpy2D CUDA_MEMCPY2D #define hipMemcpy3DParms cudaMemcpy3DParms @@ -860,7 +861,7 @@ inline static hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, } -inline hipError_t hipMemcpyWithStream(void* dst, const void* src, +inline static hipError_t hipMemcpyWithStream(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind copyKind, hipStream_t stream) { cudaError_t error = cudaMemcpyAsync(dst, src, sizeBytes, @@ -1134,6 +1135,10 @@ inline static hipError_t hipGetDeviceProperties(hipDeviceProp_t* p_prop, int dev p_prop->integrated = cdprop.integrated; p_prop->cooperativeLaunch = cdprop.cooperativeLaunch; p_prop->cooperativeMultiDeviceLaunch = cdprop.cooperativeMultiDeviceLaunch; + p_prop->cooperativeMultiDeviceUnmatchedFunc = 0; + p_prop->cooperativeMultiDeviceUnmatchedGridDim = 0; + p_prop->cooperativeMultiDeviceUnmatchedBlockDim = 0; + p_prop->cooperativeMultiDeviceUnmatchedSharedMem = 0; p_prop->maxTexture1D = cdprop.maxTexture1D; p_prop->maxTexture2D[0] = cdprop.maxTexture2D[0]; @@ -1271,6 +1276,12 @@ inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t att case hipDeviceAttributeEccEnabled: cdattr = cudaDevAttrEccEnabled; break; + case hipDeviceAttributeCooperativeLaunch: + cdattr = cudaDevAttrCooperativeLaunch; + break; + case hipDeviceAttributeCooperativeMultiDeviceLaunch: + cdattr = cudaDevAttrCooperativeMultiDeviceLaunch; + break; default: return hipCUDAErrorTohipError(cudaErrorInvalidValue); } @@ -1679,6 +1690,17 @@ inline static hipError_t hipGetChannelDesc(hipChannelFormatDesc* desc, hipArray_ return hipCUDAErrorTohipError(cudaGetChannelDesc(desc,array)); } +inline static hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDim, + void** kernelParams, unsigned int sharedMemBytes, + hipStream_t stream) { + return hipCUDAErrorTohipError( + cudaLaunchCooperativeKernel(f, gridDim, blockDim, kernelParams, sharedMemBytes, stream)); +} + +inline static hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList, + int numDevices, unsigned int flags) { + return hipCUDAErrorTohipError(cudaLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags)); +} #ifdef __cplusplus } @@ -1686,6 +1708,17 @@ inline static hipError_t hipGetChannelDesc(hipChannelFormatDesc* desc, hipArray_ #ifdef __CUDACC__ +template +inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, + T func, + int blockSize, + size_t dynamicSMemSize) { + cudaError_t cerror; + cerror = + cudaOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, func, blockSize, dynamicSMemSize); + return hipCUDAErrorTohipError(cerror); +} + template inline static hipError_t hipOccupancyMaxPotentialBlockSize(int* minGridSize, int* blockSize, T func, size_t dynamicSMemSize = 0, @@ -1742,6 +1775,14 @@ template inline static hipChannelFormatDesc hipCreateChannelDesc() { return cudaCreateChannelDesc(); } + +template +inline static hipError_t hipLaunchCooperativeKernel(T f, dim3 gridDim, dim3 blockDim, + void** kernelParams, unsigned int sharedMemBytes, hipStream_t stream) { + return hipCUDAErrorTohipError( + cudaLaunchCooperativeKernel(f, gridDim, blockDim, kernelParams, sharedMemBytes, stream)); +} + #endif //__CUDACC__ #endif // HIP_INCLUDE_HIP_NVCC_DETAIL_HIP_RUNTIME_API_H diff --git a/projects/hip/lpl_ca/CMakeLists.txt b/projects/hip/lpl_ca/CMakeLists.txt index ac01a6a0ab..f626b88d89 100644 --- a/projects/hip/lpl_ca/CMakeLists.txt +++ b/projects/hip/lpl_ca/CMakeLists.txt @@ -14,7 +14,7 @@ install(TARGETS lpl RUNTIME DESTINATION bin) #-------------------------------------LPL--------------------------------------# #-------------------------------------CA---------------------------------------# -add_executable(ca ca.cpp ${PROJECT_SOURCE_DIR}/src/code_object_bundle.cpp) +add_executable(ca ca.cpp) set_target_properties( ca PROPERTIES CXX_STANDARD 11 diff --git a/projects/hip/lpl_ca/ca.hpp b/projects/hip/lpl_ca/ca.hpp index db63f02498..2d691cd38a 100644 --- a/projects/hip/lpl_ca/ca.hpp +++ b/projects/hip/lpl_ca/ca.hpp @@ -2,7 +2,7 @@ #include "common.hpp" -#include "../include/hip/hcc_detail/code_object_bundle.hpp" +#include "../src/code_object_bundle.inl" #include "clara/clara.hpp" diff --git a/projects/hip/packaging/hip-base.txt b/projects/hip/packaging/hip-base.txt index 4ff936dba4..fc8becf84f 100644 --- a/projects/hip/packaging/hip-base.txt +++ b/projects/hip/packaging/hip-base.txt @@ -25,16 +25,15 @@ set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR} set(CPACK_GENERATOR "TGZ;DEB;RPM") set(CPACK_BINARY_DEB "ON") set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm") -set(CPACK_DEBIAN_PACKAGE_DEPENDS "perl (>= 5.0), llvm-amdgpu") -set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip_base") +set(CPACK_DEBIAN_PACKAGE_DEPENDS "perl (>= 5.0)") +set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-base") set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_base") -set(CPACK_DEBIAN_PACKAGE_CONFLICTS "hip_base") set(CPACK_BINARY_RPM "ON") set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}") set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst") set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm") set(CPACK_RPM_PACKAGE_AUTOREQPROV " no") -set(CPACK_RPM_PACKAGE_REQUIRES "perl >= 5.0, llvm-amdgpu") +set(CPACK_RPM_PACKAGE_REQUIRES "perl >= 5.0") set(CPACK_RPM_PACKAGE_OBSOLETES "hip_base") set(CPACK_RPM_PACKAGE_CONFLICTS "hip_base") set(CPACK_BINARY_RPM "ON") diff --git a/projects/hip/packaging/hip-doc.txt b/projects/hip/packaging/hip-doc.txt index d97ddc7d3a..41db246d31 100644 --- a/projects/hip/packaging/hip-doc.txt +++ b/projects/hip/packaging/hip-doc.txt @@ -32,9 +32,8 @@ set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR} set(CPACK_GENERATOR "TGZ;DEB;RPM") set(CPACK_BINARY_DEB "ON") set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION})") -set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip_doc") +set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-doc") set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_doc") -set(CPACK_DEBIAN_PACKAGE_CONFLICTS "hip_doc") set(CPACK_BINARY_RPM "ON") set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}") set(CPACK_RPM_PACKAGE_AUTOREQPROV " no") diff --git a/projects/hip/packaging/hip-hcc.txt b/projects/hip/packaging/hip-hcc.txt index 21e138e1ed..6a04ebffbd 100644 --- a/projects/hip/packaging/hip-hcc.txt +++ b/projects/hip/packaging/hip-hcc.txt @@ -37,9 +37,8 @@ set(CPACK_GENERATOR "TGZ;DEB;RPM") set(CPACK_BINARY_DEB "ON") set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm") set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}), ${HCC_PACKAGE_NAME} (= @HCC_PACKAGE_VERSION@), comgr (>= 1.1)") -set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip_hcc") +set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-hcc") set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_hcc") -set(CPACK_DEBIAN_PACKAGE_CONFLICTS "hip_hcc") set(CPACK_BINARY_RPM "ON") set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}") set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst") diff --git a/projects/hip/packaging/hip-nvcc.txt b/projects/hip/packaging/hip-nvcc.txt index dc36b628c7..4b11939609 100644 --- a/projects/hip/packaging/hip-nvcc.txt +++ b/projects/hip/packaging/hip-nvcc.txt @@ -19,9 +19,8 @@ set(CPACK_GENERATOR "TGZ;DEB;RPM") set(CPACK_BINARY_DEB "ON") set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm") set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}), cuda (>= 7.5)") -set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip_nvcc") +set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-nvcc") set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_nvcc") -set(CPACK_DEBIAN_PACKAGE_CONFLICTS "hip_nvcc") set(CPACK_BINARY_RPM "ON") set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}") set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst") diff --git a/projects/hip/packaging/hip-samples.txt b/projects/hip/packaging/hip-samples.txt index 737f048d8f..1f3e088153 100644 --- a/projects/hip/packaging/hip-samples.txt +++ b/projects/hip/packaging/hip-samples.txt @@ -19,10 +19,9 @@ set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@) set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}) set(CPACK_GENERATOR "TGZ;DEB;RPM") set(CPACK_BINARY_DEB "ON") -set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-vdi (= ${CPACK_PACKAGE_VERSION})") -set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip_samples") +set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION})") +set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-samples") set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_samples") -set(CPACK_DEBIAN_PACKAGE_CONFLICTS "hip_samples") set(CPACK_BINARY_RPM "ON") set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}") set(CPACK_RPM_PACKAGE_AUTOREQPROV " no") diff --git a/projects/hip/packaging/hip-targets.cmake b/projects/hip/packaging/hip-targets.cmake index ac72419f52..6f6957f4d6 100644 --- a/projects/hip/packaging/hip-targets.cmake +++ b/projects/hip/packaging/hip-targets.cmake @@ -114,8 +114,6 @@ set_target_properties(hip::device PROPERTIES else() set_target_properties(hip::device PROPERTIES INTERFACE_LINK_LIBRARIES "hip::host" - INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include" - INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include" ) endif() diff --git a/projects/hip/packaging/hip-vdi.txt b/projects/hip/packaging/hip-vdi.txt index c80e4aed6f..eefdcf69fb 100644 --- a/projects/hip/packaging/hip-vdi.txt +++ b/projects/hip/packaging/hip-vdi.txt @@ -27,11 +27,7 @@ set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR} set(CPACK_GENERATOR "TGZ;DEB;RPM") set(CPACK_BINARY_DEB "ON") set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm") -if(@COMPILE_HIP_ATP_MARKER@) - set(CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev, hsa-ext-rocr-dev, rocm-utils, hip-base (= ${CPACK_PACKAGE_VERSION}), rocm-profiler, comgr (>= 1.1)") -else() - set(CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev, hsa-ext-rocr-dev, rocm-utils, hip-base (= ${CPACK_PACKAGE_VERSION}), comgr (>= 1.1)") -endif() +set(CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev, hsa-ext-rocr-dev, rocm-utils, hip-base (= ${CPACK_PACKAGE_VERSION}), comgr (>= 1.1), llvm-amdgpu") set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip_vdi, hip-hcc (= ${CPACK_PACKAGE_VERSION})") set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_vdi") set(CPACK_DEBIAN_PACKAGE_CONFLICTS "hip_vdi") @@ -41,11 +37,7 @@ set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst") set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm") set(CPACK_RPM_PACKAGE_AUTOREQPROV " no") string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION}) -if(@COMPILE_HIP_ATP_MARKER@) - set(CPACK_RPM_PACKAGE_REQUIRES "hsa-rocr-dev, hsa-ext-rocr-dev, rocm-utils, hip-base = ${HIP_BASE_VERSION}, rocm-profiler, comgr >= 1.1") -else() - set(CPACK_RPM_PACKAGE_REQUIRES "hsa-rocr-dev, hsa-ext-rocr-dev, rocm-utils, hip-base = ${HIP_BASE_VERSION}, comgr >= 1.1") -endif() +set(CPACK_RPM_PACKAGE_REQUIRES "hsa-rocr-dev, hsa-ext-rocr-dev, rocm-utils, hip-base = ${HIP_BASE_VERSION}, comgr >= 1.1, llvm-amdgpu") set(CPACK_RPM_PACKAGE_PROVIDES "hip_vdi, hip-hcc = ${HIP_BASE_VERSION}") set(CPACK_RPM_PACKAGE_OBSOLETES "hip_vdi") set(CPACK_RPM_PACKAGE_CONFLICTS "hip_vdi") diff --git a/projects/hip/samples/0_Intro/module_api/defaultDriver.cpp b/projects/hip/samples/0_Intro/module_api/defaultDriver.cpp index ea36aabcf4..af8b413ac2 100644 --- a/projects/hip/samples/0_Intro/module_api/defaultDriver.cpp +++ b/projects/hip/samples/0_Intro/module_api/defaultDriver.cpp @@ -80,8 +80,8 @@ int main() { hipFree(Ad); hipFree(Bd); - delete A; - delete B; + delete[] A; + delete[] B; hipCtxDestroy(context); return 0; } diff --git a/projects/hip/samples/0_Intro/module_api/launchKernelHcc.cpp b/projects/hip/samples/0_Intro/module_api/launchKernelHcc.cpp index 38cf0d414c..90e569c5bc 100644 --- a/projects/hip/samples/0_Intro/module_api/launchKernelHcc.cpp +++ b/projects/hip/samples/0_Intro/module_api/launchKernelHcc.cpp @@ -107,8 +107,8 @@ int main() { hipFree(Ad); hipFree(Bd); - delete A; - delete B; + delete[] A; + delete[] B; hipCtxDestroy(context); return 0; } diff --git a/projects/hip/samples/0_Intro/module_api/runKernel.cpp b/projects/hip/samples/0_Intro/module_api/runKernel.cpp index a011b42666..1093b0dd54 100644 --- a/projects/hip/samples/0_Intro/module_api/runKernel.cpp +++ b/projects/hip/samples/0_Intro/module_api/runKernel.cpp @@ -99,8 +99,8 @@ int main() { hipFree(Ad); hipFree(Bd); - delete A; - delete B; + delete[] A; + delete[] B; hipCtxDestroy(context); return 0; } diff --git a/projects/hip/samples/0_Intro/module_api_global/runKernel.cpp b/projects/hip/samples/0_Intro/module_api_global/runKernel.cpp index 3a2804b7a2..4a2d49144c 100644 --- a/projects/hip/samples/0_Intro/module_api_global/runKernel.cpp +++ b/projects/hip/samples/0_Intro/module_api_global/runKernel.cpp @@ -154,8 +154,8 @@ int main() { hipFree(Ad); hipFree(Bd); - delete A; - delete B; + delete[] A; + delete[] B; hipCtxDestroy(context); return 0; } diff --git a/projects/hip/samples/1_Utils/hipInfo/hipInfo.cpp b/projects/hip/samples/1_Utils/hipInfo/hipInfo.cpp index e17f19675a..14faa7671b 100644 --- a/projects/hip/samples/1_Utils/hipInfo/hipInfo.cpp +++ b/projects/hip/samples/1_Utils/hipInfo/hipInfo.cpp @@ -56,6 +56,7 @@ void printCompilerInfo() { #endif } +double bytesToKB(size_t s) { return (double)s / (1024.0); } double bytesToGB(size_t s) { return (double)s / (1024.0 * 1024.0 * 1024.0); } #define printLimit(w1, limit, units) \ @@ -97,7 +98,7 @@ void printDeviceProp(int deviceId) { cout << setw(w1) << "totalGlobalMem: " << fixed << setprecision(2) << bytesToGB(props.totalGlobalMem) << " GB" << endl; cout << setw(w1) << "maxSharedMemoryPerMultiProcessor: " << fixed << setprecision(2) - << bytesToGB(props.maxSharedMemoryPerMultiProcessor) << " GB" << endl; + << bytesToKB(props.maxSharedMemoryPerMultiProcessor) << " KB" << endl; cout << setw(w1) << "totalConstMem: " << props.totalConstMem << endl; cout << setw(w1) << "sharedMemPerBlock: " << (float)props.sharedMemPerBlock / 1024.0 << " KB" << endl; diff --git a/projects/hip/src/code_object_bundle.cpp b/projects/hip/src/code_object_bundle.cpp deleted file mode 100644 index feef90a61a..0000000000 --- a/projects/hip/src/code_object_bundle.cpp +++ /dev/null @@ -1,34 +0,0 @@ -#include "../include/hip/hcc_detail/code_object_bundle.hpp" - -#include - -#include -#include -#include -#include -#include - -using namespace std; - -// CREATORS -hip_impl::Bundled_code_header::Bundled_code_header(const vector& x) - : Bundled_code_header{x.cbegin(), x.cend()} {} - -hip_impl::Bundled_code_header::Bundled_code_header( - const void* p) { // This is a pretty terrible interface, useful only because - // hipLoadModuleData is so poorly specified (for no fault of its own). - if (!p) return; - - if (!valid(*static_cast(p))) return; - auto ph = static_cast(p); - - size_t sz = sizeof(Header_) + ph->bundle_cnt_ * sizeof(Bundled_code::Header); - auto pb = static_cast(p) + sizeof(Header_); - auto n = ph->bundle_cnt_; - while (n--) { - sz += reinterpret_cast(pb)->bundle_sz; - pb += sizeof(Bundled_code::Header); - } - - read(static_cast(p), static_cast(p) + sz, *this); -} diff --git a/projects/hip/include/hip/hcc_detail/code_object_bundle.hpp b/projects/hip/src/code_object_bundle.inl similarity index 86% rename from projects/hip/include/hip/hcc_detail/code_object_bundle.hpp rename to projects/hip/src/code_object_bundle.inl index 77e0d706d6..596ac60661 100644 --- a/projects/hip/include/hip/hcc_detail/code_object_bundle.hpp +++ b/projects/hip/src/code_object_bundle.inl @@ -92,10 +92,6 @@ struct Bundled_code { #define magic_string_ "__CLANG_OFFLOAD_BUNDLE__" -#ifdef __GNUC__ -#pragma GCC visibility push (default) -#endif - class Bundled_code_header { // DATA - STATICS static constexpr auto magic_string_sz_ = sizeof(magic_string_) - 1; @@ -167,8 +163,26 @@ class Bundled_code_header { Bundled_code_header() = default; template Bundled_code_header(RandomAccessIterator f, RandomAccessIterator l); - explicit Bundled_code_header(const std::vector& blob); - explicit Bundled_code_header(const void* maybe_blob); + explicit Bundled_code_header(const std::vector& blob) + : Bundled_code_header{blob.cbegin(), blob.cend()} {} + explicit Bundled_code_header(const void* maybe_blob) { + // This is a pretty terrible interface, useful only because + // hipLoadModuleData is so poorly specified (for no fault of its own). + if (!maybe_blob) return; + + if (!valid(*static_cast(maybe_blob))) return; + auto ph = static_cast(maybe_blob); + + size_t sz = sizeof(Header_) + ph->bundle_cnt_ * sizeof(Bundled_code::Header); + auto pb = static_cast(maybe_blob) + sizeof(Header_); + auto n = ph->bundle_cnt_; + while (n--) { + sz += reinterpret_cast(pb)->bundle_sz; + pb += sizeof(Bundled_code::Header); + } + + read(static_cast(maybe_blob), static_cast(maybe_blob) + sz, *this); + } Bundled_code_header(const Bundled_code_header&) = default; Bundled_code_header(Bundled_code_header&&) = default; ~Bundled_code_header() = default; @@ -180,10 +194,6 @@ class Bundled_code_header { size_t bundled_code_size = 0; }; -#ifdef __GNUC__ -#pragma GCC visibility pop -#endif - // CREATORS template Bundled_code_header::Bundled_code_header(RandomAccessIterator f, RandomAccessIterator l) diff --git a/projects/hip/src/hip_clang.cpp b/projects/hip/src/hip_clang.cpp index e8f3e86881..75c13038a5 100644 --- a/projects/hip/src/hip_clang.cpp +++ b/projects/hip/src/hip_clang.cpp @@ -51,7 +51,7 @@ __hipRegisterFatBinary(const void* data) return nullptr; } - auto modules = new std::vector{g_deviceCnt}; + auto modules = new std::vector(g_deviceCnt); if (!modules) { return nullptr; } @@ -136,7 +136,7 @@ extern "C" void __hipRegisterFunction( int* wSize) { HIP_INIT_API(NONE, modules, hostFunction, deviceFunction, deviceName); - std::vector functions{g_deviceCnt}; + std::vector functions(g_deviceCnt); assert(modules && modules->size() >= g_deviceCnt); for (int deviceId = 0; deviceId < g_deviceCnt; ++deviceId) { diff --git a/projects/hip/src/hip_device.cpp b/projects/hip/src/hip_device.cpp index 1bbdb10bbc..e5797727ae 100644 --- a/projects/hip/src/hip_device.cpp +++ b/projects/hip/src/hip_device.cpp @@ -310,6 +310,18 @@ hipError_t ihipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device case hipDeviceAttributeCooperativeMultiDeviceLaunch: *pi = prop->cooperativeMultiDeviceLaunch; break; + case hipDeviceAttributeCooperativeMultiDeviceUnmatchedFunc: + *pi = prop->cooperativeMultiDeviceUnmatchedFunc; + break; + case hipDeviceAttributeCooperativeMultiDeviceUnmatchedGridDim: + *pi = prop->cooperativeMultiDeviceUnmatchedGridDim; + break; + case hipDeviceAttributeCooperativeMultiDeviceUnmatchedBlockDim: + *pi = prop->cooperativeMultiDeviceUnmatchedBlockDim; + break; + case hipDeviceAttributeCooperativeMultiDeviceUnmatchedSharedMem: + *pi = prop->cooperativeMultiDeviceUnmatchedSharedMem; + break; case hipDeviceAttributeMaxPitch: *pi = prop->memPitch; break; diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index c9688408c8..807dcc7391 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -677,7 +677,7 @@ hsa_status_t get_pool_info(hsa_amd_memory_pool_t pool, void* data) { break; case HSA_REGION_SEGMENT_GROUP: err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SIZE, - &(p_prop->sharedMemPerBlock)); + &(p_prop->maxSharedMemoryPerMultiProcessor)); break; default: break; @@ -835,10 +835,8 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop) { hsa_region_t* am_region = static_cast(_acc.get_hsa_am_region()); err = hsa_region_get_info(*am_region, HSA_REGION_INFO_SIZE, &prop->totalGlobalMem); DeviceErrorCheck(err); - // maxSharedMemoryPerMultiProcessor should be as the same as group memory size. - // Group memory will not be paged out, so, the physical memory size is the total shared memory - // size, and also equal to the group pool size. - prop->maxSharedMemoryPerMultiProcessor = prop->totalGlobalMem; + // Current GPUs allow a workgroup to use all of LDS in a CU, so these two are equal. + prop->sharedMemPerBlock = prop->maxSharedMemoryPerMultiProcessor; // Get Max memory clock frequency err = @@ -897,9 +895,16 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop) { prop->integrated = 1; } - // Enable the cooperative group for gfx9+ - prop->cooperativeLaunch = (prop->gcnArch < 900) ? 0 : 1; - prop->cooperativeMultiDeviceLaunch = (prop->gcnArch < 900) ? 0 : 1; + // Enable the cooperative group for GPUs that support all the required features + err = hsa_agent_get_info(_hsaAgent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COOPERATIVE_QUEUES, + &prop->cooperativeLaunch); + DeviceErrorCheck(err); + prop->cooperativeMultiDeviceLaunch = prop->cooperativeLaunch; + + prop->cooperativeMultiDeviceUnmatchedFunc = prop->cooperativeMultiDeviceLaunch; + prop->cooperativeMultiDeviceUnmatchedGridDim = prop->cooperativeMultiDeviceLaunch; + prop->cooperativeMultiDeviceUnmatchedBlockDim = prop->cooperativeMultiDeviceLaunch; + prop->cooperativeMultiDeviceUnmatchedSharedMem = prop->cooperativeMultiDeviceLaunch; err = hsa_agent_get_info(_hsaAgent, (hsa_agent_info_t)HSA_EXT_AGENT_INFO_IMAGE_1D_MAX_ELEMENTS, &prop->maxTexture1D); @@ -1515,20 +1520,6 @@ hipError_t ihipStreamSynchronize(TlsData *tls, hipStream_t stream) { return e; } -void ihipStreamCallbackHandler(ihipStreamCallback_t* cb) { - hipError_t e = hipSuccess; - - // Synchronize stream - tprintf(DB_SYNC, "ihipStreamCallbackHandler wait on stream %s\n", - ToString(cb->_stream).c_str()); - GET_TLS(); - e = ihipStreamSynchronize(tls, cb->_stream); - - // Call registered callback function - cb->_callback(cb->_stream, e, cb->_userData); - delete cb; -} - //--- // Get the stream to use for a command submission. // @@ -1619,7 +1610,9 @@ void ihipPrintKernelLaunch(const char* kernelName, const grid_launch_parm* lp, // Allows runtime to track some information about the stream. hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_launch_parm* lp, const char* kernelNameStr, bool lockAcquired) { - stream = ihipSyncAndResolveStream(stream, lockAcquired); + if (stream == nullptr || stream != stream->getCtx()->_defaultStream) { + stream = ihipSyncAndResolveStream(stream, lockAcquired); + } lp->grid_dim.x = grid.x; lp->grid_dim.y = grid.y; lp->grid_dim.z = grid.z; diff --git a/projects/hip/src/hip_hcc_internal.h b/projects/hip/src/hip_hcc_internal.h index 104fd910a8..993dc31dfc 100644 --- a/projects/hip/src/hip_hcc_internal.h +++ b/projects/hip/src/hip_hcc_internal.h @@ -654,19 +654,6 @@ class ihipStream_t { }; -//---- -// Internal structure for stream callback handler -class ihipStreamCallback_t { - public: - ihipStreamCallback_t(hipStream_t stream, hipStreamCallback_t callback, void* userData) - : _stream(stream), _callback(callback), _userData(userData) { - }; - hipStream_t _stream; - hipStreamCallback_t _callback; - void* _userData; -}; - - //---- // Internal event structure: enum hipEventStatus_t { @@ -980,7 +967,6 @@ hipError_t hipModuleGetFunctionEx(hipFunction_t* hfunc, hipModule_t hmod, hipStream_t ihipSyncAndResolveStream(hipStream_t, bool lockAcquired = 0); hipError_t ihipStreamSynchronize(TlsData *tls, hipStream_t stream); -void ihipStreamCallbackHandler(ihipStreamCallback_t* cb); /** * @brief Copies the memory address and size of symbol @p symbolName diff --git a/projects/hip/src/hip_memory.cpp b/projects/hip/src/hip_memory.cpp index 8159f22a97..e7bc348951 100644 --- a/projects/hip/src/hip_memory.cpp +++ b/projects/hip/src/hip_memory.cpp @@ -309,31 +309,52 @@ void generic_copy(void* __restrict dst, const void* __restrict src, size_t n, if (di.size == is_cpu_owned) return d2h_copy(dst, src, n, si); if (si.size == is_cpu_owned) return h2d_copy(dst, src, n, di); - throwing_result_check(hsa_amd_agents_allow_access(1u, &si.agentOwner, - nullptr, - di.agentBaseAddress), - __FILE__, __func__, __LINE__); - - return do_copy(dst, src, n, di.agentOwner, si.agentOwner); + hsa_status_t res = hsa_amd_agents_allow_access(1u, &si.agentOwner, + nullptr, di.agentBaseAddress); + if (res == HSA_STATUS_SUCCESS){ + return do_copy(dst, src, n, di.agentOwner, si.agentOwner); + } + // If devices do not have access then fallback mechanism will be used + // copy will be slower + throwing_result_check(hsa_memory_copy(dst,src,n), __FILE__, __func__, __LINE__); } inline void memcpy_impl(void* __restrict dst, const void* __restrict src, size_t n, hipMemcpyKind k) { + auto si{info(src)}; + auto di{info(dst)}; + + if (!is_large_BAR){ + // Pointer info takes presidence over hipMemcpyKind + // if there is mismatch b/w Memcpy kind and dst/src pointer + // E.g. dst(host pointer),src(device pointer) and hipMemcpyKind set as hipMemcpyHostToDevice + if (di.size == is_cpu_owned && si.size == is_cpu_owned) + k = hipMemcpyHostToHost; + else if (si.size == is_cpu_owned && di.size != is_cpu_owned) + k = hipMemcpyHostToDevice; + else if (di.size == is_cpu_owned && si.size != is_cpu_owned) + k = hipMemcpyDeviceToHost; + else + k = hipMemcpyDeviceToDevice; + } switch (k) { case hipMemcpyHostToHost: std::memcpy(dst, src, n); break; - case hipMemcpyHostToDevice: return h2d_copy(dst, src, n, info(dst)); - case hipMemcpyDeviceToHost: return d2h_copy(dst, src, n, info(src)); + case hipMemcpyHostToDevice: return h2d_copy(dst, src, n, di); + case hipMemcpyDeviceToHost: return d2h_copy(dst, src, n, si); case hipMemcpyDeviceToDevice: { - const auto di{info(dst)}; - const auto si{info(src)}; - throwing_result_check(hsa_amd_agents_allow_access(1u, &si.agentOwner, - nullptr, - di.agentBaseAddress), - __FILE__, __func__, __LINE__); - return do_copy(dst, src, n, di.agentOwner, si.agentOwner); + hsa_status_t res = hsa_amd_agents_allow_access(1u, &si.agentOwner, + nullptr, di.agentBaseAddress); + if (res == HSA_STATUS_SUCCESS){ + return do_copy(dst, src, n, di.agentOwner, si.agentOwner); + } + + // If devices do not have access then fallback mechanism will be used + // copy will be slower + throwing_result_check(hsa_memory_copy(dst,src,n), __FILE__, __func__, __LINE__); + break; } - default: return generic_copy(dst, src, n, info(dst), info(src)); + default: return generic_copy(dst, src, n, di, si); } } @@ -478,6 +499,10 @@ void* allocAndSharePtr(const char* msg, size_t sizeBytes, ihipCtx_t* ctx, bool s hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned int flags) { hipError_t hip_status = hipSuccess; + if (sizeBytes == 0) { + return hipSuccess; + } + if (HIP_SYNC_HOST_ALLOC) { hipDeviceSynchronize(); } @@ -485,10 +510,6 @@ hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned i auto ctx = ihipGetTlsDefaultCtx(); if ((ctx == nullptr) || (ptr == nullptr)) { hip_status = hipErrorInvalidValue; - } - else if (sizeBytes == 0) { - hip_status = hipSuccess; - // TODO - should size of 0 return err or be siliently ignored? } else { unsigned trueFlags = flags; if (flags == hipHostMallocDefault) { @@ -673,14 +694,15 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) { HIP_SET_DEVICE(); hipError_t hip_status = hipSuccess; + if (sizeBytes == 0) { + if (ptr) *ptr = NULL; + return ihipLogStatus(hipSuccess); + } + auto ctx = ihipGetTlsDefaultCtx(); // return NULL pointer when malloc size is 0 if ( nullptr == ctx || nullptr == ptr) { hip_status = hipErrorInvalidValue; - } - else if (sizeBytes == 0) { - *ptr = NULL; - hip_status = hipSuccess; } else { auto device = ctx->getWriteableDevice(); *ptr = hip_internal::allocAndSharePtr("device_mem", sizeBytes, ctx, false /*shareWithAll*/, @@ -700,14 +722,15 @@ hipError_t hipExtMallocWithFlags(void** ptr, size_t sizeBytes, unsigned int flag HIP_SET_DEVICE(); #if (__hcc_workweek__ >= 19115) + if (sizeBytes == 0) { + if (ptr) *ptr = NULL; + return ihipLogStatus(hipSuccess); + } + hipError_t hip_status = hipSuccess; auto ctx = ihipGetTlsDefaultCtx(); - // return NULL pointer when malloc size is 0 - if (sizeBytes == 0) { - *ptr = NULL; - hip_status = hipSuccess; - } else if ((ctx == nullptr) || (ptr == nullptr)) { + if ((ctx == nullptr) || (ptr == nullptr)) { hip_status = hipErrorInvalidValue; } else { unsigned amFlags = 0; @@ -736,6 +759,9 @@ hipError_t hipExtMallocWithFlags(void** ptr, size_t sizeBytes, unsigned int flag hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) { HIP_INIT_SPECIAL_API(hipHostMalloc, (TRACE_MEM), ptr, sizeBytes, flags); HIP_SET_DEVICE(); + if (sizeBytes == 0) { + return ihipLogStatus(hipSuccess); + } hipError_t hip_status = hipSuccess; hip_status = hip_internal::ihipHostMalloc(tls, ptr, sizeBytes, flags); return ihipLogStatus(hip_status); @@ -744,6 +770,9 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) { hipError_t hipMallocManaged(void** devPtr, size_t size, unsigned int flags) { HIP_INIT_SPECIAL_API(hipMallocManaged, (TRACE_MEM), devPtr, size, flags); HIP_SET_DEVICE(); + if (size == 0) { + return ihipLogStatus(hipSuccess); + } hipError_t hip_status = hipSuccess; if(flags != hipMemAttachGlobal) hip_status = hipErrorInvalidValue; @@ -1224,6 +1253,7 @@ hipError_t hipMemcpyToSymbol(void* dst, const void* src, size_t count, tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbol_name, dst); + if (count == 0) return ihipLogStatus(hipSuccess); if (dst == nullptr) { return ihipLogStatus(hipErrorInvalidSymbol); } @@ -1246,6 +1276,7 @@ hipError_t hipMemcpyFromSymbol(void* dst, const void* src, size_t count, tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbol_name, dst); + if (count == 0) return ihipLogStatus(hipSuccess); if (src == nullptr || dst == nullptr) { return ihipLogStatus(hipErrorInvalidSymbol); } @@ -1269,6 +1300,7 @@ hipError_t hipMemcpyToSymbolAsync(void* dst, const void* src, size_t count, tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbol_name, dst); + if (count == 0) return ihipLogStatus(hipSuccess); if (dst == nullptr) { return ihipLogStatus(hipErrorInvalidSymbol); } @@ -1301,6 +1333,7 @@ hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* src, size_t count, tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbol_name, src); + if (count == 0) return ihipLogStatus(hipSuccess); if (src == nullptr || dst == nullptr) { return ihipLogStatus(hipErrorInvalidSymbol); } @@ -1592,6 +1625,7 @@ hipError_t ihipMemcpy3D(const struct hipMemcpy3DParms* p, hipStream_t stream, bo srcXoffset = p->srcPos.x; srcYoffset = p->srcPos.y; srcZoffset = p->srcPos.z; + if (copyWidth == 0) return hipSuccess; if (p->dstArray != nullptr) { if ((p->dstArray->isDrv == true) ||( p->dstPtr.ptr!= nullptr)){ return hipErrorInvalidValue; @@ -1933,6 +1967,7 @@ hipError_t getLockedPointer(void *hostPtr, size_t dataLen, void **devicePtrPtr) // TODO - review and optimize hipError_t ihipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind) { + if (height == 0 || width == 0) return hipSuccess; if (dst == nullptr || src == nullptr || width > dpitch || width > spitch) return hipErrorInvalidValue; hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull); @@ -1989,6 +2024,7 @@ hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, hipError_t ihipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream) { + if (height == 0 || width == 0) return hipSuccess; if (dst == nullptr || src == nullptr || width > dpitch || width > spitch) return hipErrorInvalidValue; hipError_t e = hipSuccess; int isLockedOrD2D = 0; @@ -2043,6 +2079,7 @@ hipError_t ihip2dOffsetMemcpy(void* dst, size_t dpitch, const void* src, size_t size_t height, size_t srcXOffsetInBytes, size_t srcYOffset, size_t dstXOffsetInBytes, size_t dstYOffset,hipMemcpyKind kind, hipStream_t stream, bool isAsync) { + if (height == 0 || width == 0) return hipSuccess; if((spitch < width + srcXOffsetInBytes) || (srcYOffset >= height)){ return hipErrorInvalidValue; } else if((dpitch < width + dstXOffsetInBytes) || (dstYOffset >= height)){ @@ -2061,6 +2098,7 @@ hipError_t ihipMemcpyParam2D(const hip_Memcpy2D* pCopy, hipStream_t stream, bool if (pCopy == nullptr) { return hipErrorInvalidValue; } + if (pCopy->Height == 0 || pCopy->WidthInBytes == 0) return hipSuccess; void* dst; const void* src; size_t spitch = pCopy->srcPitch; size_t dpitch = pCopy->dstPitch; @@ -2140,6 +2178,7 @@ hipError_t hipMemcpy2DFromArray( void* dst, size_t dpitch, hipArray_const_t src, hipError_t hipMemcpy2DFromArrayAsync( void* dst, size_t dpitch, hipArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream ){ HIP_INIT_SPECIAL_API(hipMemcpy2DFromArrayAsync, (TRACE_MCMD), dst, dpitch, src, wOffset, hOffset, width, height, kind, stream); size_t byteSize; + if (height == 0 || width == 0) return ihipLogStatus(hipSuccess); if(src) { switch (src->desc.f) { case hipChannelFormatKindSigned: @@ -2239,8 +2278,6 @@ hipError_t hipMemGetInfo(size_t* free, size_t* total) { auto device = ctx->getWriteableDevice(); if (total) { *total = device->_props.totalGlobalMem; - } else { - e = hipErrorInvalidValue; } if (free) { @@ -2263,8 +2300,6 @@ hipError_t hipMemGetInfo(size_t* free, size_t* total) { } else { return ihipLogStatus(hipErrorInvalidValue); } - } else { - e = hipErrorInvalidValue; } } else { diff --git a/projects/hip/src/hip_module.cpp b/projects/hip/src/hip_module.cpp index 0a7348a3a2..1e467899c6 100644 --- a/projects/hip/src/hip_module.cpp +++ b/projects/hip/src/hip_module.cpp @@ -50,7 +50,7 @@ THE SOFTWARE. #include #include #include -#include "../include/hip/hcc_detail/code_object_bundle.hpp" +#include "code_object_bundle.inl" #include "hip_fatbin.h" // TODO Use Pool APIs from HCC to get memory regions. @@ -140,7 +140,7 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global uint32_t localWorkSizeZ, size_t sharedMemBytes, hipStream_t hStream, void** kernelParams, void** extra, hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags, bool isStreamLocked = 0, - void** impCoopParams = 0) { + void** impCoopParams = 0, hc::accelerator_view* coopAV = 0) { using namespace hip_impl; auto ctx = ihipGetTlsDefaultCtx(); @@ -192,8 +192,8 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global if (impCoopParams) { const auto p{static_cast(*impCoopParams)}; // The sixth index is for multi-grid synchronization - kernargs.insert((kernargs.cend() - padSize - HIP_IMPLICIT_KERNARG_SIZE) + 6 * HIP_IMPLICIT_KERNARG_ALIGNMENT, - p, p + HIP_IMPLICIT_KERNARG_ALIGNMENT); + copy(p, p + HIP_IMPLICIT_KERNARG_ALIGNMENT, + (kernargs.end() - HIP_IMPLICIT_KERNARG_SIZE) + 6 * HIP_IMPLICIT_KERNARG_ALIGNMENT); } /* @@ -245,6 +245,10 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global hc::completion_future cf; + if (coopAV) { + lp.av = coopAV; + } + lp.av->dispatch_hsa_kernel(&aql, kernargs.data(), kernargs.size(), (startEvent || stopEvent) ? &cf : nullptr #if (__hcc_workweek__ > 17312) @@ -326,22 +330,18 @@ hipError_t ihipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList return hipErrorInvalidValue; } - hipFunction_t* kds = reinterpret_cast(malloc(sizeof(hipFunction_t) * numDevices)); - if (kds == nullptr) { - return hipErrorNotInitialized; - } + std::vector kds(numDevices,0); // prepare all kernel descriptors for each device as all streams will be locked in the next loop for (int i = 0; i < numDevices; ++i) { const hipLaunchParams& lp = launchParamsList[i]; if (lp.stream == nullptr) { - free(kds); return hipErrorNotInitialized; } kds[i] = ps.kernel_descriptor(reinterpret_cast(lp.func), hip_impl::target_agent(lp.stream)); + if (kds[i] == nullptr) { - free(kds); return hipErrorInvalidValue; } if (!kds[i]->_kernarg_layout.empty()) continue; @@ -396,8 +396,6 @@ hipError_t ihipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList #endif } - free(kds); - return result; } @@ -409,6 +407,90 @@ hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList, return ihipExtLaunchMultiKernelMultiDevice(launchParamsList, numDevices, flags, ps); } +void getGprsLdsUsage(hipFunction_t f, size_t* usedVGPRS, size_t* usedSGPRS, size_t* usedLDS) +{ + if (f->_is_code_object_v3) { + const auto header = reinterpret_cast(f->_header); + // GRANULATED_WAVEFRONT_VGPR_COUNT is specified in 0:5 bits of COMPUTE_PGM_RSRC1 + // the granularity for gfx6-gfx9 is max(0, ceil(vgprs_used / 4) - 1) + *usedVGPRS = ((header->compute_pgm_rsrc1 & 0x3F) + 1) << 2; + // GRANULATED_WAVEFRONT_SGPR_COUNT is specified in 6:9 bits of COMPUTE_PGM_RSRC1 + // the granularity for gfx9+ is 2 * max(0, ceil(sgprs_used / 16) - 1) + *usedSGPRS = ((((header->compute_pgm_rsrc1 & 0x3C0) >> 6) >> 1) + 1) << 4; + *usedLDS = header->group_segment_fixed_size; + } + else { + const auto header = f->_header; + // VGPRs granularity is 4 + *usedVGPRS = ((header->workitem_vgpr_count + 3) >> 2) << 2; + // adding 2 to take into account the 2 VCC registers & handle the granularity of 16 + *usedSGPRS = header->wavefront_sgpr_count + 2; + *usedSGPRS = ((*usedSGPRS + 15) >> 4) << 4; + *usedLDS = header->workgroup_group_segment_byte_size; + } +} + +static hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor( + TlsData *tls, uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk) +{ + using namespace hip_impl; + + auto ctx = ihipGetTlsDefaultCtx(); + if (ctx == nullptr) { + return hipErrorInvalidDevice; + } + if (numBlocks == nullptr) { + return hipErrorInvalidValue; + } + + hipDeviceProp_t prop{}; + ihipGetDeviceProperties(&prop, ihipGetTlsDefaultCtx()->getDevice()->_deviceId); + + if (blockSize > prop.maxThreadsPerBlock) { + *numBlocks = 0; + return hipSuccess; + } + + prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024; + + size_t usedVGPRS = 0; + size_t usedSGPRS = 0; + size_t usedLDS = 0; + getGprsLdsUsage(f, &usedVGPRS, &usedSGPRS, &usedLDS); + + // Due to SPI and private memory limitations, the max of wavefronts per CU in 32 + size_t wavefrontSize = prop.warpSize; + size_t maxWavefrontsPerCU = min(prop.maxThreadsPerMultiProcessor / wavefrontSize, 32); + + const size_t simdPerCU = 4; + const size_t maxWavesPerSimd = maxWavefrontsPerCU / simdPerCU; + + size_t numWavefronts = (blockSize + wavefrontSize - 1) / wavefrontSize; + + size_t availableVGPRs = (prop.regsPerBlock / wavefrontSize / simdPerCU); + size_t vgprs_alu_occupancy = simdPerCU * (usedVGPRS == 0 ? maxWavesPerSimd + : std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS)); + + // Calculate blocks occupancy per CU based on VGPR usage + *numBlocks = vgprs_alu_occupancy / numWavefronts; + + const size_t availableSGPRs = (prop.gcnArch < 800) ? 512 : 800; + size_t sgprs_alu_occupancy = simdPerCU * (usedSGPRS == 0 ? maxWavesPerSimd + : std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS)); + + // Calculate blocks occupancy per CU based on SGPR usage + *numBlocks = std::min(*numBlocks, (uint32_t) (sgprs_alu_occupancy / numWavefronts)); + + size_t total_used_lds = usedLDS + dynSharedMemPerBlk; + if (total_used_lds != 0) { + // Calculate LDS occupacy per CU. lds_per_cu / (static_lsd + dynamic_lds) + size_t lds_occupancy = prop.maxSharedMemoryPerMultiProcessor / total_used_lds; + *numBlocks = std::min(*numBlocks, (uint32_t) lds_occupancy); + } + + return hipSuccess; +} + namespace { // kernel for initializing GWS // nwm1 is the total number of work groups minus 1 @@ -417,25 +499,28 @@ __global__ void init_gws(uint nwm1) { } } -__attribute__((visibility("default"))) hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim, - dim3 blockDimX, void** kernelParams, unsigned int sharedMemBytes, + dim3 blockDim, void** kernelParams, unsigned int sharedMemBytes, hipStream_t stream, hip_impl::program_state& ps) { +#if (__hcc_workweek__ >= 20093) hipError_t result; - if ((f == nullptr) || (stream == nullptr) || (kernelParams == nullptr)) { + if (f == nullptr || kernelParams == nullptr) { return hipErrorNotInitialized; } - if (!stream->getDevice()->_props.cooperativeLaunch) { + stream = ihipSyncAndResolveStream(stream); + + if (!stream->getDevice()->_props.cooperativeLaunch || + blockDim.x * blockDim.y * blockDim.z > stream->getDevice()->_props.maxThreadsPerBlock) { return hipErrorInvalidConfiguration; } - size_t globalWorkSizeX = (size_t)gridDim.x * (size_t)blockDimX.x; - size_t globalWorkSizeY = (size_t)gridDim.y * (size_t)blockDimX.y; - size_t globalWorkSizeZ = (size_t)gridDim.z * (size_t)blockDimX.z; + size_t globalWorkSizeX = (size_t)gridDim.x * (size_t)blockDim.x; + size_t globalWorkSizeY = (size_t)gridDim.y * (size_t)blockDim.y; + size_t globalWorkSizeZ = (size_t)gridDim.z * (size_t)blockDim.z; if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX) { return hipErrorInvalidConfiguration; @@ -469,28 +554,49 @@ hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim, kd->_kernarg_layout = *reinterpret_cast>*>(kargs.getHandle()); + GET_TLS(); + uint32_t numBlocksPerSm = 0; + result = ihipOccupancyMaxActiveBlocksPerMultiprocessor(tls, &numBlocksPerSm, kd, + blockDim.x * blockDim.y * blockDim.z, sharedMemBytes); + if (result != hipSuccess) { + return hipErrorLaunchFailure; + } + int maxActiveBlocks = numBlocksPerSm * stream->getDevice()->_props.multiProcessorCount; + + //check to see if the workload fits on the GPU + if (gridDim.x * gridDim.y * gridDim.z > maxActiveBlocks) { + return hipErrorCooperativeLaunchTooLarge; + } void *gwsKernelParam[1]; // calculate total number of work groups minus 1 for the main kernel uint nwm1 = (gridDim.x * gridDim.y * gridDim.z) - 1; gwsKernelParam[0] = &nwm1; - LockedAccessor_StreamCrit_t streamCrit(stream->criticalData(), false); -#if (__hcc_workweek__ >= 19213) - streamCrit->_av.acquire_locked_hsa_queue(); -#endif + hc::accelerator acc = stream->getDevice()->_acc; + // create a cooperative accelerated view for launching gws and main kernels + hc::accelerator_view coopAV = acc.create_cooperative_view(); - GET_TLS(); - // launch the init_gws kernel to initialize the GWS + LockedAccessor_StreamCrit_t streamCrit(stream->criticalData(), false); + + // the cooperative queue will wait until this stream completes its operations + hc::completion_future streamCF; + if (!streamCrit->_av.get_is_empty()) { + streamCF = streamCrit->_av.create_marker(hc::accelerator_scope); + coopAV.create_blocking_marker(streamCF, hc::accelerator_scope); + } + + streamCrit->_av.acquire_locked_hsa_queue(); + coopAV.acquire_locked_hsa_queue(); + + // launch the init_gws kernel to initialize the GWS in the dedicated cooperative queue result = ihipModuleLaunchKernel(tls, gwsKD, 1, 1, 1, 1, 1, 1, - 0, stream, gwsKernelParam, nullptr, nullptr, nullptr, 0, true); + 0, stream, gwsKernelParam, nullptr, nullptr, nullptr, 0, true, nullptr , &coopAV); if (result != hipSuccess) { stream->criticalData().unlock(); -#if (__hcc_workweek__ >= 19213) stream->criticalData()._av.release_locked_hsa_queue(); -#endif - + coopAV.release_locked_hsa_queue(); return hipErrorLaunchFailure; } @@ -498,60 +604,106 @@ hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim, void* impCoopParams[1]; impCoopParams[0] = &impCoopArg; - // launch the main kernel + // launch the main kernel in the cooperative queue result = ihipModuleLaunchKernel(tls, kd, - gridDim.x * blockDimX.x, - gridDim.y * blockDimX.y, - gridDim.z * blockDimX.z, - blockDimX.x, blockDimX.y, blockDimX.z, + gridDim.x * blockDim.x, + gridDim.y * blockDim.y, + gridDim.z * blockDim.z, + blockDim.x, blockDim.y, blockDim.z, sharedMemBytes, stream, kernelParams, nullptr, nullptr, - nullptr, 0, true, impCoopParams); + nullptr, 0, true, impCoopParams, &coopAV); + + + coopAV.release_locked_hsa_queue(); + stream->criticalData()._av.release_locked_hsa_queue(); + + // this stream will wait until the cooperative queue completes its operations + hc::completion_future cooperativeCF; + if (!coopAV.get_is_empty()) { + cooperativeCF = coopAV.create_marker(hc::accelerator_scope); + streamCrit->_av.create_blocking_marker(cooperativeCF, hc::accelerator_scope); + } stream->criticalData().unlock(); -#if (__hcc_workweek__ >= 19213) - stream->criticalData()._av.release_locked_hsa_queue(); -#endif return result; +#else + return hipErrorInvalidConfiguration; +#endif + } __attribute__((visibility("default"))) +hipError_t hipLaunchCooperativeKernel(const void* func, dim3 gridDim, + dim3 blockDim, void** args, + size_t sharedMem, hipStream_t stream, + hip_impl::program_state& ps) { + + // Skipping passing in ps, because the logging function does not like it + HIP_INIT_API(hipLaunchCooperativeKernel, func, gridDim, blockDim, args, + sharedMem, stream); + + return ihipLogStatus(ihipLaunchCooperativeKernel(func, gridDim, blockDim, + args, sharedMem, stream, ps)); +} + + hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList, int numDevices, unsigned int flags, hip_impl::program_state& ps) { +#if (__hcc_workweek__ >= 20093) hipError_t result; if (numDevices > g_deviceCnt || launchParamsList == nullptr || numDevices > MAX_COOPERATIVE_GPUs) { return hipErrorInvalidValue; } + vector streams; + vector deviceIDs; + // check to see if we have valid distinct streams/devices, if cooperative multi device + // launch is supported and if grid/block dimensions are valid for (int i = 0; i < numDevices; ++i) { - if (!launchParamsList[i].stream->getDevice()->_props.cooperativeMultiDeviceLaunch) { + const hipLaunchParams& lp = launchParamsList[i]; + + if (lp.stream == nullptr){ + return hipErrorInvalidResourceHandle; + } + + if (find(streams.begin(), streams.end(), lp.stream) == streams.end()) { + streams.push_back(lp.stream); + } else { + return hipErrorInvalidDevice; + } + + const ihipDevice_t* currentDevice = lp.stream->getDevice(); + if (find(deviceIDs.begin(), deviceIDs.end(), currentDevice->_deviceId) == deviceIDs.end()) { + deviceIDs.push_back(currentDevice->_deviceId); + } else { + return hipErrorInvalidDevice; + } + + if (!currentDevice->_props.cooperativeMultiDeviceLaunch) { + return hipErrorInvalidConfiguration; + } + + if (lp.gridDim.x == 0 || lp.gridDim.y == 0 || lp.gridDim.z == 0 || + lp.blockDim.x == 0 || lp.blockDim.y == 0 || lp.blockDim.z == 0 || + lp.blockDim.x * lp.blockDim.y * lp.blockDim.z > currentDevice->_props.maxThreadsPerBlock){ return hipErrorInvalidConfiguration; } } - hipFunction_t* gwsKds = reinterpret_cast(malloc(sizeof(hipFunction_t) * numDevices)); - hipFunction_t* kds = reinterpret_cast(malloc(sizeof(hipFunction_t) * numDevices)); - - if (kds == nullptr || gwsKds == nullptr) { - return hipErrorNotInitialized; - } + vector gwsKds; + vector kds; + GET_TLS(); // prepare all kernel descriptors for initializing the GWS and the main kernels per device for (int i = 0; i < numDevices; ++i) { const hipLaunchParams& lp = launchParamsList[i]; - if (lp.stream == nullptr) { - free(gwsKds); - free(kds); - return hipErrorNotInitialized; - } - gwsKds[i] = ps.kernel_descriptor(reinterpret_cast(&init_gws), - hip_impl::target_agent(lp.stream)); + gwsKds.push_back(ps.kernel_descriptor(reinterpret_cast(&init_gws), + hip_impl::target_agent(lp.stream))); if (gwsKds[i] == nullptr) { - free(gwsKds); - free(kds); return hipErrorInvalidValue; } hip_impl::kernargs_size_align gwsKargs = ps.get_kernargs_size_align( @@ -560,23 +712,42 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL gwsKargs.getHandle()); - kds[i] = ps.kernel_descriptor(reinterpret_cast(lp.func), - hip_impl::target_agent(lp.stream)); + kds.push_back(ps.kernel_descriptor(reinterpret_cast(lp.func), + hip_impl::target_agent(lp.stream))); if (kds[i] == nullptr) { - free(gwsKds); - free(kds); return hipErrorInvalidValue; } hip_impl::kernargs_size_align kargs = ps.get_kernargs_size_align( reinterpret_cast(lp.func)); kds[i]->_kernarg_layout = *reinterpret_cast>*>( kargs.getHandle()); + + uint32_t numBlocksPerSm = 0; + result = ihipOccupancyMaxActiveBlocksPerMultiprocessor(tls, &numBlocksPerSm, kds[i], + lp.blockDim.x * lp.blockDim.y * lp.blockDim.z, lp.sharedMem); + if (result != hipSuccess) { + return hipErrorLaunchFailure; + } + int maxActiveBlocks = numBlocksPerSm * lp.stream->getDevice()->_props.multiProcessorCount; + + //check to see if the workload fits on the GPU + if (lp.gridDim.x * lp.gridDim.y * lp.gridDim.z > maxActiveBlocks) { + return hipErrorCooperativeLaunchTooLarge; + } + } + + vector coopAVs; + + // create cooperative accelerated views for launching gws and main kernels on each device + for (int i = 0; i < numDevices; ++i) { + hc::accelerator acc = launchParamsList[i].stream->getDevice()->_acc; + coopAVs.push_back(acc.create_cooperative_view()); } mg_sync *mg_sync_ptr = 0; - mg_info *mg_info_ptr[MAX_COOPERATIVE_GPUs] = {0}; + vector mg_info_ptr; + - GET_TLS(); result = hip_internal::ihipHostMalloc(tls, (void **)&mg_sync_ptr, sizeof(mg_sync), hipHostMallocDefault); if (result != hipSuccess) { return hipErrorInvalidValue; @@ -586,7 +757,8 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL uint all_sum = 0; for (int i = 0; i < numDevices; ++i) { - result = hip_internal::ihipHostMalloc(tls, (void **)&mg_info_ptr[i], sizeof(mg_info), hipHostMallocDefault); + mg_info *mg_info_temp = nullptr; + result = hip_internal::ihipHostMalloc(tls, (void **)&mg_info_temp, sizeof(mg_info), hipHostMallocDefault); if (result != hipSuccess) { hip_internal::ihipHostFree(tls, mg_sync_ptr); for (int j = 0; j < i; ++j) { @@ -594,6 +766,7 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL } return hipErrorInvalidValue; } + mg_info_ptr.push_back(mg_info_temp); // calculate the sum of sizes of all grids const hipLaunchParams& lp = launchParamsList[i]; all_sum += lp.blockDim.x * lp.blockDim.y * lp.blockDim.z * @@ -603,9 +776,15 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL // lock all streams before launching the blit kernels for initializing the GWS and main kernels to each device for (int i = 0; i < numDevices; ++i) { LockedAccessor_StreamCrit_t streamCrit(launchParamsList[i].stream->criticalData(), false); -#if (__hcc_workweek__ >= 19213) + + hc::completion_future streamCF; + if (!streamCrit->_av.get_is_empty()) { + streamCF = streamCrit->_av.create_marker(hc::accelerator_scope); + coopAVs[i].create_blocking_marker(streamCF, hc::accelerator_scope); + } + streamCrit->_av.acquire_locked_hsa_queue(); -#endif + coopAVs[i].acquire_locked_hsa_queue(); } // launch the init_gws kernel to initialize the GWS for each device @@ -617,14 +796,13 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL gwsKernelParam[0] = &nwm1; result = ihipModuleLaunchKernel(tls, gwsKds[i], 1, 1, 1, 1, 1, 1, - 0, lp.stream, gwsKernelParam, nullptr, nullptr, nullptr, 0, true); + 0, lp.stream, gwsKernelParam, nullptr, nullptr, nullptr, 0, true, nullptr, &coopAVs[i]); if (result != hipSuccess) { for (int j = 0; j < numDevices; ++j) { launchParamsList[j].stream->criticalData().unlock(); -#if (__hcc_workweek__ >= 19213) launchParamsList[j].stream->criticalData()._av.release_locked_hsa_queue(); -#endif + coopAVs[i].release_locked_hsa_queue(); } hip_internal::ihipHostFree(tls, mg_sync_ptr); for (int j = 0; j < numDevices; ++j) { @@ -670,14 +848,13 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL lp.blockDim.x, lp.blockDim.y, lp.blockDim.z, lp.sharedMem, lp.stream, lp.args, nullptr, nullptr, nullptr, 0, - true, impCoopParams); + true, impCoopParams, &coopAVs[i]); if (result != hipSuccess) { for (int j = 0; j < numDevices; ++j) { launchParamsList[j].stream->criticalData().unlock(); -#if (__hcc_workweek__ >= 19213) launchParamsList[j].stream->criticalData()._av.release_locked_hsa_queue(); -#endif + coopAVs[i].release_locked_hsa_queue(); } hip_internal::ihipHostFree(tls, mg_sync_ptr); for (int j = 0; j < numDevices; ++j) { @@ -691,14 +868,18 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL // unlock all streams for (int i = 0; i < numDevices; ++i) { - launchParamsList[i].stream->criticalData().unlock(); -#if (__hcc_workweek__ >= 19213) + coopAVs[i].release_locked_hsa_queue(); launchParamsList[i].stream->criticalData()._av.release_locked_hsa_queue(); -#endif - } - free(gwsKds); - free(kds); + hc::completion_future cooperativeCF; + if (!coopAVs[i].get_is_empty()) { + cooperativeCF = coopAVs[i].create_marker(hc::accelerator_scope); + launchParamsList[i].stream->criticalData()._av.create_blocking_marker( + cooperativeCF, hc::accelerator_scope); + } + + launchParamsList[i].stream->criticalData().unlock(); + } hip_internal::ihipHostFree(tls, mg_sync_ptr); for (int j = 0; j < numDevices; ++j) { @@ -706,6 +887,24 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL } return result; +#else + return hipErrorInvalidConfiguration; +#endif +} + +__attribute__((visibility("default"))) +hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList, + int numDevices, + unsigned int flags, + hip_impl::program_state& ps) { + + // Skipping passing in ps, because the logging function does not like it + HIP_INIT_API(hipLaunchCooperativeKernelMultiDevice, launchParamsList, + numDevices, flags); + + return ihipLogStatus(ihipLaunchCooperativeKernelMultiDevice(launchParamsList, + numDevices, + flags, ps)); } namespace hip_impl { @@ -1120,7 +1319,7 @@ const amd_kernel_code_v3_t *header_v3(const ihipModuleSymbol_t& kd) { return reinterpret_cast(kd._header); } -hipFuncAttributes make_function_attributes(TlsData *tls, const ihipModuleSymbol_t& kd) { +hipFuncAttributes make_function_attributes(TlsData *tls, ihipModuleSymbol_t& kd) { hipFuncAttributes r{}; hipDeviceProp_t prop{}; @@ -1130,23 +1329,57 @@ hipFuncAttributes make_function_attributes(TlsData *tls, const ihipModuleSymbol_ prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024; if (kd._is_code_object_v3) { - r.localSizeBytes = header_v3(kd)->private_segment_fixed_size; - r.sharedSizeBytes = header_v3(kd)->group_segment_fixed_size; - r.numRegs = ((header_v3(kd)->compute_pgm_rsrc1 & 0x3F) + 1) << 2; r.binaryVersion = 0; // FIXME: should it be the ISA version or code // object format version? + r.localSizeBytes = header_v3(kd)->private_segment_fixed_size; + r.sharedSizeBytes = header_v3(kd)->group_segment_fixed_size; } else { r.localSizeBytes = kd._header->workitem_private_segment_byte_size; r.sharedSizeBytes = kd._header->workgroup_group_segment_byte_size; - r.numRegs = kd._header->workitem_vgpr_count; r.binaryVersion = kd._header->amd_machine_version_major * 10 + kd._header->amd_machine_version_minor; } r.maxDynamicSharedSizeBytes = prop.sharedMemPerBlock - r.sharedSizeBytes; - r.maxThreadsPerBlock = r.numRegs ? - std::min(prop.maxThreadsPerBlock, prop.regsPerBlock / r.numRegs) : - prop.maxThreadsPerBlock; + + size_t usedVGPRS = 0; + size_t usedSGPRS = 0; + size_t usedLDS = 0; + getGprsLdsUsage(&kd, &usedVGPRS, &usedSGPRS, &usedLDS); + + r.numRegs = usedVGPRS; + + size_t wavefrontSize = prop.warpSize; + size_t maxWavefrontsPerBlock = prop.maxThreadsPerBlock / wavefrontSize; + size_t maxWavefrontsPerCU = min(prop.maxThreadsPerMultiProcessor / wavefrontSize, 32); + const size_t numSIMD = 4; + const size_t maxWavesPerSimd = maxWavefrontsPerCU / numSIMD; + size_t maxWaves = 0; + for (int i = 0; i < maxWavefrontsPerBlock; i++) { + size_t wavefronts = i + 1; + + if (usedVGPRS > 0) { + size_t availableVGPRs = (prop.regsPerBlock / wavefrontSize / numSIMD); + size_t vgprs_alu_occupancy = numSIMD * std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS); + + // Calculate blocks occupancy per CU based on VGPR usage + if (vgprs_alu_occupancy < wavefronts) + break; + } + + if (usedSGPRS > 0) { + const size_t availableSGPRs = (prop.gcnArch < 800) ? 512 : 800; + size_t sgprs_alu_occupancy = numSIMD * ((usedSGPRS == 0) ? maxWavesPerSimd + : std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS)); + + // Calculate blocks occupancy per CU based on SGPR usage + if (sgprs_alu_occupancy < wavefronts) + break; + } + maxWaves = wavefronts; + } + + r.maxThreadsPerBlock = maxWaves * wavefrontSize; r.ptxVersion = prop.major * 10 + prop.minor; // HIP currently presents itself as PTX 3.0. return r; @@ -1294,29 +1527,6 @@ hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const return ihipLogStatus(hipSuccess); } -void getGprsLdsUsage(hipFunction_t f, size_t* usedVGPRS, size_t* usedSGPRS, size_t* usedLDS) -{ - if (f->_is_code_object_v3) { - const auto header = reinterpret_cast(f->_header); - // GRANULATED_WAVEFRONT_VGPR_COUNT is specified in 0:5 bits of COMPUTE_PGM_RSRC1 - // the granularity for gfx6-gfx9 is max(0, ceil(vgprs_used / 4) - 1) - *usedVGPRS = ((header->compute_pgm_rsrc1 & 0x3F) + 1) << 2; - // GRANULATED_WAVEFRONT_SGPR_COUNT is specified in 6:9 bits of COMPUTE_PGM_RSRC1 - // the granularity for gfx9+ is 2 * max(0, ceil(sgprs_used / 16) - 1) - *usedSGPRS = ((((header->compute_pgm_rsrc1 & 0x3C0) >> 6) >> 1) + 1) << 4; - *usedLDS = header->group_segment_fixed_size; - } - else { - const auto header = f->_header; - // VGPRs granularity is 4 - *usedVGPRS = ((header->workitem_vgpr_count + 3) >> 2) << 2; - // adding 2 to take into account the 2 VCC registers & handle the granularity of 16 - *usedSGPRS = header->wavefront_sgpr_count + 2; - *usedSGPRS = ((*usedSGPRS + 15) >> 4) << 4; - *usedLDS = header->workgroup_group_segment_byte_size; - } -} - hipError_t ihipOccupancyMaxPotentialBlockSize(TlsData *tls, uint32_t* gridSize, uint32_t* blockSize, hipFunction_t f, size_t dynSharedMemPerBlk, uint32_t blockSizeLimit) @@ -1439,59 +1649,6 @@ hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* block gridSize, blockSize, f, dynSharedMemPerBlk, blockSizeLimit)); } -hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor( - TlsData *tls, uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk) -{ - using namespace hip_impl; - - auto ctx = ihipGetTlsDefaultCtx(); - if (ctx == nullptr) { - return hipErrorInvalidDevice; - } - - hipDeviceProp_t prop{}; - ihipGetDeviceProperties(&prop, ihipGetTlsDefaultCtx()->getDevice()->_deviceId); - - prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024; - - size_t usedVGPRS = 0; - size_t usedSGPRS = 0; - size_t usedLDS = 0; - getGprsLdsUsage(f, &usedVGPRS, &usedSGPRS, &usedLDS); - - // Due to SPI and private memory limitations, the max of wavefronts per CU in 32 - size_t wavefrontSize = prop.warpSize; - size_t maxWavefrontsPerCU = min(prop.maxThreadsPerMultiProcessor / wavefrontSize, 32); - - const size_t simdPerCU = 4; - const size_t maxWavesPerSimd = maxWavefrontsPerCU / simdPerCU; - - size_t numWavefronts = (blockSize + wavefrontSize - 1) / wavefrontSize; - - size_t availableVGPRs = (prop.regsPerBlock / wavefrontSize / simdPerCU); - size_t vgprs_alu_occupancy = simdPerCU * (usedVGPRS == 0 ? maxWavesPerSimd - : std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS)); - - // Calculate blocks occupancy per CU based on VGPR usage - *numBlocks = vgprs_alu_occupancy / numWavefronts; - - const size_t availableSGPRs = (prop.gcnArch < 800) ? 512 : 800; - size_t sgprs_alu_occupancy = simdPerCU * (usedSGPRS == 0 ? maxWavesPerSimd - : std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS)); - - // Calculate blocks occupancy per CU based on SGPR usage - *numBlocks = std::min(*numBlocks, (uint32_t) (sgprs_alu_occupancy / numWavefronts)); - - size_t total_used_lds = usedLDS + dynSharedMemPerBlk; - if (total_used_lds != 0) { - // Calculate LDS occupacy per CU. lds_per_cu / (static_lsd + dynamic_lds) - size_t lds_occupancy = prop.maxSharedMemoryPerMultiProcessor / total_used_lds; - *numBlocks = std::min(*numBlocks, (uint32_t) lds_occupancy); - } - - return hipSuccess; -} - hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor( uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk) { diff --git a/projects/hip/src/hip_stream.cpp b/projects/hip/src/hip_stream.cpp index 2add6a77c4..63551d1204 100644 --- a/projects/hip/src/hip_stream.cpp +++ b/projects/hip/src/hip_stream.cpp @@ -257,11 +257,39 @@ hipError_t hipStreamGetPriority(hipStream_t stream, int* priority) { hipError_t hipStreamAddCallback(hipStream_t stream, hipStreamCallback_t callback, void* userData, unsigned int flags) { HIP_INIT_API(hipStreamAddCallback, stream, callback, userData, flags); - hipError_t e = hipSuccess; - // Create a thread in detached mode to handle callback - ihipStreamCallback_t* cb = new ihipStreamCallback_t(stream, callback, userData); - std::thread(ihipStreamCallbackHandler, cb).detach(); + auto stream_original{stream}; + stream = ihipSyncAndResolveStream(stream); - return ihipLogStatus(e); + if (!stream) return hipErrorInvalidValue; + + LockedAccessor_StreamCrit_t cs{stream->criticalData()}; + + // create first marker + auto cf = cs->_av.create_marker(hc::no_scope); + // get its signal + auto signal = *reinterpret_cast(cf.get_native_handle()); + // increment its signal value + hsa_signal_add_relaxed(signal, 1); + + // create callback that can be passed to hsa_amd_signal_async_handler + // this function will call the user's callback, then sets first packet's signal to 0 to indicate completion + auto t{new std::function{[=]() { + callback(stream_original, hipSuccess, userData); + hsa_signal_store_relaxed(signal, 0); + }}}; + + // register above callback with HSA runtime to be called when first packet's signal + // is decremented from 2 to 1 by CP (or it is already at 1) + hsa_amd_signal_async_handler(signal, HSA_SIGNAL_CONDITION_EQ, 1, + [](hsa_signal_value_t x, void* p) { + (*static_cast(p))(); + delete static_cast(p); + return false; + }, t); + + // create additional marker that blocks on the first one + cs->_av.create_blocking_marker(cf, hc::no_scope); + + return ihipLogStatus(hipSuccess); } diff --git a/projects/hip/src/hip_texture.cpp b/projects/hip/src/hip_texture.cpp index 27cf321fbc..29f0465dc1 100644 --- a/projects/hip/src/hip_texture.cpp +++ b/projects/hip/src/hip_texture.cpp @@ -301,7 +301,12 @@ hipError_t hipCreateTextureObject(hipTextureObject_t* pTexObject, const hipResou hsa_ext_sampler_descriptor_t samplerDescriptor; fillSamplerDescriptor(samplerDescriptor, pTexDesc->addressMode[0], pTexDesc->filterMode, pTexDesc->normalizedCoords); - + if(hipResourceTypeLinear == pResDesc->resType) { + samplerDescriptor.filter_mode = HSA_EXT_SAMPLER_FILTER_MODE_NEAREST; + samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER; + } else if(!pTexDesc->normalizedCoords) { + samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE; + } hsa_access_permission_t permission = HSA_ACCESS_PERMISSION_RW; if(hipResourceTypePitch2D != pResDesc->resType) @@ -312,6 +317,7 @@ hipError_t hipCreateTextureObject(hipTextureObject_t* pTexObject, const hipResou HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR, pitch, 0, &(pTexture->image)) || HSA_STATUS_SUCCESS != hsa_ext_sampler_create(*agent, &samplerDescriptor, &(pTexture->sampler))) { + free(pTexture); return ihipLogStatus(hipErrorRuntimeOther); } @@ -438,7 +444,13 @@ hipError_t ihipBindTextureImpl(TlsData *tls_, int dim, enum hipTextureReadMode r imageDescriptor.format.channel_type = channelType; hsa_ext_sampler_descriptor_t samplerDescriptor; - fillSamplerDescriptor(samplerDescriptor, addressMode, filterMode, normalizedCoords); + samplerDescriptor.filter_mode = HSA_EXT_SAMPLER_FILTER_MODE_NEAREST; + samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER; + if (normalizedCoords) { + samplerDescriptor.coordinate_mode = HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED; + } else { + samplerDescriptor.coordinate_mode = HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED; + } hsa_access_permission_t permission = HSA_ACCESS_PERMISSION_RW; @@ -449,6 +461,7 @@ hipError_t ihipBindTextureImpl(TlsData *tls_, int dim, enum hipTextureReadMode r HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR, rowPitch, 0, &(pTexture->image)) || HSA_STATUS_SUCCESS != hsa_ext_sampler_create(*agent, &samplerDescriptor, &(pTexture->sampler))) { + free(pTexture); return hipErrorRuntimeOther; } getHipTextureObject(&textureObject, pTexture->image, pTexture->sampler); @@ -514,7 +527,9 @@ hipError_t ihipBindTexture2DImpl(TlsData *tls, int dim, enum hipTextureReadMode hsa_ext_sampler_descriptor_t samplerDescriptor; fillSamplerDescriptor(samplerDescriptor, addressMode, filterMode, normalizedCoords); - + if(!normalizedCoords) { + samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE; + } hsa_access_permission_t permission = HSA_ACCESS_PERMISSION_RW; if( 0 == pitch) @@ -525,6 +540,7 @@ hipError_t ihipBindTexture2DImpl(TlsData *tls, int dim, enum hipTextureReadMode HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR, pitch, 0, &(pTexture->image)) || HSA_STATUS_SUCCESS != hsa_ext_sampler_create(*agent, &samplerDescriptor, &(pTexture->sampler))) { + free(pTexture); return hipErrorRuntimeOther; } getHipTextureObject(&textureObject, pTexture->image, pTexture->sampler); @@ -620,7 +636,9 @@ hipError_t ihipBindTextureToArrayImpl(TlsData *tls_, int dim, enum hipTextureRea hsa_ext_sampler_descriptor_t samplerDescriptor; fillSamplerDescriptor(samplerDescriptor, addressMode, filterMode, normalizedCoords); - + if(!normalizedCoords) { + samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE; + } hsa_access_permission_t permission = HSA_ACCESS_PERMISSION_RW; size_t rowPitch = getElementSize(channelOrder, channelType) * alignUp(imageDescriptor.width, IMAGE_PITCH_ALIGNMENT); diff --git a/projects/hip/src/hiprtc.cpp b/projects/hip/src/hiprtc.cpp index e9a516c339..a11207f337 100644 --- a/projects/hip/src/hiprtc.cpp +++ b/projects/hip/src/hiprtc.cpp @@ -21,7 +21,7 @@ THE SOFTWARE. */ #include "../include/hip/hiprtc.h" -#include "../include/hip/hcc_detail/code_object_bundle.hpp" +#include "code_object_bundle.inl" #include "../include/hip/hcc_detail/elfio/elfio.hpp" #include "../include/hip/hcc_detail/program_state.hpp" diff --git a/projects/hip/src/program_state.inl b/projects/hip/src/program_state.inl index 548a56795f..aef195804e 100644 --- a/projects/hip/src/program_state.inl +++ b/projects/hip/src/program_state.inl @@ -1,6 +1,6 @@ #include "../include/hip/hcc_detail/program_state.hpp" -#include "../include/hip/hcc_detail/code_object_bundle.hpp" +#include "code_object_bundle.inl" #include "../include/hip/hcc_detail/hsa_helpers.hpp" #if !defined(__cpp_exceptions) @@ -357,8 +357,11 @@ public: const auto it1 = get_symbol_addresses().find(x); if (it1 == get_symbol_addresses().cend()) { - hip_throw(std::runtime_error{ - "Global symbol: " + x + " is undefined."}); + // For a unknown symbol, initialize it with a magic poison + hsa_executable_agent_global_variable_define( + executable, agent, x.c_str(), + reinterpret_cast(0xDEADBEEFDEADBEEFull)); + continue; } hsa_status_t status; diff --git a/projects/hip/tests/hipify-clang/unit_tests/libraries/cuSPARSE/cuSPARSE_12.cu b/projects/hip/tests/hipify-clang/unit_tests/libraries/cuSPARSE/cuSPARSE_12.cu index c6d62c0007..e6a2178053 100644 --- a/projects/hip/tests/hipify-clang/unit_tests/libraries/cuSPARSE/cuSPARSE_12.cu +++ b/projects/hip/tests/hipify-clang/unit_tests/libraries/cuSPARSE/cuSPARSE_12.cu @@ -385,8 +385,7 @@ double compute_BSR(BCRSArrays& bcsr, double *x , double *y){ cudaEventCreate(&startTime); cudaEventCreate(&stopTime); cudaEventRecord(startTime, bcsr.streamId); - // NOTE: cusparseDbsrmv and CUSPARSE_DIRECTION_COLUMN (of type cusparseDirection_t) are yet unsupported by HIP - // CHECK: cusparseDbsrmv(bcsr.cusparseHandle, CUSPARSE_DIRECTION_COLUMN, HIPSPARSE_OPERATION_NON_TRANSPOSE, + // CHECK: cusparseDbsrmv(bcsr.cusparseHandle, HIPSPARSE_DIRECTION_COLUMN, HIPSPARSE_OPERATION_NON_TRANSPOSE, cusparseDbsrmv(bcsr.cusparseHandle, CUSPARSE_DIRECTION_COLUMN, CUSPARSE_OPERATION_NON_TRANSPOSE, bcsr.nbBlockRow, bcsr.m, bcsr.nbBlocks, &alpha, descr, bcsr.cu_bsrValC, bcsr.cu_bsrRowPtrC, bcsr.cu_bsrColIndC, bcsr.blockSize, diff --git a/projects/hip/tests/src/Negative/memory/hipMemcpyFromSymbol.cpp b/projects/hip/tests/src/Negative/memory/hipMemcpyFromSymbol.cpp new file mode 100644 index 0000000000..10f8c51a6d --- /dev/null +++ b/projects/hip/tests/src/Negative/memory/hipMemcpyFromSymbol.cpp @@ -0,0 +1,46 @@ +/* +Copyright (c) 2015-Present Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * TEST: %t + * HIT_END + */ + +#include "test_common.h" +#define SIZE 1024 + +int main(){ + + void *Sd; + hipError_t e; + char S[SIZE]="This is not a device symbol"; + + HIPCHECK(hipMalloc(&Sd,SIZE)); + + e = hipMemcpyFromSymbol(S, HIP_SYMBOL(Sd), SIZE, 0, hipMemcpyDeviceToHost); + HIPASSERT(e==hipErrorInvalidSymbol); + + e = hipMemcpyFromSymbol(S, NULL, SIZE, 0, hipMemcpyDeviceToHost); + HIPASSERT(e==hipErrorInvalidSymbol); + + HIPCHECK(hipFree(Sd)); + + passed(); +} diff --git a/projects/hip/tests/src/Negative/memory/hipMemcpyFromSymbolAsync.cpp b/projects/hip/tests/src/Negative/memory/hipMemcpyFromSymbolAsync.cpp new file mode 100644 index 0000000000..fa341c6cea --- /dev/null +++ b/projects/hip/tests/src/Negative/memory/hipMemcpyFromSymbolAsync.cpp @@ -0,0 +1,49 @@ +/* +Copyright (c) 2015-Present Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * TEST: %t + * HIT_END + */ + +#include "test_common.h" +#define SIZE 1024 + +int main(){ + + void *Sd; + hipError_t e; + char S[SIZE]="This is not a device symbol"; + + HIPCHECK(hipMalloc(&Sd,SIZE)); + + hipStream_t stream; + HIPCHECK(hipStreamCreate(&stream)); + + e = hipMemcpyFromSymbolAsync(S, HIP_SYMBOL(Sd), SIZE, 0, hipMemcpyDeviceToHost, stream); + HIPASSERT(e==hipErrorInvalidSymbol); + + e = hipMemcpyFromSymbolAsync(S, NULL, SIZE, 0, hipMemcpyDeviceToHost, stream); + HIPASSERT(e==hipErrorInvalidSymbol); + + HIPCHECK(hipFree(Sd)); + + passed(); +} diff --git a/projects/hip/tests/src/Negative/memory/hipMemcpyToSymbol.cpp b/projects/hip/tests/src/Negative/memory/hipMemcpyToSymbol.cpp new file mode 100644 index 0000000000..8626c2c34f --- /dev/null +++ b/projects/hip/tests/src/Negative/memory/hipMemcpyToSymbol.cpp @@ -0,0 +1,46 @@ +/* +Copyright (c) 2015-Present Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * TEST: %t + * HIT_END + */ + +#include "test_common.h" +#define SIZE 1024 + +int main(){ + + void *Sd; + hipError_t e; + char S[SIZE]="This is not a device symbol"; + + HIPCHECK(hipMalloc(&Sd,SIZE)); + + e = hipMemcpyToSymbol(HIP_SYMBOL(Sd), S, SIZE, 0, hipMemcpyHostToDevice); + HIPASSERT(e==hipErrorInvalidSymbol); + + e = hipMemcpyToSymbol(NULL, S, SIZE, 0, hipMemcpyHostToDevice); + HIPASSERT(e==hipErrorInvalidSymbol); + + HIPCHECK(hipFree(Sd)); + + passed(); +} diff --git a/projects/hip/tests/src/Negative/memory/hipMemcpyToSymbolAsync.cpp b/projects/hip/tests/src/Negative/memory/hipMemcpyToSymbolAsync.cpp new file mode 100644 index 0000000000..832e4336be --- /dev/null +++ b/projects/hip/tests/src/Negative/memory/hipMemcpyToSymbolAsync.cpp @@ -0,0 +1,49 @@ +/* +Copyright (c) 2015-Present Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * TEST: %t + * HIT_END + */ + +#include "test_common.h" +#define SIZE 100 + +int main(){ + + void *Sd; + hipError_t e; + char S[SIZE]="This is not a device symbol"; + + HIPCHECK(hipMalloc(&Sd,SIZE)); + + hipStream_t stream; + HIPCHECK(hipStreamCreate(&stream)); + + e = hipMemcpyToSymbolAsync(HIP_SYMBOL(Sd), S, SIZE, 0, hipMemcpyHostToDevice, stream); + HIPASSERT(e==hipErrorInvalidSymbol); + + e = hipMemcpyToSymbolAsync(NULL, S, SIZE, 0, hipMemcpyHostToDevice, stream); + HIPASSERT(e==hipErrorInvalidSymbol); + + HIPCHECK(hipFree(Sd)); + + passed(); +} diff --git a/projects/hip/tests/src/Negative/memory/hipMemory.cpp b/projects/hip/tests/src/Negative/memory/hipMemory.cpp new file mode 100644 index 0000000000..a71ee948f5 --- /dev/null +++ b/projects/hip/tests/src/Negative/memory/hipMemory.cpp @@ -0,0 +1,43 @@ +/* +Copyright (c) 2015-Present Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM vdi + * TEST: %t + * HIT_END + */ + +#include "test_common.h" +#define SIZE 100 + +int main(){ + hipError_t e; + char str[SIZE]="Hi, I am Ellesemere. What is ur name?"; + + e = hipMemcpy(0, str, SIZE, hipMemcpyHostToDevice); + HIPASSERT(e==hipErrorInvalidValue); + + e = hipMemcpy(NULL, str, SIZE, hipMemcpyHostToDevice); + HIPASSERT(e==hipErrorInvalidValue); + + e = hipMemset(0,99,80); + HIPASSERT(e==hipErrorInvalidValue); + + passed(); +} diff --git a/projects/hip/tests/src/Negative/stream/hipStreamCreateWithFlags.cpp b/projects/hip/tests/src/Negative/stream/hipStreamCreateWithFlags.cpp new file mode 100644 index 0000000000..6f0662b82d --- /dev/null +++ b/projects/hip/tests/src/Negative/stream/hipStreamCreateWithFlags.cpp @@ -0,0 +1,40 @@ +/* +Copyright (c) 2015-Present Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM vdi + * TEST: %t + * HIT_END + */ + +#include "test_common.h" + +int main(){ + + hipError_t e; + hipStream_t stream; + + e = hipStreamCreateWithFlags(&stream, -1); + HIPASSERT(e==hipErrorInvalidValue); + + e = hipStreamCreateWithFlags(&stream, 2); + HIPASSERT(e==hipErrorInvalidValue); + + passed(); +} diff --git a/projects/hip/tests/src/deviceLib/hipTestHalf.cpp b/projects/hip/tests/src/deviceLib/hipTestHalf.cpp index 751d44e242..64a9f7fa63 100644 --- a/projects/hip/tests/src/deviceLib/hipTestHalf.cpp +++ b/projects/hip/tests/src/deviceLib/hipTestHalf.cpp @@ -96,6 +96,18 @@ void kernel_hisinf(__half* input, int* output) { output[tx] = __hisinf(input[tx]); } +__global__ void testHalfAbs(float* p) { + auto a = __float2half(*p); + a = __habs(a); + *p = __half2float(a); +} + +__global__ void testHalf2Abs(float2* p) { + auto a = __float22half2_rn(*p); + a = __habs2(a); + *p = __half22float2(a); +} + #endif @@ -237,6 +249,31 @@ void checkFunctional() { return; } +void checkHalfAbs() { + { + float *p; + hipMalloc(&p, sizeof(float)); + float pp = -2.1f; + hipMemcpy(p, &pp, sizeof(float), hipMemcpyDefault); + hipLaunchKernelGGL(testHalfAbs, 1, 1, 0, 0, p); + hipMemcpy(&pp, p, sizeof(float), hipMemcpyDefault); + hipFree(p); + if(pp < 0.0f) { failed("Half Abs failed"); } + } + { + float2 *p; + hipMalloc(&p, sizeof(float2)); + float2 pp; + pp.x = -2.1f; + pp.y = -1.1f; + hipMemcpy(p, &pp, sizeof(float2), hipMemcpyDefault); + hipLaunchKernelGGL(testHalf2Abs, 1, 1, 0, 0, p); + hipMemcpy(&pp, p, sizeof(float2), hipMemcpyDefault); + hipFree(p); + if(pp.x < 0.0f || pp.y < 0.0f) { failed("Half2 Abs Test Failed"); } + } +} + int main() { bool* result{nullptr}; hipMemAllocHost((void**)&result, sizeof(result)); @@ -260,5 +297,7 @@ int main() { // run some functional checks checkFunctional(); + checkHalfAbs(); + passed(); } diff --git a/projects/hip/tests/src/deviceLib/hip_floatnTM.cpp b/projects/hip/tests/src/deviceLib/hip_floatnTM.cpp new file mode 100644 index 0000000000..921933636f --- /dev/null +++ b/projects/hip/tests/src/deviceLib/hip_floatnTM.cpp @@ -0,0 +1,239 @@ +/* +Copyright (c) 2015-2019 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* HIT_START + * BUILD: %t %s ../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc HIPCC_OPTIONS -std=c++14 + * TEST: %t + * HIT_END + */ + +#include +#include +#include +#include "test_common.h" + +static std::random_device dev; +static std::mt19937 rng(dev()); + +template +__host__ __device__ inline constexpr int count() { + return sizeof(T) / sizeof(M); +} + +inline float getRandomFloat(float min = 10, float max = 100) { + std::uniform_real_distribution gen(min, max); + return gen(rng); +} + +template +void fillMatrix(T* a, int size) { + for (int i = 0; i < size; i++) { + T t; + t.x = getRandomFloat(); + if constexpr (count() >= 2) t.y = getRandomFloat(); + if constexpr (count() >= 3) t.z = getRandomFloat(); + if constexpr (count() >= 4) t.w = getRandomFloat(); + + a[i] = t; + } +} + +// Test operations +template +__host__ __device__ void testOperations(T& a, T& b) { + a.x += b.x; + a.x++; + b.x++; + if constexpr (count() >= 2) { + a.y = b.x; + a.x = b.y; + } + if constexpr (count() >= 3) { + if (a.x > 0) b.x /= a.x; + a.x *= b.z; + a.y--; + } + if constexpr (count() >= 4) { + b.w = a.x; + a.w += (-b.y); + } +} + +template +__global__ void testOperationsGPU(T* d_a, T* d_b, int size) { + int id = threadIdx.x; + if (id > size) return; + T &a = d_a[id]; + T &b = d_b[id]; + + testOperations(a, b); +} + + +template +void dcopy(T* a, T* b, int size) { + for (int i = 0; i < size; i++) { + a[i] = b[i]; + } +} + +template +bool isEqual(T* a, T* b, int size) { + for (int i = 0; i < size; i++) { + if (a[i] != b[i]) { + return false; + } + } + return true; +} + +// Main function that tests type +// T = what you want to test +// D = pack of 1 i.e. float1 int1 +template +void testType(int msize) { + T *fa, *fb, *fc, *h_fa, *h_fb; + fa = new T[msize]; + fb = new T[msize]; + fc = new T[msize]; + h_fa = new T[msize]; + h_fb = new T[msize]; + + T *d_fa, *d_fb; + + constexpr int c = count(); + + if (c <= 0 || c >= 5) { + failed("Invalid Size\n"); + } + + fillMatrix(fa, msize); + dcopy(fb, fa, msize); + dcopy(h_fa, fa, msize); + dcopy(h_fb, fa, msize); + for (int i = 0; i < msize; i++) testOperations(h_fa[i], h_fb[i]); + + hipMalloc(&d_fa, sizeof(T) * msize); + hipMalloc(&d_fb, sizeof(T) * msize); + + hipMemcpy(d_fa, fa, sizeof(T) * msize, hipMemcpyHostToDevice); + hipMemcpy(d_fb, fb, sizeof(T) * msize, hipMemcpyHostToDevice); + + auto kernel = testOperationsGPU; + hipLaunchKernelGGL(kernel, 1, msize, 0, 0, d_fa, d_fb, msize); + + hipMemcpy(fc, d_fa, sizeof(T) * msize, hipMemcpyDeviceToHost); + + bool pass = true; + if (!isEqual(h_fa, fc, msize)) { + pass = false; + } + + delete[] fa; + delete[] fb; + delete[] fc; + delete[] h_fa; + delete[] h_fb; + hipFree(d_fa); + hipFree(d_fb); + + if (!pass) { + failed("Failed"); + } +} + +int main() { + const int msize = 100; + // double + testType(msize); + testType(msize); + testType(msize); + testType(msize); + + // floats + testType(msize); + testType(msize); + testType(msize); + testType(msize); + + // ints + testType(msize); + testType(msize); + testType(msize); + testType(msize); + + // chars + testType(msize); + testType(msize); + testType(msize); + testType(msize); + + // long + testType(msize); + testType(msize); + testType(msize); + testType(msize); + + // longlong + testType(msize); + testType(msize); + testType(msize); + testType(msize); + + // short + testType(msize); + testType(msize); + testType(msize); + testType(msize); + + // uints + testType(msize); + testType(msize); + testType(msize); + testType(msize); + + // uchars + testType(msize); + testType(msize); + testType(msize); + testType(msize); + + // ulong + testType(msize); + testType(msize); + testType(msize); + testType(msize); + + // ulonglong + testType(msize); + testType(msize); + testType(msize); + testType(msize); + + // ushort + testType(msize); + testType(msize); + testType(msize); + testType(msize); + + passed(); +} diff --git a/projects/hip/tests/src/hiprtc/hiprtcGetLoweredName.cpp b/projects/hip/tests/src/hiprtc/hiprtcGetLoweredName.cpp index e7b88d26d2..a63e13af64 100644 --- a/projects/hip/tests/src/hiprtc/hiprtcGetLoweredName.cpp +++ b/projects/hip/tests/src/hiprtc/hiprtcGetLoweredName.cpp @@ -20,7 +20,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* HIT_START - * BUILD: %t %s ../test_common.cpp LINK_OPTIONS hiprtc EXCLUDE_HIP_PLATFORM nvcc + * BUILD: %t %s ../test_common.cpp LINK_OPTIONS hiprtc EXCLUDE_HIP_PLATFORM nvcc vdi * TEST: %t * HIT_END */ diff --git a/projects/hip/tests/src/hiprtc/saxpy.cpp b/projects/hip/tests/src/hiprtc/saxpy.cpp index d063578757..a08c1c2399 100755 --- a/projects/hip/tests/src/hiprtc/saxpy.cpp +++ b/projects/hip/tests/src/hiprtc/saxpy.cpp @@ -20,7 +20,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* HIT_START - * BUILD: %t %s ../test_common.cpp LINK_OPTIONS hiprtc EXCLUDE_HIP_PLATFORM nvcc + * BUILD: %t %s ../test_common.cpp LINK_OPTIONS hiprtc EXCLUDE_HIP_PLATFORM nvcc vdi * TEST: %t * HIT_END */ diff --git a/projects/hip/tests/src/runtimeApi/memory/hipMemcpyNegetiveTests.cpp b/projects/hip/tests/src/runtimeApi/memory/hipMemcpyNegetiveTests.cpp new file mode 100644 index 0000000000..febc664f7d --- /dev/null +++ b/projects/hip/tests/src/runtimeApi/memory/hipMemcpyNegetiveTests.cpp @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR + * IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * */ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc vdi + * TEST: %t + * HIT_END + */ + + +#include "test_common.h" + +int main() { + int* A; + int* Ad; + int* Bd; + + // Allocation + HIPCHECK(hipMalloc((void**)&Ad, sizeof(int))); + HIPCHECK(hipMalloc((void**)&Bd, sizeof(int))); + HIPCHECK(hipHostMalloc((void**)&A,sizeof(int))); + + // Kind should be ignored and test should pass even for incorrect kind + HIPCHECK(hipMemcpy(Ad, A, sizeof(int), hipMemcpyDeviceToHost)); + HIPCHECK(hipMemcpy(A, Ad, sizeof(int), hipMemcpyHostToDevice)); + HIPCHECK(hipMemcpy(Ad, Bd, sizeof(int), hipMemcpyHostToHost)); + HIPCHECK(hipMemcpy(A, A, sizeof(int), hipMemcpyDeviceToDevice)); + + // nullptr passed as source or destination pointer + HIPASSERT(hipSuccess != hipMemcpy(nullptr, A, sizeof(int), hipMemcpyHostToDevice)); + HIPASSERT(hipSuccess != hipMemcpy(Ad, nullptr, sizeof(int), hipMemcpyHostToDevice)); + + HIPCHECK(hipFree(Ad)); + HIPCHECK(hipFree(Bd)); + HIPCHECK(hipFree(A)); + passed(); +} diff --git a/projects/hip/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp b/projects/hip/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp index 4f180829fa..11bd6e7d50 100644 --- a/projects/hip/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp +++ b/projects/hip/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp @@ -107,8 +107,8 @@ void run(const std::vector& buffer) { hipFree(Ad); hipFree(Bd); - delete A; - delete B; + delete[] A; + delete[] B; hipCtxDestroy(context); } diff --git a/projects/hip/tests/src/runtimeApi/stream/StreamAddCallback.cpp b/projects/hip/tests/src/runtimeApi/stream/StreamAddCallback.cpp new file mode 100644 index 0000000000..e6492c7ce2 --- /dev/null +++ b/projects/hip/tests/src/runtimeApi/stream/StreamAddCallback.cpp @@ -0,0 +1,145 @@ +#include +#include +#include +#include "test_common.h" +#include + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 + * TEST: %t + * HIT_END + */ + +enum class ExecState +{ + EXEC_NOT_STARTED, + EXEC_STARTED, + EXEC_CB_STARTED, + EXEC_CB_FINISHED, + EXEC_FINISHED +}; + +struct UserData +{ + size_t size; + int* ptr; +}; + +// Global variable to check exection order +std::atomic gData(ExecState::EXEC_NOT_STARTED); + + +void myCallback(hipStream_t stream, hipError_t status, void* user_data) +{ + if(gData.load() != ExecState::EXEC_STARTED) + return; // Error hence return early + + gData.store(ExecState::EXEC_CB_STARTED); + + UserData* data = reinterpret_cast(user_data); + printf("Callback started\n"); + + sleep(1); + + printf("Callback ending.\n"); + gData.store(ExecState::EXEC_CB_FINISHED); +} + +bool test(int count) +{ + printf("\n============ Test iteration %d =============\n",count); + // Stream + hipStream_t stream; + bool result = true; + + gData.store(ExecState::EXEC_STARTED); + + HIPCHECK(hipStreamCreate(&stream)); + + // Array size + size_t size = 10000; + + // Device array + int *data = NULL; + HIPCHECK(hipMalloc((void**)&data, sizeof(int) * size)); + + // Initialize device array to -1 + HIPCHECK(hipMemset(data, -1, sizeof(int) * size)); + + // Host array + int *host = NULL; + HIPCHECK(hipHostMalloc((void**)&host, sizeof(int) * size)); + + // Print host ptr address + printf("In main thread\n"); + + // Initialize user_data for callback + UserData arg; + arg.size = size; + arg.ptr = host; + + // Synchronize device + HIPCHECK(hipDeviceSynchronize()); + + // Asynchronous copy from device to host + HIPCHECK(hipMemcpyAsync(host, data, sizeof(int) * size, hipMemcpyDeviceToHost, stream)); + + // Asynchronous memset on device + HIPCHECK(hipMemsetAsync(data, 0, sizeof(int) * size, stream)); + + // Add callback - should happen after hipMemsetAsync() + HIPCHECK(hipStreamAddCallback(stream, myCallback, &arg, 0)); + + printf("Will wait in main thread until callback completes\n"); + + //This should synchronize the stream (including the callback) + HIPCHECK(hipStreamSynchronize(stream)); + + if(gData.load() != ExecState::EXEC_CB_FINISHED) + { + std::cout<<"Callback is not finished\n"; + return false; + } + printf("Callback completed will resume main thread execution\n"); + + if(host[size/2] != -1) + { + // Print some host data that just got copied + printf("Pseudo host data printing (should be -1): %d\n", host[size/2]); + result = false; + } + + HIPCHECK(hipMemcpy(host, data, sizeof(int)*size, hipMemcpyDeviceToHost)); + + if(host[size-1] != 0) + { + printf("Pseudo host data printing (should be 0): %d\n", host[size-1]); + result = false; + } + + HIPCHECK(hipFree(data)); + HIPCHECK(hipHostFree(host)); + HIPCHECK(hipStreamDestroy(stream)); + + gData.store(ExecState::EXEC_FINISHED); + return result; +} + +int main() +{ + // Test involves multithreading hence running multiple times + // to make sure consitency in the behavior + bool status = true; + + for(int i=0; i < 10; i++){ + status = test(i+1); + if(status == false) + { + failed("Test Failed!\n"); + break; + } + } + + if(status == true) passed(); + return 0; +} diff --git a/projects/hip/tests/src/runtimeApi/stream/hipStreamAddCallbackCatch.cpp b/projects/hip/tests/src/runtimeApi/stream/hipStreamAddCallbackCatch.cpp new file mode 100644 index 0000000000..c22b390ecc --- /dev/null +++ b/projects/hip/tests/src/runtimeApi/stream/hipStreamAddCallbackCatch.cpp @@ -0,0 +1,409 @@ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include "test_common.h" + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM vdi + * TEST: %t + * HIT_END + */ + +#define WORKAROUND 0 // Enable (1) this to make stream thread-safe by a workaround + +template // = queue blocks, until task is finished in enqueue(queue,task) +class QueueHipRt; + +// Queue types used in the tests +using TestQueues = std::tuple, QueueHipRt>; + + +// --- Implementation + +#define HIP_ASSERT(x) (assert((x)==hipSuccess)) +#define HIP_ASSERT_IGNORE(x,ign) auto err=x; HIP_ASSERT(err==ign ? hipSuccess : err) + +#ifdef __HIP_PLATFORM_HCC__ + #define HIPRT_CB +#endif + +template +static auto currentThreadWaitFor(QueueHipRt const & queue) -> void; + +template +class QueueHipRt +{ +public: + static constexpr bool isBlocking = IsBlocking; + //----------------------------------------------------------------------------- + QueueHipRt( + int dev) : + m_dev(dev), + m_HipQueue() + { + HIP_ASSERT( + hipSetDevice( + m_dev)); + HIP_ASSERT( + hipStreamCreateWithFlags( + &m_HipQueue, + hipStreamNonBlocking)); + } + //----------------------------------------------------------------------------- + QueueHipRt(QueueHipRt const &) = delete; + //----------------------------------------------------------------------------- + QueueHipRt(QueueHipRt &&) = delete; + //----------------------------------------------------------------------------- + auto operator=(QueueHipRt const &) -> QueueHipRt & = delete; + //----------------------------------------------------------------------------- + auto operator=(QueueHipRt &&) -> QueueHipRt & = delete; + //----------------------------------------------------------------------------- + ~QueueHipRt() + { + if(isBlocking) { +#if WORKAROUND // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking) + // we are a non-blocking queue, so we have to wait here with its destruction until all spawned tasks have been processed + currentThreadWaitFor(*this); +#endif + } + HIP_ASSERT( + hipSetDevice( + m_dev)); + HIP_ASSERT( + hipStreamDestroy( + m_HipQueue)); + } + +public: + int m_dev; //!< The device this queue is bound to. + hipStream_t m_HipQueue; + +#if WORKAROUND // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking) + int m_callees = 0; + std::mutex m_mutex; +#endif +}; + +template +struct Enqueue +{ + //############################################################################# + enum class CallbackState + { + enqueued, + notified, + finished, + }; + + //############################################################################# + struct CallbackSynchronizationData : public std::enable_shared_from_this + { + std::mutex m_mutex; + std::condition_variable m_event; + CallbackState state = CallbackState::enqueued; + }; + + //----------------------------------------------------------------------------- + static void HIPRT_CB hipRtCallback(hipStream_t /*queue*/, hipError_t /*status*/, void *arg) + { + // explicitly copy the shared_ptr so that this method holds the state even when the executing thread has already finished. + const auto pCallbackSynchronizationData = reinterpret_cast(arg)->shared_from_this(); + + // Notify the executing thread. + { + std::unique_lock lock(pCallbackSynchronizationData->m_mutex); + pCallbackSynchronizationData->state = CallbackState::notified; + } + pCallbackSynchronizationData->m_event.notify_one(); + + // Wait for the executing thread to finish the task if it has not already finished. + std::unique_lock lock(pCallbackSynchronizationData->m_mutex); + if(pCallbackSynchronizationData->state != CallbackState::finished) + { + pCallbackSynchronizationData->m_event.wait( + lock, + [pCallbackSynchronizationData](){ + return pCallbackSynchronizationData->state == CallbackState::finished; + } + ); + } + } + + //----------------------------------------------------------------------------- + template + static auto enqueue( + QueueHipRt & queue, + TTask const & task) + -> void + { + +#if WORKAROUND // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking) + { + // thread-safe callee incrementing + std::lock_guard guard(queue.m_mutex); + queue.m_callees += 1; + } +#endif + auto pCallbackSynchronizationData = std::make_shared(); + // test example: https://github.com/ROCm-Developer-Tools/HIP/blob/roc-1.9.x/tests/src/runtimeApi/stream/hipStreamAddCallback.cpp + HIP_ASSERT(hipStreamAddCallback( + queue.m_HipQueue, + hipRtCallback, + pCallbackSynchronizationData.get(), + 0u)); + + // We start a new std::thread which stores the task to be executed. + // This circumvents the limitation that it is not possible to call HIP methods within the HIP callback thread. + // The HIP thread signals the std::thread when it is ready to execute the task. + // The HIP thread is waiting for the std::thread to signal that it is finished executing the task + // before it executes the next task in the queue (HIP stream). + std::thread t( + [pCallbackSynchronizationData, + task +#if WORKAROUND // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking) + ,&queue // requires queue's destructor to wait for all tasks +#endif + ](){ + +#if WORKAROUND // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking) + // thread-safe task execution and callee decrementing + std::lock_guard guard(queue.m_mutex); +#endif + + // If the callback has not yet been called, we wait for it. + { + std::unique_lock lock(pCallbackSynchronizationData->m_mutex); + if(pCallbackSynchronizationData->state != CallbackState::notified) + { + pCallbackSynchronizationData->m_event.wait( + lock, + [pCallbackSynchronizationData](){ + return pCallbackSynchronizationData->state == CallbackState::notified; + } + ); + } + + task(); + + // Notify the waiting HIP thread. + pCallbackSynchronizationData->state = CallbackState::finished; + } + pCallbackSynchronizationData->m_event.notify_one(); +#if WORKAROUND // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking) + queue.m_callees -= 1; +#endif + } + ); + if(isBlocking) + t.join(); // => waiting for task completion + else + t.detach(); // => do not wait for task completion + } +}; +//############################################################################# +//! The HIP RT non-blocking queue test trait specialization. +struct Empty +{ + //----------------------------------------------------------------------------- + template + static auto empty( + QueueHipRt const & queue) + -> bool + { + +#if WORKAROUND // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking) + return (queue.m_callees==0); +#else + + // Query is allowed even for queues on non current device. + hipError_t ret = hipSuccess; + HIP_ASSERT_IGNORE( + ret = hipStreamQuery( + queue.m_HipQueue), + hipErrorNotReady); + return (ret == hipSuccess); +#endif + } +}; + +template +auto currentThreadWaitFor(QueueHipRt const & queue) -> void +{ +#if WORKAROUND // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking) + while(queue.m_callees>0) { + std::this_thread::sleep_for(std::chrono::milliseconds(10u)); + } +#else + // Sync is allowed even for queues on non current device. + HIP_ASSERT( hipStreamSynchronize( + queue.m_HipQueue)); +#endif +} + + + + +// --- Tests + +#define TEMPLATE_LIST_TEST_CASE(TestName) \ +template static void TestName (std::atomic &check); \ +static int TestName##Runner () { \ + std::atomic check{0}; \ + TestName< QueueHipRt >(check); \ + fprintf(stderr, "After " #TestName " < QueueHipRt > errors=%d\n", check.load()); \ + TestName< QueueHipRt >(check); \ + fprintf(stderr, "After " #TestName " < QueueHipRt > errors=%d\n", check.load()); \ + return check.load(); \ +} \ +template static void TestName (std::atomic &check) + +// add 1 if a check fails +#define CHECK(result) do{int arg=(!(result)); fprintf(stderr, "Checking " #result " %d\n", arg); check.fetch_add(arg);}while(false) + +//----------------------------------------------------------------------------- +TEMPLATE_LIST_TEST_CASE( queueIsInitiallyEmpty ) +{ + TestType queue{0}; + CHECK(Empty::empty(queue)); +} + +//----------------------------------------------------------------------------- +TEMPLATE_LIST_TEST_CASE( queueCallbackIsWorking ) +{ + std::promise promise; + auto task = [&](){ promise.set_value(true); }; + TestType queue{0}; + Enqueue enqueue; + enqueue.enqueue( + queue, + task + ); + + CHECK(promise.get_future().get()); +} + +//----------------------------------------------------------------------------- +TEMPLATE_LIST_TEST_CASE( queueWaitShouldWork ) +{ + bool CallbackFinished = false; + auto task = + [&CallbackFinished]() noexcept + { + std::this_thread::sleep_for(std::chrono::milliseconds(100u)); + CallbackFinished = true; + }; + TestType queue{0}; + Enqueue enqueue; + enqueue.enqueue( + queue, + task + ); + + currentThreadWaitFor(queue); + CHECK(CallbackFinished); +} + +//----------------------------------------------------------------------------- +TEMPLATE_LIST_TEST_CASE( queueShouldNotBeEmptyWhenLastTaskIsStillExecutingAndIsEmptyAfterProcessingFinished ) +{ + bool CallbackFinished = false; + TestType queue{0}; + auto task = [&queue, &CallbackFinished, &check]() noexcept + { + CHECK(!Empty::empty(queue)); + std::this_thread::sleep_for(std::chrono::milliseconds(100u)); + CallbackFinished = true; + }; + Enqueue enqueue; + enqueue.enqueue( + queue, + task + ); + // A non-blocking queue will always stay empty because the task has been executed immediately. + if(!TestType::isBlocking) + { + currentThreadWaitFor(queue); + } + + CHECK(Empty::empty(queue)); + CHECK(CallbackFinished); +} + +//----------------------------------------------------------------------------- +TEMPLATE_LIST_TEST_CASE( queueShouldNotExecuteTasksInParallel ) +{ + std::atomic taskIsExecuting(false); + std::promise firstTaskFinished; + std::future firstTaskFinishedFuture = firstTaskFinished.get_future(); + std::promise secondTaskFinished; + std::future secondTaskFinishedFuture = secondTaskFinished.get_future(); + + TestType queue{0}; + + std::thread thread1( + [&queue, &taskIsExecuting, &firstTaskFinished, &check]() + { + auto task1 = [&taskIsExecuting, &firstTaskFinished, &check]() noexcept + { + CHECK(!taskIsExecuting.exchange(true)); + std::this_thread::sleep_for(std::chrono::milliseconds(100u)); + CHECK(taskIsExecuting.exchange(false)); + firstTaskFinished.set_value(); + }; + Enqueue enqueue; + enqueue.enqueue( + queue, + task1 + ); + }); + + std::thread thread2( + [&queue, &taskIsExecuting, &secondTaskFinished, &check]() + { + auto task2 = [&taskIsExecuting, &secondTaskFinished, &check]() noexcept + { + CHECK(!taskIsExecuting.exchange(true)); + std::this_thread::sleep_for(std::chrono::milliseconds(100u)); + CHECK(taskIsExecuting.exchange(false)); + secondTaskFinished.set_value(); + }; + + Enqueue enqueue; + enqueue.enqueue( + queue, + task2 + ); + }); + + // Both tasks have to be enqueued + thread1.join(); + thread2.join(); + + currentThreadWaitFor(queue); + + firstTaskFinishedFuture.get(); + secondTaskFinishedFuture.get(); +} + +#define TESTER(name) do { \ + int result = name (); \ + fprintf(stderr, #name " %s\n", result?"Errors":"No Errors"); \ + if (result) { failed(#name " failed\n"); } \ +} while (false) + +int main() +{ + TESTER(queueIsInitiallyEmptyRunner); + TESTER(queueCallbackIsWorkingRunner); + TESTER(queueWaitShouldWorkRunner); + TESTER(queueShouldNotBeEmptyWhenLastTaskIsStillExecutingAndIsEmptyAfterProcessingFinishedRunner); + TESTER(queueShouldNotExecuteTasksInParallelRunner); + passed(); +} diff --git a/projects/hip/tests/src/texture/hipTex1DFetchCheckModes.cpp b/projects/hip/tests/src/texture/hipTex1DFetchCheckModes.cpp new file mode 100644 index 0000000000..381d07280c --- /dev/null +++ b/projects/hip/tests/src/texture/hipTex1DFetchCheckModes.cpp @@ -0,0 +1,122 @@ +/* +Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* HIT_START + * BUILD: %t %s ../test_common.cpp EXCLUDE_HIP_PLATFORM vdi + * TEST: %t + * HIT_END + */ + +#include "hip/hip_runtime.h" +#include "../test_common.h" + +#define N 16 +#define offset 3 +__global__ void tex1dKernel(float *val, hipTextureObject_t obj) { + int k = blockIdx.x * blockDim.x + threadIdx.x; + if (k < N) + val[k] = tex1Dfetch(obj, k+offset); +} + +int runTest(hipTextureAddressMode, hipTextureFilterMode); + +int main(int argc, char **argv) { + int testResult = runTest(hipAddressModeClamp,hipFilterModePoint); + testResult = runTest(hipAddressModeClamp,hipFilterModeLinear); + testResult = runTest(hipAddressModeWrap,hipFilterModePoint); + testResult = runTest(hipAddressModeWrap,hipFilterModeLinear); + if(testResult) { + passed(); + } else { + exit(EXIT_FAILURE); + } +} + +int runTest(hipTextureAddressMode addressMode, hipTextureFilterMode filterMode) { + + int testResult = 1; + + hipCtx_t HipContext; + hipDevice_t HipDevice; + int deviceID = 0; + hipDeviceGet(&HipDevice, deviceID); + hipCtxCreate(&HipContext, 0, HipDevice); + + // Allocating the required buffer on gpu device + float *texBuf, *texBufOut; + float val[N], output[N]; + + for (int i = 0; i < N; i++) { + val[i] = i+1; + output[i] = 0.0; + } + + HIPCHECK(hipMalloc(&texBuf, N * sizeof(float))); + HIPCHECK(hipMalloc(&texBufOut, N * sizeof(float))); + HIPCHECK(hipMemcpy(texBuf, val, N * sizeof(float), hipMemcpyHostToDevice)); + HIPCHECK(hipMemset(texBufOut, 0, N * sizeof(float))); + hipResourceDesc resDescLinear; + + memset(&resDescLinear, 0, sizeof(resDescLinear)); + resDescLinear.resType = hipResourceTypeLinear; + resDescLinear.res.linear.devPtr = texBuf; + resDescLinear.res.linear.desc = hipCreateChannelDesc(32, 0, 0, 0, hipChannelFormatKindFloat); + resDescLinear.res.linear.sizeInBytes = N * sizeof(float); + + hipTextureDesc texDesc; + memset(&texDesc, 0, sizeof(texDesc)); + texDesc.readMode = hipReadModeElementType; + + texDesc.addressMode[0] = addressMode; + texDesc.addressMode[1] = addressMode; + texDesc.filterMode = filterMode; + texDesc.normalizedCoords = false; + + // Creating texture object + hipTextureObject_t texObj = 0; + HIPCHECK(hipCreateTextureObject(&texObj, &resDescLinear, &texDesc, NULL)); + + dim3 dimBlock(1, 1, 1); + dim3 dimGrid(N , 1, 1); + + hipLaunchKernelGGL(tex1dKernel, dim3(dimGrid), dim3(dimBlock), 0, 0, + texBufOut, texObj); + HIPCHECK(hipDeviceSynchronize()); + + HIPCHECK(hipMemcpy(output, texBufOut, N * sizeof(float), hipMemcpyDeviceToHost)); + + for (int i = offset; i < N; i++) { + if (output[i-offset] != val[i]) { + testResult = 0; + break; + } + } + if(testResult){ + for(int i = N-offset; i < N; i++){ + if (output[i] != 0){ + testResult = 0; + break; + } + } + } + HIPCHECK(hipDestroyTextureObject(texObj)); + HIPCHECK(hipFree(texBuf)); + HIPCHECK(hipFree(texBufOut)); + return testResult; +}