Merge branch 'amd-master' into amd-master-next
Change-Id: I3094c15008093f2072bcd38aca4ea90aeae2d97b
[ROCm/hip commit: 2af31479e2]
이 커밋은 다음에 포함됨:
+4
-22
@@ -207,19 +207,6 @@ if (NOT CPACK_SET_DESTDIR)
|
||||
set(CPACK_PACKAGING_INSTALL_PREFIX "/opt/rocm/hip" CACHE PATH "Default installation path of hcc installer package")
|
||||
endif (NOT CPACK_SET_DESTDIR)
|
||||
|
||||
# Check if we need to enable ATP marker
|
||||
if(NOT DEFINED COMPILE_HIP_ATP_MARKER)
|
||||
if(NOT DEFINED ENV{COMPILE_HIP_ATP_MARKER})
|
||||
set(COMPILE_HIP_ATP_MARKER 0)
|
||||
else()
|
||||
set(COMPILE_HIP_ATP_MARKER $ENV{COMPILE_HIP_ATP_MARKER})
|
||||
message(WARNING "HIP Markers are deprecated, please use roctracer/rocTX marker APIs.")
|
||||
endif()
|
||||
else()
|
||||
message(WARNING "HIP Markers are deprecated, please use roctracer/rocTX marker APIs.")
|
||||
endif()
|
||||
add_to_config(_buildInfo COMPILE_HIP_ATP_MARKER)
|
||||
|
||||
#############################
|
||||
# Profiling API support
|
||||
#############################
|
||||
@@ -309,10 +296,6 @@ message(STATUS "\nHSA runtime in: " ${HSA_PATH})
|
||||
if(HIP_PLATFORM STREQUAL "hcc")
|
||||
include_directories(${PROJECT_SOURCE_DIR}/include)
|
||||
set(HIP_HCC_BUILD_FLAGS)
|
||||
if(COMPILE_HIP_ATP_MARKER)
|
||||
include_directories(/opt/rocm/profiler/CXLActivityLogger/include)
|
||||
set(HIP_HCC_BUILD_FLAGS "${HIP_HCC_BUILD_FLAGS} -DCOMPILE_HIP_ATP_MARKER=1")
|
||||
endif()
|
||||
|
||||
# Add HIP_VERSION to CMAKE_<LANG>_FLAGS
|
||||
set(HIP_HCC_BUILD_FLAGS "${HIP_HCC_BUILD_FLAGS} -DHIP_VERSION_MAJOR=${HIP_VERSION_MAJOR} -DHIP_VERSION_MINOR=${HIP_VERSION_MINOR} -DHIP_VERSION_PATCH=${HIP_VERSION_GITDATE}")
|
||||
@@ -328,7 +311,6 @@ if(HIP_PLATFORM STREQUAL "hcc")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${HIP_HCC_BUILD_FLAGS}")
|
||||
|
||||
set(SOURCE_FILES_RUNTIME
|
||||
src/code_object_bundle.cpp
|
||||
src/program_state.cpp
|
||||
src/hip_clang.cpp
|
||||
src/hip_hcc.cpp
|
||||
@@ -363,9 +345,6 @@ if(HIP_PLATFORM STREQUAL "hcc")
|
||||
set (CMAKE_BUILD_WITH_INSTALL_RPATH TRUE )
|
||||
set (CMAKE_SKIP_BUILD_RPATH TRUE )
|
||||
endif ()
|
||||
if(COMPILE_HIP_ATP_MARKER)
|
||||
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -L/opt/rocm/profiler/CXLActivityLogger/bin/x86_64 -lCXLActivityLogger")
|
||||
endif()
|
||||
add_library(hip_hcc SHARED ${SOURCE_FILES_RUNTIME})
|
||||
add_library(hip_hcc_static STATIC ${SOURCE_FILES_RUNTIME})
|
||||
|
||||
@@ -376,7 +355,7 @@ if(HIP_PLATFORM STREQUAL "hcc")
|
||||
target_link_libraries(hip_hcc PRIVATE hc_am)
|
||||
target_link_libraries(hip_hcc_static PRIVATE hc_am)
|
||||
|
||||
add_library(hiprtc SHARED src/hiprtc.cpp src/code_object_bundle.cpp)
|
||||
add_library(hiprtc SHARED src/hiprtc.cpp)
|
||||
target_compile_options(hiprtc PRIVATE -DDISABLE_REDUCED_GPU_BLOB_COPY)
|
||||
set_property ( TARGET hiprtc PROPERTY VERSION "${HIP_LIB_VERSION_STRING}" )
|
||||
set_property ( TARGET hiprtc PROPERTY SOVERSION "${HIP_LIB_VERSION_MAJOR}" )
|
||||
@@ -387,6 +366,9 @@ if(HIP_PLATFORM STREQUAL "hcc")
|
||||
|
||||
set_target_properties(hip_hcc PROPERTIES CXX_VISIBILITY_PRESET hidden)
|
||||
set_target_properties(hip_hcc PROPERTIES VISIBILITY_INLINES_HIDDEN 1)
|
||||
set_target_properties(hiprtc PROPERTIES CXX_VISIBILITY_PRESET hidden)
|
||||
set_target_properties(hiprtc PROPERTIES VISIBILITY_INLINES_HIDDEN 1)
|
||||
|
||||
|
||||
if(HIP_PLATFORM STREQUAL "hcc")
|
||||
find_package(amd_comgr REQUIRED CONFIG
|
||||
|
||||
+2
-1
@@ -1,6 +1,7 @@
|
||||
## What is this repository for? ###
|
||||
|
||||
HIP allows developers to convert CUDA code to portable C++. The same source code can be compiled to run on NVIDIA or AMD GPUs.
|
||||
**HIP is a C++ Runtime API and Kernel Language that allows developers to create portable applications for AMD and NVIDIA GPUs from single source code.**
|
||||
|
||||
Key features include:
|
||||
|
||||
* HIP is very thin and has little or no performance impact over coding directly in CUDA or hcc "HC" mode.
|
||||
|
||||
+45
-13
@@ -134,6 +134,7 @@ if (defined $HIP_RUNTIME and $HIP_RUNTIME eq "VDI" and !defined $HIP_VDI_HOME) {
|
||||
$HIP_VDI_HOME = $HIP_PATH; # use HIP_PATH
|
||||
}
|
||||
$HIPCXXFLAGS .= "-D__HIP_VDI__";
|
||||
$HIPCFLAGS .= "-D__HIP_VDI__";
|
||||
}
|
||||
|
||||
if (defined $HIP_VDI_HOME) {
|
||||
@@ -207,7 +208,8 @@ if ($HIP_PLATFORM eq "clang") {
|
||||
} else {
|
||||
$HIPCXXFLAGS .= " -std=c++11";
|
||||
}
|
||||
$HIPCXXFLAGS .= " -isystem $HIP_CLANG_INCLUDE_PATH";
|
||||
$HIPCXXFLAGS .= " -isystem $HIP_CLANG_INCLUDE_PATH/..";
|
||||
$HIPCFLAGS .= " -isystem $HIP_CLANG_INCLUDE_PATH/..";
|
||||
$HIPLDFLAGS .= " -L$HIP_LIB_PATH";
|
||||
if (not $isWindows) {
|
||||
$HIPLDFLAGS .= " -Wl,--rpath-link=$HIP_LIB_PATH";
|
||||
@@ -222,8 +224,10 @@ if ($HIP_PLATFORM eq "clang") {
|
||||
|
||||
$HSA_PATH=$ENV{'HSA_PATH'} // "$ROCM_PATH/hsa";
|
||||
$HIPCXXFLAGS .= " -isystem $HSA_PATH/include";
|
||||
$HIPCFLAGS .= " -isystem $HSA_PATH/include";
|
||||
if (!($HIP_RUNTIME eq "HCC")) {
|
||||
$HIPCXXFLAGS .= " -D__HIP_VDI__ -fhip-new-launch-api";
|
||||
$HIPCFLAGS .= " -D__HIP_VDI__ -fhip-new-launch-api";
|
||||
}
|
||||
|
||||
} elsif ($HIP_PLATFORM eq "hcc") {
|
||||
@@ -282,8 +286,11 @@ if ($HIP_PLATFORM eq "clang") {
|
||||
}
|
||||
|
||||
$HIPCXXFLAGS .= " -isystem $HIP_PATH/include/hip/hcc_detail/cuda";
|
||||
$HIPCFLAGS .= " -isystem $HIP_PATH/include/hip/hcc_detail/cuda";
|
||||
$HIPCXXFLAGS .= " -isystem $HSA_PATH/include";
|
||||
$HIPCFLAGS .= " -isystem $HSA_PATH/include";
|
||||
$HIPCXXFLAGS .= " -Wno-deprecated-register";
|
||||
$HIPCFLAGS .= " -Wno-deprecated-register";
|
||||
|
||||
$HIPLDFLAGS .= " -L$HSA_PATH/lib -L$ROCM_PATH/lib -lhsa-runtime64 -lhc_am ";
|
||||
# $HIPLDFLAGS .= " -L$HCC_HOME/compiler/lib -lLLVMAMDGPUDesc -lLLVMAMDGPUUtils -lLLVMMC -lLLVMCore -lLLVMSupport ";
|
||||
@@ -321,6 +328,7 @@ if ($HIP_PLATFORM eq "clang") {
|
||||
$HIPCC="$CUDA_PATH/bin/nvcc";
|
||||
$HIPCXXFLAGS .= " -Wno-deprecated-gpu-targets ";
|
||||
$HIPCXXFLAGS .= " -isystem $CUDA_PATH/include";
|
||||
$HIPCFLAGS .= " -isystem $CUDA_PATH/include";
|
||||
|
||||
$HIPLDFLAGS = " -Wno-deprecated-gpu-targets -lcuda -lcudart -L$CUDA_PATH/lib64";
|
||||
} else {
|
||||
@@ -330,11 +338,14 @@ if ($HIP_PLATFORM eq "clang") {
|
||||
|
||||
# Add paths to common HIP includes:
|
||||
$HIPCXXFLAGS .= " -isystem $HIP_INCLUDE_PATH" ;
|
||||
$HIPCFLAGS .= " -isystem $HIP_INCLUDE_PATH" ;
|
||||
|
||||
my $compileOnly = 0;
|
||||
my $needCXXFLAGS = 0; # need to add CXX flags to compile step
|
||||
my $needCFLAGS = 0; # need to add C flags to compile step
|
||||
my $needLDFLAGS = 1; # need to add LDFLAGS to compile step.
|
||||
my $hasC = 0; # options contain a c-style file (NVCC must force recognition as GPU file)
|
||||
my $hasC = 0; # options contain a c-style file
|
||||
my $hasCXX = 0; # options contain a cpp-style file (NVCC must force recognition as GPU file)
|
||||
my $hasCU = 0; # options contain a cu-style file (HCC must force recognition as GPU file)
|
||||
my $needHipHcc = ($HIP_PLATFORM eq 'hcc'); # set if we need to link hip_hcc.o from src tree. (some builds, ie cmake, provide their own)
|
||||
my $printHipVersion = 0; # print HIP version
|
||||
@@ -343,6 +354,7 @@ my $buildDeps = 0;
|
||||
my $linkType = 1;
|
||||
my $setLinkType = 0;
|
||||
my $coFormatv3 = 1;
|
||||
my $funcSupp = 0; # enable function support
|
||||
|
||||
my @options = ();
|
||||
my @inputs = ();
|
||||
@@ -400,7 +412,6 @@ foreach $arg (@ARGV)
|
||||
my $swallowArg = 0;
|
||||
if ($arg eq '-c' or $arg eq '--genco') {
|
||||
$compileOnly = 1;
|
||||
$needCXXFLAGS = 1;
|
||||
$needLDFLAGS = 0;
|
||||
}
|
||||
|
||||
@@ -463,6 +474,7 @@ foreach $arg (@ARGV)
|
||||
}
|
||||
if($trimarg eq '-use_fast_math') {
|
||||
$HIPCXXFLAGS .= " -DHIP_FAST_MATH ";
|
||||
$HIPCFLAGS .= " -DHIP_FAST_MATH ";
|
||||
}
|
||||
if(($trimarg eq '-use-staticlib') and ($setLinkType eq 0))
|
||||
{
|
||||
@@ -599,20 +611,30 @@ foreach $arg (@ARGV)
|
||||
#if $arg eq "--hipcc_profile") { # Example argument here, hipcc
|
||||
#
|
||||
#}
|
||||
if ($arg eq "--hipcc-func-supp") {
|
||||
$funcSupp = 1;
|
||||
} elsif ($arg eq "--hipcc-no-func-supp") {
|
||||
$funcSupp = 0;
|
||||
}
|
||||
} else {
|
||||
push (@options, $arg);
|
||||
}
|
||||
#print "O: <$arg>\n";
|
||||
} else {
|
||||
# input files and libraries
|
||||
if (($arg =~ /\.cpp$/) or ($arg =~ /\.cxx$/) or ($arg =~ /\.c$/) or ($arg =~ /\.cc$/) ) {
|
||||
if ($arg =~ /\.c$/) {
|
||||
$hasC = 1;
|
||||
$needCFLAGS = 1;
|
||||
$toolArgs .= " -x c"
|
||||
}
|
||||
elsif (($arg =~ /\.cpp$/) or ($arg =~ /\.cxx$/) or ($arg =~ /\.cc$/) ) {
|
||||
$hasCXX = 1;
|
||||
$needCXXFLAGS = 1;
|
||||
if ($HIP_PLATFORM eq 'clang') {
|
||||
if ($HIP_PLATFORM eq 'clang' and not $arg =~ /\.c$/) {
|
||||
$toolArgs .= " -x hip"
|
||||
}
|
||||
}
|
||||
if (($arg =~ /\.cu$/) or ($arg =~ /\.cuh$/) or ($arg =~ /\.hip$/)) {
|
||||
elsif (($arg =~ /\.cu$/) or ($arg =~ /\.cuh$/) or ($arg =~ /\.hip$/)) {
|
||||
$hasCU = 1;
|
||||
$needCXXFLAGS = 1;
|
||||
if ($HIP_PLATFORM eq 'clang') {
|
||||
@@ -657,7 +679,7 @@ if($HIP_PLATFORM eq "hcc" or $HIP_PLATFORM eq "clang"){
|
||||
my $archMacro = ' -D__HIP_ARCH_' . uc($val) . '__=1 ';
|
||||
# Add the arch option and macro to the compiler options.
|
||||
$GPU_ARCH_ARG = $GPU_ARCH_OPT . $val;
|
||||
$HIPLDFLAGS .= $GPU_ARCH_ARG;
|
||||
$HIPLDARCHFLAGS .= $GPU_ARCH_ARG;
|
||||
$HIPCXXFLAGS .= $archMacro;
|
||||
if ($HIP_PLATFORM eq 'clang') {
|
||||
$HIPCXXFLAGS .= $GPU_ARCH_ARG;
|
||||
@@ -685,7 +707,7 @@ if ($coFormatv3 and $HIP_PLATFORM eq 'hcc') {
|
||||
$HIPCXXFLAGS .= " -mcode-object-v3";
|
||||
}
|
||||
|
||||
if ($hasC and $HIP_PLATFORM eq 'nvcc') {
|
||||
if ($hasCXX and $HIP_PLATFORM eq 'nvcc') {
|
||||
$HIPCXXFLAGS .= " -x cu";
|
||||
}
|
||||
if ($hasCU and $HIP_PLATFORM eq 'hcc') {
|
||||
@@ -694,6 +716,7 @@ if ($hasCU and $HIP_PLATFORM eq 'hcc') {
|
||||
|
||||
if ($buildDeps and $HIP_PLATFORM eq 'nvcc') {
|
||||
$HIPCXXFLAGS .= " -M -D__CUDACC__";
|
||||
$HIPCFLAGS .= " -M -D__CUDACC__";
|
||||
}
|
||||
|
||||
if ($buildDeps and $HIP_PLATFORM eq 'clang') {
|
||||
@@ -701,10 +724,14 @@ if ($buildDeps and $HIP_PLATFORM eq 'clang') {
|
||||
}
|
||||
|
||||
# Add --hip-link only if there are no source files.
|
||||
if (!$needCXXFLAGS and $HIP_PLATFORM eq 'clang') {
|
||||
if (!$needCXXFLAGS and !$needCFLAGS and $HIP_PLATFORM eq 'clang') {
|
||||
$HIPLDFLAGS .= " --hip-link";
|
||||
}
|
||||
|
||||
if (!$needCFLAGS and $HIP_PLATFORM eq 'clang') {
|
||||
$HIPLDFLAGS .= $HIPLDARCHFLAGS;
|
||||
}
|
||||
|
||||
if ($setStdLib eq 0 and $HIP_PLATFORM eq 'hcc')
|
||||
{
|
||||
$HIPCXXFLAGS .= $HCC_WA_FLAGS;
|
||||
@@ -727,11 +754,12 @@ if ($HIP_PLATFORM eq "clang") {
|
||||
# Set default optimization level to -O3 for hip-clang.
|
||||
if ($optArg eq "") {
|
||||
$HIPCXXFLAGS .= " -O3";
|
||||
$HIPCFLAGS .= " -O3";
|
||||
$HIPLDFLAGS .= " -O3";
|
||||
}
|
||||
# Do not pass -mllvm on Windows since there is a clang bug causing duplicate -mllvm options in clang -cc1 on Windows.
|
||||
# ToDo : remove restriction for Windows after clang bug is fixed.
|
||||
if ($optArg ne "-O0" and not $isWindows) {
|
||||
if (!$funcSupp and $optArg ne "-O0" and not $isWindows) {
|
||||
$HIPCXXFLAGS .= " -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false";
|
||||
if ($needLDFLAGS and not $needCXXFLAGS) {
|
||||
$HIPLDFLAGS .= " -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false";
|
||||
@@ -747,18 +775,22 @@ if ($HIP_PLATFORM eq "clang") {
|
||||
|
||||
if ($HIPCC_COMPILE_FLAGS_APPEND) {
|
||||
$HIPCXXFLAGS .= " $HIPCC_COMPILE_FLAGS_APPEND";
|
||||
$HIPCFLAGS .= " $HIPCC_COMPILE_FLAGS_APPEND";
|
||||
}
|
||||
if ($HIPCC_LINK_FLAGS_APPEND) {
|
||||
$HIPLDFLAGS .= " $HIPCC_LINK_FLAGS_APPEND";
|
||||
}
|
||||
|
||||
my $CMD="$HIPCC";
|
||||
if ($needCXXFLAGS) {
|
||||
$CMD .= " $HIPCXXFLAGS";
|
||||
}
|
||||
if ($needLDFLAGS and not $compileOnly) {
|
||||
$CMD .= " $HIPLDFLAGS";
|
||||
}
|
||||
if ($needCFLAGS) {
|
||||
$CMD .= " $HIPCFLAGS";
|
||||
}
|
||||
if ($needCXXFLAGS) {
|
||||
$CMD .= " $HIPCXXFLAGS";
|
||||
}
|
||||
$CMD .= " $toolArgs";
|
||||
|
||||
if ($verbose & 0x1) {
|
||||
|
||||
@@ -341,8 +341,8 @@ sub simpleSubstitutions {
|
||||
$ft{'execution'} += s/\bcudaLaunchCooperativeKernelMultiDevice\b/hipLaunchCooperativeKernelMultiDevice/g;
|
||||
$ft{'execution'} += s/\bcudaLaunchKernel\b/hipLaunchKernel/g;
|
||||
$ft{'execution'} += s/\bcudaSetupArgument\b/hipSetupArgument/g;
|
||||
$ft{'occupancy'} += s/\bcuOccupancyMaxActiveBlocksPerMultiprocessor\b/hipOccupancyMaxActiveBlocksPerMultiprocessor/g;
|
||||
$ft{'occupancy'} += s/\bcuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags\b/hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags/g;
|
||||
$ft{'occupancy'} += s/\bcuOccupancyMaxActiveBlocksPerMultiprocessor\b/hipDrvOccupancyMaxActiveBlocksPerMultiprocessor/g;
|
||||
$ft{'occupancy'} += s/\bcuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags\b/hipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags/g;
|
||||
$ft{'occupancy'} += s/\bcuOccupancyMaxPotentialBlockSize\b/hipOccupancyMaxPotentialBlockSize/g;
|
||||
$ft{'occupancy'} += s/\bcudaOccupancyMaxActiveBlocksPerMultiprocessor\b/hipOccupancyMaxActiveBlocksPerMultiprocessor/g;
|
||||
$ft{'occupancy'} += s/\bcudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags\b/hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags/g;
|
||||
@@ -754,6 +754,9 @@ sub simpleSubstitutions {
|
||||
$ft{'library'} += s/\bcusparseCcsrmm\b/hipsparseCcsrmm/g;
|
||||
$ft{'library'} += s/\bcusparseCcsrmm2\b/hipsparseCcsrmm2/g;
|
||||
$ft{'library'} += s/\bcusparseCcsrmv\b/hipsparseCcsrmv/g;
|
||||
$ft{'library'} += s/\bcusparseCcsrsm2_analysis\b/hipsparseCcsrsm2_analysis/g;
|
||||
$ft{'library'} += s/\bcusparseCcsrsm2_bufferSizeExt\b/hipsparseCcsrsm2_bufferSizeExt/g;
|
||||
$ft{'library'} += s/\bcusparseCcsrsm_solve\b/hipsparseCcsrsm_solve/g;
|
||||
$ft{'library'} += s/\bcusparseCcsrsv2_analysis\b/hipsparseCcsrsv2_analysis/g;
|
||||
$ft{'library'} += s/\bcusparseCcsrsv2_bufferSize\b/hipsparseCcsrsv2_bufferSize/g;
|
||||
$ft{'library'} += s/\bcusparseCcsrsv2_bufferSizeExt\b/hipsparseCcsrsv2_bufferSizeExt/g;
|
||||
@@ -763,9 +766,11 @@ sub simpleSubstitutions {
|
||||
$ft{'library'} += s/\bcusparseCgthr\b/hipsparseCgthr/g;
|
||||
$ft{'library'} += s/\bcusparseCgthrz\b/hipsparseCgthrz/g;
|
||||
$ft{'library'} += s/\bcusparseChybmv\b/hipsparseChybmv/g;
|
||||
$ft{'library'} += s/\bcusparseCnnz\b/hipsparseCnnz/g;
|
||||
$ft{'library'} += s/\bcusparseCreate\b/hipsparseCreate/g;
|
||||
$ft{'library'} += s/\bcusparseCreateCsrgemm2Info\b/hipsparseCreateCsrgemm2Info/g;
|
||||
$ft{'library'} += s/\bcusparseCreateCsrilu02Info\b/hipsparseCreateCsrilu02Info/g;
|
||||
$ft{'library'} += s/\bcusparseCreateCsrsm2Info\b/hipsparseCreateCsrsm2Info/g;
|
||||
$ft{'library'} += s/\bcusparseCreateCsrsv2Info\b/hipsparseCreateCsrsv2Info/g;
|
||||
$ft{'library'} += s/\bcusparseCreateHybMat\b/hipsparseCreateHybMat/g;
|
||||
$ft{'library'} += s/\bcusparseCreateIdentityPermutation\b/hipsparseCreateIdentityPermutation/g;
|
||||
@@ -784,6 +789,9 @@ sub simpleSubstitutions {
|
||||
$ft{'library'} += s/\bcusparseDcsrmm\b/hipsparseDcsrmm/g;
|
||||
$ft{'library'} += s/\bcusparseDcsrmm2\b/hipsparseDcsrmm2/g;
|
||||
$ft{'library'} += s/\bcusparseDcsrmv\b/hipsparseDcsrmv/g;
|
||||
$ft{'library'} += s/\bcusparseDcsrsm2_analysis\b/hipsparseDcsrsm2_analysis/g;
|
||||
$ft{'library'} += s/\bcusparseDcsrsm2_bufferSizeExt\b/hipsparseDcsrsm2_bufferSizeExt/g;
|
||||
$ft{'library'} += s/\bcusparseDcsrsm_solve\b/hipsparseDcsrsm_solve/g;
|
||||
$ft{'library'} += s/\bcusparseDcsrsv2_analysis\b/hipsparseDcsrsv2_analysis/g;
|
||||
$ft{'library'} += s/\bcusparseDcsrsv2_bufferSize\b/hipsparseDcsrsv2_bufferSize/g;
|
||||
$ft{'library'} += s/\bcusparseDcsrsv2_bufferSizeExt\b/hipsparseDcsrsv2_bufferSizeExt/g;
|
||||
@@ -792,12 +800,14 @@ sub simpleSubstitutions {
|
||||
$ft{'library'} += s/\bcusparseDestroy\b/hipsparseDestroy/g;
|
||||
$ft{'library'} += s/\bcusparseDestroyCsrgemm2Info\b/hipsparseDestroyCsrgemm2Info/g;
|
||||
$ft{'library'} += s/\bcusparseDestroyCsrilu02Info\b/hipsparseDestroyCsrilu02Info/g;
|
||||
$ft{'library'} += s/\bcusparseDestroyCsrsm2Info\b/hipsparseDestroyCsrsm2Info/g;
|
||||
$ft{'library'} += s/\bcusparseDestroyCsrsv2Info\b/hipsparseDestroyCsrsv2Info/g;
|
||||
$ft{'library'} += s/\bcusparseDestroyHybMat\b/hipsparseDestroyHybMat/g;
|
||||
$ft{'library'} += s/\bcusparseDestroyMatDescr\b/hipsparseDestroyMatDescr/g;
|
||||
$ft{'library'} += s/\bcusparseDgthr\b/hipsparseDgthr/g;
|
||||
$ft{'library'} += s/\bcusparseDgthrz\b/hipsparseDgthrz/g;
|
||||
$ft{'library'} += s/\bcusparseDhybmv\b/hipsparseDhybmv/g;
|
||||
$ft{'library'} += s/\bcusparseDnnz\b/hipsparseDnnz/g;
|
||||
$ft{'library'} += s/\bcusparseDroti\b/hipsparseDroti/g;
|
||||
$ft{'library'} += s/\bcusparseDsctr\b/hipsparseDsctr/g;
|
||||
$ft{'library'} += s/\bcusparseGetMatDiagType\b/hipsparseGetMatDiagType/g;
|
||||
@@ -820,6 +830,9 @@ sub simpleSubstitutions {
|
||||
$ft{'library'} += s/\bcusparseScsrmm\b/hipsparseScsrmm/g;
|
||||
$ft{'library'} += s/\bcusparseScsrmm2\b/hipsparseScsrmm2/g;
|
||||
$ft{'library'} += s/\bcusparseScsrmv\b/hipsparseScsrmv/g;
|
||||
$ft{'library'} += s/\bcusparseScsrsm2_analysis\b/hipsparseScsrsm2_analysis/g;
|
||||
$ft{'library'} += s/\bcusparseScsrsm2_bufferSizeExt\b/hipsparseScsrsm2_bufferSizeExt/g;
|
||||
$ft{'library'} += s/\bcusparseScsrsm_solve\b/hipsparseScsrsm_solve/g;
|
||||
$ft{'library'} += s/\bcusparseScsrsv2_analysis\b/hipsparseScsrsv2_analysis/g;
|
||||
$ft{'library'} += s/\bcusparseScsrsv2_bufferSize\b/hipsparseScsrsv2_bufferSize/g;
|
||||
$ft{'library'} += s/\bcusparseScsrsv2_bufferSizeExt\b/hipsparseScsrsv2_bufferSizeExt/g;
|
||||
@@ -834,6 +847,7 @@ sub simpleSubstitutions {
|
||||
$ft{'library'} += s/\bcusparseSgthr\b/hipsparseSgthr/g;
|
||||
$ft{'library'} += s/\bcusparseSgthrz\b/hipsparseSgthrz/g;
|
||||
$ft{'library'} += s/\bcusparseShybmv\b/hipsparseShybmv/g;
|
||||
$ft{'library'} += s/\bcusparseSnnz\b/hipsparseSnnz/g;
|
||||
$ft{'library'} += s/\bcusparseSroti\b/hipsparseSroti/g;
|
||||
$ft{'library'} += s/\bcusparseSsctr\b/hipsparseSsctr/g;
|
||||
$ft{'library'} += s/\bcusparseXbsrilu02_zeroPivot\b/hipsparseXbsrilu02_zeroPivot/g;
|
||||
@@ -847,6 +861,7 @@ sub simpleSubstitutions {
|
||||
$ft{'library'} += s/\bcusparseXcsrgemm2Nnz\b/hipsparseXcsrgemm2Nnz/g;
|
||||
$ft{'library'} += s/\bcusparseXcsrgemmNnz\b/hipsparseXcsrgemmNnz/g;
|
||||
$ft{'library'} += s/\bcusparseXcsrilu02_zeroPivot\b/hipsparseXcsrilu02_zeroPivot/g;
|
||||
$ft{'library'} += s/\bcusparseXcsrsm2_zeroPivot\b/hipsparseXcsrsm2_zeroPivot/g;
|
||||
$ft{'library'} += s/\bcusparseXcsrsort\b/hipsparseXcsrsort/g;
|
||||
$ft{'library'} += s/\bcusparseXcsrsort_bufferSizeExt\b/hipsparseXcsrsort_bufferSizeExt/g;
|
||||
$ft{'library'} += s/\bcusparseXcsrsv2_zeroPivot\b/hipsparseXcsrsv2_zeroPivot/g;
|
||||
@@ -863,6 +878,9 @@ sub simpleSubstitutions {
|
||||
$ft{'library'} += s/\bcusparseZcsrmm\b/hipsparseZcsrmm/g;
|
||||
$ft{'library'} += s/\bcusparseZcsrmm2\b/hipsparseZcsrmm2/g;
|
||||
$ft{'library'} += s/\bcusparseZcsrmv\b/hipsparseZcsrmv/g;
|
||||
$ft{'library'} += s/\bcusparseZcsrsm2_analysis\b/hipsparseZcsrsm2_analysis/g;
|
||||
$ft{'library'} += s/\bcusparseZcsrsm2_bufferSizeExt\b/hipsparseZcsrsm2_bufferSizeExt/g;
|
||||
$ft{'library'} += s/\bcusparseZcsrsm_solve\b/hipsparseZcsrsm_solve/g;
|
||||
$ft{'library'} += s/\bcusparseZcsrsv2_analysis\b/hipsparseZcsrsv2_analysis/g;
|
||||
$ft{'library'} += s/\bcusparseZcsrsv2_bufferSize\b/hipsparseZcsrsv2_bufferSize/g;
|
||||
$ft{'library'} += s/\bcusparseZcsrsv2_bufferSizeExt\b/hipsparseZcsrsv2_bufferSizeExt/g;
|
||||
@@ -872,6 +890,7 @@ sub simpleSubstitutions {
|
||||
$ft{'library'} += s/\bcusparseZgthr\b/hipsparseZgthr/g;
|
||||
$ft{'library'} += s/\bcusparseZgthrz\b/hipsparseZgthrz/g;
|
||||
$ft{'library'} += s/\bcusparseZhybmv\b/hipsparseZhybmv/g;
|
||||
$ft{'library'} += s/\bcusparseZnnz\b/hipsparseZnnz/g;
|
||||
$ft{'library'} += s/\bcusparseZsctr\b/hipsparseZsctr/g;
|
||||
$ft{'device_library'} += s/\bcurand\b/hiprand/g;
|
||||
$ft{'device_library'} += s/\bcurand_discrete\b/hiprand_discrete/g;
|
||||
@@ -997,6 +1016,8 @@ sub simpleSubstitutions {
|
||||
$ft{'type'} += s/\bcsrgemm2Info\b/csrgemm2Info/g;
|
||||
$ft{'type'} += s/\bcsrgemm2Info_t\b/csrgemm2Info_t/g;
|
||||
$ft{'type'} += s/\bcsrilu02Info_t\b/csrilu02Info_t/g;
|
||||
$ft{'type'} += s/\bcsrsm2Info\b/csrsm2Info/g;
|
||||
$ft{'type'} += s/\bcsrsm2Info_t\b/csrsm2Info_t/g;
|
||||
$ft{'type'} += s/\bcsrsv2Info_t\b/csrsv2Info_t/g;
|
||||
$ft{'type'} += s/\bcuComplex\b/hipComplex/g;
|
||||
$ft{'type'} += s/\bcuDoubleComplex\b/hipDoubleComplex/g;
|
||||
@@ -1130,6 +1151,7 @@ sub simpleSubstitutions {
|
||||
$ft{'type'} += s/\bcurandStatus_t\b/hiprandStatus_t/g;
|
||||
$ft{'type'} += s/\bcusparseAction_t\b/hipsparseAction_t/g;
|
||||
$ft{'type'} += s/\bcusparseDiagType_t\b/hipsparseDiagType_t/g;
|
||||
$ft{'type'} += s/\bcusparseDirection_t\b/hipsparseDirection_t/g;
|
||||
$ft{'type'} += s/\bcusparseFillMode_t\b/hipsparseFillMode_t/g;
|
||||
$ft{'type'} += s/\bcusparseHandle_t\b/hipsparseHandle_t/g;
|
||||
$ft{'type'} += s/\bcusparseHybMat_t\b/hipsparseHybMat_t/g;
|
||||
@@ -1398,6 +1420,8 @@ sub simpleSubstitutions {
|
||||
$ft{'numeric_literal'} += s/\bCUSPARSE_ACTION_SYMBOLIC\b/HIPSPARSE_ACTION_SYMBOLIC/g;
|
||||
$ft{'numeric_literal'} += s/\bCUSPARSE_DIAG_TYPE_NON_UNIT\b/HIPSPARSE_DIAG_TYPE_NON_UNIT/g;
|
||||
$ft{'numeric_literal'} += s/\bCUSPARSE_DIAG_TYPE_UNIT\b/HIPSPARSE_DIAG_TYPE_UNIT/g;
|
||||
$ft{'numeric_literal'} += s/\bCUSPARSE_DIRECTION_COLUMN\b/HIPSPARSE_DIRECTION_COLUMN/g;
|
||||
$ft{'numeric_literal'} += s/\bCUSPARSE_DIRECTION_ROW\b/HIPSPARSE_DIRECTION_ROW/g;
|
||||
$ft{'numeric_literal'} += s/\bCUSPARSE_FILL_MODE_LOWER\b/HIPSPARSE_FILL_MODE_LOWER/g;
|
||||
$ft{'numeric_literal'} += s/\bCUSPARSE_FILL_MODE_UPPER\b/HIPSPARSE_FILL_MODE_UPPER/g;
|
||||
$ft{'numeric_literal'} += s/\bCUSPARSE_HYB_PARTITION_AUTO\b/HIPSPARSE_HYB_PARTITION_AUTO/g;
|
||||
|
||||
@@ -75,7 +75,6 @@ if(UNIX AND NOT APPLE AND NOT CYGWIN)
|
||||
endif()
|
||||
# And push it back to the cache
|
||||
set(HIP_ROOT_DIR ${HIP_ROOT_DIR} CACHE PATH "HIP installed location" FORCE)
|
||||
message("Found HIP at ${HIP_ROOT_DIR}")
|
||||
endif()
|
||||
|
||||
# Find HIPCC executable
|
||||
|
||||
@@ -12,9 +12,9 @@
|
||||
| enum |***`cusparseAction_t`*** | |***`hipsparseAction_t`*** |
|
||||
| 0 |*`CUSPARSE_ACTION_SYMBOLIC`* | |*`HIPSPARSE_ACTION_SYMBOLIC`* |
|
||||
| 1 |*`CUSPARSE_ACTION_NUMERIC`* | |*`HIPSPARSE_ACTION_NUMERIC`* |
|
||||
| enum |***`cusparseDirection_t`*** | | |
|
||||
| 0 |*`CUSPARSE_DIRECTION_ROW`* | | |
|
||||
| 1 |*`CUSPARSE_DIRECTION_COLUMN`* | | |
|
||||
| enum |***`cusparseDirection_t`*** | |***`hipsparseDirection_t`*** |
|
||||
| 0 |*`CUSPARSE_DIRECTION_ROW`* | |*`HIPSPARSE_DIRECTION_ROW`* |
|
||||
| 1 |*`CUSPARSE_DIRECTION_COLUMN`* | |*`HIPSPARSE_DIRECTION_COLUMN`* |
|
||||
| enum |***`cusparseHybPartition_t`*** | |***`hipsparseHybPartition_t`*** |
|
||||
| 0 |*`CUSPARSE_HYB_PARTITION_AUTO`* | |*`HIPSPARSE_HYB_PARTITION_AUTO`* |
|
||||
| 1 |*`CUSPARSE_HYB_PARTITION_USER`* | |*`HIPSPARSE_HYB_PARTITION_USER`* |
|
||||
@@ -69,8 +69,8 @@
|
||||
| typedef |`cusparseSolveAnalysisInfo_t` | | |
|
||||
| struct |`csrsv2Info` | | |
|
||||
| typedef |`csrsv2Info_t` | |`csrsv2Info_t` |
|
||||
| struct |`csrsm2Info` | 9.2 | |
|
||||
| typedef |`csrsm2Info_t` | | |
|
||||
| struct |`csrsm2Info` | 9.2 |`csrsm2Info` |
|
||||
| typedef |`csrsm2Info_t` | |`csrsm2Info_t` |
|
||||
| struct |`bsrsv2Info` | | |
|
||||
| typedef |`bsrsv2Info_t` | | |
|
||||
| struct |`bsrsm2Info` | | |
|
||||
@@ -151,8 +151,8 @@
|
||||
|`cusparseGetStream` |`hipsparseGetStream` | 8.0 |
|
||||
|`cusparseCreateCsrsv2Info` |`hipsparseCreateCsrsv2Info` |
|
||||
|`cusparseDestroyCsrsv2Info` |`hipsparseDestroyCsrsv2Info` |
|
||||
|`cusparseCreateCsrsm2Info` | | 9.2 |
|
||||
|`cusparseDestroyCsrsm2Info` | | 9.2 |
|
||||
|`cusparseCreateCsrsm2Info` |`hipsparseCreateCsrsm2Info` | 9.2 |
|
||||
|`cusparseDestroyCsrsm2Info` |`hipsparseDestroyCsrsm2Info` | 9.2 |
|
||||
|`cusparseCreateCsric02Info` | |
|
||||
|`cusparseDestroyCsric02Info` | |
|
||||
|`cusparseCreateCsrilu02Info` |`hipsparseCreateCsrilu02Info` |
|
||||
@@ -306,19 +306,19 @@
|
||||
|`cusparseDcsrsm_solve` | |
|
||||
|`cusparseCcsrsm_solve` | |
|
||||
|`cusparseZcsrsm_solve` | |
|
||||
|`cusparseScsrsm2_bufferSizeExt` | | 9.2 |
|
||||
|`cusparseDcsrsm2_bufferSizeExt` | | 9.2 |
|
||||
|`cusparseCcsrsm2_bufferSizeExt` | | 9.2 |
|
||||
|`cusparseZcsrsm2_bufferSizeExt` | | 9.2 |
|
||||
|`cusparseScsrsm2_analysis` | | 9.2 |
|
||||
|`cusparseDcsrsm2_analysis` | | 9.2 |
|
||||
|`cusparseCcsrsm2_analysis` | | 9.2 |
|
||||
|`cusparseZcsrsm2_analysis` | | 9.2 |
|
||||
|`cusparseScsrsm2_solve` | | 9.2 |
|
||||
|`cusparseDcsrsm2_solve` | | 9.2 |
|
||||
|`cusparseCcsrsm2_solve` | | 9.2 |
|
||||
|`cusparseZcsrsm2_solve` | | 9.2 |
|
||||
|`cusparseXcsrsm2_zeroPivot` | | 9.2 |
|
||||
|`cusparseScsrsm2_bufferSizeExt` |`hipsparseScsrsm2_bufferSizeExt` | 9.2 |
|
||||
|`cusparseDcsrsm2_bufferSizeExt` |`hipsparseDcsrsm2_bufferSizeExt` | 9.2 |
|
||||
|`cusparseCcsrsm2_bufferSizeExt` |`hipsparseCcsrsm2_bufferSizeExt` | 9.2 |
|
||||
|`cusparseZcsrsm2_bufferSizeExt` |`hipsparseZcsrsm2_bufferSizeExt` | 9.2 |
|
||||
|`cusparseScsrsm2_analysis` |`hipsparseScsrsm2_analysis` | 9.2 |
|
||||
|`cusparseDcsrsm2_analysis` |`hipsparseDcsrsm2_analysis` | 9.2 |
|
||||
|`cusparseCcsrsm2_analysis` |`hipsparseCcsrsm2_analysis` | 9.2 |
|
||||
|`cusparseZcsrsm2_analysis` |`hipsparseZcsrsm2_analysis` | 9.2 |
|
||||
|`cusparseScsrsm2_solve` |`hipsparseScsrsm2_solve` | 9.2 |
|
||||
|`cusparseDcsrsm2_solve` |`hipsparseDcsrsm2_solve` | 9.2 |
|
||||
|`cusparseCcsrsm2_solve` |`hipsparseCcsrsm2_solve` | 9.2 |
|
||||
|`cusparseZcsrsm2_solve` |`hipsparseZcsrsm2_solve` | 9.2 |
|
||||
|`cusparseXcsrsm2_zeroPivot` |`hipsparseXcsrsm2_zeroPivot` | 9.2 |
|
||||
|`cusparseSbsrmm` | |
|
||||
|`cusparseDbsrmm` | |
|
||||
|`cusparseCbsrmm` | |
|
||||
@@ -662,10 +662,10 @@
|
||||
|`cusparseDhyb2dense` | |
|
||||
|`cusparseChyb2dense` | |
|
||||
|`cusparseZhyb2dense` | |
|
||||
|`cusparseSnnz` | |
|
||||
|`cusparseDnnz` | |
|
||||
|`cusparseCnnz` | |
|
||||
|`cusparseZnnz` | |
|
||||
|`cusparseSnnz` |`cusparseSnnz` |
|
||||
|`cusparseDnnz` |`cusparseDnnz` |
|
||||
|`cusparseCnnz` |`cusparseCnnz` |
|
||||
|`cusparseZnnz` |`cusparseZnnz` |
|
||||
|`cusparseCreateIdentityPermutation` |`hipsparseCreateIdentityPermutation` |
|
||||
|`cusparseXcoosort_bufferSizeExt` |`hipsparseXcoosort_bufferSizeExt` |
|
||||
|`cusparseXcoosortByRow` |`hipsparseXcoosortByRow` |
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
Table of Contents
|
||||
=================
|
||||
|
||||
* [Profiling HIP Code](#profiling-hip-code" aria-hidden="true"><span aria-hidden="true)
|
||||
* [Using HIP_DB](#using-hip_db" aria-hidden="true"><span aria-hidden="true)
|
||||
* [Using ltrace](#using-ltrace" aria-hidden="true"><span aria-hidden="true)
|
||||
* [Chicken bits](#chicken-bits" aria-hidden="true"><span aria-hidden="true)
|
||||
* [Debugging HIP Applications](#debugging-hip-applications" aria-hidden="true"><span aria-hidden="true)
|
||||
* [General Debugging Tips](#general-debugging-tips" aria-hidden="true"><span aria-hidden="true)
|
||||
* [Print env var state](#print-env-var-state" aria-hidden="true"><span aria-hidden="true)
|
||||
* [Profiling HIP Code](#profiling-hip-code)
|
||||
* [Using HIP_DB](#using-hip_db)
|
||||
* [Using ltrace](#using-ltrace)
|
||||
* [Chicken bits](#chicken-bits)
|
||||
* [Debugging HIP Applications](#debugging-hip-applications)
|
||||
* [General Debugging Tips](#general-debugging-tips)
|
||||
* [Print env var state](#print-env-var-state)
|
||||
|
||||
### Using HIP_DB
|
||||
|
||||
|
||||
+23
-24
@@ -388,30 +388,29 @@ def generate_prof_header(f, api_map, opts_map):
|
||||
f.write('#define INIT_CB_ARGS_DATA(cb_id, cb_data) INIT_##cb_id##_CB_ARGS_DATA(cb_data)\n')
|
||||
|
||||
# Generating the method for the API string, name and parameters
|
||||
if False:
|
||||
f.write('\n')
|
||||
f.write('#if 0\n')
|
||||
f.write('#include <sstream>\n');
|
||||
f.write('#include <string>\n');
|
||||
f.write('// HIP API string method, method name and parameters\n')
|
||||
f.write('const char* hipApiString(hip_api_id_t id, const hip_api_data_t* data) {\n')
|
||||
f.write(' std::ostringstream oss;\n')
|
||||
f.write(' switch (id) {\n')
|
||||
for name, args in api_map.items():
|
||||
f.write(' case HIP_API_ID_' + name + ':\n')
|
||||
f.write(' oss << "' + name + '("')
|
||||
for ind in range(0, len(args)):
|
||||
arg_tuple = args[ind]
|
||||
arg_name = arg_tuple[1]
|
||||
if ind != 0: f.write(' << ","')
|
||||
f.write('\n << " ' + arg_name + '=" << data->args.' + name + '.' + arg_name)
|
||||
f.write('\n << ")";\n')
|
||||
f.write(' break;\n')
|
||||
f.write(' default: oss << "unknown";\n')
|
||||
f.write(' };\n')
|
||||
f.write(' return strdup(oss.str().c_str());\n')
|
||||
f.write('};\n')
|
||||
f.write('#endif\n')
|
||||
f.write('\n')
|
||||
f.write('#if ENABLE_HIP_API_STRING\n')
|
||||
f.write('#include <sstream>\n');
|
||||
f.write('#include <string>\n');
|
||||
f.write('// HIP API string method, method name and parameters\n')
|
||||
f.write('const char* hipApiString(hip_api_id_t id, const hip_api_data_t* data) {\n')
|
||||
f.write(' std::ostringstream oss;\n')
|
||||
f.write(' switch (id) {\n')
|
||||
for name, args in api_map.items():
|
||||
f.write(' case HIP_API_ID_' + name + ':\n')
|
||||
f.write(' oss << "' + name + '("')
|
||||
for ind in range(0, len(args)):
|
||||
arg_tuple = args[ind]
|
||||
arg_name = arg_tuple[1]
|
||||
if ind != 0: f.write(' << ","')
|
||||
f.write('\n << " ' + arg_name + '=" << data->args.' + name + '.' + arg_name)
|
||||
f.write('\n << ")";\n')
|
||||
f.write(' break;\n')
|
||||
f.write(' default: oss << "unknown";\n')
|
||||
f.write(' };\n')
|
||||
f.write(' return strdup(oss.str().c_str());\n')
|
||||
f.write('};\n')
|
||||
f.write('#endif // ENABLE_HIP_API_STRING\n')
|
||||
|
||||
f.write('#endif // _HIP_PROF_STR_H\n');
|
||||
|
||||
|
||||
+122
-121
@@ -42,10 +42,9 @@ After applying all the matchers, the output HIP source is produced.
|
||||
|
||||
`hipify-clang` requires:
|
||||
|
||||
1. [**LLVM+CLANG**](http://releases.llvm.org) of at least version [3.8.0](http://releases.llvm.org/download.html#3.8.0); the latest stable and recommended release: [**9.0.1**](http://releases.llvm.org/download.html#9.0.1), the latest release candidate: [10.0.0-rc3](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc3).
|
||||
1. [**LLVM+CLANG**](http://releases.llvm.org) of at least version [3.8.0](http://releases.llvm.org/download.html#3.8.0); the latest stable and recommended release: [**10.0.0**](http://releases.llvm.org/download.html#10.0.0).
|
||||
|
||||
2. [**CUDA**](https://developer.nvidia.com/cuda-downloads) of at least version [7.0](https://developer.nvidia.com/cuda-toolkit-70), the latest supported version is [**10.1 Update 2**](https://developer.nvidia.com/cuda-10.1-download-archive-base).
|
||||
To use the latest CUDA version [10.2](https://developer.nvidia.com/cuda-downloads) please use the latest `LLVM` release candidate: [10.0.0-rc3](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc3).
|
||||
2. [**CUDA**](https://developer.nvidia.com/cuda-downloads) of at least version [7.0](https://developer.nvidia.com/cuda-toolkit-70), the latest supported version is [**10.2**](https://developer.nvidia.com/cuda-downloads).
|
||||
|
||||
| **LLVM release version** | **CUDA latest supported version** | **Windows** | **Linux** |
|
||||
|:----------------------------------------------------------:|:------------------------------------------------------------------------:|:-----------:|:---------:|
|
||||
@@ -66,15 +65,15 @@ To use the latest CUDA version [10.2](https://developer.nvidia.com/cuda-download
|
||||
| [8.0.0](http://releases.llvm.org/download.html#8.0.0) | [10.0](https://developer.nvidia.com/cuda-10.0-download-archive) | - <br/> not working due to <br/> the clang's bug [38811](https://bugs.llvm.org/show_bug.cgi?id=38811) <br/>+<br/>[patch](patches/patch_for_clang_8.0.0_bug_38811.zip)*</br> | + |
|
||||
| [8.0.1](http://releases.llvm.org/download.html#8.0.1) | [10.0](https://developer.nvidia.com/cuda-10.0-download-archive) | - <br/> not working due to <br/> the clang's bug [38811](https://bugs.llvm.org/show_bug.cgi?id=38811) <br/>+<br/>[patch](patches/patch_for_clang_8.0.1_bug_38811.zip)*</br> | + |
|
||||
| [9.0.0](http://releases.llvm.org/download.html#9.0.0) | [10.1](https://developer.nvidia.com/cuda-10.1-download-archive-base) | + | + |
|
||||
| [**9.0.1**](http://releases.llvm.org/download.html#9.0.1) | [**10.1**](https://developer.nvidia.com/cuda-10.1-download-archive-base) | + <br/> **LATEST STABLE RELEASE** | + <br/> **LATEST STABLE RELEASE** |
|
||||
| [10.0.0-rc3](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc3) | [10.2](https://developer.nvidia.com/cuda-downloads) | + | + |
|
||||
| [9.0.1](http://releases.llvm.org/download.html#9.0.1) | [10.1](https://developer.nvidia.com/cuda-10.1-download-archive-base) | + | + |
|
||||
| [**10.0.0**](http://releases.llvm.org/download.html#10.0.0)| [**10.2**](https://developer.nvidia.com/cuda-downloads) | + <br/> **LATEST STABLE RELEASE** | + <br/> **LATEST STABLE RELEASE** |
|
||||
|
||||
`*` Download the patch and unpack it into your `LLVM` distributive directory; a few header files will be overwritten; rebuilding of `LLVM` is not needed.
|
||||
|
||||
In most cases, you can get a suitable version of `LLVM+CLANG` with your package manager.
|
||||
|
||||
Failing that or having multiple versions of `LLVM`, you can [download a release archive](http://releases.llvm.org/), build or install it, and set
|
||||
[CMAKE_PREFIX_PATH](https://cmake.org/cmake/help/v3.5/variable/CMAKE_PREFIX_PATH.html) so `cmake` can find it; for instance: `-DCMAKE_PREFIX_PATH=f:\LLVM\9.0.1\dist`
|
||||
[CMAKE_PREFIX_PATH](https://cmake.org/cmake/help/v3.5/variable/CMAKE_PREFIX_PATH.html) so `cmake` can find it; for instance: `-DCMAKE_PREFIX_PATH=d:\LLVM\10.0.0\dist`
|
||||
|
||||
### <a name="hipify-clang-usage"></a> hipify-clang: usage
|
||||
|
||||
@@ -83,14 +82,14 @@ To process a file, `hipify-clang` needs access to the same headers that would be
|
||||
For example:
|
||||
|
||||
```shell
|
||||
./hipify-clang square.cu --cuda-path=/usr/local/cuda-10.1 -I /usr/local/cuda-10.1/samples/common/inc
|
||||
./hipify-clang square.cu --cuda-path=/usr/local/cuda-10.2 -I /usr/local/cuda-10.2/samples/common/inc
|
||||
```
|
||||
|
||||
`hipify-clang` arguments are given first, followed by a separator `'--'`, and then the arguments you'd pass to `clang` if you
|
||||
were compiling the input file. For example:
|
||||
|
||||
```bash
|
||||
./hipify-clang cpp17.cu --cuda-path=/usr/local/cuda-10.1 -- -std=c++17
|
||||
./hipify-clang cpp17.cu --cuda-path=/usr/local/cuda-10.2 -- -std=c++17
|
||||
```
|
||||
|
||||
The [Clang manual for compiling CUDA](https://llvm.org/docs/CompileCudaWithLLVM.html#compiling-cuda-code) may be useful.
|
||||
@@ -158,7 +157,7 @@ Run `Visual Studio 16 2019`, open the generated `LLVM.sln`, build all, build pro
|
||||
|
||||
**LLVM 10.0.0 or newer:**
|
||||
|
||||
1. download [`LLVM project`](https://github.com/llvm/llvm-project/archive/llvmorg-10.0.0-rc3.tar.gz) sources;
|
||||
1. download [`LLVM project`](https://github.com/llvm/llvm-project/releases/download/llvmorg-10.0.0/llvm-project-10.0.0.tar.xz) sources;
|
||||
2. build [`LLVM project`](http://llvm.org/docs/CMake.html):
|
||||
|
||||
**Linux**:
|
||||
@@ -193,19 +192,19 @@ Run `Visual Studio 16 2019`, open the generated `LLVM.sln`, build all, build pro
|
||||
|
||||
* Having multiple CUDA installations to choose a particular version the `DCUDA_TOOLKIT_ROOT_DIR` option should be specified:
|
||||
|
||||
- ***Linux***: `-DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-10.1`
|
||||
- ***Linux***: `-DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-10.2`
|
||||
|
||||
- ***Windows***: `-DCUDA_TOOLKIT_ROOT_DIR="c:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1"`
|
||||
- ***Windows***: `-DCUDA_TOOLKIT_ROOT_DIR="c:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.2"`
|
||||
|
||||
`-DCUDA_SDK_ROOT_DIR="c:/ProgramData/NVIDIA Corporation/CUDA Samples/v10.1"`
|
||||
`-DCUDA_SDK_ROOT_DIR="c:/ProgramData/NVIDIA Corporation/CUDA Samples/v10.2"`
|
||||
|
||||
4. Ensure [`cuDNN`](https://developer.nvidia.com/rdp/cudnn-archive) of the version corresponding to CUDA's version is installed.
|
||||
|
||||
* Path to cuDNN should be specified by the `CUDA_DNN_ROOT_DIR` option:
|
||||
|
||||
- ***Linux***: `-DCUDA_DNN_ROOT_DIR=/srv/CUDNN/cudnn-10.1-v7.6.5.32`
|
||||
- ***Linux***: `-DCUDA_DNN_ROOT_DIR=/srv/CUDNN/cudnn-10.2-v7.6.5.32`
|
||||
|
||||
- ***Windows***: `-DCUDA_DNN_ROOT_DIR=f:/CUDNN/cudnn-10.1-windows10-x64-v7.6.5.32`
|
||||
- ***Windows***: `-DCUDA_DNN_ROOT_DIR=d:/CUDNN/cudnn-10.2-windows10-x64-v7.6.5.32`
|
||||
|
||||
5. Ensure [`CUB`](https://github.com/NVlabs/cub) of the version corresponding to CUDA's version is installed.
|
||||
|
||||
@@ -213,7 +212,7 @@ Run `Visual Studio 16 2019`, open the generated `LLVM.sln`, build all, build pro
|
||||
|
||||
- ***Linux***: `-DCUDA_CUB_ROOT_DIR=/srv/git/CUB`
|
||||
|
||||
- ***Windows***: `-DCUDA_CUB_ROOT_DIR=f:/GIT/cub`
|
||||
- ***Windows***: `-DCUDA_CUB_ROOT_DIR=d:/GIT/cub`
|
||||
|
||||
5. Ensure [`python`](https://www.python.org/downloads) of minimum required version 2.7 is installed.
|
||||
|
||||
@@ -221,21 +220,21 @@ Run `Visual Studio 16 2019`, open the generated `LLVM.sln`, build all, build pro
|
||||
|
||||
* Install `lit` into `python`:
|
||||
|
||||
- ***Linux***: `python /srv/git/LLVM/9.0.1/llvm/utils/lit/setup.py install`
|
||||
- ***Linux***: `python /srv/git/LLVM/10.0.0/llvm/utils/lit/setup.py install`
|
||||
|
||||
- ***Windows***: `python f:/LLVM/9.0.1/llvm/utils/lit/setup.py install`
|
||||
- ***Windows***: `python d:/LLVM/10.0.0/llvm/utils/lit/setup.py install`
|
||||
|
||||
* Starting with LLVM 6.0.1 path to `llvm-lit` python script should be specified by the `LLVM_EXTERNAL_LIT` option:
|
||||
|
||||
- ***Linux***: `-DLLVM_EXTERNAL_LIT=/srv/git/LLVM/9.0.1/build/bin/llvm-lit`
|
||||
- ***Linux***: `-DLLVM_EXTERNAL_LIT=/srv/git/LLVM/10.0.0/build/bin/llvm-lit`
|
||||
|
||||
- ***Windows***: `-DLLVM_EXTERNAL_LIT=f:/LLVM/9.0.1/build/Release/bin/llvm-lit.py`
|
||||
- ***Windows***: `-DLLVM_EXTERNAL_LIT=d:/LLVM/10.0.0/build/Release/bin/llvm-lit.py`
|
||||
|
||||
* `FileCheck`:
|
||||
|
||||
- ***Linux***: copy from `/srv/git/LLVM/9.0.1/build/bin/` to `CMAKE_INSTALL_PREFIX/dist/bin`
|
||||
- ***Linux***: copy from `/srv/git/LLVM/10.0.0/build/bin/` to `CMAKE_INSTALL_PREFIX/dist/bin`
|
||||
|
||||
- ***Windows***: copy from `f:/LLVM/9.0.1/build/Release/bin` to `CMAKE_INSTALL_PREFIX/dist/bin`
|
||||
- ***Windows***: copy from `d:/LLVM/10.0.0/build/Release/bin` to `CMAKE_INSTALL_PREFIX/dist/bin`
|
||||
|
||||
- Or specify the path to `FileCheck` in `CMAKE_INSTALL_PREFIX` option
|
||||
|
||||
@@ -249,7 +248,7 @@ On Linux the following configurations are tested:
|
||||
|
||||
Ubuntu 14: LLVM 5.0.0 - 6.0.1, CUDA 7.0 - 9.0, cudnn-5.0.5 - cudnn-7.6.5.32
|
||||
|
||||
Ubuntu 16-18: LLVM 8.0.0 - 10.0.0-rc3, CUDA 8.0 - 10.2, cudnn-5.1.10 - cudnn-7.6.5.32
|
||||
Ubuntu 16-18: LLVM 8.0.0 - 10.0.0, CUDA 8.0 - 10.2, cudnn-5.1.10 - cudnn-7.6.5.32
|
||||
|
||||
Minimum build system requirements for the above configurations:
|
||||
|
||||
@@ -262,11 +261,11 @@ cmake
|
||||
-DHIPIFY_CLANG_TESTS=1 \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DCMAKE_INSTALL_PREFIX=../dist \
|
||||
-DCMAKE_PREFIX_PATH=/srv/git/LLVM/9.0.1/dist \
|
||||
-DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-10.1 \
|
||||
-DCUDA_DNN_ROOT_DIR=/srv/CUDNN/cudnn-10.1-v7.6.5.32 \
|
||||
-DCMAKE_PREFIX_PATH=/srv/git/LLVM/10.0.0/dist \
|
||||
-DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-10.2 \
|
||||
-DCUDA_DNN_ROOT_DIR=/srv/CUDNN/cudnn-10.2-v7.6.5.32 \
|
||||
-DCUDA_CUB_ROOT_DIR=/srv/git/CUB \
|
||||
-DLLVM_EXTERNAL_LIT=/srv/git/LLVM/9.0.1/build/bin/llvm-lit \
|
||||
-DLLVM_EXTERNAL_LIT=/srv/git/LLVM/10.0.0/build/bin/llvm-lit \
|
||||
..
|
||||
```
|
||||
*A corresponding successful output:*
|
||||
@@ -285,14 +284,14 @@ cmake
|
||||
-- Detecting CXX compiler ABI info - done
|
||||
-- Detecting CXX compile features
|
||||
-- Detecting CXX compile features - done
|
||||
-- Found LLVM 9.0.1:
|
||||
-- - CMake module path: /srv/git/LLVM/9.0.1/dist/lib/cmake/llvm
|
||||
-- - Include path : /srv/git/LLVM/9.0.1/dist/include
|
||||
-- - Binary path : /srv/git/LLVM/9.0.1/dist/bin
|
||||
-- Found LLVM 10.0.0:
|
||||
-- - CMake module path: /srv/git/LLVM/10.0.0/dist/lib/cmake/llvm
|
||||
-- - Include path : /srv/git/LLVM/10.0.0/dist/include
|
||||
-- - Binary path : /srv/git/LLVM/10.0.0/dist/bin
|
||||
-- Linker detection: GNU ld
|
||||
-- Found PythonInterp: /usr/bin/python2.7 (found suitable version "2.7.12", minimum required is "2.7")
|
||||
-- Found lit: /usr/local/bin/lit
|
||||
-- Found FileCheck: /srv/git/LLVM/9.0.1/dist/bin/FileCheck
|
||||
-- Found FileCheck: /srv/git/LLVM/10.0.0/dist/bin/FileCheck
|
||||
-- Looking for pthread.h
|
||||
-- Looking for pthread.h - found
|
||||
-- Looking for pthread_create
|
||||
@@ -302,7 +301,7 @@ cmake
|
||||
-- Looking for pthread_create in pthread
|
||||
-- Looking for pthread_create in pthread - found
|
||||
-- Found Threads: TRUE
|
||||
-- Found CUDA: /usr/local/cuda-10.1 (found version "10.1")
|
||||
-- Found CUDA: /usr/local/cuda-10.2 (found version "10.2")
|
||||
-- Configuring done
|
||||
-- Generating done
|
||||
-- Build files have been written to: /srv/git/HIP/hipify-clang/build
|
||||
@@ -314,83 +313,85 @@ make test-hipify
|
||||
```shell
|
||||
Running HIPify regression tests
|
||||
========================================
|
||||
CUDA 10.1 - will be used for testing
|
||||
LLVM 9.0.1 - will be used for testing
|
||||
CUDA 10.2 - will be used for testing
|
||||
LLVM 10.0.0 - will be used for testing
|
||||
x86_64 - Platform architecture
|
||||
Linux 5.2.0 - Platform OS
|
||||
64 - hipify-clang binary bitness
|
||||
64 - python 2.7.12 binary bitness
|
||||
========================================
|
||||
-- Testing: 67 tests, 12 threads --
|
||||
PASS: hipify :: unit_tests/casts/reinterpret_cast.cu (1 of 67)
|
||||
PASS: hipify :: unit_tests/device/math_functions.cu (2 of 67)
|
||||
PASS: hipify :: unit_tests/device/atomics.cu (3 of 67)
|
||||
PASS: hipify :: unit_tests/device/device_symbols.cu (4 of 67)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_01.cu (5 of 67)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_02.cu (6 of 67)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_03.cu (7 of 67)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_05.cu (8 of 67)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_04.cu (9 of 67)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_06.cu (10 of 67)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_07.cu (11 of 67)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_10.cu (12 of 67)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_11.cu (13 of 67)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_08.cu (14 of 67)
|
||||
PASS: hipify :: unit_tests/kernel_launch/kernel_launch_01.cu (15 of 67)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_09.cu (16 of 67)
|
||||
PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_02.cu (17 of 67)
|
||||
PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_01.cu (18 of 67)
|
||||
PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_0_based_indexing.cu (19 of 67)
|
||||
PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_1_based_indexing.cu (20 of 67)
|
||||
PASS: hipify :: unit_tests/libraries/CUB/cub_03.cu (21 of 67)
|
||||
PASS: hipify :: unit_tests/libraries/CUB/cub_01.cu (22 of 67)
|
||||
PASS: hipify :: unit_tests/libraries/CUB/cub_02.cu (23 of 67)
|
||||
PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_0_based_indexing_rocblas.cu (24 of 67)
|
||||
PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_sgemm_matrix_multiplication.cu (25 of 67)
|
||||
PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_1_based_indexing_rocblas.cu (26 of 67)
|
||||
PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_sgemm_matrix_multiplication_rocblas.cu (27 of 67)
|
||||
PASS: hipify :: unit_tests/libraries/cuComplex/cuComplex_Julia.cu (28 of 67)
|
||||
PASS: hipify :: unit_tests/libraries/cuFFT/simple_cufft.cu (29 of 67)
|
||||
PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_softmax.cu (30 of 67)
|
||||
PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_convolution_forward.cu (31 of 67)
|
||||
PASS: hipify :: unit_tests/libraries/cuRAND/poisson_api_example.cu (32 of 67)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_01.cu (33 of 67)
|
||||
PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_generate.cpp (34 of 67)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_02.cu (35 of 67)
|
||||
PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_kernel.cpp (36 of 67)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_03.cu (37 of 67)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_04.cu (38 of 67)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_05.cu (39 of 67)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_07.cu (40 of 67)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_06.cu (41 of 67)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_08.cu (42 of 67)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_09.cu (43 of 67)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_11.cu (44 of 67)
|
||||
PASS: hipify :: unit_tests/namespace/ns_kernel_launch.cu (45 of 67)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_10.cu (46 of 67)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_12.cu (47 of 67)
|
||||
PASS: hipify :: unit_tests/pp/pp_if_else_conditionals.cu (48 of 67)
|
||||
PASS: hipify :: unit_tests/pp/pp_if_else_conditionals_01.cu (49 of 67)
|
||||
PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/tex2dKernel.cpp (50 of 67)
|
||||
PASS: hipify :: unit_tests/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp (51 of 67)
|
||||
PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp (52 of 67)
|
||||
PASS: hipify :: unit_tests/samples/2_Cookbook/13_occupancy/occupancy.cpp (53 of 67)
|
||||
PASS: hipify :: unit_tests/samples/2_Cookbook/1_hipEvent/hipEvent.cpp (54 of 67)
|
||||
PASS: hipify :: unit_tests/samples/2_Cookbook/2_Profiler/Profiler.cpp (55 of 67)
|
||||
PASS: hipify :: unit_tests/samples/2_Cookbook/7_streams/stream.cpp (56 of 67)
|
||||
PASS: hipify :: unit_tests/samples/2_Cookbook/8_peer2peer/peer2peer.cpp (57 of 67)
|
||||
PASS: hipify :: unit_tests/samples/MallocManaged.cpp (58 of 67)
|
||||
PASS: hipify :: unit_tests/samples/allocators.cu (59 of 67)
|
||||
PASS: hipify :: unit_tests/samples/coalescing.cu (60 of 67)
|
||||
PASS: hipify :: unit_tests/samples/dynamic_shared_memory.cu (61 of 67)
|
||||
PASS: hipify :: unit_tests/samples/axpy.cu (62 of 67)
|
||||
PASS: hipify :: unit_tests/samples/intro.cu (63 of 67)
|
||||
PASS: hipify :: unit_tests/samples/cudaRegister.cu (64 of 67)
|
||||
PASS: hipify :: unit_tests/samples/square.cu (65 of 67)
|
||||
PASS: hipify :: unit_tests/samples/static_shared_memory.cu (66 of 67)
|
||||
PASS: hipify :: unit_tests/samples/vec_add.cu (67 of 67)
|
||||
Testing Time: 3.07s
|
||||
Expected Passes : 67
|
||||
-- Testing: 69 tests, 12 threads --
|
||||
PASS: hipify :: unit_tests/casts/reinterpret_cast.cu (1 of 69)
|
||||
PASS: hipify :: unit_tests/device/math_functions.cu (2 of 69)
|
||||
PASS: hipify :: unit_tests/device/atomics.cu (3 of 69)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_01.cu (4 of 69)
|
||||
PASS: hipify :: unit_tests/device/device_symbols.cu (5 of 69)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_02.cu (6 of 69)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_03.cu (7 of 69)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_05.cu (8 of 69)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_04.cu (9 of 69)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_07.cu (10 of 69)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_06.cu (11 of 69)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_11.cu (12 of 69)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_10.cu (13 of 69)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_08.cu (14 of 69)
|
||||
PASS: hipify :: unit_tests/kernel_launch/kernel_launch_01.cu (15 of 69)
|
||||
PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_02.cu (16 of 69)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_09.cu (17 of 69)
|
||||
PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_01.cu (18 of 69)
|
||||
PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_0_based_indexing.cu (19 of 69)
|
||||
PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_1_based_indexing.cu (20 of 69)
|
||||
PASS: hipify :: unit_tests/libraries/CUB/cub_03.cu (21 of 69)
|
||||
PASS: hipify :: unit_tests/libraries/CUB/cub_01.cu (22 of 69)
|
||||
PASS: hipify :: unit_tests/libraries/CUB/cub_02.cu (23 of 69)
|
||||
PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_sgemm_matrix_multiplication.cu (24 of 69)
|
||||
PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_0_based_indexing_rocblas.cu (25 of 69)
|
||||
PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_1_based_indexing_rocblas.cu (26 of 69)
|
||||
PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_sgemm_matrix_multiplication_rocblas.cu (27 of 69)
|
||||
PASS: hipify :: unit_tests/libraries/cuComplex/cuComplex_Julia.cu (28 of 69)
|
||||
PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_softmax.cu (29 of 69)
|
||||
PASS: hipify :: unit_tests/libraries/cuFFT/simple_cufft.cu (30 of 69)
|
||||
PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_convolution_forward.cu (31 of 69)
|
||||
PASS: hipify :: unit_tests/libraries/cuRAND/poisson_api_example.cu (32 of 69)
|
||||
PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_generate.cpp (33 of 69)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_01.cu (34 of 69)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_02.cu (35 of 69)
|
||||
PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_kernel.cpp (36 of 69)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_03.cu (37 of 69)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_04.cu (38 of 69)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_05.cu (39 of 69)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_06.cu (40 of 69)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_07.cu (41 of 69)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_08.cu (42 of 69)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_09.cu (43 of 69)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_10.cu (44 of 69)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_11.cu (45 of 69)
|
||||
PASS: hipify :: unit_tests/namespace/ns_kernel_launch.cu (46 of 69)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_12.cu (47 of 69)
|
||||
PASS: hipify :: unit_tests/pp/pp_if_else_conditionals.cu (48 of 69)
|
||||
PASS: hipify :: unit_tests/pp/pp_if_else_conditionals_01.cu (49 of 69)
|
||||
PASS: hipify :: unit_tests/pp/pp_if_else_conditionals_01_LLVM_10.cu (50 of 69)
|
||||
PASS: hipify :: unit_tests/pp/pp_if_else_conditionals_LLVM_10.cu (51 of 69)
|
||||
PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/tex2dKernel.cpp (52 of 69)
|
||||
PASS: hipify :: unit_tests/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp (53 of 69)
|
||||
PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp (54 of 69)
|
||||
PASS: hipify :: unit_tests/samples/2_Cookbook/1_hipEvent/hipEvent.cpp (55 of 69)
|
||||
PASS: hipify :: unit_tests/samples/2_Cookbook/13_occupancy/occupancy.cpp (56 of 69)
|
||||
PASS: hipify :: unit_tests/samples/2_Cookbook/2_Profiler/Profiler.cpp (57 of 69)
|
||||
PASS: hipify :: unit_tests/samples/MallocManaged.cpp (58 of 69)
|
||||
PASS: hipify :: unit_tests/samples/2_Cookbook/7_streams/stream.cpp (59 of 69)
|
||||
PASS: hipify :: unit_tests/samples/2_Cookbook/8_peer2peer/peer2peer.cpp (60 of 69)
|
||||
PASS: hipify :: unit_tests/samples/allocators.cu (61 of 69)
|
||||
PASS: hipify :: unit_tests/samples/coalescing.cu (62 of 69)
|
||||
PASS: hipify :: unit_tests/samples/axpy.cu (63 of 69)
|
||||
PASS: hipify :: unit_tests/samples/dynamic_shared_memory.cu (64 of 69)
|
||||
PASS: hipify :: unit_tests/samples/cudaRegister.cu (65 of 69)
|
||||
PASS: hipify :: unit_tests/samples/intro.cu (66 of 69)
|
||||
PASS: hipify :: unit_tests/samples/square.cu (67 of 69)
|
||||
PASS: hipify :: unit_tests/samples/static_shared_memory.cu (68 of 69)
|
||||
PASS: hipify :: unit_tests/samples/vec_add.cu (69 of 69)
|
||||
Testing Time: 3.23s
|
||||
Expected Passes : 69
|
||||
[100%] Built target test-hipify
|
||||
```
|
||||
### <a name="windows"></a > hipify-clang: Windows
|
||||
@@ -404,8 +405,8 @@ Testing Time: 3.07s
|
||||
| 7.0.0 - 7.1.0 | 9.2 | 7.6.5.32 | 2017.15.9.11 | 3.13.3 | 3.7.3 |
|
||||
| 8.0.0 - 8.0.1 | 10.0 | 7.6.5.32 | 2017.15.9.15 | 3.14.2 | 3.7.4 |
|
||||
| 9.0.0 - 9.0.1 | 10.1 | 7.6.5.32 | 2017.15.9.20, 2019.16.4.5 | 3.16.4 | 3.8.0 |
|
||||
| 10.0.0-rc1-rc3 | 10.2 | 7.6.5.32 | 2017.15.9.20, 2019.16.4.5 | 3.16.4 | 3.8.1 |
|
||||
| 11.0.0git | 10.2 | 7.6.5.32 | 2017.15.9.20, 2019.16.4.5 | 3.16.5 | 3.8.2 |
|
||||
| 10.0.0 | 10.2 | 7.6.5.32 | 2017.15.9.21, 2019.16.5.1 | 3.17.0 | 3.8.2 |
|
||||
| 11.0.0git | 10.2 | 7.6.5.32 | 2017.15.9.21, 2019.16.5.1 | 3.17.0 | 3.8.2 |
|
||||
|
||||
*Building with testing support on `Windows 10` by `Visual Studio 16 2019`:*
|
||||
|
||||
@@ -416,28 +417,28 @@ cmake
|
||||
-DHIPIFY_CLANG_TESTS=1 \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DCMAKE_INSTALL_PREFIX=../dist \
|
||||
-DCMAKE_PREFIX_PATH=f:/LLVM/9.0.1/dist \
|
||||
-DCUDA_TOOLKIT_ROOT_DIR="c:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1" \
|
||||
-DCUDA_SDK_ROOT_DIR="c:/ProgramData/NVIDIA Corporation/CUDA Samples/v10.1" \
|
||||
-DCUDA_DNN_ROOT_DIR=f:/CUDNN/cudnn-10.1-windows10-x64-v7.6.5.32 \
|
||||
-DCUDA_CUB_ROOT_DIR=f:/GIT/cub \
|
||||
-DLLVM_EXTERNAL_LIT=f:/LLVM/9.0.1/build/Release/bin/llvm-lit.py \
|
||||
-DCMAKE_PREFIX_PATH=d:/LLVM/10.0.0/dist \
|
||||
-DCUDA_TOOLKIT_ROOT_DIR="c:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.2" \
|
||||
-DCUDA_SDK_ROOT_DIR="c:/ProgramData/NVIDIA Corporation/CUDA Samples/v10.2" \
|
||||
-DCUDA_DNN_ROOT_DIR=d:/CUDNN/cudnn-10.2-windows10-x64-v7.6.5.32 \
|
||||
-DCUDA_CUB_ROOT_DIR=d:/GIT/cub \
|
||||
-DLLVM_EXTERNAL_LIT=d:/LLVM/10.0.0/build/Release/bin/llvm-lit.py \
|
||||
-Thost=x64
|
||||
..
|
||||
```
|
||||
*A corresponding successful output:*
|
||||
```shell
|
||||
-- Found LLVM 9.0.1:
|
||||
-- - CMake module path: F:/LLVM/9.0.1/dist/lib/cmake/llvm
|
||||
-- - Include path : F:/LLVM/9.0.1/dist/include
|
||||
-- - Binary path : F:/LLVM/9.0.1/dist/bin
|
||||
-- Found PythonInterp: C:/Program Files/Python38/python.exe (found suitable version "3.8.2", minimum required is "3.6")
|
||||
-- Found lit: C:/Program Files/Python38/Scripts/lit.exe
|
||||
-- Found FileCheck: F:/LLVM/9.0.1/dist/bin/FileCheck.exe
|
||||
-- Found CUDA: C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1 (found version "10.1")
|
||||
-- Found LLVM 10.0.0:
|
||||
-- - CMake module path: d:/LLVM/10.0.0/dist/lib/cmake/llvm
|
||||
-- - Include path : d:/LLVM/10.0.0/dist/include
|
||||
-- - Binary path : d:/LLVM/10.0.0/dist/bin
|
||||
-- Found PythonInterp: c:/Program Files/Python38/python.exe (found suitable version "3.8.2", minimum required is "3.6")
|
||||
-- Found lit: c:/Program Files/Python38/Scripts/lit.exe
|
||||
-- Found FileCheck: d:/LLVM/10.0.0/dist/bin/FileCheck.exe
|
||||
-- Found CUDA: c:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.2 (found version "10.2")
|
||||
-- Configuring done
|
||||
-- Generating done
|
||||
-- Build files have been written to: f:/HIP/hipify-clang/build
|
||||
-- Build files have been written to: d:/HIP/hipify-clang/build
|
||||
```
|
||||
|
||||
Run `Visual Studio 16 2019`, open the generated `hipify-clang.sln`, build project `test-hipify`.
|
||||
|
||||
@@ -545,9 +545,9 @@ const std::map<llvm::StringRef, hipCounter> CUDA_DRIVER_FUNCTION_MAP{
|
||||
|
||||
// 5.21. Occupancy
|
||||
// cudaOccupancyMaxActiveBlocksPerMultiprocessor
|
||||
{"cuOccupancyMaxActiveBlocksPerMultiprocessor", {"hipDrvOccupancyMaxActiveBlocksPerMultiprocessor", "", CONV_OCCUPANCY, API_DRIVER}},
|
||||
{"cuOccupancyMaxActiveBlocksPerMultiprocessor", {"hipDrvOccupancyMaxActiveBlocksPerMultiprocessor", "", CONV_OCCUPANCY, API_DRIVER}},
|
||||
// cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
|
||||
{"cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", {"hipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "", CONV_OCCUPANCY, API_DRIVER}},
|
||||
{"cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", {"hipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags","", CONV_OCCUPANCY, API_DRIVER}},
|
||||
// cudaOccupancyMaxPotentialBlockSize
|
||||
{"cuOccupancyMaxPotentialBlockSize", {"hipOccupancyMaxPotentialBlockSize", "", CONV_OCCUPANCY, API_DRIVER}},
|
||||
// cudaOccupancyMaxPotentialBlockSizeWithFlags
|
||||
|
||||
@@ -49,8 +49,8 @@ const std::map<llvm::StringRef, hipCounter> CUDA_SPARSE_FUNCTION_MAP{
|
||||
{"cusparseGetStream", {"hipsparseGetStream", "", CONV_LIB_FUNC, API_SPARSE}},
|
||||
{"cusparseCreateCsrsv2Info", {"hipsparseCreateCsrsv2Info", "", CONV_LIB_FUNC, API_SPARSE}},
|
||||
{"cusparseDestroyCsrsv2Info", {"hipsparseDestroyCsrsv2Info", "", CONV_LIB_FUNC, API_SPARSE}},
|
||||
{"cusparseCreateCsrsm2Info", {"hipsparseCreateCsrsm2Info", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
|
||||
{"cusparseDestroyCsrsm2Info", {"hipsparseDestroyCsrsm2Info", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
|
||||
{"cusparseCreateCsrsm2Info", {"hipsparseCreateCsrsm2Info", "", CONV_LIB_FUNC, API_SPARSE}},
|
||||
{"cusparseDestroyCsrsm2Info", {"hipsparseDestroyCsrsm2Info", "", CONV_LIB_FUNC, API_SPARSE}},
|
||||
{"cusparseCreateCsric02Info", {"hipsparseCreateCsric02Info", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
|
||||
{"cusparseDestroyCsric02Info", {"hipsparseDestroyCsric02Info", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
|
||||
{"cusparseCreateCsrilu02Info", {"hipsparseCreateCsrilu02Info", "", CONV_LIB_FUNC, API_SPARSE}},
|
||||
@@ -218,27 +218,27 @@ const std::map<llvm::StringRef, hipCounter> CUDA_SPARSE_FUNCTION_MAP{
|
||||
{"cusparseCcsrsm_analysis", {"hipsparseCcsrsm_analysis", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
|
||||
{"cusparseZcsrsm_analysis", {"hipsparseZcsrsm_analysis", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
|
||||
|
||||
{"cusparseScsrsm_solve", {"hipsparseScsrsm_solve", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
|
||||
{"cusparseDcsrsm_solve", {"hipsparseDcsrsm_solve", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
|
||||
{"cusparseCcsrsm_solve", {"hipsparseCcsrsm_solve", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
|
||||
{"cusparseZcsrsm_solve", {"hipsparseZcsrsm_solve", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
|
||||
{"cusparseScsrsm_solve", {"hipsparseScsrsm_solve", "", CONV_LIB_FUNC, API_SPARSE}},
|
||||
{"cusparseDcsrsm_solve", {"hipsparseDcsrsm_solve", "", CONV_LIB_FUNC, API_SPARSE}},
|
||||
{"cusparseCcsrsm_solve", {"hipsparseCcsrsm_solve", "", CONV_LIB_FUNC, API_SPARSE}},
|
||||
{"cusparseZcsrsm_solve", {"hipsparseZcsrsm_solve", "", CONV_LIB_FUNC, API_SPARSE}},
|
||||
|
||||
{"cusparseScsrsm2_bufferSizeExt", {"hipsparseScsrsm2_bufferSizeExt", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
|
||||
{"cusparseDcsrsm2_bufferSizeExt", {"hipsparseDcsrsm2_bufferSizeExt", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
|
||||
{"cusparseCcsrsm2_bufferSizeExt", {"hipsparseCcsrsm2_bufferSizeExt", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
|
||||
{"cusparseZcsrsm2_bufferSizeExt", {"hipsparseZcsrsm2_bufferSizeExt", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
|
||||
{"cusparseScsrsm2_bufferSizeExt", {"hipsparseScsrsm2_bufferSizeExt", "", CONV_LIB_FUNC, API_SPARSE}},
|
||||
{"cusparseDcsrsm2_bufferSizeExt", {"hipsparseDcsrsm2_bufferSizeExt", "", CONV_LIB_FUNC, API_SPARSE,}},
|
||||
{"cusparseCcsrsm2_bufferSizeExt", {"hipsparseCcsrsm2_bufferSizeExt", "", CONV_LIB_FUNC, API_SPARSE}},
|
||||
{"cusparseZcsrsm2_bufferSizeExt", {"hipsparseZcsrsm2_bufferSizeExt", "", CONV_LIB_FUNC, API_SPARSE}},
|
||||
|
||||
{"cusparseScsrsm2_analysis", {"hipsparseScsrsm2_analysis", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
|
||||
{"cusparseDcsrsm2_analysis", {"hipsparseDcsrsm2_analysis", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
|
||||
{"cusparseCcsrsm2_analysis", {"hipsparseCcsrsm2_analysis", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
|
||||
{"cusparseZcsrsm2_analysis", {"hipsparseZcsrsm2_analysis", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
|
||||
{"cusparseScsrsm2_analysis", {"hipsparseScsrsm2_analysis", "", CONV_LIB_FUNC, API_SPARSE}},
|
||||
{"cusparseDcsrsm2_analysis", {"hipsparseDcsrsm2_analysis", "", CONV_LIB_FUNC, API_SPARSE}},
|
||||
{"cusparseCcsrsm2_analysis", {"hipsparseCcsrsm2_analysis", "", CONV_LIB_FUNC, API_SPARSE}},
|
||||
{"cusparseZcsrsm2_analysis", {"hipsparseZcsrsm2_analysis", "", CONV_LIB_FUNC, API_SPARSE}},
|
||||
|
||||
{"cusparseScsrsm2_solve", {"hipsparseScsrsm2_solve", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
|
||||
{"cusparseDcsrsm2_solve", {"hipsparseDcsrsm2_solve", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
|
||||
{"cusparseCcsrsm2_solve", {"hipsparseCcsrsm2_solve", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
|
||||
{"cusparseZcsrsm2_solve", {"hipsparseZcsrsm2_solve", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
|
||||
|
||||
{"cusparseXcsrsm2_zeroPivot", {"hipsparseXcsrsm2_zeroPivot", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
|
||||
{"cusparseXcsrsm2_zeroPivot", {"hipsparseXcsrsm2_zeroPivot", "", CONV_LIB_FUNC, API_SPARSE}},
|
||||
|
||||
{"cusparseSbsrmm", {"hipsparseSbsrmm", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
|
||||
{"cusparseDbsrmm", {"hipsparseDbsrmm", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
|
||||
@@ -619,10 +619,10 @@ const std::map<llvm::StringRef, hipCounter> CUDA_SPARSE_FUNCTION_MAP{
|
||||
{"cusparseChyb2dense", {"hipsparseChyb2dense", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
|
||||
{"cusparseZhyb2dense", {"hipsparseZhyb2dense", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
|
||||
|
||||
{"cusparseSnnz", {"hipsparseSnnz", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
|
||||
{"cusparseDnnz", {"hipsparseDnnz", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
|
||||
{"cusparseCnnz", {"hipsparseCnnz", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
|
||||
{"cusparseZnnz", {"hipsparseZnnz", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
|
||||
{"cusparseSnnz", {"hipsparseSnnz", "", CONV_LIB_FUNC, API_SPARSE}},
|
||||
{"cusparseDnnz", {"hipsparseDnnz", "", CONV_LIB_FUNC, API_SPARSE}},
|
||||
{"cusparseCnnz", {"hipsparseCnnz", "", CONV_LIB_FUNC, API_SPARSE}},
|
||||
{"cusparseZnnz", {"hipsparseZnnz", "", CONV_LIB_FUNC, API_SPARSE}},
|
||||
|
||||
{"cusparseCreateIdentityPermutation", {"hipsparseCreateIdentityPermutation", "", CONV_LIB_FUNC, API_SPARSE}},
|
||||
|
||||
|
||||
@@ -41,8 +41,8 @@ const std::map<llvm::StringRef, hipCounter> CUDA_SPARSE_TYPE_NAME_MAP{
|
||||
{"csrsv2Info", {"csrsv2Info", "", CONV_TYPE, API_SPARSE, HIP_UNSUPPORTED}},
|
||||
{"csrsv2Info_t", {"csrsv2Info_t", "", CONV_TYPE, API_SPARSE}},
|
||||
|
||||
{"csrsm2Info", {"csrsm2Info", "", CONV_TYPE, API_SPARSE, HIP_UNSUPPORTED}},
|
||||
{"csrsm2Info_t", {"csrsm2Info_t", "", CONV_TYPE, API_SPARSE, HIP_UNSUPPORTED}},
|
||||
{"csrsm2Info", {"csrsm2Info", "", CONV_TYPE, API_SPARSE}},
|
||||
{"csrsm2Info_t", {"csrsm2Info_t", "", CONV_TYPE, API_SPARSE}},
|
||||
|
||||
{"bsrsv2Info", {"bsrsv2Info", "", CONV_TYPE, API_SPARSE, HIP_UNSUPPORTED}},
|
||||
{"bsrsv2Info_t", {"bsrsv2Info_t", "", CONV_TYPE, API_SPARSE, HIP_UNSUPPORTED}},
|
||||
@@ -88,9 +88,9 @@ const std::map<llvm::StringRef, hipCounter> CUDA_SPARSE_TYPE_NAME_MAP{
|
||||
{"CUSPARSE_ACTION_SYMBOLIC", {"HIPSPARSE_ACTION_SYMBOLIC", "", CONV_NUMERIC_LITERAL, API_SPARSE}},
|
||||
{"CUSPARSE_ACTION_NUMERIC", {"HIPSPARSE_ACTION_NUMERIC", "", CONV_NUMERIC_LITERAL, API_SPARSE}},
|
||||
|
||||
{"cusparseDirection_t", {"hipsparseDirection_t", "", CONV_TYPE, API_SPARSE, HIP_UNSUPPORTED}},
|
||||
{"CUSPARSE_DIRECTION_ROW", {"HIPSPARSE_DIRECTION_ROW", "", CONV_NUMERIC_LITERAL, API_SPARSE, HIP_UNSUPPORTED}},
|
||||
{"CUSPARSE_DIRECTION_COLUMN", {"HIPSPARSE_DIRECTION_COLUMN", "", CONV_NUMERIC_LITERAL, API_SPARSE, HIP_UNSUPPORTED}},
|
||||
{"cusparseDirection_t", {"hipsparseDirection_t", "", CONV_TYPE, API_SPARSE}},
|
||||
{"CUSPARSE_DIRECTION_ROW", {"HIPSPARSE_DIRECTION_ROW", "", CONV_NUMERIC_LITERAL, API_SPARSE}},
|
||||
{"CUSPARSE_DIRECTION_COLUMN", {"HIPSPARSE_DIRECTION_COLUMN", "", CONV_NUMERIC_LITERAL, API_SPARSE}},
|
||||
|
||||
{"cusparseHybPartition_t", {"hipsparseHybPartition_t", "", CONV_TYPE, API_SPARSE}},
|
||||
{"CUSPARSE_HYB_PARTITION_AUTO", {"HIPSPARSE_HYB_PARTITION_AUTO", "", CONV_NUMERIC_LITERAL, API_SPARSE}},
|
||||
|
||||
@@ -128,7 +128,7 @@ __device__ static int __mul24(int x, int y);
|
||||
__device__ static long long int __mul64hi(long long int x, long long int y);
|
||||
__device__ static int __mulhi(int x, int y);
|
||||
__device__ static int __rhadd(int x, int y);
|
||||
__device__ static unsigned int __sad(int x, int y, int z);
|
||||
__device__ static unsigned int __sad(int x, int y,unsigned int z);
|
||||
__device__ static unsigned int __uhadd(unsigned int x, unsigned int y);
|
||||
__device__ static int __umul24(unsigned int x, unsigned int y);
|
||||
__device__ static unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y);
|
||||
@@ -199,7 +199,7 @@ __device__ static inline int __rhadd(int x, int y) {
|
||||
int value = z & 0x7FFFFFFF;
|
||||
return ((value) >> 1 || sign);
|
||||
}
|
||||
__device__ static inline unsigned int __sad(int x, int y, int z) {
|
||||
__device__ static inline unsigned int __sad(int x, int y, unsigned int z) {
|
||||
return x > y ? x - y + z : y - x + z;
|
||||
}
|
||||
__device__ static inline unsigned int __uhadd(unsigned int x, unsigned int y) {
|
||||
@@ -230,7 +230,7 @@ __device__ static inline unsigned int __urhadd(unsigned int x, unsigned int y) {
|
||||
return (x + y + 1) >> 1;
|
||||
}
|
||||
__device__ static inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) {
|
||||
return __ockl_sad_u32(x, y, z);
|
||||
return __ockl_sadd_u32(x, y, z);
|
||||
}
|
||||
|
||||
__device__ static inline unsigned int __lane_id() { return __mbcnt_hi(-1, __mbcnt_lo(-1, 0)); }
|
||||
@@ -563,7 +563,7 @@ long __shfl_xor(long var, int lane_mask, int width = warpSize)
|
||||
return tmp1;
|
||||
#else
|
||||
static_assert(sizeof(long) == sizeof(int), "");
|
||||
return static_cast<long>(__shfl_down(static_cast<int>(var), lane_mask, width));
|
||||
return static_cast<long>(__shfl_xor(static_cast<int>(var), lane_mask, width));
|
||||
#endif
|
||||
}
|
||||
__device__
|
||||
|
||||
@@ -44,7 +44,7 @@ extern "C" __device__ __attribute__((const)) uint __ockl_mul24_u32(uint, uint);
|
||||
extern "C" __device__ __attribute__((const)) int __ockl_mul24_i32(int, int);
|
||||
extern "C" __device__ __attribute__((const)) uint __ockl_mul_hi_u32(uint, uint);
|
||||
extern "C" __device__ __attribute__((const)) int __ockl_mul_hi_i32(int, int);
|
||||
extern "C" __device__ __attribute__((const)) uint __ockl_sad_u32(uint, uint, uint);
|
||||
extern "C" __device__ __attribute__((const)) uint __ockl_sadd_u32(uint, uint, uint);
|
||||
|
||||
extern "C" __device__ __attribute__((const)) uchar __ockl_clz_u8(uchar);
|
||||
extern "C" __device__ __attribute__((const)) ushort __ockl_clz_u16(ushort);
|
||||
@@ -72,6 +72,7 @@ extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_thread_rank(
|
||||
extern "C" __device__ __attribute__((const)) int __ockl_multi_grid_is_valid(void);
|
||||
extern "C" __device__ __attribute__((convergent)) void __ockl_multi_grid_sync(void);
|
||||
|
||||
extern "C" __device__ void __ockl_atomic_add_noret_f32(float*, float);
|
||||
|
||||
// Introduce local address space
|
||||
#define __local __attribute__((address_space(3)))
|
||||
|
||||
@@ -37,14 +37,15 @@ THE SOFTWARE.
|
||||
hipError_t ihipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList, int numDevices,
|
||||
unsigned int flags, hip_impl::program_state& ps);
|
||||
|
||||
hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDimX, void** kernelParams,
|
||||
unsigned int sharedMemBytes, hipStream_t stream, hip_impl::program_state& ps);
|
||||
|
||||
hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList, int numDevices,
|
||||
unsigned int flags, hip_impl::program_state& ps);
|
||||
|
||||
|
||||
hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim,
|
||||
dim3 blockDim, void** args,
|
||||
size_t sharedMem, hipStream_t stream,
|
||||
hip_impl::program_state& ps);
|
||||
|
||||
hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
|
||||
int numDevices,
|
||||
unsigned int flags,
|
||||
hip_impl::program_state& ps);
|
||||
|
||||
#pragma GCC visibility push(hidden)
|
||||
|
||||
@@ -192,22 +193,24 @@ void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
|
||||
template <typename F>
|
||||
inline
|
||||
__attribute__((visibility("hidden")))
|
||||
hipError_t hipLaunchCooperativeKernel(F f, dim3 gridDim, dim3 blockDimX, void** kernelParams,
|
||||
unsigned int sharedMemBytes, hipStream_t stream) {
|
||||
|
||||
hipError_t hipLaunchCooperativeKernel(F f, dim3 gridDim, dim3 blockDim,
|
||||
void** args, size_t sharedMem,
|
||||
hipStream_t stream) {
|
||||
hip_impl::hip_init();
|
||||
auto& ps = hip_impl::get_program_state();
|
||||
return ihipLaunchCooperativeKernel(reinterpret_cast<void*>(f), gridDim, blockDimX, kernelParams, sharedMemBytes, stream, ps);
|
||||
return hipLaunchCooperativeKernel(reinterpret_cast<void*>(f), gridDim,
|
||||
blockDim, args, sharedMem, stream, ps);
|
||||
}
|
||||
|
||||
inline
|
||||
__attribute__((visibility("hidden")))
|
||||
hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList, int numDevices,
|
||||
unsigned int flags) {
|
||||
hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
|
||||
int numDevices,
|
||||
unsigned int flags) {
|
||||
|
||||
hip_impl::hip_init();
|
||||
auto& ps = hip_impl::get_program_state();
|
||||
return ihipLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags, ps);
|
||||
return hipLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags, ps);
|
||||
}
|
||||
|
||||
#pragma GCC visibility pop
|
||||
|
||||
@@ -73,6 +73,14 @@ float atomicAdd(float* address, float val)
|
||||
|
||||
return __uint_as_float(r);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
void atomicAddNoRet(float* address, float val)
|
||||
{
|
||||
__ockl_atomic_add_noret_f32(address, val);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
double atomicAdd(double* address, double val)
|
||||
|
||||
@@ -1268,6 +1268,13 @@ THE SOFTWARE.
|
||||
static_cast<__half_raw>(x).data +
|
||||
static_cast<__half_raw>(y).data};
|
||||
}
|
||||
inline
|
||||
__device__
|
||||
__half __habs(__half x)
|
||||
{
|
||||
return __half_raw{
|
||||
__ocml_fabs_f16(static_cast<__half_raw>(x).data)};
|
||||
}
|
||||
inline
|
||||
__device__
|
||||
__half __hsub(__half x, __half y)
|
||||
@@ -1334,6 +1341,13 @@ THE SOFTWARE.
|
||||
static_cast<__half2_raw>(x).data +
|
||||
static_cast<__half2_raw>(y).data};
|
||||
}
|
||||
inline
|
||||
__device__
|
||||
__half2 __habs2(__half2 x)
|
||||
{
|
||||
return __half2_raw{
|
||||
__ocml_fabs_2f16(static_cast<__half2_raw>(x).data)};
|
||||
}
|
||||
inline
|
||||
__device__
|
||||
__half2 __hsub2(__half2 x, __half2 y)
|
||||
|
||||
@@ -38,6 +38,7 @@ extern "C"
|
||||
__device__ __attribute__((const)) _Float16 __ocml_floor_f16(_Float16);
|
||||
__device__ __attribute__((const))
|
||||
_Float16 __ocml_fma_f16(_Float16, _Float16, _Float16);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_fabs_f16(_Float16);
|
||||
__device__ __attribute__((const)) int __ocml_isinf_f16(_Float16);
|
||||
__device__ __attribute__((const)) int __ocml_isnan_f16(_Float16);
|
||||
__device__ __attribute__((pure)) _Float16 __ocml_log_f16(_Float16);
|
||||
@@ -58,6 +59,7 @@ extern "C"
|
||||
#endif
|
||||
|
||||
__device__ __attribute__((const)) __2f16 __ocml_ceil_2f16(__2f16);
|
||||
__device__ __attribute__((const)) __2f16 __ocml_fabs_2f16(__2f16);
|
||||
__device__ __2f16 __ocml_cos_2f16(__2f16);
|
||||
__device__ __attribute__((pure)) __2f16 __ocml_exp_2f16(__2f16);
|
||||
__device__ __attribute__((pure)) __2f16 __ocml_exp10_2f16(__2f16);
|
||||
|
||||
@@ -504,9 +504,14 @@ hc_get_workitem_absolute_id(int dim)
|
||||
#define __CUDA__
|
||||
#include <__clang_cuda_math_forward_declares.h>
|
||||
#include <__clang_cuda_complex_builtins.h>
|
||||
#include <cuda_wrappers/algorithm>
|
||||
#include <cuda_wrappers/complex>
|
||||
#include <cuda_wrappers/new>
|
||||
// Workaround for using libc++ with HIP-Clang.
|
||||
// The following headers requires clang include path before standard C++ include path.
|
||||
// However libc++ include path requires to be before clang include path.
|
||||
// To workaround this, we pass -isystem with the parent directory of clang include
|
||||
// path instead of the clang include path itself.
|
||||
#include <include/cuda_wrappers/algorithm>
|
||||
#include <include/cuda_wrappers/complex>
|
||||
#include <include/cuda_wrappers/new>
|
||||
#undef __CUDA__
|
||||
#pragma pop_macro("__CUDA__")
|
||||
#endif // !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
|
||||
|
||||
@@ -55,7 +55,7 @@ THE SOFTWARE.
|
||||
#define DEPRECATED(msg) __attribute__ ((deprecated(msg)))
|
||||
#endif // !defined(_MSC_VER)
|
||||
|
||||
#define DEPRECATED_MSG "This API is marked as deprecated and may not be supported in future releases.For more details please refer https://github.com/ROCm-Developer-Tools/HIP/tree/master/docs/markdown/hip_deprecated_api_list"
|
||||
#define DEPRECATED_MSG "This API is marked as deprecated and may not be supported in future releases. For more details please refer https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_deprecated_api_list.md"
|
||||
|
||||
#if defined(__HCC__) && (__hcc_workweek__ < 16155)
|
||||
#error("This version of HIP requires a newer version of HCC.");
|
||||
|
||||
@@ -34,7 +34,7 @@ THE SOFTWARE.
|
||||
|
||||
#include "hip/hcc_detail/host_defines.h"
|
||||
|
||||
#if !defined(_MSC_VER) || __clang__
|
||||
#if defined(__has_attribute)
|
||||
#if __has_attribute(ext_vector_type)
|
||||
#define __NATIVE_VECTOR__(n, T) T __attribute__((ext_vector_type(n)))
|
||||
#else
|
||||
@@ -694,7 +694,7 @@ THE SOFTWARE.
|
||||
typename U = T,
|
||||
typename std::enable_if<std::is_signed<U>{}>::type* = nullptr>
|
||||
inline __host__ __device__
|
||||
HIP_vector_type operator-() noexcept
|
||||
HIP_vector_type operator-() const noexcept
|
||||
{
|
||||
auto tmp(*this);
|
||||
tmp.data = -tmp.data;
|
||||
@@ -705,7 +705,7 @@ THE SOFTWARE.
|
||||
typename U = T,
|
||||
typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
|
||||
inline __host__ __device__
|
||||
HIP_vector_type operator~() noexcept
|
||||
HIP_vector_type operator~() const noexcept
|
||||
{
|
||||
HIP_vector_type r{*this};
|
||||
r.data = ~r.data;
|
||||
@@ -1241,7 +1241,9 @@ DECLOP_MAKE_ONE_COMPONENT(signed long long, longlong1);
|
||||
DECLOP_MAKE_TWO_COMPONENT(signed long long, longlong2);
|
||||
DECLOP_MAKE_THREE_COMPONENT(signed long long, longlong3);
|
||||
DECLOP_MAKE_FOUR_COMPONENT(signed long long, longlong4);
|
||||
#else // defined(_MSC_VER)
|
||||
#else // !defined(__has_attribute)
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#include <mmintrin.h>
|
||||
#include <xmmintrin.h>
|
||||
#include <emmintrin.h>
|
||||
@@ -1347,5 +1349,92 @@ typedef union { double4 data; } double3;
|
||||
typedef union { __m256d data[2]; } double8;
|
||||
typedef union { __m256d data[4]; } double16;
|
||||
|
||||
#else // !defined(_MSC_VER)
|
||||
|
||||
typedef union { char data; } char1;
|
||||
typedef union { char data[2]; } char2;
|
||||
typedef union { char data[4]; } char4;
|
||||
typedef union { char data[8]; } char8;
|
||||
typedef union { char data[16]; } char16;
|
||||
typedef union { char4 data; } char3;
|
||||
|
||||
typedef union { unsigned char data; } uchar1;
|
||||
typedef union { unsigned char data[2]; } uchar2;
|
||||
typedef union { unsigned char data[4]; } uchar4;
|
||||
typedef union { unsigned char data[8]; } uchar8;
|
||||
typedef union { unsigned char data[16]; } uchar16;
|
||||
typedef union { uchar4 data; } uchar3;
|
||||
|
||||
typedef union { short data; } short1;
|
||||
typedef union { short data[2]; } short2;
|
||||
typedef union { short data[4]; } short4;
|
||||
typedef union { short data[8]; } short8;
|
||||
typedef union { short data[16]; } short16;
|
||||
typedef union { short4 data; } short3;
|
||||
|
||||
typedef union { unsigned short data; } ushort1;
|
||||
typedef union { unsigned short data[2]; } ushort2;
|
||||
typedef union { unsigned short data[4]; } ushort4;
|
||||
typedef union { unsigned short data[8]; } ushort8;
|
||||
typedef union { unsigned short data[16]; } ushort16;
|
||||
typedef union { ushort4 data; } ushort3;
|
||||
|
||||
typedef union { int data; } int1;
|
||||
typedef union { int data[2]; } int2;
|
||||
typedef union { int data[4]; } int4;
|
||||
typedef union { int data[8]; } int8;
|
||||
typedef union { int data[16]; } int16;
|
||||
typedef union { int4 data; } int3;
|
||||
|
||||
typedef union { unsigned int data; } uint1;
|
||||
typedef union { unsigned int data[2]; } uint2;
|
||||
typedef union { unsigned int data[4]; } uint4;
|
||||
typedef union { unsigned int data[8]; } uint8;
|
||||
typedef union { unsigned int data[16]; } uint16;
|
||||
typedef union { uint4 data; } uint3;
|
||||
|
||||
typedef union { long data; } long1;
|
||||
typedef union { long data[2]; } long2;
|
||||
typedef union { long data[4]; } long4;
|
||||
typedef union { long data[8]; } long8;
|
||||
typedef union { long data[16]; } long16;
|
||||
typedef union { long4 data; } long3;
|
||||
|
||||
typedef union { unsigned long data; } ulong1;
|
||||
typedef union { unsigned long data[2]; } ulong2;
|
||||
typedef union { unsigned long data[4]; } ulong4;
|
||||
typedef union { unsigned long data[8]; } ulong8;
|
||||
typedef union { unsigned long data[16]; } ulong16;
|
||||
typedef union { ulong4 data; } ulong3;
|
||||
|
||||
typedef union { long long data; } longlong1;
|
||||
typedef union { long long data[2]; } longlong2;
|
||||
typedef union { long long data[4]; } longlong4;
|
||||
typedef union { long long data[8]; } longlong8;
|
||||
typedef union { long long data[16]; } longlong16;
|
||||
typedef union { longlong4 data; } longlong3;
|
||||
|
||||
typedef union { unsigned long long data; } ulonglong1;
|
||||
typedef union { unsigned long long data[2]; } ulonglong2;
|
||||
typedef union { unsigned long long data[4]; } ulonglong4;
|
||||
typedef union { unsigned long long data[8]; } ulonglong8;
|
||||
typedef union { unsigned long long data[16]; } ulonglong16;
|
||||
typedef union { ulonglong4 data; } ulonglong3;
|
||||
|
||||
typedef union { float data; } float1;
|
||||
typedef union { float data[2]; } float2;
|
||||
typedef union { float data[4]; } float4;
|
||||
typedef union { float data[8]; } float8;
|
||||
typedef union { float data[16]; } float16;
|
||||
typedef union { float4 data; } float3;
|
||||
|
||||
typedef union { double data; } double1;
|
||||
typedef union { double data[2]; } double2;
|
||||
typedef union { double data[4]; } double4;
|
||||
typedef union { double data[8]; } double8;
|
||||
typedef union { double data[16]; } double16;
|
||||
typedef union { double4 data; } double3;
|
||||
|
||||
#endif // defined(_MSC_VER)
|
||||
#endif // defined(__has_attribute)
|
||||
#endif
|
||||
|
||||
@@ -28,6 +28,8 @@ extern "C" {
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
#pragma GCC visibility push (default)
|
||||
|
||||
enum hiprtcResult {
|
||||
HIPRTC_SUCCESS = 0,
|
||||
HIPRTC_ERROR_OUT_OF_MEMORY = 1,
|
||||
@@ -79,6 +81,8 @@ hiprtcResult hiprtcGetCode(hiprtcProgram prog, char* code);
|
||||
|
||||
hiprtcResult hiprtcGetCodeSize(hiprtcProgram prog, size_t* codeSizeRet);
|
||||
|
||||
#pragma GCC visibility pop
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif /* __cplusplus */
|
||||
|
||||
@@ -60,7 +60,7 @@ THE SOFTWARE.
|
||||
*/
|
||||
// _restrict is supported by the compiler
|
||||
#define __shared__ tile_static
|
||||
#define __constant__ __attribute__((hc))
|
||||
#define __constant__ __attribute__((hc, annotate("__HIP_constant__")))
|
||||
|
||||
#elif defined(__clang__) && defined(__HIP__)
|
||||
|
||||
|
||||
@@ -321,7 +321,6 @@ typedef enum hipDeviceAttribute_t {
|
||||
hipDeviceAttributeIntegrated, ///< iGPU
|
||||
hipDeviceAttributeCooperativeLaunch, ///< Support cooperative launch
|
||||
hipDeviceAttributeCooperativeMultiDeviceLaunch, ///< Support cooperative launch on multiple devices
|
||||
|
||||
hipDeviceAttributeMaxTexture1DWidth, ///< Maximum number of elements in 1D images
|
||||
hipDeviceAttributeMaxTexture2DWidth, ///< Maximum dimension width of 2D images in image elements
|
||||
hipDeviceAttributeMaxTexture2DHeight, ///< Maximum dimension height of 2D images in image elements
|
||||
|
||||
@@ -186,6 +186,7 @@ typedef struct cudaArray hipArray;
|
||||
typedef struct cudaArray* hipArray_t;
|
||||
typedef struct cudaArray* hipArray_const_t;
|
||||
typedef struct cudaFuncAttributes hipFuncAttributes;
|
||||
typedef struct cudaLaunchParams hipLaunchParams;
|
||||
#define hipFunction_attribute CUfunction_attribute
|
||||
#define hip_Memcpy2D CUDA_MEMCPY2D
|
||||
#define hipMemcpy3DParms cudaMemcpy3DParms
|
||||
@@ -860,7 +861,7 @@ inline static hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes,
|
||||
}
|
||||
|
||||
|
||||
inline hipError_t hipMemcpyWithStream(void* dst, const void* src,
|
||||
inline static hipError_t hipMemcpyWithStream(void* dst, const void* src,
|
||||
size_t sizeBytes, hipMemcpyKind copyKind,
|
||||
hipStream_t stream) {
|
||||
cudaError_t error = cudaMemcpyAsync(dst, src, sizeBytes,
|
||||
@@ -1134,6 +1135,10 @@ inline static hipError_t hipGetDeviceProperties(hipDeviceProp_t* p_prop, int dev
|
||||
p_prop->integrated = cdprop.integrated;
|
||||
p_prop->cooperativeLaunch = cdprop.cooperativeLaunch;
|
||||
p_prop->cooperativeMultiDeviceLaunch = cdprop.cooperativeMultiDeviceLaunch;
|
||||
p_prop->cooperativeMultiDeviceUnmatchedFunc = 0;
|
||||
p_prop->cooperativeMultiDeviceUnmatchedGridDim = 0;
|
||||
p_prop->cooperativeMultiDeviceUnmatchedBlockDim = 0;
|
||||
p_prop->cooperativeMultiDeviceUnmatchedSharedMem = 0;
|
||||
|
||||
p_prop->maxTexture1D = cdprop.maxTexture1D;
|
||||
p_prop->maxTexture2D[0] = cdprop.maxTexture2D[0];
|
||||
@@ -1271,6 +1276,12 @@ inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t att
|
||||
case hipDeviceAttributeEccEnabled:
|
||||
cdattr = cudaDevAttrEccEnabled;
|
||||
break;
|
||||
case hipDeviceAttributeCooperativeLaunch:
|
||||
cdattr = cudaDevAttrCooperativeLaunch;
|
||||
break;
|
||||
case hipDeviceAttributeCooperativeMultiDeviceLaunch:
|
||||
cdattr = cudaDevAttrCooperativeMultiDeviceLaunch;
|
||||
break;
|
||||
default:
|
||||
return hipCUDAErrorTohipError(cudaErrorInvalidValue);
|
||||
}
|
||||
@@ -1679,6 +1690,17 @@ inline static hipError_t hipGetChannelDesc(hipChannelFormatDesc* desc, hipArray_
|
||||
return hipCUDAErrorTohipError(cudaGetChannelDesc(desc,array));
|
||||
}
|
||||
|
||||
inline static hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDim,
|
||||
void** kernelParams, unsigned int sharedMemBytes,
|
||||
hipStream_t stream) {
|
||||
return hipCUDAErrorTohipError(
|
||||
cudaLaunchCooperativeKernel(f, gridDim, blockDim, kernelParams, sharedMemBytes, stream));
|
||||
}
|
||||
|
||||
inline static hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
|
||||
int numDevices, unsigned int flags) {
|
||||
return hipCUDAErrorTohipError(cudaLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags));
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
@@ -1686,6 +1708,17 @@ inline static hipError_t hipGetChannelDesc(hipChannelFormatDesc* desc, hipArray_
|
||||
|
||||
#ifdef __CUDACC__
|
||||
|
||||
template<class T>
|
||||
inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks,
|
||||
T func,
|
||||
int blockSize,
|
||||
size_t dynamicSMemSize) {
|
||||
cudaError_t cerror;
|
||||
cerror =
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, func, blockSize, dynamicSMemSize);
|
||||
return hipCUDAErrorTohipError(cerror);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
inline static hipError_t hipOccupancyMaxPotentialBlockSize(int* minGridSize, int* blockSize, T func,
|
||||
size_t dynamicSMemSize = 0,
|
||||
@@ -1742,6 +1775,14 @@ template <class T>
|
||||
inline static hipChannelFormatDesc hipCreateChannelDesc() {
|
||||
return cudaCreateChannelDesc<T>();
|
||||
}
|
||||
|
||||
template <class T>
|
||||
inline static hipError_t hipLaunchCooperativeKernel(T f, dim3 gridDim, dim3 blockDim,
|
||||
void** kernelParams, unsigned int sharedMemBytes, hipStream_t stream) {
|
||||
return hipCUDAErrorTohipError(
|
||||
cudaLaunchCooperativeKernel(f, gridDim, blockDim, kernelParams, sharedMemBytes, stream));
|
||||
}
|
||||
|
||||
#endif //__CUDACC__
|
||||
|
||||
#endif // HIP_INCLUDE_HIP_NVCC_DETAIL_HIP_RUNTIME_API_H
|
||||
|
||||
@@ -14,7 +14,7 @@ install(TARGETS lpl RUNTIME DESTINATION bin)
|
||||
#-------------------------------------LPL--------------------------------------#
|
||||
|
||||
#-------------------------------------CA---------------------------------------#
|
||||
add_executable(ca ca.cpp ${PROJECT_SOURCE_DIR}/src/code_object_bundle.cpp)
|
||||
add_executable(ca ca.cpp)
|
||||
set_target_properties(
|
||||
ca PROPERTIES
|
||||
CXX_STANDARD 11
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#include "../include/hip/hcc_detail/code_object_bundle.hpp"
|
||||
#include "../src/code_object_bundle.inl"
|
||||
|
||||
#include "clara/clara.hpp"
|
||||
|
||||
|
||||
@@ -25,16 +25,15 @@ set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}
|
||||
set(CPACK_GENERATOR "TGZ;DEB;RPM")
|
||||
set(CPACK_BINARY_DEB "ON")
|
||||
set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm")
|
||||
set(CPACK_DEBIAN_PACKAGE_DEPENDS "perl (>= 5.0), llvm-amdgpu")
|
||||
set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip_base")
|
||||
set(CPACK_DEBIAN_PACKAGE_DEPENDS "perl (>= 5.0)")
|
||||
set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-base")
|
||||
set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_base")
|
||||
set(CPACK_DEBIAN_PACKAGE_CONFLICTS "hip_base")
|
||||
set(CPACK_BINARY_RPM "ON")
|
||||
set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
|
||||
set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst")
|
||||
set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm")
|
||||
set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
|
||||
set(CPACK_RPM_PACKAGE_REQUIRES "perl >= 5.0, llvm-amdgpu")
|
||||
set(CPACK_RPM_PACKAGE_REQUIRES "perl >= 5.0")
|
||||
set(CPACK_RPM_PACKAGE_OBSOLETES "hip_base")
|
||||
set(CPACK_RPM_PACKAGE_CONFLICTS "hip_base")
|
||||
set(CPACK_BINARY_RPM "ON")
|
||||
|
||||
@@ -32,9 +32,8 @@ set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}
|
||||
set(CPACK_GENERATOR "TGZ;DEB;RPM")
|
||||
set(CPACK_BINARY_DEB "ON")
|
||||
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION})")
|
||||
set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip_doc")
|
||||
set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-doc")
|
||||
set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_doc")
|
||||
set(CPACK_DEBIAN_PACKAGE_CONFLICTS "hip_doc")
|
||||
set(CPACK_BINARY_RPM "ON")
|
||||
set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
|
||||
set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
|
||||
|
||||
@@ -37,9 +37,8 @@ set(CPACK_GENERATOR "TGZ;DEB;RPM")
|
||||
set(CPACK_BINARY_DEB "ON")
|
||||
set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm")
|
||||
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}), ${HCC_PACKAGE_NAME} (= @HCC_PACKAGE_VERSION@), comgr (>= 1.1)")
|
||||
set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip_hcc")
|
||||
set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-hcc")
|
||||
set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_hcc")
|
||||
set(CPACK_DEBIAN_PACKAGE_CONFLICTS "hip_hcc")
|
||||
set(CPACK_BINARY_RPM "ON")
|
||||
set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
|
||||
set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst")
|
||||
|
||||
@@ -19,9 +19,8 @@ set(CPACK_GENERATOR "TGZ;DEB;RPM")
|
||||
set(CPACK_BINARY_DEB "ON")
|
||||
set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm")
|
||||
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}), cuda (>= 7.5)")
|
||||
set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip_nvcc")
|
||||
set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-nvcc")
|
||||
set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_nvcc")
|
||||
set(CPACK_DEBIAN_PACKAGE_CONFLICTS "hip_nvcc")
|
||||
set(CPACK_BINARY_RPM "ON")
|
||||
set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
|
||||
set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst")
|
||||
|
||||
@@ -19,10 +19,9 @@ set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@)
|
||||
set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH})
|
||||
set(CPACK_GENERATOR "TGZ;DEB;RPM")
|
||||
set(CPACK_BINARY_DEB "ON")
|
||||
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-vdi (= ${CPACK_PACKAGE_VERSION})")
|
||||
set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip_samples")
|
||||
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION})")
|
||||
set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-samples")
|
||||
set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_samples")
|
||||
set(CPACK_DEBIAN_PACKAGE_CONFLICTS "hip_samples")
|
||||
set(CPACK_BINARY_RPM "ON")
|
||||
set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
|
||||
set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
|
||||
|
||||
@@ -114,8 +114,6 @@ set_target_properties(hip::device PROPERTIES
|
||||
else()
|
||||
set_target_properties(hip::device PROPERTIES
|
||||
INTERFACE_LINK_LIBRARIES "hip::host"
|
||||
INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include"
|
||||
INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include"
|
||||
)
|
||||
endif()
|
||||
|
||||
|
||||
@@ -27,11 +27,7 @@ set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}
|
||||
set(CPACK_GENERATOR "TGZ;DEB;RPM")
|
||||
set(CPACK_BINARY_DEB "ON")
|
||||
set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm")
|
||||
if(@COMPILE_HIP_ATP_MARKER@)
|
||||
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev, hsa-ext-rocr-dev, rocm-utils, hip-base (= ${CPACK_PACKAGE_VERSION}), rocm-profiler, comgr (>= 1.1)")
|
||||
else()
|
||||
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev, hsa-ext-rocr-dev, rocm-utils, hip-base (= ${CPACK_PACKAGE_VERSION}), comgr (>= 1.1)")
|
||||
endif()
|
||||
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev, hsa-ext-rocr-dev, rocm-utils, hip-base (= ${CPACK_PACKAGE_VERSION}), comgr (>= 1.1), llvm-amdgpu")
|
||||
set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip_vdi, hip-hcc (= ${CPACK_PACKAGE_VERSION})")
|
||||
set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_vdi")
|
||||
set(CPACK_DEBIAN_PACKAGE_CONFLICTS "hip_vdi")
|
||||
@@ -41,11 +37,7 @@ set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst")
|
||||
set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm")
|
||||
set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
|
||||
string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION})
|
||||
if(@COMPILE_HIP_ATP_MARKER@)
|
||||
set(CPACK_RPM_PACKAGE_REQUIRES "hsa-rocr-dev, hsa-ext-rocr-dev, rocm-utils, hip-base = ${HIP_BASE_VERSION}, rocm-profiler, comgr >= 1.1")
|
||||
else()
|
||||
set(CPACK_RPM_PACKAGE_REQUIRES "hsa-rocr-dev, hsa-ext-rocr-dev, rocm-utils, hip-base = ${HIP_BASE_VERSION}, comgr >= 1.1")
|
||||
endif()
|
||||
set(CPACK_RPM_PACKAGE_REQUIRES "hsa-rocr-dev, hsa-ext-rocr-dev, rocm-utils, hip-base = ${HIP_BASE_VERSION}, comgr >= 1.1, llvm-amdgpu")
|
||||
set(CPACK_RPM_PACKAGE_PROVIDES "hip_vdi, hip-hcc = ${HIP_BASE_VERSION}")
|
||||
set(CPACK_RPM_PACKAGE_OBSOLETES "hip_vdi")
|
||||
set(CPACK_RPM_PACKAGE_CONFLICTS "hip_vdi")
|
||||
|
||||
@@ -80,8 +80,8 @@ int main() {
|
||||
|
||||
hipFree(Ad);
|
||||
hipFree(Bd);
|
||||
delete A;
|
||||
delete B;
|
||||
delete[] A;
|
||||
delete[] B;
|
||||
hipCtxDestroy(context);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -107,8 +107,8 @@ int main() {
|
||||
|
||||
hipFree(Ad);
|
||||
hipFree(Bd);
|
||||
delete A;
|
||||
delete B;
|
||||
delete[] A;
|
||||
delete[] B;
|
||||
hipCtxDestroy(context);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -99,8 +99,8 @@ int main() {
|
||||
|
||||
hipFree(Ad);
|
||||
hipFree(Bd);
|
||||
delete A;
|
||||
delete B;
|
||||
delete[] A;
|
||||
delete[] B;
|
||||
hipCtxDestroy(context);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -154,8 +154,8 @@ int main() {
|
||||
|
||||
hipFree(Ad);
|
||||
hipFree(Bd);
|
||||
delete A;
|
||||
delete B;
|
||||
delete[] A;
|
||||
delete[] B;
|
||||
hipCtxDestroy(context);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -56,6 +56,7 @@ void printCompilerInfo() {
|
||||
#endif
|
||||
}
|
||||
|
||||
double bytesToKB(size_t s) { return (double)s / (1024.0); }
|
||||
double bytesToGB(size_t s) { return (double)s / (1024.0 * 1024.0 * 1024.0); }
|
||||
|
||||
#define printLimit(w1, limit, units) \
|
||||
@@ -97,7 +98,7 @@ void printDeviceProp(int deviceId) {
|
||||
cout << setw(w1) << "totalGlobalMem: " << fixed << setprecision(2)
|
||||
<< bytesToGB(props.totalGlobalMem) << " GB" << endl;
|
||||
cout << setw(w1) << "maxSharedMemoryPerMultiProcessor: " << fixed << setprecision(2)
|
||||
<< bytesToGB(props.maxSharedMemoryPerMultiProcessor) << " GB" << endl;
|
||||
<< bytesToKB(props.maxSharedMemoryPerMultiProcessor) << " KB" << endl;
|
||||
cout << setw(w1) << "totalConstMem: " << props.totalConstMem << endl;
|
||||
cout << setw(w1) << "sharedMemPerBlock: " << (float)props.sharedMemPerBlock / 1024.0 << " KB"
|
||||
<< endl;
|
||||
|
||||
@@ -1,34 +0,0 @@
|
||||
#include "../include/hip/hcc_detail/code_object_bundle.hpp"
|
||||
|
||||
#include <hsa/hsa.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
using namespace std;
|
||||
|
||||
// CREATORS
|
||||
hip_impl::Bundled_code_header::Bundled_code_header(const vector<char>& x)
|
||||
: Bundled_code_header{x.cbegin(), x.cend()} {}
|
||||
|
||||
hip_impl::Bundled_code_header::Bundled_code_header(
|
||||
const void* p) { // This is a pretty terrible interface, useful only because
|
||||
// hipLoadModuleData is so poorly specified (for no fault of its own).
|
||||
if (!p) return;
|
||||
|
||||
if (!valid(*static_cast<const Bundled_code_header*>(p))) return;
|
||||
auto ph = static_cast<const Header_*>(p);
|
||||
|
||||
size_t sz = sizeof(Header_) + ph->bundle_cnt_ * sizeof(Bundled_code::Header);
|
||||
auto pb = static_cast<const char*>(p) + sizeof(Header_);
|
||||
auto n = ph->bundle_cnt_;
|
||||
while (n--) {
|
||||
sz += reinterpret_cast<const Bundled_code::Header*>(pb)->bundle_sz;
|
||||
pb += sizeof(Bundled_code::Header);
|
||||
}
|
||||
|
||||
read(static_cast<const char*>(p), static_cast<const char*>(p) + sz, *this);
|
||||
}
|
||||
+20
-10
@@ -92,10 +92,6 @@ struct Bundled_code {
|
||||
|
||||
#define magic_string_ "__CLANG_OFFLOAD_BUNDLE__"
|
||||
|
||||
#ifdef __GNUC__
|
||||
#pragma GCC visibility push (default)
|
||||
#endif
|
||||
|
||||
class Bundled_code_header {
|
||||
// DATA - STATICS
|
||||
static constexpr auto magic_string_sz_ = sizeof(magic_string_) - 1;
|
||||
@@ -167,8 +163,26 @@ class Bundled_code_header {
|
||||
Bundled_code_header() = default;
|
||||
template <typename RandomAccessIterator>
|
||||
Bundled_code_header(RandomAccessIterator f, RandomAccessIterator l);
|
||||
explicit Bundled_code_header(const std::vector<char>& blob);
|
||||
explicit Bundled_code_header(const void* maybe_blob);
|
||||
explicit Bundled_code_header(const std::vector<char>& blob)
|
||||
: Bundled_code_header{blob.cbegin(), blob.cend()} {}
|
||||
explicit Bundled_code_header(const void* maybe_blob) {
|
||||
// This is a pretty terrible interface, useful only because
|
||||
// hipLoadModuleData is so poorly specified (for no fault of its own).
|
||||
if (!maybe_blob) return;
|
||||
|
||||
if (!valid(*static_cast<const Bundled_code_header*>(maybe_blob))) return;
|
||||
auto ph = static_cast<const Header_*>(maybe_blob);
|
||||
|
||||
size_t sz = sizeof(Header_) + ph->bundle_cnt_ * sizeof(Bundled_code::Header);
|
||||
auto pb = static_cast<const char*>(maybe_blob) + sizeof(Header_);
|
||||
auto n = ph->bundle_cnt_;
|
||||
while (n--) {
|
||||
sz += reinterpret_cast<const Bundled_code::Header*>(pb)->bundle_sz;
|
||||
pb += sizeof(Bundled_code::Header);
|
||||
}
|
||||
|
||||
read(static_cast<const char*>(maybe_blob), static_cast<const char*>(maybe_blob) + sz, *this);
|
||||
}
|
||||
Bundled_code_header(const Bundled_code_header&) = default;
|
||||
Bundled_code_header(Bundled_code_header&&) = default;
|
||||
~Bundled_code_header() = default;
|
||||
@@ -180,10 +194,6 @@ class Bundled_code_header {
|
||||
size_t bundled_code_size = 0;
|
||||
};
|
||||
|
||||
#ifdef __GNUC__
|
||||
#pragma GCC visibility pop
|
||||
#endif
|
||||
|
||||
// CREATORS
|
||||
template <typename RandomAccessIterator>
|
||||
Bundled_code_header::Bundled_code_header(RandomAccessIterator f, RandomAccessIterator l)
|
||||
@@ -51,7 +51,7 @@ __hipRegisterFatBinary(const void* data)
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto modules = new std::vector<hipModule_t>{g_deviceCnt};
|
||||
auto modules = new std::vector<hipModule_t>(g_deviceCnt);
|
||||
if (!modules) {
|
||||
return nullptr;
|
||||
}
|
||||
@@ -136,7 +136,7 @@ extern "C" void __hipRegisterFunction(
|
||||
int* wSize)
|
||||
{
|
||||
HIP_INIT_API(NONE, modules, hostFunction, deviceFunction, deviceName);
|
||||
std::vector<hipFunction_t> functions{g_deviceCnt};
|
||||
std::vector<hipFunction_t> functions(g_deviceCnt);
|
||||
|
||||
assert(modules && modules->size() >= g_deviceCnt);
|
||||
for (int deviceId = 0; deviceId < g_deviceCnt; ++deviceId) {
|
||||
|
||||
@@ -310,6 +310,18 @@ hipError_t ihipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device
|
||||
case hipDeviceAttributeCooperativeMultiDeviceLaunch:
|
||||
*pi = prop->cooperativeMultiDeviceLaunch;
|
||||
break;
|
||||
case hipDeviceAttributeCooperativeMultiDeviceUnmatchedFunc:
|
||||
*pi = prop->cooperativeMultiDeviceUnmatchedFunc;
|
||||
break;
|
||||
case hipDeviceAttributeCooperativeMultiDeviceUnmatchedGridDim:
|
||||
*pi = prop->cooperativeMultiDeviceUnmatchedGridDim;
|
||||
break;
|
||||
case hipDeviceAttributeCooperativeMultiDeviceUnmatchedBlockDim:
|
||||
*pi = prop->cooperativeMultiDeviceUnmatchedBlockDim;
|
||||
break;
|
||||
case hipDeviceAttributeCooperativeMultiDeviceUnmatchedSharedMem:
|
||||
*pi = prop->cooperativeMultiDeviceUnmatchedSharedMem;
|
||||
break;
|
||||
case hipDeviceAttributeMaxPitch:
|
||||
*pi = prop->memPitch;
|
||||
break;
|
||||
|
||||
+16
-23
@@ -677,7 +677,7 @@ hsa_status_t get_pool_info(hsa_amd_memory_pool_t pool, void* data) {
|
||||
break;
|
||||
case HSA_REGION_SEGMENT_GROUP:
|
||||
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SIZE,
|
||||
&(p_prop->sharedMemPerBlock));
|
||||
&(p_prop->maxSharedMemoryPerMultiProcessor));
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
@@ -835,10 +835,8 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop) {
|
||||
hsa_region_t* am_region = static_cast<hsa_region_t*>(_acc.get_hsa_am_region());
|
||||
err = hsa_region_get_info(*am_region, HSA_REGION_INFO_SIZE, &prop->totalGlobalMem);
|
||||
DeviceErrorCheck(err);
|
||||
// maxSharedMemoryPerMultiProcessor should be as the same as group memory size.
|
||||
// Group memory will not be paged out, so, the physical memory size is the total shared memory
|
||||
// size, and also equal to the group pool size.
|
||||
prop->maxSharedMemoryPerMultiProcessor = prop->totalGlobalMem;
|
||||
// Current GPUs allow a workgroup to use all of LDS in a CU, so these two are equal.
|
||||
prop->sharedMemPerBlock = prop->maxSharedMemoryPerMultiProcessor;
|
||||
|
||||
// Get Max memory clock frequency
|
||||
err =
|
||||
@@ -897,9 +895,16 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop) {
|
||||
prop->integrated = 1;
|
||||
}
|
||||
|
||||
// Enable the cooperative group for gfx9+
|
||||
prop->cooperativeLaunch = (prop->gcnArch < 900) ? 0 : 1;
|
||||
prop->cooperativeMultiDeviceLaunch = (prop->gcnArch < 900) ? 0 : 1;
|
||||
// Enable the cooperative group for GPUs that support all the required features
|
||||
err = hsa_agent_get_info(_hsaAgent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COOPERATIVE_QUEUES,
|
||||
&prop->cooperativeLaunch);
|
||||
DeviceErrorCheck(err);
|
||||
prop->cooperativeMultiDeviceLaunch = prop->cooperativeLaunch;
|
||||
|
||||
prop->cooperativeMultiDeviceUnmatchedFunc = prop->cooperativeMultiDeviceLaunch;
|
||||
prop->cooperativeMultiDeviceUnmatchedGridDim = prop->cooperativeMultiDeviceLaunch;
|
||||
prop->cooperativeMultiDeviceUnmatchedBlockDim = prop->cooperativeMultiDeviceLaunch;
|
||||
prop->cooperativeMultiDeviceUnmatchedSharedMem = prop->cooperativeMultiDeviceLaunch;
|
||||
|
||||
err = hsa_agent_get_info(_hsaAgent, (hsa_agent_info_t)HSA_EXT_AGENT_INFO_IMAGE_1D_MAX_ELEMENTS,
|
||||
&prop->maxTexture1D);
|
||||
@@ -1515,20 +1520,6 @@ hipError_t ihipStreamSynchronize(TlsData *tls, hipStream_t stream) {
|
||||
return e;
|
||||
}
|
||||
|
||||
void ihipStreamCallbackHandler(ihipStreamCallback_t* cb) {
|
||||
hipError_t e = hipSuccess;
|
||||
|
||||
// Synchronize stream
|
||||
tprintf(DB_SYNC, "ihipStreamCallbackHandler wait on stream %s\n",
|
||||
ToString(cb->_stream).c_str());
|
||||
GET_TLS();
|
||||
e = ihipStreamSynchronize(tls, cb->_stream);
|
||||
|
||||
// Call registered callback function
|
||||
cb->_callback(cb->_stream, e, cb->_userData);
|
||||
delete cb;
|
||||
}
|
||||
|
||||
//---
|
||||
// Get the stream to use for a command submission.
|
||||
//
|
||||
@@ -1619,7 +1610,9 @@ void ihipPrintKernelLaunch(const char* kernelName, const grid_launch_parm* lp,
|
||||
// Allows runtime to track some information about the stream.
|
||||
hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_launch_parm* lp,
|
||||
const char* kernelNameStr, bool lockAcquired) {
|
||||
stream = ihipSyncAndResolveStream(stream, lockAcquired);
|
||||
if (stream == nullptr || stream != stream->getCtx()->_defaultStream) {
|
||||
stream = ihipSyncAndResolveStream(stream, lockAcquired);
|
||||
}
|
||||
lp->grid_dim.x = grid.x;
|
||||
lp->grid_dim.y = grid.y;
|
||||
lp->grid_dim.z = grid.z;
|
||||
|
||||
@@ -654,19 +654,6 @@ class ihipStream_t {
|
||||
};
|
||||
|
||||
|
||||
//----
|
||||
// Internal structure for stream callback handler
|
||||
class ihipStreamCallback_t {
|
||||
public:
|
||||
ihipStreamCallback_t(hipStream_t stream, hipStreamCallback_t callback, void* userData)
|
||||
: _stream(stream), _callback(callback), _userData(userData) {
|
||||
};
|
||||
hipStream_t _stream;
|
||||
hipStreamCallback_t _callback;
|
||||
void* _userData;
|
||||
};
|
||||
|
||||
|
||||
//----
|
||||
// Internal event structure:
|
||||
enum hipEventStatus_t {
|
||||
@@ -980,7 +967,6 @@ hipError_t hipModuleGetFunctionEx(hipFunction_t* hfunc, hipModule_t hmod,
|
||||
|
||||
hipStream_t ihipSyncAndResolveStream(hipStream_t, bool lockAcquired = 0);
|
||||
hipError_t ihipStreamSynchronize(TlsData *tls, hipStream_t stream);
|
||||
void ihipStreamCallbackHandler(ihipStreamCallback_t* cb);
|
||||
|
||||
/**
|
||||
* @brief Copies the memory address and size of symbol @p symbolName
|
||||
|
||||
+68
-33
@@ -309,31 +309,52 @@ void generic_copy(void* __restrict dst, const void* __restrict src, size_t n,
|
||||
if (di.size == is_cpu_owned) return d2h_copy(dst, src, n, si);
|
||||
if (si.size == is_cpu_owned) return h2d_copy(dst, src, n, di);
|
||||
|
||||
throwing_result_check(hsa_amd_agents_allow_access(1u, &si.agentOwner,
|
||||
nullptr,
|
||||
di.agentBaseAddress),
|
||||
__FILE__, __func__, __LINE__);
|
||||
|
||||
return do_copy(dst, src, n, di.agentOwner, si.agentOwner);
|
||||
hsa_status_t res = hsa_amd_agents_allow_access(1u, &si.agentOwner,
|
||||
nullptr, di.agentBaseAddress);
|
||||
if (res == HSA_STATUS_SUCCESS){
|
||||
return do_copy(dst, src, n, di.agentOwner, si.agentOwner);
|
||||
}
|
||||
// If devices do not have access then fallback mechanism will be used
|
||||
// copy will be slower
|
||||
throwing_result_check(hsa_memory_copy(dst,src,n), __FILE__, __func__, __LINE__);
|
||||
}
|
||||
|
||||
inline
|
||||
void memcpy_impl(void* __restrict dst, const void* __restrict src, size_t n,
|
||||
hipMemcpyKind k) {
|
||||
auto si{info(src)};
|
||||
auto di{info(dst)};
|
||||
|
||||
if (!is_large_BAR){
|
||||
// Pointer info takes presidence over hipMemcpyKind
|
||||
// if there is mismatch b/w Memcpy kind and dst/src pointer
|
||||
// E.g. dst(host pointer),src(device pointer) and hipMemcpyKind set as hipMemcpyHostToDevice
|
||||
if (di.size == is_cpu_owned && si.size == is_cpu_owned)
|
||||
k = hipMemcpyHostToHost;
|
||||
else if (si.size == is_cpu_owned && di.size != is_cpu_owned)
|
||||
k = hipMemcpyHostToDevice;
|
||||
else if (di.size == is_cpu_owned && si.size != is_cpu_owned)
|
||||
k = hipMemcpyDeviceToHost;
|
||||
else
|
||||
k = hipMemcpyDeviceToDevice;
|
||||
}
|
||||
switch (k) {
|
||||
case hipMemcpyHostToHost: std::memcpy(dst, src, n); break;
|
||||
case hipMemcpyHostToDevice: return h2d_copy(dst, src, n, info(dst));
|
||||
case hipMemcpyDeviceToHost: return d2h_copy(dst, src, n, info(src));
|
||||
case hipMemcpyHostToDevice: return h2d_copy(dst, src, n, di);
|
||||
case hipMemcpyDeviceToHost: return d2h_copy(dst, src, n, si);
|
||||
case hipMemcpyDeviceToDevice: {
|
||||
const auto di{info(dst)};
|
||||
const auto si{info(src)};
|
||||
throwing_result_check(hsa_amd_agents_allow_access(1u, &si.agentOwner,
|
||||
nullptr,
|
||||
di.agentBaseAddress),
|
||||
__FILE__, __func__, __LINE__);
|
||||
return do_copy(dst, src, n, di.agentOwner, si.agentOwner);
|
||||
hsa_status_t res = hsa_amd_agents_allow_access(1u, &si.agentOwner,
|
||||
nullptr, di.agentBaseAddress);
|
||||
if (res == HSA_STATUS_SUCCESS){
|
||||
return do_copy(dst, src, n, di.agentOwner, si.agentOwner);
|
||||
}
|
||||
|
||||
// If devices do not have access then fallback mechanism will be used
|
||||
// copy will be slower
|
||||
throwing_result_check(hsa_memory_copy(dst,src,n), __FILE__, __func__, __LINE__);
|
||||
break;
|
||||
}
|
||||
default: return generic_copy(dst, src, n, info(dst), info(src));
|
||||
default: return generic_copy(dst, src, n, di, si);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -478,6 +499,10 @@ void* allocAndSharePtr(const char* msg, size_t sizeBytes, ihipCtx_t* ctx, bool s
|
||||
hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned int flags) {
|
||||
hipError_t hip_status = hipSuccess;
|
||||
|
||||
if (sizeBytes == 0) {
|
||||
return hipSuccess;
|
||||
}
|
||||
|
||||
if (HIP_SYNC_HOST_ALLOC) {
|
||||
hipDeviceSynchronize();
|
||||
}
|
||||
@@ -485,10 +510,6 @@ hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned i
|
||||
auto ctx = ihipGetTlsDefaultCtx();
|
||||
if ((ctx == nullptr) || (ptr == nullptr)) {
|
||||
hip_status = hipErrorInvalidValue;
|
||||
}
|
||||
else if (sizeBytes == 0) {
|
||||
hip_status = hipSuccess;
|
||||
// TODO - should size of 0 return err or be siliently ignored?
|
||||
} else {
|
||||
unsigned trueFlags = flags;
|
||||
if (flags == hipHostMallocDefault) {
|
||||
@@ -673,14 +694,15 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) {
|
||||
HIP_SET_DEVICE();
|
||||
hipError_t hip_status = hipSuccess;
|
||||
|
||||
if (sizeBytes == 0) {
|
||||
if (ptr) *ptr = NULL;
|
||||
return ihipLogStatus(hipSuccess);
|
||||
}
|
||||
|
||||
auto ctx = ihipGetTlsDefaultCtx();
|
||||
// return NULL pointer when malloc size is 0
|
||||
if ( nullptr == ctx || nullptr == ptr) {
|
||||
hip_status = hipErrorInvalidValue;
|
||||
}
|
||||
else if (sizeBytes == 0) {
|
||||
*ptr = NULL;
|
||||
hip_status = hipSuccess;
|
||||
} else {
|
||||
auto device = ctx->getWriteableDevice();
|
||||
*ptr = hip_internal::allocAndSharePtr("device_mem", sizeBytes, ctx, false /*shareWithAll*/,
|
||||
@@ -700,14 +722,15 @@ hipError_t hipExtMallocWithFlags(void** ptr, size_t sizeBytes, unsigned int flag
|
||||
HIP_SET_DEVICE();
|
||||
|
||||
#if (__hcc_workweek__ >= 19115)
|
||||
if (sizeBytes == 0) {
|
||||
if (ptr) *ptr = NULL;
|
||||
return ihipLogStatus(hipSuccess);
|
||||
}
|
||||
|
||||
hipError_t hip_status = hipSuccess;
|
||||
|
||||
auto ctx = ihipGetTlsDefaultCtx();
|
||||
// return NULL pointer when malloc size is 0
|
||||
if (sizeBytes == 0) {
|
||||
*ptr = NULL;
|
||||
hip_status = hipSuccess;
|
||||
} else if ((ctx == nullptr) || (ptr == nullptr)) {
|
||||
if ((ctx == nullptr) || (ptr == nullptr)) {
|
||||
hip_status = hipErrorInvalidValue;
|
||||
} else {
|
||||
unsigned amFlags = 0;
|
||||
@@ -736,6 +759,9 @@ hipError_t hipExtMallocWithFlags(void** ptr, size_t sizeBytes, unsigned int flag
|
||||
hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) {
|
||||
HIP_INIT_SPECIAL_API(hipHostMalloc, (TRACE_MEM), ptr, sizeBytes, flags);
|
||||
HIP_SET_DEVICE();
|
||||
if (sizeBytes == 0) {
|
||||
return ihipLogStatus(hipSuccess);
|
||||
}
|
||||
hipError_t hip_status = hipSuccess;
|
||||
hip_status = hip_internal::ihipHostMalloc(tls, ptr, sizeBytes, flags);
|
||||
return ihipLogStatus(hip_status);
|
||||
@@ -744,6 +770,9 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) {
|
||||
hipError_t hipMallocManaged(void** devPtr, size_t size, unsigned int flags) {
|
||||
HIP_INIT_SPECIAL_API(hipMallocManaged, (TRACE_MEM), devPtr, size, flags);
|
||||
HIP_SET_DEVICE();
|
||||
if (size == 0) {
|
||||
return ihipLogStatus(hipSuccess);
|
||||
}
|
||||
hipError_t hip_status = hipSuccess;
|
||||
if(flags != hipMemAttachGlobal)
|
||||
hip_status = hipErrorInvalidValue;
|
||||
@@ -1224,6 +1253,7 @@ hipError_t hipMemcpyToSymbol(void* dst, const void* src, size_t count,
|
||||
|
||||
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbol_name, dst);
|
||||
|
||||
if (count == 0) return ihipLogStatus(hipSuccess);
|
||||
if (dst == nullptr) {
|
||||
return ihipLogStatus(hipErrorInvalidSymbol);
|
||||
}
|
||||
@@ -1246,6 +1276,7 @@ hipError_t hipMemcpyFromSymbol(void* dst, const void* src, size_t count,
|
||||
|
||||
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbol_name, dst);
|
||||
|
||||
if (count == 0) return ihipLogStatus(hipSuccess);
|
||||
if (src == nullptr || dst == nullptr) {
|
||||
return ihipLogStatus(hipErrorInvalidSymbol);
|
||||
}
|
||||
@@ -1269,6 +1300,7 @@ hipError_t hipMemcpyToSymbolAsync(void* dst, const void* src, size_t count,
|
||||
|
||||
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbol_name, dst);
|
||||
|
||||
if (count == 0) return ihipLogStatus(hipSuccess);
|
||||
if (dst == nullptr) {
|
||||
return ihipLogStatus(hipErrorInvalidSymbol);
|
||||
}
|
||||
@@ -1301,6 +1333,7 @@ hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* src, size_t count,
|
||||
|
||||
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbol_name, src);
|
||||
|
||||
if (count == 0) return ihipLogStatus(hipSuccess);
|
||||
if (src == nullptr || dst == nullptr) {
|
||||
return ihipLogStatus(hipErrorInvalidSymbol);
|
||||
}
|
||||
@@ -1592,6 +1625,7 @@ hipError_t ihipMemcpy3D(const struct hipMemcpy3DParms* p, hipStream_t stream, bo
|
||||
srcXoffset = p->srcPos.x;
|
||||
srcYoffset = p->srcPos.y;
|
||||
srcZoffset = p->srcPos.z;
|
||||
if (copyWidth == 0) return hipSuccess;
|
||||
if (p->dstArray != nullptr) {
|
||||
if ((p->dstArray->isDrv == true) ||( p->dstPtr.ptr!= nullptr)){
|
||||
return hipErrorInvalidValue;
|
||||
@@ -1933,6 +1967,7 @@ hipError_t getLockedPointer(void *hostPtr, size_t dataLen, void **devicePtrPtr)
|
||||
// TODO - review and optimize
|
||||
hipError_t ihipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width,
|
||||
size_t height, hipMemcpyKind kind) {
|
||||
if (height == 0 || width == 0) return hipSuccess;
|
||||
if (dst == nullptr || src == nullptr || width > dpitch || width > spitch) return hipErrorInvalidValue;
|
||||
|
||||
hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
|
||||
@@ -1989,6 +2024,7 @@ hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch,
|
||||
|
||||
hipError_t ihipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width,
|
||||
size_t height, hipMemcpyKind kind, hipStream_t stream) {
|
||||
if (height == 0 || width == 0) return hipSuccess;
|
||||
if (dst == nullptr || src == nullptr || width > dpitch || width > spitch) return hipErrorInvalidValue;
|
||||
hipError_t e = hipSuccess;
|
||||
int isLockedOrD2D = 0;
|
||||
@@ -2043,6 +2079,7 @@ hipError_t ihip2dOffsetMemcpy(void* dst, size_t dpitch, const void* src, size_t
|
||||
size_t height, size_t srcXOffsetInBytes, size_t srcYOffset,
|
||||
size_t dstXOffsetInBytes, size_t dstYOffset,hipMemcpyKind kind,
|
||||
hipStream_t stream, bool isAsync) {
|
||||
if (height == 0 || width == 0) return hipSuccess;
|
||||
if((spitch < width + srcXOffsetInBytes) || (srcYOffset >= height)){
|
||||
return hipErrorInvalidValue;
|
||||
} else if((dpitch < width + dstXOffsetInBytes) || (dstYOffset >= height)){
|
||||
@@ -2061,6 +2098,7 @@ hipError_t ihipMemcpyParam2D(const hip_Memcpy2D* pCopy, hipStream_t stream, bool
|
||||
if (pCopy == nullptr) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
if (pCopy->Height == 0 || pCopy->WidthInBytes == 0) return hipSuccess;
|
||||
void* dst; const void* src;
|
||||
size_t spitch = pCopy->srcPitch;
|
||||
size_t dpitch = pCopy->dstPitch;
|
||||
@@ -2140,6 +2178,7 @@ hipError_t hipMemcpy2DFromArray( void* dst, size_t dpitch, hipArray_const_t src,
|
||||
hipError_t hipMemcpy2DFromArrayAsync( void* dst, size_t dpitch, hipArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream ){
|
||||
HIP_INIT_SPECIAL_API(hipMemcpy2DFromArrayAsync, (TRACE_MCMD), dst, dpitch, src, wOffset, hOffset, width, height, kind, stream);
|
||||
size_t byteSize;
|
||||
if (height == 0 || width == 0) return ihipLogStatus(hipSuccess);
|
||||
if(src) {
|
||||
switch (src->desc.f) {
|
||||
case hipChannelFormatKindSigned:
|
||||
@@ -2239,8 +2278,6 @@ hipError_t hipMemGetInfo(size_t* free, size_t* total) {
|
||||
auto device = ctx->getWriteableDevice();
|
||||
if (total) {
|
||||
*total = device->_props.totalGlobalMem;
|
||||
} else {
|
||||
e = hipErrorInvalidValue;
|
||||
}
|
||||
|
||||
if (free) {
|
||||
@@ -2263,8 +2300,6 @@ hipError_t hipMemGetInfo(size_t* free, size_t* total) {
|
||||
} else {
|
||||
return ihipLogStatus(hipErrorInvalidValue);
|
||||
}
|
||||
} else {
|
||||
e = hipErrorInvalidValue;
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
+316
-159
@@ -50,7 +50,7 @@ THE SOFTWARE.
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include "../include/hip/hcc_detail/code_object_bundle.hpp"
|
||||
#include "code_object_bundle.inl"
|
||||
#include "hip_fatbin.h"
|
||||
// TODO Use Pool APIs from HCC to get memory regions.
|
||||
|
||||
@@ -140,7 +140,7 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global
|
||||
uint32_t localWorkSizeZ, size_t sharedMemBytes,
|
||||
hipStream_t hStream, void** kernelParams, void** extra,
|
||||
hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags, bool isStreamLocked = 0,
|
||||
void** impCoopParams = 0) {
|
||||
void** impCoopParams = 0, hc::accelerator_view* coopAV = 0) {
|
||||
using namespace hip_impl;
|
||||
|
||||
auto ctx = ihipGetTlsDefaultCtx();
|
||||
@@ -192,8 +192,8 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global
|
||||
if (impCoopParams) {
|
||||
const auto p{static_cast<const char*>(*impCoopParams)};
|
||||
// The sixth index is for multi-grid synchronization
|
||||
kernargs.insert((kernargs.cend() - padSize - HIP_IMPLICIT_KERNARG_SIZE) + 6 * HIP_IMPLICIT_KERNARG_ALIGNMENT,
|
||||
p, p + HIP_IMPLICIT_KERNARG_ALIGNMENT);
|
||||
copy(p, p + HIP_IMPLICIT_KERNARG_ALIGNMENT,
|
||||
(kernargs.end() - HIP_IMPLICIT_KERNARG_SIZE) + 6 * HIP_IMPLICIT_KERNARG_ALIGNMENT);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -245,6 +245,10 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global
|
||||
|
||||
hc::completion_future cf;
|
||||
|
||||
if (coopAV) {
|
||||
lp.av = coopAV;
|
||||
}
|
||||
|
||||
lp.av->dispatch_hsa_kernel(&aql, kernargs.data(), kernargs.size(),
|
||||
(startEvent || stopEvent) ? &cf : nullptr
|
||||
#if (__hcc_workweek__ > 17312)
|
||||
@@ -326,22 +330,18 @@ hipError_t ihipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
|
||||
hipFunction_t* kds = reinterpret_cast<hipFunction_t*>(malloc(sizeof(hipFunction_t) * numDevices));
|
||||
if (kds == nullptr) {
|
||||
return hipErrorNotInitialized;
|
||||
}
|
||||
std::vector<hipFunction_t> kds(numDevices,0);
|
||||
|
||||
// prepare all kernel descriptors for each device as all streams will be locked in the next loop
|
||||
for (int i = 0; i < numDevices; ++i) {
|
||||
const hipLaunchParams& lp = launchParamsList[i];
|
||||
if (lp.stream == nullptr) {
|
||||
free(kds);
|
||||
return hipErrorNotInitialized;
|
||||
}
|
||||
kds[i] = ps.kernel_descriptor(reinterpret_cast<std::uintptr_t>(lp.func),
|
||||
hip_impl::target_agent(lp.stream));
|
||||
|
||||
if (kds[i] == nullptr) {
|
||||
free(kds);
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
if (!kds[i]->_kernarg_layout.empty()) continue;
|
||||
@@ -396,8 +396,6 @@ hipError_t ihipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList
|
||||
#endif
|
||||
}
|
||||
|
||||
free(kds);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -409,6 +407,90 @@ hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList,
|
||||
return ihipExtLaunchMultiKernelMultiDevice(launchParamsList, numDevices, flags, ps);
|
||||
}
|
||||
|
||||
void getGprsLdsUsage(hipFunction_t f, size_t* usedVGPRS, size_t* usedSGPRS, size_t* usedLDS)
|
||||
{
|
||||
if (f->_is_code_object_v3) {
|
||||
const auto header = reinterpret_cast<const amd_kernel_code_v3_t*>(f->_header);
|
||||
// GRANULATED_WAVEFRONT_VGPR_COUNT is specified in 0:5 bits of COMPUTE_PGM_RSRC1
|
||||
// the granularity for gfx6-gfx9 is max(0, ceil(vgprs_used / 4) - 1)
|
||||
*usedVGPRS = ((header->compute_pgm_rsrc1 & 0x3F) + 1) << 2;
|
||||
// GRANULATED_WAVEFRONT_SGPR_COUNT is specified in 6:9 bits of COMPUTE_PGM_RSRC1
|
||||
// the granularity for gfx9+ is 2 * max(0, ceil(sgprs_used / 16) - 1)
|
||||
*usedSGPRS = ((((header->compute_pgm_rsrc1 & 0x3C0) >> 6) >> 1) + 1) << 4;
|
||||
*usedLDS = header->group_segment_fixed_size;
|
||||
}
|
||||
else {
|
||||
const auto header = f->_header;
|
||||
// VGPRs granularity is 4
|
||||
*usedVGPRS = ((header->workitem_vgpr_count + 3) >> 2) << 2;
|
||||
// adding 2 to take into account the 2 VCC registers & handle the granularity of 16
|
||||
*usedSGPRS = header->wavefront_sgpr_count + 2;
|
||||
*usedSGPRS = ((*usedSGPRS + 15) >> 4) << 4;
|
||||
*usedLDS = header->workgroup_group_segment_byte_size;
|
||||
}
|
||||
}
|
||||
|
||||
static hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
TlsData *tls, uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk)
|
||||
{
|
||||
using namespace hip_impl;
|
||||
|
||||
auto ctx = ihipGetTlsDefaultCtx();
|
||||
if (ctx == nullptr) {
|
||||
return hipErrorInvalidDevice;
|
||||
}
|
||||
if (numBlocks == nullptr) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
|
||||
hipDeviceProp_t prop{};
|
||||
ihipGetDeviceProperties(&prop, ihipGetTlsDefaultCtx()->getDevice()->_deviceId);
|
||||
|
||||
if (blockSize > prop.maxThreadsPerBlock) {
|
||||
*numBlocks = 0;
|
||||
return hipSuccess;
|
||||
}
|
||||
|
||||
prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024;
|
||||
|
||||
size_t usedVGPRS = 0;
|
||||
size_t usedSGPRS = 0;
|
||||
size_t usedLDS = 0;
|
||||
getGprsLdsUsage(f, &usedVGPRS, &usedSGPRS, &usedLDS);
|
||||
|
||||
// Due to SPI and private memory limitations, the max of wavefronts per CU in 32
|
||||
size_t wavefrontSize = prop.warpSize;
|
||||
size_t maxWavefrontsPerCU = min(prop.maxThreadsPerMultiProcessor / wavefrontSize, 32);
|
||||
|
||||
const size_t simdPerCU = 4;
|
||||
const size_t maxWavesPerSimd = maxWavefrontsPerCU / simdPerCU;
|
||||
|
||||
size_t numWavefronts = (blockSize + wavefrontSize - 1) / wavefrontSize;
|
||||
|
||||
size_t availableVGPRs = (prop.regsPerBlock / wavefrontSize / simdPerCU);
|
||||
size_t vgprs_alu_occupancy = simdPerCU * (usedVGPRS == 0 ? maxWavesPerSimd
|
||||
: std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS));
|
||||
|
||||
// Calculate blocks occupancy per CU based on VGPR usage
|
||||
*numBlocks = vgprs_alu_occupancy / numWavefronts;
|
||||
|
||||
const size_t availableSGPRs = (prop.gcnArch < 800) ? 512 : 800;
|
||||
size_t sgprs_alu_occupancy = simdPerCU * (usedSGPRS == 0 ? maxWavesPerSimd
|
||||
: std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS));
|
||||
|
||||
// Calculate blocks occupancy per CU based on SGPR usage
|
||||
*numBlocks = std::min(*numBlocks, (uint32_t) (sgprs_alu_occupancy / numWavefronts));
|
||||
|
||||
size_t total_used_lds = usedLDS + dynSharedMemPerBlk;
|
||||
if (total_used_lds != 0) {
|
||||
// Calculate LDS occupacy per CU. lds_per_cu / (static_lsd + dynamic_lds)
|
||||
size_t lds_occupancy = prop.maxSharedMemoryPerMultiProcessor / total_used_lds;
|
||||
*numBlocks = std::min(*numBlocks, (uint32_t) lds_occupancy);
|
||||
}
|
||||
|
||||
return hipSuccess;
|
||||
}
|
||||
|
||||
namespace {
|
||||
// kernel for initializing GWS
|
||||
// nwm1 is the total number of work groups minus 1
|
||||
@@ -417,25 +499,28 @@ __global__ void init_gws(uint nwm1) {
|
||||
}
|
||||
}
|
||||
|
||||
__attribute__((visibility("default")))
|
||||
hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim,
|
||||
dim3 blockDimX, void** kernelParams, unsigned int sharedMemBytes,
|
||||
dim3 blockDim, void** kernelParams, unsigned int sharedMemBytes,
|
||||
hipStream_t stream, hip_impl::program_state& ps) {
|
||||
|
||||
#if (__hcc_workweek__ >= 20093)
|
||||
hipError_t result;
|
||||
|
||||
|
||||
if ((f == nullptr) || (stream == nullptr) || (kernelParams == nullptr)) {
|
||||
if (f == nullptr || kernelParams == nullptr) {
|
||||
return hipErrorNotInitialized;
|
||||
}
|
||||
|
||||
if (!stream->getDevice()->_props.cooperativeLaunch) {
|
||||
stream = ihipSyncAndResolveStream(stream);
|
||||
|
||||
if (!stream->getDevice()->_props.cooperativeLaunch ||
|
||||
blockDim.x * blockDim.y * blockDim.z > stream->getDevice()->_props.maxThreadsPerBlock) {
|
||||
return hipErrorInvalidConfiguration;
|
||||
}
|
||||
|
||||
size_t globalWorkSizeX = (size_t)gridDim.x * (size_t)blockDimX.x;
|
||||
size_t globalWorkSizeY = (size_t)gridDim.y * (size_t)blockDimX.y;
|
||||
size_t globalWorkSizeZ = (size_t)gridDim.z * (size_t)blockDimX.z;
|
||||
size_t globalWorkSizeX = (size_t)gridDim.x * (size_t)blockDim.x;
|
||||
size_t globalWorkSizeY = (size_t)gridDim.y * (size_t)blockDim.y;
|
||||
size_t globalWorkSizeZ = (size_t)gridDim.z * (size_t)blockDim.z;
|
||||
if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX)
|
||||
{
|
||||
return hipErrorInvalidConfiguration;
|
||||
@@ -469,28 +554,49 @@ hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim,
|
||||
kd->_kernarg_layout = *reinterpret_cast<const std::vector<
|
||||
std::pair<std::size_t, std::size_t>>*>(kargs.getHandle());
|
||||
|
||||
GET_TLS();
|
||||
uint32_t numBlocksPerSm = 0;
|
||||
result = ihipOccupancyMaxActiveBlocksPerMultiprocessor(tls, &numBlocksPerSm, kd,
|
||||
blockDim.x * blockDim.y * blockDim.z, sharedMemBytes);
|
||||
if (result != hipSuccess) {
|
||||
return hipErrorLaunchFailure;
|
||||
}
|
||||
int maxActiveBlocks = numBlocksPerSm * stream->getDevice()->_props.multiProcessorCount;
|
||||
|
||||
//check to see if the workload fits on the GPU
|
||||
if (gridDim.x * gridDim.y * gridDim.z > maxActiveBlocks) {
|
||||
return hipErrorCooperativeLaunchTooLarge;
|
||||
}
|
||||
|
||||
void *gwsKernelParam[1];
|
||||
// calculate total number of work groups minus 1 for the main kernel
|
||||
uint nwm1 = (gridDim.x * gridDim.y * gridDim.z) - 1;
|
||||
gwsKernelParam[0] = &nwm1;
|
||||
|
||||
LockedAccessor_StreamCrit_t streamCrit(stream->criticalData(), false);
|
||||
#if (__hcc_workweek__ >= 19213)
|
||||
streamCrit->_av.acquire_locked_hsa_queue();
|
||||
#endif
|
||||
hc::accelerator acc = stream->getDevice()->_acc;
|
||||
// create a cooperative accelerated view for launching gws and main kernels
|
||||
hc::accelerator_view coopAV = acc.create_cooperative_view();
|
||||
|
||||
GET_TLS();
|
||||
// launch the init_gws kernel to initialize the GWS
|
||||
LockedAccessor_StreamCrit_t streamCrit(stream->criticalData(), false);
|
||||
|
||||
// the cooperative queue will wait until this stream completes its operations
|
||||
hc::completion_future streamCF;
|
||||
if (!streamCrit->_av.get_is_empty()) {
|
||||
streamCF = streamCrit->_av.create_marker(hc::accelerator_scope);
|
||||
coopAV.create_blocking_marker(streamCF, hc::accelerator_scope);
|
||||
}
|
||||
|
||||
streamCrit->_av.acquire_locked_hsa_queue();
|
||||
coopAV.acquire_locked_hsa_queue();
|
||||
|
||||
// launch the init_gws kernel to initialize the GWS in the dedicated cooperative queue
|
||||
result = ihipModuleLaunchKernel(tls, gwsKD, 1, 1, 1, 1, 1, 1,
|
||||
0, stream, gwsKernelParam, nullptr, nullptr, nullptr, 0, true);
|
||||
0, stream, gwsKernelParam, nullptr, nullptr, nullptr, 0, true, nullptr , &coopAV);
|
||||
|
||||
if (result != hipSuccess) {
|
||||
stream->criticalData().unlock();
|
||||
#if (__hcc_workweek__ >= 19213)
|
||||
stream->criticalData()._av.release_locked_hsa_queue();
|
||||
#endif
|
||||
|
||||
coopAV.release_locked_hsa_queue();
|
||||
return hipErrorLaunchFailure;
|
||||
}
|
||||
|
||||
@@ -498,60 +604,106 @@ hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim,
|
||||
void* impCoopParams[1];
|
||||
impCoopParams[0] = &impCoopArg;
|
||||
|
||||
// launch the main kernel
|
||||
// launch the main kernel in the cooperative queue
|
||||
result = ihipModuleLaunchKernel(tls, kd,
|
||||
gridDim.x * blockDimX.x,
|
||||
gridDim.y * blockDimX.y,
|
||||
gridDim.z * blockDimX.z,
|
||||
blockDimX.x, blockDimX.y, blockDimX.z,
|
||||
gridDim.x * blockDim.x,
|
||||
gridDim.y * blockDim.y,
|
||||
gridDim.z * blockDim.z,
|
||||
blockDim.x, blockDim.y, blockDim.z,
|
||||
sharedMemBytes, stream, kernelParams, nullptr, nullptr,
|
||||
nullptr, 0, true, impCoopParams);
|
||||
nullptr, 0, true, impCoopParams, &coopAV);
|
||||
|
||||
|
||||
coopAV.release_locked_hsa_queue();
|
||||
stream->criticalData()._av.release_locked_hsa_queue();
|
||||
|
||||
// this stream will wait until the cooperative queue completes its operations
|
||||
hc::completion_future cooperativeCF;
|
||||
if (!coopAV.get_is_empty()) {
|
||||
cooperativeCF = coopAV.create_marker(hc::accelerator_scope);
|
||||
streamCrit->_av.create_blocking_marker(cooperativeCF, hc::accelerator_scope);
|
||||
}
|
||||
|
||||
stream->criticalData().unlock();
|
||||
#if (__hcc_workweek__ >= 19213)
|
||||
stream->criticalData()._av.release_locked_hsa_queue();
|
||||
#endif
|
||||
|
||||
return result;
|
||||
#else
|
||||
return hipErrorInvalidConfiguration;
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
__attribute__((visibility("default")))
|
||||
hipError_t hipLaunchCooperativeKernel(const void* func, dim3 gridDim,
|
||||
dim3 blockDim, void** args,
|
||||
size_t sharedMem, hipStream_t stream,
|
||||
hip_impl::program_state& ps) {
|
||||
|
||||
// Skipping passing in ps, because the logging function does not like it
|
||||
HIP_INIT_API(hipLaunchCooperativeKernel, func, gridDim, blockDim, args,
|
||||
sharedMem, stream);
|
||||
|
||||
return ihipLogStatus(ihipLaunchCooperativeKernel(func, gridDim, blockDim,
|
||||
args, sharedMem, stream, ps));
|
||||
}
|
||||
|
||||
|
||||
hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
|
||||
int numDevices, unsigned int flags, hip_impl::program_state& ps) {
|
||||
|
||||
#if (__hcc_workweek__ >= 20093)
|
||||
hipError_t result;
|
||||
|
||||
if (numDevices > g_deviceCnt || launchParamsList == nullptr || numDevices > MAX_COOPERATIVE_GPUs) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
|
||||
vector<hipStream_t> streams;
|
||||
vector<uint64_t> deviceIDs;
|
||||
// check to see if we have valid distinct streams/devices, if cooperative multi device
|
||||
// launch is supported and if grid/block dimensions are valid
|
||||
for (int i = 0; i < numDevices; ++i) {
|
||||
if (!launchParamsList[i].stream->getDevice()->_props.cooperativeMultiDeviceLaunch) {
|
||||
const hipLaunchParams& lp = launchParamsList[i];
|
||||
|
||||
if (lp.stream == nullptr){
|
||||
return hipErrorInvalidResourceHandle;
|
||||
}
|
||||
|
||||
if (find(streams.begin(), streams.end(), lp.stream) == streams.end()) {
|
||||
streams.push_back(lp.stream);
|
||||
} else {
|
||||
return hipErrorInvalidDevice;
|
||||
}
|
||||
|
||||
const ihipDevice_t* currentDevice = lp.stream->getDevice();
|
||||
if (find(deviceIDs.begin(), deviceIDs.end(), currentDevice->_deviceId) == deviceIDs.end()) {
|
||||
deviceIDs.push_back(currentDevice->_deviceId);
|
||||
} else {
|
||||
return hipErrorInvalidDevice;
|
||||
}
|
||||
|
||||
if (!currentDevice->_props.cooperativeMultiDeviceLaunch) {
|
||||
return hipErrorInvalidConfiguration;
|
||||
}
|
||||
|
||||
if (lp.gridDim.x == 0 || lp.gridDim.y == 0 || lp.gridDim.z == 0 ||
|
||||
lp.blockDim.x == 0 || lp.blockDim.y == 0 || lp.blockDim.z == 0 ||
|
||||
lp.blockDim.x * lp.blockDim.y * lp.blockDim.z > currentDevice->_props.maxThreadsPerBlock){
|
||||
return hipErrorInvalidConfiguration;
|
||||
}
|
||||
}
|
||||
|
||||
hipFunction_t* gwsKds = reinterpret_cast<hipFunction_t*>(malloc(sizeof(hipFunction_t) * numDevices));
|
||||
hipFunction_t* kds = reinterpret_cast<hipFunction_t*>(malloc(sizeof(hipFunction_t) * numDevices));
|
||||
|
||||
if (kds == nullptr || gwsKds == nullptr) {
|
||||
return hipErrorNotInitialized;
|
||||
}
|
||||
vector<hipFunction_t> gwsKds;
|
||||
vector<hipFunction_t> kds;
|
||||
|
||||
GET_TLS();
|
||||
// prepare all kernel descriptors for initializing the GWS and the main kernels per device
|
||||
for (int i = 0; i < numDevices; ++i) {
|
||||
const hipLaunchParams& lp = launchParamsList[i];
|
||||
if (lp.stream == nullptr) {
|
||||
free(gwsKds);
|
||||
free(kds);
|
||||
return hipErrorNotInitialized;
|
||||
}
|
||||
|
||||
gwsKds[i] = ps.kernel_descriptor(reinterpret_cast<std::uintptr_t>(&init_gws),
|
||||
hip_impl::target_agent(lp.stream));
|
||||
gwsKds.push_back(ps.kernel_descriptor(reinterpret_cast<std::uintptr_t>(&init_gws),
|
||||
hip_impl::target_agent(lp.stream)));
|
||||
if (gwsKds[i] == nullptr) {
|
||||
free(gwsKds);
|
||||
free(kds);
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
hip_impl::kernargs_size_align gwsKargs = ps.get_kernargs_size_align(
|
||||
@@ -560,23 +712,42 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
|
||||
gwsKargs.getHandle());
|
||||
|
||||
|
||||
kds[i] = ps.kernel_descriptor(reinterpret_cast<std::uintptr_t>(lp.func),
|
||||
hip_impl::target_agent(lp.stream));
|
||||
kds.push_back(ps.kernel_descriptor(reinterpret_cast<std::uintptr_t>(lp.func),
|
||||
hip_impl::target_agent(lp.stream)));
|
||||
if (kds[i] == nullptr) {
|
||||
free(gwsKds);
|
||||
free(kds);
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
hip_impl::kernargs_size_align kargs = ps.get_kernargs_size_align(
|
||||
reinterpret_cast<std::uintptr_t>(lp.func));
|
||||
kds[i]->_kernarg_layout = *reinterpret_cast<const std::vector<std::pair<std::size_t, std::size_t>>*>(
|
||||
kargs.getHandle());
|
||||
|
||||
uint32_t numBlocksPerSm = 0;
|
||||
result = ihipOccupancyMaxActiveBlocksPerMultiprocessor(tls, &numBlocksPerSm, kds[i],
|
||||
lp.blockDim.x * lp.blockDim.y * lp.blockDim.z, lp.sharedMem);
|
||||
if (result != hipSuccess) {
|
||||
return hipErrorLaunchFailure;
|
||||
}
|
||||
int maxActiveBlocks = numBlocksPerSm * lp.stream->getDevice()->_props.multiProcessorCount;
|
||||
|
||||
//check to see if the workload fits on the GPU
|
||||
if (lp.gridDim.x * lp.gridDim.y * lp.gridDim.z > maxActiveBlocks) {
|
||||
return hipErrorCooperativeLaunchTooLarge;
|
||||
}
|
||||
}
|
||||
|
||||
vector<hc::accelerator_view> coopAVs;
|
||||
|
||||
// create cooperative accelerated views for launching gws and main kernels on each device
|
||||
for (int i = 0; i < numDevices; ++i) {
|
||||
hc::accelerator acc = launchParamsList[i].stream->getDevice()->_acc;
|
||||
coopAVs.push_back(acc.create_cooperative_view());
|
||||
}
|
||||
|
||||
mg_sync *mg_sync_ptr = 0;
|
||||
mg_info *mg_info_ptr[MAX_COOPERATIVE_GPUs] = {0};
|
||||
vector<mg_info *> mg_info_ptr;
|
||||
|
||||
|
||||
GET_TLS();
|
||||
result = hip_internal::ihipHostMalloc(tls, (void **)&mg_sync_ptr, sizeof(mg_sync), hipHostMallocDefault);
|
||||
if (result != hipSuccess) {
|
||||
return hipErrorInvalidValue;
|
||||
@@ -586,7 +757,8 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
|
||||
|
||||
uint all_sum = 0;
|
||||
for (int i = 0; i < numDevices; ++i) {
|
||||
result = hip_internal::ihipHostMalloc(tls, (void **)&mg_info_ptr[i], sizeof(mg_info), hipHostMallocDefault);
|
||||
mg_info *mg_info_temp = nullptr;
|
||||
result = hip_internal::ihipHostMalloc(tls, (void **)&mg_info_temp, sizeof(mg_info), hipHostMallocDefault);
|
||||
if (result != hipSuccess) {
|
||||
hip_internal::ihipHostFree(tls, mg_sync_ptr);
|
||||
for (int j = 0; j < i; ++j) {
|
||||
@@ -594,6 +766,7 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
|
||||
}
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
mg_info_ptr.push_back(mg_info_temp);
|
||||
// calculate the sum of sizes of all grids
|
||||
const hipLaunchParams& lp = launchParamsList[i];
|
||||
all_sum += lp.blockDim.x * lp.blockDim.y * lp.blockDim.z *
|
||||
@@ -603,9 +776,15 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
|
||||
// lock all streams before launching the blit kernels for initializing the GWS and main kernels to each device
|
||||
for (int i = 0; i < numDevices; ++i) {
|
||||
LockedAccessor_StreamCrit_t streamCrit(launchParamsList[i].stream->criticalData(), false);
|
||||
#if (__hcc_workweek__ >= 19213)
|
||||
|
||||
hc::completion_future streamCF;
|
||||
if (!streamCrit->_av.get_is_empty()) {
|
||||
streamCF = streamCrit->_av.create_marker(hc::accelerator_scope);
|
||||
coopAVs[i].create_blocking_marker(streamCF, hc::accelerator_scope);
|
||||
}
|
||||
|
||||
streamCrit->_av.acquire_locked_hsa_queue();
|
||||
#endif
|
||||
coopAVs[i].acquire_locked_hsa_queue();
|
||||
}
|
||||
|
||||
// launch the init_gws kernel to initialize the GWS for each device
|
||||
@@ -617,14 +796,13 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
|
||||
gwsKernelParam[0] = &nwm1;
|
||||
|
||||
result = ihipModuleLaunchKernel(tls, gwsKds[i], 1, 1, 1, 1, 1, 1,
|
||||
0, lp.stream, gwsKernelParam, nullptr, nullptr, nullptr, 0, true);
|
||||
0, lp.stream, gwsKernelParam, nullptr, nullptr, nullptr, 0, true, nullptr, &coopAVs[i]);
|
||||
|
||||
if (result != hipSuccess) {
|
||||
for (int j = 0; j < numDevices; ++j) {
|
||||
launchParamsList[j].stream->criticalData().unlock();
|
||||
#if (__hcc_workweek__ >= 19213)
|
||||
launchParamsList[j].stream->criticalData()._av.release_locked_hsa_queue();
|
||||
#endif
|
||||
coopAVs[i].release_locked_hsa_queue();
|
||||
}
|
||||
hip_internal::ihipHostFree(tls, mg_sync_ptr);
|
||||
for (int j = 0; j < numDevices; ++j) {
|
||||
@@ -670,14 +848,13 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
|
||||
lp.blockDim.x, lp.blockDim.y,
|
||||
lp.blockDim.z, lp.sharedMem,
|
||||
lp.stream, lp.args, nullptr, nullptr, nullptr, 0,
|
||||
true, impCoopParams);
|
||||
true, impCoopParams, &coopAVs[i]);
|
||||
|
||||
if (result != hipSuccess) {
|
||||
for (int j = 0; j < numDevices; ++j) {
|
||||
launchParamsList[j].stream->criticalData().unlock();
|
||||
#if (__hcc_workweek__ >= 19213)
|
||||
launchParamsList[j].stream->criticalData()._av.release_locked_hsa_queue();
|
||||
#endif
|
||||
coopAVs[i].release_locked_hsa_queue();
|
||||
}
|
||||
hip_internal::ihipHostFree(tls, mg_sync_ptr);
|
||||
for (int j = 0; j < numDevices; ++j) {
|
||||
@@ -691,14 +868,18 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
|
||||
|
||||
// unlock all streams
|
||||
for (int i = 0; i < numDevices; ++i) {
|
||||
launchParamsList[i].stream->criticalData().unlock();
|
||||
#if (__hcc_workweek__ >= 19213)
|
||||
coopAVs[i].release_locked_hsa_queue();
|
||||
launchParamsList[i].stream->criticalData()._av.release_locked_hsa_queue();
|
||||
#endif
|
||||
}
|
||||
|
||||
free(gwsKds);
|
||||
free(kds);
|
||||
hc::completion_future cooperativeCF;
|
||||
if (!coopAVs[i].get_is_empty()) {
|
||||
cooperativeCF = coopAVs[i].create_marker(hc::accelerator_scope);
|
||||
launchParamsList[i].stream->criticalData()._av.create_blocking_marker(
|
||||
cooperativeCF, hc::accelerator_scope);
|
||||
}
|
||||
|
||||
launchParamsList[i].stream->criticalData().unlock();
|
||||
}
|
||||
|
||||
hip_internal::ihipHostFree(tls, mg_sync_ptr);
|
||||
for (int j = 0; j < numDevices; ++j) {
|
||||
@@ -706,6 +887,24 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
|
||||
}
|
||||
|
||||
return result;
|
||||
#else
|
||||
return hipErrorInvalidConfiguration;
|
||||
#endif
|
||||
}
|
||||
|
||||
__attribute__((visibility("default")))
|
||||
hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
|
||||
int numDevices,
|
||||
unsigned int flags,
|
||||
hip_impl::program_state& ps) {
|
||||
|
||||
// Skipping passing in ps, because the logging function does not like it
|
||||
HIP_INIT_API(hipLaunchCooperativeKernelMultiDevice, launchParamsList,
|
||||
numDevices, flags);
|
||||
|
||||
return ihipLogStatus(ihipLaunchCooperativeKernelMultiDevice(launchParamsList,
|
||||
numDevices,
|
||||
flags, ps));
|
||||
}
|
||||
|
||||
namespace hip_impl {
|
||||
@@ -1120,7 +1319,7 @@ const amd_kernel_code_v3_t *header_v3(const ihipModuleSymbol_t& kd) {
|
||||
return reinterpret_cast<const amd_kernel_code_v3_t*>(kd._header);
|
||||
}
|
||||
|
||||
hipFuncAttributes make_function_attributes(TlsData *tls, const ihipModuleSymbol_t& kd) {
|
||||
hipFuncAttributes make_function_attributes(TlsData *tls, ihipModuleSymbol_t& kd) {
|
||||
hipFuncAttributes r{};
|
||||
|
||||
hipDeviceProp_t prop{};
|
||||
@@ -1130,23 +1329,57 @@ hipFuncAttributes make_function_attributes(TlsData *tls, const ihipModuleSymbol_
|
||||
prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024;
|
||||
|
||||
if (kd._is_code_object_v3) {
|
||||
r.localSizeBytes = header_v3(kd)->private_segment_fixed_size;
|
||||
r.sharedSizeBytes = header_v3(kd)->group_segment_fixed_size;
|
||||
r.numRegs = ((header_v3(kd)->compute_pgm_rsrc1 & 0x3F) + 1) << 2;
|
||||
r.binaryVersion = 0; // FIXME: should it be the ISA version or code
|
||||
// object format version?
|
||||
r.localSizeBytes = header_v3(kd)->private_segment_fixed_size;
|
||||
r.sharedSizeBytes = header_v3(kd)->group_segment_fixed_size;
|
||||
} else {
|
||||
r.localSizeBytes = kd._header->workitem_private_segment_byte_size;
|
||||
r.sharedSizeBytes = kd._header->workgroup_group_segment_byte_size;
|
||||
r.numRegs = kd._header->workitem_vgpr_count;
|
||||
r.binaryVersion =
|
||||
kd._header->amd_machine_version_major * 10 +
|
||||
kd._header->amd_machine_version_minor;
|
||||
}
|
||||
r.maxDynamicSharedSizeBytes = prop.sharedMemPerBlock - r.sharedSizeBytes;
|
||||
r.maxThreadsPerBlock = r.numRegs ?
|
||||
std::min(prop.maxThreadsPerBlock, prop.regsPerBlock / r.numRegs) :
|
||||
prop.maxThreadsPerBlock;
|
||||
|
||||
size_t usedVGPRS = 0;
|
||||
size_t usedSGPRS = 0;
|
||||
size_t usedLDS = 0;
|
||||
getGprsLdsUsage(&kd, &usedVGPRS, &usedSGPRS, &usedLDS);
|
||||
|
||||
r.numRegs = usedVGPRS;
|
||||
|
||||
size_t wavefrontSize = prop.warpSize;
|
||||
size_t maxWavefrontsPerBlock = prop.maxThreadsPerBlock / wavefrontSize;
|
||||
size_t maxWavefrontsPerCU = min(prop.maxThreadsPerMultiProcessor / wavefrontSize, 32);
|
||||
const size_t numSIMD = 4;
|
||||
const size_t maxWavesPerSimd = maxWavefrontsPerCU / numSIMD;
|
||||
size_t maxWaves = 0;
|
||||
for (int i = 0; i < maxWavefrontsPerBlock; i++) {
|
||||
size_t wavefronts = i + 1;
|
||||
|
||||
if (usedVGPRS > 0) {
|
||||
size_t availableVGPRs = (prop.regsPerBlock / wavefrontSize / numSIMD);
|
||||
size_t vgprs_alu_occupancy = numSIMD * std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS);
|
||||
|
||||
// Calculate blocks occupancy per CU based on VGPR usage
|
||||
if (vgprs_alu_occupancy < wavefronts)
|
||||
break;
|
||||
}
|
||||
|
||||
if (usedSGPRS > 0) {
|
||||
const size_t availableSGPRs = (prop.gcnArch < 800) ? 512 : 800;
|
||||
size_t sgprs_alu_occupancy = numSIMD * ((usedSGPRS == 0) ? maxWavesPerSimd
|
||||
: std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS));
|
||||
|
||||
// Calculate blocks occupancy per CU based on SGPR usage
|
||||
if (sgprs_alu_occupancy < wavefronts)
|
||||
break;
|
||||
}
|
||||
maxWaves = wavefronts;
|
||||
}
|
||||
|
||||
r.maxThreadsPerBlock = maxWaves * wavefrontSize;
|
||||
r.ptxVersion = prop.major * 10 + prop.minor; // HIP currently presents itself as PTX 3.0.
|
||||
|
||||
return r;
|
||||
@@ -1294,29 +1527,6 @@ hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const
|
||||
return ihipLogStatus(hipSuccess);
|
||||
}
|
||||
|
||||
void getGprsLdsUsage(hipFunction_t f, size_t* usedVGPRS, size_t* usedSGPRS, size_t* usedLDS)
|
||||
{
|
||||
if (f->_is_code_object_v3) {
|
||||
const auto header = reinterpret_cast<const amd_kernel_code_v3_t*>(f->_header);
|
||||
// GRANULATED_WAVEFRONT_VGPR_COUNT is specified in 0:5 bits of COMPUTE_PGM_RSRC1
|
||||
// the granularity for gfx6-gfx9 is max(0, ceil(vgprs_used / 4) - 1)
|
||||
*usedVGPRS = ((header->compute_pgm_rsrc1 & 0x3F) + 1) << 2;
|
||||
// GRANULATED_WAVEFRONT_SGPR_COUNT is specified in 6:9 bits of COMPUTE_PGM_RSRC1
|
||||
// the granularity for gfx9+ is 2 * max(0, ceil(sgprs_used / 16) - 1)
|
||||
*usedSGPRS = ((((header->compute_pgm_rsrc1 & 0x3C0) >> 6) >> 1) + 1) << 4;
|
||||
*usedLDS = header->group_segment_fixed_size;
|
||||
}
|
||||
else {
|
||||
const auto header = f->_header;
|
||||
// VGPRs granularity is 4
|
||||
*usedVGPRS = ((header->workitem_vgpr_count + 3) >> 2) << 2;
|
||||
// adding 2 to take into account the 2 VCC registers & handle the granularity of 16
|
||||
*usedSGPRS = header->wavefront_sgpr_count + 2;
|
||||
*usedSGPRS = ((*usedSGPRS + 15) >> 4) << 4;
|
||||
*usedLDS = header->workgroup_group_segment_byte_size;
|
||||
}
|
||||
}
|
||||
|
||||
hipError_t ihipOccupancyMaxPotentialBlockSize(TlsData *tls, uint32_t* gridSize, uint32_t* blockSize,
|
||||
hipFunction_t f, size_t dynSharedMemPerBlk,
|
||||
uint32_t blockSizeLimit)
|
||||
@@ -1439,59 +1649,6 @@ hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* block
|
||||
gridSize, blockSize, f, dynSharedMemPerBlk, blockSizeLimit));
|
||||
}
|
||||
|
||||
hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
TlsData *tls, uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk)
|
||||
{
|
||||
using namespace hip_impl;
|
||||
|
||||
auto ctx = ihipGetTlsDefaultCtx();
|
||||
if (ctx == nullptr) {
|
||||
return hipErrorInvalidDevice;
|
||||
}
|
||||
|
||||
hipDeviceProp_t prop{};
|
||||
ihipGetDeviceProperties(&prop, ihipGetTlsDefaultCtx()->getDevice()->_deviceId);
|
||||
|
||||
prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024;
|
||||
|
||||
size_t usedVGPRS = 0;
|
||||
size_t usedSGPRS = 0;
|
||||
size_t usedLDS = 0;
|
||||
getGprsLdsUsage(f, &usedVGPRS, &usedSGPRS, &usedLDS);
|
||||
|
||||
// Due to SPI and private memory limitations, the max of wavefronts per CU in 32
|
||||
size_t wavefrontSize = prop.warpSize;
|
||||
size_t maxWavefrontsPerCU = min(prop.maxThreadsPerMultiProcessor / wavefrontSize, 32);
|
||||
|
||||
const size_t simdPerCU = 4;
|
||||
const size_t maxWavesPerSimd = maxWavefrontsPerCU / simdPerCU;
|
||||
|
||||
size_t numWavefronts = (blockSize + wavefrontSize - 1) / wavefrontSize;
|
||||
|
||||
size_t availableVGPRs = (prop.regsPerBlock / wavefrontSize / simdPerCU);
|
||||
size_t vgprs_alu_occupancy = simdPerCU * (usedVGPRS == 0 ? maxWavesPerSimd
|
||||
: std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS));
|
||||
|
||||
// Calculate blocks occupancy per CU based on VGPR usage
|
||||
*numBlocks = vgprs_alu_occupancy / numWavefronts;
|
||||
|
||||
const size_t availableSGPRs = (prop.gcnArch < 800) ? 512 : 800;
|
||||
size_t sgprs_alu_occupancy = simdPerCU * (usedSGPRS == 0 ? maxWavesPerSimd
|
||||
: std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS));
|
||||
|
||||
// Calculate blocks occupancy per CU based on SGPR usage
|
||||
*numBlocks = std::min(*numBlocks, (uint32_t) (sgprs_alu_occupancy / numWavefronts));
|
||||
|
||||
size_t total_used_lds = usedLDS + dynSharedMemPerBlk;
|
||||
if (total_used_lds != 0) {
|
||||
// Calculate LDS occupacy per CU. lds_per_cu / (static_lsd + dynamic_lds)
|
||||
size_t lds_occupancy = prop.maxSharedMemoryPerMultiProcessor / total_used_lds;
|
||||
*numBlocks = std::min(*numBlocks, (uint32_t) lds_occupancy);
|
||||
}
|
||||
|
||||
return hipSuccess;
|
||||
}
|
||||
|
||||
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk)
|
||||
{
|
||||
|
||||
@@ -257,11 +257,39 @@ hipError_t hipStreamGetPriority(hipStream_t stream, int* priority) {
|
||||
hipError_t hipStreamAddCallback(hipStream_t stream, hipStreamCallback_t callback, void* userData,
|
||||
unsigned int flags) {
|
||||
HIP_INIT_API(hipStreamAddCallback, stream, callback, userData, flags);
|
||||
hipError_t e = hipSuccess;
|
||||
|
||||
// Create a thread in detached mode to handle callback
|
||||
ihipStreamCallback_t* cb = new ihipStreamCallback_t(stream, callback, userData);
|
||||
std::thread(ihipStreamCallbackHandler, cb).detach();
|
||||
auto stream_original{stream};
|
||||
stream = ihipSyncAndResolveStream(stream);
|
||||
|
||||
return ihipLogStatus(e);
|
||||
if (!stream) return hipErrorInvalidValue;
|
||||
|
||||
LockedAccessor_StreamCrit_t cs{stream->criticalData()};
|
||||
|
||||
// create first marker
|
||||
auto cf = cs->_av.create_marker(hc::no_scope);
|
||||
// get its signal
|
||||
auto signal = *reinterpret_cast<hsa_signal_t*>(cf.get_native_handle());
|
||||
// increment its signal value
|
||||
hsa_signal_add_relaxed(signal, 1);
|
||||
|
||||
// create callback that can be passed to hsa_amd_signal_async_handler
|
||||
// this function will call the user's callback, then sets first packet's signal to 0 to indicate completion
|
||||
auto t{new std::function<void()>{[=]() {
|
||||
callback(stream_original, hipSuccess, userData);
|
||||
hsa_signal_store_relaxed(signal, 0);
|
||||
}}};
|
||||
|
||||
// register above callback with HSA runtime to be called when first packet's signal
|
||||
// is decremented from 2 to 1 by CP (or it is already at 1)
|
||||
hsa_amd_signal_async_handler(signal, HSA_SIGNAL_CONDITION_EQ, 1,
|
||||
[](hsa_signal_value_t x, void* p) {
|
||||
(*static_cast<decltype(t)>(p))();
|
||||
delete static_cast<decltype(t)>(p);
|
||||
return false;
|
||||
}, t);
|
||||
|
||||
// create additional marker that blocks on the first one
|
||||
cs->_av.create_blocking_marker(cf, hc::no_scope);
|
||||
|
||||
return ihipLogStatus(hipSuccess);
|
||||
}
|
||||
|
||||
@@ -301,7 +301,12 @@ hipError_t hipCreateTextureObject(hipTextureObject_t* pTexObject, const hipResou
|
||||
hsa_ext_sampler_descriptor_t samplerDescriptor;
|
||||
fillSamplerDescriptor(samplerDescriptor, pTexDesc->addressMode[0], pTexDesc->filterMode,
|
||||
pTexDesc->normalizedCoords);
|
||||
|
||||
if(hipResourceTypeLinear == pResDesc->resType) {
|
||||
samplerDescriptor.filter_mode = HSA_EXT_SAMPLER_FILTER_MODE_NEAREST;
|
||||
samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER;
|
||||
} else if(!pTexDesc->normalizedCoords) {
|
||||
samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE;
|
||||
}
|
||||
hsa_access_permission_t permission = HSA_ACCESS_PERMISSION_RW;
|
||||
|
||||
if(hipResourceTypePitch2D != pResDesc->resType)
|
||||
@@ -312,6 +317,7 @@ hipError_t hipCreateTextureObject(hipTextureObject_t* pTexObject, const hipResou
|
||||
HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR, pitch, 0, &(pTexture->image)) ||
|
||||
HSA_STATUS_SUCCESS !=
|
||||
hsa_ext_sampler_create(*agent, &samplerDescriptor, &(pTexture->sampler))) {
|
||||
free(pTexture);
|
||||
return ihipLogStatus(hipErrorRuntimeOther);
|
||||
}
|
||||
|
||||
@@ -438,7 +444,13 @@ hipError_t ihipBindTextureImpl(TlsData *tls_, int dim, enum hipTextureReadMode r
|
||||
imageDescriptor.format.channel_type = channelType;
|
||||
|
||||
hsa_ext_sampler_descriptor_t samplerDescriptor;
|
||||
fillSamplerDescriptor(samplerDescriptor, addressMode, filterMode, normalizedCoords);
|
||||
samplerDescriptor.filter_mode = HSA_EXT_SAMPLER_FILTER_MODE_NEAREST;
|
||||
samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER;
|
||||
if (normalizedCoords) {
|
||||
samplerDescriptor.coordinate_mode = HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED;
|
||||
} else {
|
||||
samplerDescriptor.coordinate_mode = HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED;
|
||||
}
|
||||
|
||||
hsa_access_permission_t permission = HSA_ACCESS_PERMISSION_RW;
|
||||
|
||||
@@ -449,6 +461,7 @@ hipError_t ihipBindTextureImpl(TlsData *tls_, int dim, enum hipTextureReadMode r
|
||||
HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR, rowPitch, 0, &(pTexture->image)) ||
|
||||
HSA_STATUS_SUCCESS !=
|
||||
hsa_ext_sampler_create(*agent, &samplerDescriptor, &(pTexture->sampler))) {
|
||||
free(pTexture);
|
||||
return hipErrorRuntimeOther;
|
||||
}
|
||||
getHipTextureObject(&textureObject, pTexture->image, pTexture->sampler);
|
||||
@@ -514,7 +527,9 @@ hipError_t ihipBindTexture2DImpl(TlsData *tls, int dim, enum hipTextureReadMode
|
||||
|
||||
hsa_ext_sampler_descriptor_t samplerDescriptor;
|
||||
fillSamplerDescriptor(samplerDescriptor, addressMode, filterMode, normalizedCoords);
|
||||
|
||||
if(!normalizedCoords) {
|
||||
samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE;
|
||||
}
|
||||
hsa_access_permission_t permission = HSA_ACCESS_PERMISSION_RW;
|
||||
|
||||
if( 0 == pitch)
|
||||
@@ -525,6 +540,7 @@ hipError_t ihipBindTexture2DImpl(TlsData *tls, int dim, enum hipTextureReadMode
|
||||
HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR, pitch, 0, &(pTexture->image)) ||
|
||||
HSA_STATUS_SUCCESS !=
|
||||
hsa_ext_sampler_create(*agent, &samplerDescriptor, &(pTexture->sampler))) {
|
||||
free(pTexture);
|
||||
return hipErrorRuntimeOther;
|
||||
}
|
||||
getHipTextureObject(&textureObject, pTexture->image, pTexture->sampler);
|
||||
@@ -620,7 +636,9 @@ hipError_t ihipBindTextureToArrayImpl(TlsData *tls_, int dim, enum hipTextureRea
|
||||
|
||||
hsa_ext_sampler_descriptor_t samplerDescriptor;
|
||||
fillSamplerDescriptor(samplerDescriptor, addressMode, filterMode, normalizedCoords);
|
||||
|
||||
if(!normalizedCoords) {
|
||||
samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE;
|
||||
}
|
||||
hsa_access_permission_t permission = HSA_ACCESS_PERMISSION_RW;
|
||||
|
||||
size_t rowPitch = getElementSize(channelOrder, channelType) * alignUp(imageDescriptor.width, IMAGE_PITCH_ALIGNMENT);
|
||||
|
||||
@@ -21,7 +21,7 @@ THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "../include/hip/hiprtc.h"
|
||||
#include "../include/hip/hcc_detail/code_object_bundle.hpp"
|
||||
#include "code_object_bundle.inl"
|
||||
#include "../include/hip/hcc_detail/elfio/elfio.hpp"
|
||||
#include "../include/hip/hcc_detail/program_state.hpp"
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#include "../include/hip/hcc_detail/program_state.hpp"
|
||||
|
||||
#include "../include/hip/hcc_detail/code_object_bundle.hpp"
|
||||
#include "code_object_bundle.inl"
|
||||
#include "../include/hip/hcc_detail/hsa_helpers.hpp"
|
||||
|
||||
#if !defined(__cpp_exceptions)
|
||||
@@ -357,8 +357,11 @@ public:
|
||||
|
||||
const auto it1 = get_symbol_addresses().find(x);
|
||||
if (it1 == get_symbol_addresses().cend()) {
|
||||
hip_throw(std::runtime_error{
|
||||
"Global symbol: " + x + " is undefined."});
|
||||
// For a unknown symbol, initialize it with a magic poison
|
||||
hsa_executable_agent_global_variable_define(
|
||||
executable, agent, x.c_str(),
|
||||
reinterpret_cast<void*>(0xDEADBEEFDEADBEEFull));
|
||||
continue;
|
||||
}
|
||||
|
||||
hsa_status_t status;
|
||||
|
||||
@@ -385,8 +385,7 @@ double compute_BSR(BCRSArrays& bcsr, double *x , double *y){
|
||||
cudaEventCreate(&startTime);
|
||||
cudaEventCreate(&stopTime);
|
||||
cudaEventRecord(startTime, bcsr.streamId);
|
||||
// NOTE: cusparseDbsrmv and CUSPARSE_DIRECTION_COLUMN (of type cusparseDirection_t) are yet unsupported by HIP
|
||||
// CHECK: cusparseDbsrmv(bcsr.cusparseHandle, CUSPARSE_DIRECTION_COLUMN, HIPSPARSE_OPERATION_NON_TRANSPOSE,
|
||||
// CHECK: cusparseDbsrmv(bcsr.cusparseHandle, HIPSPARSE_DIRECTION_COLUMN, HIPSPARSE_OPERATION_NON_TRANSPOSE,
|
||||
cusparseDbsrmv(bcsr.cusparseHandle, CUSPARSE_DIRECTION_COLUMN, CUSPARSE_OPERATION_NON_TRANSPOSE,
|
||||
bcsr.nbBlockRow, bcsr.m, bcsr.nbBlocks, &alpha, descr,
|
||||
bcsr.cu_bsrValC, bcsr.cu_bsrRowPtrC, bcsr.cu_bsrColIndC, bcsr.blockSize,
|
||||
|
||||
@@ -0,0 +1,46 @@
|
||||
/*
|
||||
Copyright (c) 2015-Present Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include "test_common.h"
|
||||
#define SIZE 1024
|
||||
|
||||
int main(){
|
||||
|
||||
void *Sd;
|
||||
hipError_t e;
|
||||
char S[SIZE]="This is not a device symbol";
|
||||
|
||||
HIPCHECK(hipMalloc(&Sd,SIZE));
|
||||
|
||||
e = hipMemcpyFromSymbol(S, HIP_SYMBOL(Sd), SIZE, 0, hipMemcpyDeviceToHost);
|
||||
HIPASSERT(e==hipErrorInvalidSymbol);
|
||||
|
||||
e = hipMemcpyFromSymbol(S, NULL, SIZE, 0, hipMemcpyDeviceToHost);
|
||||
HIPASSERT(e==hipErrorInvalidSymbol);
|
||||
|
||||
HIPCHECK(hipFree(Sd));
|
||||
|
||||
passed();
|
||||
}
|
||||
@@ -0,0 +1,49 @@
|
||||
/*
|
||||
Copyright (c) 2015-Present Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include "test_common.h"
|
||||
#define SIZE 1024
|
||||
|
||||
int main(){
|
||||
|
||||
void *Sd;
|
||||
hipError_t e;
|
||||
char S[SIZE]="This is not a device symbol";
|
||||
|
||||
HIPCHECK(hipMalloc(&Sd,SIZE));
|
||||
|
||||
hipStream_t stream;
|
||||
HIPCHECK(hipStreamCreate(&stream));
|
||||
|
||||
e = hipMemcpyFromSymbolAsync(S, HIP_SYMBOL(Sd), SIZE, 0, hipMemcpyDeviceToHost, stream);
|
||||
HIPASSERT(e==hipErrorInvalidSymbol);
|
||||
|
||||
e = hipMemcpyFromSymbolAsync(S, NULL, SIZE, 0, hipMemcpyDeviceToHost, stream);
|
||||
HIPASSERT(e==hipErrorInvalidSymbol);
|
||||
|
||||
HIPCHECK(hipFree(Sd));
|
||||
|
||||
passed();
|
||||
}
|
||||
@@ -0,0 +1,46 @@
|
||||
/*
|
||||
Copyright (c) 2015-Present Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include "test_common.h"
|
||||
#define SIZE 1024
|
||||
|
||||
int main(){
|
||||
|
||||
void *Sd;
|
||||
hipError_t e;
|
||||
char S[SIZE]="This is not a device symbol";
|
||||
|
||||
HIPCHECK(hipMalloc(&Sd,SIZE));
|
||||
|
||||
e = hipMemcpyToSymbol(HIP_SYMBOL(Sd), S, SIZE, 0, hipMemcpyHostToDevice);
|
||||
HIPASSERT(e==hipErrorInvalidSymbol);
|
||||
|
||||
e = hipMemcpyToSymbol(NULL, S, SIZE, 0, hipMemcpyHostToDevice);
|
||||
HIPASSERT(e==hipErrorInvalidSymbol);
|
||||
|
||||
HIPCHECK(hipFree(Sd));
|
||||
|
||||
passed();
|
||||
}
|
||||
@@ -0,0 +1,49 @@
|
||||
/*
|
||||
Copyright (c) 2015-Present Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include "test_common.h"
|
||||
#define SIZE 100
|
||||
|
||||
int main(){
|
||||
|
||||
void *Sd;
|
||||
hipError_t e;
|
||||
char S[SIZE]="This is not a device symbol";
|
||||
|
||||
HIPCHECK(hipMalloc(&Sd,SIZE));
|
||||
|
||||
hipStream_t stream;
|
||||
HIPCHECK(hipStreamCreate(&stream));
|
||||
|
||||
e = hipMemcpyToSymbolAsync(HIP_SYMBOL(Sd), S, SIZE, 0, hipMemcpyHostToDevice, stream);
|
||||
HIPASSERT(e==hipErrorInvalidSymbol);
|
||||
|
||||
e = hipMemcpyToSymbolAsync(NULL, S, SIZE, 0, hipMemcpyHostToDevice, stream);
|
||||
HIPASSERT(e==hipErrorInvalidSymbol);
|
||||
|
||||
HIPCHECK(hipFree(Sd));
|
||||
|
||||
passed();
|
||||
}
|
||||
@@ -0,0 +1,43 @@
|
||||
/*
|
||||
Copyright (c) 2015-Present Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM vdi
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include "test_common.h"
|
||||
#define SIZE 100
|
||||
|
||||
int main(){
|
||||
hipError_t e;
|
||||
char str[SIZE]="Hi, I am Ellesemere. What is ur name?";
|
||||
|
||||
e = hipMemcpy(0, str, SIZE, hipMemcpyHostToDevice);
|
||||
HIPASSERT(e==hipErrorInvalidValue);
|
||||
|
||||
e = hipMemcpy(NULL, str, SIZE, hipMemcpyHostToDevice);
|
||||
HIPASSERT(e==hipErrorInvalidValue);
|
||||
|
||||
e = hipMemset(0,99,80);
|
||||
HIPASSERT(e==hipErrorInvalidValue);
|
||||
|
||||
passed();
|
||||
}
|
||||
@@ -0,0 +1,40 @@
|
||||
/*
|
||||
Copyright (c) 2015-Present Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM vdi
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include "test_common.h"
|
||||
|
||||
int main(){
|
||||
|
||||
hipError_t e;
|
||||
hipStream_t stream;
|
||||
|
||||
e = hipStreamCreateWithFlags(&stream, -1);
|
||||
HIPASSERT(e==hipErrorInvalidValue);
|
||||
|
||||
e = hipStreamCreateWithFlags(&stream, 2);
|
||||
HIPASSERT(e==hipErrorInvalidValue);
|
||||
|
||||
passed();
|
||||
}
|
||||
@@ -96,6 +96,18 @@ void kernel_hisinf(__half* input, int* output) {
|
||||
output[tx] = __hisinf(input[tx]);
|
||||
}
|
||||
|
||||
__global__ void testHalfAbs(float* p) {
|
||||
auto a = __float2half(*p);
|
||||
a = __habs(a);
|
||||
*p = __half2float(a);
|
||||
}
|
||||
|
||||
__global__ void testHalf2Abs(float2* p) {
|
||||
auto a = __float22half2_rn(*p);
|
||||
a = __habs2(a);
|
||||
*p = __half22float2(a);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@@ -237,6 +249,31 @@ void checkFunctional() {
|
||||
return;
|
||||
}
|
||||
|
||||
void checkHalfAbs() {
|
||||
{
|
||||
float *p;
|
||||
hipMalloc(&p, sizeof(float));
|
||||
float pp = -2.1f;
|
||||
hipMemcpy(p, &pp, sizeof(float), hipMemcpyDefault);
|
||||
hipLaunchKernelGGL(testHalfAbs, 1, 1, 0, 0, p);
|
||||
hipMemcpy(&pp, p, sizeof(float), hipMemcpyDefault);
|
||||
hipFree(p);
|
||||
if(pp < 0.0f) { failed("Half Abs failed"); }
|
||||
}
|
||||
{
|
||||
float2 *p;
|
||||
hipMalloc(&p, sizeof(float2));
|
||||
float2 pp;
|
||||
pp.x = -2.1f;
|
||||
pp.y = -1.1f;
|
||||
hipMemcpy(p, &pp, sizeof(float2), hipMemcpyDefault);
|
||||
hipLaunchKernelGGL(testHalf2Abs, 1, 1, 0, 0, p);
|
||||
hipMemcpy(&pp, p, sizeof(float2), hipMemcpyDefault);
|
||||
hipFree(p);
|
||||
if(pp.x < 0.0f || pp.y < 0.0f) { failed("Half2 Abs Test Failed"); }
|
||||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
bool* result{nullptr};
|
||||
hipMemAllocHost((void**)&result, sizeof(result));
|
||||
@@ -260,5 +297,7 @@ int main() {
|
||||
// run some functional checks
|
||||
checkFunctional();
|
||||
|
||||
checkHalfAbs();
|
||||
|
||||
passed();
|
||||
}
|
||||
|
||||
@@ -0,0 +1,239 @@
|
||||
/*
|
||||
Copyright (c) 2015-2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc HIPCC_OPTIONS -std=c++14
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <type_traits>
|
||||
#include <random>
|
||||
#include "test_common.h"
|
||||
|
||||
static std::random_device dev;
|
||||
static std::mt19937 rng(dev());
|
||||
|
||||
template <typename T, typename M>
|
||||
__host__ __device__ inline constexpr int count() {
|
||||
return sizeof(T) / sizeof(M);
|
||||
}
|
||||
|
||||
inline float getRandomFloat(float min = 10, float max = 100) {
|
||||
std::uniform_real_distribution<float> gen(min, max);
|
||||
return gen(rng);
|
||||
}
|
||||
|
||||
template <typename T, typename B>
|
||||
void fillMatrix(T* a, int size) {
|
||||
for (int i = 0; i < size; i++) {
|
||||
T t;
|
||||
t.x = getRandomFloat();
|
||||
if constexpr (count<T, B>() >= 2) t.y = getRandomFloat();
|
||||
if constexpr (count<T, B>() >= 3) t.z = getRandomFloat();
|
||||
if constexpr (count<T, B>() >= 4) t.w = getRandomFloat();
|
||||
|
||||
a[i] = t;
|
||||
}
|
||||
}
|
||||
|
||||
// Test operations
|
||||
template <typename T, typename B>
|
||||
__host__ __device__ void testOperations(T& a, T& b) {
|
||||
a.x += b.x;
|
||||
a.x++;
|
||||
b.x++;
|
||||
if constexpr (count<T, B>() >= 2) {
|
||||
a.y = b.x;
|
||||
a.x = b.y;
|
||||
}
|
||||
if constexpr (count<T, B>() >= 3) {
|
||||
if (a.x > 0) b.x /= a.x;
|
||||
a.x *= b.z;
|
||||
a.y--;
|
||||
}
|
||||
if constexpr (count<T, B>() >= 4) {
|
||||
b.w = a.x;
|
||||
a.w += (-b.y);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, typename B>
|
||||
__global__ void testOperationsGPU(T* d_a, T* d_b, int size) {
|
||||
int id = threadIdx.x;
|
||||
if (id > size) return;
|
||||
T &a = d_a[id];
|
||||
T &b = d_b[id];
|
||||
|
||||
testOperations<T, B>(a, b);
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
void dcopy(T* a, T* b, int size) {
|
||||
for (int i = 0; i < size; i++) {
|
||||
a[i] = b[i];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool isEqual(T* a, T* b, int size) {
|
||||
for (int i = 0; i < size; i++) {
|
||||
if (a[i] != b[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Main function that tests type
|
||||
// T = what you want to test
|
||||
// D = pack of 1 i.e. float1 int1
|
||||
template <typename T, typename D>
|
||||
void testType(int msize) {
|
||||
T *fa, *fb, *fc, *h_fa, *h_fb;
|
||||
fa = new T[msize];
|
||||
fb = new T[msize];
|
||||
fc = new T[msize];
|
||||
h_fa = new T[msize];
|
||||
h_fb = new T[msize];
|
||||
|
||||
T *d_fa, *d_fb;
|
||||
|
||||
constexpr int c = count<T, D>();
|
||||
|
||||
if (c <= 0 || c >= 5) {
|
||||
failed("Invalid Size\n");
|
||||
}
|
||||
|
||||
fillMatrix<T, D>(fa, msize);
|
||||
dcopy(fb, fa, msize);
|
||||
dcopy(h_fa, fa, msize);
|
||||
dcopy(h_fb, fa, msize);
|
||||
for (int i = 0; i < msize; i++) testOperations<T, D>(h_fa[i], h_fb[i]);
|
||||
|
||||
hipMalloc(&d_fa, sizeof(T) * msize);
|
||||
hipMalloc(&d_fb, sizeof(T) * msize);
|
||||
|
||||
hipMemcpy(d_fa, fa, sizeof(T) * msize, hipMemcpyHostToDevice);
|
||||
hipMemcpy(d_fb, fb, sizeof(T) * msize, hipMemcpyHostToDevice);
|
||||
|
||||
auto kernel = testOperationsGPU<T, D>;
|
||||
hipLaunchKernelGGL(kernel, 1, msize, 0, 0, d_fa, d_fb, msize);
|
||||
|
||||
hipMemcpy(fc, d_fa, sizeof(T) * msize, hipMemcpyDeviceToHost);
|
||||
|
||||
bool pass = true;
|
||||
if (!isEqual<T>(h_fa, fc, msize)) {
|
||||
pass = false;
|
||||
}
|
||||
|
||||
delete[] fa;
|
||||
delete[] fb;
|
||||
delete[] fc;
|
||||
delete[] h_fa;
|
||||
delete[] h_fb;
|
||||
hipFree(d_fa);
|
||||
hipFree(d_fb);
|
||||
|
||||
if (!pass) {
|
||||
failed("Failed");
|
||||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
const int msize = 100;
|
||||
// double
|
||||
testType<double1, double1>(msize);
|
||||
testType<double2, double1>(msize);
|
||||
testType<double3, double1>(msize);
|
||||
testType<double4, double1>(msize);
|
||||
|
||||
// floats
|
||||
testType<float1, float1>(msize);
|
||||
testType<float2, float1>(msize);
|
||||
testType<float3, float1>(msize);
|
||||
testType<float4, float1>(msize);
|
||||
|
||||
// ints
|
||||
testType<int1, int1>(msize);
|
||||
testType<int2, int1>(msize);
|
||||
testType<int3, int1>(msize);
|
||||
testType<int4, int1>(msize);
|
||||
|
||||
// chars
|
||||
testType<char1, char1>(msize);
|
||||
testType<char2, char1>(msize);
|
||||
testType<char3, char1>(msize);
|
||||
testType<char4, char1>(msize);
|
||||
|
||||
// long
|
||||
testType<long1, long1>(msize);
|
||||
testType<long2, long1>(msize);
|
||||
testType<long3, long1>(msize);
|
||||
testType<long4, long1>(msize);
|
||||
|
||||
// longlong
|
||||
testType<longlong1, longlong1>(msize);
|
||||
testType<longlong2, longlong1>(msize);
|
||||
testType<longlong3, longlong1>(msize);
|
||||
testType<longlong4, longlong1>(msize);
|
||||
|
||||
// short
|
||||
testType<short1, short1>(msize);
|
||||
testType<short2, short1>(msize);
|
||||
testType<short3, short1>(msize);
|
||||
testType<short4, short1>(msize);
|
||||
|
||||
// uints
|
||||
testType<uint1, uint1>(msize);
|
||||
testType<uint2, uint1>(msize);
|
||||
testType<uint3, uint1>(msize);
|
||||
testType<uint4, uint1>(msize);
|
||||
|
||||
// uchars
|
||||
testType<uchar1, uchar1>(msize);
|
||||
testType<uchar2, uchar1>(msize);
|
||||
testType<uchar3, uchar1>(msize);
|
||||
testType<uchar4, uchar1>(msize);
|
||||
|
||||
// ulong
|
||||
testType<ulong1, ulong1>(msize);
|
||||
testType<ulong2, ulong1>(msize);
|
||||
testType<ulong3, ulong1>(msize);
|
||||
testType<ulong4, ulong1>(msize);
|
||||
|
||||
// ulonglong
|
||||
testType<ulonglong1, ulonglong1>(msize);
|
||||
testType<ulonglong2, ulonglong1>(msize);
|
||||
testType<ulonglong3, ulonglong1>(msize);
|
||||
testType<ulonglong4, ulonglong1>(msize);
|
||||
|
||||
// ushort
|
||||
testType<ushort1, ushort1>(msize);
|
||||
testType<ushort2, ushort1>(msize);
|
||||
testType<ushort3, ushort1>(msize);
|
||||
testType<ushort4, ushort1>(msize);
|
||||
|
||||
passed();
|
||||
}
|
||||
@@ -20,7 +20,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../test_common.cpp LINK_OPTIONS hiprtc EXCLUDE_HIP_PLATFORM nvcc
|
||||
* BUILD: %t %s ../test_common.cpp LINK_OPTIONS hiprtc EXCLUDE_HIP_PLATFORM nvcc vdi
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
@@ -20,7 +20,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../test_common.cpp LINK_OPTIONS hiprtc EXCLUDE_HIP_PLATFORM nvcc
|
||||
* BUILD: %t %s ../test_common.cpp LINK_OPTIONS hiprtc EXCLUDE_HIP_PLATFORM nvcc vdi
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
@@ -0,0 +1,53 @@
|
||||
/*
|
||||
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
* */
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc vdi
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
|
||||
#include "test_common.h"
|
||||
|
||||
int main() {
|
||||
int* A;
|
||||
int* Ad;
|
||||
int* Bd;
|
||||
|
||||
// Allocation
|
||||
HIPCHECK(hipMalloc((void**)&Ad, sizeof(int)));
|
||||
HIPCHECK(hipMalloc((void**)&Bd, sizeof(int)));
|
||||
HIPCHECK(hipHostMalloc((void**)&A,sizeof(int)));
|
||||
|
||||
// Kind should be ignored and test should pass even for incorrect kind
|
||||
HIPCHECK(hipMemcpy(Ad, A, sizeof(int), hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipMemcpy(A, Ad, sizeof(int), hipMemcpyHostToDevice));
|
||||
HIPCHECK(hipMemcpy(Ad, Bd, sizeof(int), hipMemcpyHostToHost));
|
||||
HIPCHECK(hipMemcpy(A, A, sizeof(int), hipMemcpyDeviceToDevice));
|
||||
|
||||
// nullptr passed as source or destination pointer
|
||||
HIPASSERT(hipSuccess != hipMemcpy(nullptr, A, sizeof(int), hipMemcpyHostToDevice));
|
||||
HIPASSERT(hipSuccess != hipMemcpy(Ad, nullptr, sizeof(int), hipMemcpyHostToDevice));
|
||||
|
||||
HIPCHECK(hipFree(Ad));
|
||||
HIPCHECK(hipFree(Bd));
|
||||
HIPCHECK(hipFree(A));
|
||||
passed();
|
||||
}
|
||||
@@ -107,8 +107,8 @@ void run(const std::vector<char>& buffer) {
|
||||
|
||||
hipFree(Ad);
|
||||
hipFree(Bd);
|
||||
delete A;
|
||||
delete B;
|
||||
delete[] A;
|
||||
delete[] B;
|
||||
hipCtxDestroy(context);
|
||||
|
||||
}
|
||||
|
||||
@@ -0,0 +1,145 @@
|
||||
#include <stdio.h>
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <unistd.h>
|
||||
#include "test_common.h"
|
||||
#include <atomic>
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
enum class ExecState
|
||||
{
|
||||
EXEC_NOT_STARTED,
|
||||
EXEC_STARTED,
|
||||
EXEC_CB_STARTED,
|
||||
EXEC_CB_FINISHED,
|
||||
EXEC_FINISHED
|
||||
};
|
||||
|
||||
struct UserData
|
||||
{
|
||||
size_t size;
|
||||
int* ptr;
|
||||
};
|
||||
|
||||
// Global variable to check exection order
|
||||
std::atomic<ExecState> gData(ExecState::EXEC_NOT_STARTED);
|
||||
|
||||
|
||||
void myCallback(hipStream_t stream, hipError_t status, void* user_data)
|
||||
{
|
||||
if(gData.load() != ExecState::EXEC_STARTED)
|
||||
return; // Error hence return early
|
||||
|
||||
gData.store(ExecState::EXEC_CB_STARTED);
|
||||
|
||||
UserData* data = reinterpret_cast<UserData*>(user_data);
|
||||
printf("Callback started\n");
|
||||
|
||||
sleep(1);
|
||||
|
||||
printf("Callback ending.\n");
|
||||
gData.store(ExecState::EXEC_CB_FINISHED);
|
||||
}
|
||||
|
||||
bool test(int count)
|
||||
{
|
||||
printf("\n============ Test iteration %d =============\n",count);
|
||||
// Stream
|
||||
hipStream_t stream;
|
||||
bool result = true;
|
||||
|
||||
gData.store(ExecState::EXEC_STARTED);
|
||||
|
||||
HIPCHECK(hipStreamCreate(&stream));
|
||||
|
||||
// Array size
|
||||
size_t size = 10000;
|
||||
|
||||
// Device array
|
||||
int *data = NULL;
|
||||
HIPCHECK(hipMalloc((void**)&data, sizeof(int) * size));
|
||||
|
||||
// Initialize device array to -1
|
||||
HIPCHECK(hipMemset(data, -1, sizeof(int) * size));
|
||||
|
||||
// Host array
|
||||
int *host = NULL;
|
||||
HIPCHECK(hipHostMalloc((void**)&host, sizeof(int) * size));
|
||||
|
||||
// Print host ptr address
|
||||
printf("In main thread\n");
|
||||
|
||||
// Initialize user_data for callback
|
||||
UserData arg;
|
||||
arg.size = size;
|
||||
arg.ptr = host;
|
||||
|
||||
// Synchronize device
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
|
||||
// Asynchronous copy from device to host
|
||||
HIPCHECK(hipMemcpyAsync(host, data, sizeof(int) * size, hipMemcpyDeviceToHost, stream));
|
||||
|
||||
// Asynchronous memset on device
|
||||
HIPCHECK(hipMemsetAsync(data, 0, sizeof(int) * size, stream));
|
||||
|
||||
// Add callback - should happen after hipMemsetAsync()
|
||||
HIPCHECK(hipStreamAddCallback(stream, myCallback, &arg, 0));
|
||||
|
||||
printf("Will wait in main thread until callback completes\n");
|
||||
|
||||
//This should synchronize the stream (including the callback)
|
||||
HIPCHECK(hipStreamSynchronize(stream));
|
||||
|
||||
if(gData.load() != ExecState::EXEC_CB_FINISHED)
|
||||
{
|
||||
std::cout<<"Callback is not finished\n";
|
||||
return false;
|
||||
}
|
||||
printf("Callback completed will resume main thread execution\n");
|
||||
|
||||
if(host[size/2] != -1)
|
||||
{
|
||||
// Print some host data that just got copied
|
||||
printf("Pseudo host data printing (should be -1): %d\n", host[size/2]);
|
||||
result = false;
|
||||
}
|
||||
|
||||
HIPCHECK(hipMemcpy(host, data, sizeof(int)*size, hipMemcpyDeviceToHost));
|
||||
|
||||
if(host[size-1] != 0)
|
||||
{
|
||||
printf("Pseudo host data printing (should be 0): %d\n", host[size-1]);
|
||||
result = false;
|
||||
}
|
||||
|
||||
HIPCHECK(hipFree(data));
|
||||
HIPCHECK(hipHostFree(host));
|
||||
HIPCHECK(hipStreamDestroy(stream));
|
||||
|
||||
gData.store(ExecState::EXEC_FINISHED);
|
||||
return result;
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
// Test involves multithreading hence running multiple times
|
||||
// to make sure consitency in the behavior
|
||||
bool status = true;
|
||||
|
||||
for(int i=0; i < 10; i++){
|
||||
status = test(i+1);
|
||||
if(status == false)
|
||||
{
|
||||
failed("Test Failed!\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if(status == true) passed();
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,409 @@
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
#include <stdexcept>
|
||||
#include <memory>
|
||||
#include <functional>
|
||||
#include <mutex>
|
||||
#include <condition_variable>
|
||||
#include <thread>
|
||||
#include <future>
|
||||
#include "test_common.h"
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM vdi
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#define WORKAROUND 0 // Enable (1) this to make stream thread-safe by a workaround
|
||||
|
||||
template<bool IsBlocking> // <true> = queue blocks, until task is finished in enqueue(queue,task)
|
||||
class QueueHipRt;
|
||||
|
||||
// Queue types used in the tests
|
||||
using TestQueues = std::tuple<QueueHipRt<true>, QueueHipRt<false>>;
|
||||
|
||||
|
||||
// --- Implementation
|
||||
|
||||
#define HIP_ASSERT(x) (assert((x)==hipSuccess))
|
||||
#define HIP_ASSERT_IGNORE(x,ign) auto err=x; HIP_ASSERT(err==ign ? hipSuccess : err)
|
||||
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
#define HIPRT_CB
|
||||
#endif
|
||||
|
||||
template<bool isBlocking>
|
||||
static auto currentThreadWaitFor(QueueHipRt<isBlocking> const & queue) -> void;
|
||||
|
||||
template<bool IsBlocking>
|
||||
class QueueHipRt
|
||||
{
|
||||
public:
|
||||
static constexpr bool isBlocking = IsBlocking;
|
||||
//-----------------------------------------------------------------------------
|
||||
QueueHipRt(
|
||||
int dev) :
|
||||
m_dev(dev),
|
||||
m_HipQueue()
|
||||
{
|
||||
HIP_ASSERT(
|
||||
hipSetDevice(
|
||||
m_dev));
|
||||
HIP_ASSERT(
|
||||
hipStreamCreateWithFlags(
|
||||
&m_HipQueue,
|
||||
hipStreamNonBlocking));
|
||||
}
|
||||
//-----------------------------------------------------------------------------
|
||||
QueueHipRt(QueueHipRt const &) = delete;
|
||||
//-----------------------------------------------------------------------------
|
||||
QueueHipRt(QueueHipRt &&) = delete;
|
||||
//-----------------------------------------------------------------------------
|
||||
auto operator=(QueueHipRt const &) -> QueueHipRt & = delete;
|
||||
//-----------------------------------------------------------------------------
|
||||
auto operator=(QueueHipRt &&) -> QueueHipRt & = delete;
|
||||
//-----------------------------------------------------------------------------
|
||||
~QueueHipRt()
|
||||
{
|
||||
if(isBlocking) {
|
||||
#if WORKAROUND // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
|
||||
// we are a non-blocking queue, so we have to wait here with its destruction until all spawned tasks have been processed
|
||||
currentThreadWaitFor(*this);
|
||||
#endif
|
||||
}
|
||||
HIP_ASSERT(
|
||||
hipSetDevice(
|
||||
m_dev));
|
||||
HIP_ASSERT(
|
||||
hipStreamDestroy(
|
||||
m_HipQueue));
|
||||
}
|
||||
|
||||
public:
|
||||
int m_dev; //!< The device this queue is bound to.
|
||||
hipStream_t m_HipQueue;
|
||||
|
||||
#if WORKAROUND // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
|
||||
int m_callees = 0;
|
||||
std::mutex m_mutex;
|
||||
#endif
|
||||
};
|
||||
|
||||
template<typename TTask>
|
||||
struct Enqueue
|
||||
{
|
||||
//#############################################################################
|
||||
enum class CallbackState
|
||||
{
|
||||
enqueued,
|
||||
notified,
|
||||
finished,
|
||||
};
|
||||
|
||||
//#############################################################################
|
||||
struct CallbackSynchronizationData : public std::enable_shared_from_this<CallbackSynchronizationData>
|
||||
{
|
||||
std::mutex m_mutex;
|
||||
std::condition_variable m_event;
|
||||
CallbackState state = CallbackState::enqueued;
|
||||
};
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
static void HIPRT_CB hipRtCallback(hipStream_t /*queue*/, hipError_t /*status*/, void *arg)
|
||||
{
|
||||
// explicitly copy the shared_ptr so that this method holds the state even when the executing thread has already finished.
|
||||
const auto pCallbackSynchronizationData = reinterpret_cast<CallbackSynchronizationData*>(arg)->shared_from_this();
|
||||
|
||||
// Notify the executing thread.
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(pCallbackSynchronizationData->m_mutex);
|
||||
pCallbackSynchronizationData->state = CallbackState::notified;
|
||||
}
|
||||
pCallbackSynchronizationData->m_event.notify_one();
|
||||
|
||||
// Wait for the executing thread to finish the task if it has not already finished.
|
||||
std::unique_lock<std::mutex> lock(pCallbackSynchronizationData->m_mutex);
|
||||
if(pCallbackSynchronizationData->state != CallbackState::finished)
|
||||
{
|
||||
pCallbackSynchronizationData->m_event.wait(
|
||||
lock,
|
||||
[pCallbackSynchronizationData](){
|
||||
return pCallbackSynchronizationData->state == CallbackState::finished;
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
template<bool isBlocking>
|
||||
static auto enqueue(
|
||||
QueueHipRt<isBlocking> & queue,
|
||||
TTask const & task)
|
||||
-> void
|
||||
{
|
||||
|
||||
#if WORKAROUND // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
|
||||
{
|
||||
// thread-safe callee incrementing
|
||||
std::lock_guard<std::mutex> guard(queue.m_mutex);
|
||||
queue.m_callees += 1;
|
||||
}
|
||||
#endif
|
||||
auto pCallbackSynchronizationData = std::make_shared<CallbackSynchronizationData>();
|
||||
// test example: https://github.com/ROCm-Developer-Tools/HIP/blob/roc-1.9.x/tests/src/runtimeApi/stream/hipStreamAddCallback.cpp
|
||||
HIP_ASSERT(hipStreamAddCallback(
|
||||
queue.m_HipQueue,
|
||||
hipRtCallback,
|
||||
pCallbackSynchronizationData.get(),
|
||||
0u));
|
||||
|
||||
// We start a new std::thread which stores the task to be executed.
|
||||
// This circumvents the limitation that it is not possible to call HIP methods within the HIP callback thread.
|
||||
// The HIP thread signals the std::thread when it is ready to execute the task.
|
||||
// The HIP thread is waiting for the std::thread to signal that it is finished executing the task
|
||||
// before it executes the next task in the queue (HIP stream).
|
||||
std::thread t(
|
||||
[pCallbackSynchronizationData,
|
||||
task
|
||||
#if WORKAROUND // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
|
||||
,&queue // requires queue's destructor to wait for all tasks
|
||||
#endif
|
||||
](){
|
||||
|
||||
#if WORKAROUND // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
|
||||
// thread-safe task execution and callee decrementing
|
||||
std::lock_guard<std::mutex> guard(queue.m_mutex);
|
||||
#endif
|
||||
|
||||
// If the callback has not yet been called, we wait for it.
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(pCallbackSynchronizationData->m_mutex);
|
||||
if(pCallbackSynchronizationData->state != CallbackState::notified)
|
||||
{
|
||||
pCallbackSynchronizationData->m_event.wait(
|
||||
lock,
|
||||
[pCallbackSynchronizationData](){
|
||||
return pCallbackSynchronizationData->state == CallbackState::notified;
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
task();
|
||||
|
||||
// Notify the waiting HIP thread.
|
||||
pCallbackSynchronizationData->state = CallbackState::finished;
|
||||
}
|
||||
pCallbackSynchronizationData->m_event.notify_one();
|
||||
#if WORKAROUND // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
|
||||
queue.m_callees -= 1;
|
||||
#endif
|
||||
}
|
||||
);
|
||||
if(isBlocking)
|
||||
t.join(); // => waiting for task completion
|
||||
else
|
||||
t.detach(); // => do not wait for task completion
|
||||
}
|
||||
};
|
||||
//#############################################################################
|
||||
//! The HIP RT non-blocking queue test trait specialization.
|
||||
struct Empty
|
||||
{
|
||||
//-----------------------------------------------------------------------------
|
||||
template<bool isBlocking>
|
||||
static auto empty(
|
||||
QueueHipRt<isBlocking> const & queue)
|
||||
-> bool
|
||||
{
|
||||
|
||||
#if WORKAROUND // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
|
||||
return (queue.m_callees==0);
|
||||
#else
|
||||
|
||||
// Query is allowed even for queues on non current device.
|
||||
hipError_t ret = hipSuccess;
|
||||
HIP_ASSERT_IGNORE(
|
||||
ret = hipStreamQuery(
|
||||
queue.m_HipQueue),
|
||||
hipErrorNotReady);
|
||||
return (ret == hipSuccess);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
template<bool isBlocking>
|
||||
auto currentThreadWaitFor(QueueHipRt<isBlocking> const & queue) -> void
|
||||
{
|
||||
#if WORKAROUND // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
|
||||
while(queue.m_callees>0) {
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(10u));
|
||||
}
|
||||
#else
|
||||
// Sync is allowed even for queues on non current device.
|
||||
HIP_ASSERT( hipStreamSynchronize(
|
||||
queue.m_HipQueue));
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// --- Tests
|
||||
|
||||
#define TEMPLATE_LIST_TEST_CASE(TestName) \
|
||||
template<typename TestType> static void TestName (std::atomic<int> &check); \
|
||||
static int TestName##Runner () { \
|
||||
std::atomic<int> check{0}; \
|
||||
TestName< QueueHipRt<true> >(check); \
|
||||
fprintf(stderr, "After " #TestName " < QueueHipRt<true> > errors=%d\n", check.load()); \
|
||||
TestName< QueueHipRt<false> >(check); \
|
||||
fprintf(stderr, "After " #TestName " < QueueHipRt<false> > errors=%d\n", check.load()); \
|
||||
return check.load(); \
|
||||
} \
|
||||
template<typename TestType> static void TestName (std::atomic<int> &check)
|
||||
|
||||
// add 1 if a check fails
|
||||
#define CHECK(result) do{int arg=(!(result)); fprintf(stderr, "Checking " #result " %d\n", arg); check.fetch_add(arg);}while(false)
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
TEMPLATE_LIST_TEST_CASE( queueIsInitiallyEmpty )
|
||||
{
|
||||
TestType queue{0};
|
||||
CHECK(Empty::empty(queue));
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
TEMPLATE_LIST_TEST_CASE( queueCallbackIsWorking )
|
||||
{
|
||||
std::promise<bool> promise;
|
||||
auto task = [&](){ promise.set_value(true); };
|
||||
TestType queue{0};
|
||||
Enqueue<decltype(task)> enqueue;
|
||||
enqueue.enqueue(
|
||||
queue,
|
||||
task
|
||||
);
|
||||
|
||||
CHECK(promise.get_future().get());
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
TEMPLATE_LIST_TEST_CASE( queueWaitShouldWork )
|
||||
{
|
||||
bool CallbackFinished = false;
|
||||
auto task =
|
||||
[&CallbackFinished]() noexcept
|
||||
{
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(100u));
|
||||
CallbackFinished = true;
|
||||
};
|
||||
TestType queue{0};
|
||||
Enqueue<decltype(task)> enqueue;
|
||||
enqueue.enqueue(
|
||||
queue,
|
||||
task
|
||||
);
|
||||
|
||||
currentThreadWaitFor(queue);
|
||||
CHECK(CallbackFinished);
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
TEMPLATE_LIST_TEST_CASE( queueShouldNotBeEmptyWhenLastTaskIsStillExecutingAndIsEmptyAfterProcessingFinished )
|
||||
{
|
||||
bool CallbackFinished = false;
|
||||
TestType queue{0};
|
||||
auto task = [&queue, &CallbackFinished, &check]() noexcept
|
||||
{
|
||||
CHECK(!Empty::empty(queue));
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(100u));
|
||||
CallbackFinished = true;
|
||||
};
|
||||
Enqueue<decltype(task)> enqueue;
|
||||
enqueue.enqueue(
|
||||
queue,
|
||||
task
|
||||
);
|
||||
// A non-blocking queue will always stay empty because the task has been executed immediately.
|
||||
if(!TestType::isBlocking)
|
||||
{
|
||||
currentThreadWaitFor(queue);
|
||||
}
|
||||
|
||||
CHECK(Empty::empty(queue));
|
||||
CHECK(CallbackFinished);
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
TEMPLATE_LIST_TEST_CASE( queueShouldNotExecuteTasksInParallel )
|
||||
{
|
||||
std::atomic<bool> taskIsExecuting(false);
|
||||
std::promise<void> firstTaskFinished;
|
||||
std::future<void> firstTaskFinishedFuture = firstTaskFinished.get_future();
|
||||
std::promise<void> secondTaskFinished;
|
||||
std::future<void> secondTaskFinishedFuture = secondTaskFinished.get_future();
|
||||
|
||||
TestType queue{0};
|
||||
|
||||
std::thread thread1(
|
||||
[&queue, &taskIsExecuting, &firstTaskFinished, &check]()
|
||||
{
|
||||
auto task1 = [&taskIsExecuting, &firstTaskFinished, &check]() noexcept
|
||||
{
|
||||
CHECK(!taskIsExecuting.exchange(true));
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(100u));
|
||||
CHECK(taskIsExecuting.exchange(false));
|
||||
firstTaskFinished.set_value();
|
||||
};
|
||||
Enqueue<decltype(task1)> enqueue;
|
||||
enqueue.enqueue(
|
||||
queue,
|
||||
task1
|
||||
);
|
||||
});
|
||||
|
||||
std::thread thread2(
|
||||
[&queue, &taskIsExecuting, &secondTaskFinished, &check]()
|
||||
{
|
||||
auto task2 = [&taskIsExecuting, &secondTaskFinished, &check]() noexcept
|
||||
{
|
||||
CHECK(!taskIsExecuting.exchange(true));
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(100u));
|
||||
CHECK(taskIsExecuting.exchange(false));
|
||||
secondTaskFinished.set_value();
|
||||
};
|
||||
|
||||
Enqueue<decltype(task2)> enqueue;
|
||||
enqueue.enqueue(
|
||||
queue,
|
||||
task2
|
||||
);
|
||||
});
|
||||
|
||||
// Both tasks have to be enqueued
|
||||
thread1.join();
|
||||
thread2.join();
|
||||
|
||||
currentThreadWaitFor(queue);
|
||||
|
||||
firstTaskFinishedFuture.get();
|
||||
secondTaskFinishedFuture.get();
|
||||
}
|
||||
|
||||
#define TESTER(name) do { \
|
||||
int result = name (); \
|
||||
fprintf(stderr, #name " %s\n", result?"Errors":"No Errors"); \
|
||||
if (result) { failed(#name " failed\n"); } \
|
||||
} while (false)
|
||||
|
||||
int main()
|
||||
{
|
||||
TESTER(queueIsInitiallyEmptyRunner);
|
||||
TESTER(queueCallbackIsWorkingRunner);
|
||||
TESTER(queueWaitShouldWorkRunner);
|
||||
TESTER(queueShouldNotBeEmptyWhenLastTaskIsStillExecutingAndIsEmptyAfterProcessingFinishedRunner);
|
||||
TESTER(queueShouldNotExecuteTasksInParallelRunner);
|
||||
passed();
|
||||
}
|
||||
@@ -0,0 +1,122 @@
|
||||
/*
|
||||
Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../test_common.cpp EXCLUDE_HIP_PLATFORM vdi
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "../test_common.h"
|
||||
|
||||
#define N 16
|
||||
#define offset 3
|
||||
__global__ void tex1dKernel(float *val, hipTextureObject_t obj) {
|
||||
int k = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (k < N)
|
||||
val[k] = tex1Dfetch<float>(obj, k+offset);
|
||||
}
|
||||
|
||||
int runTest(hipTextureAddressMode, hipTextureFilterMode);
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
int testResult = runTest(hipAddressModeClamp,hipFilterModePoint);
|
||||
testResult = runTest(hipAddressModeClamp,hipFilterModeLinear);
|
||||
testResult = runTest(hipAddressModeWrap,hipFilterModePoint);
|
||||
testResult = runTest(hipAddressModeWrap,hipFilterModeLinear);
|
||||
if(testResult) {
|
||||
passed();
|
||||
} else {
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
|
||||
int runTest(hipTextureAddressMode addressMode, hipTextureFilterMode filterMode) {
|
||||
|
||||
int testResult = 1;
|
||||
|
||||
hipCtx_t HipContext;
|
||||
hipDevice_t HipDevice;
|
||||
int deviceID = 0;
|
||||
hipDeviceGet(&HipDevice, deviceID);
|
||||
hipCtxCreate(&HipContext, 0, HipDevice);
|
||||
|
||||
// Allocating the required buffer on gpu device
|
||||
float *texBuf, *texBufOut;
|
||||
float val[N], output[N];
|
||||
|
||||
for (int i = 0; i < N; i++) {
|
||||
val[i] = i+1;
|
||||
output[i] = 0.0;
|
||||
}
|
||||
|
||||
HIPCHECK(hipMalloc(&texBuf, N * sizeof(float)));
|
||||
HIPCHECK(hipMalloc(&texBufOut, N * sizeof(float)));
|
||||
HIPCHECK(hipMemcpy(texBuf, val, N * sizeof(float), hipMemcpyHostToDevice));
|
||||
HIPCHECK(hipMemset(texBufOut, 0, N * sizeof(float)));
|
||||
hipResourceDesc resDescLinear;
|
||||
|
||||
memset(&resDescLinear, 0, sizeof(resDescLinear));
|
||||
resDescLinear.resType = hipResourceTypeLinear;
|
||||
resDescLinear.res.linear.devPtr = texBuf;
|
||||
resDescLinear.res.linear.desc = hipCreateChannelDesc(32, 0, 0, 0, hipChannelFormatKindFloat);
|
||||
resDescLinear.res.linear.sizeInBytes = N * sizeof(float);
|
||||
|
||||
hipTextureDesc texDesc;
|
||||
memset(&texDesc, 0, sizeof(texDesc));
|
||||
texDesc.readMode = hipReadModeElementType;
|
||||
|
||||
texDesc.addressMode[0] = addressMode;
|
||||
texDesc.addressMode[1] = addressMode;
|
||||
texDesc.filterMode = filterMode;
|
||||
texDesc.normalizedCoords = false;
|
||||
|
||||
// Creating texture object
|
||||
hipTextureObject_t texObj = 0;
|
||||
HIPCHECK(hipCreateTextureObject(&texObj, &resDescLinear, &texDesc, NULL));
|
||||
|
||||
dim3 dimBlock(1, 1, 1);
|
||||
dim3 dimGrid(N , 1, 1);
|
||||
|
||||
hipLaunchKernelGGL(tex1dKernel, dim3(dimGrid), dim3(dimBlock), 0, 0,
|
||||
texBufOut, texObj);
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
|
||||
HIPCHECK(hipMemcpy(output, texBufOut, N * sizeof(float), hipMemcpyDeviceToHost));
|
||||
|
||||
for (int i = offset; i < N; i++) {
|
||||
if (output[i-offset] != val[i]) {
|
||||
testResult = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if(testResult){
|
||||
for(int i = N-offset; i < N; i++){
|
||||
if (output[i] != 0){
|
||||
testResult = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
HIPCHECK(hipDestroyTextureObject(texObj));
|
||||
HIPCHECK(hipFree(texBuf));
|
||||
HIPCHECK(hipFree(texBufOut));
|
||||
return testResult;
|
||||
}
|
||||
새 이슈에서 참조
사용자 차단