Merge branch 'amd-master' into amd-master-next

Change-Id: I3094c15008093f2072bcd38aca4ea90aeae2d97b


[ROCm/hip commit: 2af31479e2]
이 커밋은 다음에 포함됨:
Maneesh Gupta
2020-04-07 06:57:42 -04:00
부모 c34599e77c
커밋 4e1f6f2a0e
67개의 변경된 파일2278개의 추가작업 그리고 601개의 파일을 삭제
+4 -22
파일 보기
@@ -207,19 +207,6 @@ if (NOT CPACK_SET_DESTDIR)
set(CPACK_PACKAGING_INSTALL_PREFIX "/opt/rocm/hip" CACHE PATH "Default installation path of hcc installer package")
endif (NOT CPACK_SET_DESTDIR)
# Check if we need to enable ATP marker
if(NOT DEFINED COMPILE_HIP_ATP_MARKER)
if(NOT DEFINED ENV{COMPILE_HIP_ATP_MARKER})
set(COMPILE_HIP_ATP_MARKER 0)
else()
set(COMPILE_HIP_ATP_MARKER $ENV{COMPILE_HIP_ATP_MARKER})
message(WARNING "HIP Markers are deprecated, please use roctracer/rocTX marker APIs.")
endif()
else()
message(WARNING "HIP Markers are deprecated, please use roctracer/rocTX marker APIs.")
endif()
add_to_config(_buildInfo COMPILE_HIP_ATP_MARKER)
#############################
# Profiling API support
#############################
@@ -309,10 +296,6 @@ message(STATUS "\nHSA runtime in: " ${HSA_PATH})
if(HIP_PLATFORM STREQUAL "hcc")
include_directories(${PROJECT_SOURCE_DIR}/include)
set(HIP_HCC_BUILD_FLAGS)
if(COMPILE_HIP_ATP_MARKER)
include_directories(/opt/rocm/profiler/CXLActivityLogger/include)
set(HIP_HCC_BUILD_FLAGS "${HIP_HCC_BUILD_FLAGS} -DCOMPILE_HIP_ATP_MARKER=1")
endif()
# Add HIP_VERSION to CMAKE_<LANG>_FLAGS
set(HIP_HCC_BUILD_FLAGS "${HIP_HCC_BUILD_FLAGS} -DHIP_VERSION_MAJOR=${HIP_VERSION_MAJOR} -DHIP_VERSION_MINOR=${HIP_VERSION_MINOR} -DHIP_VERSION_PATCH=${HIP_VERSION_GITDATE}")
@@ -328,7 +311,6 @@ if(HIP_PLATFORM STREQUAL "hcc")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${HIP_HCC_BUILD_FLAGS}")
set(SOURCE_FILES_RUNTIME
src/code_object_bundle.cpp
src/program_state.cpp
src/hip_clang.cpp
src/hip_hcc.cpp
@@ -363,9 +345,6 @@ if(HIP_PLATFORM STREQUAL "hcc")
set (CMAKE_BUILD_WITH_INSTALL_RPATH TRUE )
set (CMAKE_SKIP_BUILD_RPATH TRUE )
endif ()
if(COMPILE_HIP_ATP_MARKER)
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -L/opt/rocm/profiler/CXLActivityLogger/bin/x86_64 -lCXLActivityLogger")
endif()
add_library(hip_hcc SHARED ${SOURCE_FILES_RUNTIME})
add_library(hip_hcc_static STATIC ${SOURCE_FILES_RUNTIME})
@@ -376,7 +355,7 @@ if(HIP_PLATFORM STREQUAL "hcc")
target_link_libraries(hip_hcc PRIVATE hc_am)
target_link_libraries(hip_hcc_static PRIVATE hc_am)
add_library(hiprtc SHARED src/hiprtc.cpp src/code_object_bundle.cpp)
add_library(hiprtc SHARED src/hiprtc.cpp)
target_compile_options(hiprtc PRIVATE -DDISABLE_REDUCED_GPU_BLOB_COPY)
set_property ( TARGET hiprtc PROPERTY VERSION "${HIP_LIB_VERSION_STRING}" )
set_property ( TARGET hiprtc PROPERTY SOVERSION "${HIP_LIB_VERSION_MAJOR}" )
@@ -387,6 +366,9 @@ if(HIP_PLATFORM STREQUAL "hcc")
set_target_properties(hip_hcc PROPERTIES CXX_VISIBILITY_PRESET hidden)
set_target_properties(hip_hcc PROPERTIES VISIBILITY_INLINES_HIDDEN 1)
set_target_properties(hiprtc PROPERTIES CXX_VISIBILITY_PRESET hidden)
set_target_properties(hiprtc PROPERTIES VISIBILITY_INLINES_HIDDEN 1)
if(HIP_PLATFORM STREQUAL "hcc")
find_package(amd_comgr REQUIRED CONFIG
+2 -1
파일 보기
@@ -1,6 +1,7 @@
## What is this repository for? ###
HIP allows developers to convert CUDA code to portable C++. The same source code can be compiled to run on NVIDIA or AMD GPUs.
**HIP is a C++ Runtime API and Kernel Language that allows developers to create portable applications for AMD and NVIDIA GPUs from single source code.**
Key features include:
* HIP is very thin and has little or no performance impact over coding directly in CUDA or hcc "HC" mode.
+45 -13
파일 보기
@@ -134,6 +134,7 @@ if (defined $HIP_RUNTIME and $HIP_RUNTIME eq "VDI" and !defined $HIP_VDI_HOME) {
$HIP_VDI_HOME = $HIP_PATH; # use HIP_PATH
}
$HIPCXXFLAGS .= "-D__HIP_VDI__";
$HIPCFLAGS .= "-D__HIP_VDI__";
}
if (defined $HIP_VDI_HOME) {
@@ -207,7 +208,8 @@ if ($HIP_PLATFORM eq "clang") {
} else {
$HIPCXXFLAGS .= " -std=c++11";
}
$HIPCXXFLAGS .= " -isystem $HIP_CLANG_INCLUDE_PATH";
$HIPCXXFLAGS .= " -isystem $HIP_CLANG_INCLUDE_PATH/..";
$HIPCFLAGS .= " -isystem $HIP_CLANG_INCLUDE_PATH/..";
$HIPLDFLAGS .= " -L$HIP_LIB_PATH";
if (not $isWindows) {
$HIPLDFLAGS .= " -Wl,--rpath-link=$HIP_LIB_PATH";
@@ -222,8 +224,10 @@ if ($HIP_PLATFORM eq "clang") {
$HSA_PATH=$ENV{'HSA_PATH'} // "$ROCM_PATH/hsa";
$HIPCXXFLAGS .= " -isystem $HSA_PATH/include";
$HIPCFLAGS .= " -isystem $HSA_PATH/include";
if (!($HIP_RUNTIME eq "HCC")) {
$HIPCXXFLAGS .= " -D__HIP_VDI__ -fhip-new-launch-api";
$HIPCFLAGS .= " -D__HIP_VDI__ -fhip-new-launch-api";
}
} elsif ($HIP_PLATFORM eq "hcc") {
@@ -282,8 +286,11 @@ if ($HIP_PLATFORM eq "clang") {
}
$HIPCXXFLAGS .= " -isystem $HIP_PATH/include/hip/hcc_detail/cuda";
$HIPCFLAGS .= " -isystem $HIP_PATH/include/hip/hcc_detail/cuda";
$HIPCXXFLAGS .= " -isystem $HSA_PATH/include";
$HIPCFLAGS .= " -isystem $HSA_PATH/include";
$HIPCXXFLAGS .= " -Wno-deprecated-register";
$HIPCFLAGS .= " -Wno-deprecated-register";
$HIPLDFLAGS .= " -L$HSA_PATH/lib -L$ROCM_PATH/lib -lhsa-runtime64 -lhc_am ";
# $HIPLDFLAGS .= " -L$HCC_HOME/compiler/lib -lLLVMAMDGPUDesc -lLLVMAMDGPUUtils -lLLVMMC -lLLVMCore -lLLVMSupport ";
@@ -321,6 +328,7 @@ if ($HIP_PLATFORM eq "clang") {
$HIPCC="$CUDA_PATH/bin/nvcc";
$HIPCXXFLAGS .= " -Wno-deprecated-gpu-targets ";
$HIPCXXFLAGS .= " -isystem $CUDA_PATH/include";
$HIPCFLAGS .= " -isystem $CUDA_PATH/include";
$HIPLDFLAGS = " -Wno-deprecated-gpu-targets -lcuda -lcudart -L$CUDA_PATH/lib64";
} else {
@@ -330,11 +338,14 @@ if ($HIP_PLATFORM eq "clang") {
# Add paths to common HIP includes:
$HIPCXXFLAGS .= " -isystem $HIP_INCLUDE_PATH" ;
$HIPCFLAGS .= " -isystem $HIP_INCLUDE_PATH" ;
my $compileOnly = 0;
my $needCXXFLAGS = 0; # need to add CXX flags to compile step
my $needCFLAGS = 0; # need to add C flags to compile step
my $needLDFLAGS = 1; # need to add LDFLAGS to compile step.
my $hasC = 0; # options contain a c-style file (NVCC must force recognition as GPU file)
my $hasC = 0; # options contain a c-style file
my $hasCXX = 0; # options contain a cpp-style file (NVCC must force recognition as GPU file)
my $hasCU = 0; # options contain a cu-style file (HCC must force recognition as GPU file)
my $needHipHcc = ($HIP_PLATFORM eq 'hcc'); # set if we need to link hip_hcc.o from src tree. (some builds, ie cmake, provide their own)
my $printHipVersion = 0; # print HIP version
@@ -343,6 +354,7 @@ my $buildDeps = 0;
my $linkType = 1;
my $setLinkType = 0;
my $coFormatv3 = 1;
my $funcSupp = 0; # enable function support
my @options = ();
my @inputs = ();
@@ -400,7 +412,6 @@ foreach $arg (@ARGV)
my $swallowArg = 0;
if ($arg eq '-c' or $arg eq '--genco') {
$compileOnly = 1;
$needCXXFLAGS = 1;
$needLDFLAGS = 0;
}
@@ -463,6 +474,7 @@ foreach $arg (@ARGV)
}
if($trimarg eq '-use_fast_math') {
$HIPCXXFLAGS .= " -DHIP_FAST_MATH ";
$HIPCFLAGS .= " -DHIP_FAST_MATH ";
}
if(($trimarg eq '-use-staticlib') and ($setLinkType eq 0))
{
@@ -599,20 +611,30 @@ foreach $arg (@ARGV)
#if $arg eq "--hipcc_profile") { # Example argument here, hipcc
#
#}
if ($arg eq "--hipcc-func-supp") {
$funcSupp = 1;
} elsif ($arg eq "--hipcc-no-func-supp") {
$funcSupp = 0;
}
} else {
push (@options, $arg);
}
#print "O: <$arg>\n";
} else {
# input files and libraries
if (($arg =~ /\.cpp$/) or ($arg =~ /\.cxx$/) or ($arg =~ /\.c$/) or ($arg =~ /\.cc$/) ) {
if ($arg =~ /\.c$/) {
$hasC = 1;
$needCFLAGS = 1;
$toolArgs .= " -x c"
}
elsif (($arg =~ /\.cpp$/) or ($arg =~ /\.cxx$/) or ($arg =~ /\.cc$/) ) {
$hasCXX = 1;
$needCXXFLAGS = 1;
if ($HIP_PLATFORM eq 'clang') {
if ($HIP_PLATFORM eq 'clang' and not $arg =~ /\.c$/) {
$toolArgs .= " -x hip"
}
}
if (($arg =~ /\.cu$/) or ($arg =~ /\.cuh$/) or ($arg =~ /\.hip$/)) {
elsif (($arg =~ /\.cu$/) or ($arg =~ /\.cuh$/) or ($arg =~ /\.hip$/)) {
$hasCU = 1;
$needCXXFLAGS = 1;
if ($HIP_PLATFORM eq 'clang') {
@@ -657,7 +679,7 @@ if($HIP_PLATFORM eq "hcc" or $HIP_PLATFORM eq "clang"){
my $archMacro = ' -D__HIP_ARCH_' . uc($val) . '__=1 ';
# Add the arch option and macro to the compiler options.
$GPU_ARCH_ARG = $GPU_ARCH_OPT . $val;
$HIPLDFLAGS .= $GPU_ARCH_ARG;
$HIPLDARCHFLAGS .= $GPU_ARCH_ARG;
$HIPCXXFLAGS .= $archMacro;
if ($HIP_PLATFORM eq 'clang') {
$HIPCXXFLAGS .= $GPU_ARCH_ARG;
@@ -685,7 +707,7 @@ if ($coFormatv3 and $HIP_PLATFORM eq 'hcc') {
$HIPCXXFLAGS .= " -mcode-object-v3";
}
if ($hasC and $HIP_PLATFORM eq 'nvcc') {
if ($hasCXX and $HIP_PLATFORM eq 'nvcc') {
$HIPCXXFLAGS .= " -x cu";
}
if ($hasCU and $HIP_PLATFORM eq 'hcc') {
@@ -694,6 +716,7 @@ if ($hasCU and $HIP_PLATFORM eq 'hcc') {
if ($buildDeps and $HIP_PLATFORM eq 'nvcc') {
$HIPCXXFLAGS .= " -M -D__CUDACC__";
$HIPCFLAGS .= " -M -D__CUDACC__";
}
if ($buildDeps and $HIP_PLATFORM eq 'clang') {
@@ -701,10 +724,14 @@ if ($buildDeps and $HIP_PLATFORM eq 'clang') {
}
# Add --hip-link only if there are no source files.
if (!$needCXXFLAGS and $HIP_PLATFORM eq 'clang') {
if (!$needCXXFLAGS and !$needCFLAGS and $HIP_PLATFORM eq 'clang') {
$HIPLDFLAGS .= " --hip-link";
}
if (!$needCFLAGS and $HIP_PLATFORM eq 'clang') {
$HIPLDFLAGS .= $HIPLDARCHFLAGS;
}
if ($setStdLib eq 0 and $HIP_PLATFORM eq 'hcc')
{
$HIPCXXFLAGS .= $HCC_WA_FLAGS;
@@ -727,11 +754,12 @@ if ($HIP_PLATFORM eq "clang") {
# Set default optimization level to -O3 for hip-clang.
if ($optArg eq "") {
$HIPCXXFLAGS .= " -O3";
$HIPCFLAGS .= " -O3";
$HIPLDFLAGS .= " -O3";
}
# Do not pass -mllvm on Windows since there is a clang bug causing duplicate -mllvm options in clang -cc1 on Windows.
# ToDo : remove restriction for Windows after clang bug is fixed.
if ($optArg ne "-O0" and not $isWindows) {
if (!$funcSupp and $optArg ne "-O0" and not $isWindows) {
$HIPCXXFLAGS .= " -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false";
if ($needLDFLAGS and not $needCXXFLAGS) {
$HIPLDFLAGS .= " -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false";
@@ -747,18 +775,22 @@ if ($HIP_PLATFORM eq "clang") {
if ($HIPCC_COMPILE_FLAGS_APPEND) {
$HIPCXXFLAGS .= " $HIPCC_COMPILE_FLAGS_APPEND";
$HIPCFLAGS .= " $HIPCC_COMPILE_FLAGS_APPEND";
}
if ($HIPCC_LINK_FLAGS_APPEND) {
$HIPLDFLAGS .= " $HIPCC_LINK_FLAGS_APPEND";
}
my $CMD="$HIPCC";
if ($needCXXFLAGS) {
$CMD .= " $HIPCXXFLAGS";
}
if ($needLDFLAGS and not $compileOnly) {
$CMD .= " $HIPLDFLAGS";
}
if ($needCFLAGS) {
$CMD .= " $HIPCFLAGS";
}
if ($needCXXFLAGS) {
$CMD .= " $HIPCXXFLAGS";
}
$CMD .= " $toolArgs";
if ($verbose & 0x1) {
+26 -2
파일 보기
@@ -341,8 +341,8 @@ sub simpleSubstitutions {
$ft{'execution'} += s/\bcudaLaunchCooperativeKernelMultiDevice\b/hipLaunchCooperativeKernelMultiDevice/g;
$ft{'execution'} += s/\bcudaLaunchKernel\b/hipLaunchKernel/g;
$ft{'execution'} += s/\bcudaSetupArgument\b/hipSetupArgument/g;
$ft{'occupancy'} += s/\bcuOccupancyMaxActiveBlocksPerMultiprocessor\b/hipOccupancyMaxActiveBlocksPerMultiprocessor/g;
$ft{'occupancy'} += s/\bcuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags\b/hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags/g;
$ft{'occupancy'} += s/\bcuOccupancyMaxActiveBlocksPerMultiprocessor\b/hipDrvOccupancyMaxActiveBlocksPerMultiprocessor/g;
$ft{'occupancy'} += s/\bcuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags\b/hipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags/g;
$ft{'occupancy'} += s/\bcuOccupancyMaxPotentialBlockSize\b/hipOccupancyMaxPotentialBlockSize/g;
$ft{'occupancy'} += s/\bcudaOccupancyMaxActiveBlocksPerMultiprocessor\b/hipOccupancyMaxActiveBlocksPerMultiprocessor/g;
$ft{'occupancy'} += s/\bcudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags\b/hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags/g;
@@ -754,6 +754,9 @@ sub simpleSubstitutions {
$ft{'library'} += s/\bcusparseCcsrmm\b/hipsparseCcsrmm/g;
$ft{'library'} += s/\bcusparseCcsrmm2\b/hipsparseCcsrmm2/g;
$ft{'library'} += s/\bcusparseCcsrmv\b/hipsparseCcsrmv/g;
$ft{'library'} += s/\bcusparseCcsrsm2_analysis\b/hipsparseCcsrsm2_analysis/g;
$ft{'library'} += s/\bcusparseCcsrsm2_bufferSizeExt\b/hipsparseCcsrsm2_bufferSizeExt/g;
$ft{'library'} += s/\bcusparseCcsrsm_solve\b/hipsparseCcsrsm_solve/g;
$ft{'library'} += s/\bcusparseCcsrsv2_analysis\b/hipsparseCcsrsv2_analysis/g;
$ft{'library'} += s/\bcusparseCcsrsv2_bufferSize\b/hipsparseCcsrsv2_bufferSize/g;
$ft{'library'} += s/\bcusparseCcsrsv2_bufferSizeExt\b/hipsparseCcsrsv2_bufferSizeExt/g;
@@ -763,9 +766,11 @@ sub simpleSubstitutions {
$ft{'library'} += s/\bcusparseCgthr\b/hipsparseCgthr/g;
$ft{'library'} += s/\bcusparseCgthrz\b/hipsparseCgthrz/g;
$ft{'library'} += s/\bcusparseChybmv\b/hipsparseChybmv/g;
$ft{'library'} += s/\bcusparseCnnz\b/hipsparseCnnz/g;
$ft{'library'} += s/\bcusparseCreate\b/hipsparseCreate/g;
$ft{'library'} += s/\bcusparseCreateCsrgemm2Info\b/hipsparseCreateCsrgemm2Info/g;
$ft{'library'} += s/\bcusparseCreateCsrilu02Info\b/hipsparseCreateCsrilu02Info/g;
$ft{'library'} += s/\bcusparseCreateCsrsm2Info\b/hipsparseCreateCsrsm2Info/g;
$ft{'library'} += s/\bcusparseCreateCsrsv2Info\b/hipsparseCreateCsrsv2Info/g;
$ft{'library'} += s/\bcusparseCreateHybMat\b/hipsparseCreateHybMat/g;
$ft{'library'} += s/\bcusparseCreateIdentityPermutation\b/hipsparseCreateIdentityPermutation/g;
@@ -784,6 +789,9 @@ sub simpleSubstitutions {
$ft{'library'} += s/\bcusparseDcsrmm\b/hipsparseDcsrmm/g;
$ft{'library'} += s/\bcusparseDcsrmm2\b/hipsparseDcsrmm2/g;
$ft{'library'} += s/\bcusparseDcsrmv\b/hipsparseDcsrmv/g;
$ft{'library'} += s/\bcusparseDcsrsm2_analysis\b/hipsparseDcsrsm2_analysis/g;
$ft{'library'} += s/\bcusparseDcsrsm2_bufferSizeExt\b/hipsparseDcsrsm2_bufferSizeExt/g;
$ft{'library'} += s/\bcusparseDcsrsm_solve\b/hipsparseDcsrsm_solve/g;
$ft{'library'} += s/\bcusparseDcsrsv2_analysis\b/hipsparseDcsrsv2_analysis/g;
$ft{'library'} += s/\bcusparseDcsrsv2_bufferSize\b/hipsparseDcsrsv2_bufferSize/g;
$ft{'library'} += s/\bcusparseDcsrsv2_bufferSizeExt\b/hipsparseDcsrsv2_bufferSizeExt/g;
@@ -792,12 +800,14 @@ sub simpleSubstitutions {
$ft{'library'} += s/\bcusparseDestroy\b/hipsparseDestroy/g;
$ft{'library'} += s/\bcusparseDestroyCsrgemm2Info\b/hipsparseDestroyCsrgemm2Info/g;
$ft{'library'} += s/\bcusparseDestroyCsrilu02Info\b/hipsparseDestroyCsrilu02Info/g;
$ft{'library'} += s/\bcusparseDestroyCsrsm2Info\b/hipsparseDestroyCsrsm2Info/g;
$ft{'library'} += s/\bcusparseDestroyCsrsv2Info\b/hipsparseDestroyCsrsv2Info/g;
$ft{'library'} += s/\bcusparseDestroyHybMat\b/hipsparseDestroyHybMat/g;
$ft{'library'} += s/\bcusparseDestroyMatDescr\b/hipsparseDestroyMatDescr/g;
$ft{'library'} += s/\bcusparseDgthr\b/hipsparseDgthr/g;
$ft{'library'} += s/\bcusparseDgthrz\b/hipsparseDgthrz/g;
$ft{'library'} += s/\bcusparseDhybmv\b/hipsparseDhybmv/g;
$ft{'library'} += s/\bcusparseDnnz\b/hipsparseDnnz/g;
$ft{'library'} += s/\bcusparseDroti\b/hipsparseDroti/g;
$ft{'library'} += s/\bcusparseDsctr\b/hipsparseDsctr/g;
$ft{'library'} += s/\bcusparseGetMatDiagType\b/hipsparseGetMatDiagType/g;
@@ -820,6 +830,9 @@ sub simpleSubstitutions {
$ft{'library'} += s/\bcusparseScsrmm\b/hipsparseScsrmm/g;
$ft{'library'} += s/\bcusparseScsrmm2\b/hipsparseScsrmm2/g;
$ft{'library'} += s/\bcusparseScsrmv\b/hipsparseScsrmv/g;
$ft{'library'} += s/\bcusparseScsrsm2_analysis\b/hipsparseScsrsm2_analysis/g;
$ft{'library'} += s/\bcusparseScsrsm2_bufferSizeExt\b/hipsparseScsrsm2_bufferSizeExt/g;
$ft{'library'} += s/\bcusparseScsrsm_solve\b/hipsparseScsrsm_solve/g;
$ft{'library'} += s/\bcusparseScsrsv2_analysis\b/hipsparseScsrsv2_analysis/g;
$ft{'library'} += s/\bcusparseScsrsv2_bufferSize\b/hipsparseScsrsv2_bufferSize/g;
$ft{'library'} += s/\bcusparseScsrsv2_bufferSizeExt\b/hipsparseScsrsv2_bufferSizeExt/g;
@@ -834,6 +847,7 @@ sub simpleSubstitutions {
$ft{'library'} += s/\bcusparseSgthr\b/hipsparseSgthr/g;
$ft{'library'} += s/\bcusparseSgthrz\b/hipsparseSgthrz/g;
$ft{'library'} += s/\bcusparseShybmv\b/hipsparseShybmv/g;
$ft{'library'} += s/\bcusparseSnnz\b/hipsparseSnnz/g;
$ft{'library'} += s/\bcusparseSroti\b/hipsparseSroti/g;
$ft{'library'} += s/\bcusparseSsctr\b/hipsparseSsctr/g;
$ft{'library'} += s/\bcusparseXbsrilu02_zeroPivot\b/hipsparseXbsrilu02_zeroPivot/g;
@@ -847,6 +861,7 @@ sub simpleSubstitutions {
$ft{'library'} += s/\bcusparseXcsrgemm2Nnz\b/hipsparseXcsrgemm2Nnz/g;
$ft{'library'} += s/\bcusparseXcsrgemmNnz\b/hipsparseXcsrgemmNnz/g;
$ft{'library'} += s/\bcusparseXcsrilu02_zeroPivot\b/hipsparseXcsrilu02_zeroPivot/g;
$ft{'library'} += s/\bcusparseXcsrsm2_zeroPivot\b/hipsparseXcsrsm2_zeroPivot/g;
$ft{'library'} += s/\bcusparseXcsrsort\b/hipsparseXcsrsort/g;
$ft{'library'} += s/\bcusparseXcsrsort_bufferSizeExt\b/hipsparseXcsrsort_bufferSizeExt/g;
$ft{'library'} += s/\bcusparseXcsrsv2_zeroPivot\b/hipsparseXcsrsv2_zeroPivot/g;
@@ -863,6 +878,9 @@ sub simpleSubstitutions {
$ft{'library'} += s/\bcusparseZcsrmm\b/hipsparseZcsrmm/g;
$ft{'library'} += s/\bcusparseZcsrmm2\b/hipsparseZcsrmm2/g;
$ft{'library'} += s/\bcusparseZcsrmv\b/hipsparseZcsrmv/g;
$ft{'library'} += s/\bcusparseZcsrsm2_analysis\b/hipsparseZcsrsm2_analysis/g;
$ft{'library'} += s/\bcusparseZcsrsm2_bufferSizeExt\b/hipsparseZcsrsm2_bufferSizeExt/g;
$ft{'library'} += s/\bcusparseZcsrsm_solve\b/hipsparseZcsrsm_solve/g;
$ft{'library'} += s/\bcusparseZcsrsv2_analysis\b/hipsparseZcsrsv2_analysis/g;
$ft{'library'} += s/\bcusparseZcsrsv2_bufferSize\b/hipsparseZcsrsv2_bufferSize/g;
$ft{'library'} += s/\bcusparseZcsrsv2_bufferSizeExt\b/hipsparseZcsrsv2_bufferSizeExt/g;
@@ -872,6 +890,7 @@ sub simpleSubstitutions {
$ft{'library'} += s/\bcusparseZgthr\b/hipsparseZgthr/g;
$ft{'library'} += s/\bcusparseZgthrz\b/hipsparseZgthrz/g;
$ft{'library'} += s/\bcusparseZhybmv\b/hipsparseZhybmv/g;
$ft{'library'} += s/\bcusparseZnnz\b/hipsparseZnnz/g;
$ft{'library'} += s/\bcusparseZsctr\b/hipsparseZsctr/g;
$ft{'device_library'} += s/\bcurand\b/hiprand/g;
$ft{'device_library'} += s/\bcurand_discrete\b/hiprand_discrete/g;
@@ -997,6 +1016,8 @@ sub simpleSubstitutions {
$ft{'type'} += s/\bcsrgemm2Info\b/csrgemm2Info/g;
$ft{'type'} += s/\bcsrgemm2Info_t\b/csrgemm2Info_t/g;
$ft{'type'} += s/\bcsrilu02Info_t\b/csrilu02Info_t/g;
$ft{'type'} += s/\bcsrsm2Info\b/csrsm2Info/g;
$ft{'type'} += s/\bcsrsm2Info_t\b/csrsm2Info_t/g;
$ft{'type'} += s/\bcsrsv2Info_t\b/csrsv2Info_t/g;
$ft{'type'} += s/\bcuComplex\b/hipComplex/g;
$ft{'type'} += s/\bcuDoubleComplex\b/hipDoubleComplex/g;
@@ -1130,6 +1151,7 @@ sub simpleSubstitutions {
$ft{'type'} += s/\bcurandStatus_t\b/hiprandStatus_t/g;
$ft{'type'} += s/\bcusparseAction_t\b/hipsparseAction_t/g;
$ft{'type'} += s/\bcusparseDiagType_t\b/hipsparseDiagType_t/g;
$ft{'type'} += s/\bcusparseDirection_t\b/hipsparseDirection_t/g;
$ft{'type'} += s/\bcusparseFillMode_t\b/hipsparseFillMode_t/g;
$ft{'type'} += s/\bcusparseHandle_t\b/hipsparseHandle_t/g;
$ft{'type'} += s/\bcusparseHybMat_t\b/hipsparseHybMat_t/g;
@@ -1398,6 +1420,8 @@ sub simpleSubstitutions {
$ft{'numeric_literal'} += s/\bCUSPARSE_ACTION_SYMBOLIC\b/HIPSPARSE_ACTION_SYMBOLIC/g;
$ft{'numeric_literal'} += s/\bCUSPARSE_DIAG_TYPE_NON_UNIT\b/HIPSPARSE_DIAG_TYPE_NON_UNIT/g;
$ft{'numeric_literal'} += s/\bCUSPARSE_DIAG_TYPE_UNIT\b/HIPSPARSE_DIAG_TYPE_UNIT/g;
$ft{'numeric_literal'} += s/\bCUSPARSE_DIRECTION_COLUMN\b/HIPSPARSE_DIRECTION_COLUMN/g;
$ft{'numeric_literal'} += s/\bCUSPARSE_DIRECTION_ROW\b/HIPSPARSE_DIRECTION_ROW/g;
$ft{'numeric_literal'} += s/\bCUSPARSE_FILL_MODE_LOWER\b/HIPSPARSE_FILL_MODE_LOWER/g;
$ft{'numeric_literal'} += s/\bCUSPARSE_FILL_MODE_UPPER\b/HIPSPARSE_FILL_MODE_UPPER/g;
$ft{'numeric_literal'} += s/\bCUSPARSE_HYB_PARTITION_AUTO\b/HIPSPARSE_HYB_PARTITION_AUTO/g;
-1
파일 보기
@@ -75,7 +75,6 @@ if(UNIX AND NOT APPLE AND NOT CYGWIN)
endif()
# And push it back to the cache
set(HIP_ROOT_DIR ${HIP_ROOT_DIR} CACHE PATH "HIP installed location" FORCE)
message("Found HIP at ${HIP_ROOT_DIR}")
endif()
# Find HIPCC executable
+24 -24
파일 보기
@@ -12,9 +12,9 @@
| enum |***`cusparseAction_t`*** | |***`hipsparseAction_t`*** |
| 0 |*`CUSPARSE_ACTION_SYMBOLIC`* | |*`HIPSPARSE_ACTION_SYMBOLIC`* |
| 1 |*`CUSPARSE_ACTION_NUMERIC`* | |*`HIPSPARSE_ACTION_NUMERIC`* |
| enum |***`cusparseDirection_t`*** | | |
| 0 |*`CUSPARSE_DIRECTION_ROW`* | | |
| 1 |*`CUSPARSE_DIRECTION_COLUMN`* | | |
| enum |***`cusparseDirection_t`*** | |***`hipsparseDirection_t`*** |
| 0 |*`CUSPARSE_DIRECTION_ROW`* | |*`HIPSPARSE_DIRECTION_ROW`* |
| 1 |*`CUSPARSE_DIRECTION_COLUMN`* | |*`HIPSPARSE_DIRECTION_COLUMN`* |
| enum |***`cusparseHybPartition_t`*** | |***`hipsparseHybPartition_t`*** |
| 0 |*`CUSPARSE_HYB_PARTITION_AUTO`* | |*`HIPSPARSE_HYB_PARTITION_AUTO`* |
| 1 |*`CUSPARSE_HYB_PARTITION_USER`* | |*`HIPSPARSE_HYB_PARTITION_USER`* |
@@ -69,8 +69,8 @@
| typedef |`cusparseSolveAnalysisInfo_t` | | |
| struct |`csrsv2Info` | | |
| typedef |`csrsv2Info_t` | |`csrsv2Info_t` |
| struct |`csrsm2Info` | 9.2 | |
| typedef |`csrsm2Info_t` | | |
| struct |`csrsm2Info` | 9.2 |`csrsm2Info` |
| typedef |`csrsm2Info_t` | |`csrsm2Info_t` |
| struct |`bsrsv2Info` | | |
| typedef |`bsrsv2Info_t` | | |
| struct |`bsrsm2Info` | | |
@@ -151,8 +151,8 @@
|`cusparseGetStream` |`hipsparseGetStream` | 8.0 |
|`cusparseCreateCsrsv2Info` |`hipsparseCreateCsrsv2Info` |
|`cusparseDestroyCsrsv2Info` |`hipsparseDestroyCsrsv2Info` |
|`cusparseCreateCsrsm2Info` | | 9.2 |
|`cusparseDestroyCsrsm2Info` | | 9.2 |
|`cusparseCreateCsrsm2Info` |`hipsparseCreateCsrsm2Info` | 9.2 |
|`cusparseDestroyCsrsm2Info` |`hipsparseDestroyCsrsm2Info` | 9.2 |
|`cusparseCreateCsric02Info` | |
|`cusparseDestroyCsric02Info` | |
|`cusparseCreateCsrilu02Info` |`hipsparseCreateCsrilu02Info` |
@@ -306,19 +306,19 @@
|`cusparseDcsrsm_solve` | |
|`cusparseCcsrsm_solve` | |
|`cusparseZcsrsm_solve` | |
|`cusparseScsrsm2_bufferSizeExt` | | 9.2 |
|`cusparseDcsrsm2_bufferSizeExt` | | 9.2 |
|`cusparseCcsrsm2_bufferSizeExt` | | 9.2 |
|`cusparseZcsrsm2_bufferSizeExt` | | 9.2 |
|`cusparseScsrsm2_analysis` | | 9.2 |
|`cusparseDcsrsm2_analysis` | | 9.2 |
|`cusparseCcsrsm2_analysis` | | 9.2 |
|`cusparseZcsrsm2_analysis` | | 9.2 |
|`cusparseScsrsm2_solve` | | 9.2 |
|`cusparseDcsrsm2_solve` | | 9.2 |
|`cusparseCcsrsm2_solve` | | 9.2 |
|`cusparseZcsrsm2_solve` | | 9.2 |
|`cusparseXcsrsm2_zeroPivot` | | 9.2 |
|`cusparseScsrsm2_bufferSizeExt` |`hipsparseScsrsm2_bufferSizeExt` | 9.2 |
|`cusparseDcsrsm2_bufferSizeExt` |`hipsparseDcsrsm2_bufferSizeExt` | 9.2 |
|`cusparseCcsrsm2_bufferSizeExt` |`hipsparseCcsrsm2_bufferSizeExt` | 9.2 |
|`cusparseZcsrsm2_bufferSizeExt` |`hipsparseZcsrsm2_bufferSizeExt` | 9.2 |
|`cusparseScsrsm2_analysis` |`hipsparseScsrsm2_analysis` | 9.2 |
|`cusparseDcsrsm2_analysis` |`hipsparseDcsrsm2_analysis` | 9.2 |
|`cusparseCcsrsm2_analysis` |`hipsparseCcsrsm2_analysis` | 9.2 |
|`cusparseZcsrsm2_analysis` |`hipsparseZcsrsm2_analysis` | 9.2 |
|`cusparseScsrsm2_solve` |`hipsparseScsrsm2_solve` | 9.2 |
|`cusparseDcsrsm2_solve` |`hipsparseDcsrsm2_solve` | 9.2 |
|`cusparseCcsrsm2_solve` |`hipsparseCcsrsm2_solve` | 9.2 |
|`cusparseZcsrsm2_solve` |`hipsparseZcsrsm2_solve` | 9.2 |
|`cusparseXcsrsm2_zeroPivot` |`hipsparseXcsrsm2_zeroPivot` | 9.2 |
|`cusparseSbsrmm` | |
|`cusparseDbsrmm` | |
|`cusparseCbsrmm` | |
@@ -662,10 +662,10 @@
|`cusparseDhyb2dense` | |
|`cusparseChyb2dense` | |
|`cusparseZhyb2dense` | |
|`cusparseSnnz` | |
|`cusparseDnnz` | |
|`cusparseCnnz` | |
|`cusparseZnnz` | |
|`cusparseSnnz` |`cusparseSnnz` |
|`cusparseDnnz` |`cusparseDnnz` |
|`cusparseCnnz` |`cusparseCnnz` |
|`cusparseZnnz` |`cusparseZnnz` |
|`cusparseCreateIdentityPermutation` |`hipsparseCreateIdentityPermutation` |
|`cusparseXcoosort_bufferSizeExt` |`hipsparseXcoosort_bufferSizeExt` |
|`cusparseXcoosortByRow` |`hipsparseXcoosortByRow` |
+7 -7
파일 보기
@@ -1,13 +1,13 @@
Table of Contents
=================
* [Profiling HIP Code](#profiling-hip-code" aria-hidden="true"><span aria-hidden="true)
* [Using HIP_DB](#using-hip_db" aria-hidden="true"><span aria-hidden="true)
* [Using ltrace](#using-ltrace" aria-hidden="true"><span aria-hidden="true)
* [Chicken bits](#chicken-bits" aria-hidden="true"><span aria-hidden="true)
* [Debugging HIP Applications](#debugging-hip-applications" aria-hidden="true"><span aria-hidden="true)
* [General Debugging Tips](#general-debugging-tips" aria-hidden="true"><span aria-hidden="true)
* [Print env var state](#print-env-var-state" aria-hidden="true"><span aria-hidden="true)
* [Profiling HIP Code](#profiling-hip-code)
* [Using HIP_DB](#using-hip_db)
* [Using ltrace](#using-ltrace)
* [Chicken bits](#chicken-bits)
* [Debugging HIP Applications](#debugging-hip-applications)
* [General Debugging Tips](#general-debugging-tips)
* [Print env var state](#print-env-var-state)
### Using HIP_DB
+23 -24
파일 보기
@@ -388,30 +388,29 @@ def generate_prof_header(f, api_map, opts_map):
f.write('#define INIT_CB_ARGS_DATA(cb_id, cb_data) INIT_##cb_id##_CB_ARGS_DATA(cb_data)\n')
# Generating the method for the API string, name and parameters
if False:
f.write('\n')
f.write('#if 0\n')
f.write('#include <sstream>\n');
f.write('#include <string>\n');
f.write('// HIP API string method, method name and parameters\n')
f.write('const char* hipApiString(hip_api_id_t id, const hip_api_data_t* data) {\n')
f.write(' std::ostringstream oss;\n')
f.write(' switch (id) {\n')
for name, args in api_map.items():
f.write(' case HIP_API_ID_' + name + ':\n')
f.write(' oss << "' + name + '("')
for ind in range(0, len(args)):
arg_tuple = args[ind]
arg_name = arg_tuple[1]
if ind != 0: f.write(' << ","')
f.write('\n << " ' + arg_name + '=" << data->args.' + name + '.' + arg_name)
f.write('\n << ")";\n')
f.write(' break;\n')
f.write(' default: oss << "unknown";\n')
f.write(' };\n')
f.write(' return strdup(oss.str().c_str());\n')
f.write('};\n')
f.write('#endif\n')
f.write('\n')
f.write('#if ENABLE_HIP_API_STRING\n')
f.write('#include <sstream>\n');
f.write('#include <string>\n');
f.write('// HIP API string method, method name and parameters\n')
f.write('const char* hipApiString(hip_api_id_t id, const hip_api_data_t* data) {\n')
f.write(' std::ostringstream oss;\n')
f.write(' switch (id) {\n')
for name, args in api_map.items():
f.write(' case HIP_API_ID_' + name + ':\n')
f.write(' oss << "' + name + '("')
for ind in range(0, len(args)):
arg_tuple = args[ind]
arg_name = arg_tuple[1]
if ind != 0: f.write(' << ","')
f.write('\n << " ' + arg_name + '=" << data->args.' + name + '.' + arg_name)
f.write('\n << ")";\n')
f.write(' break;\n')
f.write(' default: oss << "unknown";\n')
f.write(' };\n')
f.write(' return strdup(oss.str().c_str());\n')
f.write('};\n')
f.write('#endif // ENABLE_HIP_API_STRING\n')
f.write('#endif // _HIP_PROF_STR_H\n');
+122 -121
파일 보기
@@ -42,10 +42,9 @@ After applying all the matchers, the output HIP source is produced.
`hipify-clang` requires:
1. [**LLVM+CLANG**](http://releases.llvm.org) of at least version [3.8.0](http://releases.llvm.org/download.html#3.8.0); the latest stable and recommended release: [**9.0.1**](http://releases.llvm.org/download.html#9.0.1), the latest release candidate: [10.0.0-rc3](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc3).
1. [**LLVM+CLANG**](http://releases.llvm.org) of at least version [3.8.0](http://releases.llvm.org/download.html#3.8.0); the latest stable and recommended release: [**10.0.0**](http://releases.llvm.org/download.html#10.0.0).
2. [**CUDA**](https://developer.nvidia.com/cuda-downloads) of at least version [7.0](https://developer.nvidia.com/cuda-toolkit-70), the latest supported version is [**10.1 Update 2**](https://developer.nvidia.com/cuda-10.1-download-archive-base).
To use the latest CUDA version [10.2](https://developer.nvidia.com/cuda-downloads) please use the latest `LLVM` release candidate: [10.0.0-rc3](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc3).
2. [**CUDA**](https://developer.nvidia.com/cuda-downloads) of at least version [7.0](https://developer.nvidia.com/cuda-toolkit-70), the latest supported version is [**10.2**](https://developer.nvidia.com/cuda-downloads).
| **LLVM release version** | **CUDA latest supported version** | **Windows** | **Linux** |
|:----------------------------------------------------------:|:------------------------------------------------------------------------:|:-----------:|:---------:|
@@ -66,15 +65,15 @@ To use the latest CUDA version [10.2](https://developer.nvidia.com/cuda-download
| [8.0.0](http://releases.llvm.org/download.html#8.0.0) | [10.0](https://developer.nvidia.com/cuda-10.0-download-archive) | - <br/> not working due to <br/> the clang's bug [38811](https://bugs.llvm.org/show_bug.cgi?id=38811) <br/>+<br/>[patch](patches/patch_for_clang_8.0.0_bug_38811.zip)*</br> | + |
| [8.0.1](http://releases.llvm.org/download.html#8.0.1) | [10.0](https://developer.nvidia.com/cuda-10.0-download-archive) | - <br/> not working due to <br/> the clang's bug [38811](https://bugs.llvm.org/show_bug.cgi?id=38811) <br/>+<br/>[patch](patches/patch_for_clang_8.0.1_bug_38811.zip)*</br> | + |
| [9.0.0](http://releases.llvm.org/download.html#9.0.0) | [10.1](https://developer.nvidia.com/cuda-10.1-download-archive-base) | + | + |
| [**9.0.1**](http://releases.llvm.org/download.html#9.0.1) | [**10.1**](https://developer.nvidia.com/cuda-10.1-download-archive-base) | + <br/> **LATEST STABLE RELEASE** | + <br/> **LATEST STABLE RELEASE** |
| [10.0.0-rc3](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc3) | [10.2](https://developer.nvidia.com/cuda-downloads) | + | + |
| [9.0.1](http://releases.llvm.org/download.html#9.0.1) | [10.1](https://developer.nvidia.com/cuda-10.1-download-archive-base) | + | + |
| [**10.0.0**](http://releases.llvm.org/download.html#10.0.0)| [**10.2**](https://developer.nvidia.com/cuda-downloads) | + <br/> **LATEST STABLE RELEASE** | + <br/> **LATEST STABLE RELEASE** |
`*` Download the patch and unpack it into your `LLVM` distributive directory; a few header files will be overwritten; rebuilding of `LLVM` is not needed.
In most cases, you can get a suitable version of `LLVM+CLANG` with your package manager.
Failing that or having multiple versions of `LLVM`, you can [download a release archive](http://releases.llvm.org/), build or install it, and set
[CMAKE_PREFIX_PATH](https://cmake.org/cmake/help/v3.5/variable/CMAKE_PREFIX_PATH.html) so `cmake` can find it; for instance: `-DCMAKE_PREFIX_PATH=f:\LLVM\9.0.1\dist`
[CMAKE_PREFIX_PATH](https://cmake.org/cmake/help/v3.5/variable/CMAKE_PREFIX_PATH.html) so `cmake` can find it; for instance: `-DCMAKE_PREFIX_PATH=d:\LLVM\10.0.0\dist`
### <a name="hipify-clang-usage"></a> hipify-clang: usage
@@ -83,14 +82,14 @@ To process a file, `hipify-clang` needs access to the same headers that would be
For example:
```shell
./hipify-clang square.cu --cuda-path=/usr/local/cuda-10.1 -I /usr/local/cuda-10.1/samples/common/inc
./hipify-clang square.cu --cuda-path=/usr/local/cuda-10.2 -I /usr/local/cuda-10.2/samples/common/inc
```
`hipify-clang` arguments are given first, followed by a separator `'--'`, and then the arguments you'd pass to `clang` if you
were compiling the input file. For example:
```bash
./hipify-clang cpp17.cu --cuda-path=/usr/local/cuda-10.1 -- -std=c++17
./hipify-clang cpp17.cu --cuda-path=/usr/local/cuda-10.2 -- -std=c++17
```
The [Clang manual for compiling CUDA](https://llvm.org/docs/CompileCudaWithLLVM.html#compiling-cuda-code) may be useful.
@@ -158,7 +157,7 @@ Run `Visual Studio 16 2019`, open the generated `LLVM.sln`, build all, build pro
**LLVM 10.0.0 or newer:**
1. download [`LLVM project`](https://github.com/llvm/llvm-project/archive/llvmorg-10.0.0-rc3.tar.gz) sources;
1. download [`LLVM project`](https://github.com/llvm/llvm-project/releases/download/llvmorg-10.0.0/llvm-project-10.0.0.tar.xz) sources;
2. build [`LLVM project`](http://llvm.org/docs/CMake.html):
**Linux**:
@@ -193,19 +192,19 @@ Run `Visual Studio 16 2019`, open the generated `LLVM.sln`, build all, build pro
* Having multiple CUDA installations to choose a particular version the `DCUDA_TOOLKIT_ROOT_DIR` option should be specified:
- ***Linux***: `-DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-10.1`
- ***Linux***: `-DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-10.2`
- ***Windows***: `-DCUDA_TOOLKIT_ROOT_DIR="c:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1"`
- ***Windows***: `-DCUDA_TOOLKIT_ROOT_DIR="c:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.2"`
`-DCUDA_SDK_ROOT_DIR="c:/ProgramData/NVIDIA Corporation/CUDA Samples/v10.1"`
`-DCUDA_SDK_ROOT_DIR="c:/ProgramData/NVIDIA Corporation/CUDA Samples/v10.2"`
4. Ensure [`cuDNN`](https://developer.nvidia.com/rdp/cudnn-archive) of the version corresponding to CUDA's version is installed.
* Path to cuDNN should be specified by the `CUDA_DNN_ROOT_DIR` option:
- ***Linux***: `-DCUDA_DNN_ROOT_DIR=/srv/CUDNN/cudnn-10.1-v7.6.5.32`
- ***Linux***: `-DCUDA_DNN_ROOT_DIR=/srv/CUDNN/cudnn-10.2-v7.6.5.32`
- ***Windows***: `-DCUDA_DNN_ROOT_DIR=f:/CUDNN/cudnn-10.1-windows10-x64-v7.6.5.32`
- ***Windows***: `-DCUDA_DNN_ROOT_DIR=d:/CUDNN/cudnn-10.2-windows10-x64-v7.6.5.32`
5. Ensure [`CUB`](https://github.com/NVlabs/cub) of the version corresponding to CUDA's version is installed.
@@ -213,7 +212,7 @@ Run `Visual Studio 16 2019`, open the generated `LLVM.sln`, build all, build pro
- ***Linux***: `-DCUDA_CUB_ROOT_DIR=/srv/git/CUB`
- ***Windows***: `-DCUDA_CUB_ROOT_DIR=f:/GIT/cub`
- ***Windows***: `-DCUDA_CUB_ROOT_DIR=d:/GIT/cub`
5. Ensure [`python`](https://www.python.org/downloads) of minimum required version 2.7 is installed.
@@ -221,21 +220,21 @@ Run `Visual Studio 16 2019`, open the generated `LLVM.sln`, build all, build pro
* Install `lit` into `python`:
- ***Linux***: `python /srv/git/LLVM/9.0.1/llvm/utils/lit/setup.py install`
- ***Linux***: `python /srv/git/LLVM/10.0.0/llvm/utils/lit/setup.py install`
- ***Windows***: `python f:/LLVM/9.0.1/llvm/utils/lit/setup.py install`
- ***Windows***: `python d:/LLVM/10.0.0/llvm/utils/lit/setup.py install`
* Starting with LLVM 6.0.1 path to `llvm-lit` python script should be specified by the `LLVM_EXTERNAL_LIT` option:
- ***Linux***: `-DLLVM_EXTERNAL_LIT=/srv/git/LLVM/9.0.1/build/bin/llvm-lit`
- ***Linux***: `-DLLVM_EXTERNAL_LIT=/srv/git/LLVM/10.0.0/build/bin/llvm-lit`
- ***Windows***: `-DLLVM_EXTERNAL_LIT=f:/LLVM/9.0.1/build/Release/bin/llvm-lit.py`
- ***Windows***: `-DLLVM_EXTERNAL_LIT=d:/LLVM/10.0.0/build/Release/bin/llvm-lit.py`
* `FileCheck`:
- ***Linux***: copy from `/srv/git/LLVM/9.0.1/build/bin/` to `CMAKE_INSTALL_PREFIX/dist/bin`
- ***Linux***: copy from `/srv/git/LLVM/10.0.0/build/bin/` to `CMAKE_INSTALL_PREFIX/dist/bin`
- ***Windows***: copy from `f:/LLVM/9.0.1/build/Release/bin` to `CMAKE_INSTALL_PREFIX/dist/bin`
- ***Windows***: copy from `d:/LLVM/10.0.0/build/Release/bin` to `CMAKE_INSTALL_PREFIX/dist/bin`
- Or specify the path to `FileCheck` in `CMAKE_INSTALL_PREFIX` option
@@ -249,7 +248,7 @@ On Linux the following configurations are tested:
Ubuntu 14: LLVM 5.0.0 - 6.0.1, CUDA 7.0 - 9.0, cudnn-5.0.5 - cudnn-7.6.5.32
Ubuntu 16-18: LLVM 8.0.0 - 10.0.0-rc3, CUDA 8.0 - 10.2, cudnn-5.1.10 - cudnn-7.6.5.32
Ubuntu 16-18: LLVM 8.0.0 - 10.0.0, CUDA 8.0 - 10.2, cudnn-5.1.10 - cudnn-7.6.5.32
Minimum build system requirements for the above configurations:
@@ -262,11 +261,11 @@ cmake
-DHIPIFY_CLANG_TESTS=1 \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_INSTALL_PREFIX=../dist \
-DCMAKE_PREFIX_PATH=/srv/git/LLVM/9.0.1/dist \
-DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-10.1 \
-DCUDA_DNN_ROOT_DIR=/srv/CUDNN/cudnn-10.1-v7.6.5.32 \
-DCMAKE_PREFIX_PATH=/srv/git/LLVM/10.0.0/dist \
-DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-10.2 \
-DCUDA_DNN_ROOT_DIR=/srv/CUDNN/cudnn-10.2-v7.6.5.32 \
-DCUDA_CUB_ROOT_DIR=/srv/git/CUB \
-DLLVM_EXTERNAL_LIT=/srv/git/LLVM/9.0.1/build/bin/llvm-lit \
-DLLVM_EXTERNAL_LIT=/srv/git/LLVM/10.0.0/build/bin/llvm-lit \
..
```
*A corresponding successful output:*
@@ -285,14 +284,14 @@ cmake
-- Detecting CXX compiler ABI info - done
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Found LLVM 9.0.1:
-- - CMake module path: /srv/git/LLVM/9.0.1/dist/lib/cmake/llvm
-- - Include path : /srv/git/LLVM/9.0.1/dist/include
-- - Binary path : /srv/git/LLVM/9.0.1/dist/bin
-- Found LLVM 10.0.0:
-- - CMake module path: /srv/git/LLVM/10.0.0/dist/lib/cmake/llvm
-- - Include path : /srv/git/LLVM/10.0.0/dist/include
-- - Binary path : /srv/git/LLVM/10.0.0/dist/bin
-- Linker detection: GNU ld
-- Found PythonInterp: /usr/bin/python2.7 (found suitable version "2.7.12", minimum required is "2.7")
-- Found lit: /usr/local/bin/lit
-- Found FileCheck: /srv/git/LLVM/9.0.1/dist/bin/FileCheck
-- Found FileCheck: /srv/git/LLVM/10.0.0/dist/bin/FileCheck
-- Looking for pthread.h
-- Looking for pthread.h - found
-- Looking for pthread_create
@@ -302,7 +301,7 @@ cmake
-- Looking for pthread_create in pthread
-- Looking for pthread_create in pthread - found
-- Found Threads: TRUE
-- Found CUDA: /usr/local/cuda-10.1 (found version "10.1")
-- Found CUDA: /usr/local/cuda-10.2 (found version "10.2")
-- Configuring done
-- Generating done
-- Build files have been written to: /srv/git/HIP/hipify-clang/build
@@ -314,83 +313,85 @@ make test-hipify
```shell
Running HIPify regression tests
========================================
CUDA 10.1 - will be used for testing
LLVM 9.0.1 - will be used for testing
CUDA 10.2 - will be used for testing
LLVM 10.0.0 - will be used for testing
x86_64 - Platform architecture
Linux 5.2.0 - Platform OS
64 - hipify-clang binary bitness
64 - python 2.7.12 binary bitness
========================================
-- Testing: 67 tests, 12 threads --
PASS: hipify :: unit_tests/casts/reinterpret_cast.cu (1 of 67)
PASS: hipify :: unit_tests/device/math_functions.cu (2 of 67)
PASS: hipify :: unit_tests/device/atomics.cu (3 of 67)
PASS: hipify :: unit_tests/device/device_symbols.cu (4 of 67)
PASS: hipify :: unit_tests/headers/headers_test_01.cu (5 of 67)
PASS: hipify :: unit_tests/headers/headers_test_02.cu (6 of 67)
PASS: hipify :: unit_tests/headers/headers_test_03.cu (7 of 67)
PASS: hipify :: unit_tests/headers/headers_test_05.cu (8 of 67)
PASS: hipify :: unit_tests/headers/headers_test_04.cu (9 of 67)
PASS: hipify :: unit_tests/headers/headers_test_06.cu (10 of 67)
PASS: hipify :: unit_tests/headers/headers_test_07.cu (11 of 67)
PASS: hipify :: unit_tests/headers/headers_test_10.cu (12 of 67)
PASS: hipify :: unit_tests/headers/headers_test_11.cu (13 of 67)
PASS: hipify :: unit_tests/headers/headers_test_08.cu (14 of 67)
PASS: hipify :: unit_tests/kernel_launch/kernel_launch_01.cu (15 of 67)
PASS: hipify :: unit_tests/headers/headers_test_09.cu (16 of 67)
PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_02.cu (17 of 67)
PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_01.cu (18 of 67)
PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_0_based_indexing.cu (19 of 67)
PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_1_based_indexing.cu (20 of 67)
PASS: hipify :: unit_tests/libraries/CUB/cub_03.cu (21 of 67)
PASS: hipify :: unit_tests/libraries/CUB/cub_01.cu (22 of 67)
PASS: hipify :: unit_tests/libraries/CUB/cub_02.cu (23 of 67)
PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_0_based_indexing_rocblas.cu (24 of 67)
PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_sgemm_matrix_multiplication.cu (25 of 67)
PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_1_based_indexing_rocblas.cu (26 of 67)
PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_sgemm_matrix_multiplication_rocblas.cu (27 of 67)
PASS: hipify :: unit_tests/libraries/cuComplex/cuComplex_Julia.cu (28 of 67)
PASS: hipify :: unit_tests/libraries/cuFFT/simple_cufft.cu (29 of 67)
PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_softmax.cu (30 of 67)
PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_convolution_forward.cu (31 of 67)
PASS: hipify :: unit_tests/libraries/cuRAND/poisson_api_example.cu (32 of 67)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_01.cu (33 of 67)
PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_generate.cpp (34 of 67)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_02.cu (35 of 67)
PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_kernel.cpp (36 of 67)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_03.cu (37 of 67)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_04.cu (38 of 67)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_05.cu (39 of 67)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_07.cu (40 of 67)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_06.cu (41 of 67)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_08.cu (42 of 67)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_09.cu (43 of 67)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_11.cu (44 of 67)
PASS: hipify :: unit_tests/namespace/ns_kernel_launch.cu (45 of 67)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_10.cu (46 of 67)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_12.cu (47 of 67)
PASS: hipify :: unit_tests/pp/pp_if_else_conditionals.cu (48 of 67)
PASS: hipify :: unit_tests/pp/pp_if_else_conditionals_01.cu (49 of 67)
PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/tex2dKernel.cpp (50 of 67)
PASS: hipify :: unit_tests/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp (51 of 67)
PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp (52 of 67)
PASS: hipify :: unit_tests/samples/2_Cookbook/13_occupancy/occupancy.cpp (53 of 67)
PASS: hipify :: unit_tests/samples/2_Cookbook/1_hipEvent/hipEvent.cpp (54 of 67)
PASS: hipify :: unit_tests/samples/2_Cookbook/2_Profiler/Profiler.cpp (55 of 67)
PASS: hipify :: unit_tests/samples/2_Cookbook/7_streams/stream.cpp (56 of 67)
PASS: hipify :: unit_tests/samples/2_Cookbook/8_peer2peer/peer2peer.cpp (57 of 67)
PASS: hipify :: unit_tests/samples/MallocManaged.cpp (58 of 67)
PASS: hipify :: unit_tests/samples/allocators.cu (59 of 67)
PASS: hipify :: unit_tests/samples/coalescing.cu (60 of 67)
PASS: hipify :: unit_tests/samples/dynamic_shared_memory.cu (61 of 67)
PASS: hipify :: unit_tests/samples/axpy.cu (62 of 67)
PASS: hipify :: unit_tests/samples/intro.cu (63 of 67)
PASS: hipify :: unit_tests/samples/cudaRegister.cu (64 of 67)
PASS: hipify :: unit_tests/samples/square.cu (65 of 67)
PASS: hipify :: unit_tests/samples/static_shared_memory.cu (66 of 67)
PASS: hipify :: unit_tests/samples/vec_add.cu (67 of 67)
Testing Time: 3.07s
Expected Passes : 67
-- Testing: 69 tests, 12 threads --
PASS: hipify :: unit_tests/casts/reinterpret_cast.cu (1 of 69)
PASS: hipify :: unit_tests/device/math_functions.cu (2 of 69)
PASS: hipify :: unit_tests/device/atomics.cu (3 of 69)
PASS: hipify :: unit_tests/headers/headers_test_01.cu (4 of 69)
PASS: hipify :: unit_tests/device/device_symbols.cu (5 of 69)
PASS: hipify :: unit_tests/headers/headers_test_02.cu (6 of 69)
PASS: hipify :: unit_tests/headers/headers_test_03.cu (7 of 69)
PASS: hipify :: unit_tests/headers/headers_test_05.cu (8 of 69)
PASS: hipify :: unit_tests/headers/headers_test_04.cu (9 of 69)
PASS: hipify :: unit_tests/headers/headers_test_07.cu (10 of 69)
PASS: hipify :: unit_tests/headers/headers_test_06.cu (11 of 69)
PASS: hipify :: unit_tests/headers/headers_test_11.cu (12 of 69)
PASS: hipify :: unit_tests/headers/headers_test_10.cu (13 of 69)
PASS: hipify :: unit_tests/headers/headers_test_08.cu (14 of 69)
PASS: hipify :: unit_tests/kernel_launch/kernel_launch_01.cu (15 of 69)
PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_02.cu (16 of 69)
PASS: hipify :: unit_tests/headers/headers_test_09.cu (17 of 69)
PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_01.cu (18 of 69)
PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_0_based_indexing.cu (19 of 69)
PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_1_based_indexing.cu (20 of 69)
PASS: hipify :: unit_tests/libraries/CUB/cub_03.cu (21 of 69)
PASS: hipify :: unit_tests/libraries/CUB/cub_01.cu (22 of 69)
PASS: hipify :: unit_tests/libraries/CUB/cub_02.cu (23 of 69)
PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_sgemm_matrix_multiplication.cu (24 of 69)
PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_0_based_indexing_rocblas.cu (25 of 69)
PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_1_based_indexing_rocblas.cu (26 of 69)
PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_sgemm_matrix_multiplication_rocblas.cu (27 of 69)
PASS: hipify :: unit_tests/libraries/cuComplex/cuComplex_Julia.cu (28 of 69)
PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_softmax.cu (29 of 69)
PASS: hipify :: unit_tests/libraries/cuFFT/simple_cufft.cu (30 of 69)
PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_convolution_forward.cu (31 of 69)
PASS: hipify :: unit_tests/libraries/cuRAND/poisson_api_example.cu (32 of 69)
PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_generate.cpp (33 of 69)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_01.cu (34 of 69)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_02.cu (35 of 69)
PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_kernel.cpp (36 of 69)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_03.cu (37 of 69)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_04.cu (38 of 69)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_05.cu (39 of 69)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_06.cu (40 of 69)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_07.cu (41 of 69)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_08.cu (42 of 69)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_09.cu (43 of 69)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_10.cu (44 of 69)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_11.cu (45 of 69)
PASS: hipify :: unit_tests/namespace/ns_kernel_launch.cu (46 of 69)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_12.cu (47 of 69)
PASS: hipify :: unit_tests/pp/pp_if_else_conditionals.cu (48 of 69)
PASS: hipify :: unit_tests/pp/pp_if_else_conditionals_01.cu (49 of 69)
PASS: hipify :: unit_tests/pp/pp_if_else_conditionals_01_LLVM_10.cu (50 of 69)
PASS: hipify :: unit_tests/pp/pp_if_else_conditionals_LLVM_10.cu (51 of 69)
PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/tex2dKernel.cpp (52 of 69)
PASS: hipify :: unit_tests/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp (53 of 69)
PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp (54 of 69)
PASS: hipify :: unit_tests/samples/2_Cookbook/1_hipEvent/hipEvent.cpp (55 of 69)
PASS: hipify :: unit_tests/samples/2_Cookbook/13_occupancy/occupancy.cpp (56 of 69)
PASS: hipify :: unit_tests/samples/2_Cookbook/2_Profiler/Profiler.cpp (57 of 69)
PASS: hipify :: unit_tests/samples/MallocManaged.cpp (58 of 69)
PASS: hipify :: unit_tests/samples/2_Cookbook/7_streams/stream.cpp (59 of 69)
PASS: hipify :: unit_tests/samples/2_Cookbook/8_peer2peer/peer2peer.cpp (60 of 69)
PASS: hipify :: unit_tests/samples/allocators.cu (61 of 69)
PASS: hipify :: unit_tests/samples/coalescing.cu (62 of 69)
PASS: hipify :: unit_tests/samples/axpy.cu (63 of 69)
PASS: hipify :: unit_tests/samples/dynamic_shared_memory.cu (64 of 69)
PASS: hipify :: unit_tests/samples/cudaRegister.cu (65 of 69)
PASS: hipify :: unit_tests/samples/intro.cu (66 of 69)
PASS: hipify :: unit_tests/samples/square.cu (67 of 69)
PASS: hipify :: unit_tests/samples/static_shared_memory.cu (68 of 69)
PASS: hipify :: unit_tests/samples/vec_add.cu (69 of 69)
Testing Time: 3.23s
Expected Passes : 69
[100%] Built target test-hipify
```
### <a name="windows"></a > hipify-clang: Windows
@@ -404,8 +405,8 @@ Testing Time: 3.07s
| 7.0.0 - 7.1.0 | 9.2 | 7.6.5.32 | 2017.15.9.11 | 3.13.3 | 3.7.3 |
| 8.0.0 - 8.0.1 | 10.0 | 7.6.5.32 | 2017.15.9.15 | 3.14.2 | 3.7.4 |
| 9.0.0 - 9.0.1 | 10.1 | 7.6.5.32 | 2017.15.9.20, 2019.16.4.5 | 3.16.4 | 3.8.0 |
| 10.0.0-rc1-rc3 | 10.2 | 7.6.5.32 | 2017.15.9.20, 2019.16.4.5 | 3.16.4 | 3.8.1 |
| 11.0.0git | 10.2 | 7.6.5.32 | 2017.15.9.20, 2019.16.4.5 | 3.16.5 | 3.8.2 |
| 10.0.0 | 10.2 | 7.6.5.32 | 2017.15.9.21, 2019.16.5.1 | 3.17.0 | 3.8.2 |
| 11.0.0git | 10.2 | 7.6.5.32 | 2017.15.9.21, 2019.16.5.1 | 3.17.0 | 3.8.2 |
*Building with testing support on `Windows 10` by `Visual Studio 16 2019`:*
@@ -416,28 +417,28 @@ cmake
-DHIPIFY_CLANG_TESTS=1 \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_INSTALL_PREFIX=../dist \
-DCMAKE_PREFIX_PATH=f:/LLVM/9.0.1/dist \
-DCUDA_TOOLKIT_ROOT_DIR="c:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1" \
-DCUDA_SDK_ROOT_DIR="c:/ProgramData/NVIDIA Corporation/CUDA Samples/v10.1" \
-DCUDA_DNN_ROOT_DIR=f:/CUDNN/cudnn-10.1-windows10-x64-v7.6.5.32 \
-DCUDA_CUB_ROOT_DIR=f:/GIT/cub \
-DLLVM_EXTERNAL_LIT=f:/LLVM/9.0.1/build/Release/bin/llvm-lit.py \
-DCMAKE_PREFIX_PATH=d:/LLVM/10.0.0/dist \
-DCUDA_TOOLKIT_ROOT_DIR="c:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.2" \
-DCUDA_SDK_ROOT_DIR="c:/ProgramData/NVIDIA Corporation/CUDA Samples/v10.2" \
-DCUDA_DNN_ROOT_DIR=d:/CUDNN/cudnn-10.2-windows10-x64-v7.6.5.32 \
-DCUDA_CUB_ROOT_DIR=d:/GIT/cub \
-DLLVM_EXTERNAL_LIT=d:/LLVM/10.0.0/build/Release/bin/llvm-lit.py \
-Thost=x64
..
```
*A corresponding successful output:*
```shell
-- Found LLVM 9.0.1:
-- - CMake module path: F:/LLVM/9.0.1/dist/lib/cmake/llvm
-- - Include path : F:/LLVM/9.0.1/dist/include
-- - Binary path : F:/LLVM/9.0.1/dist/bin
-- Found PythonInterp: C:/Program Files/Python38/python.exe (found suitable version "3.8.2", minimum required is "3.6")
-- Found lit: C:/Program Files/Python38/Scripts/lit.exe
-- Found FileCheck: F:/LLVM/9.0.1/dist/bin/FileCheck.exe
-- Found CUDA: C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1 (found version "10.1")
-- Found LLVM 10.0.0:
-- - CMake module path: d:/LLVM/10.0.0/dist/lib/cmake/llvm
-- - Include path : d:/LLVM/10.0.0/dist/include
-- - Binary path : d:/LLVM/10.0.0/dist/bin
-- Found PythonInterp: c:/Program Files/Python38/python.exe (found suitable version "3.8.2", minimum required is "3.6")
-- Found lit: c:/Program Files/Python38/Scripts/lit.exe
-- Found FileCheck: d:/LLVM/10.0.0/dist/bin/FileCheck.exe
-- Found CUDA: c:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.2 (found version "10.2")
-- Configuring done
-- Generating done
-- Build files have been written to: f:/HIP/hipify-clang/build
-- Build files have been written to: d:/HIP/hipify-clang/build
```
Run `Visual Studio 16 2019`, open the generated `hipify-clang.sln`, build project `test-hipify`.
+2 -2
파일 보기
@@ -545,9 +545,9 @@ const std::map<llvm::StringRef, hipCounter> CUDA_DRIVER_FUNCTION_MAP{
// 5.21. Occupancy
// cudaOccupancyMaxActiveBlocksPerMultiprocessor
{"cuOccupancyMaxActiveBlocksPerMultiprocessor", {"hipDrvOccupancyMaxActiveBlocksPerMultiprocessor", "", CONV_OCCUPANCY, API_DRIVER}},
{"cuOccupancyMaxActiveBlocksPerMultiprocessor", {"hipDrvOccupancyMaxActiveBlocksPerMultiprocessor", "", CONV_OCCUPANCY, API_DRIVER}},
// cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
{"cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", {"hipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "", CONV_OCCUPANCY, API_DRIVER}},
{"cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", {"hipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags","", CONV_OCCUPANCY, API_DRIVER}},
// cudaOccupancyMaxPotentialBlockSize
{"cuOccupancyMaxPotentialBlockSize", {"hipOccupancyMaxPotentialBlockSize", "", CONV_OCCUPANCY, API_DRIVER}},
// cudaOccupancyMaxPotentialBlockSizeWithFlags
+19 -19
파일 보기
@@ -49,8 +49,8 @@ const std::map<llvm::StringRef, hipCounter> CUDA_SPARSE_FUNCTION_MAP{
{"cusparseGetStream", {"hipsparseGetStream", "", CONV_LIB_FUNC, API_SPARSE}},
{"cusparseCreateCsrsv2Info", {"hipsparseCreateCsrsv2Info", "", CONV_LIB_FUNC, API_SPARSE}},
{"cusparseDestroyCsrsv2Info", {"hipsparseDestroyCsrsv2Info", "", CONV_LIB_FUNC, API_SPARSE}},
{"cusparseCreateCsrsm2Info", {"hipsparseCreateCsrsm2Info", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
{"cusparseDestroyCsrsm2Info", {"hipsparseDestroyCsrsm2Info", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
{"cusparseCreateCsrsm2Info", {"hipsparseCreateCsrsm2Info", "", CONV_LIB_FUNC, API_SPARSE}},
{"cusparseDestroyCsrsm2Info", {"hipsparseDestroyCsrsm2Info", "", CONV_LIB_FUNC, API_SPARSE}},
{"cusparseCreateCsric02Info", {"hipsparseCreateCsric02Info", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
{"cusparseDestroyCsric02Info", {"hipsparseDestroyCsric02Info", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
{"cusparseCreateCsrilu02Info", {"hipsparseCreateCsrilu02Info", "", CONV_LIB_FUNC, API_SPARSE}},
@@ -218,27 +218,27 @@ const std::map<llvm::StringRef, hipCounter> CUDA_SPARSE_FUNCTION_MAP{
{"cusparseCcsrsm_analysis", {"hipsparseCcsrsm_analysis", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
{"cusparseZcsrsm_analysis", {"hipsparseZcsrsm_analysis", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
{"cusparseScsrsm_solve", {"hipsparseScsrsm_solve", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
{"cusparseDcsrsm_solve", {"hipsparseDcsrsm_solve", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
{"cusparseCcsrsm_solve", {"hipsparseCcsrsm_solve", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
{"cusparseZcsrsm_solve", {"hipsparseZcsrsm_solve", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
{"cusparseScsrsm_solve", {"hipsparseScsrsm_solve", "", CONV_LIB_FUNC, API_SPARSE}},
{"cusparseDcsrsm_solve", {"hipsparseDcsrsm_solve", "", CONV_LIB_FUNC, API_SPARSE}},
{"cusparseCcsrsm_solve", {"hipsparseCcsrsm_solve", "", CONV_LIB_FUNC, API_SPARSE}},
{"cusparseZcsrsm_solve", {"hipsparseZcsrsm_solve", "", CONV_LIB_FUNC, API_SPARSE}},
{"cusparseScsrsm2_bufferSizeExt", {"hipsparseScsrsm2_bufferSizeExt", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
{"cusparseDcsrsm2_bufferSizeExt", {"hipsparseDcsrsm2_bufferSizeExt", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
{"cusparseCcsrsm2_bufferSizeExt", {"hipsparseCcsrsm2_bufferSizeExt", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
{"cusparseZcsrsm2_bufferSizeExt", {"hipsparseZcsrsm2_bufferSizeExt", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
{"cusparseScsrsm2_bufferSizeExt", {"hipsparseScsrsm2_bufferSizeExt", "", CONV_LIB_FUNC, API_SPARSE}},
{"cusparseDcsrsm2_bufferSizeExt", {"hipsparseDcsrsm2_bufferSizeExt", "", CONV_LIB_FUNC, API_SPARSE,}},
{"cusparseCcsrsm2_bufferSizeExt", {"hipsparseCcsrsm2_bufferSizeExt", "", CONV_LIB_FUNC, API_SPARSE}},
{"cusparseZcsrsm2_bufferSizeExt", {"hipsparseZcsrsm2_bufferSizeExt", "", CONV_LIB_FUNC, API_SPARSE}},
{"cusparseScsrsm2_analysis", {"hipsparseScsrsm2_analysis", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
{"cusparseDcsrsm2_analysis", {"hipsparseDcsrsm2_analysis", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
{"cusparseCcsrsm2_analysis", {"hipsparseCcsrsm2_analysis", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
{"cusparseZcsrsm2_analysis", {"hipsparseZcsrsm2_analysis", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
{"cusparseScsrsm2_analysis", {"hipsparseScsrsm2_analysis", "", CONV_LIB_FUNC, API_SPARSE}},
{"cusparseDcsrsm2_analysis", {"hipsparseDcsrsm2_analysis", "", CONV_LIB_FUNC, API_SPARSE}},
{"cusparseCcsrsm2_analysis", {"hipsparseCcsrsm2_analysis", "", CONV_LIB_FUNC, API_SPARSE}},
{"cusparseZcsrsm2_analysis", {"hipsparseZcsrsm2_analysis", "", CONV_LIB_FUNC, API_SPARSE}},
{"cusparseScsrsm2_solve", {"hipsparseScsrsm2_solve", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
{"cusparseDcsrsm2_solve", {"hipsparseDcsrsm2_solve", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
{"cusparseCcsrsm2_solve", {"hipsparseCcsrsm2_solve", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
{"cusparseZcsrsm2_solve", {"hipsparseZcsrsm2_solve", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
{"cusparseXcsrsm2_zeroPivot", {"hipsparseXcsrsm2_zeroPivot", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
{"cusparseXcsrsm2_zeroPivot", {"hipsparseXcsrsm2_zeroPivot", "", CONV_LIB_FUNC, API_SPARSE}},
{"cusparseSbsrmm", {"hipsparseSbsrmm", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
{"cusparseDbsrmm", {"hipsparseDbsrmm", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
@@ -619,10 +619,10 @@ const std::map<llvm::StringRef, hipCounter> CUDA_SPARSE_FUNCTION_MAP{
{"cusparseChyb2dense", {"hipsparseChyb2dense", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
{"cusparseZhyb2dense", {"hipsparseZhyb2dense", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
{"cusparseSnnz", {"hipsparseSnnz", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
{"cusparseDnnz", {"hipsparseDnnz", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
{"cusparseCnnz", {"hipsparseCnnz", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
{"cusparseZnnz", {"hipsparseZnnz", "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
{"cusparseSnnz", {"hipsparseSnnz", "", CONV_LIB_FUNC, API_SPARSE}},
{"cusparseDnnz", {"hipsparseDnnz", "", CONV_LIB_FUNC, API_SPARSE}},
{"cusparseCnnz", {"hipsparseCnnz", "", CONV_LIB_FUNC, API_SPARSE}},
{"cusparseZnnz", {"hipsparseZnnz", "", CONV_LIB_FUNC, API_SPARSE}},
{"cusparseCreateIdentityPermutation", {"hipsparseCreateIdentityPermutation", "", CONV_LIB_FUNC, API_SPARSE}},
+5 -5
파일 보기
@@ -41,8 +41,8 @@ const std::map<llvm::StringRef, hipCounter> CUDA_SPARSE_TYPE_NAME_MAP{
{"csrsv2Info", {"csrsv2Info", "", CONV_TYPE, API_SPARSE, HIP_UNSUPPORTED}},
{"csrsv2Info_t", {"csrsv2Info_t", "", CONV_TYPE, API_SPARSE}},
{"csrsm2Info", {"csrsm2Info", "", CONV_TYPE, API_SPARSE, HIP_UNSUPPORTED}},
{"csrsm2Info_t", {"csrsm2Info_t", "", CONV_TYPE, API_SPARSE, HIP_UNSUPPORTED}},
{"csrsm2Info", {"csrsm2Info", "", CONV_TYPE, API_SPARSE}},
{"csrsm2Info_t", {"csrsm2Info_t", "", CONV_TYPE, API_SPARSE}},
{"bsrsv2Info", {"bsrsv2Info", "", CONV_TYPE, API_SPARSE, HIP_UNSUPPORTED}},
{"bsrsv2Info_t", {"bsrsv2Info_t", "", CONV_TYPE, API_SPARSE, HIP_UNSUPPORTED}},
@@ -88,9 +88,9 @@ const std::map<llvm::StringRef, hipCounter> CUDA_SPARSE_TYPE_NAME_MAP{
{"CUSPARSE_ACTION_SYMBOLIC", {"HIPSPARSE_ACTION_SYMBOLIC", "", CONV_NUMERIC_LITERAL, API_SPARSE}},
{"CUSPARSE_ACTION_NUMERIC", {"HIPSPARSE_ACTION_NUMERIC", "", CONV_NUMERIC_LITERAL, API_SPARSE}},
{"cusparseDirection_t", {"hipsparseDirection_t", "", CONV_TYPE, API_SPARSE, HIP_UNSUPPORTED}},
{"CUSPARSE_DIRECTION_ROW", {"HIPSPARSE_DIRECTION_ROW", "", CONV_NUMERIC_LITERAL, API_SPARSE, HIP_UNSUPPORTED}},
{"CUSPARSE_DIRECTION_COLUMN", {"HIPSPARSE_DIRECTION_COLUMN", "", CONV_NUMERIC_LITERAL, API_SPARSE, HIP_UNSUPPORTED}},
{"cusparseDirection_t", {"hipsparseDirection_t", "", CONV_TYPE, API_SPARSE}},
{"CUSPARSE_DIRECTION_ROW", {"HIPSPARSE_DIRECTION_ROW", "", CONV_NUMERIC_LITERAL, API_SPARSE}},
{"CUSPARSE_DIRECTION_COLUMN", {"HIPSPARSE_DIRECTION_COLUMN", "", CONV_NUMERIC_LITERAL, API_SPARSE}},
{"cusparseHybPartition_t", {"hipsparseHybPartition_t", "", CONV_TYPE, API_SPARSE}},
{"CUSPARSE_HYB_PARTITION_AUTO", {"HIPSPARSE_HYB_PARTITION_AUTO", "", CONV_NUMERIC_LITERAL, API_SPARSE}},
+4 -4
파일 보기
@@ -128,7 +128,7 @@ __device__ static int __mul24(int x, int y);
__device__ static long long int __mul64hi(long long int x, long long int y);
__device__ static int __mulhi(int x, int y);
__device__ static int __rhadd(int x, int y);
__device__ static unsigned int __sad(int x, int y, int z);
__device__ static unsigned int __sad(int x, int y,unsigned int z);
__device__ static unsigned int __uhadd(unsigned int x, unsigned int y);
__device__ static int __umul24(unsigned int x, unsigned int y);
__device__ static unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y);
@@ -199,7 +199,7 @@ __device__ static inline int __rhadd(int x, int y) {
int value = z & 0x7FFFFFFF;
return ((value) >> 1 || sign);
}
__device__ static inline unsigned int __sad(int x, int y, int z) {
__device__ static inline unsigned int __sad(int x, int y, unsigned int z) {
return x > y ? x - y + z : y - x + z;
}
__device__ static inline unsigned int __uhadd(unsigned int x, unsigned int y) {
@@ -230,7 +230,7 @@ __device__ static inline unsigned int __urhadd(unsigned int x, unsigned int y) {
return (x + y + 1) >> 1;
}
__device__ static inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) {
return __ockl_sad_u32(x, y, z);
return __ockl_sadd_u32(x, y, z);
}
__device__ static inline unsigned int __lane_id() { return __mbcnt_hi(-1, __mbcnt_lo(-1, 0)); }
@@ -563,7 +563,7 @@ long __shfl_xor(long var, int lane_mask, int width = warpSize)
return tmp1;
#else
static_assert(sizeof(long) == sizeof(int), "");
return static_cast<long>(__shfl_down(static_cast<int>(var), lane_mask, width));
return static_cast<long>(__shfl_xor(static_cast<int>(var), lane_mask, width));
#endif
}
__device__
+2 -1
파일 보기
@@ -44,7 +44,7 @@ extern "C" __device__ __attribute__((const)) uint __ockl_mul24_u32(uint, uint);
extern "C" __device__ __attribute__((const)) int __ockl_mul24_i32(int, int);
extern "C" __device__ __attribute__((const)) uint __ockl_mul_hi_u32(uint, uint);
extern "C" __device__ __attribute__((const)) int __ockl_mul_hi_i32(int, int);
extern "C" __device__ __attribute__((const)) uint __ockl_sad_u32(uint, uint, uint);
extern "C" __device__ __attribute__((const)) uint __ockl_sadd_u32(uint, uint, uint);
extern "C" __device__ __attribute__((const)) uchar __ockl_clz_u8(uchar);
extern "C" __device__ __attribute__((const)) ushort __ockl_clz_u16(ushort);
@@ -72,6 +72,7 @@ extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_thread_rank(
extern "C" __device__ __attribute__((const)) int __ockl_multi_grid_is_valid(void);
extern "C" __device__ __attribute__((convergent)) void __ockl_multi_grid_sync(void);
extern "C" __device__ void __ockl_atomic_add_noret_f32(float*, float);
// Introduce local address space
#define __local __attribute__((address_space(3)))
+17 -14
파일 보기
@@ -37,14 +37,15 @@ THE SOFTWARE.
hipError_t ihipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList, int numDevices,
unsigned int flags, hip_impl::program_state& ps);
hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDimX, void** kernelParams,
unsigned int sharedMemBytes, hipStream_t stream, hip_impl::program_state& ps);
hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList, int numDevices,
unsigned int flags, hip_impl::program_state& ps);
hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim,
dim3 blockDim, void** args,
size_t sharedMem, hipStream_t stream,
hip_impl::program_state& ps);
hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
int numDevices,
unsigned int flags,
hip_impl::program_state& ps);
#pragma GCC visibility push(hidden)
@@ -192,22 +193,24 @@ void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
template <typename F>
inline
__attribute__((visibility("hidden")))
hipError_t hipLaunchCooperativeKernel(F f, dim3 gridDim, dim3 blockDimX, void** kernelParams,
unsigned int sharedMemBytes, hipStream_t stream) {
hipError_t hipLaunchCooperativeKernel(F f, dim3 gridDim, dim3 blockDim,
void** args, size_t sharedMem,
hipStream_t stream) {
hip_impl::hip_init();
auto& ps = hip_impl::get_program_state();
return ihipLaunchCooperativeKernel(reinterpret_cast<void*>(f), gridDim, blockDimX, kernelParams, sharedMemBytes, stream, ps);
return hipLaunchCooperativeKernel(reinterpret_cast<void*>(f), gridDim,
blockDim, args, sharedMem, stream, ps);
}
inline
__attribute__((visibility("hidden")))
hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList, int numDevices,
unsigned int flags) {
hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
int numDevices,
unsigned int flags) {
hip_impl::hip_init();
auto& ps = hip_impl::get_program_state();
return ihipLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags, ps);
return hipLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags, ps);
}
#pragma GCC visibility pop
+8
파일 보기
@@ -73,6 +73,14 @@ float atomicAdd(float* address, float val)
return __uint_as_float(r);
}
__device__
inline
void atomicAddNoRet(float* address, float val)
{
__ockl_atomic_add_noret_f32(address, val);
}
__device__
inline
double atomicAdd(double* address, double val)
+14
파일 보기
@@ -1268,6 +1268,13 @@ THE SOFTWARE.
static_cast<__half_raw>(x).data +
static_cast<__half_raw>(y).data};
}
inline
__device__
__half __habs(__half x)
{
return __half_raw{
__ocml_fabs_f16(static_cast<__half_raw>(x).data)};
}
inline
__device__
__half __hsub(__half x, __half y)
@@ -1334,6 +1341,13 @@ THE SOFTWARE.
static_cast<__half2_raw>(x).data +
static_cast<__half2_raw>(y).data};
}
inline
__device__
__half2 __habs2(__half2 x)
{
return __half2_raw{
__ocml_fabs_2f16(static_cast<__half2_raw>(x).data)};
}
inline
__device__
__half2 __hsub2(__half2 x, __half2 y)
+2
파일 보기
@@ -38,6 +38,7 @@ extern "C"
__device__ __attribute__((const)) _Float16 __ocml_floor_f16(_Float16);
__device__ __attribute__((const))
_Float16 __ocml_fma_f16(_Float16, _Float16, _Float16);
__device__ __attribute__((const)) _Float16 __ocml_fabs_f16(_Float16);
__device__ __attribute__((const)) int __ocml_isinf_f16(_Float16);
__device__ __attribute__((const)) int __ocml_isnan_f16(_Float16);
__device__ __attribute__((pure)) _Float16 __ocml_log_f16(_Float16);
@@ -58,6 +59,7 @@ extern "C"
#endif
__device__ __attribute__((const)) __2f16 __ocml_ceil_2f16(__2f16);
__device__ __attribute__((const)) __2f16 __ocml_fabs_2f16(__2f16);
__device__ __2f16 __ocml_cos_2f16(__2f16);
__device__ __attribute__((pure)) __2f16 __ocml_exp_2f16(__2f16);
__device__ __attribute__((pure)) __2f16 __ocml_exp10_2f16(__2f16);
+8 -3
파일 보기
@@ -504,9 +504,14 @@ hc_get_workitem_absolute_id(int dim)
#define __CUDA__
#include <__clang_cuda_math_forward_declares.h>
#include <__clang_cuda_complex_builtins.h>
#include <cuda_wrappers/algorithm>
#include <cuda_wrappers/complex>
#include <cuda_wrappers/new>
// Workaround for using libc++ with HIP-Clang.
// The following headers requires clang include path before standard C++ include path.
// However libc++ include path requires to be before clang include path.
// To workaround this, we pass -isystem with the parent directory of clang include
// path instead of the clang include path itself.
#include <include/cuda_wrappers/algorithm>
#include <include/cuda_wrappers/complex>
#include <include/cuda_wrappers/new>
#undef __CUDA__
#pragma pop_macro("__CUDA__")
#endif // !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
+1 -1
파일 보기
@@ -55,7 +55,7 @@ THE SOFTWARE.
#define DEPRECATED(msg) __attribute__ ((deprecated(msg)))
#endif // !defined(_MSC_VER)
#define DEPRECATED_MSG "This API is marked as deprecated and may not be supported in future releases.For more details please refer https://github.com/ROCm-Developer-Tools/HIP/tree/master/docs/markdown/hip_deprecated_api_list"
#define DEPRECATED_MSG "This API is marked as deprecated and may not be supported in future releases. For more details please refer https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_deprecated_api_list.md"
#if defined(__HCC__) && (__hcc_workweek__ < 16155)
#error("This version of HIP requires a newer version of HCC.");
+93 -4
파일 보기
@@ -34,7 +34,7 @@ THE SOFTWARE.
#include "hip/hcc_detail/host_defines.h"
#if !defined(_MSC_VER) || __clang__
#if defined(__has_attribute)
#if __has_attribute(ext_vector_type)
#define __NATIVE_VECTOR__(n, T) T __attribute__((ext_vector_type(n)))
#else
@@ -694,7 +694,7 @@ THE SOFTWARE.
typename U = T,
typename std::enable_if<std::is_signed<U>{}>::type* = nullptr>
inline __host__ __device__
HIP_vector_type operator-() noexcept
HIP_vector_type operator-() const noexcept
{
auto tmp(*this);
tmp.data = -tmp.data;
@@ -705,7 +705,7 @@ THE SOFTWARE.
typename U = T,
typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
inline __host__ __device__
HIP_vector_type operator~() noexcept
HIP_vector_type operator~() const noexcept
{
HIP_vector_type r{*this};
r.data = ~r.data;
@@ -1241,7 +1241,9 @@ DECLOP_MAKE_ONE_COMPONENT(signed long long, longlong1);
DECLOP_MAKE_TWO_COMPONENT(signed long long, longlong2);
DECLOP_MAKE_THREE_COMPONENT(signed long long, longlong3);
DECLOP_MAKE_FOUR_COMPONENT(signed long long, longlong4);
#else // defined(_MSC_VER)
#else // !defined(__has_attribute)
#if defined(_MSC_VER)
#include <mmintrin.h>
#include <xmmintrin.h>
#include <emmintrin.h>
@@ -1347,5 +1349,92 @@ typedef union { double4 data; } double3;
typedef union { __m256d data[2]; } double8;
typedef union { __m256d data[4]; } double16;
#else // !defined(_MSC_VER)
typedef union { char data; } char1;
typedef union { char data[2]; } char2;
typedef union { char data[4]; } char4;
typedef union { char data[8]; } char8;
typedef union { char data[16]; } char16;
typedef union { char4 data; } char3;
typedef union { unsigned char data; } uchar1;
typedef union { unsigned char data[2]; } uchar2;
typedef union { unsigned char data[4]; } uchar4;
typedef union { unsigned char data[8]; } uchar8;
typedef union { unsigned char data[16]; } uchar16;
typedef union { uchar4 data; } uchar3;
typedef union { short data; } short1;
typedef union { short data[2]; } short2;
typedef union { short data[4]; } short4;
typedef union { short data[8]; } short8;
typedef union { short data[16]; } short16;
typedef union { short4 data; } short3;
typedef union { unsigned short data; } ushort1;
typedef union { unsigned short data[2]; } ushort2;
typedef union { unsigned short data[4]; } ushort4;
typedef union { unsigned short data[8]; } ushort8;
typedef union { unsigned short data[16]; } ushort16;
typedef union { ushort4 data; } ushort3;
typedef union { int data; } int1;
typedef union { int data[2]; } int2;
typedef union { int data[4]; } int4;
typedef union { int data[8]; } int8;
typedef union { int data[16]; } int16;
typedef union { int4 data; } int3;
typedef union { unsigned int data; } uint1;
typedef union { unsigned int data[2]; } uint2;
typedef union { unsigned int data[4]; } uint4;
typedef union { unsigned int data[8]; } uint8;
typedef union { unsigned int data[16]; } uint16;
typedef union { uint4 data; } uint3;
typedef union { long data; } long1;
typedef union { long data[2]; } long2;
typedef union { long data[4]; } long4;
typedef union { long data[8]; } long8;
typedef union { long data[16]; } long16;
typedef union { long4 data; } long3;
typedef union { unsigned long data; } ulong1;
typedef union { unsigned long data[2]; } ulong2;
typedef union { unsigned long data[4]; } ulong4;
typedef union { unsigned long data[8]; } ulong8;
typedef union { unsigned long data[16]; } ulong16;
typedef union { ulong4 data; } ulong3;
typedef union { long long data; } longlong1;
typedef union { long long data[2]; } longlong2;
typedef union { long long data[4]; } longlong4;
typedef union { long long data[8]; } longlong8;
typedef union { long long data[16]; } longlong16;
typedef union { longlong4 data; } longlong3;
typedef union { unsigned long long data; } ulonglong1;
typedef union { unsigned long long data[2]; } ulonglong2;
typedef union { unsigned long long data[4]; } ulonglong4;
typedef union { unsigned long long data[8]; } ulonglong8;
typedef union { unsigned long long data[16]; } ulonglong16;
typedef union { ulonglong4 data; } ulonglong3;
typedef union { float data; } float1;
typedef union { float data[2]; } float2;
typedef union { float data[4]; } float4;
typedef union { float data[8]; } float8;
typedef union { float data[16]; } float16;
typedef union { float4 data; } float3;
typedef union { double data; } double1;
typedef union { double data[2]; } double2;
typedef union { double data[4]; } double4;
typedef union { double data[8]; } double8;
typedef union { double data[16]; } double16;
typedef union { double4 data; } double3;
#endif // defined(_MSC_VER)
#endif // defined(__has_attribute)
#endif
+4
파일 보기
@@ -28,6 +28,8 @@ extern "C" {
#include <stdlib.h>
#pragma GCC visibility push (default)
enum hiprtcResult {
HIPRTC_SUCCESS = 0,
HIPRTC_ERROR_OUT_OF_MEMORY = 1,
@@ -79,6 +81,8 @@ hiprtcResult hiprtcGetCode(hiprtcProgram prog, char* code);
hiprtcResult hiprtcGetCodeSize(hiprtcProgram prog, size_t* codeSizeRet);
#pragma GCC visibility pop
#ifdef __cplusplus
}
#endif /* __cplusplus */
+1 -1
파일 보기
@@ -60,7 +60,7 @@ THE SOFTWARE.
*/
// _restrict is supported by the compiler
#define __shared__ tile_static
#define __constant__ __attribute__((hc))
#define __constant__ __attribute__((hc, annotate("__HIP_constant__")))
#elif defined(__clang__) && defined(__HIP__)
-1
파일 보기
@@ -321,7 +321,6 @@ typedef enum hipDeviceAttribute_t {
hipDeviceAttributeIntegrated, ///< iGPU
hipDeviceAttributeCooperativeLaunch, ///< Support cooperative launch
hipDeviceAttributeCooperativeMultiDeviceLaunch, ///< Support cooperative launch on multiple devices
hipDeviceAttributeMaxTexture1DWidth, ///< Maximum number of elements in 1D images
hipDeviceAttributeMaxTexture2DWidth, ///< Maximum dimension width of 2D images in image elements
hipDeviceAttributeMaxTexture2DHeight, ///< Maximum dimension height of 2D images in image elements
+42 -1
파일 보기
@@ -186,6 +186,7 @@ typedef struct cudaArray hipArray;
typedef struct cudaArray* hipArray_t;
typedef struct cudaArray* hipArray_const_t;
typedef struct cudaFuncAttributes hipFuncAttributes;
typedef struct cudaLaunchParams hipLaunchParams;
#define hipFunction_attribute CUfunction_attribute
#define hip_Memcpy2D CUDA_MEMCPY2D
#define hipMemcpy3DParms cudaMemcpy3DParms
@@ -860,7 +861,7 @@ inline static hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes,
}
inline hipError_t hipMemcpyWithStream(void* dst, const void* src,
inline static hipError_t hipMemcpyWithStream(void* dst, const void* src,
size_t sizeBytes, hipMemcpyKind copyKind,
hipStream_t stream) {
cudaError_t error = cudaMemcpyAsync(dst, src, sizeBytes,
@@ -1134,6 +1135,10 @@ inline static hipError_t hipGetDeviceProperties(hipDeviceProp_t* p_prop, int dev
p_prop->integrated = cdprop.integrated;
p_prop->cooperativeLaunch = cdprop.cooperativeLaunch;
p_prop->cooperativeMultiDeviceLaunch = cdprop.cooperativeMultiDeviceLaunch;
p_prop->cooperativeMultiDeviceUnmatchedFunc = 0;
p_prop->cooperativeMultiDeviceUnmatchedGridDim = 0;
p_prop->cooperativeMultiDeviceUnmatchedBlockDim = 0;
p_prop->cooperativeMultiDeviceUnmatchedSharedMem = 0;
p_prop->maxTexture1D = cdprop.maxTexture1D;
p_prop->maxTexture2D[0] = cdprop.maxTexture2D[0];
@@ -1271,6 +1276,12 @@ inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t att
case hipDeviceAttributeEccEnabled:
cdattr = cudaDevAttrEccEnabled;
break;
case hipDeviceAttributeCooperativeLaunch:
cdattr = cudaDevAttrCooperativeLaunch;
break;
case hipDeviceAttributeCooperativeMultiDeviceLaunch:
cdattr = cudaDevAttrCooperativeMultiDeviceLaunch;
break;
default:
return hipCUDAErrorTohipError(cudaErrorInvalidValue);
}
@@ -1679,6 +1690,17 @@ inline static hipError_t hipGetChannelDesc(hipChannelFormatDesc* desc, hipArray_
return hipCUDAErrorTohipError(cudaGetChannelDesc(desc,array));
}
inline static hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDim,
void** kernelParams, unsigned int sharedMemBytes,
hipStream_t stream) {
return hipCUDAErrorTohipError(
cudaLaunchCooperativeKernel(f, gridDim, blockDim, kernelParams, sharedMemBytes, stream));
}
inline static hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
int numDevices, unsigned int flags) {
return hipCUDAErrorTohipError(cudaLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags));
}
#ifdef __cplusplus
}
@@ -1686,6 +1708,17 @@ inline static hipError_t hipGetChannelDesc(hipChannelFormatDesc* desc, hipArray_
#ifdef __CUDACC__
template<class T>
inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks,
T func,
int blockSize,
size_t dynamicSMemSize) {
cudaError_t cerror;
cerror =
cudaOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, func, blockSize, dynamicSMemSize);
return hipCUDAErrorTohipError(cerror);
}
template <class T>
inline static hipError_t hipOccupancyMaxPotentialBlockSize(int* minGridSize, int* blockSize, T func,
size_t dynamicSMemSize = 0,
@@ -1742,6 +1775,14 @@ template <class T>
inline static hipChannelFormatDesc hipCreateChannelDesc() {
return cudaCreateChannelDesc<T>();
}
template <class T>
inline static hipError_t hipLaunchCooperativeKernel(T f, dim3 gridDim, dim3 blockDim,
void** kernelParams, unsigned int sharedMemBytes, hipStream_t stream) {
return hipCUDAErrorTohipError(
cudaLaunchCooperativeKernel(f, gridDim, blockDim, kernelParams, sharedMemBytes, stream));
}
#endif //__CUDACC__
#endif // HIP_INCLUDE_HIP_NVCC_DETAIL_HIP_RUNTIME_API_H
+1 -1
파일 보기
@@ -14,7 +14,7 @@ install(TARGETS lpl RUNTIME DESTINATION bin)
#-------------------------------------LPL--------------------------------------#
#-------------------------------------CA---------------------------------------#
add_executable(ca ca.cpp ${PROJECT_SOURCE_DIR}/src/code_object_bundle.cpp)
add_executable(ca ca.cpp)
set_target_properties(
ca PROPERTIES
CXX_STANDARD 11
+1 -1
파일 보기
@@ -2,7 +2,7 @@
#include "common.hpp"
#include "../include/hip/hcc_detail/code_object_bundle.hpp"
#include "../src/code_object_bundle.inl"
#include "clara/clara.hpp"
+3 -4
파일 보기
@@ -25,16 +25,15 @@ set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}
set(CPACK_GENERATOR "TGZ;DEB;RPM")
set(CPACK_BINARY_DEB "ON")
set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm")
set(CPACK_DEBIAN_PACKAGE_DEPENDS "perl (>= 5.0), llvm-amdgpu")
set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip_base")
set(CPACK_DEBIAN_PACKAGE_DEPENDS "perl (>= 5.0)")
set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-base")
set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_base")
set(CPACK_DEBIAN_PACKAGE_CONFLICTS "hip_base")
set(CPACK_BINARY_RPM "ON")
set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst")
set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm")
set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
set(CPACK_RPM_PACKAGE_REQUIRES "perl >= 5.0, llvm-amdgpu")
set(CPACK_RPM_PACKAGE_REQUIRES "perl >= 5.0")
set(CPACK_RPM_PACKAGE_OBSOLETES "hip_base")
set(CPACK_RPM_PACKAGE_CONFLICTS "hip_base")
set(CPACK_BINARY_RPM "ON")
+1 -2
파일 보기
@@ -32,9 +32,8 @@ set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}
set(CPACK_GENERATOR "TGZ;DEB;RPM")
set(CPACK_BINARY_DEB "ON")
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION})")
set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip_doc")
set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-doc")
set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_doc")
set(CPACK_DEBIAN_PACKAGE_CONFLICTS "hip_doc")
set(CPACK_BINARY_RPM "ON")
set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
+1 -2
파일 보기
@@ -37,9 +37,8 @@ set(CPACK_GENERATOR "TGZ;DEB;RPM")
set(CPACK_BINARY_DEB "ON")
set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm")
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}), ${HCC_PACKAGE_NAME} (= @HCC_PACKAGE_VERSION@), comgr (>= 1.1)")
set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip_hcc")
set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-hcc")
set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_hcc")
set(CPACK_DEBIAN_PACKAGE_CONFLICTS "hip_hcc")
set(CPACK_BINARY_RPM "ON")
set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst")
+1 -2
파일 보기
@@ -19,9 +19,8 @@ set(CPACK_GENERATOR "TGZ;DEB;RPM")
set(CPACK_BINARY_DEB "ON")
set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm")
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}), cuda (>= 7.5)")
set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip_nvcc")
set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-nvcc")
set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_nvcc")
set(CPACK_DEBIAN_PACKAGE_CONFLICTS "hip_nvcc")
set(CPACK_BINARY_RPM "ON")
set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst")
+2 -3
파일 보기
@@ -19,10 +19,9 @@ set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@)
set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH})
set(CPACK_GENERATOR "TGZ;DEB;RPM")
set(CPACK_BINARY_DEB "ON")
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-vdi (= ${CPACK_PACKAGE_VERSION})")
set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip_samples")
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION})")
set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-samples")
set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_samples")
set(CPACK_DEBIAN_PACKAGE_CONFLICTS "hip_samples")
set(CPACK_BINARY_RPM "ON")
set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
-2
파일 보기
@@ -114,8 +114,6 @@ set_target_properties(hip::device PROPERTIES
else()
set_target_properties(hip::device PROPERTIES
INTERFACE_LINK_LIBRARIES "hip::host"
INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include"
INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include"
)
endif()
+2 -10
파일 보기
@@ -27,11 +27,7 @@ set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}
set(CPACK_GENERATOR "TGZ;DEB;RPM")
set(CPACK_BINARY_DEB "ON")
set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm")
if(@COMPILE_HIP_ATP_MARKER@)
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev, hsa-ext-rocr-dev, rocm-utils, hip-base (= ${CPACK_PACKAGE_VERSION}), rocm-profiler, comgr (>= 1.1)")
else()
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev, hsa-ext-rocr-dev, rocm-utils, hip-base (= ${CPACK_PACKAGE_VERSION}), comgr (>= 1.1)")
endif()
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev, hsa-ext-rocr-dev, rocm-utils, hip-base (= ${CPACK_PACKAGE_VERSION}), comgr (>= 1.1), llvm-amdgpu")
set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip_vdi, hip-hcc (= ${CPACK_PACKAGE_VERSION})")
set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_vdi")
set(CPACK_DEBIAN_PACKAGE_CONFLICTS "hip_vdi")
@@ -41,11 +37,7 @@ set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst")
set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm")
set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION})
if(@COMPILE_HIP_ATP_MARKER@)
set(CPACK_RPM_PACKAGE_REQUIRES "hsa-rocr-dev, hsa-ext-rocr-dev, rocm-utils, hip-base = ${HIP_BASE_VERSION}, rocm-profiler, comgr >= 1.1")
else()
set(CPACK_RPM_PACKAGE_REQUIRES "hsa-rocr-dev, hsa-ext-rocr-dev, rocm-utils, hip-base = ${HIP_BASE_VERSION}, comgr >= 1.1")
endif()
set(CPACK_RPM_PACKAGE_REQUIRES "hsa-rocr-dev, hsa-ext-rocr-dev, rocm-utils, hip-base = ${HIP_BASE_VERSION}, comgr >= 1.1, llvm-amdgpu")
set(CPACK_RPM_PACKAGE_PROVIDES "hip_vdi, hip-hcc = ${HIP_BASE_VERSION}")
set(CPACK_RPM_PACKAGE_OBSOLETES "hip_vdi")
set(CPACK_RPM_PACKAGE_CONFLICTS "hip_vdi")
+2 -2
파일 보기
@@ -80,8 +80,8 @@ int main() {
hipFree(Ad);
hipFree(Bd);
delete A;
delete B;
delete[] A;
delete[] B;
hipCtxDestroy(context);
return 0;
}
+2 -2
파일 보기
@@ -107,8 +107,8 @@ int main() {
hipFree(Ad);
hipFree(Bd);
delete A;
delete B;
delete[] A;
delete[] B;
hipCtxDestroy(context);
return 0;
}
+2 -2
파일 보기
@@ -99,8 +99,8 @@ int main() {
hipFree(Ad);
hipFree(Bd);
delete A;
delete B;
delete[] A;
delete[] B;
hipCtxDestroy(context);
return 0;
}
+2 -2
파일 보기
@@ -154,8 +154,8 @@ int main() {
hipFree(Ad);
hipFree(Bd);
delete A;
delete B;
delete[] A;
delete[] B;
hipCtxDestroy(context);
return 0;
}
+2 -1
파일 보기
@@ -56,6 +56,7 @@ void printCompilerInfo() {
#endif
}
double bytesToKB(size_t s) { return (double)s / (1024.0); }
double bytesToGB(size_t s) { return (double)s / (1024.0 * 1024.0 * 1024.0); }
#define printLimit(w1, limit, units) \
@@ -97,7 +98,7 @@ void printDeviceProp(int deviceId) {
cout << setw(w1) << "totalGlobalMem: " << fixed << setprecision(2)
<< bytesToGB(props.totalGlobalMem) << " GB" << endl;
cout << setw(w1) << "maxSharedMemoryPerMultiProcessor: " << fixed << setprecision(2)
<< bytesToGB(props.maxSharedMemoryPerMultiProcessor) << " GB" << endl;
<< bytesToKB(props.maxSharedMemoryPerMultiProcessor) << " KB" << endl;
cout << setw(w1) << "totalConstMem: " << props.totalConstMem << endl;
cout << setw(w1) << "sharedMemPerBlock: " << (float)props.sharedMemPerBlock / 1024.0 << " KB"
<< endl;
-34
파일 보기
@@ -1,34 +0,0 @@
#include "../include/hip/hcc_detail/code_object_bundle.hpp"
#include <hsa/hsa.h>
#include <algorithm>
#include <cstddef>
#include <cstdint>
#include <string>
#include <vector>
using namespace std;
// CREATORS
hip_impl::Bundled_code_header::Bundled_code_header(const vector<char>& x)
: Bundled_code_header{x.cbegin(), x.cend()} {}
hip_impl::Bundled_code_header::Bundled_code_header(
const void* p) { // This is a pretty terrible interface, useful only because
// hipLoadModuleData is so poorly specified (for no fault of its own).
if (!p) return;
if (!valid(*static_cast<const Bundled_code_header*>(p))) return;
auto ph = static_cast<const Header_*>(p);
size_t sz = sizeof(Header_) + ph->bundle_cnt_ * sizeof(Bundled_code::Header);
auto pb = static_cast<const char*>(p) + sizeof(Header_);
auto n = ph->bundle_cnt_;
while (n--) {
sz += reinterpret_cast<const Bundled_code::Header*>(pb)->bundle_sz;
pb += sizeof(Bundled_code::Header);
}
read(static_cast<const char*>(p), static_cast<const char*>(p) + sz, *this);
}
@@ -92,10 +92,6 @@ struct Bundled_code {
#define magic_string_ "__CLANG_OFFLOAD_BUNDLE__"
#ifdef __GNUC__
#pragma GCC visibility push (default)
#endif
class Bundled_code_header {
// DATA - STATICS
static constexpr auto magic_string_sz_ = sizeof(magic_string_) - 1;
@@ -167,8 +163,26 @@ class Bundled_code_header {
Bundled_code_header() = default;
template <typename RandomAccessIterator>
Bundled_code_header(RandomAccessIterator f, RandomAccessIterator l);
explicit Bundled_code_header(const std::vector<char>& blob);
explicit Bundled_code_header(const void* maybe_blob);
explicit Bundled_code_header(const std::vector<char>& blob)
: Bundled_code_header{blob.cbegin(), blob.cend()} {}
explicit Bundled_code_header(const void* maybe_blob) {
// This is a pretty terrible interface, useful only because
// hipLoadModuleData is so poorly specified (for no fault of its own).
if (!maybe_blob) return;
if (!valid(*static_cast<const Bundled_code_header*>(maybe_blob))) return;
auto ph = static_cast<const Header_*>(maybe_blob);
size_t sz = sizeof(Header_) + ph->bundle_cnt_ * sizeof(Bundled_code::Header);
auto pb = static_cast<const char*>(maybe_blob) + sizeof(Header_);
auto n = ph->bundle_cnt_;
while (n--) {
sz += reinterpret_cast<const Bundled_code::Header*>(pb)->bundle_sz;
pb += sizeof(Bundled_code::Header);
}
read(static_cast<const char*>(maybe_blob), static_cast<const char*>(maybe_blob) + sz, *this);
}
Bundled_code_header(const Bundled_code_header&) = default;
Bundled_code_header(Bundled_code_header&&) = default;
~Bundled_code_header() = default;
@@ -180,10 +194,6 @@ class Bundled_code_header {
size_t bundled_code_size = 0;
};
#ifdef __GNUC__
#pragma GCC visibility pop
#endif
// CREATORS
template <typename RandomAccessIterator>
Bundled_code_header::Bundled_code_header(RandomAccessIterator f, RandomAccessIterator l)
+2 -2
파일 보기
@@ -51,7 +51,7 @@ __hipRegisterFatBinary(const void* data)
return nullptr;
}
auto modules = new std::vector<hipModule_t>{g_deviceCnt};
auto modules = new std::vector<hipModule_t>(g_deviceCnt);
if (!modules) {
return nullptr;
}
@@ -136,7 +136,7 @@ extern "C" void __hipRegisterFunction(
int* wSize)
{
HIP_INIT_API(NONE, modules, hostFunction, deviceFunction, deviceName);
std::vector<hipFunction_t> functions{g_deviceCnt};
std::vector<hipFunction_t> functions(g_deviceCnt);
assert(modules && modules->size() >= g_deviceCnt);
for (int deviceId = 0; deviceId < g_deviceCnt; ++deviceId) {
+12
파일 보기
@@ -310,6 +310,18 @@ hipError_t ihipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device
case hipDeviceAttributeCooperativeMultiDeviceLaunch:
*pi = prop->cooperativeMultiDeviceLaunch;
break;
case hipDeviceAttributeCooperativeMultiDeviceUnmatchedFunc:
*pi = prop->cooperativeMultiDeviceUnmatchedFunc;
break;
case hipDeviceAttributeCooperativeMultiDeviceUnmatchedGridDim:
*pi = prop->cooperativeMultiDeviceUnmatchedGridDim;
break;
case hipDeviceAttributeCooperativeMultiDeviceUnmatchedBlockDim:
*pi = prop->cooperativeMultiDeviceUnmatchedBlockDim;
break;
case hipDeviceAttributeCooperativeMultiDeviceUnmatchedSharedMem:
*pi = prop->cooperativeMultiDeviceUnmatchedSharedMem;
break;
case hipDeviceAttributeMaxPitch:
*pi = prop->memPitch;
break;
+16 -23
파일 보기
@@ -677,7 +677,7 @@ hsa_status_t get_pool_info(hsa_amd_memory_pool_t pool, void* data) {
break;
case HSA_REGION_SEGMENT_GROUP:
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SIZE,
&(p_prop->sharedMemPerBlock));
&(p_prop->maxSharedMemoryPerMultiProcessor));
break;
default:
break;
@@ -835,10 +835,8 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop) {
hsa_region_t* am_region = static_cast<hsa_region_t*>(_acc.get_hsa_am_region());
err = hsa_region_get_info(*am_region, HSA_REGION_INFO_SIZE, &prop->totalGlobalMem);
DeviceErrorCheck(err);
// maxSharedMemoryPerMultiProcessor should be as the same as group memory size.
// Group memory will not be paged out, so, the physical memory size is the total shared memory
// size, and also equal to the group pool size.
prop->maxSharedMemoryPerMultiProcessor = prop->totalGlobalMem;
// Current GPUs allow a workgroup to use all of LDS in a CU, so these two are equal.
prop->sharedMemPerBlock = prop->maxSharedMemoryPerMultiProcessor;
// Get Max memory clock frequency
err =
@@ -897,9 +895,16 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop) {
prop->integrated = 1;
}
// Enable the cooperative group for gfx9+
prop->cooperativeLaunch = (prop->gcnArch < 900) ? 0 : 1;
prop->cooperativeMultiDeviceLaunch = (prop->gcnArch < 900) ? 0 : 1;
// Enable the cooperative group for GPUs that support all the required features
err = hsa_agent_get_info(_hsaAgent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COOPERATIVE_QUEUES,
&prop->cooperativeLaunch);
DeviceErrorCheck(err);
prop->cooperativeMultiDeviceLaunch = prop->cooperativeLaunch;
prop->cooperativeMultiDeviceUnmatchedFunc = prop->cooperativeMultiDeviceLaunch;
prop->cooperativeMultiDeviceUnmatchedGridDim = prop->cooperativeMultiDeviceLaunch;
prop->cooperativeMultiDeviceUnmatchedBlockDim = prop->cooperativeMultiDeviceLaunch;
prop->cooperativeMultiDeviceUnmatchedSharedMem = prop->cooperativeMultiDeviceLaunch;
err = hsa_agent_get_info(_hsaAgent, (hsa_agent_info_t)HSA_EXT_AGENT_INFO_IMAGE_1D_MAX_ELEMENTS,
&prop->maxTexture1D);
@@ -1515,20 +1520,6 @@ hipError_t ihipStreamSynchronize(TlsData *tls, hipStream_t stream) {
return e;
}
void ihipStreamCallbackHandler(ihipStreamCallback_t* cb) {
hipError_t e = hipSuccess;
// Synchronize stream
tprintf(DB_SYNC, "ihipStreamCallbackHandler wait on stream %s\n",
ToString(cb->_stream).c_str());
GET_TLS();
e = ihipStreamSynchronize(tls, cb->_stream);
// Call registered callback function
cb->_callback(cb->_stream, e, cb->_userData);
delete cb;
}
//---
// Get the stream to use for a command submission.
//
@@ -1619,7 +1610,9 @@ void ihipPrintKernelLaunch(const char* kernelName, const grid_launch_parm* lp,
// Allows runtime to track some information about the stream.
hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_launch_parm* lp,
const char* kernelNameStr, bool lockAcquired) {
stream = ihipSyncAndResolveStream(stream, lockAcquired);
if (stream == nullptr || stream != stream->getCtx()->_defaultStream) {
stream = ihipSyncAndResolveStream(stream, lockAcquired);
}
lp->grid_dim.x = grid.x;
lp->grid_dim.y = grid.y;
lp->grid_dim.z = grid.z;
-14
파일 보기
@@ -654,19 +654,6 @@ class ihipStream_t {
};
//----
// Internal structure for stream callback handler
class ihipStreamCallback_t {
public:
ihipStreamCallback_t(hipStream_t stream, hipStreamCallback_t callback, void* userData)
: _stream(stream), _callback(callback), _userData(userData) {
};
hipStream_t _stream;
hipStreamCallback_t _callback;
void* _userData;
};
//----
// Internal event structure:
enum hipEventStatus_t {
@@ -980,7 +967,6 @@ hipError_t hipModuleGetFunctionEx(hipFunction_t* hfunc, hipModule_t hmod,
hipStream_t ihipSyncAndResolveStream(hipStream_t, bool lockAcquired = 0);
hipError_t ihipStreamSynchronize(TlsData *tls, hipStream_t stream);
void ihipStreamCallbackHandler(ihipStreamCallback_t* cb);
/**
* @brief Copies the memory address and size of symbol @p symbolName
+68 -33
파일 보기
@@ -309,31 +309,52 @@ void generic_copy(void* __restrict dst, const void* __restrict src, size_t n,
if (di.size == is_cpu_owned) return d2h_copy(dst, src, n, si);
if (si.size == is_cpu_owned) return h2d_copy(dst, src, n, di);
throwing_result_check(hsa_amd_agents_allow_access(1u, &si.agentOwner,
nullptr,
di.agentBaseAddress),
__FILE__, __func__, __LINE__);
return do_copy(dst, src, n, di.agentOwner, si.agentOwner);
hsa_status_t res = hsa_amd_agents_allow_access(1u, &si.agentOwner,
nullptr, di.agentBaseAddress);
if (res == HSA_STATUS_SUCCESS){
return do_copy(dst, src, n, di.agentOwner, si.agentOwner);
}
// If devices do not have access then fallback mechanism will be used
// copy will be slower
throwing_result_check(hsa_memory_copy(dst,src,n), __FILE__, __func__, __LINE__);
}
inline
void memcpy_impl(void* __restrict dst, const void* __restrict src, size_t n,
hipMemcpyKind k) {
auto si{info(src)};
auto di{info(dst)};
if (!is_large_BAR){
// Pointer info takes presidence over hipMemcpyKind
// if there is mismatch b/w Memcpy kind and dst/src pointer
// E.g. dst(host pointer),src(device pointer) and hipMemcpyKind set as hipMemcpyHostToDevice
if (di.size == is_cpu_owned && si.size == is_cpu_owned)
k = hipMemcpyHostToHost;
else if (si.size == is_cpu_owned && di.size != is_cpu_owned)
k = hipMemcpyHostToDevice;
else if (di.size == is_cpu_owned && si.size != is_cpu_owned)
k = hipMemcpyDeviceToHost;
else
k = hipMemcpyDeviceToDevice;
}
switch (k) {
case hipMemcpyHostToHost: std::memcpy(dst, src, n); break;
case hipMemcpyHostToDevice: return h2d_copy(dst, src, n, info(dst));
case hipMemcpyDeviceToHost: return d2h_copy(dst, src, n, info(src));
case hipMemcpyHostToDevice: return h2d_copy(dst, src, n, di);
case hipMemcpyDeviceToHost: return d2h_copy(dst, src, n, si);
case hipMemcpyDeviceToDevice: {
const auto di{info(dst)};
const auto si{info(src)};
throwing_result_check(hsa_amd_agents_allow_access(1u, &si.agentOwner,
nullptr,
di.agentBaseAddress),
__FILE__, __func__, __LINE__);
return do_copy(dst, src, n, di.agentOwner, si.agentOwner);
hsa_status_t res = hsa_amd_agents_allow_access(1u, &si.agentOwner,
nullptr, di.agentBaseAddress);
if (res == HSA_STATUS_SUCCESS){
return do_copy(dst, src, n, di.agentOwner, si.agentOwner);
}
// If devices do not have access then fallback mechanism will be used
// copy will be slower
throwing_result_check(hsa_memory_copy(dst,src,n), __FILE__, __func__, __LINE__);
break;
}
default: return generic_copy(dst, src, n, info(dst), info(src));
default: return generic_copy(dst, src, n, di, si);
}
}
@@ -478,6 +499,10 @@ void* allocAndSharePtr(const char* msg, size_t sizeBytes, ihipCtx_t* ctx, bool s
hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned int flags) {
hipError_t hip_status = hipSuccess;
if (sizeBytes == 0) {
return hipSuccess;
}
if (HIP_SYNC_HOST_ALLOC) {
hipDeviceSynchronize();
}
@@ -485,10 +510,6 @@ hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned i
auto ctx = ihipGetTlsDefaultCtx();
if ((ctx == nullptr) || (ptr == nullptr)) {
hip_status = hipErrorInvalidValue;
}
else if (sizeBytes == 0) {
hip_status = hipSuccess;
// TODO - should size of 0 return err or be siliently ignored?
} else {
unsigned trueFlags = flags;
if (flags == hipHostMallocDefault) {
@@ -673,14 +694,15 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) {
HIP_SET_DEVICE();
hipError_t hip_status = hipSuccess;
if (sizeBytes == 0) {
if (ptr) *ptr = NULL;
return ihipLogStatus(hipSuccess);
}
auto ctx = ihipGetTlsDefaultCtx();
// return NULL pointer when malloc size is 0
if ( nullptr == ctx || nullptr == ptr) {
hip_status = hipErrorInvalidValue;
}
else if (sizeBytes == 0) {
*ptr = NULL;
hip_status = hipSuccess;
} else {
auto device = ctx->getWriteableDevice();
*ptr = hip_internal::allocAndSharePtr("device_mem", sizeBytes, ctx, false /*shareWithAll*/,
@@ -700,14 +722,15 @@ hipError_t hipExtMallocWithFlags(void** ptr, size_t sizeBytes, unsigned int flag
HIP_SET_DEVICE();
#if (__hcc_workweek__ >= 19115)
if (sizeBytes == 0) {
if (ptr) *ptr = NULL;
return ihipLogStatus(hipSuccess);
}
hipError_t hip_status = hipSuccess;
auto ctx = ihipGetTlsDefaultCtx();
// return NULL pointer when malloc size is 0
if (sizeBytes == 0) {
*ptr = NULL;
hip_status = hipSuccess;
} else if ((ctx == nullptr) || (ptr == nullptr)) {
if ((ctx == nullptr) || (ptr == nullptr)) {
hip_status = hipErrorInvalidValue;
} else {
unsigned amFlags = 0;
@@ -736,6 +759,9 @@ hipError_t hipExtMallocWithFlags(void** ptr, size_t sizeBytes, unsigned int flag
hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) {
HIP_INIT_SPECIAL_API(hipHostMalloc, (TRACE_MEM), ptr, sizeBytes, flags);
HIP_SET_DEVICE();
if (sizeBytes == 0) {
return ihipLogStatus(hipSuccess);
}
hipError_t hip_status = hipSuccess;
hip_status = hip_internal::ihipHostMalloc(tls, ptr, sizeBytes, flags);
return ihipLogStatus(hip_status);
@@ -744,6 +770,9 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) {
hipError_t hipMallocManaged(void** devPtr, size_t size, unsigned int flags) {
HIP_INIT_SPECIAL_API(hipMallocManaged, (TRACE_MEM), devPtr, size, flags);
HIP_SET_DEVICE();
if (size == 0) {
return ihipLogStatus(hipSuccess);
}
hipError_t hip_status = hipSuccess;
if(flags != hipMemAttachGlobal)
hip_status = hipErrorInvalidValue;
@@ -1224,6 +1253,7 @@ hipError_t hipMemcpyToSymbol(void* dst, const void* src, size_t count,
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbol_name, dst);
if (count == 0) return ihipLogStatus(hipSuccess);
if (dst == nullptr) {
return ihipLogStatus(hipErrorInvalidSymbol);
}
@@ -1246,6 +1276,7 @@ hipError_t hipMemcpyFromSymbol(void* dst, const void* src, size_t count,
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbol_name, dst);
if (count == 0) return ihipLogStatus(hipSuccess);
if (src == nullptr || dst == nullptr) {
return ihipLogStatus(hipErrorInvalidSymbol);
}
@@ -1269,6 +1300,7 @@ hipError_t hipMemcpyToSymbolAsync(void* dst, const void* src, size_t count,
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbol_name, dst);
if (count == 0) return ihipLogStatus(hipSuccess);
if (dst == nullptr) {
return ihipLogStatus(hipErrorInvalidSymbol);
}
@@ -1301,6 +1333,7 @@ hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* src, size_t count,
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbol_name, src);
if (count == 0) return ihipLogStatus(hipSuccess);
if (src == nullptr || dst == nullptr) {
return ihipLogStatus(hipErrorInvalidSymbol);
}
@@ -1592,6 +1625,7 @@ hipError_t ihipMemcpy3D(const struct hipMemcpy3DParms* p, hipStream_t stream, bo
srcXoffset = p->srcPos.x;
srcYoffset = p->srcPos.y;
srcZoffset = p->srcPos.z;
if (copyWidth == 0) return hipSuccess;
if (p->dstArray != nullptr) {
if ((p->dstArray->isDrv == true) ||( p->dstPtr.ptr!= nullptr)){
return hipErrorInvalidValue;
@@ -1933,6 +1967,7 @@ hipError_t getLockedPointer(void *hostPtr, size_t dataLen, void **devicePtrPtr)
// TODO - review and optimize
hipError_t ihipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width,
size_t height, hipMemcpyKind kind) {
if (height == 0 || width == 0) return hipSuccess;
if (dst == nullptr || src == nullptr || width > dpitch || width > spitch) return hipErrorInvalidValue;
hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
@@ -1989,6 +2024,7 @@ hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch,
hipError_t ihipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width,
size_t height, hipMemcpyKind kind, hipStream_t stream) {
if (height == 0 || width == 0) return hipSuccess;
if (dst == nullptr || src == nullptr || width > dpitch || width > spitch) return hipErrorInvalidValue;
hipError_t e = hipSuccess;
int isLockedOrD2D = 0;
@@ -2043,6 +2079,7 @@ hipError_t ihip2dOffsetMemcpy(void* dst, size_t dpitch, const void* src, size_t
size_t height, size_t srcXOffsetInBytes, size_t srcYOffset,
size_t dstXOffsetInBytes, size_t dstYOffset,hipMemcpyKind kind,
hipStream_t stream, bool isAsync) {
if (height == 0 || width == 0) return hipSuccess;
if((spitch < width + srcXOffsetInBytes) || (srcYOffset >= height)){
return hipErrorInvalidValue;
} else if((dpitch < width + dstXOffsetInBytes) || (dstYOffset >= height)){
@@ -2061,6 +2098,7 @@ hipError_t ihipMemcpyParam2D(const hip_Memcpy2D* pCopy, hipStream_t stream, bool
if (pCopy == nullptr) {
return hipErrorInvalidValue;
}
if (pCopy->Height == 0 || pCopy->WidthInBytes == 0) return hipSuccess;
void* dst; const void* src;
size_t spitch = pCopy->srcPitch;
size_t dpitch = pCopy->dstPitch;
@@ -2140,6 +2178,7 @@ hipError_t hipMemcpy2DFromArray( void* dst, size_t dpitch, hipArray_const_t src,
hipError_t hipMemcpy2DFromArrayAsync( void* dst, size_t dpitch, hipArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream ){
HIP_INIT_SPECIAL_API(hipMemcpy2DFromArrayAsync, (TRACE_MCMD), dst, dpitch, src, wOffset, hOffset, width, height, kind, stream);
size_t byteSize;
if (height == 0 || width == 0) return ihipLogStatus(hipSuccess);
if(src) {
switch (src->desc.f) {
case hipChannelFormatKindSigned:
@@ -2239,8 +2278,6 @@ hipError_t hipMemGetInfo(size_t* free, size_t* total) {
auto device = ctx->getWriteableDevice();
if (total) {
*total = device->_props.totalGlobalMem;
} else {
e = hipErrorInvalidValue;
}
if (free) {
@@ -2263,8 +2300,6 @@ hipError_t hipMemGetInfo(size_t* free, size_t* total) {
} else {
return ihipLogStatus(hipErrorInvalidValue);
}
} else {
e = hipErrorInvalidValue;
}
} else {
+316 -159
파일 보기
@@ -50,7 +50,7 @@ THE SOFTWARE.
#include <unordered_map>
#include <utility>
#include <vector>
#include "../include/hip/hcc_detail/code_object_bundle.hpp"
#include "code_object_bundle.inl"
#include "hip_fatbin.h"
// TODO Use Pool APIs from HCC to get memory regions.
@@ -140,7 +140,7 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global
uint32_t localWorkSizeZ, size_t sharedMemBytes,
hipStream_t hStream, void** kernelParams, void** extra,
hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags, bool isStreamLocked = 0,
void** impCoopParams = 0) {
void** impCoopParams = 0, hc::accelerator_view* coopAV = 0) {
using namespace hip_impl;
auto ctx = ihipGetTlsDefaultCtx();
@@ -192,8 +192,8 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global
if (impCoopParams) {
const auto p{static_cast<const char*>(*impCoopParams)};
// The sixth index is for multi-grid synchronization
kernargs.insert((kernargs.cend() - padSize - HIP_IMPLICIT_KERNARG_SIZE) + 6 * HIP_IMPLICIT_KERNARG_ALIGNMENT,
p, p + HIP_IMPLICIT_KERNARG_ALIGNMENT);
copy(p, p + HIP_IMPLICIT_KERNARG_ALIGNMENT,
(kernargs.end() - HIP_IMPLICIT_KERNARG_SIZE) + 6 * HIP_IMPLICIT_KERNARG_ALIGNMENT);
}
/*
@@ -245,6 +245,10 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global
hc::completion_future cf;
if (coopAV) {
lp.av = coopAV;
}
lp.av->dispatch_hsa_kernel(&aql, kernargs.data(), kernargs.size(),
(startEvent || stopEvent) ? &cf : nullptr
#if (__hcc_workweek__ > 17312)
@@ -326,22 +330,18 @@ hipError_t ihipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList
return hipErrorInvalidValue;
}
hipFunction_t* kds = reinterpret_cast<hipFunction_t*>(malloc(sizeof(hipFunction_t) * numDevices));
if (kds == nullptr) {
return hipErrorNotInitialized;
}
std::vector<hipFunction_t> kds(numDevices,0);
// prepare all kernel descriptors for each device as all streams will be locked in the next loop
for (int i = 0; i < numDevices; ++i) {
const hipLaunchParams& lp = launchParamsList[i];
if (lp.stream == nullptr) {
free(kds);
return hipErrorNotInitialized;
}
kds[i] = ps.kernel_descriptor(reinterpret_cast<std::uintptr_t>(lp.func),
hip_impl::target_agent(lp.stream));
if (kds[i] == nullptr) {
free(kds);
return hipErrorInvalidValue;
}
if (!kds[i]->_kernarg_layout.empty()) continue;
@@ -396,8 +396,6 @@ hipError_t ihipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList
#endif
}
free(kds);
return result;
}
@@ -409,6 +407,90 @@ hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList,
return ihipExtLaunchMultiKernelMultiDevice(launchParamsList, numDevices, flags, ps);
}
void getGprsLdsUsage(hipFunction_t f, size_t* usedVGPRS, size_t* usedSGPRS, size_t* usedLDS)
{
if (f->_is_code_object_v3) {
const auto header = reinterpret_cast<const amd_kernel_code_v3_t*>(f->_header);
// GRANULATED_WAVEFRONT_VGPR_COUNT is specified in 0:5 bits of COMPUTE_PGM_RSRC1
// the granularity for gfx6-gfx9 is max(0, ceil(vgprs_used / 4) - 1)
*usedVGPRS = ((header->compute_pgm_rsrc1 & 0x3F) + 1) << 2;
// GRANULATED_WAVEFRONT_SGPR_COUNT is specified in 6:9 bits of COMPUTE_PGM_RSRC1
// the granularity for gfx9+ is 2 * max(0, ceil(sgprs_used / 16) - 1)
*usedSGPRS = ((((header->compute_pgm_rsrc1 & 0x3C0) >> 6) >> 1) + 1) << 4;
*usedLDS = header->group_segment_fixed_size;
}
else {
const auto header = f->_header;
// VGPRs granularity is 4
*usedVGPRS = ((header->workitem_vgpr_count + 3) >> 2) << 2;
// adding 2 to take into account the 2 VCC registers & handle the granularity of 16
*usedSGPRS = header->wavefront_sgpr_count + 2;
*usedSGPRS = ((*usedSGPRS + 15) >> 4) << 4;
*usedLDS = header->workgroup_group_segment_byte_size;
}
}
static hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
TlsData *tls, uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk)
{
using namespace hip_impl;
auto ctx = ihipGetTlsDefaultCtx();
if (ctx == nullptr) {
return hipErrorInvalidDevice;
}
if (numBlocks == nullptr) {
return hipErrorInvalidValue;
}
hipDeviceProp_t prop{};
ihipGetDeviceProperties(&prop, ihipGetTlsDefaultCtx()->getDevice()->_deviceId);
if (blockSize > prop.maxThreadsPerBlock) {
*numBlocks = 0;
return hipSuccess;
}
prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024;
size_t usedVGPRS = 0;
size_t usedSGPRS = 0;
size_t usedLDS = 0;
getGprsLdsUsage(f, &usedVGPRS, &usedSGPRS, &usedLDS);
// Due to SPI and private memory limitations, the max of wavefronts per CU in 32
size_t wavefrontSize = prop.warpSize;
size_t maxWavefrontsPerCU = min(prop.maxThreadsPerMultiProcessor / wavefrontSize, 32);
const size_t simdPerCU = 4;
const size_t maxWavesPerSimd = maxWavefrontsPerCU / simdPerCU;
size_t numWavefronts = (blockSize + wavefrontSize - 1) / wavefrontSize;
size_t availableVGPRs = (prop.regsPerBlock / wavefrontSize / simdPerCU);
size_t vgprs_alu_occupancy = simdPerCU * (usedVGPRS == 0 ? maxWavesPerSimd
: std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS));
// Calculate blocks occupancy per CU based on VGPR usage
*numBlocks = vgprs_alu_occupancy / numWavefronts;
const size_t availableSGPRs = (prop.gcnArch < 800) ? 512 : 800;
size_t sgprs_alu_occupancy = simdPerCU * (usedSGPRS == 0 ? maxWavesPerSimd
: std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS));
// Calculate blocks occupancy per CU based on SGPR usage
*numBlocks = std::min(*numBlocks, (uint32_t) (sgprs_alu_occupancy / numWavefronts));
size_t total_used_lds = usedLDS + dynSharedMemPerBlk;
if (total_used_lds != 0) {
// Calculate LDS occupacy per CU. lds_per_cu / (static_lsd + dynamic_lds)
size_t lds_occupancy = prop.maxSharedMemoryPerMultiProcessor / total_used_lds;
*numBlocks = std::min(*numBlocks, (uint32_t) lds_occupancy);
}
return hipSuccess;
}
namespace {
// kernel for initializing GWS
// nwm1 is the total number of work groups minus 1
@@ -417,25 +499,28 @@ __global__ void init_gws(uint nwm1) {
}
}
__attribute__((visibility("default")))
hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim,
dim3 blockDimX, void** kernelParams, unsigned int sharedMemBytes,
dim3 blockDim, void** kernelParams, unsigned int sharedMemBytes,
hipStream_t stream, hip_impl::program_state& ps) {
#if (__hcc_workweek__ >= 20093)
hipError_t result;
if ((f == nullptr) || (stream == nullptr) || (kernelParams == nullptr)) {
if (f == nullptr || kernelParams == nullptr) {
return hipErrorNotInitialized;
}
if (!stream->getDevice()->_props.cooperativeLaunch) {
stream = ihipSyncAndResolveStream(stream);
if (!stream->getDevice()->_props.cooperativeLaunch ||
blockDim.x * blockDim.y * blockDim.z > stream->getDevice()->_props.maxThreadsPerBlock) {
return hipErrorInvalidConfiguration;
}
size_t globalWorkSizeX = (size_t)gridDim.x * (size_t)blockDimX.x;
size_t globalWorkSizeY = (size_t)gridDim.y * (size_t)blockDimX.y;
size_t globalWorkSizeZ = (size_t)gridDim.z * (size_t)blockDimX.z;
size_t globalWorkSizeX = (size_t)gridDim.x * (size_t)blockDim.x;
size_t globalWorkSizeY = (size_t)gridDim.y * (size_t)blockDim.y;
size_t globalWorkSizeZ = (size_t)gridDim.z * (size_t)blockDim.z;
if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX)
{
return hipErrorInvalidConfiguration;
@@ -469,28 +554,49 @@ hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim,
kd->_kernarg_layout = *reinterpret_cast<const std::vector<
std::pair<std::size_t, std::size_t>>*>(kargs.getHandle());
GET_TLS();
uint32_t numBlocksPerSm = 0;
result = ihipOccupancyMaxActiveBlocksPerMultiprocessor(tls, &numBlocksPerSm, kd,
blockDim.x * blockDim.y * blockDim.z, sharedMemBytes);
if (result != hipSuccess) {
return hipErrorLaunchFailure;
}
int maxActiveBlocks = numBlocksPerSm * stream->getDevice()->_props.multiProcessorCount;
//check to see if the workload fits on the GPU
if (gridDim.x * gridDim.y * gridDim.z > maxActiveBlocks) {
return hipErrorCooperativeLaunchTooLarge;
}
void *gwsKernelParam[1];
// calculate total number of work groups minus 1 for the main kernel
uint nwm1 = (gridDim.x * gridDim.y * gridDim.z) - 1;
gwsKernelParam[0] = &nwm1;
LockedAccessor_StreamCrit_t streamCrit(stream->criticalData(), false);
#if (__hcc_workweek__ >= 19213)
streamCrit->_av.acquire_locked_hsa_queue();
#endif
hc::accelerator acc = stream->getDevice()->_acc;
// create a cooperative accelerated view for launching gws and main kernels
hc::accelerator_view coopAV = acc.create_cooperative_view();
GET_TLS();
// launch the init_gws kernel to initialize the GWS
LockedAccessor_StreamCrit_t streamCrit(stream->criticalData(), false);
// the cooperative queue will wait until this stream completes its operations
hc::completion_future streamCF;
if (!streamCrit->_av.get_is_empty()) {
streamCF = streamCrit->_av.create_marker(hc::accelerator_scope);
coopAV.create_blocking_marker(streamCF, hc::accelerator_scope);
}
streamCrit->_av.acquire_locked_hsa_queue();
coopAV.acquire_locked_hsa_queue();
// launch the init_gws kernel to initialize the GWS in the dedicated cooperative queue
result = ihipModuleLaunchKernel(tls, gwsKD, 1, 1, 1, 1, 1, 1,
0, stream, gwsKernelParam, nullptr, nullptr, nullptr, 0, true);
0, stream, gwsKernelParam, nullptr, nullptr, nullptr, 0, true, nullptr , &coopAV);
if (result != hipSuccess) {
stream->criticalData().unlock();
#if (__hcc_workweek__ >= 19213)
stream->criticalData()._av.release_locked_hsa_queue();
#endif
coopAV.release_locked_hsa_queue();
return hipErrorLaunchFailure;
}
@@ -498,60 +604,106 @@ hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim,
void* impCoopParams[1];
impCoopParams[0] = &impCoopArg;
// launch the main kernel
// launch the main kernel in the cooperative queue
result = ihipModuleLaunchKernel(tls, kd,
gridDim.x * blockDimX.x,
gridDim.y * blockDimX.y,
gridDim.z * blockDimX.z,
blockDimX.x, blockDimX.y, blockDimX.z,
gridDim.x * blockDim.x,
gridDim.y * blockDim.y,
gridDim.z * blockDim.z,
blockDim.x, blockDim.y, blockDim.z,
sharedMemBytes, stream, kernelParams, nullptr, nullptr,
nullptr, 0, true, impCoopParams);
nullptr, 0, true, impCoopParams, &coopAV);
coopAV.release_locked_hsa_queue();
stream->criticalData()._av.release_locked_hsa_queue();
// this stream will wait until the cooperative queue completes its operations
hc::completion_future cooperativeCF;
if (!coopAV.get_is_empty()) {
cooperativeCF = coopAV.create_marker(hc::accelerator_scope);
streamCrit->_av.create_blocking_marker(cooperativeCF, hc::accelerator_scope);
}
stream->criticalData().unlock();
#if (__hcc_workweek__ >= 19213)
stream->criticalData()._av.release_locked_hsa_queue();
#endif
return result;
#else
return hipErrorInvalidConfiguration;
#endif
}
__attribute__((visibility("default")))
hipError_t hipLaunchCooperativeKernel(const void* func, dim3 gridDim,
dim3 blockDim, void** args,
size_t sharedMem, hipStream_t stream,
hip_impl::program_state& ps) {
// Skipping passing in ps, because the logging function does not like it
HIP_INIT_API(hipLaunchCooperativeKernel, func, gridDim, blockDim, args,
sharedMem, stream);
return ihipLogStatus(ihipLaunchCooperativeKernel(func, gridDim, blockDim,
args, sharedMem, stream, ps));
}
hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
int numDevices, unsigned int flags, hip_impl::program_state& ps) {
#if (__hcc_workweek__ >= 20093)
hipError_t result;
if (numDevices > g_deviceCnt || launchParamsList == nullptr || numDevices > MAX_COOPERATIVE_GPUs) {
return hipErrorInvalidValue;
}
vector<hipStream_t> streams;
vector<uint64_t> deviceIDs;
// check to see if we have valid distinct streams/devices, if cooperative multi device
// launch is supported and if grid/block dimensions are valid
for (int i = 0; i < numDevices; ++i) {
if (!launchParamsList[i].stream->getDevice()->_props.cooperativeMultiDeviceLaunch) {
const hipLaunchParams& lp = launchParamsList[i];
if (lp.stream == nullptr){
return hipErrorInvalidResourceHandle;
}
if (find(streams.begin(), streams.end(), lp.stream) == streams.end()) {
streams.push_back(lp.stream);
} else {
return hipErrorInvalidDevice;
}
const ihipDevice_t* currentDevice = lp.stream->getDevice();
if (find(deviceIDs.begin(), deviceIDs.end(), currentDevice->_deviceId) == deviceIDs.end()) {
deviceIDs.push_back(currentDevice->_deviceId);
} else {
return hipErrorInvalidDevice;
}
if (!currentDevice->_props.cooperativeMultiDeviceLaunch) {
return hipErrorInvalidConfiguration;
}
if (lp.gridDim.x == 0 || lp.gridDim.y == 0 || lp.gridDim.z == 0 ||
lp.blockDim.x == 0 || lp.blockDim.y == 0 || lp.blockDim.z == 0 ||
lp.blockDim.x * lp.blockDim.y * lp.blockDim.z > currentDevice->_props.maxThreadsPerBlock){
return hipErrorInvalidConfiguration;
}
}
hipFunction_t* gwsKds = reinterpret_cast<hipFunction_t*>(malloc(sizeof(hipFunction_t) * numDevices));
hipFunction_t* kds = reinterpret_cast<hipFunction_t*>(malloc(sizeof(hipFunction_t) * numDevices));
if (kds == nullptr || gwsKds == nullptr) {
return hipErrorNotInitialized;
}
vector<hipFunction_t> gwsKds;
vector<hipFunction_t> kds;
GET_TLS();
// prepare all kernel descriptors for initializing the GWS and the main kernels per device
for (int i = 0; i < numDevices; ++i) {
const hipLaunchParams& lp = launchParamsList[i];
if (lp.stream == nullptr) {
free(gwsKds);
free(kds);
return hipErrorNotInitialized;
}
gwsKds[i] = ps.kernel_descriptor(reinterpret_cast<std::uintptr_t>(&init_gws),
hip_impl::target_agent(lp.stream));
gwsKds.push_back(ps.kernel_descriptor(reinterpret_cast<std::uintptr_t>(&init_gws),
hip_impl::target_agent(lp.stream)));
if (gwsKds[i] == nullptr) {
free(gwsKds);
free(kds);
return hipErrorInvalidValue;
}
hip_impl::kernargs_size_align gwsKargs = ps.get_kernargs_size_align(
@@ -560,23 +712,42 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
gwsKargs.getHandle());
kds[i] = ps.kernel_descriptor(reinterpret_cast<std::uintptr_t>(lp.func),
hip_impl::target_agent(lp.stream));
kds.push_back(ps.kernel_descriptor(reinterpret_cast<std::uintptr_t>(lp.func),
hip_impl::target_agent(lp.stream)));
if (kds[i] == nullptr) {
free(gwsKds);
free(kds);
return hipErrorInvalidValue;
}
hip_impl::kernargs_size_align kargs = ps.get_kernargs_size_align(
reinterpret_cast<std::uintptr_t>(lp.func));
kds[i]->_kernarg_layout = *reinterpret_cast<const std::vector<std::pair<std::size_t, std::size_t>>*>(
kargs.getHandle());
uint32_t numBlocksPerSm = 0;
result = ihipOccupancyMaxActiveBlocksPerMultiprocessor(tls, &numBlocksPerSm, kds[i],
lp.blockDim.x * lp.blockDim.y * lp.blockDim.z, lp.sharedMem);
if (result != hipSuccess) {
return hipErrorLaunchFailure;
}
int maxActiveBlocks = numBlocksPerSm * lp.stream->getDevice()->_props.multiProcessorCount;
//check to see if the workload fits on the GPU
if (lp.gridDim.x * lp.gridDim.y * lp.gridDim.z > maxActiveBlocks) {
return hipErrorCooperativeLaunchTooLarge;
}
}
vector<hc::accelerator_view> coopAVs;
// create cooperative accelerated views for launching gws and main kernels on each device
for (int i = 0; i < numDevices; ++i) {
hc::accelerator acc = launchParamsList[i].stream->getDevice()->_acc;
coopAVs.push_back(acc.create_cooperative_view());
}
mg_sync *mg_sync_ptr = 0;
mg_info *mg_info_ptr[MAX_COOPERATIVE_GPUs] = {0};
vector<mg_info *> mg_info_ptr;
GET_TLS();
result = hip_internal::ihipHostMalloc(tls, (void **)&mg_sync_ptr, sizeof(mg_sync), hipHostMallocDefault);
if (result != hipSuccess) {
return hipErrorInvalidValue;
@@ -586,7 +757,8 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
uint all_sum = 0;
for (int i = 0; i < numDevices; ++i) {
result = hip_internal::ihipHostMalloc(tls, (void **)&mg_info_ptr[i], sizeof(mg_info), hipHostMallocDefault);
mg_info *mg_info_temp = nullptr;
result = hip_internal::ihipHostMalloc(tls, (void **)&mg_info_temp, sizeof(mg_info), hipHostMallocDefault);
if (result != hipSuccess) {
hip_internal::ihipHostFree(tls, mg_sync_ptr);
for (int j = 0; j < i; ++j) {
@@ -594,6 +766,7 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
}
return hipErrorInvalidValue;
}
mg_info_ptr.push_back(mg_info_temp);
// calculate the sum of sizes of all grids
const hipLaunchParams& lp = launchParamsList[i];
all_sum += lp.blockDim.x * lp.blockDim.y * lp.blockDim.z *
@@ -603,9 +776,15 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
// lock all streams before launching the blit kernels for initializing the GWS and main kernels to each device
for (int i = 0; i < numDevices; ++i) {
LockedAccessor_StreamCrit_t streamCrit(launchParamsList[i].stream->criticalData(), false);
#if (__hcc_workweek__ >= 19213)
hc::completion_future streamCF;
if (!streamCrit->_av.get_is_empty()) {
streamCF = streamCrit->_av.create_marker(hc::accelerator_scope);
coopAVs[i].create_blocking_marker(streamCF, hc::accelerator_scope);
}
streamCrit->_av.acquire_locked_hsa_queue();
#endif
coopAVs[i].acquire_locked_hsa_queue();
}
// launch the init_gws kernel to initialize the GWS for each device
@@ -617,14 +796,13 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
gwsKernelParam[0] = &nwm1;
result = ihipModuleLaunchKernel(tls, gwsKds[i], 1, 1, 1, 1, 1, 1,
0, lp.stream, gwsKernelParam, nullptr, nullptr, nullptr, 0, true);
0, lp.stream, gwsKernelParam, nullptr, nullptr, nullptr, 0, true, nullptr, &coopAVs[i]);
if (result != hipSuccess) {
for (int j = 0; j < numDevices; ++j) {
launchParamsList[j].stream->criticalData().unlock();
#if (__hcc_workweek__ >= 19213)
launchParamsList[j].stream->criticalData()._av.release_locked_hsa_queue();
#endif
coopAVs[i].release_locked_hsa_queue();
}
hip_internal::ihipHostFree(tls, mg_sync_ptr);
for (int j = 0; j < numDevices; ++j) {
@@ -670,14 +848,13 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
lp.blockDim.x, lp.blockDim.y,
lp.blockDim.z, lp.sharedMem,
lp.stream, lp.args, nullptr, nullptr, nullptr, 0,
true, impCoopParams);
true, impCoopParams, &coopAVs[i]);
if (result != hipSuccess) {
for (int j = 0; j < numDevices; ++j) {
launchParamsList[j].stream->criticalData().unlock();
#if (__hcc_workweek__ >= 19213)
launchParamsList[j].stream->criticalData()._av.release_locked_hsa_queue();
#endif
coopAVs[i].release_locked_hsa_queue();
}
hip_internal::ihipHostFree(tls, mg_sync_ptr);
for (int j = 0; j < numDevices; ++j) {
@@ -691,14 +868,18 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
// unlock all streams
for (int i = 0; i < numDevices; ++i) {
launchParamsList[i].stream->criticalData().unlock();
#if (__hcc_workweek__ >= 19213)
coopAVs[i].release_locked_hsa_queue();
launchParamsList[i].stream->criticalData()._av.release_locked_hsa_queue();
#endif
}
free(gwsKds);
free(kds);
hc::completion_future cooperativeCF;
if (!coopAVs[i].get_is_empty()) {
cooperativeCF = coopAVs[i].create_marker(hc::accelerator_scope);
launchParamsList[i].stream->criticalData()._av.create_blocking_marker(
cooperativeCF, hc::accelerator_scope);
}
launchParamsList[i].stream->criticalData().unlock();
}
hip_internal::ihipHostFree(tls, mg_sync_ptr);
for (int j = 0; j < numDevices; ++j) {
@@ -706,6 +887,24 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
}
return result;
#else
return hipErrorInvalidConfiguration;
#endif
}
__attribute__((visibility("default")))
hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
int numDevices,
unsigned int flags,
hip_impl::program_state& ps) {
// Skipping passing in ps, because the logging function does not like it
HIP_INIT_API(hipLaunchCooperativeKernelMultiDevice, launchParamsList,
numDevices, flags);
return ihipLogStatus(ihipLaunchCooperativeKernelMultiDevice(launchParamsList,
numDevices,
flags, ps));
}
namespace hip_impl {
@@ -1120,7 +1319,7 @@ const amd_kernel_code_v3_t *header_v3(const ihipModuleSymbol_t& kd) {
return reinterpret_cast<const amd_kernel_code_v3_t*>(kd._header);
}
hipFuncAttributes make_function_attributes(TlsData *tls, const ihipModuleSymbol_t& kd) {
hipFuncAttributes make_function_attributes(TlsData *tls, ihipModuleSymbol_t& kd) {
hipFuncAttributes r{};
hipDeviceProp_t prop{};
@@ -1130,23 +1329,57 @@ hipFuncAttributes make_function_attributes(TlsData *tls, const ihipModuleSymbol_
prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024;
if (kd._is_code_object_v3) {
r.localSizeBytes = header_v3(kd)->private_segment_fixed_size;
r.sharedSizeBytes = header_v3(kd)->group_segment_fixed_size;
r.numRegs = ((header_v3(kd)->compute_pgm_rsrc1 & 0x3F) + 1) << 2;
r.binaryVersion = 0; // FIXME: should it be the ISA version or code
// object format version?
r.localSizeBytes = header_v3(kd)->private_segment_fixed_size;
r.sharedSizeBytes = header_v3(kd)->group_segment_fixed_size;
} else {
r.localSizeBytes = kd._header->workitem_private_segment_byte_size;
r.sharedSizeBytes = kd._header->workgroup_group_segment_byte_size;
r.numRegs = kd._header->workitem_vgpr_count;
r.binaryVersion =
kd._header->amd_machine_version_major * 10 +
kd._header->amd_machine_version_minor;
}
r.maxDynamicSharedSizeBytes = prop.sharedMemPerBlock - r.sharedSizeBytes;
r.maxThreadsPerBlock = r.numRegs ?
std::min(prop.maxThreadsPerBlock, prop.regsPerBlock / r.numRegs) :
prop.maxThreadsPerBlock;
size_t usedVGPRS = 0;
size_t usedSGPRS = 0;
size_t usedLDS = 0;
getGprsLdsUsage(&kd, &usedVGPRS, &usedSGPRS, &usedLDS);
r.numRegs = usedVGPRS;
size_t wavefrontSize = prop.warpSize;
size_t maxWavefrontsPerBlock = prop.maxThreadsPerBlock / wavefrontSize;
size_t maxWavefrontsPerCU = min(prop.maxThreadsPerMultiProcessor / wavefrontSize, 32);
const size_t numSIMD = 4;
const size_t maxWavesPerSimd = maxWavefrontsPerCU / numSIMD;
size_t maxWaves = 0;
for (int i = 0; i < maxWavefrontsPerBlock; i++) {
size_t wavefronts = i + 1;
if (usedVGPRS > 0) {
size_t availableVGPRs = (prop.regsPerBlock / wavefrontSize / numSIMD);
size_t vgprs_alu_occupancy = numSIMD * std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS);
// Calculate blocks occupancy per CU based on VGPR usage
if (vgprs_alu_occupancy < wavefronts)
break;
}
if (usedSGPRS > 0) {
const size_t availableSGPRs = (prop.gcnArch < 800) ? 512 : 800;
size_t sgprs_alu_occupancy = numSIMD * ((usedSGPRS == 0) ? maxWavesPerSimd
: std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS));
// Calculate blocks occupancy per CU based on SGPR usage
if (sgprs_alu_occupancy < wavefronts)
break;
}
maxWaves = wavefronts;
}
r.maxThreadsPerBlock = maxWaves * wavefrontSize;
r.ptxVersion = prop.major * 10 + prop.minor; // HIP currently presents itself as PTX 3.0.
return r;
@@ -1294,29 +1527,6 @@ hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const
return ihipLogStatus(hipSuccess);
}
void getGprsLdsUsage(hipFunction_t f, size_t* usedVGPRS, size_t* usedSGPRS, size_t* usedLDS)
{
if (f->_is_code_object_v3) {
const auto header = reinterpret_cast<const amd_kernel_code_v3_t*>(f->_header);
// GRANULATED_WAVEFRONT_VGPR_COUNT is specified in 0:5 bits of COMPUTE_PGM_RSRC1
// the granularity for gfx6-gfx9 is max(0, ceil(vgprs_used / 4) - 1)
*usedVGPRS = ((header->compute_pgm_rsrc1 & 0x3F) + 1) << 2;
// GRANULATED_WAVEFRONT_SGPR_COUNT is specified in 6:9 bits of COMPUTE_PGM_RSRC1
// the granularity for gfx9+ is 2 * max(0, ceil(sgprs_used / 16) - 1)
*usedSGPRS = ((((header->compute_pgm_rsrc1 & 0x3C0) >> 6) >> 1) + 1) << 4;
*usedLDS = header->group_segment_fixed_size;
}
else {
const auto header = f->_header;
// VGPRs granularity is 4
*usedVGPRS = ((header->workitem_vgpr_count + 3) >> 2) << 2;
// adding 2 to take into account the 2 VCC registers & handle the granularity of 16
*usedSGPRS = header->wavefront_sgpr_count + 2;
*usedSGPRS = ((*usedSGPRS + 15) >> 4) << 4;
*usedLDS = header->workgroup_group_segment_byte_size;
}
}
hipError_t ihipOccupancyMaxPotentialBlockSize(TlsData *tls, uint32_t* gridSize, uint32_t* blockSize,
hipFunction_t f, size_t dynSharedMemPerBlk,
uint32_t blockSizeLimit)
@@ -1439,59 +1649,6 @@ hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* block
gridSize, blockSize, f, dynSharedMemPerBlk, blockSizeLimit));
}
hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
TlsData *tls, uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk)
{
using namespace hip_impl;
auto ctx = ihipGetTlsDefaultCtx();
if (ctx == nullptr) {
return hipErrorInvalidDevice;
}
hipDeviceProp_t prop{};
ihipGetDeviceProperties(&prop, ihipGetTlsDefaultCtx()->getDevice()->_deviceId);
prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024;
size_t usedVGPRS = 0;
size_t usedSGPRS = 0;
size_t usedLDS = 0;
getGprsLdsUsage(f, &usedVGPRS, &usedSGPRS, &usedLDS);
// Due to SPI and private memory limitations, the max of wavefronts per CU in 32
size_t wavefrontSize = prop.warpSize;
size_t maxWavefrontsPerCU = min(prop.maxThreadsPerMultiProcessor / wavefrontSize, 32);
const size_t simdPerCU = 4;
const size_t maxWavesPerSimd = maxWavefrontsPerCU / simdPerCU;
size_t numWavefronts = (blockSize + wavefrontSize - 1) / wavefrontSize;
size_t availableVGPRs = (prop.regsPerBlock / wavefrontSize / simdPerCU);
size_t vgprs_alu_occupancy = simdPerCU * (usedVGPRS == 0 ? maxWavesPerSimd
: std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS));
// Calculate blocks occupancy per CU based on VGPR usage
*numBlocks = vgprs_alu_occupancy / numWavefronts;
const size_t availableSGPRs = (prop.gcnArch < 800) ? 512 : 800;
size_t sgprs_alu_occupancy = simdPerCU * (usedSGPRS == 0 ? maxWavesPerSimd
: std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS));
// Calculate blocks occupancy per CU based on SGPR usage
*numBlocks = std::min(*numBlocks, (uint32_t) (sgprs_alu_occupancy / numWavefronts));
size_t total_used_lds = usedLDS + dynSharedMemPerBlk;
if (total_used_lds != 0) {
// Calculate LDS occupacy per CU. lds_per_cu / (static_lsd + dynamic_lds)
size_t lds_occupancy = prop.maxSharedMemoryPerMultiProcessor / total_used_lds;
*numBlocks = std::min(*numBlocks, (uint32_t) lds_occupancy);
}
return hipSuccess;
}
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk)
{
+33 -5
파일 보기
@@ -257,11 +257,39 @@ hipError_t hipStreamGetPriority(hipStream_t stream, int* priority) {
hipError_t hipStreamAddCallback(hipStream_t stream, hipStreamCallback_t callback, void* userData,
unsigned int flags) {
HIP_INIT_API(hipStreamAddCallback, stream, callback, userData, flags);
hipError_t e = hipSuccess;
// Create a thread in detached mode to handle callback
ihipStreamCallback_t* cb = new ihipStreamCallback_t(stream, callback, userData);
std::thread(ihipStreamCallbackHandler, cb).detach();
auto stream_original{stream};
stream = ihipSyncAndResolveStream(stream);
return ihipLogStatus(e);
if (!stream) return hipErrorInvalidValue;
LockedAccessor_StreamCrit_t cs{stream->criticalData()};
// create first marker
auto cf = cs->_av.create_marker(hc::no_scope);
// get its signal
auto signal = *reinterpret_cast<hsa_signal_t*>(cf.get_native_handle());
// increment its signal value
hsa_signal_add_relaxed(signal, 1);
// create callback that can be passed to hsa_amd_signal_async_handler
// this function will call the user's callback, then sets first packet's signal to 0 to indicate completion
auto t{new std::function<void()>{[=]() {
callback(stream_original, hipSuccess, userData);
hsa_signal_store_relaxed(signal, 0);
}}};
// register above callback with HSA runtime to be called when first packet's signal
// is decremented from 2 to 1 by CP (or it is already at 1)
hsa_amd_signal_async_handler(signal, HSA_SIGNAL_CONDITION_EQ, 1,
[](hsa_signal_value_t x, void* p) {
(*static_cast<decltype(t)>(p))();
delete static_cast<decltype(t)>(p);
return false;
}, t);
// create additional marker that blocks on the first one
cs->_av.create_blocking_marker(cf, hc::no_scope);
return ihipLogStatus(hipSuccess);
}
+22 -4
파일 보기
@@ -301,7 +301,12 @@ hipError_t hipCreateTextureObject(hipTextureObject_t* pTexObject, const hipResou
hsa_ext_sampler_descriptor_t samplerDescriptor;
fillSamplerDescriptor(samplerDescriptor, pTexDesc->addressMode[0], pTexDesc->filterMode,
pTexDesc->normalizedCoords);
if(hipResourceTypeLinear == pResDesc->resType) {
samplerDescriptor.filter_mode = HSA_EXT_SAMPLER_FILTER_MODE_NEAREST;
samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER;
} else if(!pTexDesc->normalizedCoords) {
samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE;
}
hsa_access_permission_t permission = HSA_ACCESS_PERMISSION_RW;
if(hipResourceTypePitch2D != pResDesc->resType)
@@ -312,6 +317,7 @@ hipError_t hipCreateTextureObject(hipTextureObject_t* pTexObject, const hipResou
HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR, pitch, 0, &(pTexture->image)) ||
HSA_STATUS_SUCCESS !=
hsa_ext_sampler_create(*agent, &samplerDescriptor, &(pTexture->sampler))) {
free(pTexture);
return ihipLogStatus(hipErrorRuntimeOther);
}
@@ -438,7 +444,13 @@ hipError_t ihipBindTextureImpl(TlsData *tls_, int dim, enum hipTextureReadMode r
imageDescriptor.format.channel_type = channelType;
hsa_ext_sampler_descriptor_t samplerDescriptor;
fillSamplerDescriptor(samplerDescriptor, addressMode, filterMode, normalizedCoords);
samplerDescriptor.filter_mode = HSA_EXT_SAMPLER_FILTER_MODE_NEAREST;
samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER;
if (normalizedCoords) {
samplerDescriptor.coordinate_mode = HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED;
} else {
samplerDescriptor.coordinate_mode = HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED;
}
hsa_access_permission_t permission = HSA_ACCESS_PERMISSION_RW;
@@ -449,6 +461,7 @@ hipError_t ihipBindTextureImpl(TlsData *tls_, int dim, enum hipTextureReadMode r
HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR, rowPitch, 0, &(pTexture->image)) ||
HSA_STATUS_SUCCESS !=
hsa_ext_sampler_create(*agent, &samplerDescriptor, &(pTexture->sampler))) {
free(pTexture);
return hipErrorRuntimeOther;
}
getHipTextureObject(&textureObject, pTexture->image, pTexture->sampler);
@@ -514,7 +527,9 @@ hipError_t ihipBindTexture2DImpl(TlsData *tls, int dim, enum hipTextureReadMode
hsa_ext_sampler_descriptor_t samplerDescriptor;
fillSamplerDescriptor(samplerDescriptor, addressMode, filterMode, normalizedCoords);
if(!normalizedCoords) {
samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE;
}
hsa_access_permission_t permission = HSA_ACCESS_PERMISSION_RW;
if( 0 == pitch)
@@ -525,6 +540,7 @@ hipError_t ihipBindTexture2DImpl(TlsData *tls, int dim, enum hipTextureReadMode
HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR, pitch, 0, &(pTexture->image)) ||
HSA_STATUS_SUCCESS !=
hsa_ext_sampler_create(*agent, &samplerDescriptor, &(pTexture->sampler))) {
free(pTexture);
return hipErrorRuntimeOther;
}
getHipTextureObject(&textureObject, pTexture->image, pTexture->sampler);
@@ -620,7 +636,9 @@ hipError_t ihipBindTextureToArrayImpl(TlsData *tls_, int dim, enum hipTextureRea
hsa_ext_sampler_descriptor_t samplerDescriptor;
fillSamplerDescriptor(samplerDescriptor, addressMode, filterMode, normalizedCoords);
if(!normalizedCoords) {
samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE;
}
hsa_access_permission_t permission = HSA_ACCESS_PERMISSION_RW;
size_t rowPitch = getElementSize(channelOrder, channelType) * alignUp(imageDescriptor.width, IMAGE_PITCH_ALIGNMENT);
+1 -1
파일 보기
@@ -21,7 +21,7 @@ THE SOFTWARE.
*/
#include "../include/hip/hiprtc.h"
#include "../include/hip/hcc_detail/code_object_bundle.hpp"
#include "code_object_bundle.inl"
#include "../include/hip/hcc_detail/elfio/elfio.hpp"
#include "../include/hip/hcc_detail/program_state.hpp"
+6 -3
파일 보기
@@ -1,6 +1,6 @@
#include "../include/hip/hcc_detail/program_state.hpp"
#include "../include/hip/hcc_detail/code_object_bundle.hpp"
#include "code_object_bundle.inl"
#include "../include/hip/hcc_detail/hsa_helpers.hpp"
#if !defined(__cpp_exceptions)
@@ -357,8 +357,11 @@ public:
const auto it1 = get_symbol_addresses().find(x);
if (it1 == get_symbol_addresses().cend()) {
hip_throw(std::runtime_error{
"Global symbol: " + x + " is undefined."});
// For a unknown symbol, initialize it with a magic poison
hsa_executable_agent_global_variable_define(
executable, agent, x.c_str(),
reinterpret_cast<void*>(0xDEADBEEFDEADBEEFull));
continue;
}
hsa_status_t status;
+1 -2
파일 보기
@@ -385,8 +385,7 @@ double compute_BSR(BCRSArrays& bcsr, double *x , double *y){
cudaEventCreate(&startTime);
cudaEventCreate(&stopTime);
cudaEventRecord(startTime, bcsr.streamId);
// NOTE: cusparseDbsrmv and CUSPARSE_DIRECTION_COLUMN (of type cusparseDirection_t) are yet unsupported by HIP
// CHECK: cusparseDbsrmv(bcsr.cusparseHandle, CUSPARSE_DIRECTION_COLUMN, HIPSPARSE_OPERATION_NON_TRANSPOSE,
// CHECK: cusparseDbsrmv(bcsr.cusparseHandle, HIPSPARSE_DIRECTION_COLUMN, HIPSPARSE_OPERATION_NON_TRANSPOSE,
cusparseDbsrmv(bcsr.cusparseHandle, CUSPARSE_DIRECTION_COLUMN, CUSPARSE_OPERATION_NON_TRANSPOSE,
bcsr.nbBlockRow, bcsr.m, bcsr.nbBlocks, &alpha, descr,
bcsr.cu_bsrValC, bcsr.cu_bsrRowPtrC, bcsr.cu_bsrColIndC, bcsr.blockSize,
+46
파일 보기
@@ -0,0 +1,46 @@
/*
Copyright (c) 2015-Present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp
* TEST: %t
* HIT_END
*/
#include "test_common.h"
#define SIZE 1024
int main(){
void *Sd;
hipError_t e;
char S[SIZE]="This is not a device symbol";
HIPCHECK(hipMalloc(&Sd,SIZE));
e = hipMemcpyFromSymbol(S, HIP_SYMBOL(Sd), SIZE, 0, hipMemcpyDeviceToHost);
HIPASSERT(e==hipErrorInvalidSymbol);
e = hipMemcpyFromSymbol(S, NULL, SIZE, 0, hipMemcpyDeviceToHost);
HIPASSERT(e==hipErrorInvalidSymbol);
HIPCHECK(hipFree(Sd));
passed();
}
+49
파일 보기
@@ -0,0 +1,49 @@
/*
Copyright (c) 2015-Present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp
* TEST: %t
* HIT_END
*/
#include "test_common.h"
#define SIZE 1024
int main(){
void *Sd;
hipError_t e;
char S[SIZE]="This is not a device symbol";
HIPCHECK(hipMalloc(&Sd,SIZE));
hipStream_t stream;
HIPCHECK(hipStreamCreate(&stream));
e = hipMemcpyFromSymbolAsync(S, HIP_SYMBOL(Sd), SIZE, 0, hipMemcpyDeviceToHost, stream);
HIPASSERT(e==hipErrorInvalidSymbol);
e = hipMemcpyFromSymbolAsync(S, NULL, SIZE, 0, hipMemcpyDeviceToHost, stream);
HIPASSERT(e==hipErrorInvalidSymbol);
HIPCHECK(hipFree(Sd));
passed();
}
+46
파일 보기
@@ -0,0 +1,46 @@
/*
Copyright (c) 2015-Present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp
* TEST: %t
* HIT_END
*/
#include "test_common.h"
#define SIZE 1024
int main(){
void *Sd;
hipError_t e;
char S[SIZE]="This is not a device symbol";
HIPCHECK(hipMalloc(&Sd,SIZE));
e = hipMemcpyToSymbol(HIP_SYMBOL(Sd), S, SIZE, 0, hipMemcpyHostToDevice);
HIPASSERT(e==hipErrorInvalidSymbol);
e = hipMemcpyToSymbol(NULL, S, SIZE, 0, hipMemcpyHostToDevice);
HIPASSERT(e==hipErrorInvalidSymbol);
HIPCHECK(hipFree(Sd));
passed();
}
+49
파일 보기
@@ -0,0 +1,49 @@
/*
Copyright (c) 2015-Present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp
* TEST: %t
* HIT_END
*/
#include "test_common.h"
#define SIZE 100
int main(){
void *Sd;
hipError_t e;
char S[SIZE]="This is not a device symbol";
HIPCHECK(hipMalloc(&Sd,SIZE));
hipStream_t stream;
HIPCHECK(hipStreamCreate(&stream));
e = hipMemcpyToSymbolAsync(HIP_SYMBOL(Sd), S, SIZE, 0, hipMemcpyHostToDevice, stream);
HIPASSERT(e==hipErrorInvalidSymbol);
e = hipMemcpyToSymbolAsync(NULL, S, SIZE, 0, hipMemcpyHostToDevice, stream);
HIPASSERT(e==hipErrorInvalidSymbol);
HIPCHECK(hipFree(Sd));
passed();
}
+43
파일 보기
@@ -0,0 +1,43 @@
/*
Copyright (c) 2015-Present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM vdi
* TEST: %t
* HIT_END
*/
#include "test_common.h"
#define SIZE 100
int main(){
hipError_t e;
char str[SIZE]="Hi, I am Ellesemere. What is ur name?";
e = hipMemcpy(0, str, SIZE, hipMemcpyHostToDevice);
HIPASSERT(e==hipErrorInvalidValue);
e = hipMemcpy(NULL, str, SIZE, hipMemcpyHostToDevice);
HIPASSERT(e==hipErrorInvalidValue);
e = hipMemset(0,99,80);
HIPASSERT(e==hipErrorInvalidValue);
passed();
}
+40
파일 보기
@@ -0,0 +1,40 @@
/*
Copyright (c) 2015-Present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM vdi
* TEST: %t
* HIT_END
*/
#include "test_common.h"
int main(){
hipError_t e;
hipStream_t stream;
e = hipStreamCreateWithFlags(&stream, -1);
HIPASSERT(e==hipErrorInvalidValue);
e = hipStreamCreateWithFlags(&stream, 2);
HIPASSERT(e==hipErrorInvalidValue);
passed();
}
+39
파일 보기
@@ -96,6 +96,18 @@ void kernel_hisinf(__half* input, int* output) {
output[tx] = __hisinf(input[tx]);
}
__global__ void testHalfAbs(float* p) {
auto a = __float2half(*p);
a = __habs(a);
*p = __half2float(a);
}
__global__ void testHalf2Abs(float2* p) {
auto a = __float22half2_rn(*p);
a = __habs2(a);
*p = __half22float2(a);
}
#endif
@@ -237,6 +249,31 @@ void checkFunctional() {
return;
}
void checkHalfAbs() {
{
float *p;
hipMalloc(&p, sizeof(float));
float pp = -2.1f;
hipMemcpy(p, &pp, sizeof(float), hipMemcpyDefault);
hipLaunchKernelGGL(testHalfAbs, 1, 1, 0, 0, p);
hipMemcpy(&pp, p, sizeof(float), hipMemcpyDefault);
hipFree(p);
if(pp < 0.0f) { failed("Half Abs failed"); }
}
{
float2 *p;
hipMalloc(&p, sizeof(float2));
float2 pp;
pp.x = -2.1f;
pp.y = -1.1f;
hipMemcpy(p, &pp, sizeof(float2), hipMemcpyDefault);
hipLaunchKernelGGL(testHalf2Abs, 1, 1, 0, 0, p);
hipMemcpy(&pp, p, sizeof(float2), hipMemcpyDefault);
hipFree(p);
if(pp.x < 0.0f || pp.y < 0.0f) { failed("Half2 Abs Test Failed"); }
}
}
int main() {
bool* result{nullptr};
hipMemAllocHost((void**)&result, sizeof(result));
@@ -260,5 +297,7 @@ int main() {
// run some functional checks
checkFunctional();
checkHalfAbs();
passed();
}
+239
파일 보기
@@ -0,0 +1,239 @@
/*
Copyright (c) 2015-2019 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc HIPCC_OPTIONS -std=c++14
* TEST: %t
* HIT_END
*/
#include <hip/hip_runtime.h>
#include <type_traits>
#include <random>
#include "test_common.h"
static std::random_device dev;
static std::mt19937 rng(dev());
template <typename T, typename M>
__host__ __device__ inline constexpr int count() {
return sizeof(T) / sizeof(M);
}
inline float getRandomFloat(float min = 10, float max = 100) {
std::uniform_real_distribution<float> gen(min, max);
return gen(rng);
}
template <typename T, typename B>
void fillMatrix(T* a, int size) {
for (int i = 0; i < size; i++) {
T t;
t.x = getRandomFloat();
if constexpr (count<T, B>() >= 2) t.y = getRandomFloat();
if constexpr (count<T, B>() >= 3) t.z = getRandomFloat();
if constexpr (count<T, B>() >= 4) t.w = getRandomFloat();
a[i] = t;
}
}
// Test operations
template <typename T, typename B>
__host__ __device__ void testOperations(T& a, T& b) {
a.x += b.x;
a.x++;
b.x++;
if constexpr (count<T, B>() >= 2) {
a.y = b.x;
a.x = b.y;
}
if constexpr (count<T, B>() >= 3) {
if (a.x > 0) b.x /= a.x;
a.x *= b.z;
a.y--;
}
if constexpr (count<T, B>() >= 4) {
b.w = a.x;
a.w += (-b.y);
}
}
template <typename T, typename B>
__global__ void testOperationsGPU(T* d_a, T* d_b, int size) {
int id = threadIdx.x;
if (id > size) return;
T &a = d_a[id];
T &b = d_b[id];
testOperations<T, B>(a, b);
}
template <typename T>
void dcopy(T* a, T* b, int size) {
for (int i = 0; i < size; i++) {
a[i] = b[i];
}
}
template <typename T>
bool isEqual(T* a, T* b, int size) {
for (int i = 0; i < size; i++) {
if (a[i] != b[i]) {
return false;
}
}
return true;
}
// Main function that tests type
// T = what you want to test
// D = pack of 1 i.e. float1 int1
template <typename T, typename D>
void testType(int msize) {
T *fa, *fb, *fc, *h_fa, *h_fb;
fa = new T[msize];
fb = new T[msize];
fc = new T[msize];
h_fa = new T[msize];
h_fb = new T[msize];
T *d_fa, *d_fb;
constexpr int c = count<T, D>();
if (c <= 0 || c >= 5) {
failed("Invalid Size\n");
}
fillMatrix<T, D>(fa, msize);
dcopy(fb, fa, msize);
dcopy(h_fa, fa, msize);
dcopy(h_fb, fa, msize);
for (int i = 0; i < msize; i++) testOperations<T, D>(h_fa[i], h_fb[i]);
hipMalloc(&d_fa, sizeof(T) * msize);
hipMalloc(&d_fb, sizeof(T) * msize);
hipMemcpy(d_fa, fa, sizeof(T) * msize, hipMemcpyHostToDevice);
hipMemcpy(d_fb, fb, sizeof(T) * msize, hipMemcpyHostToDevice);
auto kernel = testOperationsGPU<T, D>;
hipLaunchKernelGGL(kernel, 1, msize, 0, 0, d_fa, d_fb, msize);
hipMemcpy(fc, d_fa, sizeof(T) * msize, hipMemcpyDeviceToHost);
bool pass = true;
if (!isEqual<T>(h_fa, fc, msize)) {
pass = false;
}
delete[] fa;
delete[] fb;
delete[] fc;
delete[] h_fa;
delete[] h_fb;
hipFree(d_fa);
hipFree(d_fb);
if (!pass) {
failed("Failed");
}
}
int main() {
const int msize = 100;
// double
testType<double1, double1>(msize);
testType<double2, double1>(msize);
testType<double3, double1>(msize);
testType<double4, double1>(msize);
// floats
testType<float1, float1>(msize);
testType<float2, float1>(msize);
testType<float3, float1>(msize);
testType<float4, float1>(msize);
// ints
testType<int1, int1>(msize);
testType<int2, int1>(msize);
testType<int3, int1>(msize);
testType<int4, int1>(msize);
// chars
testType<char1, char1>(msize);
testType<char2, char1>(msize);
testType<char3, char1>(msize);
testType<char4, char1>(msize);
// long
testType<long1, long1>(msize);
testType<long2, long1>(msize);
testType<long3, long1>(msize);
testType<long4, long1>(msize);
// longlong
testType<longlong1, longlong1>(msize);
testType<longlong2, longlong1>(msize);
testType<longlong3, longlong1>(msize);
testType<longlong4, longlong1>(msize);
// short
testType<short1, short1>(msize);
testType<short2, short1>(msize);
testType<short3, short1>(msize);
testType<short4, short1>(msize);
// uints
testType<uint1, uint1>(msize);
testType<uint2, uint1>(msize);
testType<uint3, uint1>(msize);
testType<uint4, uint1>(msize);
// uchars
testType<uchar1, uchar1>(msize);
testType<uchar2, uchar1>(msize);
testType<uchar3, uchar1>(msize);
testType<uchar4, uchar1>(msize);
// ulong
testType<ulong1, ulong1>(msize);
testType<ulong2, ulong1>(msize);
testType<ulong3, ulong1>(msize);
testType<ulong4, ulong1>(msize);
// ulonglong
testType<ulonglong1, ulonglong1>(msize);
testType<ulonglong2, ulonglong1>(msize);
testType<ulonglong3, ulonglong1>(msize);
testType<ulonglong4, ulonglong1>(msize);
// ushort
testType<ushort1, ushort1>(msize);
testType<ushort2, ushort1>(msize);
testType<ushort3, ushort1>(msize);
testType<ushort4, ushort1>(msize);
passed();
}
+1 -1
파일 보기
@@ -20,7 +20,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../test_common.cpp LINK_OPTIONS hiprtc EXCLUDE_HIP_PLATFORM nvcc
* BUILD: %t %s ../test_common.cpp LINK_OPTIONS hiprtc EXCLUDE_HIP_PLATFORM nvcc vdi
* TEST: %t
* HIT_END
*/
+1 -1
파일 보기
@@ -20,7 +20,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../test_common.cpp LINK_OPTIONS hiprtc EXCLUDE_HIP_PLATFORM nvcc
* BUILD: %t %s ../test_common.cpp LINK_OPTIONS hiprtc EXCLUDE_HIP_PLATFORM nvcc vdi
* TEST: %t
* HIT_END
*/
+53
파일 보기
@@ -0,0 +1,53 @@
/*
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
* IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* */
/* HIT_START
* BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc vdi
* TEST: %t
* HIT_END
*/
#include "test_common.h"
int main() {
int* A;
int* Ad;
int* Bd;
// Allocation
HIPCHECK(hipMalloc((void**)&Ad, sizeof(int)));
HIPCHECK(hipMalloc((void**)&Bd, sizeof(int)));
HIPCHECK(hipHostMalloc((void**)&A,sizeof(int)));
// Kind should be ignored and test should pass even for incorrect kind
HIPCHECK(hipMemcpy(Ad, A, sizeof(int), hipMemcpyDeviceToHost));
HIPCHECK(hipMemcpy(A, Ad, sizeof(int), hipMemcpyHostToDevice));
HIPCHECK(hipMemcpy(Ad, Bd, sizeof(int), hipMemcpyHostToHost));
HIPCHECK(hipMemcpy(A, A, sizeof(int), hipMemcpyDeviceToDevice));
// nullptr passed as source or destination pointer
HIPASSERT(hipSuccess != hipMemcpy(nullptr, A, sizeof(int), hipMemcpyHostToDevice));
HIPASSERT(hipSuccess != hipMemcpy(Ad, nullptr, sizeof(int), hipMemcpyHostToDevice));
HIPCHECK(hipFree(Ad));
HIPCHECK(hipFree(Bd));
HIPCHECK(hipFree(A));
passed();
}
+2 -2
파일 보기
@@ -107,8 +107,8 @@ void run(const std::vector<char>& buffer) {
hipFree(Ad);
hipFree(Bd);
delete A;
delete B;
delete[] A;
delete[] B;
hipCtxDestroy(context);
}
+145
파일 보기
@@ -0,0 +1,145 @@
#include <stdio.h>
#include <hip/hip_runtime.h>
#include <unistd.h>
#include "test_common.h"
#include <atomic>
/* HIT_START
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
* TEST: %t
* HIT_END
*/
enum class ExecState
{
EXEC_NOT_STARTED,
EXEC_STARTED,
EXEC_CB_STARTED,
EXEC_CB_FINISHED,
EXEC_FINISHED
};
struct UserData
{
size_t size;
int* ptr;
};
// Global variable to check exection order
std::atomic<ExecState> gData(ExecState::EXEC_NOT_STARTED);
void myCallback(hipStream_t stream, hipError_t status, void* user_data)
{
if(gData.load() != ExecState::EXEC_STARTED)
return; // Error hence return early
gData.store(ExecState::EXEC_CB_STARTED);
UserData* data = reinterpret_cast<UserData*>(user_data);
printf("Callback started\n");
sleep(1);
printf("Callback ending.\n");
gData.store(ExecState::EXEC_CB_FINISHED);
}
bool test(int count)
{
printf("\n============ Test iteration %d =============\n",count);
// Stream
hipStream_t stream;
bool result = true;
gData.store(ExecState::EXEC_STARTED);
HIPCHECK(hipStreamCreate(&stream));
// Array size
size_t size = 10000;
// Device array
int *data = NULL;
HIPCHECK(hipMalloc((void**)&data, sizeof(int) * size));
// Initialize device array to -1
HIPCHECK(hipMemset(data, -1, sizeof(int) * size));
// Host array
int *host = NULL;
HIPCHECK(hipHostMalloc((void**)&host, sizeof(int) * size));
// Print host ptr address
printf("In main thread\n");
// Initialize user_data for callback
UserData arg;
arg.size = size;
arg.ptr = host;
// Synchronize device
HIPCHECK(hipDeviceSynchronize());
// Asynchronous copy from device to host
HIPCHECK(hipMemcpyAsync(host, data, sizeof(int) * size, hipMemcpyDeviceToHost, stream));
// Asynchronous memset on device
HIPCHECK(hipMemsetAsync(data, 0, sizeof(int) * size, stream));
// Add callback - should happen after hipMemsetAsync()
HIPCHECK(hipStreamAddCallback(stream, myCallback, &arg, 0));
printf("Will wait in main thread until callback completes\n");
//This should synchronize the stream (including the callback)
HIPCHECK(hipStreamSynchronize(stream));
if(gData.load() != ExecState::EXEC_CB_FINISHED)
{
std::cout<<"Callback is not finished\n";
return false;
}
printf("Callback completed will resume main thread execution\n");
if(host[size/2] != -1)
{
// Print some host data that just got copied
printf("Pseudo host data printing (should be -1): %d\n", host[size/2]);
result = false;
}
HIPCHECK(hipMemcpy(host, data, sizeof(int)*size, hipMemcpyDeviceToHost));
if(host[size-1] != 0)
{
printf("Pseudo host data printing (should be 0): %d\n", host[size-1]);
result = false;
}
HIPCHECK(hipFree(data));
HIPCHECK(hipHostFree(host));
HIPCHECK(hipStreamDestroy(stream));
gData.store(ExecState::EXEC_FINISHED);
return result;
}
int main()
{
// Test involves multithreading hence running multiple times
// to make sure consitency in the behavior
bool status = true;
for(int i=0; i < 10; i++){
status = test(i+1);
if(status == false)
{
failed("Test Failed!\n");
break;
}
}
if(status == true) passed();
return 0;
}
+409
파일 보기
@@ -0,0 +1,409 @@
#include <hip/hip_runtime.h>
#include <stdexcept>
#include <memory>
#include <functional>
#include <mutex>
#include <condition_variable>
#include <thread>
#include <future>
#include "test_common.h"
/* HIT_START
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM vdi
* TEST: %t
* HIT_END
*/
#define WORKAROUND 0 // Enable (1) this to make stream thread-safe by a workaround
template<bool IsBlocking> // <true> = queue blocks, until task is finished in enqueue(queue,task)
class QueueHipRt;
// Queue types used in the tests
using TestQueues = std::tuple<QueueHipRt<true>, QueueHipRt<false>>;
// --- Implementation
#define HIP_ASSERT(x) (assert((x)==hipSuccess))
#define HIP_ASSERT_IGNORE(x,ign) auto err=x; HIP_ASSERT(err==ign ? hipSuccess : err)
#ifdef __HIP_PLATFORM_HCC__
#define HIPRT_CB
#endif
template<bool isBlocking>
static auto currentThreadWaitFor(QueueHipRt<isBlocking> const & queue) -> void;
template<bool IsBlocking>
class QueueHipRt
{
public:
static constexpr bool isBlocking = IsBlocking;
//-----------------------------------------------------------------------------
QueueHipRt(
int dev) :
m_dev(dev),
m_HipQueue()
{
HIP_ASSERT(
hipSetDevice(
m_dev));
HIP_ASSERT(
hipStreamCreateWithFlags(
&m_HipQueue,
hipStreamNonBlocking));
}
//-----------------------------------------------------------------------------
QueueHipRt(QueueHipRt const &) = delete;
//-----------------------------------------------------------------------------
QueueHipRt(QueueHipRt &&) = delete;
//-----------------------------------------------------------------------------
auto operator=(QueueHipRt const &) -> QueueHipRt & = delete;
//-----------------------------------------------------------------------------
auto operator=(QueueHipRt &&) -> QueueHipRt & = delete;
//-----------------------------------------------------------------------------
~QueueHipRt()
{
if(isBlocking) {
#if WORKAROUND // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
// we are a non-blocking queue, so we have to wait here with its destruction until all spawned tasks have been processed
currentThreadWaitFor(*this);
#endif
}
HIP_ASSERT(
hipSetDevice(
m_dev));
HIP_ASSERT(
hipStreamDestroy(
m_HipQueue));
}
public:
int m_dev; //!< The device this queue is bound to.
hipStream_t m_HipQueue;
#if WORKAROUND // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
int m_callees = 0;
std::mutex m_mutex;
#endif
};
template<typename TTask>
struct Enqueue
{
//#############################################################################
enum class CallbackState
{
enqueued,
notified,
finished,
};
//#############################################################################
struct CallbackSynchronizationData : public std::enable_shared_from_this<CallbackSynchronizationData>
{
std::mutex m_mutex;
std::condition_variable m_event;
CallbackState state = CallbackState::enqueued;
};
//-----------------------------------------------------------------------------
static void HIPRT_CB hipRtCallback(hipStream_t /*queue*/, hipError_t /*status*/, void *arg)
{
// explicitly copy the shared_ptr so that this method holds the state even when the executing thread has already finished.
const auto pCallbackSynchronizationData = reinterpret_cast<CallbackSynchronizationData*>(arg)->shared_from_this();
// Notify the executing thread.
{
std::unique_lock<std::mutex> lock(pCallbackSynchronizationData->m_mutex);
pCallbackSynchronizationData->state = CallbackState::notified;
}
pCallbackSynchronizationData->m_event.notify_one();
// Wait for the executing thread to finish the task if it has not already finished.
std::unique_lock<std::mutex> lock(pCallbackSynchronizationData->m_mutex);
if(pCallbackSynchronizationData->state != CallbackState::finished)
{
pCallbackSynchronizationData->m_event.wait(
lock,
[pCallbackSynchronizationData](){
return pCallbackSynchronizationData->state == CallbackState::finished;
}
);
}
}
//-----------------------------------------------------------------------------
template<bool isBlocking>
static auto enqueue(
QueueHipRt<isBlocking> & queue,
TTask const & task)
-> void
{
#if WORKAROUND // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
{
// thread-safe callee incrementing
std::lock_guard<std::mutex> guard(queue.m_mutex);
queue.m_callees += 1;
}
#endif
auto pCallbackSynchronizationData = std::make_shared<CallbackSynchronizationData>();
// test example: https://github.com/ROCm-Developer-Tools/HIP/blob/roc-1.9.x/tests/src/runtimeApi/stream/hipStreamAddCallback.cpp
HIP_ASSERT(hipStreamAddCallback(
queue.m_HipQueue,
hipRtCallback,
pCallbackSynchronizationData.get(),
0u));
// We start a new std::thread which stores the task to be executed.
// This circumvents the limitation that it is not possible to call HIP methods within the HIP callback thread.
// The HIP thread signals the std::thread when it is ready to execute the task.
// The HIP thread is waiting for the std::thread to signal that it is finished executing the task
// before it executes the next task in the queue (HIP stream).
std::thread t(
[pCallbackSynchronizationData,
task
#if WORKAROUND // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
,&queue // requires queue's destructor to wait for all tasks
#endif
](){
#if WORKAROUND // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
// thread-safe task execution and callee decrementing
std::lock_guard<std::mutex> guard(queue.m_mutex);
#endif
// If the callback has not yet been called, we wait for it.
{
std::unique_lock<std::mutex> lock(pCallbackSynchronizationData->m_mutex);
if(pCallbackSynchronizationData->state != CallbackState::notified)
{
pCallbackSynchronizationData->m_event.wait(
lock,
[pCallbackSynchronizationData](){
return pCallbackSynchronizationData->state == CallbackState::notified;
}
);
}
task();
// Notify the waiting HIP thread.
pCallbackSynchronizationData->state = CallbackState::finished;
}
pCallbackSynchronizationData->m_event.notify_one();
#if WORKAROUND // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
queue.m_callees -= 1;
#endif
}
);
if(isBlocking)
t.join(); // => waiting for task completion
else
t.detach(); // => do not wait for task completion
}
};
//#############################################################################
//! The HIP RT non-blocking queue test trait specialization.
struct Empty
{
//-----------------------------------------------------------------------------
template<bool isBlocking>
static auto empty(
QueueHipRt<isBlocking> const & queue)
-> bool
{
#if WORKAROUND // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
return (queue.m_callees==0);
#else
// Query is allowed even for queues on non current device.
hipError_t ret = hipSuccess;
HIP_ASSERT_IGNORE(
ret = hipStreamQuery(
queue.m_HipQueue),
hipErrorNotReady);
return (ret == hipSuccess);
#endif
}
};
template<bool isBlocking>
auto currentThreadWaitFor(QueueHipRt<isBlocking> const & queue) -> void
{
#if WORKAROUND // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
while(queue.m_callees>0) {
std::this_thread::sleep_for(std::chrono::milliseconds(10u));
}
#else
// Sync is allowed even for queues on non current device.
HIP_ASSERT( hipStreamSynchronize(
queue.m_HipQueue));
#endif
}
// --- Tests
#define TEMPLATE_LIST_TEST_CASE(TestName) \
template<typename TestType> static void TestName (std::atomic<int> &check); \
static int TestName##Runner () { \
std::atomic<int> check{0}; \
TestName< QueueHipRt<true> >(check); \
fprintf(stderr, "After " #TestName " < QueueHipRt<true> > errors=%d\n", check.load()); \
TestName< QueueHipRt<false> >(check); \
fprintf(stderr, "After " #TestName " < QueueHipRt<false> > errors=%d\n", check.load()); \
return check.load(); \
} \
template<typename TestType> static void TestName (std::atomic<int> &check)
// add 1 if a check fails
#define CHECK(result) do{int arg=(!(result)); fprintf(stderr, "Checking " #result " %d\n", arg); check.fetch_add(arg);}while(false)
//-----------------------------------------------------------------------------
TEMPLATE_LIST_TEST_CASE( queueIsInitiallyEmpty )
{
TestType queue{0};
CHECK(Empty::empty(queue));
}
//-----------------------------------------------------------------------------
TEMPLATE_LIST_TEST_CASE( queueCallbackIsWorking )
{
std::promise<bool> promise;
auto task = [&](){ promise.set_value(true); };
TestType queue{0};
Enqueue<decltype(task)> enqueue;
enqueue.enqueue(
queue,
task
);
CHECK(promise.get_future().get());
}
//-----------------------------------------------------------------------------
TEMPLATE_LIST_TEST_CASE( queueWaitShouldWork )
{
bool CallbackFinished = false;
auto task =
[&CallbackFinished]() noexcept
{
std::this_thread::sleep_for(std::chrono::milliseconds(100u));
CallbackFinished = true;
};
TestType queue{0};
Enqueue<decltype(task)> enqueue;
enqueue.enqueue(
queue,
task
);
currentThreadWaitFor(queue);
CHECK(CallbackFinished);
}
//-----------------------------------------------------------------------------
TEMPLATE_LIST_TEST_CASE( queueShouldNotBeEmptyWhenLastTaskIsStillExecutingAndIsEmptyAfterProcessingFinished )
{
bool CallbackFinished = false;
TestType queue{0};
auto task = [&queue, &CallbackFinished, &check]() noexcept
{
CHECK(!Empty::empty(queue));
std::this_thread::sleep_for(std::chrono::milliseconds(100u));
CallbackFinished = true;
};
Enqueue<decltype(task)> enqueue;
enqueue.enqueue(
queue,
task
);
// A non-blocking queue will always stay empty because the task has been executed immediately.
if(!TestType::isBlocking)
{
currentThreadWaitFor(queue);
}
CHECK(Empty::empty(queue));
CHECK(CallbackFinished);
}
//-----------------------------------------------------------------------------
TEMPLATE_LIST_TEST_CASE( queueShouldNotExecuteTasksInParallel )
{
std::atomic<bool> taskIsExecuting(false);
std::promise<void> firstTaskFinished;
std::future<void> firstTaskFinishedFuture = firstTaskFinished.get_future();
std::promise<void> secondTaskFinished;
std::future<void> secondTaskFinishedFuture = secondTaskFinished.get_future();
TestType queue{0};
std::thread thread1(
[&queue, &taskIsExecuting, &firstTaskFinished, &check]()
{
auto task1 = [&taskIsExecuting, &firstTaskFinished, &check]() noexcept
{
CHECK(!taskIsExecuting.exchange(true));
std::this_thread::sleep_for(std::chrono::milliseconds(100u));
CHECK(taskIsExecuting.exchange(false));
firstTaskFinished.set_value();
};
Enqueue<decltype(task1)> enqueue;
enqueue.enqueue(
queue,
task1
);
});
std::thread thread2(
[&queue, &taskIsExecuting, &secondTaskFinished, &check]()
{
auto task2 = [&taskIsExecuting, &secondTaskFinished, &check]() noexcept
{
CHECK(!taskIsExecuting.exchange(true));
std::this_thread::sleep_for(std::chrono::milliseconds(100u));
CHECK(taskIsExecuting.exchange(false));
secondTaskFinished.set_value();
};
Enqueue<decltype(task2)> enqueue;
enqueue.enqueue(
queue,
task2
);
});
// Both tasks have to be enqueued
thread1.join();
thread2.join();
currentThreadWaitFor(queue);
firstTaskFinishedFuture.get();
secondTaskFinishedFuture.get();
}
#define TESTER(name) do { \
int result = name (); \
fprintf(stderr, #name " %s\n", result?"Errors":"No Errors"); \
if (result) { failed(#name " failed\n"); } \
} while (false)
int main()
{
TESTER(queueIsInitiallyEmptyRunner);
TESTER(queueCallbackIsWorkingRunner);
TESTER(queueWaitShouldWorkRunner);
TESTER(queueShouldNotBeEmptyWhenLastTaskIsStillExecutingAndIsEmptyAfterProcessingFinishedRunner);
TESTER(queueShouldNotExecuteTasksInParallelRunner);
passed();
}
+122
파일 보기
@@ -0,0 +1,122 @@
/*
Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../test_common.cpp EXCLUDE_HIP_PLATFORM vdi
* TEST: %t
* HIT_END
*/
#include "hip/hip_runtime.h"
#include "../test_common.h"
#define N 16
#define offset 3
__global__ void tex1dKernel(float *val, hipTextureObject_t obj) {
int k = blockIdx.x * blockDim.x + threadIdx.x;
if (k < N)
val[k] = tex1Dfetch<float>(obj, k+offset);
}
int runTest(hipTextureAddressMode, hipTextureFilterMode);
int main(int argc, char **argv) {
int testResult = runTest(hipAddressModeClamp,hipFilterModePoint);
testResult = runTest(hipAddressModeClamp,hipFilterModeLinear);
testResult = runTest(hipAddressModeWrap,hipFilterModePoint);
testResult = runTest(hipAddressModeWrap,hipFilterModeLinear);
if(testResult) {
passed();
} else {
exit(EXIT_FAILURE);
}
}
int runTest(hipTextureAddressMode addressMode, hipTextureFilterMode filterMode) {
int testResult = 1;
hipCtx_t HipContext;
hipDevice_t HipDevice;
int deviceID = 0;
hipDeviceGet(&HipDevice, deviceID);
hipCtxCreate(&HipContext, 0, HipDevice);
// Allocating the required buffer on gpu device
float *texBuf, *texBufOut;
float val[N], output[N];
for (int i = 0; i < N; i++) {
val[i] = i+1;
output[i] = 0.0;
}
HIPCHECK(hipMalloc(&texBuf, N * sizeof(float)));
HIPCHECK(hipMalloc(&texBufOut, N * sizeof(float)));
HIPCHECK(hipMemcpy(texBuf, val, N * sizeof(float), hipMemcpyHostToDevice));
HIPCHECK(hipMemset(texBufOut, 0, N * sizeof(float)));
hipResourceDesc resDescLinear;
memset(&resDescLinear, 0, sizeof(resDescLinear));
resDescLinear.resType = hipResourceTypeLinear;
resDescLinear.res.linear.devPtr = texBuf;
resDescLinear.res.linear.desc = hipCreateChannelDesc(32, 0, 0, 0, hipChannelFormatKindFloat);
resDescLinear.res.linear.sizeInBytes = N * sizeof(float);
hipTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.readMode = hipReadModeElementType;
texDesc.addressMode[0] = addressMode;
texDesc.addressMode[1] = addressMode;
texDesc.filterMode = filterMode;
texDesc.normalizedCoords = false;
// Creating texture object
hipTextureObject_t texObj = 0;
HIPCHECK(hipCreateTextureObject(&texObj, &resDescLinear, &texDesc, NULL));
dim3 dimBlock(1, 1, 1);
dim3 dimGrid(N , 1, 1);
hipLaunchKernelGGL(tex1dKernel, dim3(dimGrid), dim3(dimBlock), 0, 0,
texBufOut, texObj);
HIPCHECK(hipDeviceSynchronize());
HIPCHECK(hipMemcpy(output, texBufOut, N * sizeof(float), hipMemcpyDeviceToHost));
for (int i = offset; i < N; i++) {
if (output[i-offset] != val[i]) {
testResult = 0;
break;
}
}
if(testResult){
for(int i = N-offset; i < N; i++){
if (output[i] != 0){
testResult = 0;
break;
}
}
}
HIPCHECK(hipDestroyTextureObject(texObj));
HIPCHECK(hipFree(texBuf));
HIPCHECK(hipFree(texBufOut));
return testResult;
}