merge from master
Этот коммит содержится в:
+15
-2
@@ -289,7 +289,7 @@ if(HIP_PLATFORM STREQUAL "hcc")
|
||||
target_link_libraries(hip_hcc PRIVATE hc_am)
|
||||
target_link_libraries(hip_hcc_static PRIVATE hc_am)
|
||||
|
||||
add_library(hiprtc SHARED src/hiprtc.cpp)
|
||||
add_library(hiprtc SHARED src/hiprtc.cpp src/code_object_bundle.cpp)
|
||||
target_include_directories(
|
||||
hiprtc SYSTEM
|
||||
PRIVATE ${PROJECT_SOURCE_DIR}/include ${HSA_PATH}/include)
|
||||
@@ -504,7 +504,7 @@ endif()
|
||||
find_program(CPPCHECK_EXE cppcheck)
|
||||
if(CPPCHECK_EXE)
|
||||
add_custom_target(cppcheck COMMAND ${CPPCHECK_EXE} --force --quiet --enable=warning,performance,portability,information,missingInclude src include -I /opt/rocm/include/hcc -I /opt/rocm/include --suppress=*:/opt/rocm/include/hcc/hc.hpp
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
endif()
|
||||
|
||||
#############################
|
||||
@@ -545,4 +545,17 @@ else()
|
||||
message(STATUS "Testing targets will not be available. To enable them please ensure that the HIP installation directory is writeable. Use -DCMAKE_INSTALL_PREFIX to specify a suitable location")
|
||||
endif()
|
||||
|
||||
#############################
|
||||
# Code analysis
|
||||
#############################
|
||||
# Target: clang
|
||||
if(HIP_HIPCC_EXECUTABLE)
|
||||
add_custom_target(analyze
|
||||
COMMAND ${HIP_HIPCC_EXECUTABLE} -fvisibility=hidden -fvisibility-inlines-hidden --analyze --analyzer-outputtext -isystem /opt/rocm/include ${HIP_HCC_BUILD_FLAGS} -Wno-unused-command-line-argument -I/opt/rocm/include -c src/*.cpp -Iinclude/ -I./
|
||||
WORKING_DIRECTORY ${HIP_SRC_PATH})
|
||||
if(CPPCHECK_EXE)
|
||||
add_dependencies(analyze cppcheck)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# vim: ts=4:sw=4:expandtab:smartindent
|
||||
|
||||
+9
-23
@@ -234,6 +234,8 @@ if ($HIP_PLATFORM eq "clang") {
|
||||
if ($HIP_RUNTIME eq "HCC" ) {
|
||||
$HSA_PATH=$ENV{'HSA_PATH'} // "$ROCM_PATH/hsa";
|
||||
$HIPCXXFLAGS .= " -isystem $HSA_PATH/include";
|
||||
} else {
|
||||
$HIPCXXFLAGS .= " -fhip-new-launch-api";
|
||||
}
|
||||
|
||||
} elsif ($HIP_PLATFORM eq "hcc") {
|
||||
@@ -352,7 +354,6 @@ my $runCmd = 1;
|
||||
my $buildDeps = 0;
|
||||
my $linkType = 1;
|
||||
my $setLinkType = 0;
|
||||
my $coFormatv3 = 1;
|
||||
|
||||
my @options = ();
|
||||
my @inputs = ();
|
||||
@@ -472,22 +473,6 @@ foreach $arg (@ARGV)
|
||||
$swallowArg = 1;
|
||||
}
|
||||
|
||||
# code object format parsing
|
||||
if ($trimarg eq '-mcode-object-v3') {
|
||||
$coFormatv3 = 1;
|
||||
# hip-clang already recognizes -mcode-object-v3, so we just pass it on
|
||||
if ($HIP_PLATFORM eq 'hcc') {
|
||||
$swallowArg = 1;
|
||||
}
|
||||
}
|
||||
if ($trimarg eq '-mno-code-object-v3') {
|
||||
$coFormatv3 = 0;
|
||||
# hip-clang already recognizes -mno-code-object-v3, so we just pass it on
|
||||
if ($HIP_PLATFORM eq 'hcc') {
|
||||
$swallowArg = 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (($arg =~ /--genco/) and $HIP_PLATFORM eq 'clang' ) {
|
||||
$arg = "--cuda-device-only";
|
||||
}
|
||||
@@ -526,6 +511,13 @@ foreach $arg (@ARGV)
|
||||
$optArg = $arg;
|
||||
}
|
||||
|
||||
## This is a temporary workaround for CMake detection of OpenMP support.
|
||||
## It should be removed when the OpenMP detection c++ test in CMake is updated
|
||||
## and corrected CMake version is available.
|
||||
if((defined $HIP_COMPILER) and ($HIP_COMPILER eq "clang") and ($arg eq '-fopenmp')) {
|
||||
$HIPCXXFLAGS .= " -D_OPENMP "
|
||||
}
|
||||
|
||||
## process linker response file for hip-clang
|
||||
## extract object files from static library and pass them directly to
|
||||
## hip-clang in command line.
|
||||
@@ -863,12 +855,6 @@ if($HIP_PLATFORM eq "hcc" or $HIP_PLATFORM eq "clang"){
|
||||
}
|
||||
}
|
||||
|
||||
# hcc defaults to v2, so we need to convert to the appropriate flag
|
||||
# hip-clang defaults to v3, so we don't need to do anything
|
||||
if ($coFormatv3 and $HIP_PLATFORM eq 'hcc') {
|
||||
$HIPLDFLAGS .= " -Wl,-hcc-cov3 ";
|
||||
}
|
||||
|
||||
if ($hasC and $HIP_PLATFORM eq 'nvcc') {
|
||||
$HIPCXXFLAGS .= " -x cu";
|
||||
}
|
||||
|
||||
@@ -983,7 +983,6 @@ sub simpleSubstitutions {
|
||||
$ft{'type'} += s/\bcusparsePointerMode_t\b/hipsparsePointerMode_t/g;
|
||||
$ft{'type'} += s/\bcusparseSolvePolicy_t\b/hipsparseSolvePolicy_t/g;
|
||||
$ft{'type'} += s/\bcusparseStatus_t\b/hipsparseStatus_t/g;
|
||||
$ft{'type'} += s/\bwarpSize\b/hipWarpSize/g;
|
||||
$ft{'numeric_literal'} += s/\bCUBLAS_DIAG_NON_UNIT\b/HIPBLAS_DIAG_NON_UNIT/g;
|
||||
$ft{'numeric_literal'} += s/\bCUBLAS_DIAG_UNIT\b/HIPBLAS_DIAG_UNIT/g;
|
||||
$ft{'numeric_literal'} += s/\bCUBLAS_FILL_MODE_FULL\b/HIPBLAS_FILL_MODE_FULL/g;
|
||||
@@ -1676,6 +1675,12 @@ sub transformKernelLaunch {
|
||||
}
|
||||
}
|
||||
|
||||
sub transformCubNamespace {
|
||||
my $k = 0;
|
||||
$k += s/using\s*namespace\s*cub/using namespace hipcub/g;
|
||||
return $k;
|
||||
}
|
||||
|
||||
sub transformHostFunctions {
|
||||
my $k = 0;
|
||||
foreach $func (
|
||||
@@ -2368,6 +2373,7 @@ while (@ARGV) {
|
||||
simpleSubstitutions();
|
||||
transformExternShared();
|
||||
transformKernelLaunch();
|
||||
transformCubNamespace();
|
||||
if ($print_stats) {
|
||||
while (/(\b(hip|HIP)([A-Z]|_)\w+\b)/g) {
|
||||
$convertedTags{$1}++;
|
||||
|
||||
@@ -196,8 +196,7 @@ This file can be copied and edited to provide more selective HSA event recording
|
||||
|
||||
|
||||
#### How to enable profiling at HIP build time
|
||||
Recent pre-built packages of HIP are always built with profiling support enabled.
|
||||
For developer builds, you must enable marker support manually when compiling HIP.
|
||||
Pre-built packages of HIP are not built with profiling support enabled.You must enable marker support manually when compiling HIP.
|
||||
|
||||
1. Build HIP with ATP markers enabled
|
||||
HIP pre-built packages are enabled with ATP marker support by default.
|
||||
|
||||
@@ -348,7 +348,7 @@ def generate_prof_header(f, api_map, opts_map):
|
||||
# Generating the callbacks data structure
|
||||
f.write('\n// HIP API callbacks data structure\n')
|
||||
f.write(
|
||||
'struct hip_api_data_t {\n' +
|
||||
'typedef struct hip_api_data_t {\n' +
|
||||
' uint64_t correlation_id;\n' +
|
||||
' uint32_t phase;\n' +
|
||||
' union {\n'
|
||||
@@ -364,7 +364,7 @@ def generate_prof_header(f, api_map, opts_map):
|
||||
f.write(' } ' + name + ';\n')
|
||||
f.write(
|
||||
' } args;\n' +
|
||||
'};\n'
|
||||
'} hip_api_data_t;\n'
|
||||
)
|
||||
|
||||
# Generating the callbacks args data filling macros
|
||||
|
||||
@@ -1,8 +1,4 @@
|
||||
if (CUDA_VERSION VERSION_GREATER "9.2")
|
||||
cmake_minimum_required(VERSION 3.12.3)
|
||||
else()
|
||||
cmake_minimum_required(VERSION 3.7.2)
|
||||
endif()
|
||||
cmake_minimum_required(VERSION 3.5.1)
|
||||
|
||||
project(hipify-clang)
|
||||
|
||||
@@ -58,7 +54,7 @@ target_link_libraries(hipify-clang PRIVATE
|
||||
LLVMOption
|
||||
LLVMCore)
|
||||
|
||||
if (LLVM_PACKAGE_VERSION VERSION_GREATER_EQUAL "7")
|
||||
if (LLVM_PACKAGE_VERSION VERSION_GREATER "6.0.1")
|
||||
target_link_libraries(hipify-clang PRIVATE clangToolingInclusions)
|
||||
endif()
|
||||
|
||||
@@ -72,7 +68,7 @@ else()
|
||||
set(StdCpp "-std=c++")
|
||||
endif()
|
||||
|
||||
if (LLVM_PACKAGE_VERSION VERSION_GREATER_EQUAL "10.0")
|
||||
if (LLVM_PACKAGE_VERSION VERSION_GREATER "9.0")
|
||||
string(APPEND StdCpp "14")
|
||||
# MSVC starting from 1900 (VS 2015) supports only the following c++ std values: c++14|c++17|c++latest
|
||||
elseif (MSVC)
|
||||
@@ -84,7 +80,38 @@ endif()
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_CFLAGS}")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CFLAGS} ${StdCpp} -DHIPIFY_CLANG_RES=\\\"${LLVM_LIBRARY_DIRS}/clang/${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}\\\"")
|
||||
|
||||
install(TARGETS hipify-clang DESTINATION bin)
|
||||
set(INSTALL_PATH_DOC_STRING "Installation path for hipify-clang")
|
||||
if (CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
|
||||
if(CMAKE_BUILD_TYPE MATCHES Debug)
|
||||
set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_SOURCE_DIR}/bin" CACHE PATH ${INSTALL_PATH_DOC_STRING} FORCE)
|
||||
elseif(CMAKE_BUILD_TYPE MATCHES Release)
|
||||
if (BIN_INSTALL_DIR)
|
||||
set(CMAKE_INSTALL_PREFIX "${BIN_INSTALL_DIR}" CACHE PATH ${INSTALL_PATH_DOC_STRING} FORCE)
|
||||
else()
|
||||
set(CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/bin" CACHE PATH ${INSTALL_PATH_DOC_STRING} FORCE)
|
||||
endif()
|
||||
else()
|
||||
message(FATAL_ERROR "Invalid CMAKE_BUILD_TYPE specified. Valid values are Debug and Release")
|
||||
endif()
|
||||
elseif(BIN_INSTALL_DIR)
|
||||
set(CMAKE_INSTALL_PREFIX "${BIN_INSTALL_DIR}" CACHE PATH ${INSTALL_PATH_DOC_STRING} FORCE)
|
||||
endif()
|
||||
|
||||
install(TARGETS hipify-clang DESTINATION ${CMAKE_INSTALL_PREFIX})
|
||||
|
||||
install(
|
||||
DIRECTORY ${LLVM_DIR}/../../clang/${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}/
|
||||
DESTINATION ${CMAKE_INSTALL_PREFIX}
|
||||
COMPONENT clang-resource-headers
|
||||
FILES_MATCHING
|
||||
PATTERN "*.h"
|
||||
PATTERN "*.modulemap"
|
||||
PATTERN "algorithm"
|
||||
PATTERN "complex"
|
||||
PATTERN "new"
|
||||
PATTERN "ppc_wrappers" EXCLUDE
|
||||
PATTERN "openmp_wrappers" EXCLUDE
|
||||
)
|
||||
|
||||
if (HIPIFY_CLANG_TESTS)
|
||||
find_package(PythonInterp 2.7 REQUIRED)
|
||||
|
||||
@@ -63,7 +63,7 @@
|
||||
In most cases, you can get a suitable version of LLVM+CLANG with your package manager.
|
||||
|
||||
Failing that or having multiple versions of LLVM, you can [download a release archive](http://releases.llvm.org/), build or install it, and set
|
||||
[CMAKE_PREFIX_PATH](https://cmake.org/cmake/help/v3.12/variable/CMAKE_PREFIX_PATH.html) so `cmake` can find it; for instance: `-DCMAKE_PREFIX_PATH=f:\LLVM\9.0.0\dist`
|
||||
[CMAKE_PREFIX_PATH](https://cmake.org/cmake/help/v3.5/variable/CMAKE_PREFIX_PATH.html) so `cmake` can find it; for instance: `-DCMAKE_PREFIX_PATH=f:\LLVM\9.0.0\dist`
|
||||
|
||||
## <a name="build-and-install"></a> Build and install
|
||||
|
||||
@@ -154,7 +154,7 @@ To run it:
|
||||
|
||||
* Path to CUB should be specified by the `CUDA_CUB_ROOT_DIR` option:
|
||||
|
||||
- Linux: `-DCUDA_CUB_ROOT_DIR=/srv/CUB`
|
||||
- Linux: `-DCUDA_CUB_ROOT_DIR=/srv/git/CUB`
|
||||
|
||||
- Windows: `-DCUDA_CUB_ROOT_DIR=f:/GIT/cub`
|
||||
|
||||
@@ -194,9 +194,9 @@ Ubuntu 14: LLVM 5.0.0 - 6.0.1, CUDA 7.0 - 9.0, cudnn-5.0.5 - cudnn-7.6.4.38
|
||||
|
||||
Ubuntu 16-18: LLVM 8.0.0 - 9.0.0, CUDA 8.0 - 10.1, cudnn-5.1.10 - cudnn-7.6.4.38
|
||||
|
||||
Build system for the above configurations:
|
||||
Minimum build system requirements for the above configurations:
|
||||
|
||||
Python 2.7 (min), cmake 3.12.3 (min), GNU C/C++ 5.4.0 (min).
|
||||
Python 2.7, cmake 3.5.1, GNU C/C++ 5.4.0.
|
||||
|
||||
Here is an example of building `hipify-clang` with testing support on `Ubuntu 16.04`:
|
||||
|
||||
@@ -208,7 +208,7 @@ cmake
|
||||
-DCMAKE_PREFIX_PATH=/srv/git/LLVM/9.0.0/dist \
|
||||
-DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-10.1 \
|
||||
-DCUDA_DNN_ROOT_DIR=/srv/CUDNN/cudnn-10.1-v7.6.4.38 \
|
||||
-DCUDA_CUB_ROOT_DIR=/srv/CUB \
|
||||
-DCUDA_CUB_ROOT_DIR=/srv/git/CUB \
|
||||
-DLLVM_EXTERNAL_LIT=/srv/git/LLVM/9.0.0/build/bin/llvm-lit \
|
||||
..
|
||||
```
|
||||
@@ -264,88 +264,89 @@ Linux 5.2.0 - Platform OS
|
||||
64 - hipify-clang binary bitness
|
||||
64 - python 2.7.12 binary bitness
|
||||
========================================
|
||||
-- Testing: 64 tests, 12 threads --
|
||||
PASS: hipify :: unit_tests/casts/reinterpret_cast.cu (1 of 64)
|
||||
PASS: hipify :: unit_tests/device/math_functions.cu (2 of 64)
|
||||
PASS: hipify :: unit_tests/device/atomics.cu (3 of 64)
|
||||
PASS: hipify :: unit_tests/device/device_symbols.cu (4 of 64)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_02.cu (5 of 64)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_03.cu (6 of 64)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_01.cu (7 of 64)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_04.cu (8 of 64)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_05.cu (9 of 64)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_07.cu (10 of 64)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_06.cu (11 of 64)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_11.cu (12 of 64)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_08.cu (13 of 64)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_10.cu (14 of 64)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_09.cu (15 of 64)
|
||||
PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_02.cu (16 of 64)
|
||||
PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_01.cu (17 of 64)
|
||||
PASS: hipify :: unit_tests/libraries/CUB/cub_01.cu (18 of 64)
|
||||
PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_1_based_indexing.cu (19 of 64)
|
||||
PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_0_based_indexing.cu (20 of 64)
|
||||
PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_sgemm_matrix_multiplication.cu (21 of 64)
|
||||
PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_0_based_indexing_rocblas.cu (22 of 64)
|
||||
PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_sgemm_matrix_multiplication_rocblas.cu (23 of 64)
|
||||
PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_1_based_indexing_rocblas.cu (24 of 64)
|
||||
PASS: hipify :: unit_tests/libraries/cuComplex/cuComplex_Julia.cu (25 of 64)
|
||||
PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_softmax.cu (26 of 64)
|
||||
PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_convolution_forward.cu (27 of 64)
|
||||
PASS: hipify :: unit_tests/libraries/cuFFT/simple_cufft.cu (28 of 64)
|
||||
PASS: hipify :: unit_tests/libraries/cuRAND/poisson_api_example.cu (29 of 64)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_01.cu (30 of 64)
|
||||
PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_generate.cpp (31 of 64)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_02.cu (32 of 64)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_03.cu (33 of 64)
|
||||
PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_kernel.cpp (34 of 64)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_04.cu (35 of 64)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_05.cu (36 of 64)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_06.cu (37 of 64)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_07.cu (38 of 64)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_09.cu (39 of 64)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_10.cu (40 of 64)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_08.cu (41 of 64)
|
||||
PASS: hipify :: unit_tests/namespace/ns_kernel_launch.cu (42 of 64)
|
||||
PASS: hipify :: unit_tests/pp/pp_if_else_conditionals.cu (43 of 64)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_11.cu (44 of 64)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_12.cu (45 of 64)
|
||||
PASS: hipify :: unit_tests/pp/pp_if_else_conditionals_01.cu (46 of 64)
|
||||
PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/tex2dKernel.cpp (47 of 64)
|
||||
PASS: hipify :: unit_tests/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp (48 of 64)
|
||||
PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp (49 of 64)
|
||||
PASS: hipify :: unit_tests/samples/2_Cookbook/13_occupancy/occupancy.cpp (50 of 64)
|
||||
PASS: hipify :: unit_tests/samples/2_Cookbook/1_hipEvent/hipEvent.cpp (51 of 64)
|
||||
PASS: hipify :: unit_tests/samples/2_Cookbook/2_Profiler/Profiler.cpp (52 of 64)
|
||||
PASS: hipify :: unit_tests/samples/MallocManaged.cpp (53 of 64)
|
||||
PASS: hipify :: unit_tests/samples/2_Cookbook/7_streams/stream.cpp (54 of 64)
|
||||
PASS: hipify :: unit_tests/samples/2_Cookbook/8_peer2peer/peer2peer.cpp (55 of 64)
|
||||
PASS: hipify :: unit_tests/samples/allocators.cu (56 of 64)
|
||||
PASS: hipify :: unit_tests/samples/coalescing.cu (57 of 64)
|
||||
PASS: hipify :: unit_tests/samples/dynamic_shared_memory.cu (58 of 64)
|
||||
PASS: hipify :: unit_tests/samples/axpy.cu (59 of 64)
|
||||
PASS: hipify :: unit_tests/samples/cudaRegister.cu (60 of 64)
|
||||
PASS: hipify :: unit_tests/samples/intro.cu (61 of 64)
|
||||
PASS: hipify :: unit_tests/samples/square.cu (62 of 64)
|
||||
PASS: hipify :: unit_tests/samples/static_shared_memory.cu (63 of 64)
|
||||
PASS: hipify :: unit_tests/samples/vec_add.cu (64 of 64)
|
||||
Testing Time: 2.98s
|
||||
Expected Passes : 64
|
||||
-- Testing: 65 tests, 12 threads --
|
||||
PASS: hipify :: unit_tests/casts/reinterpret_cast.cu (1 of 65)
|
||||
PASS: hipify :: unit_tests/device/math_functions.cu (2 of 65)
|
||||
PASS: hipify :: unit_tests/device/atomics.cu (3 of 65)
|
||||
PASS: hipify :: unit_tests/device/device_symbols.cu (4 of 65)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_02.cu (5 of 65)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_03.cu (6 of 65)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_01.cu (7 of 65)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_04.cu (8 of 65)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_05.cu (9 of 65)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_07.cu (10 of 65)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_06.cu (11 of 65)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_11.cu (12 of 65)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_08.cu (13 of 65)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_10.cu (14 of 65)
|
||||
PASS: hipify :: unit_tests/headers/headers_test_09.cu (15 of 65)
|
||||
PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_02.cu (16 of 65)
|
||||
PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_01.cu (17 of 65)
|
||||
PASS: hipify :: unit_tests/libraries/CUB/cub_01.cu (18 of 65)
|
||||
PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_1_based_indexing.cu (19 of 65)
|
||||
PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_0_based_indexing.cu (20 of 65)
|
||||
PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_sgemm_matrix_multiplication.cu (21 of 65)
|
||||
PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_0_based_indexing_rocblas.cu (22 of 65)
|
||||
PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_sgemm_matrix_multiplication_rocblas.cu (23 of 65)
|
||||
PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_1_based_indexing_rocblas.cu (24 of 65)
|
||||
PASS: hipify :: unit_tests/libraries/cuComplex/cuComplex_Julia.cu (25 of 65)
|
||||
PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_softmax.cu (26 of 65)
|
||||
PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_convolution_forward.cu (27 of 65)
|
||||
PASS: hipify :: unit_tests/libraries/cuFFT/simple_cufft.cu (28 of 65)
|
||||
PASS: hipify :: unit_tests/libraries/cuRAND/poisson_api_example.cu (29 of 65)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_01.cu (30 of 65)
|
||||
PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_generate.cpp (31 of 65)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_02.cu (32 of 65)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_03.cu (33 of 65)
|
||||
PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_kernel.cpp (34 of 65)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_04.cu (35 of 65)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_05.cu (36 of 65)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_06.cu (37 of 65)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_07.cu (38 of 65)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_09.cu (39 of 65)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_10.cu (40 of 65)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_08.cu (41 of 65)
|
||||
PASS: hipify :: unit_tests/namespace/ns_kernel_launch.cu (42 of 65)
|
||||
PASS: hipify :: unit_tests/pp/pp_if_else_conditionals.cu (43 of 65)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_11.cu (44 of 65)
|
||||
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_12.cu (45 of 65)
|
||||
PASS: hipify :: unit_tests/pp/pp_if_else_conditionals_01.cu (46 of 65)
|
||||
PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/tex2dKernel.cpp (47 of 65)
|
||||
PASS: hipify :: unit_tests/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp (48 of 65)
|
||||
PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp (49 of 65)
|
||||
PASS: hipify :: unit_tests/samples/2_Cookbook/13_occupancy/occupancy.cpp (50 of 65)
|
||||
PASS: hipify :: unit_tests/samples/2_Cookbook/1_hipEvent/hipEvent.cpp (51 of 65)
|
||||
PASS: hipify :: unit_tests/samples/2_Cookbook/2_Profiler/Profiler.cpp (52 of 65)
|
||||
PASS: hipify :: unit_tests/samples/MallocManaged.cpp (53 of 65)
|
||||
PASS: hipify :: unit_tests/samples/2_Cookbook/7_streams/stream.cpp (54 of 65)
|
||||
PASS: hipify :: unit_tests/samples/2_Cookbook/8_peer2peer/peer2peer.cpp (55 of 65)
|
||||
PASS: hipify :: unit_tests/samples/allocators.cu (56 of 65)
|
||||
PASS: hipify :: unit_tests/samples/coalescing.cu (57 of 65)
|
||||
PASS: hipify :: unit_tests/samples/dynamic_shared_memory.cu (58 of 65)
|
||||
PASS: hipify :: unit_tests/samples/axpy.cu (59 of 65)
|
||||
PASS: hipify :: unit_tests/samples/cudaRegister.cu (60 of 65)
|
||||
PASS: hipify :: unit_tests/samples/intro.cu (61 of 65)
|
||||
PASS: hipify :: unit_tests/samples/square.cu (62 of 65)
|
||||
PASS: hipify :: unit_tests/samples/static_shared_memory.cu (63 of 65)
|
||||
PASS: hipify :: unit_tests/samples/vec_add.cu (64 of 65)
|
||||
PASS: hipify :: unit_tests/libraries/CUB/cub_02.cu (18 of 65)
|
||||
Testing Time: 3.01s
|
||||
Expected Passes : 65
|
||||
[100%] Built target test-hipify
|
||||
```
|
||||
### <a name="windows"></a >Windows
|
||||
|
||||
On Windows 10 the following configurations are tested:
|
||||
|
||||
LLVM 5.0.0 - 5.0.2, CUDA 8.0, cudnn-5.1.10 - cudnn-7.1.4.18
|
||||
LLVM 5.0.0 - 5.0.2, CUDA 8.0, cudnn 5.1.10 - 7.1.4.18
|
||||
|
||||
LLVM 6.0.0 - 6.0.1, CUDA 9.0, cudnn-7.0.5.15 - cudnn-7.6.4.38
|
||||
LLVM 6.0.0 - 6.0.1, CUDA 9.0, cudnn 7.0.5.15 - 7.6.4.38
|
||||
|
||||
LLVM 7.0.0 - 9.0.0, CUDA 7.5 - 10.1, cudnn-7.0.5.15 - cudnn-7.6.4.38
|
||||
LLVM 7.0.0 - 9.0.0, CUDA 7.5 - 10.1, cudnn 7.0.5.15 - 7.6.4.38
|
||||
|
||||
Build system for the above configurations:
|
||||
Build system requirements for the latest configuration LLVM 9.0.0/CUDA 10.1 Update 2:
|
||||
|
||||
Python 3.6 - 3.7.4, cmake 3.12.3 - 3.15.5, Visual Studio 2017 (15.5.2) - 2019 (16.3.4).
|
||||
Python 3.6.0 - 3.8.0, cmake 3.5.1 - 3.15.5, Visual Studio 2017 (15.5.2) - 2019 (16.3.5).
|
||||
|
||||
Here is an example of building `hipify-clang` with testing support on `Windows 10` by `Visual Studio 16 2019`:
|
||||
|
||||
@@ -371,7 +372,7 @@ cmake
|
||||
-- - CMake module path: F:/LLVM/9.0.0/dist/lib/cmake/llvm
|
||||
-- - Include path : F:/LLVM/9.0.0/dist/include
|
||||
-- - Binary path : F:/LLVM/9.0.0/dist/bin
|
||||
-- Found PythonInterp: C:/Program Files/Python37/python.exe (found suitable version "3.7.4", minimum required is "3.6")
|
||||
-- Found PythonInterp: C:/Program Files/Python38/python.exe (found suitable version "3.8.0", minimum required is "3.6")
|
||||
-- Found lit: C:/Program Files/Python36/Scripts/lit.exe
|
||||
-- Found FileCheck: F:/LLVM/9.0.0/dist/bin/FileCheck.exe
|
||||
-- Found CUDA: C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1 (found version "10.1")
|
||||
|
||||
@@ -0,0 +1,28 @@
|
||||
/*
|
||||
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "CUDA2HIP.h"
|
||||
|
||||
// Maps the names of CUDA CUB API types to the corresponding HIP types
|
||||
const std::map<llvm::StringRef, hipCounter> CUDA_CUB_TYPE_NAME_MAP{
|
||||
{"cub", {"hipcub", "", CONV_TYPE, API_CUB}},
|
||||
};
|
||||
@@ -251,6 +251,11 @@ namespace perl {
|
||||
*streamPtr.get() << tab_2 << "$Tkernels{$1}++;" << endl_tab << "}" << endl << "}" << endl;
|
||||
}
|
||||
|
||||
void generateCubNamespace(unique_ptr<ostream>& streamPtr) {
|
||||
*streamPtr.get() << endl << sub << "transformCubNamespace" << " {" << endl_tab << my_k << endl;
|
||||
*streamPtr.get() << tab << "$k += s/using\\s*namespace\\s*cub/using namespace hipcub/g;" << endl << tab << return_k << "}" << endl;
|
||||
}
|
||||
|
||||
void generateHostFunctions(unique_ptr<ostream>& streamPtr) {
|
||||
*streamPtr.get() << endl << sub << "transformHostFunctions" << " {" << endl_tab << my_k << endl;
|
||||
set<string> &funcSet = DeviceSymbolFunctions0;
|
||||
@@ -358,6 +363,7 @@ namespace perl {
|
||||
generateSimpleSubstitutions(streamPtr);
|
||||
generateExternShared(streamPtr);
|
||||
generateKernelLaunch(streamPtr);
|
||||
generateCubNamespace(streamPtr);
|
||||
generateHostFunctions(streamPtr);
|
||||
generateDeviceFunctions(streamPtr);
|
||||
*streamPtr.get() << endl << "# Count of transforms in all files" << endl;
|
||||
@@ -401,6 +407,7 @@ namespace perl {
|
||||
*streamPtr.get() << tab_2 << "simpleSubstitutions();" << endl;
|
||||
*streamPtr.get() << tab_2 << "transformExternShared();" << endl;
|
||||
*streamPtr.get() << tab_2 << "transformKernelLaunch();" << endl;
|
||||
*streamPtr.get() << tab_2 << "transformCubNamespace();" << endl;
|
||||
*streamPtr.get() << tab_2 << "if ($print_stats) {" << endl;
|
||||
*streamPtr.get() << tab_3 << while_ << "(/(\\b(hip|HIP)([A-Z]|_)\\w+\\b)/g) {" << endl;
|
||||
*streamPtr.get() << tab_4 << "$convertedTags{$1}++;" << endl_tab_3 << "}" << endl_tab_2 << "}" << endl;
|
||||
|
||||
@@ -31,8 +31,6 @@ const std::map<llvm::StringRef, hipCounter> CUDA_RUNTIME_TYPE_NAME_MAP {
|
||||
{"cudaChannelFormatDesc", {"hipChannelFormatDesc", "", CONV_TYPE, API_RUNTIME}},
|
||||
// no analogue
|
||||
{"cudaDeviceProp", {"hipDeviceProp_t", "", CONV_TYPE, API_RUNTIME}},
|
||||
// NOTE: int warpSize is a field of cudaDeviceProp
|
||||
{"warpSize", {"hipWarpSize", "", CONV_TYPE, API_RUNTIME}},
|
||||
|
||||
// no analogue
|
||||
{"cudaEglFrame", {"hipEglFrame", "", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}},
|
||||
|
||||
@@ -33,16 +33,21 @@ THE SOFTWARE.
|
||||
#include "StringUtils.h"
|
||||
#include "ArgParse.h"
|
||||
|
||||
namespace ct = clang::tooling;
|
||||
namespace mat = clang::ast_matchers;
|
||||
|
||||
const std::string sHIP = "HIP";
|
||||
const std::string sROC = "ROC";
|
||||
const std::string sCub = "cub";
|
||||
const std::string sHipcub = "hipcub";
|
||||
const std::string sHIP_DYNAMIC_SHARED = "HIP_DYNAMIC_SHARED";
|
||||
const std::string sHIP_KERNEL_NAME = "HIP_KERNEL_NAME";
|
||||
std::string sHIP_SYMBOL = "HIP_SYMBOL";
|
||||
std::string sHIP_KERNEL_NAME = "HIP_KERNEL_NAME";
|
||||
std::string s_reinterpret_cast = "reinterpret_cast<const void*>";
|
||||
const std::string sHipLaunchKernelGGL = "hipLaunchKernelGGL(";
|
||||
const std::string sHipLaunchKernelGGL = "hipLaunchKernelGGL";
|
||||
const std::string sDim3 = "dim3(";
|
||||
|
||||
const std::string s_hiprand_kernel_h = "hiprand_kernel.h";
|
||||
const std::string s_hiprand_h = "hiprand.h";
|
||||
const std::string sOnce = "once";
|
||||
const std::string s_string_literal = "[string literal]";
|
||||
// CUDA identifiers, used in matchers
|
||||
const std::string sCudaMemcpyToSymbol = "cudaMemcpyToSymbol";
|
||||
const std::string sCudaMemcpyToSymbolAsync = "cudaMemcpyToSymbolAsync";
|
||||
const std::string sCudaGetSymbolSize = "cudaGetSymbolSize";
|
||||
@@ -51,6 +56,14 @@ const std::string sCudaMemcpyFromSymbol = "cudaMemcpyFromSymbol";
|
||||
const std::string sCudaMemcpyFromSymbolAsync = "cudaMemcpyFromSymbolAsync";
|
||||
const std::string sCudaFuncSetCacheConfig = "cudaFuncSetCacheConfig";
|
||||
const std::string sCudaFuncGetAttributes = "cudaFuncGetAttributes";
|
||||
// Matchers' names
|
||||
const StringRef sCudaSharedIncompleteArrayVar = "cudaSharedIncompleteArrayVar";
|
||||
const StringRef sCudaLaunchKernel = "cudaLaunchKernel";
|
||||
const StringRef sCudaHostFuncCall = "cudaHostFuncCall";
|
||||
const StringRef sCudaDeviceFuncCall = "cudaDeviceFuncCall";
|
||||
const StringRef sCubNamespacePrefix = "cubNamespacePrefix";
|
||||
const StringRef sCubFunctionTemplateDecl = "cubFunctionTemplateDecl";
|
||||
const StringRef sCubUsingNamespaceDecl = "cubUsingNamespaceDecl";
|
||||
|
||||
std::set<std::string> DeviceSymbolFunctions0 {
|
||||
{sCudaMemcpyToSymbol},
|
||||
@@ -78,7 +91,7 @@ std::set<std::string> ReinterpretFunctions1{
|
||||
};
|
||||
|
||||
void HipifyAction::RewriteString(StringRef s, clang::SourceLocation start) {
|
||||
clang::SourceManager& SM = getCompilerInstance().getSourceManager();
|
||||
auto &SM = getCompilerInstance().getSourceManager();
|
||||
size_t begin = 0;
|
||||
while ((begin = s.find("cu", begin)) != StringRef::npos) {
|
||||
const size_t end = s.find_first_of(" ", begin + 4);
|
||||
@@ -86,7 +99,7 @@ void HipifyAction::RewriteString(StringRef s, clang::SourceLocation start) {
|
||||
const auto found = CUDA_RENAMES_MAP().find(name);
|
||||
if (found != CUDA_RENAMES_MAP().end()) {
|
||||
StringRef repName = Statistics::isToRoc(found->second) ? found->second.rocName : found->second.hipName;
|
||||
hipCounter counter = {"[string literal]", "", ConvTypes::CONV_LITERAL, ApiTypes::API_RUNTIME, found->second.supportDegree};
|
||||
hipCounter counter = {s_string_literal, "", ConvTypes::CONV_LITERAL, ApiTypes::API_RUNTIME, found->second.supportDegree};
|
||||
Statistics::current().incrementCounter(counter, name.str());
|
||||
if (!Statistics::isUnsupported(counter)) {
|
||||
clang::SourceLocation sl = start.getLocWithOffset(begin + 1);
|
||||
@@ -95,13 +108,24 @@ void HipifyAction::RewriteString(StringRef s, clang::SourceLocation start) {
|
||||
insertReplacement(Rep, fullSL);
|
||||
}
|
||||
}
|
||||
if (end == StringRef::npos) {
|
||||
break;
|
||||
}
|
||||
if (end == StringRef::npos) break;
|
||||
begin = end + 1;
|
||||
}
|
||||
}
|
||||
|
||||
clang::SourceLocation HipifyAction::GetSubstrLocation(const std::string &str, const clang::SourceRange &sr) {
|
||||
clang::SourceLocation sl(sr.getBegin());
|
||||
clang::SourceLocation end(sr.getEnd());
|
||||
auto &SM = getCompilerInstance().getSourceManager();
|
||||
size_t length = SM.getCharacterData(end) - SM.getCharacterData(sl);
|
||||
StringRef sfull = StringRef(SM.getCharacterData(sl), length);
|
||||
size_t offset = sfull.find(str);
|
||||
if (offset > 0) {
|
||||
sl = sl.getLocWithOffset(offset);
|
||||
}
|
||||
return sl;
|
||||
}
|
||||
|
||||
/**
|
||||
* Look at, and consider altering, a given token.
|
||||
*
|
||||
@@ -109,7 +133,7 @@ void HipifyAction::RewriteString(StringRef s, clang::SourceLocation start) {
|
||||
* If it's an unsupported CUDA identifier, a warning is emitted.
|
||||
* Otherwise, the source file is updated with the corresponding hipification.
|
||||
*/
|
||||
void HipifyAction::RewriteToken(const clang::Token& t) {
|
||||
void HipifyAction::RewriteToken(const clang::Token &t) {
|
||||
// String literals containing CUDA references need fixing.
|
||||
if (t.is(clang::tok::string_literal)) {
|
||||
StringRef s(t.getLiteralData(), t.getLength());
|
||||
@@ -124,20 +148,21 @@ void HipifyAction::RewriteToken(const clang::Token& t) {
|
||||
FindAndReplace(name, sl, CUDA_RENAMES_MAP());
|
||||
}
|
||||
|
||||
void HipifyAction::FindAndReplace(llvm::StringRef name,
|
||||
void HipifyAction::FindAndReplace(StringRef name,
|
||||
clang::SourceLocation sl,
|
||||
const std::map<llvm::StringRef, hipCounter>& repMap, bool bReplace) {
|
||||
const std::map<StringRef, hipCounter> &repMap,
|
||||
bool bReplace) {
|
||||
const auto found = repMap.find(name);
|
||||
if (found == repMap.end()) {
|
||||
// So it's an identifier, but not CUDA? Boring.
|
||||
return;
|
||||
}
|
||||
Statistics::current().incrementCounter(found->second, name.str());
|
||||
clang::DiagnosticsEngine& DE = getCompilerInstance().getDiagnostics();
|
||||
clang::DiagnosticsEngine &DE = getCompilerInstance().getDiagnostics();
|
||||
// Warn the user about unsupported identifier.
|
||||
if (Statistics::isUnsupported(found->second)) {
|
||||
std::string sWarn;
|
||||
Statistics::isToRoc(found->second) ? sWarn = "ROC" : sWarn = "HIP";
|
||||
Statistics::isToRoc(found->second) ? sWarn = sROC : sWarn = sHIP;
|
||||
sWarn = "" + sWarn;
|
||||
const auto ID = DE.getCustomDiagID(clang::DiagnosticsEngine::Warning, "CUDA identifier is unsupported in %0.");
|
||||
DE.Report(sl, ID) << sWarn;
|
||||
@@ -147,7 +172,7 @@ void HipifyAction::FindAndReplace(llvm::StringRef name,
|
||||
return;
|
||||
}
|
||||
StringRef repName = Statistics::isToRoc(found->second) ? found->second.rocName : found->second.hipName;
|
||||
clang::SourceManager& SM = getCompilerInstance().getSourceManager();
|
||||
auto &SM = getCompilerInstance().getSourceManager();
|
||||
ct::Replacement Rep(SM, sl, name.size(), repName.str());
|
||||
clang::FullSourceLoc fullSL(sl, SM);
|
||||
insertReplacement(Rep, fullSL);
|
||||
@@ -155,7 +180,7 @@ void HipifyAction::FindAndReplace(llvm::StringRef name,
|
||||
|
||||
namespace {
|
||||
|
||||
clang::SourceRange getReadRange(clang::SourceManager& SM, const clang::SourceRange& exprRange) {
|
||||
clang::SourceRange getReadRange(clang::SourceManager &SM, const clang::SourceRange &exprRange) {
|
||||
clang::SourceLocation begin = exprRange.getBegin();
|
||||
clang::SourceLocation end = exprRange.getEnd();
|
||||
bool beginSafe = !SM.isMacroBodyExpansion(begin) || clang::Lexer::isAtStartOfMacroExpansion(begin, SM, clang::LangOptions{});
|
||||
@@ -167,7 +192,7 @@ clang::SourceRange getReadRange(clang::SourceManager& SM, const clang::SourceRan
|
||||
}
|
||||
}
|
||||
|
||||
clang::SourceRange getWriteRange(clang::SourceManager& SM, const clang::SourceRange& exprRange) {
|
||||
clang::SourceRange getWriteRange(clang::SourceManager &SM, const clang::SourceRange &exprRange) {
|
||||
clang::SourceLocation begin = exprRange.getBegin();
|
||||
clang::SourceLocation end = exprRange.getEnd();
|
||||
// If the range is contained within a macro, update the macro definition.
|
||||
@@ -178,7 +203,7 @@ clang::SourceRange getWriteRange(clang::SourceManager& SM, const clang::SourceRa
|
||||
return {SM.getSpellingLoc(begin), SM.getSpellingLoc(end)};
|
||||
}
|
||||
|
||||
StringRef readSourceText(clang::SourceManager& SM, const clang::SourceRange& exprRange) {
|
||||
StringRef readSourceText(clang::SourceManager &SM, const clang::SourceRange &exprRange) {
|
||||
return clang::Lexer::getSourceText(clang::CharSourceRange::getTokenRange(getReadRange(SM, exprRange)), SM, clang::LangOptions(), nullptr);
|
||||
}
|
||||
|
||||
@@ -186,53 +211,50 @@ StringRef readSourceText(clang::SourceManager& SM, const clang::SourceRange& exp
|
||||
* Get a string representation of the expression `arg`, unless it's a defaulting function
|
||||
* call argument, in which case get a 0. Used for building argument lists to kernel calls.
|
||||
*/
|
||||
std::string stringifyZeroDefaultedArg(clang::SourceManager& SM, const clang::Expr* arg) {
|
||||
if (clang::isa<clang::CXXDefaultArgExpr>(arg)) {
|
||||
return "0";
|
||||
} else {
|
||||
return readSourceText(SM, arg->getSourceRange());
|
||||
}
|
||||
std::string stringifyZeroDefaultedArg(clang::SourceManager &SM, const clang::Expr *arg) {
|
||||
if (clang::isa<clang::CXXDefaultArgExpr>(arg)) return "0";
|
||||
else return readSourceText(SM, arg->getSourceRange());
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
bool HipifyAction::Exclude(const hipCounter & hipToken) {
|
||||
bool HipifyAction::Exclude(const hipCounter &hipToken) {
|
||||
switch (hipToken.type) {
|
||||
case CONV_INCLUDE_CUDA_MAIN_H:
|
||||
switch (hipToken.apiType) {
|
||||
case API_DRIVER:
|
||||
case API_RUNTIME:
|
||||
if (insertedRuntimeHeader) { return true; }
|
||||
if (insertedRuntimeHeader) return true;
|
||||
insertedRuntimeHeader = true;
|
||||
return false;
|
||||
case API_BLAS:
|
||||
if (insertedBLASHeader) { return true; }
|
||||
if (insertedBLASHeader) return true;
|
||||
insertedBLASHeader = true;
|
||||
return false;
|
||||
case API_RAND:
|
||||
if (hipToken.hipName == "hiprand_kernel.h") {
|
||||
if (insertedRAND_kernelHeader) { return true; }
|
||||
if (hipToken.hipName == s_hiprand_kernel_h) {
|
||||
if (insertedRAND_kernelHeader) return true;
|
||||
insertedRAND_kernelHeader = true;
|
||||
return false;
|
||||
} else if (hipToken.hipName == "hiprand.h") {
|
||||
if (insertedRANDHeader) { return true; }
|
||||
} else if (hipToken.hipName == s_hiprand_h) {
|
||||
if (insertedRANDHeader) return true;
|
||||
insertedRANDHeader = true;
|
||||
return false;
|
||||
}
|
||||
case API_DNN:
|
||||
if (insertedDNNHeader) { return true; }
|
||||
if (insertedDNNHeader) return true;
|
||||
insertedDNNHeader = true;
|
||||
return false;
|
||||
case API_FFT:
|
||||
if (insertedFFTHeader) { return true; }
|
||||
if (insertedFFTHeader) return true;
|
||||
insertedFFTHeader = true;
|
||||
return false;
|
||||
case API_COMPLEX:
|
||||
if (insertedComplexHeader) { return true; }
|
||||
if (insertedComplexHeader) return true;
|
||||
insertedComplexHeader = true;
|
||||
return false;
|
||||
case API_SPARSE:
|
||||
if (insertedSPARSEHeader) { return true; }
|
||||
if (insertedSPARSEHeader) return true;
|
||||
insertedSPARSEHeader = true;
|
||||
return false;
|
||||
default:
|
||||
@@ -240,13 +262,11 @@ bool HipifyAction::Exclude(const hipCounter & hipToken) {
|
||||
}
|
||||
return false;
|
||||
case CONV_INCLUDE:
|
||||
if (hipToken.hipName.empty()) {
|
||||
return true;
|
||||
}
|
||||
if (hipToken.hipName.empty()) return true;
|
||||
switch (hipToken.apiType) {
|
||||
case API_RAND:
|
||||
if (hipToken.hipName == "hiprand_kernel.h") {
|
||||
if (insertedRAND_kernelHeader) { return true; }
|
||||
if (hipToken.hipName == s_hiprand_kernel_h) {
|
||||
if (insertedRAND_kernelHeader) return true;
|
||||
insertedRAND_kernelHeader = true;
|
||||
}
|
||||
return false;
|
||||
@@ -267,24 +287,19 @@ void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc,
|
||||
clang::CharSourceRange filename_range,
|
||||
const clang::FileEntry*, StringRef,
|
||||
StringRef, const clang::Module*) {
|
||||
clang::SourceManager& SM = getCompilerInstance().getSourceManager();
|
||||
if (!SM.isWrittenInMainFile(hash_loc)) {
|
||||
return;
|
||||
}
|
||||
auto &SM = getCompilerInstance().getSourceManager();
|
||||
if (!SM.isWrittenInMainFile(hash_loc)) return;
|
||||
if (!firstHeader) {
|
||||
firstHeader = true;
|
||||
firstHeaderLoc = hash_loc;
|
||||
}
|
||||
const auto found = CUDA_INCLUDE_MAP.find(file_name);
|
||||
if (found == CUDA_INCLUDE_MAP.end()) {
|
||||
return;
|
||||
}
|
||||
if (found == CUDA_INCLUDE_MAP.end()) return;
|
||||
bool exclude = Exclude(found->second);
|
||||
Statistics::current().incrementCounter(found->second, file_name.str());
|
||||
|
||||
clang::SourceLocation sl = filename_range.getBegin();
|
||||
if (Statistics::isUnsupported(found->second)) {
|
||||
clang::DiagnosticsEngine& DE = getCompilerInstance().getDiagnostics();
|
||||
clang::DiagnosticsEngine &DE = getCompilerInstance().getDiagnostics();
|
||||
DE.Report(sl, DE.getCustomDiagID(clang::DiagnosticsEngine::Warning, "Unsupported CUDA header."));
|
||||
return;
|
||||
}
|
||||
@@ -293,11 +308,8 @@ void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc,
|
||||
if (!exclude) {
|
||||
clang::SmallString<128> includeBuffer;
|
||||
llvm::StringRef name = Statistics::isToRoc(found->second) ? found->second.rocName : found->second.hipName;
|
||||
if (is_angled) {
|
||||
newInclude = llvm::Twine("<" + name+ ">").toStringRef(includeBuffer);
|
||||
} else {
|
||||
newInclude = llvm::Twine("\"" + name + "\"").toStringRef(includeBuffer);
|
||||
}
|
||||
if (is_angled) newInclude = llvm::Twine("<" + name+ ">").toStringRef(includeBuffer);
|
||||
else newInclude = llvm::Twine("\"" + name + "\"").toStringRef(includeBuffer);
|
||||
} else {
|
||||
// hashLoc is location of the '#', thus replacing the whole include directive by empty newInclude starting with '#'.
|
||||
sl = hash_loc;
|
||||
@@ -309,48 +321,52 @@ void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc,
|
||||
}
|
||||
|
||||
void HipifyAction::PragmaDirective(clang::SourceLocation Loc, clang::PragmaIntroducerKind Introducer) {
|
||||
if (pragmaOnce) {
|
||||
return;
|
||||
}
|
||||
clang::SourceManager& SM = getCompilerInstance().getSourceManager();
|
||||
if (!SM.isWrittenInMainFile(Loc)) {
|
||||
return;
|
||||
}
|
||||
clang::Preprocessor& PP = getCompilerInstance().getPreprocessor();
|
||||
if (pragmaOnce) return;
|
||||
auto &SM = getCompilerInstance().getSourceManager();
|
||||
if (!SM.isWrittenInMainFile(Loc)) return;
|
||||
clang::Preprocessor &PP = getCompilerInstance().getPreprocessor();
|
||||
clang::Token tok;
|
||||
PP.Lex(tok);
|
||||
StringRef Text(SM.getCharacterData(tok.getLocation()), tok.getLength());
|
||||
if (Text == "once") {
|
||||
if (Text == sOnce) {
|
||||
pragmaOnce = true;
|
||||
pragmaOnceLoc = tok.getEndLoc();
|
||||
}
|
||||
}
|
||||
|
||||
bool HipifyAction::cudaLaunchKernel(const clang::ast_matchers::MatchFinder::MatchResult& Result) {
|
||||
StringRef refName = "cudaLaunchKernel";
|
||||
const auto* launchKernel = Result.Nodes.getNodeAs<clang::CUDAKernelCallExpr>(refName);
|
||||
if (!launchKernel) {
|
||||
return false;
|
||||
}
|
||||
const clang::Expr* calleeExpr = launchKernel->getCallee();
|
||||
if (!calleeExpr) {
|
||||
return false;
|
||||
}
|
||||
const clang::FunctionDecl *caleeDecl = launchKernel->getDirectCallee();
|
||||
if (!caleeDecl) {
|
||||
return false;
|
||||
}
|
||||
const clang::CallExpr* config = launchKernel->getConfig();
|
||||
if (!config) {
|
||||
return false;
|
||||
}
|
||||
bool HipifyAction::cudaLaunchKernel(const mat::MatchFinder::MatchResult &Result) {
|
||||
auto *launchKernel = Result.Nodes.getNodeAs<clang::CUDAKernelCallExpr>(sCudaLaunchKernel);
|
||||
if (!launchKernel) return false;
|
||||
auto *calleeExpr = launchKernel->getCallee();
|
||||
if (!calleeExpr) return false;
|
||||
auto *caleeDecl = launchKernel->getDirectCallee();
|
||||
if (!caleeDecl) return false;
|
||||
auto *config = launchKernel->getConfig();
|
||||
if (!config) return false;
|
||||
clang::SmallString<40> XStr;
|
||||
llvm::raw_svector_ostream OS(XStr);
|
||||
clang::LangOptions DefaultLangOptions;
|
||||
clang::SourceManager* SM = Result.SourceManager;
|
||||
OS << sHipLaunchKernelGGL;
|
||||
if (caleeDecl->isTemplateInstantiation()) OS << sHIP_KERNEL_NAME << "(";
|
||||
OS << readSourceText(*SM, calleeExpr->getSourceRange());
|
||||
auto *SM = Result.SourceManager;
|
||||
clang::SourceRange sr = calleeExpr->getSourceRange();
|
||||
std::string kern = readSourceText(*SM, sr).str();
|
||||
OS << sHipLaunchKernelGGL << "(";
|
||||
if (caleeDecl->isTemplateInstantiation()) {
|
||||
OS << sHIP_KERNEL_NAME << "(";
|
||||
std::string cub = sCub + "::";
|
||||
std::string hipcub;
|
||||
const auto found = CUDA_CUB_TYPE_NAME_MAP.find(sCub);
|
||||
if (found != CUDA_CUB_TYPE_NAME_MAP.end()) {
|
||||
hipcub = found->second.hipName.str() + "::";
|
||||
} else {
|
||||
hipcub = sHipcub + "::";
|
||||
}
|
||||
size_t pos = kern.find(cub);
|
||||
while (pos != std::string::npos) {
|
||||
kern.replace(pos, cub.size(), hipcub);
|
||||
pos = kern.find(cub, pos + hipcub.size());
|
||||
}
|
||||
}
|
||||
OS << kern;
|
||||
if (caleeDecl->isTemplateInstantiation()) OS << ")";
|
||||
OS << ", ";
|
||||
// Next up are the four kernel configuration parameters, the last two of which are optional and default to zero.
|
||||
@@ -381,29 +397,24 @@ bool HipifyAction::cudaLaunchKernel(const clang::ast_matchers::MatchFinder::Matc
|
||||
ct::Replacement Rep(*SM, launchStart, length, OS.str());
|
||||
clang::FullSourceLoc fullSL(launchStart, *SM);
|
||||
insertReplacement(Rep, fullSL);
|
||||
hipCounter counter = {"hipLaunchKernelGGL", "", ConvTypes::CONV_KERNEL_LAUNCH, ApiTypes::API_RUNTIME};
|
||||
Statistics::current().incrementCounter(counter, refName.str());
|
||||
hipCounter counter = {sHipLaunchKernelGGL, "", ConvTypes::CONV_KERNEL_LAUNCH, ApiTypes::API_RUNTIME};
|
||||
Statistics::current().incrementCounter(counter, sCudaLaunchKernel.str());
|
||||
return true;
|
||||
}
|
||||
|
||||
bool HipifyAction::cudaSharedIncompleteArrayVar(const clang::ast_matchers::MatchFinder::MatchResult& Result) {
|
||||
StringRef refName = "cudaSharedIncompleteArrayVar";
|
||||
auto* sharedVar = Result.Nodes.getNodeAs<clang::VarDecl>(refName);
|
||||
if (!sharedVar) {
|
||||
return false;
|
||||
}
|
||||
bool HipifyAction::cudaSharedIncompleteArrayVar(const mat::MatchFinder::MatchResult &Result) {
|
||||
auto *sharedVar = Result.Nodes.getNodeAs<clang::VarDecl>(sCudaSharedIncompleteArrayVar);
|
||||
if (!sharedVar) return false;
|
||||
// Example: extern __shared__ uint sRadix1[];
|
||||
if (!sharedVar->hasExternalFormalLinkage()) {
|
||||
return false;
|
||||
}
|
||||
if (!sharedVar->hasExternalFormalLinkage()) return false;
|
||||
clang::QualType QT = sharedVar->getType();
|
||||
std::string typeName;
|
||||
if (QT->isIncompleteArrayType()) {
|
||||
const clang::ArrayType* AT = QT.getTypePtr()->getAsArrayTypeUnsafe();
|
||||
const clang::ArrayType *AT = QT.getTypePtr()->getAsArrayTypeUnsafe();
|
||||
QT = AT->getElementType();
|
||||
if (QT.getTypePtr()->isBuiltinType()) {
|
||||
QT = QT.getCanonicalType();
|
||||
const auto* BT = clang::dyn_cast<clang::BuiltinType>(QT);
|
||||
auto *BT = clang::dyn_cast<clang::BuiltinType>(QT);
|
||||
if (BT) {
|
||||
clang::LangOptions LO;
|
||||
LO.CUDA = true;
|
||||
@@ -417,7 +428,7 @@ bool HipifyAction::cudaSharedIncompleteArrayVar(const clang::ast_matchers::Match
|
||||
if (!typeName.empty()) {
|
||||
clang::SourceLocation slStart = sharedVar->getOuterLocStart();
|
||||
clang::SourceLocation slEnd = llcompat::getEndLoc(sharedVar->getTypeSourceInfo()->getTypeLoc());
|
||||
clang::SourceManager* SM = Result.SourceManager;
|
||||
auto *SM = Result.SourceManager;
|
||||
size_t repLength = SM->getCharacterData(slEnd) - SM->getCharacterData(slStart) + 1;
|
||||
std::string varName = sharedVar->getNameAsString();
|
||||
std::string repName = sHIP_DYNAMIC_SHARED + "(" + typeName + ", " + varName + ")";
|
||||
@@ -425,74 +436,84 @@ bool HipifyAction::cudaSharedIncompleteArrayVar(const clang::ast_matchers::Match
|
||||
clang::FullSourceLoc fullSL(slStart, *SM);
|
||||
insertReplacement(Rep, fullSL);
|
||||
hipCounter counter = {sHIP_DYNAMIC_SHARED, "", ConvTypes::CONV_EXTERN_SHARED, ApiTypes::API_RUNTIME};
|
||||
Statistics::current().incrementCounter(counter, refName.str());
|
||||
Statistics::current().incrementCounter(counter, sCudaSharedIncompleteArrayVar.str());
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool HipifyAction::cudaDeviceFuncCall(const clang::ast_matchers::MatchFinder::MatchResult& Result) {
|
||||
if (const clang::CallExpr *call = Result.Nodes.getNodeAs<clang::CallExpr>("cudaDeviceFuncCall")) {
|
||||
const clang::FunctionDecl *funcDcl = call->getDirectCallee();
|
||||
if (!funcDcl) {
|
||||
return false;
|
||||
}
|
||||
bool HipifyAction::cudaDeviceFuncCall(const mat::MatchFinder::MatchResult &Result) {
|
||||
if (const clang::CallExpr *call = Result.Nodes.getNodeAs<clang::CallExpr>(sCudaDeviceFuncCall)) {
|
||||
auto *funcDcl = call->getDirectCallee();
|
||||
if (!funcDcl) return false;
|
||||
FindAndReplace(funcDcl->getDeclName().getAsString(), llcompat::getBeginLoc(call), CUDA_DEVICE_FUNC_MAP, false);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool HipifyAction::cubNamespacePrefix(const clang::ast_matchers::MatchFinder::MatchResult& Result) {
|
||||
if (const clang::TypedefNameDecl *decl = Result.Nodes.getNodeAs<clang::TypedefNameDecl>("cubNamespacePrefix")) {
|
||||
if (!decl) {
|
||||
return false;
|
||||
}
|
||||
bool HipifyAction::cubNamespacePrefix(const mat::MatchFinder::MatchResult &Result) {
|
||||
if (auto *decl = Result.Nodes.getNodeAs<clang::TypedefNameDecl>(sCubNamespacePrefix)) {
|
||||
clang::QualType QT = decl->getUnderlyingType();
|
||||
const clang::Type* t = QT.getTypePtr();
|
||||
if (!t) {
|
||||
return false;
|
||||
}
|
||||
const clang::ElaboratedType* et = t->getAs<clang::ElaboratedType>();
|
||||
if (!et) {
|
||||
return false;
|
||||
}
|
||||
auto *t = QT.getTypePtr();
|
||||
if (!t) return false;
|
||||
const clang::ElaboratedType *et = t->getAs<clang::ElaboratedType>();
|
||||
if (!et) return false;
|
||||
const clang::NestedNameSpecifier *nns = et->getQualifier();
|
||||
if (!nns) {
|
||||
return false;
|
||||
}
|
||||
if (!nns) return false;
|
||||
const clang::NamespaceDecl *nsd = nns->getAsNamespace();
|
||||
if (!nsd) {
|
||||
return false;
|
||||
}
|
||||
if (!nsd) return false;
|
||||
const clang::TypeSourceInfo *si = decl->getTypeSourceInfo();
|
||||
const clang::TypeLoc tloc = si->getTypeLoc();
|
||||
const clang::SourceRange sr = tloc.getSourceRange();
|
||||
clang::SourceLocation sl(sr.getBegin());
|
||||
clang::SourceLocation end(sr.getEnd());
|
||||
clang::SourceManager& SM = getCompilerInstance().getSourceManager();
|
||||
size_t length = SM.getCharacterData(end) - SM.getCharacterData(sl);
|
||||
StringRef sfull = StringRef(SM.getCharacterData(sl), length);
|
||||
std::string name = nsd->getDeclName().getAsString();
|
||||
size_t offset = sfull.find(name);
|
||||
if (offset > 0) {
|
||||
sl = sl.getLocWithOffset(offset);
|
||||
}
|
||||
FindAndReplace(name, sl, CUDA_CUB_TYPE_NAME_MAP);
|
||||
FindAndReplace(name, GetSubstrLocation(name, sr), CUDA_CUB_TYPE_NAME_MAP);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool HipifyAction::cudaHostFuncCall(const clang::ast_matchers::MatchFinder::MatchResult& Result) {
|
||||
if (const clang::CallExpr * call = Result.Nodes.getNodeAs<clang::CallExpr>("cudaHostFuncCall")) {
|
||||
if (!call->getNumArgs()) {
|
||||
return false;
|
||||
bool HipifyAction::cubUsingNamespaceDecl(const mat::MatchFinder::MatchResult &Result) {
|
||||
if (auto *decl = Result.Nodes.getNodeAs<clang::UsingDirectiveDecl>(sCubUsingNamespaceDecl)) {
|
||||
if (auto nsd = decl->getNominatedNamespace()) {
|
||||
FindAndReplace(nsd->getDeclName().getAsString(), decl->getIdentLocation(), CUDA_CUB_TYPE_NAME_MAP);
|
||||
return true;
|
||||
}
|
||||
const clang::FunctionDecl* funcDcl = call->getDirectCallee();
|
||||
if (!funcDcl) {
|
||||
return false;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool HipifyAction::cubFunctionTemplateDecl(const mat::MatchFinder::MatchResult &Result) {
|
||||
if (auto *decl = Result.Nodes.getNodeAs<clang::FunctionTemplateDecl>(sCubFunctionTemplateDecl)) {
|
||||
auto *Tparams = decl->getTemplateParameters();
|
||||
bool ret = false;
|
||||
for (size_t I = 0; I < Tparams->size(); ++I) {
|
||||
const clang::ValueDecl *valueDecl = dyn_cast<clang::ValueDecl>(Tparams->getParam(I));
|
||||
if (!valueDecl) continue;
|
||||
clang::QualType QT = valueDecl->getType();
|
||||
auto *t = QT.getTypePtr();
|
||||
if (!t) continue;
|
||||
const clang::ElaboratedType *et = t->getAs<clang::ElaboratedType>();
|
||||
if (!et) continue;
|
||||
const clang::NestedNameSpecifier *nns = et->getQualifier();
|
||||
if (!nns) continue;
|
||||
const clang::NamespaceDecl *nsd = nns->getAsNamespace();
|
||||
if (!nsd) continue;
|
||||
const clang::SourceRange sr = valueDecl->getSourceRange();
|
||||
std::string name = nsd->getDeclName().getAsString();
|
||||
FindAndReplace(name, GetSubstrLocation(name, sr), CUDA_CUB_TYPE_NAME_MAP);
|
||||
ret = true;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool HipifyAction::cudaHostFuncCall(const mat::MatchFinder::MatchResult &Result) {
|
||||
if (auto *call = Result.Nodes.getNodeAs<clang::CallExpr>(sCudaHostFuncCall)) {
|
||||
if (!call->getNumArgs()) return false;
|
||||
auto *funcDcl = call->getDirectCallee();
|
||||
if (!funcDcl) return false;
|
||||
std::string sName = funcDcl->getDeclName().getAsString();
|
||||
unsigned int argNum = 0;
|
||||
bool b_reinterpret = (ReinterpretFunctions.find(sName) != ReinterpretFunctions.end()) ? true : false;
|
||||
@@ -506,7 +527,7 @@ bool HipifyAction::cudaHostFuncCall(const clang::ast_matchers::MatchFinder::Matc
|
||||
clang::SmallString<40> XStr;
|
||||
llvm::raw_svector_ostream OS(XStr);
|
||||
clang::SourceRange sr = call->getArg(argNum)->getSourceRange();
|
||||
clang::SourceManager* SM = Result.SourceManager;
|
||||
auto *SM = Result.SourceManager;
|
||||
OS << (b_reinterpret ? s_reinterpret_cast : sHIP_SYMBOL) << "(" << readSourceText(*SM, sr) << ")";
|
||||
clang::SourceRange replacementRange = getWriteRange(*SM, { sr.getBegin(), sr.getEnd() });
|
||||
clang::SourceLocation s = replacementRange.getBegin();
|
||||
@@ -521,7 +542,7 @@ bool HipifyAction::cudaHostFuncCall(const clang::ast_matchers::MatchFinder::Matc
|
||||
return false;
|
||||
}
|
||||
|
||||
void HipifyAction::insertReplacement(const ct::Replacement& rep, const clang::FullSourceLoc& fullSL) {
|
||||
void HipifyAction::insertReplacement(const ct::Replacement &rep, const clang::FullSourceLoc &fullSL) {
|
||||
llcompat::insertReplacement(*replacements, rep);
|
||||
if (PrintStats) {
|
||||
rep.getLength();
|
||||
@@ -530,10 +551,10 @@ void HipifyAction::insertReplacement(const ct::Replacement& rep, const clang::Fu
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<clang::ASTConsumer> HipifyAction::CreateASTConsumer(clang::CompilerInstance& CI, llvm::StringRef) {
|
||||
Finder.reset(new clang::ast_matchers::MatchFinder);
|
||||
std::unique_ptr<clang::ASTConsumer> HipifyAction::CreateASTConsumer(clang::CompilerInstance &CI, StringRef) {
|
||||
Finder.reset(new mat::MatchFinder);
|
||||
// Replace the <<<...>>> language extension with a hip kernel launch
|
||||
Finder->addMatcher(mat::cudaKernelCallExpr(mat::isExpansionInMainFile()).bind("cudaLaunchKernel"), this);
|
||||
Finder->addMatcher(mat::cudaKernelCallExpr(mat::isExpansionInMainFile()).bind(sCudaLaunchKernel), this);
|
||||
Finder->addMatcher(
|
||||
mat::varDecl(
|
||||
mat::isExpansionInMainFile(),
|
||||
@@ -541,7 +562,7 @@ std::unique_ptr<clang::ASTConsumer> HipifyAction::CreateASTConsumer(clang::Compi
|
||||
mat::hasAttr(clang::attr::CUDAShared),
|
||||
mat::hasType(mat::incompleteArrayType())
|
||||
)
|
||||
).bind("cudaSharedIncompleteArrayVar"),
|
||||
).bind(sCudaSharedIncompleteArrayVar),
|
||||
this
|
||||
);
|
||||
Finder->addMatcher(
|
||||
@@ -561,7 +582,7 @@ std::unique_ptr<clang::ASTConsumer> HipifyAction::CreateASTConsumer(clang::Compi
|
||||
)
|
||||
)
|
||||
)
|
||||
).bind("cudaHostFuncCall"),
|
||||
).bind(sCudaHostFuncCall),
|
||||
this
|
||||
);
|
||||
Finder->addMatcher(
|
||||
@@ -576,7 +597,7 @@ std::unique_ptr<clang::ASTConsumer> HipifyAction::CreateASTConsumer(clang::Compi
|
||||
mat::unless(mat::hasAttr(clang::attr::CUDAHost))
|
||||
)
|
||||
)
|
||||
).bind("cudaDeviceFuncCall"),
|
||||
).bind(sCudaDeviceFuncCall),
|
||||
this
|
||||
);
|
||||
Finder->addMatcher(
|
||||
@@ -586,12 +607,26 @@ std::unique_ptr<clang::ASTConsumer> HipifyAction::CreateASTConsumer(clang::Compi
|
||||
mat::elaboratedType(
|
||||
mat::hasQualifier(
|
||||
mat::specifiesNamespace(
|
||||
mat::hasName("cub")
|
||||
mat::hasName(sCub)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
).bind("cubNamespacePrefix"),
|
||||
).bind(sCubNamespacePrefix),
|
||||
this
|
||||
);
|
||||
// TODO: Maybe worth to make it more concrete based on final cubFunctionTemplateDecl
|
||||
Finder->addMatcher(
|
||||
mat::functionTemplateDecl(
|
||||
mat::isExpansionInMainFile()
|
||||
).bind(sCubFunctionTemplateDecl),
|
||||
this
|
||||
);
|
||||
// TODO: Maybe worth to make it more concrete
|
||||
Finder->addMatcher(
|
||||
mat::usingDirectiveDecl(
|
||||
mat::isExpansionInMainFile()
|
||||
).bind(sCubUsingNamespaceDecl),
|
||||
this
|
||||
);
|
||||
// Ownership is transferred to the caller.
|
||||
@@ -599,10 +634,8 @@ std::unique_ptr<clang::ASTConsumer> HipifyAction::CreateASTConsumer(clang::Compi
|
||||
}
|
||||
|
||||
void HipifyAction::Ifndef(clang::SourceLocation Loc, const clang::Token &MacroNameTok, const clang::MacroDefinition &MD) {
|
||||
clang::SourceManager& SM = getCompilerInstance().getSourceManager();
|
||||
if (!SM.isWrittenInMainFile(Loc)) {
|
||||
return;
|
||||
}
|
||||
auto &SM = getCompilerInstance().getSourceManager();
|
||||
if (!SM.isWrittenInMainFile(Loc)) return;
|
||||
StringRef Text(SM.getCharacterData(MacroNameTok.getLocation()), MacroNameTok.getLength());
|
||||
Ifndefs.insert(std::make_pair(Text.str(), MacroNameTok.getEndLoc()));
|
||||
}
|
||||
@@ -615,12 +648,12 @@ void HipifyAction::EndSourceFileAction() {
|
||||
// one copy of the hip include into every file.
|
||||
bool placeForIncludeCalculated = false;
|
||||
clang::SourceLocation sl, controllingMacroLoc;
|
||||
clang::SourceManager& SM = getCompilerInstance().getSourceManager();
|
||||
clang::Preprocessor& PP = getCompilerInstance().getPreprocessor();
|
||||
clang::HeaderSearch& HS = PP.getHeaderSearchInfo();
|
||||
clang::ExternalPreprocessorSource* EPL = HS.getExternalLookup();
|
||||
const clang::FileEntry* FE = SM.getFileEntryForID(SM.getMainFileID());
|
||||
const clang::IdentifierInfo* controllingMacro = HS.getFileInfo(FE).getControllingMacro(EPL);
|
||||
auto &SM = getCompilerInstance().getSourceManager();
|
||||
clang::Preprocessor &PP = getCompilerInstance().getPreprocessor();
|
||||
clang::HeaderSearch &HS = PP.getHeaderSearchInfo();
|
||||
clang::ExternalPreprocessorSource *EPL = HS.getExternalLookup();
|
||||
const clang::FileEntry *FE = SM.getFileEntryForID(SM.getMainFileID());
|
||||
const clang::IdentifierInfo *controllingMacro = HS.getFileInfo(FE).getControllingMacro(EPL);
|
||||
if (controllingMacro) {
|
||||
auto found = Ifndefs.find(controllingMacro->getName().str());
|
||||
if (found != Ifndefs.end()) {
|
||||
@@ -629,19 +662,13 @@ void HipifyAction::EndSourceFileAction() {
|
||||
}
|
||||
}
|
||||
if (pragmaOnce) {
|
||||
if (placeForIncludeCalculated) {
|
||||
sl = pragmaOnceLoc < controllingMacroLoc ? pragmaOnceLoc : controllingMacroLoc;
|
||||
} else {
|
||||
sl = pragmaOnceLoc;
|
||||
}
|
||||
if (placeForIncludeCalculated) sl = pragmaOnceLoc < controllingMacroLoc ? pragmaOnceLoc : controllingMacroLoc;
|
||||
else sl = pragmaOnceLoc;
|
||||
placeForIncludeCalculated = true;
|
||||
}
|
||||
if (!placeForIncludeCalculated) {
|
||||
if (firstHeader) {
|
||||
sl = firstHeaderLoc;
|
||||
} else {
|
||||
sl = SM.getLocForStartOfFile(SM.getMainFileID());
|
||||
}
|
||||
if (firstHeader) sl = firstHeaderLoc;
|
||||
else sl = SM.getLocForStartOfFile(SM.getMainFileID());
|
||||
}
|
||||
clang::FullSourceLoc fullSL(sl, SM);
|
||||
ct::Replacement Rep(SM, sl, 0, "\n#include <hip/hip_runtime.h>\n");
|
||||
@@ -656,15 +683,15 @@ namespace {
|
||||
* A silly little class to proxy PPCallbacks back to the HipifyAction class.
|
||||
*/
|
||||
class PPCallbackProxy : public clang::PPCallbacks {
|
||||
HipifyAction& hipifyAction;
|
||||
HipifyAction &hipifyAction;
|
||||
|
||||
public:
|
||||
explicit PPCallbackProxy(HipifyAction& action): hipifyAction(action) {}
|
||||
explicit PPCallbackProxy(HipifyAction &action): hipifyAction(action) {}
|
||||
|
||||
void InclusionDirective(clang::SourceLocation hash_loc, const clang::Token& include_token,
|
||||
void InclusionDirective(clang::SourceLocation hash_loc, const clang::Token &include_token,
|
||||
StringRef file_name, bool is_angled, clang::CharSourceRange filename_range,
|
||||
const clang::FileEntry* file, StringRef search_path, StringRef relative_path,
|
||||
const clang::Module* imported
|
||||
const clang::FileEntry *file, StringRef search_path, StringRef relative_path,
|
||||
const clang::Module *imported
|
||||
#if LLVM_VERSION_MAJOR > 6
|
||||
, clang::SrcMgr::CharacteristicKind FileType
|
||||
#endif
|
||||
@@ -688,10 +715,10 @@ bool HipifyAction::BeginInvocation(clang::CompilerInstance &CI) {
|
||||
}
|
||||
|
||||
void HipifyAction::ExecuteAction() {
|
||||
clang::Preprocessor& PP = getCompilerInstance().getPreprocessor();
|
||||
clang::SourceManager& SM = getCompilerInstance().getSourceManager();
|
||||
clang::Preprocessor &PP = getCompilerInstance().getPreprocessor();
|
||||
auto &SM = getCompilerInstance().getSourceManager();
|
||||
// Start lexing the specified input file.
|
||||
const llvm::MemoryBuffer* FromFile = SM.getBuffer(SM.getMainFileID());
|
||||
const llvm::MemoryBuffer *FromFile = SM.getBuffer(SM.getMainFileID());
|
||||
clang::Lexer RawLex(SM.getMainFileID(), FromFile, SM, PP.getLangOpts());
|
||||
RawLex.SetKeepWhitespaceMode(true);
|
||||
// Perform a token-level rewrite of CUDA identifiers to hip ones. The raw-mode lexer gives us enough
|
||||
@@ -709,10 +736,12 @@ void HipifyAction::ExecuteAction() {
|
||||
clang::ASTFrontendAction::ExecuteAction();
|
||||
}
|
||||
|
||||
void HipifyAction::run(const clang::ast_matchers::MatchFinder::MatchResult& Result) {
|
||||
void HipifyAction::run(const mat::MatchFinder::MatchResult &Result) {
|
||||
if (cudaLaunchKernel(Result)) return;
|
||||
if (cudaSharedIncompleteArrayVar(Result)) return;
|
||||
if (cudaHostFuncCall(Result)) return;
|
||||
if (cudaDeviceFuncCall(Result)) return;
|
||||
if (cubNamespacePrefix(Result)) return;
|
||||
if (cubFunctionTemplateDecl(Result)) return;
|
||||
if (cubUsingNamespaceDecl(Result)) return;
|
||||
}
|
||||
|
||||
@@ -31,17 +31,18 @@ THE SOFTWARE.
|
||||
#include "Statistics.h"
|
||||
|
||||
namespace ct = clang::tooling;
|
||||
namespace mat = clang::ast_matchers;
|
||||
using namespace llvm;
|
||||
|
||||
/**
|
||||
* A FrontendAction that hipifies CUDA programs.
|
||||
*/
|
||||
class HipifyAction : public clang::ASTFrontendAction,
|
||||
public clang::ast_matchers::MatchFinder::MatchCallback {
|
||||
public mat::MatchFinder::MatchCallback {
|
||||
private:
|
||||
ct::Replacements* replacements;
|
||||
ct::Replacements *replacements;
|
||||
std::map<std::string, clang::SourceLocation> Ifndefs;
|
||||
std::unique_ptr<clang::ast_matchers::MatchFinder> Finder;
|
||||
std::unique_ptr<mat::MatchFinder> Finder;
|
||||
// CUDA implicitly adds its runtime header. We rewrite explicitly-provided CUDA includes with equivalent
|
||||
// ones, and track - using this flag - if the result led to us including the hip runtime header. If it did
|
||||
// not, we insert it at the top of the file when we finish processing it.
|
||||
@@ -62,17 +63,20 @@ private:
|
||||
void RewriteString(StringRef s, clang::SourceLocation start);
|
||||
// Replace a CUDA identifier with the corresponding hip identifier, if applicable.
|
||||
void RewriteToken(const clang::Token &t);
|
||||
// Calculate str's SourceLocation in SourceRange sr
|
||||
clang::SourceLocation GetSubstrLocation(const std::string &str, const clang::SourceRange &sr);
|
||||
|
||||
public:
|
||||
explicit HipifyAction(ct::Replacements *replacements): clang::ASTFrontendAction(),
|
||||
replacements(replacements) {}
|
||||
// MatchCallback listeners
|
||||
bool cudaBuiltin(const clang::ast_matchers::MatchFinder::MatchResult& Result);
|
||||
bool cudaLaunchKernel(const clang::ast_matchers::MatchFinder::MatchResult& Result);
|
||||
bool cudaSharedIncompleteArrayVar(const clang::ast_matchers::MatchFinder::MatchResult& Result);
|
||||
bool cudaDeviceFuncCall(const clang::ast_matchers::MatchFinder::MatchResult& Result);
|
||||
bool cudaHostFuncCall(const clang::ast_matchers::MatchFinder::MatchResult& Result);
|
||||
bool cubNamespacePrefix(const clang::ast_matchers::MatchFinder::MatchResult& Result);
|
||||
bool cudaLaunchKernel(const mat::MatchFinder::MatchResult &Result);
|
||||
bool cudaSharedIncompleteArrayVar(const mat::MatchFinder::MatchResult &Result);
|
||||
bool cudaDeviceFuncCall(const mat::MatchFinder::MatchResult &Result);
|
||||
bool cudaHostFuncCall(const mat::MatchFinder::MatchResult &Result);
|
||||
bool cubNamespacePrefix(const mat::MatchFinder::MatchResult &Result);
|
||||
bool cubFunctionTemplateDecl(const mat::MatchFinder::MatchResult &Result);
|
||||
bool cubUsingNamespaceDecl(const mat::MatchFinder::MatchResult &Result);
|
||||
// Called by the preprocessor for each include directive during the non-raw lexing pass.
|
||||
void InclusionDirective(clang::SourceLocation hash_loc,
|
||||
const clang::Token &include_token,
|
||||
@@ -91,7 +95,7 @@ public:
|
||||
|
||||
protected:
|
||||
// Add a Replacement for the current file. These will all be applied after executing the FrontendAction.
|
||||
void insertReplacement(const ct::Replacement& rep, const clang::FullSourceLoc& fullSL);
|
||||
void insertReplacement(const ct::Replacement &rep, const clang::FullSourceLoc &fullSL);
|
||||
// FrontendAction entry point.
|
||||
void ExecuteAction() override;
|
||||
// Callback before starting processing a single input; used by hipify-clang for setting Preprocessor options.
|
||||
@@ -99,8 +103,8 @@ protected:
|
||||
// Called at the start of each new file to process.
|
||||
void EndSourceFileAction() override;
|
||||
// MatchCallback API entry point. Called by the AST visitor while searching the AST for things we registered an interest for.
|
||||
void run(const clang::ast_matchers::MatchFinder::MatchResult& Result) override;
|
||||
std::unique_ptr<clang::ASTConsumer> CreateASTConsumer(clang::CompilerInstance &CI, llvm::StringRef InFile) override;
|
||||
bool Exclude(const hipCounter & hipToken);
|
||||
void FindAndReplace(llvm::StringRef name, clang::SourceLocation sl, const std::map<llvm::StringRef, hipCounter>& repMap, bool bReplace = true);
|
||||
void run(const mat::MatchFinder::MatchResult &Result) override;
|
||||
std::unique_ptr<clang::ASTConsumer> CreateASTConsumer(clang::CompilerInstance &CI, StringRef InFile) override;
|
||||
bool Exclude(const hipCounter &hipToken);
|
||||
void FindAndReplace(StringRef name, clang::SourceLocation sl, const std::map<StringRef, hipCounter> &repMap, bool bReplace = true);
|
||||
};
|
||||
|
||||
@@ -199,6 +199,7 @@ int main(int argc, const char **argv) {
|
||||
Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("cuda", ct::ArgumentInsertPosition::BEGIN));
|
||||
Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("-x", ct::ArgumentInsertPosition::BEGIN));
|
||||
Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("--cuda-host-only", ct::ArgumentInsertPosition::BEGIN));
|
||||
Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("-fno-delayed-template-parsing", ct::ArgumentInsertPosition::BEGIN));
|
||||
if (!CudaPath.empty()) {
|
||||
std::string sCudaPath = "--cuda-path=" + CudaPath;
|
||||
Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster(sCudaPath.c_str(), ct::ArgumentInsertPosition::BEGIN));
|
||||
|
||||
@@ -86,7 +86,7 @@ struct Bundled_code {
|
||||
char cbuf[sizeof(offset) + sizeof(bundle_sz) + sizeof(triple_sz)];
|
||||
} header;
|
||||
std::string triple;
|
||||
std::vector<char> blob;
|
||||
std::string blob;
|
||||
};
|
||||
|
||||
#define magic_string_ "__CLANG_OFFLOAD_BUNDLE__"
|
||||
|
||||
@@ -127,35 +127,6 @@ void hipLaunchKernelGGLImpl(
|
||||
} // Namespace hip_impl.
|
||||
|
||||
|
||||
template <typename F>
|
||||
inline
|
||||
hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* blockSize,
|
||||
F kernel, size_t dynSharedMemPerBlk, uint32_t blockSizeLimit) {
|
||||
|
||||
using namespace hip_impl;
|
||||
|
||||
hip_impl::hip_init();
|
||||
auto f = get_program_state().kernel_descriptor(reinterpret_cast<std::uintptr_t>(kernel),
|
||||
target_agent(0));
|
||||
|
||||
return hipOccupancyMaxPotentialBlockSize(gridSize, blockSize, f,
|
||||
dynSharedMemPerBlk, blockSizeLimit);
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
inline
|
||||
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(uint32_t* numBlocks, F kernel,
|
||||
uint32_t blockSize, size_t dynSharedMemPerBlk) {
|
||||
|
||||
using namespace hip_impl;
|
||||
|
||||
hip_impl::hip_init();
|
||||
auto f = get_program_state().kernel_descriptor(reinterpret_cast<std::uintptr_t>(kernel),
|
||||
target_agent(0));
|
||||
|
||||
return hipOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, f, blockSize, dynSharedMemPerBlk);
|
||||
}
|
||||
|
||||
template <typename... Args, typename F = void (*)(Args...)>
|
||||
inline
|
||||
void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
|
||||
|
||||
@@ -59,12 +59,17 @@ float atomicAdd(float* address, float val)
|
||||
{
|
||||
unsigned int* uaddr{reinterpret_cast<unsigned int*>(address)};
|
||||
unsigned int old{__atomic_load_n(uaddr, __ATOMIC_RELAXED)};
|
||||
unsigned int r;
|
||||
|
||||
unsigned int r;
|
||||
do {
|
||||
r = old;
|
||||
r = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
|
||||
|
||||
if (r != old) { r = old; continue; }
|
||||
|
||||
old = atomicCAS(uaddr, r, __float_as_uint(val + __uint_as_float(r)));
|
||||
} while (r != old);
|
||||
|
||||
if (r == old) break;
|
||||
} while (true);
|
||||
|
||||
return __uint_as_float(r);
|
||||
}
|
||||
@@ -74,13 +79,18 @@ double atomicAdd(double* address, double val)
|
||||
{
|
||||
unsigned long long* uaddr{reinterpret_cast<unsigned long long*>(address)};
|
||||
unsigned long long old{__atomic_load_n(uaddr, __ATOMIC_RELAXED)};
|
||||
unsigned long long r;
|
||||
|
||||
unsigned long long r;
|
||||
do {
|
||||
r = old;
|
||||
r = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
|
||||
|
||||
if (r != old) { r = old; continue; }
|
||||
|
||||
old = atomicCAS(
|
||||
uaddr, r, __double_as_longlong(val + __longlong_as_double(r)));
|
||||
} while (r != old);
|
||||
|
||||
if (r == old) break;
|
||||
} while (true);
|
||||
|
||||
return __longlong_as_double(r);
|
||||
}
|
||||
@@ -144,7 +154,13 @@ unsigned long long atomicMin(
|
||||
unsigned long long* address, unsigned long long val)
|
||||
{
|
||||
unsigned long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)};
|
||||
while (val < tmp) { tmp = atomicCAS(address, tmp, val); }
|
||||
while (val < tmp) {
|
||||
const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED);
|
||||
|
||||
if (tmp1 != tmp) { tmp = tmp1; continue; }
|
||||
|
||||
tmp = atomicCAS(address, tmp, val);
|
||||
}
|
||||
|
||||
return tmp;
|
||||
}
|
||||
@@ -167,7 +183,13 @@ unsigned long long atomicMax(
|
||||
unsigned long long* address, unsigned long long val)
|
||||
{
|
||||
unsigned long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)};
|
||||
while (tmp < val) { tmp = atomicCAS(address, tmp, val); }
|
||||
while (tmp < val) {
|
||||
const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED);
|
||||
|
||||
if (tmp1 != tmp) { tmp = tmp1; continue; }
|
||||
|
||||
tmp = atomicCAS(address, tmp, val);
|
||||
}
|
||||
|
||||
return tmp;
|
||||
}
|
||||
@@ -177,7 +199,7 @@ inline
|
||||
unsigned int atomicInc(unsigned int* address, unsigned int val)
|
||||
{
|
||||
__device__
|
||||
extern
|
||||
extern
|
||||
unsigned int __builtin_amdgcn_atomic_inc(
|
||||
unsigned int*,
|
||||
unsigned int,
|
||||
@@ -194,7 +216,7 @@ inline
|
||||
unsigned int atomicDec(unsigned int* address, unsigned int val)
|
||||
{
|
||||
__device__
|
||||
extern
|
||||
extern
|
||||
unsigned int __builtin_amdgcn_atomic_dec(
|
||||
unsigned int*,
|
||||
unsigned int,
|
||||
|
||||
@@ -1742,10 +1742,10 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes);
|
||||
*
|
||||
* @param[out] dst Data ptr to be filled
|
||||
* @param[in] constant value to be set
|
||||
* @param[in] sizeBytes Data size in bytes
|
||||
* @param[in] number of values to be set
|
||||
* @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
|
||||
*/
|
||||
hipError_t hipMemsetD8(hipDeviceptr_t dest, unsigned char value, size_t sizeBytes);
|
||||
hipError_t hipMemsetD8(hipDeviceptr_t dest, unsigned char value, size_t count);
|
||||
|
||||
/**
|
||||
* @brief Fills the first sizeBytes bytes of the memory area pointed to by dest with the constant
|
||||
@@ -1758,11 +1758,11 @@ hipError_t hipMemsetD8(hipDeviceptr_t dest, unsigned char value, size_t sizeByte
|
||||
*
|
||||
* @param[out] dst Data ptr to be filled
|
||||
* @param[in] constant value to be set
|
||||
* @param[in] sizeBytes Data size in bytes
|
||||
* @param[in] number of values to be set
|
||||
* @param[in] stream - Stream identifier
|
||||
* @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
|
||||
*/
|
||||
hipError_t hipMemsetD8Async(hipDeviceptr_t dest, unsigned char value, size_t sizeBytes, hipStream_t stream __dparm(0));
|
||||
hipError_t hipMemsetD8Async(hipDeviceptr_t dest, unsigned char value, size_t count, hipStream_t stream __dparm(0));
|
||||
|
||||
/**
|
||||
* @brief Fills the first sizeBytes bytes of the memory area pointed to by dest with the constant
|
||||
@@ -1770,10 +1770,10 @@ hipError_t hipMemsetD8Async(hipDeviceptr_t dest, unsigned char value, size_t siz
|
||||
*
|
||||
* @param[out] dst Data ptr to be filled
|
||||
* @param[in] constant value to be set
|
||||
* @param[in] sizeBytes Data size in bytes
|
||||
* @param[in] number of values to be set
|
||||
* @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
|
||||
*/
|
||||
hipError_t hipMemsetD16(hipDeviceptr_t dest, unsigned short value, size_t sizeBytes);
|
||||
hipError_t hipMemsetD16(hipDeviceptr_t dest, unsigned short value, size_t count);
|
||||
|
||||
/**
|
||||
* @brief Fills the first sizeBytes bytes of the memory area pointed to by dest with the constant
|
||||
@@ -1786,11 +1786,11 @@ hipError_t hipMemsetD16(hipDeviceptr_t dest, unsigned short value, size_t sizeBy
|
||||
*
|
||||
* @param[out] dst Data ptr to be filled
|
||||
* @param[in] constant value to be set
|
||||
* @param[in] sizeBytes Data size in bytes
|
||||
* @param[in] number of values to be set
|
||||
* @param[in] stream - Stream identifier
|
||||
* @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
|
||||
*/
|
||||
hipError_t hipMemsetD16Async(hipDeviceptr_t dest, unsigned short value, size_t sizeBytes, hipStream_t stream __dparm(0));
|
||||
hipError_t hipMemsetD16Async(hipDeviceptr_t dest, unsigned short value, size_t count, hipStream_t stream __dparm(0));
|
||||
|
||||
/**
|
||||
* @brief Fills the memory area pointed to by dest with the constant integer
|
||||
@@ -2884,14 +2884,14 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
|
||||
* @param [out] gridSize minimum grid size for maximum potential occupancy
|
||||
* @param [out] blockSize block size for maximum potential occupancy
|
||||
* @param [in] f kernel function for which occupancy is calulated
|
||||
* @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
|
||||
* @param [in] dynamicSMemSize Per - block dynamic shared memory usage intended, in bytes
|
||||
* @param [in] blockSizeLimit the maximum block size for the kernel, use 0 for no limit
|
||||
*
|
||||
* @returns hipSuccess, hipInvalidDevice, hipErrorInvalidValue
|
||||
*/
|
||||
hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* blockSize,
|
||||
hipFunction_t f, size_t dynSharedMemPerBlk,
|
||||
uint32_t blockSizeLimit);
|
||||
hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
|
||||
const void* f, size_t dynamicSMemSize,
|
||||
int blockSizeLimit);
|
||||
|
||||
/**
|
||||
* @brief Returns occupancy for a device function.
|
||||
@@ -2899,10 +2899,10 @@ hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* block
|
||||
* @param [out] numBlocks Returned occupancy
|
||||
* @param [in] func Kernel function for which occupancy is calulated
|
||||
* @param [in] blockSize Block size the kernel is intended to be launched with
|
||||
* @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
|
||||
* @param [in] dynamicSMemSize Per - block dynamic shared memory usage intended, in bytes
|
||||
*/
|
||||
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk);
|
||||
int* numBlocks, const void* f, int blockSize, size_t dynamicSMemSize);
|
||||
|
||||
/**
|
||||
* @brief Returns occupancy for a device function.
|
||||
@@ -2910,11 +2910,11 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
* @param [out] numBlocks Returned occupancy
|
||||
* @param [in] func Kernel function for which occupancy is calulated
|
||||
* @param [in] blockSize Block size the kernel is intended to be launched with
|
||||
* @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
|
||||
* @param [in] dynamicSMemSize Per - block dynamic shared memory usage intended, in bytes
|
||||
* @param [in] flags Extra flags for occupancy calculation (currently ignored)
|
||||
*/
|
||||
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
|
||||
uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk, unsigned int flags);
|
||||
int* numBlocks, const void* f, int blockSize, size_t dynamicSMemSize, unsigned int flags);
|
||||
|
||||
/**
|
||||
* @brief Launches kernels on multiple devices and guarantees all specified kernels are dispatched
|
||||
@@ -3320,7 +3320,27 @@ hipError_t hipBindTextureToMipmappedArray(const texture<T, dim, readMode>& tex,
|
||||
return hipSuccess;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
int* numBlocks, T f, int blockSize, size_t dynamicSMemSize) {
|
||||
return hipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
numBlocks, reinterpret_cast<const void*>(f), blockSize, dynamicSMemSize);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
|
||||
int* numBlocks, T f, int blockSize, size_t dynamicSMemSize, unsigned int flags) {
|
||||
return hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
|
||||
numBlocks, reinterpret_cast<const void*>(f), blockSize, dynamicSMemSize, flags);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
inline hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
|
||||
T f, size_t dynamicSMemSize, int blockSizeLimit) {
|
||||
return hipOccupancyMaxPotentialBlockSize(
|
||||
gridSize, blockSize, reinterpret_cast<const void*>(f), dynamicSMemSize, blockSizeLimit);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
inline hipError_t hipLaunchCooperativeKernel(T f, dim3 gridDim, dim3 blockDim,
|
||||
void** kernelParams, unsigned int sharedMemBytes, hipStream_t stream) {
|
||||
|
||||
@@ -0,0 +1,77 @@
|
||||
/*
|
||||
Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_PROF_H
|
||||
#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_PROF_H
|
||||
|
||||
// HIP VDI Op IDs enumeration
|
||||
enum HipVdiOpId {
|
||||
kHipVdiOpIdDispatch = 0,
|
||||
kHipVdiOpIdCopy = 1,
|
||||
kHipVdiOpIdBarrier = 2,
|
||||
kHipVdiOpIdNumber = 3
|
||||
};
|
||||
|
||||
// Types of VDI commands
|
||||
enum HipVdiCommandKind {
|
||||
kHipVdiCommandKernel = 0x11F0,
|
||||
kHipVdiMemcpyDeviceToHost = 0x11F3,
|
||||
kHipHipVdiMemcpyHostToDevice = 0x11F4,
|
||||
kHipVdiMemcpyDeviceToDevice = 0x11F5,
|
||||
kHipVidMemcpyDeviceToHostRect = 0x1201,
|
||||
kHipVdiMemcpyHostToDeviceRect = 0x1202,
|
||||
kHipVdiMemcpyDeviceToDeviceRect = 0x1203,
|
||||
kHipVdiFillMemory = 0x1207,
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Initializes activity callback
|
||||
*
|
||||
* @param [input] id_callback Event ID callback function
|
||||
* @param [input] op_callback Event operation callback function
|
||||
* @param [input] arg Arguments passed into callback
|
||||
*
|
||||
* @returns None
|
||||
*/
|
||||
void hipInitActivityCallback(void* id_callback, void* op_callback, void* arg);
|
||||
|
||||
/**
|
||||
* @brief Enables activity callback
|
||||
*
|
||||
* @param [input] op Operation, which will trigger a callback (@see HipVdiOpId)
|
||||
* @param [input] enable Enable state for the callback
|
||||
*
|
||||
* @returns True if successful
|
||||
*/
|
||||
bool hipEnableActivityCallback(uint32_t op, bool enable);
|
||||
|
||||
/**
|
||||
* @brief Returns the description string for the operation kind
|
||||
*
|
||||
* @param [input] id Command kind id (@see HipVdiCommandKind)
|
||||
*
|
||||
* @returns A pointer to a const string with the command description
|
||||
*/
|
||||
const char* hipGetCmdName(uint32_t id);
|
||||
|
||||
#endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_PROF_H
|
||||
|
||||
@@ -47,6 +47,95 @@ THE SOFTWARE.
|
||||
#if defined(__cplusplus)
|
||||
#include <type_traits>
|
||||
|
||||
namespace hip_impl {
|
||||
template<typename T, typename Vector, unsigned int idx>
|
||||
struct Scalar_accessor {
|
||||
// Idea from https://t0rakka.silvrback.com/simd-scalar-accessor
|
||||
Vector data;
|
||||
|
||||
__host__ __device__
|
||||
operator T() const noexcept { return data[idx]; }
|
||||
|
||||
__host__ __device__
|
||||
Scalar_accessor& operator=(T x) noexcept {
|
||||
data[idx] = x;
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
__host__ __device__
|
||||
Scalar_accessor& operator+=(T x) noexcept {
|
||||
data[idx] += x;
|
||||
return *this;
|
||||
}
|
||||
__host__ __device__
|
||||
Scalar_accessor& operator-=(T x) noexcept {
|
||||
data[idx] -= x;
|
||||
return *this;
|
||||
}
|
||||
|
||||
__host__ __device__
|
||||
Scalar_accessor& operator*=(T x) noexcept {
|
||||
data[idx] *= x;
|
||||
return *this;
|
||||
}
|
||||
__host__ __device__
|
||||
Scalar_accessor& operator/=(T x) noexcept {
|
||||
data[idx] /= x;
|
||||
return *this;
|
||||
}
|
||||
template<
|
||||
typename U = T,
|
||||
typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
|
||||
__host__ __device__
|
||||
Scalar_accessor& operator%=(T x) noexcept {
|
||||
data[idx] %= x;
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<
|
||||
typename U = T,
|
||||
typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
|
||||
__host__ __device__
|
||||
Scalar_accessor& operator>>=(T x) noexcept {
|
||||
data[idx] >>= x;
|
||||
return *this;
|
||||
}
|
||||
template<
|
||||
typename U = T,
|
||||
typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
|
||||
__host__ __device__
|
||||
Scalar_accessor& operator<<=(T x) noexcept {
|
||||
data[idx] <<= x;
|
||||
return *this;
|
||||
}
|
||||
template<
|
||||
typename U = T,
|
||||
typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
|
||||
__host__ __device__
|
||||
Scalar_accessor& operator&=(T x) noexcept {
|
||||
data[idx] &= x;
|
||||
return *this;
|
||||
}
|
||||
template<
|
||||
typename U = T,
|
||||
typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
|
||||
__host__ __device__
|
||||
Scalar_accessor& operator|=(T x) noexcept {
|
||||
data[idx] |= x;
|
||||
return *this;
|
||||
}
|
||||
template<
|
||||
typename U = T,
|
||||
typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
|
||||
__host__ __device__
|
||||
Scalar_accessor& operator^=(T x) noexcept {
|
||||
data[idx] ^= x;
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
} // Namespace hip_impl.
|
||||
|
||||
template<typename T, unsigned int n> struct HIP_vector_base;
|
||||
|
||||
template<typename T>
|
||||
@@ -55,9 +144,7 @@ THE SOFTWARE.
|
||||
|
||||
union {
|
||||
Native_vec_ data;
|
||||
struct {
|
||||
T x;
|
||||
};
|
||||
hip_impl::Scalar_accessor<T, Native_vec_, 0> x;
|
||||
};
|
||||
};
|
||||
|
||||
@@ -67,10 +154,8 @@ THE SOFTWARE.
|
||||
|
||||
union {
|
||||
Native_vec_ data;
|
||||
struct {
|
||||
T x;
|
||||
T y;
|
||||
};
|
||||
hip_impl::Scalar_accessor<T, Native_vec_, 0> x;
|
||||
hip_impl::Scalar_accessor<T, Native_vec_, 1> y;
|
||||
};
|
||||
};
|
||||
|
||||
@@ -238,12 +323,10 @@ THE SOFTWARE.
|
||||
|
||||
union {
|
||||
Native_vec_ data;
|
||||
struct {
|
||||
T x;
|
||||
T y;
|
||||
T z;
|
||||
T w;
|
||||
};
|
||||
hip_impl::Scalar_accessor<T, Native_vec_, 0> x;
|
||||
hip_impl::Scalar_accessor<T, Native_vec_, 1> y;
|
||||
hip_impl::Scalar_accessor<T, Native_vec_, 2> z;
|
||||
hip_impl::Scalar_accessor<T, Native_vec_, 3> w;
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
Обычный файл → Исполняемый файл
+16
-16
@@ -30,7 +30,7 @@ THE SOFTWARE.
|
||||
#define fileName "tex2dKernel.code"
|
||||
|
||||
texture<float, 2, hipReadModeElementType> tex;
|
||||
bool testResult = false;
|
||||
bool testResult = true;
|
||||
|
||||
#define HIP_CHECK(cmd) \
|
||||
{ \
|
||||
@@ -62,7 +62,7 @@ bool runTest(int argc, char** argv) {
|
||||
desc.NumChannels = 1;
|
||||
desc.Width = width;
|
||||
desc.Height = height;
|
||||
hipArrayCreate(&array, &desc);
|
||||
HIP_CHECK(hipArrayCreate(&array, &desc));
|
||||
|
||||
hip_Memcpy2D copyParam;
|
||||
memset(©Param, 0, sizeof(copyParam));
|
||||
@@ -73,19 +73,19 @@ bool runTest(int argc, char** argv) {
|
||||
copyParam.srcPitch = width * sizeof(float);
|
||||
copyParam.WidthInBytes = copyParam.srcPitch;
|
||||
copyParam.Height = height;
|
||||
hipMemcpyParam2D(©Param);
|
||||
HIP_CHECK(hipMemcpyParam2D(©Param));
|
||||
|
||||
textureReference* texref;
|
||||
hipModuleGetTexRef(&texref, Module, "tex");
|
||||
hipTexRefSetAddressMode(texref, 0, hipAddressModeWrap);
|
||||
hipTexRefSetAddressMode(texref, 1, hipAddressModeWrap);
|
||||
hipTexRefSetFilterMode(texref, hipFilterModePoint);
|
||||
hipTexRefSetFlags(texref, 0);
|
||||
hipTexRefSetFormat(texref, HIP_AD_FORMAT_FLOAT, 1);
|
||||
hipTexRefSetArray(texref, array, HIP_TRSA_OVERRIDE_FORMAT);
|
||||
HIP_CHECK(hipModuleGetTexRef(&texref, Module, "tex"));
|
||||
HIP_CHECK(hipTexRefSetAddressMode(texref, 0, hipAddressModeWrap));
|
||||
HIP_CHECK(hipTexRefSetAddressMode(texref, 1, hipAddressModeWrap));
|
||||
HIP_CHECK(hipTexRefSetFilterMode(texref, hipFilterModePoint));
|
||||
HIP_CHECK(hipTexRefSetFlags(texref, 0));
|
||||
HIP_CHECK(hipTexRefSetFormat(texref, HIP_AD_FORMAT_FLOAT, 1));
|
||||
HIP_CHECK(hipTexRefSetArray(texref, array, HIP_TRSA_OVERRIDE_FORMAT));
|
||||
|
||||
float* dData = NULL;
|
||||
hipMalloc((void**)&dData, size);
|
||||
HIP_CHECK(hipMalloc((void**)&dData, size));
|
||||
|
||||
struct {
|
||||
void* _Ad;
|
||||
@@ -112,7 +112,7 @@ bool runTest(int argc, char** argv) {
|
||||
|
||||
float* hOutputData = (float*)malloc(size);
|
||||
memset(hOutputData, 0, size);
|
||||
hipMemcpy(hOutputData, dData, size, hipMemcpyDeviceToHost);
|
||||
HIP_CHECK(hipMemcpy(hOutputData, dData, size, hipMemcpyDeviceToHost));
|
||||
|
||||
for (int i = 0; i < height; i++) {
|
||||
for (int j = 0; j < width; j++) {
|
||||
@@ -124,10 +124,10 @@ bool runTest(int argc, char** argv) {
|
||||
}
|
||||
}
|
||||
}
|
||||
hipUnbindTexture(tex);
|
||||
hipFree(dData);
|
||||
hipFreeArray(array);
|
||||
return true;
|
||||
HIP_CHECK(hipUnbindTexture(tex));
|
||||
HIP_CHECK(hipFree(dData));
|
||||
HIP_CHECK(hipFreeArray(array));
|
||||
return testResult;
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
|
||||
@@ -56,9 +56,9 @@ void launchKernel(float* C, float* A, float* B, bool manual){
|
||||
const unsigned threadsperblock = 32;
|
||||
const unsigned blocks = (NUM/threadsperblock)+1;
|
||||
|
||||
uint32_t mingridSize = 0;
|
||||
uint32_t gridSize = 0;
|
||||
uint32_t blockSize = 0;
|
||||
int mingridSize = 0;
|
||||
int gridSize = 0;
|
||||
int blockSize = 0;
|
||||
|
||||
if (manual){
|
||||
blockSize = threadsperblock;
|
||||
@@ -86,7 +86,7 @@ void launchKernel(float* C, float* A, float* B, bool manual){
|
||||
printf("kernel Execution time = %6.3fms\n", eventMs);
|
||||
|
||||
//Calculate Occupancy
|
||||
uint32_t numBlock = 0;
|
||||
int numBlock = 0;
|
||||
HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, multiply, blockSize, 0));
|
||||
|
||||
if(devProp.maxThreadsPerMultiProcessor){
|
||||
|
||||
@@ -1022,6 +1022,27 @@ inline std::ostream& operator<<(std::ostream& os, const ihipCtx_t* c) {
|
||||
namespace hip_internal {
|
||||
hipError_t memcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind,
|
||||
hipStream_t stream);
|
||||
|
||||
hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned int flags);
|
||||
|
||||
hipError_t ihipHostFree(TlsData *tls, void* ptr);
|
||||
|
||||
};
|
||||
|
||||
#define MAX_COOPERATIVE_GPUs 255
|
||||
|
||||
// do not change these two structs without changing the device library
|
||||
struct mg_sync {
|
||||
uint w0;
|
||||
uint w1;
|
||||
};
|
||||
|
||||
struct mg_info {
|
||||
struct mg_sync *mgs;
|
||||
uint grid_id;
|
||||
uint num_grids;
|
||||
ulong prev_sum;
|
||||
ulong all_sum;
|
||||
};
|
||||
|
||||
//---
|
||||
|
||||
+113
-105
@@ -141,6 +141,103 @@ void* allocAndSharePtr(const char* msg, size_t sizeBytes, ihipCtx_t* ctx, bool s
|
||||
return ptr;
|
||||
}
|
||||
|
||||
hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned int flags) {
|
||||
hipError_t hip_status = hipSuccess;
|
||||
|
||||
if (HIP_SYNC_HOST_ALLOC) {
|
||||
hipDeviceSynchronize();
|
||||
}
|
||||
|
||||
auto ctx = ihipGetTlsDefaultCtx();
|
||||
if ((ctx == nullptr) || (ptr == nullptr)) {
|
||||
hip_status = hipErrorInvalidValue;
|
||||
}
|
||||
else if (sizeBytes == 0) {
|
||||
hip_status = hipSuccess;
|
||||
// TODO - should size of 0 return err or be siliently ignored?
|
||||
} else {
|
||||
unsigned trueFlags = flags;
|
||||
if (flags == hipHostMallocDefault) {
|
||||
// HCC/ROCM provide a modern system with unified memory and should set both of these
|
||||
// flags by default:
|
||||
trueFlags = hipHostMallocMapped | hipHostMallocPortable;
|
||||
}
|
||||
|
||||
|
||||
const unsigned supportedFlags = hipHostMallocPortable | hipHostMallocMapped |
|
||||
hipHostMallocWriteCombined | hipHostMallocCoherent |
|
||||
hipHostMallocNonCoherent;
|
||||
|
||||
|
||||
const unsigned coherencyFlags = hipHostMallocCoherent | hipHostMallocNonCoherent;
|
||||
|
||||
if ((flags & ~supportedFlags) || ((flags & coherencyFlags) == coherencyFlags)) {
|
||||
*ptr = nullptr;
|
||||
// can't specify unsupported flags, can't specify both Coherent + NonCoherent
|
||||
hip_status = hipErrorInvalidValue;
|
||||
} else {
|
||||
auto device = ctx->getWriteableDevice();
|
||||
#if (__hcc_workweek__ >= 19115)
|
||||
//Avoid mapping host pinned memory to all devices by HCC
|
||||
unsigned amFlags = amHostUnmapped;
|
||||
#else
|
||||
unsigned amFlags = 0;
|
||||
#endif
|
||||
if (flags & hipHostMallocCoherent) {
|
||||
amFlags |= amHostCoherent;
|
||||
} else if (flags & hipHostMallocNonCoherent) {
|
||||
amFlags |= amHostNonCoherent;
|
||||
} else {
|
||||
// depends on env variables:
|
||||
amFlags |= HIP_HOST_COHERENT ? amHostCoherent : amHostNonCoherent;
|
||||
}
|
||||
|
||||
|
||||
*ptr = hip_internal::allocAndSharePtr(
|
||||
(amFlags & amHostCoherent) ? "finegrained_host" : "pinned_host", sizeBytes, ctx,
|
||||
true /*shareWithAll*/, amFlags, flags, 0);
|
||||
|
||||
if (sizeBytes && (*ptr == NULL)) {
|
||||
hip_status = hipErrorMemoryAllocation;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (HIP_SYNC_HOST_ALLOC) {
|
||||
hipDeviceSynchronize();
|
||||
}
|
||||
return hip_status;
|
||||
}
|
||||
|
||||
hipError_t ihipHostFree(TlsData *tls, void* ptr) {
|
||||
|
||||
// Synchronize to ensure all work has finished.
|
||||
ihipGetTlsDefaultCtx()->locked_waitAllStreams(); // ignores non-blocking streams, this waits
|
||||
// for all activity to finish.
|
||||
|
||||
hipError_t hipStatus = hipErrorInvalidValue;
|
||||
if (ptr) {
|
||||
hc::accelerator acc;
|
||||
#if (__hcc_workweek__ >= 17332)
|
||||
hc::AmPointerInfo amPointerInfo(NULL, NULL, NULL, 0, acc, 0, 0);
|
||||
#else
|
||||
hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0);
|
||||
#endif
|
||||
am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, ptr);
|
||||
if (status == AM_SUCCESS) {
|
||||
if (amPointerInfo._hostPointer == ptr) {
|
||||
hc::am_free(ptr);
|
||||
hipStatus = hipSuccess;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// free NULL pointer succeeds and is common technique to initialize runtime
|
||||
hipStatus = hipSuccess;
|
||||
}
|
||||
|
||||
return hipStatus;
|
||||
}
|
||||
|
||||
|
||||
} // end namespace hip_internal
|
||||
|
||||
@@ -301,79 +398,12 @@ hipError_t hipExtMallocWithFlags(void** ptr, size_t sizeBytes, unsigned int flag
|
||||
return ihipLogStatus(hip_status);
|
||||
}
|
||||
|
||||
hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned int flags) {
|
||||
hipError_t hip_status = hipSuccess;
|
||||
|
||||
if (HIP_SYNC_HOST_ALLOC) {
|
||||
hipDeviceSynchronize();
|
||||
}
|
||||
|
||||
auto ctx = ihipGetTlsDefaultCtx();
|
||||
if ((ctx == nullptr) || (ptr == nullptr)) {
|
||||
hip_status = hipErrorInvalidValue;
|
||||
}
|
||||
else if (sizeBytes == 0) {
|
||||
hip_status = hipSuccess;
|
||||
// TODO - should size of 0 return err or be siliently ignored?
|
||||
} else {
|
||||
unsigned trueFlags = flags;
|
||||
if (flags == hipHostMallocDefault) {
|
||||
// HCC/ROCM provide a modern system with unified memory and should set both of these
|
||||
// flags by default:
|
||||
trueFlags = hipHostMallocMapped | hipHostMallocPortable;
|
||||
}
|
||||
|
||||
|
||||
const unsigned supportedFlags = hipHostMallocPortable | hipHostMallocMapped |
|
||||
hipHostMallocWriteCombined | hipHostMallocCoherent |
|
||||
hipHostMallocNonCoherent;
|
||||
|
||||
|
||||
const unsigned coherencyFlags = hipHostMallocCoherent | hipHostMallocNonCoherent;
|
||||
|
||||
if ((flags & ~supportedFlags) || ((flags & coherencyFlags) == coherencyFlags)) {
|
||||
*ptr = nullptr;
|
||||
// can't specify unsupported flags, can't specify both Coherent + NonCoherent
|
||||
hip_status = hipErrorInvalidValue;
|
||||
} else {
|
||||
auto device = ctx->getWriteableDevice();
|
||||
#if (__hcc_workweek__ >= 19115)
|
||||
//Avoid mapping host pinned memory to all devices by HCC
|
||||
unsigned amFlags = amHostUnmapped;
|
||||
#else
|
||||
unsigned amFlags = 0;
|
||||
#endif
|
||||
if (flags & hipHostMallocCoherent) {
|
||||
amFlags |= amHostCoherent;
|
||||
} else if (flags & hipHostMallocNonCoherent) {
|
||||
amFlags |= amHostNonCoherent;
|
||||
} else {
|
||||
// depends on env variables:
|
||||
amFlags |= HIP_HOST_COHERENT ? amHostCoherent : amHostNonCoherent;
|
||||
}
|
||||
|
||||
|
||||
*ptr = hip_internal::allocAndSharePtr(
|
||||
(amFlags & amHostCoherent) ? "finegrained_host" : "pinned_host", sizeBytes, ctx,
|
||||
true /*shareWithAll*/, amFlags, flags, 0);
|
||||
|
||||
if (sizeBytes && (*ptr == NULL)) {
|
||||
hip_status = hipErrorMemoryAllocation;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (HIP_SYNC_HOST_ALLOC) {
|
||||
hipDeviceSynchronize();
|
||||
}
|
||||
return hip_status;
|
||||
}
|
||||
|
||||
hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) {
|
||||
HIP_INIT_SPECIAL_API(hipHostMalloc, (TRACE_MEM), ptr, sizeBytes, flags);
|
||||
HIP_SET_DEVICE();
|
||||
hipError_t hip_status = hipSuccess;
|
||||
hip_status = ihipHostMalloc(tls, ptr, sizeBytes, flags);
|
||||
hip_status = hip_internal::ihipHostMalloc(tls, ptr, sizeBytes, flags);
|
||||
return ihipLogStatus(hip_status);
|
||||
}
|
||||
|
||||
@@ -384,7 +414,7 @@ hipError_t hipMallocManaged(void** devPtr, size_t size, unsigned int flags) {
|
||||
if(flags != hipMemAttachGlobal)
|
||||
hip_status = hipErrorInvalidValue;
|
||||
else
|
||||
hip_status = ihipHostMalloc(tls, devPtr, size, hipHostMallocDefault);
|
||||
hip_status = hip_internal::ihipHostMalloc(tls, devPtr, size, hipHostMallocDefault);
|
||||
return ihipLogStatus(hip_status);
|
||||
}
|
||||
|
||||
@@ -1935,15 +1965,15 @@ hipError_t hipMemset2DAsync(void* dst, size_t pitch, int value, size_t width, si
|
||||
return ihipLogStatus(e);
|
||||
};
|
||||
|
||||
hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char value, size_t sizeBytes) {
|
||||
HIP_INIT_SPECIAL_API(hipMemsetD8, (TRACE_MCMD), dst, value, sizeBytes);
|
||||
hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char value, size_t count) {
|
||||
HIP_INIT_SPECIAL_API(hipMemsetD8, (TRACE_MCMD), dst, value, count);
|
||||
|
||||
hipError_t e = hipSuccess;
|
||||
|
||||
hipStream_t stream = hipStreamNull;
|
||||
stream = ihipSyncAndResolveStream(stream);
|
||||
if (stream) {
|
||||
e = ihipMemset(dst, value, sizeBytes, stream, ihipMemsetDataTypeChar);
|
||||
e = ihipMemset(dst, value, count, stream, ihipMemsetDataTypeChar);
|
||||
stream->locked_wait();
|
||||
} else {
|
||||
e = hipErrorInvalidValue;
|
||||
@@ -1951,23 +1981,23 @@ hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char value, size_t sizeBytes
|
||||
return ihipLogStatus(e);
|
||||
}
|
||||
|
||||
hipError_t hipMemsetD8Async(hipDeviceptr_t dst, unsigned char value, size_t sizeBytes , hipStream_t stream ) {
|
||||
HIP_INIT_SPECIAL_API(hipMemsetD8Async, (TRACE_MCMD), dst, value, sizeBytes, stream);
|
||||
hipError_t hipMemsetD8Async(hipDeviceptr_t dst, unsigned char value, size_t count , hipStream_t stream ) {
|
||||
HIP_INIT_SPECIAL_API(hipMemsetD8Async, (TRACE_MCMD), dst, value, count, stream);
|
||||
|
||||
stream = ihipSyncAndResolveStream(stream);
|
||||
if (stream) {
|
||||
return ihipLogStatus(ihipMemset(dst, value, sizeBytes, stream, ihipMemsetDataTypeChar));
|
||||
return ihipLogStatus(ihipMemset(dst, value, count, stream, ihipMemsetDataTypeChar));
|
||||
} else {
|
||||
return ihipLogStatus(hipErrorInvalidValue);
|
||||
}
|
||||
}
|
||||
|
||||
hipError_t hipMemsetD16(hipDeviceptr_t dst, unsigned short value, size_t sizeBytes){
|
||||
HIP_INIT_SPECIAL_API(hipMemsetD16, (TRACE_MCMD), dst, value, sizeBytes);
|
||||
hipError_t hipMemsetD16(hipDeviceptr_t dst, unsigned short value, size_t count){
|
||||
HIP_INIT_SPECIAL_API(hipMemsetD16, (TRACE_MCMD), dst, value, count);
|
||||
hipError_t e = hipSuccess;
|
||||
hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
|
||||
if (stream) {
|
||||
e = ihipMemset(dst, value, sizeBytes, stream, ihipMemsetDataTypeShort);
|
||||
e = ihipMemset(dst, value, count, stream, ihipMemsetDataTypeShort);
|
||||
if(hipSuccess == e)
|
||||
stream->locked_wait();
|
||||
} else {
|
||||
@@ -1976,12 +2006,12 @@ hipError_t hipMemsetD16(hipDeviceptr_t dst, unsigned short value, size_t sizeByt
|
||||
return ihipLogStatus(e);
|
||||
}
|
||||
|
||||
hipError_t hipMemsetD16Async(hipDeviceptr_t dst, unsigned short value, size_t sizeBytes, hipStream_t stream ){
|
||||
HIP_INIT_SPECIAL_API(hipMemsetD16Async, (TRACE_MCMD), dst, value, sizeBytes, stream);
|
||||
hipError_t hipMemsetD16Async(hipDeviceptr_t dst, unsigned short value, size_t count, hipStream_t stream ){
|
||||
HIP_INIT_SPECIAL_API(hipMemsetD16Async, (TRACE_MCMD), dst, value, count, stream);
|
||||
|
||||
stream = ihipSyncAndResolveStream(stream);
|
||||
if (stream) {
|
||||
return ihipLogStatus(ihipMemset(dst, value, sizeBytes, stream, ihipMemsetDataTypeShort));
|
||||
return ihipLogStatus(ihipMemset(dst, value, count, stream, ihipMemsetDataTypeShort));
|
||||
} else {
|
||||
return ihipLogStatus(hipErrorInvalidValue);
|
||||
}
|
||||
@@ -2146,30 +2176,8 @@ hipError_t hipFree(void* ptr) {
|
||||
hipError_t hipHostFree(void* ptr) {
|
||||
HIP_INIT_SPECIAL_API(hipHostFree, (TRACE_MEM), ptr);
|
||||
|
||||
// Synchronize to ensure all work has finished.
|
||||
ihipGetTlsDefaultCtx()->locked_waitAllStreams(); // ignores non-blocking streams, this waits
|
||||
// for all activity to finish.
|
||||
|
||||
|
||||
hipError_t hipStatus = hipErrorInvalidValue;
|
||||
if (ptr) {
|
||||
hc::accelerator acc;
|
||||
#if (__hcc_workweek__ >= 17332)
|
||||
hc::AmPointerInfo amPointerInfo(NULL, NULL, NULL, 0, acc, 0, 0);
|
||||
#else
|
||||
hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0);
|
||||
#endif
|
||||
am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, ptr);
|
||||
if (status == AM_SUCCESS) {
|
||||
if (amPointerInfo._hostPointer == ptr) {
|
||||
hc::am_free(ptr);
|
||||
hipStatus = hipSuccess;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// free NULL pointer succeeds and is common technique to initialize runtime
|
||||
hipStatus = hipSuccess;
|
||||
}
|
||||
hipError_t hipStatus = hipSuccess;
|
||||
hipStatus = hip_internal::ihipHostFree(tls, ptr);
|
||||
|
||||
return ihipLogStatus(hipStatus);
|
||||
};
|
||||
|
||||
+129
-49
@@ -109,6 +109,7 @@ struct ihipModuleSymbol_t {
|
||||
amd_kernel_code_t const* _header{};
|
||||
string _name; // TODO - review for performance cost. Name is just used for debug.
|
||||
vector<pair<size_t, size_t>> _kernarg_layout{};
|
||||
bool _is_code_object_v3{};
|
||||
};
|
||||
|
||||
template <>
|
||||
@@ -137,7 +138,8 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global
|
||||
uint32_t localWorkSizeX, uint32_t localWorkSizeY,
|
||||
uint32_t localWorkSizeZ, size_t sharedMemBytes,
|
||||
hipStream_t hStream, void** kernelParams, void** extra,
|
||||
hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags, bool isStreamLocked = 0) {
|
||||
hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags, bool isStreamLocked = 0,
|
||||
void** impCoopParams = 0) {
|
||||
using namespace hip_impl;
|
||||
|
||||
auto ctx = ihipGetTlsDefaultCtx();
|
||||
@@ -181,10 +183,17 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
|
||||
// Insert 48-bytes at the end for implicit kernel arguments and fill with value zero.
|
||||
// Insert 56-bytes at the end for implicit kernel arguments and fill with value zero.
|
||||
size_t padSize = (~kernargs.size() + 1) & (HIP_IMPLICIT_KERNARG_ALIGNMENT - 1);
|
||||
kernargs.insert(kernargs.end(), padSize + HIP_IMPLICIT_KERNARG_SIZE, 0);
|
||||
|
||||
if (impCoopParams) {
|
||||
const auto p{static_cast<const char*>(*impCoopParams)};
|
||||
// The sixth index is for multi-grid synchronization
|
||||
kernargs.insert((kernargs.cend() - padSize - HIP_IMPLICIT_KERNARG_SIZE) + 6 * HIP_IMPLICIT_KERNARG_ALIGNMENT,
|
||||
p, p + HIP_IMPLICIT_KERNARG_ALIGNMENT);
|
||||
}
|
||||
|
||||
/*
|
||||
Kernel argument preparation.
|
||||
*/
|
||||
@@ -208,8 +217,7 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global
|
||||
aql.grid_size_x = globalWorkSizeX;
|
||||
aql.grid_size_y = globalWorkSizeY;
|
||||
aql.grid_size_z = globalWorkSizeZ;
|
||||
bool is_code_object_v3 = f->_name.find(".kd") != std::string::npos;
|
||||
if (is_code_object_v3) {
|
||||
if (f->_is_code_object_v3) {
|
||||
const auto* header =
|
||||
reinterpret_cast<const amd_kernel_code_v3_t*>(f->_header);
|
||||
aql.group_segment_size =
|
||||
@@ -449,6 +457,10 @@ hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim,
|
||||
return ihipLogStatus(hipErrorLaunchFailure);
|
||||
}
|
||||
|
||||
size_t impCoopArg = 1;
|
||||
void* impCoopParams[1];
|
||||
impCoopParams[0] = &impCoopArg;
|
||||
|
||||
// launch the main kernel
|
||||
result = ihipModuleLaunchKernel(tls, kd,
|
||||
gridDim.x * blockDimX.x,
|
||||
@@ -456,7 +468,7 @@ hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim,
|
||||
gridDim.z * blockDimX.z,
|
||||
blockDimX.x, blockDimX.y, blockDimX.z,
|
||||
sharedMemBytes, stream, kernelParams, nullptr, nullptr,
|
||||
nullptr, 0, true);
|
||||
nullptr, 0, true, impCoopParams);
|
||||
|
||||
stream->criticalData().unlock();
|
||||
#if (__hcc_workweek__ >= 19213)
|
||||
@@ -472,7 +484,7 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
|
||||
HIP_INIT_API(hipLaunchCooperativeKernelMultiDevice, launchParamsList, numDevices, flags);
|
||||
hipError_t result;
|
||||
|
||||
if (numDevices > g_deviceCnt || launchParamsList == nullptr) {
|
||||
if (numDevices > g_deviceCnt || launchParamsList == nullptr || numDevices > MAX_COOPERATIVE_GPUs) {
|
||||
return ihipLogStatus(hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
@@ -523,6 +535,32 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
|
||||
kargs.getHandle());
|
||||
}
|
||||
|
||||
mg_sync *mg_sync_ptr = 0;
|
||||
mg_info *mg_info_ptr[MAX_COOPERATIVE_GPUs] = {0};
|
||||
|
||||
result = hip_internal::ihipHostMalloc(tls, (void **)&mg_sync_ptr, sizeof(mg_sync), hipHostMallocDefault);
|
||||
if (result != hipSuccess) {
|
||||
return ihipLogStatus(hipErrorInvalidValue);
|
||||
}
|
||||
mg_sync_ptr->w0 = 0;
|
||||
mg_sync_ptr->w1 = 0;
|
||||
|
||||
uint all_sum = 0;
|
||||
for (int i = 0; i < numDevices; ++i) {
|
||||
result = hip_internal::ihipHostMalloc(tls, (void **)&mg_info_ptr[i], sizeof(mg_info), hipHostMallocDefault);
|
||||
if (result != hipSuccess) {
|
||||
hip_internal::ihipHostFree(tls, mg_sync_ptr);
|
||||
for (int j = 0; j < i; ++j) {
|
||||
hip_internal::ihipHostFree(tls, mg_info_ptr[j]);
|
||||
}
|
||||
return ihipLogStatus(hipErrorInvalidValue);
|
||||
}
|
||||
// calculate the sum of sizes of all grids
|
||||
const hipLaunchParams& lp = launchParamsList[i];
|
||||
all_sum += lp.blockDim.x * lp.blockDim.y * lp.blockDim.z *
|
||||
lp.gridDim.x * lp.gridDim.y * lp.gridDim.z;
|
||||
}
|
||||
|
||||
// lock all streams before launching the blit kernels for initializing the GWS and main kernels to each device
|
||||
for (int i = 0; i < numDevices; ++i) {
|
||||
LockedAccessor_StreamCrit_t streamCrit(launchParamsList[i].stream->criticalData(), false);
|
||||
@@ -531,7 +569,7 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
|
||||
#endif
|
||||
}
|
||||
|
||||
// launch the init_gws kernel to initialize the GWS followed by launching the main kernels for each device
|
||||
// launch the init_gws kernel to initialize the GWS for each device
|
||||
for (int i = 0; i < numDevices; ++i) {
|
||||
const hipLaunchParams& lp = launchParamsList[i];
|
||||
|
||||
@@ -549,8 +587,32 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
|
||||
launchParamsList[j].stream->criticalData()._av.release_locked_hsa_queue();
|
||||
#endif
|
||||
}
|
||||
hip_internal::ihipHostFree(tls, mg_sync_ptr);
|
||||
for (int j = 0; j < numDevices; ++j) {
|
||||
hip_internal::ihipHostFree(tls, mg_info_ptr[j]);
|
||||
}
|
||||
|
||||
return ihipLogStatus(hipErrorLaunchFailure);
|
||||
}
|
||||
}
|
||||
|
||||
void* impCoopParams[1];
|
||||
ulong prev_sum = 0;
|
||||
// launch the main kernels for each device
|
||||
for (int i = 0; i < numDevices; ++i) {
|
||||
const hipLaunchParams& lp = launchParamsList[i];
|
||||
|
||||
//initialize and setup the implicit kernel argument for multi-grid sync
|
||||
mg_info_ptr[i]->mgs = mg_sync_ptr;
|
||||
mg_info_ptr[i]->grid_id = i;
|
||||
mg_info_ptr[i]->num_grids = numDevices;
|
||||
mg_info_ptr[i]->all_sum = all_sum;
|
||||
mg_info_ptr[i]->prev_sum = prev_sum;
|
||||
prev_sum += lp.blockDim.x * lp.blockDim.y * lp.blockDim.z *
|
||||
lp.gridDim.x * lp.gridDim.y * lp.gridDim.z;
|
||||
|
||||
|
||||
impCoopParams[0] = &mg_info_ptr[i];
|
||||
|
||||
result = ihipModuleLaunchKernel(tls, kds[i],
|
||||
lp.gridDim.x * lp.blockDim.x,
|
||||
@@ -559,7 +621,23 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
|
||||
lp.blockDim.x, lp.blockDim.y,
|
||||
lp.blockDim.z, lp.sharedMem,
|
||||
lp.stream, lp.args, nullptr, nullptr, nullptr, 0,
|
||||
true);
|
||||
true, impCoopParams);
|
||||
|
||||
if (result != hipSuccess) {
|
||||
for (int j = 0; j < numDevices; ++j) {
|
||||
launchParamsList[j].stream->criticalData().unlock();
|
||||
#if (__hcc_workweek__ >= 19213)
|
||||
launchParamsList[j].stream->criticalData()._av.release_locked_hsa_queue();
|
||||
#endif
|
||||
}
|
||||
hip_internal::ihipHostFree(tls, mg_sync_ptr);
|
||||
for (int j = 0; j < numDevices; ++j) {
|
||||
hip_internal::ihipHostFree(tls, mg_info_ptr[j]);
|
||||
}
|
||||
|
||||
return ihipLogStatus(hipErrorLaunchFailure);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// unlock all streams
|
||||
@@ -573,6 +651,11 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
|
||||
free(gwsKds);
|
||||
free(kds);
|
||||
|
||||
hip_internal::ihipHostFree(tls, mg_sync_ptr);
|
||||
for (int j = 0; j < numDevices; ++j) {
|
||||
hip_internal::ihipHostFree(tls, mg_info_ptr[j]);
|
||||
}
|
||||
|
||||
return ihipLogStatus(result);
|
||||
}
|
||||
|
||||
@@ -977,31 +1060,24 @@ hipFuncAttributes make_function_attributes(TlsData *tls, const ihipModuleSymbol_
|
||||
// available per CU, therefore we hardcode it to 64 KiRegisters.
|
||||
prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024;
|
||||
|
||||
bool is_code_object_v3 = kd._name.find(".kd") != std::string::npos;
|
||||
if (is_code_object_v3) {
|
||||
if (kd._is_code_object_v3) {
|
||||
r.localSizeBytes = header_v3(kd)->private_segment_fixed_size;
|
||||
r.sharedSizeBytes = header_v3(kd)->group_segment_fixed_size;
|
||||
} else {
|
||||
r.localSizeBytes = kd._header->workitem_private_segment_byte_size;
|
||||
r.sharedSizeBytes = kd._header->workgroup_group_segment_byte_size;
|
||||
}
|
||||
r.maxDynamicSharedSizeBytes = prop.sharedMemPerBlock - r.sharedSizeBytes;
|
||||
if (is_code_object_v3) {
|
||||
r.numRegs = ((header_v3(kd)->compute_pgm_rsrc1 & 0x3F) + 1) << 2;
|
||||
} else {
|
||||
r.numRegs = kd._header->workitem_vgpr_count;
|
||||
}
|
||||
r.maxThreadsPerBlock = r.numRegs ?
|
||||
std::min(prop.maxThreadsPerBlock, prop.regsPerBlock / r.numRegs) :
|
||||
prop.maxThreadsPerBlock;
|
||||
if (is_code_object_v3) {
|
||||
r.binaryVersion = 0; // FIXME: should it be the ISA version or code
|
||||
// object format version?
|
||||
} else {
|
||||
r.localSizeBytes = kd._header->workitem_private_segment_byte_size;
|
||||
r.sharedSizeBytes = kd._header->workgroup_group_segment_byte_size;
|
||||
r.numRegs = kd._header->workitem_vgpr_count;
|
||||
r.binaryVersion =
|
||||
kd._header->amd_machine_version_major * 10 +
|
||||
kd._header->amd_machine_version_minor;
|
||||
}
|
||||
r.maxDynamicSharedSizeBytes = prop.sharedMemPerBlock - r.sharedSizeBytes;
|
||||
r.maxThreadsPerBlock = r.numRegs ?
|
||||
std::min(prop.maxThreadsPerBlock, prop.regsPerBlock / r.numRegs) :
|
||||
prop.maxThreadsPerBlock;
|
||||
r.ptxVersion = prop.major * 10 + prop.minor; // HIP currently presents itself as PTX 3.0.
|
||||
|
||||
return r;
|
||||
@@ -1099,8 +1175,7 @@ hipError_t ihipModuleLoadData(TlsData *tls, hipModule_t* module, const void* ima
|
||||
content.data(), content.size(), (*module)->executable,
|
||||
this_agent());
|
||||
|
||||
std::vector<char> blob(content.cbegin(), content.cend());
|
||||
program_state_impl::read_kernarg_metadata(blob, (*module)->kernargs);
|
||||
program_state_impl::read_kernarg_metadata(content, (*module)->kernargs);
|
||||
|
||||
// compute the hash of the code object
|
||||
(*module)->hash = checksum(content.length(), content.data());
|
||||
@@ -1152,8 +1227,7 @@ hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const
|
||||
|
||||
void getGprsLdsUsage(hipFunction_t f, size_t* usedVGPRS, size_t* usedSGPRS, size_t* usedLDS)
|
||||
{
|
||||
bool is_code_object_v3 = f->_name.find(".kd") != std::string::npos;
|
||||
if (is_code_object_v3) {
|
||||
if (f->_is_code_object_v3) {
|
||||
const auto header = reinterpret_cast<const amd_kernel_code_v3_t*>(f->_header);
|
||||
// GRANULATED_WAVEFRONT_VGPR_COUNT is specified in 0:5 bits of COMPUTE_PGM_RSRC1
|
||||
// the granularity for gfx6-gfx9 is max(0, ceil(vgprs_used / 4) - 1)
|
||||
@@ -1174,9 +1248,9 @@ void getGprsLdsUsage(hipFunction_t f, size_t* usedVGPRS, size_t* usedSGPRS, size
|
||||
}
|
||||
}
|
||||
|
||||
hipError_t ihipOccupancyMaxPotentialBlockSize(TlsData *tls, uint32_t* gridSize, uint32_t* blockSize,
|
||||
hipFunction_t f, size_t dynSharedMemPerBlk,
|
||||
uint32_t blockSizeLimit)
|
||||
hipError_t ihipOccupancyMaxPotentialBlockSize(TlsData *tls, int* gridSize, int* blockSize,
|
||||
hipFunction_t f, size_t dynamicSMemSize,
|
||||
int blockSizeLimit)
|
||||
{
|
||||
using namespace hip_impl;
|
||||
|
||||
@@ -1257,7 +1331,7 @@ hipError_t ihipOccupancyMaxPotentialBlockSize(TlsData *tls, uint32_t* gridSize,
|
||||
}
|
||||
else {
|
||||
size_t availableSharedMemPerCU = prop.maxSharedMemoryPerMultiProcessor;
|
||||
size_t workgroupPerCU = availableSharedMemPerCU / (usedLDS + dynSharedMemPerBlk);
|
||||
size_t workgroupPerCU = availableSharedMemPerCU / (usedLDS + dynamicSMemSize);
|
||||
wavefrontsLDS = min(workgroupPerCU, maxWorkgroupPerCU) * wavefrontsPerWG;
|
||||
}
|
||||
|
||||
@@ -1286,18 +1360,19 @@ hipError_t ihipOccupancyMaxPotentialBlockSize(TlsData *tls, uint32_t* gridSize,
|
||||
return hipSuccess;
|
||||
}
|
||||
|
||||
hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* blockSize,
|
||||
hipFunction_t f, size_t dynSharedMemPerBlk,
|
||||
uint32_t blockSizeLimit)
|
||||
hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
|
||||
const void* f, size_t dynamicSMemSize,
|
||||
int blockSizeLimit)
|
||||
{
|
||||
HIP_INIT_API(hipOccupancyMaxPotentialBlockSize, gridSize, blockSize, f, dynSharedMemPerBlk, blockSizeLimit);
|
||||
|
||||
HIP_INIT_API(hipOccupancyMaxPotentialBlockSize, gridSize, blockSize, f, dynamicSMemSize, blockSizeLimit);
|
||||
auto F = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)(f),
|
||||
hip_impl::target_agent(0));
|
||||
return ihipLogStatus(ihipOccupancyMaxPotentialBlockSize(tls,
|
||||
gridSize, blockSize, f, dynSharedMemPerBlk, blockSizeLimit));
|
||||
gridSize, blockSize, F, dynamicSMemSize, blockSizeLimit));
|
||||
}
|
||||
|
||||
hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
TlsData *tls, uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk)
|
||||
TlsData *tls, int* numBlocks, hipFunction_t f, int blockSize, size_t dynamicSMemSize)
|
||||
{
|
||||
using namespace hip_impl;
|
||||
|
||||
@@ -1326,45 +1401,50 @@ hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
size_t numWavefronts = (blockSize + wavefrontSize - 1) / wavefrontSize;
|
||||
|
||||
size_t availableVGPRs = (prop.regsPerBlock / wavefrontSize / simdPerCU);
|
||||
size_t vgprs_alu_occupancy = simdPerCU * std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS);
|
||||
size_t vgprs_alu_occupancy = simdPerCU * (usedVGPRS == 0 ? maxWavesPerSimd
|
||||
: std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS));
|
||||
|
||||
// Calculate blocks occupancy per CU based on VGPR usage
|
||||
*numBlocks = vgprs_alu_occupancy / numWavefronts;
|
||||
|
||||
const size_t availableSGPRs = (prop.gcnArch < 800) ? 512 : 800;
|
||||
size_t sgprs_alu_occupancy = simdPerCU * ((usedSGPRS == 0) ? maxWavesPerSimd
|
||||
size_t sgprs_alu_occupancy = simdPerCU * (usedSGPRS == 0 ? maxWavesPerSimd
|
||||
: std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS));
|
||||
|
||||
// Calculate blocks occupancy per CU based on SGPR usage
|
||||
*numBlocks = std::min(*numBlocks, (uint32_t) (sgprs_alu_occupancy / numWavefronts));
|
||||
*numBlocks = std::min(*numBlocks, (int) (sgprs_alu_occupancy / numWavefronts));
|
||||
|
||||
size_t total_used_lds = usedLDS + dynSharedMemPerBlk;
|
||||
size_t total_used_lds = usedLDS + dynamicSMemSize;
|
||||
if (total_used_lds != 0) {
|
||||
// Calculate LDS occupacy per CU. lds_per_cu / (static_lsd + dynamic_lds)
|
||||
size_t lds_occupancy = prop.maxSharedMemoryPerMultiProcessor / total_used_lds;
|
||||
*numBlocks = std::min(*numBlocks, (uint32_t) lds_occupancy);
|
||||
*numBlocks = std::min(*numBlocks, (int) lds_occupancy);
|
||||
}
|
||||
|
||||
return hipSuccess;
|
||||
}
|
||||
|
||||
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk)
|
||||
int* numBlocks, const void* f, int blockSize, size_t dynamicSMemSize)
|
||||
{
|
||||
HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessor, numBlocks, f, blockSize, dynSharedMemPerBlk);
|
||||
HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessor, numBlocks, f, blockSize, dynamicSMemSize);
|
||||
|
||||
auto F = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)(f),
|
||||
hip_impl::target_agent(0));
|
||||
return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
tls, numBlocks, f, blockSize, dynSharedMemPerBlk));
|
||||
tls, numBlocks, F, blockSize, dynamicSMemSize));
|
||||
}
|
||||
|
||||
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
|
||||
uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk,
|
||||
int* numBlocks, const void* f, int blockSize, size_t dynamicSMemSize,
|
||||
unsigned int flags)
|
||||
{
|
||||
HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, numBlocks, f, blockSize, dynSharedMemPerBlk, flags);
|
||||
HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, numBlocks, f, blockSize, dynamicSMemSize, flags);
|
||||
|
||||
auto F = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)(f),
|
||||
hip_impl::target_agent(0));
|
||||
return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
tls, numBlocks, f, blockSize, dynSharedMemPerBlk));
|
||||
tls, numBlocks, F, blockSize, dynamicSMemSize));
|
||||
}
|
||||
|
||||
hipError_t hipLaunchKernel(
|
||||
|
||||
+137
-55
@@ -89,9 +89,10 @@ struct Symbol {
|
||||
|
||||
class Kernel_descriptor {
|
||||
std::uint64_t kernel_object_{};
|
||||
amd_kernel_code_t const* kernel_header_{nullptr};
|
||||
std::string name_{};
|
||||
amd_kernel_code_t const* header_{};
|
||||
std::string name_;
|
||||
std::vector<std::pair<std::size_t, std::size_t>> kernarg_layout_{};
|
||||
bool is_code_object_v3_{};
|
||||
public:
|
||||
Kernel_descriptor() = default;
|
||||
Kernel_descriptor(
|
||||
@@ -101,7 +102,8 @@ public:
|
||||
:
|
||||
kernel_object_{kernel_object},
|
||||
name_{name},
|
||||
kernarg_layout_{std::move(kernarg_layout)}
|
||||
kernarg_layout_{std::move(kernarg_layout)},
|
||||
is_code_object_v3_{name.find(".kd") != std::string::npos}
|
||||
{
|
||||
bool supported{false};
|
||||
std::uint16_t min_v{UINT16_MAX};
|
||||
@@ -123,7 +125,7 @@ public:
|
||||
|
||||
r = tbl.hsa_ven_amd_loader_query_host_address(
|
||||
reinterpret_cast<void*>(kernel_object_),
|
||||
reinterpret_cast<const void**>(&kernel_header_));
|
||||
reinterpret_cast<const void**>(&header_));
|
||||
|
||||
if (r != HSA_STATUS_SUCCESS) return;
|
||||
}
|
||||
@@ -149,7 +151,7 @@ public:
|
||||
std::string,
|
||||
std::unordered_map<
|
||||
hsa_isa_t,
|
||||
std::vector<std::vector<char>>>>> code_object_blobs;
|
||||
std::vector<std::string>>>> code_object_blobs;
|
||||
|
||||
std::pair<
|
||||
std::once_flag,
|
||||
@@ -213,7 +215,7 @@ public:
|
||||
std::string,
|
||||
std::unordered_map<
|
||||
hsa_isa_t,
|
||||
std::vector<std::vector<char>>>>& get_code_object_blobs() {
|
||||
std::vector<std::string>>>& get_code_object_blobs() {
|
||||
|
||||
std::call_once(code_object_blobs.first, [this]() {
|
||||
dl_iterate_phdr([](dl_phdr_info* info, std::size_t, void* p) {
|
||||
@@ -584,6 +586,68 @@ public:
|
||||
return functions[agent].second;
|
||||
}
|
||||
|
||||
static
|
||||
std::size_t parse_args_v2(
|
||||
const std::string& metadata,
|
||||
std::size_t f,
|
||||
std::size_t l,
|
||||
std::vector<std::pair<std::size_t, std::size_t>>& size_align) {
|
||||
if (f == l) return f;
|
||||
if (!size_align.empty()) return l;
|
||||
|
||||
do {
|
||||
static constexpr size_t size_sz{5};
|
||||
f = metadata.find("Size:", f) + size_sz;
|
||||
|
||||
if (l <= f) return f;
|
||||
|
||||
auto size = std::strtoul(&metadata[f], nullptr, 10);
|
||||
|
||||
static constexpr size_t align_sz{6};
|
||||
f = metadata.find("Align:", f) + align_sz;
|
||||
|
||||
char* l{};
|
||||
auto align = std::strtoul(&metadata[f], &l, 10);
|
||||
|
||||
f += (l - &metadata[f]) + 1;
|
||||
|
||||
size_align.emplace_back(size, align);
|
||||
} while (true);
|
||||
}
|
||||
|
||||
static
|
||||
void read_kernarg_metadata_v2(
|
||||
const std::string& kernels_md,
|
||||
std::size_t dx,
|
||||
std::unordered_map<
|
||||
std::string,
|
||||
std::vector<std::pair<std::size_t, std::size_t>>>& kernargs) {
|
||||
do {
|
||||
dx = kernels_md.find("Name:", dx);
|
||||
|
||||
if (dx == std::string::npos) break;
|
||||
|
||||
static constexpr decltype(kernels_md.size()) name_sz{5};
|
||||
dx = kernels_md.find_first_not_of(" '", dx + name_sz);
|
||||
|
||||
auto fn =
|
||||
kernels_md.substr(dx, kernels_md.find_first_of("'\n", dx) - dx);
|
||||
dx += fn.size();
|
||||
|
||||
auto dx1 = kernels_md.find("CodeProps", dx);
|
||||
dx = kernels_md.find("Args:", dx);
|
||||
|
||||
if (dx1 < dx) {
|
||||
dx = dx1;
|
||||
continue;
|
||||
}
|
||||
if (dx == std::string::npos) break;
|
||||
|
||||
static constexpr decltype(kernels_md.size()) args_sz{5};
|
||||
dx = parse_args_v2(kernels_md, dx + args_sz, dx1, kernargs[fn]);
|
||||
} while (true);
|
||||
}
|
||||
|
||||
static
|
||||
std::string metadata_to_string(const amd_comgr_metadata_node_t& md) {
|
||||
std::string str;
|
||||
@@ -598,9 +662,8 @@ public:
|
||||
}
|
||||
|
||||
static
|
||||
void parse_args(
|
||||
void parse_args_v3(
|
||||
const amd_comgr_metadata_node_t& args_md,
|
||||
bool is_code_object_v3,
|
||||
std::vector<std::pair<std::size_t, std::size_t>>& size_align) {
|
||||
size_t arg_count = 0;
|
||||
if (amd_comgr_get_metadata_list_size(args_md, &arg_count)
|
||||
@@ -615,9 +678,7 @@ public:
|
||||
return;
|
||||
|
||||
amd_comgr_metadata_node_t arg_size_md;
|
||||
if (amd_comgr_metadata_lookup(arg_md,
|
||||
is_code_object_v3 ? ".size" : "Size",
|
||||
&arg_size_md)
|
||||
if (amd_comgr_metadata_lookup(arg_md, ".size", &arg_size_md)
|
||||
!= AMD_COMGR_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
@@ -629,35 +690,21 @@ public:
|
||||
|
||||
size_t arg_align;
|
||||
|
||||
if (is_code_object_v3) {
|
||||
amd_comgr_metadata_node_t arg_offset_md;
|
||||
if (amd_comgr_metadata_lookup(arg_md, ".offset", &arg_offset_md)
|
||||
!= AMD_COMGR_STATUS_SUCCESS)
|
||||
return;
|
||||
amd_comgr_metadata_node_t arg_offset_md;
|
||||
if (amd_comgr_metadata_lookup(arg_md, ".offset", &arg_offset_md)
|
||||
!= AMD_COMGR_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
size_t arg_offset
|
||||
= std::stoul(metadata_to_string(arg_offset_md));
|
||||
size_t arg_offset = std::stoul(metadata_to_string(arg_offset_md));
|
||||
|
||||
if (amd_comgr_destroy_metadata(arg_offset_md)
|
||||
!= AMD_COMGR_STATUS_SUCCESS)
|
||||
return;
|
||||
if (amd_comgr_destroy_metadata(arg_offset_md)
|
||||
!= AMD_COMGR_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
arg_align = 1;
|
||||
while (arg_offset && (arg_offset & 1) == 0) {
|
||||
arg_offset >>= 1;
|
||||
arg_align <<= 1;
|
||||
}
|
||||
} else {
|
||||
amd_comgr_metadata_node_t arg_align_md;
|
||||
if (amd_comgr_metadata_lookup(arg_md, "Align", &arg_align_md)
|
||||
!= AMD_COMGR_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
arg_align = std::stoul(metadata_to_string(arg_align_md));
|
||||
|
||||
if (amd_comgr_destroy_metadata(arg_align_md)
|
||||
!= AMD_COMGR_STATUS_SUCCESS)
|
||||
return;
|
||||
arg_align = 1;
|
||||
while (arg_offset && (arg_offset & 1) == 0) {
|
||||
arg_offset >>= 1;
|
||||
arg_align <<= 1;
|
||||
}
|
||||
|
||||
size_align.emplace_back(arg_size, arg_align);
|
||||
@@ -669,11 +716,11 @@ public:
|
||||
}
|
||||
|
||||
static
|
||||
void read_kernarg_metadata(
|
||||
const std::vector<char>& blob,
|
||||
void read_kernarg_metadata_v3(
|
||||
const std::string& blob,
|
||||
std::unordered_map<
|
||||
std::string,
|
||||
std::vector<std::pair<std::size_t, std::size_t>>>& kernargs) {
|
||||
std::string,
|
||||
std::vector<std::pair<std::size_t, std::size_t>>>& kernargs) {
|
||||
amd_comgr_data_t dataIn;
|
||||
amd_comgr_status_t status;
|
||||
|
||||
@@ -690,7 +737,6 @@ public:
|
||||
!= AMD_COMGR_STATUS_SUCCESS)
|
||||
return;
|
||||
|
||||
bool is_code_object_v3 = false;
|
||||
amd_comgr_metadata_node_t kernels_md;
|
||||
if (amd_comgr_metadata_lookup(metadata, "Kernels", &kernels_md)
|
||||
!= AMD_COMGR_STATUS_SUCCESS) {
|
||||
@@ -699,7 +745,6 @@ public:
|
||||
&kernels_md)
|
||||
!= AMD_COMGR_STATUS_SUCCESS)
|
||||
return;
|
||||
is_code_object_v3 = true;
|
||||
}
|
||||
|
||||
size_t kernel_count = 0;
|
||||
@@ -715,9 +760,7 @@ public:
|
||||
continue;
|
||||
|
||||
amd_comgr_metadata_node_t name_md;
|
||||
if (amd_comgr_metadata_lookup(kernel_md,
|
||||
is_code_object_v3 ? ".name" : "Name",
|
||||
&name_md)
|
||||
if (amd_comgr_metadata_lookup(kernel_md, ".name", &name_md)
|
||||
!= AMD_COMGR_STATUS_SUCCESS)
|
||||
continue;
|
||||
|
||||
@@ -727,21 +770,15 @@ public:
|
||||
!= AMD_COMGR_STATUS_SUCCESS)
|
||||
continue;
|
||||
|
||||
if (is_code_object_v3)
|
||||
kernel_name_str.append(".kd");
|
||||
|
||||
|
||||
amd_comgr_metadata_node_t args_md;
|
||||
if (amd_comgr_metadata_lookup(kernel_md,
|
||||
is_code_object_v3 ? ".args" : "Args",
|
||||
&args_md)
|
||||
if (amd_comgr_metadata_lookup(kernel_md, ".args", &args_md)
|
||||
!= AMD_COMGR_STATUS_SUCCESS)
|
||||
continue;
|
||||
|
||||
auto foundKernel = kernargs.find(kernel_name_str);
|
||||
// parse arguments for a given kernel only once
|
||||
if (foundKernel == kernargs.end()) {
|
||||
parse_args(args_md, is_code_object_v3, kernargs[kernel_name_str]);
|
||||
parse_args_v3(args_md, kernargs[kernel_name_str]);
|
||||
}
|
||||
|
||||
if (amd_comgr_destroy_metadata(args_md) != AMD_COMGR_STATUS_SUCCESS
|
||||
@@ -757,7 +794,52 @@ public:
|
||||
amd_comgr_release_data(dataIn);
|
||||
}
|
||||
|
||||
const std::unordered_map<std::string,
|
||||
static
|
||||
void read_kernarg_metadata(
|
||||
const std::string& blob,
|
||||
std::unordered_map<
|
||||
std::string,
|
||||
std::vector<std::pair<std::size_t, std::size_t>>>& kernargs)
|
||||
{
|
||||
std::istringstream istr{blob};
|
||||
ELFIO::elfio reader;
|
||||
|
||||
if (!reader.load(istr)) return;
|
||||
|
||||
// TODO: this is inefficient.
|
||||
auto it = find_section_if(reader, [](const ELFIO::section* x) {
|
||||
return x->get_type() == SHT_NOTE;
|
||||
});
|
||||
|
||||
if (!it) return;
|
||||
|
||||
const ELFIO::note_section_accessor acc{reader, it};
|
||||
auto n{acc.get_notes_num()};
|
||||
while (n--) {
|
||||
ELFIO::Elf_Word type{};
|
||||
std::string name{};
|
||||
void* desc{};
|
||||
ELFIO::Elf_Word desc_size{};
|
||||
|
||||
acc.get_note(n, type, name, desc, desc_size);
|
||||
|
||||
if (name == "AMDGPU") {
|
||||
return read_kernarg_metadata_v3(blob, kernargs);
|
||||
}
|
||||
if (name != "AMD") continue; // TODO: switch to using NT_AMD_AMDGPU_HSA_METADATA.
|
||||
|
||||
std::string tmp{
|
||||
static_cast<char*>(desc), static_cast<char*>(desc) + desc_size};
|
||||
|
||||
auto dx = tmp.find("Kernels:");
|
||||
|
||||
if (dx == std::string::npos) continue;
|
||||
|
||||
return read_kernarg_metadata_v2(tmp, dx + 8u, kernargs); // Skip "Kernels:".
|
||||
}
|
||||
}
|
||||
|
||||
const std::unordered_map<std::string,
|
||||
std::vector<std::pair<std::size_t, std::size_t>>>& get_kernargs() {
|
||||
|
||||
std::call_once(kernargs.first, [this]() {
|
||||
|
||||
@@ -12,18 +12,33 @@ import lit.util
|
||||
site_cfg = lit_config.params.get('site_config', None)
|
||||
lit_config.load_config(config, site_cfg)
|
||||
|
||||
print(str("========================================"))
|
||||
config.excludes = ['cmdparser.hpp']
|
||||
config.excludes.append('spatial_batch_norm_op.h')
|
||||
config.excludes.append('common_cudnn.h')
|
||||
|
||||
delimiter = "===============================================================";
|
||||
print(delimiter)
|
||||
print("CUDA " + config.cuda_version + " - will be used for testing")
|
||||
print("LLVM " + config.llvm_version + " - will be used for testing")
|
||||
print(platform.machine() + " - Platform architecture")
|
||||
print(platform.system() + " " + platform.release() + " - Platform OS")
|
||||
print(str(config.pointer_size * 8) + " - hipify-clang binary bitness")
|
||||
print(str(struct.calcsize("P") * 8) + " - python " + str(platform.python_version()) + " binary bitness")
|
||||
print(str("========================================"))
|
||||
|
||||
config.excludes = ['cmdparser.hpp']
|
||||
config.excludes.append('spatial_batch_norm_op.h')
|
||||
config.excludes.append('common_cudnn.h')
|
||||
print(delimiter)
|
||||
warns = None
|
||||
if not config.cuda_dnn_root:
|
||||
config.excludes.append('cudnn_convolution_forward.cu')
|
||||
config.excludes.append('cudnn_softmax.cu')
|
||||
print("WARN: cuDNN tests are excluded due to unset CUDA_DNN_ROOT_DIR")
|
||||
warns = True
|
||||
if not config.cuda_cub_root:
|
||||
config.excludes.append('cub_01.cu')
|
||||
config.excludes.append('cub_02.cu')
|
||||
config.excludes.append('cub_03.cu')
|
||||
print("WARN: CUB tests are excluded due to unset CUDA_CUB_ROOT_DIR")
|
||||
warns = True
|
||||
if warns:
|
||||
print(delimiter)
|
||||
|
||||
if config.cuda_version_major == 7 and config.cuda_version_minor == 0:
|
||||
config.excludes.append('headers_test_09.cu')
|
||||
@@ -90,17 +105,47 @@ else:
|
||||
run_test_ext = ".sh"
|
||||
# CUDA SDK ROOT
|
||||
clang_arguments += " -isystem'%s'/samples/common/inc"
|
||||
# cuDNN ROOT
|
||||
clang_arguments += " -I'%s'/include"
|
||||
if config.pointer_size == 8:
|
||||
clang_arguments += " -D__LP64__"
|
||||
|
||||
# Set max clang's CudaArch for corresponding CUDA version
|
||||
# to support maximum CUDA features in offline tests
|
||||
if config.cuda_version_major == 7:
|
||||
if config.cuda_version_minor == 5:
|
||||
clang_arguments += " --cuda-gpu-arch=sm_53"
|
||||
else:
|
||||
clang_arguments += " --cuda-gpu-arch=sm_52"
|
||||
elif config.cuda_version_major == 8:
|
||||
clang_arguments += " --cuda-gpu-arch=sm_62"
|
||||
elif config.cuda_version_major == 9:
|
||||
if config.cuda_version_minor == 2:
|
||||
clang_arguments += " --cuda-gpu-arch=sm_72"
|
||||
else:
|
||||
clang_arguments += " --cuda-gpu-arch=sm_70"
|
||||
elif config.cuda_version_major == 10:
|
||||
clang_arguments += " --cuda-gpu-arch=sm_75"
|
||||
|
||||
# cuDNN ROOT
|
||||
if config.cuda_dnn_root:
|
||||
clang_arguments += " -I'%s'/include"
|
||||
# CUB ROOT
|
||||
clang_arguments += " -I'%s'"
|
||||
if config.cuda_cub_root:
|
||||
clang_arguments += " -I'%s'"
|
||||
|
||||
hipify_arguments = "--cuda-path='%s'"
|
||||
if config.cuda_dnn_root and config.cuda_cub_root:
|
||||
config.substitutions.append(("%clang_args", clang_arguments % (config.cuda_sdk_root, config.cuda_dnn_root, config.cuda_cub_root)))
|
||||
elif config.cuda_dnn_root:
|
||||
config.substitutions.append(("%clang_args", clang_arguments % (config.cuda_sdk_root, config.cuda_dnn_root)))
|
||||
elif config.cuda_cub_root:
|
||||
config.substitutions.append(("%clang_args", clang_arguments % (config.cuda_sdk_root, config.cuda_cub_root)))
|
||||
else:
|
||||
config.substitutions.append(("%clang_args", clang_arguments % config.cuda_sdk_root))
|
||||
|
||||
config.substitutions.append(("%clang_args", clang_arguments % (config.cuda_sdk_root, config.cuda_dnn_root, config.cuda_cub_root)))
|
||||
config.substitutions.append(("%hipify_args", hipify_arguments % (config.cuda_root)))
|
||||
if config.llvm_version_major < 4:
|
||||
hipify_arguments = "-I'%s'/include"
|
||||
else:
|
||||
hipify_arguments = "--cuda-path='%s'"
|
||||
|
||||
config.substitutions.append(("%hipify_args", hipify_arguments % config.cuda_root))
|
||||
config.substitutions.append(("hipify", '"' + hipify_path + "/hipify-clang" + '"'))
|
||||
config.substitutions.append(("%run_test", '"' + config.test_source_root + "/run_test" + run_test_ext + '"'))
|
||||
|
||||
@@ -276,7 +276,9 @@ int main(int argc, char** argv) {
|
||||
runTest<unsigned int>();
|
||||
runTest<unsigned long long>();
|
||||
runTest<float>();
|
||||
#if CUDA_VERSION >= 8000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
|
||||
runTest<double>();
|
||||
#endif
|
||||
// CHECK: hipDeviceReset();
|
||||
cudaDeviceReset();
|
||||
printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
|
||||
|
||||
@@ -0,0 +1,69 @@
|
||||
// RUN: %run_test hipify "%s" "%t" %hipify_args %clang_args
|
||||
// CHECK: #include <hip/hip_runtime.h>
|
||||
#include <iostream>
|
||||
// CHECK: #include <hiprand.h>
|
||||
#include <curand.h>
|
||||
// CHECK: #include <hipcub/hipcub.hpp>
|
||||
#include <cub/cub.cuh>
|
||||
|
||||
#include <iostream>
|
||||
|
||||
template <int BLOCK_WIDTH, int ITEMS_PER_THREAD,
|
||||
// CHECK: hipcub::BlockLoadAlgorithm BLOCK_LOAD_ALGO,
|
||||
cub::BlockLoadAlgorithm BLOCK_LOAD_ALGO,
|
||||
// CHECK: hipcub::BlockStoreAlgorithm BLOCK_STORE_ALGO,
|
||||
cub::BlockStoreAlgorithm BLOCK_STORE_ALGO,
|
||||
typename T>
|
||||
__global__ void sort(const T* data_in, T* data_out){
|
||||
// CHECK: typedef hipcub::BlockLoad<T, BLOCK_WIDTH, ITEMS_PER_THREAD, BLOCK_LOAD_ALGO> BlockLoadT;
|
||||
typedef cub::BlockLoad<T, BLOCK_WIDTH, ITEMS_PER_THREAD, BLOCK_LOAD_ALGO> BlockLoadT;
|
||||
// CHECK: typedef hipcub::BlockRadixSort<T, BLOCK_WIDTH, ITEMS_PER_THREAD> BlockRadixSortT;
|
||||
typedef cub::BlockRadixSort<T, BLOCK_WIDTH, ITEMS_PER_THREAD> BlockRadixSortT;
|
||||
// CHECK: typedef hipcub::BlockStore<T, BLOCK_WIDTH, ITEMS_PER_THREAD, BLOCK_STORE_ALGO> BlockStoreT;
|
||||
typedef cub::BlockStore<T, BLOCK_WIDTH, ITEMS_PER_THREAD, BLOCK_STORE_ALGO> BlockStoreT;
|
||||
__shared__ union {
|
||||
typename BlockLoadT::TempStorage load;
|
||||
typename BlockRadixSortT::TempStorage sort;
|
||||
typename BlockStoreT::TempStorage store;
|
||||
} tmp_storage;
|
||||
T items[ITEMS_PER_THREAD];
|
||||
BlockLoadT(tmp_storage.load).Load(data_in + blockIdx.x * BLOCK_WIDTH * ITEMS_PER_THREAD, items);
|
||||
__syncthreads();
|
||||
BlockRadixSortT(tmp_storage.sort).Sort(items);
|
||||
__syncthreads();
|
||||
BlockStoreT(tmp_storage.store).Store(data_out + blockIdx.x * BLOCK_WIDTH * ITEMS_PER_THREAD, items);
|
||||
}
|
||||
|
||||
int main() {
|
||||
double* d_gpu = NULL;
|
||||
double* result_gpu = NULL;
|
||||
double* data_sorted = new double[1000*4096];
|
||||
// Allocate memory on the GPU
|
||||
// CHECK: hipMalloc(&d_gpu, 1000*4096 * sizeof(double));
|
||||
cudaMalloc(&d_gpu, 1000*4096 * sizeof(double));
|
||||
// CHECK: hipMalloc(&result_gpu, 1000*4096 * sizeof(double));
|
||||
cudaMalloc(&result_gpu, 1000*4096 * sizeof(double));
|
||||
// CHECK: hiprandGenerator_t gen;
|
||||
curandGenerator_t gen;
|
||||
// Create generator
|
||||
// CHECK: hiprandCreateGenerator(&gen, HIPRAND_RNG_PSEUDO_DEFAULT);
|
||||
curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
|
||||
// Fill array with random numbers
|
||||
// CHECK: hiprandGenerateNormalDouble(gen, d_gpu, 1000*4096, 0.0, 1.0);
|
||||
curandGenerateNormalDouble(gen, d_gpu, 1000*4096, 0.0, 1.0);
|
||||
// Destroy generator
|
||||
// CHECK: hiprandDestroyGenerator(gen);
|
||||
curandDestroyGenerator(gen);
|
||||
// Sort data
|
||||
// CHECK: hipLaunchKernelGGL(HIP_KERNEL_NAME(sort<512, 8, hipcub::BLOCK_LOAD_TRANSPOSE, hipcub::BLOCK_STORE_TRANSPOSE>), dim3(1000), dim3(512), 0, 0, d_gpu, result_gpu);
|
||||
sort<512, 8, cub::BLOCK_LOAD_TRANSPOSE, cub::BLOCK_STORE_TRANSPOSE><<<1000, 512>>>(d_gpu, result_gpu);
|
||||
// CHECK: hipLaunchKernelGGL(HIP_KERNEL_NAME(sort<256, 16, hipcub::BLOCK_LOAD_DIRECT, hipcub::BLOCK_STORE_DIRECT>), dim3(1000), dim3(256), 0, 0, d_gpu, result_gpu);
|
||||
sort<256, 16, cub::BLOCK_LOAD_DIRECT, cub::BLOCK_STORE_DIRECT><<<1000, 256>>>(d_gpu, result_gpu);
|
||||
// CHECK: hipMemcpy(data_sorted, result_gpu, 1000*4096*sizeof(double), hipMemcpyDeviceToHost);
|
||||
cudaMemcpy(data_sorted, result_gpu, 1000*4096*sizeof(double), cudaMemcpyDeviceToHost);
|
||||
// Write the sorted data to standard out
|
||||
for (int i = 0; i < 4095; ++i) {
|
||||
std::cout << data_sorted[i] << ", ";
|
||||
}
|
||||
std::cout << data_sorted[4095] << std::endl;
|
||||
}
|
||||
@@ -0,0 +1,33 @@
|
||||
// RUN: %run_test hipify "%s" "%t" %hipify_args %clang_args
|
||||
// CHECK: #include <hip/hip_runtime.h>
|
||||
#include <iostream>
|
||||
// CHECK: #include <hipcub/hipcub.hpp>
|
||||
#include <cub/cub.cuh>
|
||||
|
||||
// using namespace hipcub;
|
||||
using namespace cub;
|
||||
|
||||
// Simple CUDA kernel for computing tiled partial sums
|
||||
template <int BLOCK_THREADS, int ITEMS_PER_THREAD,
|
||||
// CHECK: hipcub::BlockLoadAlgorithm LOAD_ALGO,
|
||||
cub::BlockLoadAlgorithm LOAD_ALGO,
|
||||
// CHECK: hipcub::BlockScanAlgorithm SCAN_ALGO>
|
||||
cub::BlockScanAlgorithm SCAN_ALGO>
|
||||
__global__ void ScanTilesKernel(int *d_in, int *d_out) {
|
||||
// Specialize collective types for problem context
|
||||
// CHECK: typedef ::hipcub::BlockLoad<int*, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGO> BlockLoadT;
|
||||
typedef ::cub::BlockLoad<int*, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGO> BlockLoadT;
|
||||
typedef BlockScan<int, BLOCK_THREADS, SCAN_ALGO> BlockScanT;
|
||||
// Allocate on-chip temporary storage
|
||||
__shared__ union {
|
||||
typename BlockLoadT::TempStorage load;
|
||||
typename BlockScanT::TempStorage reduce;
|
||||
} temp_storage;
|
||||
// Load data per thread
|
||||
int thread_data[ITEMS_PER_THREAD];
|
||||
int offset = blockIdx.x * (BLOCK_THREADS * ITEMS_PER_THREAD);
|
||||
BlockLoadT(temp_storage.load).Load(d_in + offset, offset);
|
||||
__syncthreads();
|
||||
// Compute the block-wide prefix sum
|
||||
BlockScanT(temp_storage).Sum(thread_data);
|
||||
}
|
||||
@@ -67,7 +67,6 @@ int devcheck(int gpudevice, int rank)
|
||||
cudaError_t cudareturn;
|
||||
cudaDeviceProp deviceProp;
|
||||
cudaGetDeviceProperties(&deviceProp, gpudevice);
|
||||
// CHECK: if (deviceProp.hipWarpSize <= 1)
|
||||
if (deviceProp.warpSize <= 1)
|
||||
{
|
||||
printf("rank %d: warning, CUDA Device Emulation (CPU) detected, exiting\n", rank);
|
||||
|
||||
@@ -29,7 +29,12 @@ THE SOFTWARE.
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "test_common.h"
|
||||
|
||||
#if __HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__ || __HIP_ARCH_GFX906__ || __HIP_ARCH_GFX908__
|
||||
#if __HIP_ARCH_GFX803__ || \
|
||||
__HIP_ARCH_GFX900__ || \
|
||||
__HIP_ARCH_GFX906__ || \
|
||||
__HIP_ARCH_GFX908__ || \
|
||||
__HIP_ARCH_GFX1010__ || \
|
||||
__HIP_ARCH_GFX1012__
|
||||
|
||||
__global__ void kernel_abs_int64(long long *input, long long *output) {
|
||||
int tx = threadIdx.x;
|
||||
|
||||
@@ -28,7 +28,12 @@ THE SOFTWARE.
|
||||
|
||||
#include "test_common.h"
|
||||
|
||||
#if __HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__ || __HIP_ARCH_GFX906__ || __HIP_ARCH_GFX908__
|
||||
#if __HIP_ARCH_GFX803__ || \
|
||||
__HIP_ARCH_GFX900__ || \
|
||||
__HIP_ARCH_GFX906__ || \
|
||||
__HIP_ARCH_GFX908__ || \
|
||||
__HIP_ARCH_GFX1010__ || \
|
||||
__HIP_ARCH_GFX1012__
|
||||
|
||||
__device__ void test_convert() {
|
||||
__half x;
|
||||
|
||||
@@ -32,7 +32,12 @@ THE SOFTWARE.
|
||||
|
||||
using namespace std;
|
||||
|
||||
#if __HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__ || __HIP_ARCH_GFX906__ || __HIP_ARCH_GFX908__
|
||||
#if __HIP_ARCH_GFX803__ || \
|
||||
__HIP_ARCH_GFX900__ || \
|
||||
__HIP_ARCH_GFX906__ || \
|
||||
__HIP_ARCH_GFX908__ || \
|
||||
__HIP_ARCH_GFX1010__ || \
|
||||
__HIP_ARCH_GFX1012__
|
||||
|
||||
__global__
|
||||
void __halfTest(bool* result, __half a) {
|
||||
|
||||
@@ -120,8 +120,8 @@ void simpleTest2(size_t numElements, bool usePinnedHost) {
|
||||
HIPCHECK(hipHostFree(A_h1));
|
||||
HIPCHECK(hipHostFree(A_h2));
|
||||
} else {
|
||||
free(A_h1);
|
||||
free(A_h2);
|
||||
aligned_free(A_h1);
|
||||
aligned_free(A_h2);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -45,7 +45,7 @@ bool testhipMemset2D(int memsetval,int p_gpuDevice)
|
||||
char *A_d;
|
||||
char *A_h;
|
||||
bool testResult = true;
|
||||
HIPCHECK ( hipMemAllocPitch((hipDeviceptr_t*)&A_d, &pitch_A, width , numH,16) );
|
||||
HIPCHECK(hipMemAllocPitch((hipDeviceptr_t*)&A_d, &pitch_A, width , numH,16));
|
||||
A_h = (char*)malloc(sizeElements);
|
||||
HIPASSERT(A_h != NULL);
|
||||
for (size_t i=0; i<elements; i++) {
|
||||
@@ -89,8 +89,9 @@ bool testhipMemset2DAsync(int memsetval,int p_gpuDevice)
|
||||
}
|
||||
hipStream_t stream;
|
||||
HIPCHECK(hipStreamCreate(&stream));
|
||||
HIPCHECK ( hipMemset2DAsync(A_d, pitch_A, memsetval, numW, numH, stream) );
|
||||
HIPCHECK ( hipMemcpy2D(A_h, width, A_d, pitch_A, numW, numH, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipMemset2DAsync(A_d, pitch_A, memsetval, numW, numH, stream) );
|
||||
HIPCHECK(hipStreamSynchronize(stream));
|
||||
HIPCHECK(hipMemcpy2D(A_h, width, A_d, pitch_A, numW, numH, hipMemcpyDeviceToHost));
|
||||
|
||||
for (int i=0; i<elements; i++) {
|
||||
if (A_h[i] != memsetval) {
|
||||
@@ -112,9 +113,9 @@ int main(int argc, char *argv[])
|
||||
hipCtx_t context;
|
||||
hipCtxCreate(&context, 0, p_gpuDevice);
|
||||
|
||||
bool testResult = false;
|
||||
testResult = testhipMemset2D(memsetval, p_gpuDevice);
|
||||
testResult = testhipMemset2DAsync(memsetval, p_gpuDevice);
|
||||
bool testResult = true;
|
||||
testResult &= testhipMemset2D(memsetval, p_gpuDevice);
|
||||
testResult &= testhipMemset2DAsync(memsetval, p_gpuDevice);
|
||||
hipCtxDestroy(context);
|
||||
if(testResult){
|
||||
passed();
|
||||
|
||||
@@ -41,27 +41,27 @@ bool testhipMemset3D(int memsetval,int p_gpuDevice)
|
||||
size_t elements = numW* numH* depth;
|
||||
|
||||
|
||||
printf ("testhipMemset3D memsetval=%2x device=%d\n", memsetval, p_gpuDevice);
|
||||
printf ("testhipMemset3D memsetval=%2x device=%d\n", memsetval, p_gpuDevice);
|
||||
char *A_h;
|
||||
bool testResult = true;
|
||||
hipExtent extent = make_hipExtent(width, numH, depth);
|
||||
hipPitchedPtr devPitchedPtr;
|
||||
|
||||
HIPCHECK(hipMalloc3D(&devPitchedPtr, extent));
|
||||
A_h = (char*)malloc(sizeElements);
|
||||
HIPASSERT(A_h != NULL);
|
||||
for (size_t i=0; i<elements; i++) {
|
||||
A_h = (char*)malloc(sizeElements);
|
||||
HIPASSERT(A_h != NULL);
|
||||
for (size_t i=0; i<elements; i++) {
|
||||
A_h[i] = 1;
|
||||
}
|
||||
HIPCHECK ( hipMemset3D( devPitchedPtr, memsetval, extent) );
|
||||
hipMemcpy3DParms myparms = {0};
|
||||
myparms.srcPos = make_hipPos(0,0,0);
|
||||
myparms.dstPos = make_hipPos(0,0,0);
|
||||
myparms.dstPtr = make_hipPitchedPtr(A_h, width , numW, numH);
|
||||
myparms.srcPtr = devPitchedPtr;
|
||||
myparms.extent = extent;
|
||||
HIPCHECK(hipMemset3D( devPitchedPtr, memsetval, extent));
|
||||
hipMemcpy3DParms myparms = {0};
|
||||
myparms.srcPos = make_hipPos(0,0,0);
|
||||
myparms.dstPos = make_hipPos(0,0,0);
|
||||
myparms.dstPtr = make_hipPitchedPtr(A_h, width , numW, numH);
|
||||
myparms.srcPtr = devPitchedPtr;
|
||||
myparms.extent = extent;
|
||||
#ifdef __HIP_PLATFORM_NVCC__
|
||||
myparms.kind = hipMemcpyKindToCudaMemcpyKind(hipMemcpyDeviceToHost);
|
||||
myparms.kind = hipMemcpyKindToCudaMemcpyKind(hipMemcpyDeviceToHost);
|
||||
#else
|
||||
myparms.kind = hipMemcpyDeviceToHost;
|
||||
#endif
|
||||
@@ -69,7 +69,58 @@ bool testhipMemset3D(int memsetval,int p_gpuDevice)
|
||||
|
||||
for (int i=0; i<elements; i++) {
|
||||
if (A_h[i] != memsetval) {
|
||||
testResult = false;
|
||||
testResult = false;
|
||||
printf("mismatch at index:%d computed:%02x, memsetval:%02x\n", i, (int)A_h[i], (int)memsetval);
|
||||
break;
|
||||
}
|
||||
}
|
||||
HIPCHECK(hipFree(devPitchedPtr.ptr));
|
||||
free(A_h);
|
||||
return testResult;
|
||||
}
|
||||
|
||||
bool testhipMemset3DAsync(int memsetval,int p_gpuDevice)
|
||||
{
|
||||
size_t numH = 256;
|
||||
size_t numW = 256;
|
||||
size_t depth = 10;
|
||||
size_t width = numW * sizeof(char);
|
||||
size_t sizeElements = width * numH * depth;
|
||||
size_t elements = numW* numH* depth;
|
||||
|
||||
|
||||
printf ("testhipMemset3D memsetval=%2x device=%d\n", memsetval, p_gpuDevice);
|
||||
char *A_h;
|
||||
bool testResult = true;
|
||||
hipExtent extent = make_hipExtent(width, numH, depth);
|
||||
hipPitchedPtr devPitchedPtr;
|
||||
|
||||
HIPCHECK(hipMalloc3D(&devPitchedPtr, extent));
|
||||
A_h = (char*)malloc(sizeElements);
|
||||
HIPASSERT(A_h != NULL);
|
||||
for (size_t i=0; i<elements; i++) {
|
||||
A_h[i] = 1;
|
||||
}
|
||||
hipStream_t stream;
|
||||
HIPCHECK(hipStreamCreate(&stream));
|
||||
HIPCHECK(hipMemset3DAsync(devPitchedPtr, memsetval, extent, stream));
|
||||
HIPCHECK(hipStreamSynchronize(stream));
|
||||
hipMemcpy3DParms myparms = {0};
|
||||
myparms.srcPos = make_hipPos(0,0,0);
|
||||
myparms.dstPos = make_hipPos(0,0,0);
|
||||
myparms.dstPtr = make_hipPitchedPtr(A_h, width , numW, numH);
|
||||
myparms.srcPtr = devPitchedPtr;
|
||||
myparms.extent = extent;
|
||||
#ifdef __HIP_PLATFORM_NVCC__
|
||||
myparms.kind = hipMemcpyKindToCudaMemcpyKind(hipMemcpyDeviceToHost);
|
||||
#else
|
||||
myparms.kind = hipMemcpyDeviceToHost;
|
||||
#endif
|
||||
HIPCHECK(hipMemcpy3D(&myparms));
|
||||
|
||||
for (int i=0; i<elements; i++) {
|
||||
if (A_h[i] != memsetval) {
|
||||
testResult = false;
|
||||
printf("mismatch at index:%d computed:%02x, memsetval:%02x\n", i, (int)A_h[i], (int)memsetval);
|
||||
break;
|
||||
}
|
||||
@@ -82,9 +133,10 @@ bool testhipMemset3D(int memsetval,int p_gpuDevice)
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
HipTest::parseStandardArguments(argc, argv, true);
|
||||
bool testResult = false;
|
||||
HIPCHECK(hipSetDevice(p_gpuDevice));
|
||||
testResult = testhipMemset3D(memsetval, p_gpuDevice);
|
||||
bool testResult = true;
|
||||
testResult &= testhipMemset3D(memsetval, p_gpuDevice);
|
||||
testResult &= testhipMemset3DAsync(memsetval, p_gpuDevice);
|
||||
if (testResult) {
|
||||
passed();
|
||||
} else {
|
||||
|
||||
@@ -116,7 +116,7 @@ int main() {
|
||||
|
||||
dimBlock.x = workgroups[i];
|
||||
// Calculate the device occupancy to know how many blocks can be run concurrently
|
||||
hipOccupancyMaxActiveBlocksPerMultiprocessor(reinterpret_cast<uint32_t*>(&numBlocks),
|
||||
hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks,
|
||||
test_gws, dimBlock.x * dimBlock.y * dimBlock.z, dimBlock.x * sizeof(long));
|
||||
|
||||
dimGrid.x = deviceProp.multiProcessorCount * std::min(numBlocks, 32);
|
||||
|
||||
+3
-18
@@ -30,10 +30,6 @@ THE SOFTWARE.
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "test_common.h"
|
||||
|
||||
#define fileName "vcpy_kernel.code"
|
||||
#define kernel_name "hello_world"
|
||||
|
||||
|
||||
__global__ void f1(float *a) { *a = 1.0; }
|
||||
|
||||
template <typename T>
|
||||
@@ -44,16 +40,15 @@ __global__ void f2(T *a) { *a = 1; }
|
||||
int main(int argc, char* argv[]) {
|
||||
|
||||
// test case for using kernel function pointer
|
||||
uint32_t gridSize = 0;
|
||||
uint32_t blockSize = 0;
|
||||
int gridSize = 0;
|
||||
int blockSize = 0;
|
||||
hipOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, f1, 0, 0);
|
||||
assert(gridSize != 0 && blockSize != 0);
|
||||
|
||||
uint32_t numBlock = 0;
|
||||
int numBlock = 0;
|
||||
hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, f1, blockSize, 0);
|
||||
assert(numBlock != 0);
|
||||
|
||||
|
||||
// test case for using kernel function pointer with template
|
||||
gridSize = 0;
|
||||
blockSize = 0;
|
||||
@@ -64,15 +59,5 @@ int main(int argc, char* argv[]) {
|
||||
hipOccupancyMaxActiveBlocksPerMultiprocessor<void(*)(int *)>(&numBlock, f2, blockSize, 0);
|
||||
assert(numBlock != 0);
|
||||
|
||||
|
||||
// test case for using kernel with hipFunction_t type
|
||||
numBlock = 0;
|
||||
hipModule_t Module;
|
||||
hipFunction_t Function;
|
||||
HIPCHECK(hipModuleLoad(&Module, fileName));
|
||||
HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name));
|
||||
HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, Function, blockSize, 0));
|
||||
assert(numBlock != 0);
|
||||
|
||||
passed();
|
||||
}
|
||||
|
||||
@@ -30,22 +30,16 @@ THE SOFTWARE.
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "test_common.h"
|
||||
|
||||
#define fileName "vcpy_kernel.code"
|
||||
#define kernel_name "hello_world"
|
||||
|
||||
|
||||
__global__ void f1(float *a) { *a = 1.0; }
|
||||
|
||||
template <typename T>
|
||||
__global__ void f2(T *a) { *a = 1; }
|
||||
|
||||
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
|
||||
// test case for using kernel function pointer
|
||||
uint32_t gridSize = 0;
|
||||
uint32_t blockSize = 0;
|
||||
int gridSize = 0;
|
||||
int blockSize = 0;
|
||||
hipOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, f1, 0, 0);
|
||||
assert(gridSize != 0 && blockSize != 0);
|
||||
|
||||
@@ -55,15 +49,5 @@ int main(int argc, char* argv[]) {
|
||||
hipOccupancyMaxPotentialBlockSize<void(*)(int *)>(&gridSize, &blockSize, f2, 0, 0);
|
||||
assert(gridSize != 0 && blockSize != 0);
|
||||
|
||||
// test case for using kernel with hipFunction_t type
|
||||
gridSize = 0;
|
||||
blockSize = 0;
|
||||
hipModule_t Module;
|
||||
hipFunction_t Function;
|
||||
HIPCHECK(hipModuleLoad(&Module, fileName));
|
||||
HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name));
|
||||
HIPCHECK(hipOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, Function, 0, 0));
|
||||
assert(gridSize != 0 && blockSize != 0);
|
||||
|
||||
passed();
|
||||
}
|
||||
|
||||
@@ -99,10 +99,13 @@ THE SOFTWARE.
|
||||
|
||||
#ifdef _WIN64
|
||||
#include <tchar.h>
|
||||
#define aligned_alloc _aligned_malloc
|
||||
#define aligned_alloc(x,y) _aligned_malloc(y,x)
|
||||
#define aligned_free(x) _aligned_free(x)
|
||||
#define popen(x,y) _popen(x,y)
|
||||
#define pclose(x) _pclose(x)
|
||||
#define setenv(x,y,z) _putenv_s(x,y)
|
||||
#else
|
||||
#define aligned_free(x) free(x)
|
||||
#endif
|
||||
|
||||
// standard command-line variables:
|
||||
|
||||
Ссылка в новой задаче
Block a user