diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0ad40a10ea..57369a9039 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -289,7 +289,7 @@ if(HIP_PLATFORM STREQUAL "hcc")
target_link_libraries(hip_hcc PRIVATE hc_am)
target_link_libraries(hip_hcc_static PRIVATE hc_am)
- add_library(hiprtc SHARED src/hiprtc.cpp)
+ add_library(hiprtc SHARED src/hiprtc.cpp src/code_object_bundle.cpp)
target_include_directories(
hiprtc SYSTEM
PRIVATE ${PROJECT_SOURCE_DIR}/include ${HSA_PATH}/include)
@@ -504,7 +504,7 @@ endif()
find_program(CPPCHECK_EXE cppcheck)
if(CPPCHECK_EXE)
add_custom_target(cppcheck COMMAND ${CPPCHECK_EXE} --force --quiet --enable=warning,performance,portability,information,missingInclude src include -I /opt/rocm/include/hcc -I /opt/rocm/include --suppress=*:/opt/rocm/include/hcc/hc.hpp
- WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+ WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endif()
#############################
@@ -545,4 +545,17 @@ else()
message(STATUS "Testing targets will not be available. To enable them please ensure that the HIP installation directory is writeable. Use -DCMAKE_INSTALL_PREFIX to specify a suitable location")
endif()
+#############################
+# Code analysis
+#############################
+# Target: clang
+if(HIP_HIPCC_EXECUTABLE)
+ add_custom_target(analyze
+ COMMAND ${HIP_HIPCC_EXECUTABLE} -fvisibility=hidden -fvisibility-inlines-hidden --analyze --analyzer-outputtext -isystem /opt/rocm/include ${HIP_HCC_BUILD_FLAGS} -Wno-unused-command-line-argument -I/opt/rocm/include -c src/*.cpp -Iinclude/ -I./
+ WORKING_DIRECTORY ${HIP_SRC_PATH})
+ if(CPPCHECK_EXE)
+ add_dependencies(analyze cppcheck)
+ endif()
+endif()
+
# vim: ts=4:sw=4:expandtab:smartindent
diff --git a/bin/hipcc b/bin/hipcc
index 35fbb54397..9d325c2bf9 100755
--- a/bin/hipcc
+++ b/bin/hipcc
@@ -234,6 +234,8 @@ if ($HIP_PLATFORM eq "clang") {
if ($HIP_RUNTIME eq "HCC" ) {
$HSA_PATH=$ENV{'HSA_PATH'} // "$ROCM_PATH/hsa";
$HIPCXXFLAGS .= " -isystem $HSA_PATH/include";
+ } else {
+ $HIPCXXFLAGS .= " -fhip-new-launch-api";
}
} elsif ($HIP_PLATFORM eq "hcc") {
@@ -352,7 +354,6 @@ my $runCmd = 1;
my $buildDeps = 0;
my $linkType = 1;
my $setLinkType = 0;
-my $coFormatv3 = 1;
my @options = ();
my @inputs = ();
@@ -472,22 +473,6 @@ foreach $arg (@ARGV)
$swallowArg = 1;
}
- # code object format parsing
- if ($trimarg eq '-mcode-object-v3') {
- $coFormatv3 = 1;
- # hip-clang already recognizes -mcode-object-v3, so we just pass it on
- if ($HIP_PLATFORM eq 'hcc') {
- $swallowArg = 1;
- }
- }
- if ($trimarg eq '-mno-code-object-v3') {
- $coFormatv3 = 0;
- # hip-clang already recognizes -mno-code-object-v3, so we just pass it on
- if ($HIP_PLATFORM eq 'hcc') {
- $swallowArg = 1;
- }
- }
-
if (($arg =~ /--genco/) and $HIP_PLATFORM eq 'clang' ) {
$arg = "--cuda-device-only";
}
@@ -526,6 +511,13 @@ foreach $arg (@ARGV)
$optArg = $arg;
}
+ ## This is a temporary workaround for CMake detection of OpenMP support.
+ ## It should be removed when the OpenMP detection c++ test in CMake is updated
+ ## and corrected CMake version is available.
+ if((defined $HIP_COMPILER) and ($HIP_COMPILER eq "clang") and ($arg eq '-fopenmp')) {
+ $HIPCXXFLAGS .= " -D_OPENMP "
+ }
+
## process linker response file for hip-clang
## extract object files from static library and pass them directly to
## hip-clang in command line.
@@ -863,12 +855,6 @@ if($HIP_PLATFORM eq "hcc" or $HIP_PLATFORM eq "clang"){
}
}
-# hcc defaults to v2, so we need to convert to the appropriate flag
-# hip-clang defaults to v3, so we don't need to do anything
-if ($coFormatv3 and $HIP_PLATFORM eq 'hcc') {
- $HIPLDFLAGS .= " -Wl,-hcc-cov3 ";
-}
-
if ($hasC and $HIP_PLATFORM eq 'nvcc') {
$HIPCXXFLAGS .= " -x cu";
}
diff --git a/bin/hipify-perl b/bin/hipify-perl
index 2e391ab8d2..80f721c58e 100755
--- a/bin/hipify-perl
+++ b/bin/hipify-perl
@@ -983,7 +983,6 @@ sub simpleSubstitutions {
$ft{'type'} += s/\bcusparsePointerMode_t\b/hipsparsePointerMode_t/g;
$ft{'type'} += s/\bcusparseSolvePolicy_t\b/hipsparseSolvePolicy_t/g;
$ft{'type'} += s/\bcusparseStatus_t\b/hipsparseStatus_t/g;
- $ft{'type'} += s/\bwarpSize\b/hipWarpSize/g;
$ft{'numeric_literal'} += s/\bCUBLAS_DIAG_NON_UNIT\b/HIPBLAS_DIAG_NON_UNIT/g;
$ft{'numeric_literal'} += s/\bCUBLAS_DIAG_UNIT\b/HIPBLAS_DIAG_UNIT/g;
$ft{'numeric_literal'} += s/\bCUBLAS_FILL_MODE_FULL\b/HIPBLAS_FILL_MODE_FULL/g;
@@ -1676,6 +1675,12 @@ sub transformKernelLaunch {
}
}
+sub transformCubNamespace {
+ my $k = 0;
+ $k += s/using\s*namespace\s*cub/using namespace hipcub/g;
+ return $k;
+}
+
sub transformHostFunctions {
my $k = 0;
foreach $func (
@@ -2368,6 +2373,7 @@ while (@ARGV) {
simpleSubstitutions();
transformExternShared();
transformKernelLaunch();
+ transformCubNamespace();
if ($print_stats) {
while (/(\b(hip|HIP)([A-Z]|_)\w+\b)/g) {
$convertedTags{$1}++;
diff --git a/docs/markdown/hip_profiling.md b/docs/markdown/hip_profiling.md
index 8a44368680..28ed37e321 100644
--- a/docs/markdown/hip_profiling.md
+++ b/docs/markdown/hip_profiling.md
@@ -196,8 +196,7 @@ This file can be copied and edited to provide more selective HSA event recording
#### How to enable profiling at HIP build time
-Recent pre-built packages of HIP are always built with profiling support enabled.
-For developer builds, you must enable marker support manually when compiling HIP.
+Pre-built packages of HIP are not built with profiling support enabled.You must enable marker support manually when compiling HIP.
1. Build HIP with ATP markers enabled
HIP pre-built packages are enabled with ATP marker support by default.
diff --git a/hip_prof_gen.py b/hip_prof_gen.py
index d1203a64d6..9e90c1558c 100755
--- a/hip_prof_gen.py
+++ b/hip_prof_gen.py
@@ -348,7 +348,7 @@ def generate_prof_header(f, api_map, opts_map):
# Generating the callbacks data structure
f.write('\n// HIP API callbacks data structure\n')
f.write(
- 'struct hip_api_data_t {\n' +
+ 'typedef struct hip_api_data_t {\n' +
' uint64_t correlation_id;\n' +
' uint32_t phase;\n' +
' union {\n'
@@ -364,7 +364,7 @@ def generate_prof_header(f, api_map, opts_map):
f.write(' } ' + name + ';\n')
f.write(
' } args;\n' +
- '};\n'
+ '} hip_api_data_t;\n'
)
# Generating the callbacks args data filling macros
diff --git a/hipify-clang/CMakeLists.txt b/hipify-clang/CMakeLists.txt
index 5eb71b3a3e..2f24c6c6fe 100644
--- a/hipify-clang/CMakeLists.txt
+++ b/hipify-clang/CMakeLists.txt
@@ -1,8 +1,4 @@
-if (CUDA_VERSION VERSION_GREATER "9.2")
- cmake_minimum_required(VERSION 3.12.3)
-else()
- cmake_minimum_required(VERSION 3.7.2)
-endif()
+cmake_minimum_required(VERSION 3.5.1)
project(hipify-clang)
@@ -58,7 +54,7 @@ target_link_libraries(hipify-clang PRIVATE
LLVMOption
LLVMCore)
-if (LLVM_PACKAGE_VERSION VERSION_GREATER_EQUAL "7")
+if (LLVM_PACKAGE_VERSION VERSION_GREATER "6.0.1")
target_link_libraries(hipify-clang PRIVATE clangToolingInclusions)
endif()
@@ -72,7 +68,7 @@ else()
set(StdCpp "-std=c++")
endif()
-if (LLVM_PACKAGE_VERSION VERSION_GREATER_EQUAL "10.0")
+if (LLVM_PACKAGE_VERSION VERSION_GREATER "9.0")
string(APPEND StdCpp "14")
# MSVC starting from 1900 (VS 2015) supports only the following c++ std values: c++14|c++17|c++latest
elseif (MSVC)
@@ -84,7 +80,38 @@ endif()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_CFLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CFLAGS} ${StdCpp} -DHIPIFY_CLANG_RES=\\\"${LLVM_LIBRARY_DIRS}/clang/${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}\\\"")
-install(TARGETS hipify-clang DESTINATION bin)
+set(INSTALL_PATH_DOC_STRING "Installation path for hipify-clang")
+if (CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
+ if(CMAKE_BUILD_TYPE MATCHES Debug)
+ set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_SOURCE_DIR}/bin" CACHE PATH ${INSTALL_PATH_DOC_STRING} FORCE)
+ elseif(CMAKE_BUILD_TYPE MATCHES Release)
+ if (BIN_INSTALL_DIR)
+ set(CMAKE_INSTALL_PREFIX "${BIN_INSTALL_DIR}" CACHE PATH ${INSTALL_PATH_DOC_STRING} FORCE)
+ else()
+ set(CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/bin" CACHE PATH ${INSTALL_PATH_DOC_STRING} FORCE)
+ endif()
+ else()
+ message(FATAL_ERROR "Invalid CMAKE_BUILD_TYPE specified. Valid values are Debug and Release")
+ endif()
+elseif(BIN_INSTALL_DIR)
+ set(CMAKE_INSTALL_PREFIX "${BIN_INSTALL_DIR}" CACHE PATH ${INSTALL_PATH_DOC_STRING} FORCE)
+endif()
+
+install(TARGETS hipify-clang DESTINATION ${CMAKE_INSTALL_PREFIX})
+
+install(
+ DIRECTORY ${LLVM_DIR}/../../clang/${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}/
+ DESTINATION ${CMAKE_INSTALL_PREFIX}
+ COMPONENT clang-resource-headers
+ FILES_MATCHING
+ PATTERN "*.h"
+ PATTERN "*.modulemap"
+ PATTERN "algorithm"
+ PATTERN "complex"
+ PATTERN "new"
+ PATTERN "ppc_wrappers" EXCLUDE
+ PATTERN "openmp_wrappers" EXCLUDE
+ )
if (HIPIFY_CLANG_TESTS)
find_package(PythonInterp 2.7 REQUIRED)
diff --git a/hipify-clang/README.md b/hipify-clang/README.md
index 07466dbe62..da5abc19da 100644
--- a/hipify-clang/README.md
+++ b/hipify-clang/README.md
@@ -63,7 +63,7 @@
In most cases, you can get a suitable version of LLVM+CLANG with your package manager.
Failing that or having multiple versions of LLVM, you can [download a release archive](http://releases.llvm.org/), build or install it, and set
-[CMAKE_PREFIX_PATH](https://cmake.org/cmake/help/v3.12/variable/CMAKE_PREFIX_PATH.html) so `cmake` can find it; for instance: `-DCMAKE_PREFIX_PATH=f:\LLVM\9.0.0\dist`
+[CMAKE_PREFIX_PATH](https://cmake.org/cmake/help/v3.5/variable/CMAKE_PREFIX_PATH.html) so `cmake` can find it; for instance: `-DCMAKE_PREFIX_PATH=f:\LLVM\9.0.0\dist`
## Build and install
@@ -154,7 +154,7 @@ To run it:
* Path to CUB should be specified by the `CUDA_CUB_ROOT_DIR` option:
- - Linux: `-DCUDA_CUB_ROOT_DIR=/srv/CUB`
+ - Linux: `-DCUDA_CUB_ROOT_DIR=/srv/git/CUB`
- Windows: `-DCUDA_CUB_ROOT_DIR=f:/GIT/cub`
@@ -194,9 +194,9 @@ Ubuntu 14: LLVM 5.0.0 - 6.0.1, CUDA 7.0 - 9.0, cudnn-5.0.5 - cudnn-7.6.4.38
Ubuntu 16-18: LLVM 8.0.0 - 9.0.0, CUDA 8.0 - 10.1, cudnn-5.1.10 - cudnn-7.6.4.38
-Build system for the above configurations:
+Minimum build system requirements for the above configurations:
-Python 2.7 (min), cmake 3.12.3 (min), GNU C/C++ 5.4.0 (min).
+Python 2.7, cmake 3.5.1, GNU C/C++ 5.4.0.
Here is an example of building `hipify-clang` with testing support on `Ubuntu 16.04`:
@@ -208,7 +208,7 @@ cmake
-DCMAKE_PREFIX_PATH=/srv/git/LLVM/9.0.0/dist \
-DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-10.1 \
-DCUDA_DNN_ROOT_DIR=/srv/CUDNN/cudnn-10.1-v7.6.4.38 \
- -DCUDA_CUB_ROOT_DIR=/srv/CUB \
+ -DCUDA_CUB_ROOT_DIR=/srv/git/CUB \
-DLLVM_EXTERNAL_LIT=/srv/git/LLVM/9.0.0/build/bin/llvm-lit \
..
```
@@ -264,88 +264,89 @@ Linux 5.2.0 - Platform OS
64 - hipify-clang binary bitness
64 - python 2.7.12 binary bitness
========================================
--- Testing: 64 tests, 12 threads --
-PASS: hipify :: unit_tests/casts/reinterpret_cast.cu (1 of 64)
-PASS: hipify :: unit_tests/device/math_functions.cu (2 of 64)
-PASS: hipify :: unit_tests/device/atomics.cu (3 of 64)
-PASS: hipify :: unit_tests/device/device_symbols.cu (4 of 64)
-PASS: hipify :: unit_tests/headers/headers_test_02.cu (5 of 64)
-PASS: hipify :: unit_tests/headers/headers_test_03.cu (6 of 64)
-PASS: hipify :: unit_tests/headers/headers_test_01.cu (7 of 64)
-PASS: hipify :: unit_tests/headers/headers_test_04.cu (8 of 64)
-PASS: hipify :: unit_tests/headers/headers_test_05.cu (9 of 64)
-PASS: hipify :: unit_tests/headers/headers_test_07.cu (10 of 64)
-PASS: hipify :: unit_tests/headers/headers_test_06.cu (11 of 64)
-PASS: hipify :: unit_tests/headers/headers_test_11.cu (12 of 64)
-PASS: hipify :: unit_tests/headers/headers_test_08.cu (13 of 64)
-PASS: hipify :: unit_tests/headers/headers_test_10.cu (14 of 64)
-PASS: hipify :: unit_tests/headers/headers_test_09.cu (15 of 64)
-PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_02.cu (16 of 64)
-PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_01.cu (17 of 64)
-PASS: hipify :: unit_tests/libraries/CUB/cub_01.cu (18 of 64)
-PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_1_based_indexing.cu (19 of 64)
-PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_0_based_indexing.cu (20 of 64)
-PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_sgemm_matrix_multiplication.cu (21 of 64)
-PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_0_based_indexing_rocblas.cu (22 of 64)
-PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_sgemm_matrix_multiplication_rocblas.cu (23 of 64)
-PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_1_based_indexing_rocblas.cu (24 of 64)
-PASS: hipify :: unit_tests/libraries/cuComplex/cuComplex_Julia.cu (25 of 64)
-PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_softmax.cu (26 of 64)
-PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_convolution_forward.cu (27 of 64)
-PASS: hipify :: unit_tests/libraries/cuFFT/simple_cufft.cu (28 of 64)
-PASS: hipify :: unit_tests/libraries/cuRAND/poisson_api_example.cu (29 of 64)
-PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_01.cu (30 of 64)
-PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_generate.cpp (31 of 64)
-PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_02.cu (32 of 64)
-PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_03.cu (33 of 64)
-PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_kernel.cpp (34 of 64)
-PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_04.cu (35 of 64)
-PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_05.cu (36 of 64)
-PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_06.cu (37 of 64)
-PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_07.cu (38 of 64)
-PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_09.cu (39 of 64)
-PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_10.cu (40 of 64)
-PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_08.cu (41 of 64)
-PASS: hipify :: unit_tests/namespace/ns_kernel_launch.cu (42 of 64)
-PASS: hipify :: unit_tests/pp/pp_if_else_conditionals.cu (43 of 64)
-PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_11.cu (44 of 64)
-PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_12.cu (45 of 64)
-PASS: hipify :: unit_tests/pp/pp_if_else_conditionals_01.cu (46 of 64)
-PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/tex2dKernel.cpp (47 of 64)
-PASS: hipify :: unit_tests/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp (48 of 64)
-PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp (49 of 64)
-PASS: hipify :: unit_tests/samples/2_Cookbook/13_occupancy/occupancy.cpp (50 of 64)
-PASS: hipify :: unit_tests/samples/2_Cookbook/1_hipEvent/hipEvent.cpp (51 of 64)
-PASS: hipify :: unit_tests/samples/2_Cookbook/2_Profiler/Profiler.cpp (52 of 64)
-PASS: hipify :: unit_tests/samples/MallocManaged.cpp (53 of 64)
-PASS: hipify :: unit_tests/samples/2_Cookbook/7_streams/stream.cpp (54 of 64)
-PASS: hipify :: unit_tests/samples/2_Cookbook/8_peer2peer/peer2peer.cpp (55 of 64)
-PASS: hipify :: unit_tests/samples/allocators.cu (56 of 64)
-PASS: hipify :: unit_tests/samples/coalescing.cu (57 of 64)
-PASS: hipify :: unit_tests/samples/dynamic_shared_memory.cu (58 of 64)
-PASS: hipify :: unit_tests/samples/axpy.cu (59 of 64)
-PASS: hipify :: unit_tests/samples/cudaRegister.cu (60 of 64)
-PASS: hipify :: unit_tests/samples/intro.cu (61 of 64)
-PASS: hipify :: unit_tests/samples/square.cu (62 of 64)
-PASS: hipify :: unit_tests/samples/static_shared_memory.cu (63 of 64)
-PASS: hipify :: unit_tests/samples/vec_add.cu (64 of 64)
-Testing Time: 2.98s
- Expected Passes : 64
+-- Testing: 65 tests, 12 threads --
+PASS: hipify :: unit_tests/casts/reinterpret_cast.cu (1 of 65)
+PASS: hipify :: unit_tests/device/math_functions.cu (2 of 65)
+PASS: hipify :: unit_tests/device/atomics.cu (3 of 65)
+PASS: hipify :: unit_tests/device/device_symbols.cu (4 of 65)
+PASS: hipify :: unit_tests/headers/headers_test_02.cu (5 of 65)
+PASS: hipify :: unit_tests/headers/headers_test_03.cu (6 of 65)
+PASS: hipify :: unit_tests/headers/headers_test_01.cu (7 of 65)
+PASS: hipify :: unit_tests/headers/headers_test_04.cu (8 of 65)
+PASS: hipify :: unit_tests/headers/headers_test_05.cu (9 of 65)
+PASS: hipify :: unit_tests/headers/headers_test_07.cu (10 of 65)
+PASS: hipify :: unit_tests/headers/headers_test_06.cu (11 of 65)
+PASS: hipify :: unit_tests/headers/headers_test_11.cu (12 of 65)
+PASS: hipify :: unit_tests/headers/headers_test_08.cu (13 of 65)
+PASS: hipify :: unit_tests/headers/headers_test_10.cu (14 of 65)
+PASS: hipify :: unit_tests/headers/headers_test_09.cu (15 of 65)
+PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_02.cu (16 of 65)
+PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_01.cu (17 of 65)
+PASS: hipify :: unit_tests/libraries/CUB/cub_01.cu (18 of 65)
+PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_1_based_indexing.cu (19 of 65)
+PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_0_based_indexing.cu (20 of 65)
+PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_sgemm_matrix_multiplication.cu (21 of 65)
+PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_0_based_indexing_rocblas.cu (22 of 65)
+PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_sgemm_matrix_multiplication_rocblas.cu (23 of 65)
+PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_1_based_indexing_rocblas.cu (24 of 65)
+PASS: hipify :: unit_tests/libraries/cuComplex/cuComplex_Julia.cu (25 of 65)
+PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_softmax.cu (26 of 65)
+PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_convolution_forward.cu (27 of 65)
+PASS: hipify :: unit_tests/libraries/cuFFT/simple_cufft.cu (28 of 65)
+PASS: hipify :: unit_tests/libraries/cuRAND/poisson_api_example.cu (29 of 65)
+PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_01.cu (30 of 65)
+PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_generate.cpp (31 of 65)
+PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_02.cu (32 of 65)
+PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_03.cu (33 of 65)
+PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_kernel.cpp (34 of 65)
+PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_04.cu (35 of 65)
+PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_05.cu (36 of 65)
+PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_06.cu (37 of 65)
+PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_07.cu (38 of 65)
+PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_09.cu (39 of 65)
+PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_10.cu (40 of 65)
+PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_08.cu (41 of 65)
+PASS: hipify :: unit_tests/namespace/ns_kernel_launch.cu (42 of 65)
+PASS: hipify :: unit_tests/pp/pp_if_else_conditionals.cu (43 of 65)
+PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_11.cu (44 of 65)
+PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_12.cu (45 of 65)
+PASS: hipify :: unit_tests/pp/pp_if_else_conditionals_01.cu (46 of 65)
+PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/tex2dKernel.cpp (47 of 65)
+PASS: hipify :: unit_tests/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp (48 of 65)
+PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp (49 of 65)
+PASS: hipify :: unit_tests/samples/2_Cookbook/13_occupancy/occupancy.cpp (50 of 65)
+PASS: hipify :: unit_tests/samples/2_Cookbook/1_hipEvent/hipEvent.cpp (51 of 65)
+PASS: hipify :: unit_tests/samples/2_Cookbook/2_Profiler/Profiler.cpp (52 of 65)
+PASS: hipify :: unit_tests/samples/MallocManaged.cpp (53 of 65)
+PASS: hipify :: unit_tests/samples/2_Cookbook/7_streams/stream.cpp (54 of 65)
+PASS: hipify :: unit_tests/samples/2_Cookbook/8_peer2peer/peer2peer.cpp (55 of 65)
+PASS: hipify :: unit_tests/samples/allocators.cu (56 of 65)
+PASS: hipify :: unit_tests/samples/coalescing.cu (57 of 65)
+PASS: hipify :: unit_tests/samples/dynamic_shared_memory.cu (58 of 65)
+PASS: hipify :: unit_tests/samples/axpy.cu (59 of 65)
+PASS: hipify :: unit_tests/samples/cudaRegister.cu (60 of 65)
+PASS: hipify :: unit_tests/samples/intro.cu (61 of 65)
+PASS: hipify :: unit_tests/samples/square.cu (62 of 65)
+PASS: hipify :: unit_tests/samples/static_shared_memory.cu (63 of 65)
+PASS: hipify :: unit_tests/samples/vec_add.cu (64 of 65)
+PASS: hipify :: unit_tests/libraries/CUB/cub_02.cu (18 of 65)
+Testing Time: 3.01s
+ Expected Passes : 65
[100%] Built target test-hipify
```
### Windows
On Windows 10 the following configurations are tested:
-LLVM 5.0.0 - 5.0.2, CUDA 8.0, cudnn-5.1.10 - cudnn-7.1.4.18
+LLVM 5.0.0 - 5.0.2, CUDA 8.0, cudnn 5.1.10 - 7.1.4.18
-LLVM 6.0.0 - 6.0.1, CUDA 9.0, cudnn-7.0.5.15 - cudnn-7.6.4.38
+LLVM 6.0.0 - 6.0.1, CUDA 9.0, cudnn 7.0.5.15 - 7.6.4.38
-LLVM 7.0.0 - 9.0.0, CUDA 7.5 - 10.1, cudnn-7.0.5.15 - cudnn-7.6.4.38
+LLVM 7.0.0 - 9.0.0, CUDA 7.5 - 10.1, cudnn 7.0.5.15 - 7.6.4.38
-Build system for the above configurations:
+Build system requirements for the latest configuration LLVM 9.0.0/CUDA 10.1 Update 2:
-Python 3.6 - 3.7.4, cmake 3.12.3 - 3.15.5, Visual Studio 2017 (15.5.2) - 2019 (16.3.4).
+Python 3.6.0 - 3.8.0, cmake 3.5.1 - 3.15.5, Visual Studio 2017 (15.5.2) - 2019 (16.3.5).
Here is an example of building `hipify-clang` with testing support on `Windows 10` by `Visual Studio 16 2019`:
@@ -371,7 +372,7 @@ cmake
-- - CMake module path: F:/LLVM/9.0.0/dist/lib/cmake/llvm
-- - Include path : F:/LLVM/9.0.0/dist/include
-- - Binary path : F:/LLVM/9.0.0/dist/bin
--- Found PythonInterp: C:/Program Files/Python37/python.exe (found suitable version "3.7.4", minimum required is "3.6")
+-- Found PythonInterp: C:/Program Files/Python38/python.exe (found suitable version "3.8.0", minimum required is "3.6")
-- Found lit: C:/Program Files/Python36/Scripts/lit.exe
-- Found FileCheck: F:/LLVM/9.0.0/dist/bin/FileCheck.exe
-- Found CUDA: C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1 (found version "10.1")
diff --git a/hipify-clang/src/CUDA2HIP_CUB_API_types.cpp b/hipify-clang/src/CUDA2HIP_CUB_API_types.cpp
new file mode 100644
index 0000000000..0ef1912b54
--- /dev/null
+++ b/hipify-clang/src/CUDA2HIP_CUB_API_types.cpp
@@ -0,0 +1,28 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "CUDA2HIP.h"
+
+// Maps the names of CUDA CUB API types to the corresponding HIP types
+const std::map CUDA_CUB_TYPE_NAME_MAP{
+ {"cub", {"hipcub", "", CONV_TYPE, API_CUB}},
+};
diff --git a/hipify-clang/src/CUDA2HIP_Perl.cpp b/hipify-clang/src/CUDA2HIP_Perl.cpp
index df83bf8d57..8d59089d4f 100644
--- a/hipify-clang/src/CUDA2HIP_Perl.cpp
+++ b/hipify-clang/src/CUDA2HIP_Perl.cpp
@@ -251,6 +251,11 @@ namespace perl {
*streamPtr.get() << tab_2 << "$Tkernels{$1}++;" << endl_tab << "}" << endl << "}" << endl;
}
+ void generateCubNamespace(unique_ptr& streamPtr) {
+ *streamPtr.get() << endl << sub << "transformCubNamespace" << " {" << endl_tab << my_k << endl;
+ *streamPtr.get() << tab << "$k += s/using\\s*namespace\\s*cub/using namespace hipcub/g;" << endl << tab << return_k << "}" << endl;
+ }
+
void generateHostFunctions(unique_ptr& streamPtr) {
*streamPtr.get() << endl << sub << "transformHostFunctions" << " {" << endl_tab << my_k << endl;
set &funcSet = DeviceSymbolFunctions0;
@@ -358,6 +363,7 @@ namespace perl {
generateSimpleSubstitutions(streamPtr);
generateExternShared(streamPtr);
generateKernelLaunch(streamPtr);
+ generateCubNamespace(streamPtr);
generateHostFunctions(streamPtr);
generateDeviceFunctions(streamPtr);
*streamPtr.get() << endl << "# Count of transforms in all files" << endl;
@@ -401,6 +407,7 @@ namespace perl {
*streamPtr.get() << tab_2 << "simpleSubstitutions();" << endl;
*streamPtr.get() << tab_2 << "transformExternShared();" << endl;
*streamPtr.get() << tab_2 << "transformKernelLaunch();" << endl;
+ *streamPtr.get() << tab_2 << "transformCubNamespace();" << endl;
*streamPtr.get() << tab_2 << "if ($print_stats) {" << endl;
*streamPtr.get() << tab_3 << while_ << "(/(\\b(hip|HIP)([A-Z]|_)\\w+\\b)/g) {" << endl;
*streamPtr.get() << tab_4 << "$convertedTags{$1}++;" << endl_tab_3 << "}" << endl_tab_2 << "}" << endl;
diff --git a/hipify-clang/src/CUDA2HIP_Runtime_API_types.cpp b/hipify-clang/src/CUDA2HIP_Runtime_API_types.cpp
index fda9b34b2c..7b34d97ab4 100644
--- a/hipify-clang/src/CUDA2HIP_Runtime_API_types.cpp
+++ b/hipify-clang/src/CUDA2HIP_Runtime_API_types.cpp
@@ -31,8 +31,6 @@ const std::map CUDA_RUNTIME_TYPE_NAME_MAP {
{"cudaChannelFormatDesc", {"hipChannelFormatDesc", "", CONV_TYPE, API_RUNTIME}},
// no analogue
{"cudaDeviceProp", {"hipDeviceProp_t", "", CONV_TYPE, API_RUNTIME}},
- // NOTE: int warpSize is a field of cudaDeviceProp
- {"warpSize", {"hipWarpSize", "", CONV_TYPE, API_RUNTIME}},
// no analogue
{"cudaEglFrame", {"hipEglFrame", "", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}},
diff --git a/hipify-clang/src/HipifyAction.cpp b/hipify-clang/src/HipifyAction.cpp
index c6d10f0cdc..510d91978a 100644
--- a/hipify-clang/src/HipifyAction.cpp
+++ b/hipify-clang/src/HipifyAction.cpp
@@ -33,16 +33,21 @@ THE SOFTWARE.
#include "StringUtils.h"
#include "ArgParse.h"
-namespace ct = clang::tooling;
-namespace mat = clang::ast_matchers;
-
+const std::string sHIP = "HIP";
+const std::string sROC = "ROC";
+const std::string sCub = "cub";
+const std::string sHipcub = "hipcub";
const std::string sHIP_DYNAMIC_SHARED = "HIP_DYNAMIC_SHARED";
+const std::string sHIP_KERNEL_NAME = "HIP_KERNEL_NAME";
std::string sHIP_SYMBOL = "HIP_SYMBOL";
-std::string sHIP_KERNEL_NAME = "HIP_KERNEL_NAME";
std::string s_reinterpret_cast = "reinterpret_cast";
-const std::string sHipLaunchKernelGGL = "hipLaunchKernelGGL(";
+const std::string sHipLaunchKernelGGL = "hipLaunchKernelGGL";
const std::string sDim3 = "dim3(";
-
+const std::string s_hiprand_kernel_h = "hiprand_kernel.h";
+const std::string s_hiprand_h = "hiprand.h";
+const std::string sOnce = "once";
+const std::string s_string_literal = "[string literal]";
+// CUDA identifiers, used in matchers
const std::string sCudaMemcpyToSymbol = "cudaMemcpyToSymbol";
const std::string sCudaMemcpyToSymbolAsync = "cudaMemcpyToSymbolAsync";
const std::string sCudaGetSymbolSize = "cudaGetSymbolSize";
@@ -51,6 +56,14 @@ const std::string sCudaMemcpyFromSymbol = "cudaMemcpyFromSymbol";
const std::string sCudaMemcpyFromSymbolAsync = "cudaMemcpyFromSymbolAsync";
const std::string sCudaFuncSetCacheConfig = "cudaFuncSetCacheConfig";
const std::string sCudaFuncGetAttributes = "cudaFuncGetAttributes";
+// Matchers' names
+const StringRef sCudaSharedIncompleteArrayVar = "cudaSharedIncompleteArrayVar";
+const StringRef sCudaLaunchKernel = "cudaLaunchKernel";
+const StringRef sCudaHostFuncCall = "cudaHostFuncCall";
+const StringRef sCudaDeviceFuncCall = "cudaDeviceFuncCall";
+const StringRef sCubNamespacePrefix = "cubNamespacePrefix";
+const StringRef sCubFunctionTemplateDecl = "cubFunctionTemplateDecl";
+const StringRef sCubUsingNamespaceDecl = "cubUsingNamespaceDecl";
std::set DeviceSymbolFunctions0 {
{sCudaMemcpyToSymbol},
@@ -78,7 +91,7 @@ std::set ReinterpretFunctions1{
};
void HipifyAction::RewriteString(StringRef s, clang::SourceLocation start) {
- clang::SourceManager& SM = getCompilerInstance().getSourceManager();
+ auto &SM = getCompilerInstance().getSourceManager();
size_t begin = 0;
while ((begin = s.find("cu", begin)) != StringRef::npos) {
const size_t end = s.find_first_of(" ", begin + 4);
@@ -86,7 +99,7 @@ void HipifyAction::RewriteString(StringRef s, clang::SourceLocation start) {
const auto found = CUDA_RENAMES_MAP().find(name);
if (found != CUDA_RENAMES_MAP().end()) {
StringRef repName = Statistics::isToRoc(found->second) ? found->second.rocName : found->second.hipName;
- hipCounter counter = {"[string literal]", "", ConvTypes::CONV_LITERAL, ApiTypes::API_RUNTIME, found->second.supportDegree};
+ hipCounter counter = {s_string_literal, "", ConvTypes::CONV_LITERAL, ApiTypes::API_RUNTIME, found->second.supportDegree};
Statistics::current().incrementCounter(counter, name.str());
if (!Statistics::isUnsupported(counter)) {
clang::SourceLocation sl = start.getLocWithOffset(begin + 1);
@@ -95,13 +108,24 @@ void HipifyAction::RewriteString(StringRef s, clang::SourceLocation start) {
insertReplacement(Rep, fullSL);
}
}
- if (end == StringRef::npos) {
- break;
- }
+ if (end == StringRef::npos) break;
begin = end + 1;
}
}
+clang::SourceLocation HipifyAction::GetSubstrLocation(const std::string &str, const clang::SourceRange &sr) {
+ clang::SourceLocation sl(sr.getBegin());
+ clang::SourceLocation end(sr.getEnd());
+ auto &SM = getCompilerInstance().getSourceManager();
+ size_t length = SM.getCharacterData(end) - SM.getCharacterData(sl);
+ StringRef sfull = StringRef(SM.getCharacterData(sl), length);
+ size_t offset = sfull.find(str);
+ if (offset > 0) {
+ sl = sl.getLocWithOffset(offset);
+ }
+ return sl;
+}
+
/**
* Look at, and consider altering, a given token.
*
@@ -109,7 +133,7 @@ void HipifyAction::RewriteString(StringRef s, clang::SourceLocation start) {
* If it's an unsupported CUDA identifier, a warning is emitted.
* Otherwise, the source file is updated with the corresponding hipification.
*/
-void HipifyAction::RewriteToken(const clang::Token& t) {
+void HipifyAction::RewriteToken(const clang::Token &t) {
// String literals containing CUDA references need fixing.
if (t.is(clang::tok::string_literal)) {
StringRef s(t.getLiteralData(), t.getLength());
@@ -124,20 +148,21 @@ void HipifyAction::RewriteToken(const clang::Token& t) {
FindAndReplace(name, sl, CUDA_RENAMES_MAP());
}
-void HipifyAction::FindAndReplace(llvm::StringRef name,
+void HipifyAction::FindAndReplace(StringRef name,
clang::SourceLocation sl,
- const std::map& repMap, bool bReplace) {
+ const std::map &repMap,
+ bool bReplace) {
const auto found = repMap.find(name);
if (found == repMap.end()) {
// So it's an identifier, but not CUDA? Boring.
return;
}
Statistics::current().incrementCounter(found->second, name.str());
- clang::DiagnosticsEngine& DE = getCompilerInstance().getDiagnostics();
+ clang::DiagnosticsEngine &DE = getCompilerInstance().getDiagnostics();
// Warn the user about unsupported identifier.
if (Statistics::isUnsupported(found->second)) {
std::string sWarn;
- Statistics::isToRoc(found->second) ? sWarn = "ROC" : sWarn = "HIP";
+ Statistics::isToRoc(found->second) ? sWarn = sROC : sWarn = sHIP;
sWarn = "" + sWarn;
const auto ID = DE.getCustomDiagID(clang::DiagnosticsEngine::Warning, "CUDA identifier is unsupported in %0.");
DE.Report(sl, ID) << sWarn;
@@ -147,7 +172,7 @@ void HipifyAction::FindAndReplace(llvm::StringRef name,
return;
}
StringRef repName = Statistics::isToRoc(found->second) ? found->second.rocName : found->second.hipName;
- clang::SourceManager& SM = getCompilerInstance().getSourceManager();
+ auto &SM = getCompilerInstance().getSourceManager();
ct::Replacement Rep(SM, sl, name.size(), repName.str());
clang::FullSourceLoc fullSL(sl, SM);
insertReplacement(Rep, fullSL);
@@ -155,7 +180,7 @@ void HipifyAction::FindAndReplace(llvm::StringRef name,
namespace {
-clang::SourceRange getReadRange(clang::SourceManager& SM, const clang::SourceRange& exprRange) {
+clang::SourceRange getReadRange(clang::SourceManager &SM, const clang::SourceRange &exprRange) {
clang::SourceLocation begin = exprRange.getBegin();
clang::SourceLocation end = exprRange.getEnd();
bool beginSafe = !SM.isMacroBodyExpansion(begin) || clang::Lexer::isAtStartOfMacroExpansion(begin, SM, clang::LangOptions{});
@@ -167,7 +192,7 @@ clang::SourceRange getReadRange(clang::SourceManager& SM, const clang::SourceRan
}
}
-clang::SourceRange getWriteRange(clang::SourceManager& SM, const clang::SourceRange& exprRange) {
+clang::SourceRange getWriteRange(clang::SourceManager &SM, const clang::SourceRange &exprRange) {
clang::SourceLocation begin = exprRange.getBegin();
clang::SourceLocation end = exprRange.getEnd();
// If the range is contained within a macro, update the macro definition.
@@ -178,7 +203,7 @@ clang::SourceRange getWriteRange(clang::SourceManager& SM, const clang::SourceRa
return {SM.getSpellingLoc(begin), SM.getSpellingLoc(end)};
}
-StringRef readSourceText(clang::SourceManager& SM, const clang::SourceRange& exprRange) {
+StringRef readSourceText(clang::SourceManager &SM, const clang::SourceRange &exprRange) {
return clang::Lexer::getSourceText(clang::CharSourceRange::getTokenRange(getReadRange(SM, exprRange)), SM, clang::LangOptions(), nullptr);
}
@@ -186,53 +211,50 @@ StringRef readSourceText(clang::SourceManager& SM, const clang::SourceRange& exp
* Get a string representation of the expression `arg`, unless it's a defaulting function
* call argument, in which case get a 0. Used for building argument lists to kernel calls.
*/
-std::string stringifyZeroDefaultedArg(clang::SourceManager& SM, const clang::Expr* arg) {
- if (clang::isa(arg)) {
- return "0";
- } else {
- return readSourceText(SM, arg->getSourceRange());
- }
+std::string stringifyZeroDefaultedArg(clang::SourceManager &SM, const clang::Expr *arg) {
+ if (clang::isa(arg)) return "0";
+ else return readSourceText(SM, arg->getSourceRange());
}
} // anonymous namespace
-bool HipifyAction::Exclude(const hipCounter & hipToken) {
+bool HipifyAction::Exclude(const hipCounter &hipToken) {
switch (hipToken.type) {
case CONV_INCLUDE_CUDA_MAIN_H:
switch (hipToken.apiType) {
case API_DRIVER:
case API_RUNTIME:
- if (insertedRuntimeHeader) { return true; }
+ if (insertedRuntimeHeader) return true;
insertedRuntimeHeader = true;
return false;
case API_BLAS:
- if (insertedBLASHeader) { return true; }
+ if (insertedBLASHeader) return true;
insertedBLASHeader = true;
return false;
case API_RAND:
- if (hipToken.hipName == "hiprand_kernel.h") {
- if (insertedRAND_kernelHeader) { return true; }
+ if (hipToken.hipName == s_hiprand_kernel_h) {
+ if (insertedRAND_kernelHeader) return true;
insertedRAND_kernelHeader = true;
return false;
- } else if (hipToken.hipName == "hiprand.h") {
- if (insertedRANDHeader) { return true; }
+ } else if (hipToken.hipName == s_hiprand_h) {
+ if (insertedRANDHeader) return true;
insertedRANDHeader = true;
return false;
}
case API_DNN:
- if (insertedDNNHeader) { return true; }
+ if (insertedDNNHeader) return true;
insertedDNNHeader = true;
return false;
case API_FFT:
- if (insertedFFTHeader) { return true; }
+ if (insertedFFTHeader) return true;
insertedFFTHeader = true;
return false;
case API_COMPLEX:
- if (insertedComplexHeader) { return true; }
+ if (insertedComplexHeader) return true;
insertedComplexHeader = true;
return false;
case API_SPARSE:
- if (insertedSPARSEHeader) { return true; }
+ if (insertedSPARSEHeader) return true;
insertedSPARSEHeader = true;
return false;
default:
@@ -240,13 +262,11 @@ bool HipifyAction::Exclude(const hipCounter & hipToken) {
}
return false;
case CONV_INCLUDE:
- if (hipToken.hipName.empty()) {
- return true;
- }
+ if (hipToken.hipName.empty()) return true;
switch (hipToken.apiType) {
case API_RAND:
- if (hipToken.hipName == "hiprand_kernel.h") {
- if (insertedRAND_kernelHeader) { return true; }
+ if (hipToken.hipName == s_hiprand_kernel_h) {
+ if (insertedRAND_kernelHeader) return true;
insertedRAND_kernelHeader = true;
}
return false;
@@ -267,24 +287,19 @@ void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc,
clang::CharSourceRange filename_range,
const clang::FileEntry*, StringRef,
StringRef, const clang::Module*) {
- clang::SourceManager& SM = getCompilerInstance().getSourceManager();
- if (!SM.isWrittenInMainFile(hash_loc)) {
- return;
- }
+ auto &SM = getCompilerInstance().getSourceManager();
+ if (!SM.isWrittenInMainFile(hash_loc)) return;
if (!firstHeader) {
firstHeader = true;
firstHeaderLoc = hash_loc;
}
const auto found = CUDA_INCLUDE_MAP.find(file_name);
- if (found == CUDA_INCLUDE_MAP.end()) {
- return;
- }
+ if (found == CUDA_INCLUDE_MAP.end()) return;
bool exclude = Exclude(found->second);
Statistics::current().incrementCounter(found->second, file_name.str());
-
clang::SourceLocation sl = filename_range.getBegin();
if (Statistics::isUnsupported(found->second)) {
- clang::DiagnosticsEngine& DE = getCompilerInstance().getDiagnostics();
+ clang::DiagnosticsEngine &DE = getCompilerInstance().getDiagnostics();
DE.Report(sl, DE.getCustomDiagID(clang::DiagnosticsEngine::Warning, "Unsupported CUDA header."));
return;
}
@@ -293,11 +308,8 @@ void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc,
if (!exclude) {
clang::SmallString<128> includeBuffer;
llvm::StringRef name = Statistics::isToRoc(found->second) ? found->second.rocName : found->second.hipName;
- if (is_angled) {
- newInclude = llvm::Twine("<" + name+ ">").toStringRef(includeBuffer);
- } else {
- newInclude = llvm::Twine("\"" + name + "\"").toStringRef(includeBuffer);
- }
+ if (is_angled) newInclude = llvm::Twine("<" + name+ ">").toStringRef(includeBuffer);
+ else newInclude = llvm::Twine("\"" + name + "\"").toStringRef(includeBuffer);
} else {
// hashLoc is location of the '#', thus replacing the whole include directive by empty newInclude starting with '#'.
sl = hash_loc;
@@ -309,48 +321,52 @@ void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc,
}
void HipifyAction::PragmaDirective(clang::SourceLocation Loc, clang::PragmaIntroducerKind Introducer) {
- if (pragmaOnce) {
- return;
- }
- clang::SourceManager& SM = getCompilerInstance().getSourceManager();
- if (!SM.isWrittenInMainFile(Loc)) {
- return;
- }
- clang::Preprocessor& PP = getCompilerInstance().getPreprocessor();
+ if (pragmaOnce) return;
+ auto &SM = getCompilerInstance().getSourceManager();
+ if (!SM.isWrittenInMainFile(Loc)) return;
+ clang::Preprocessor &PP = getCompilerInstance().getPreprocessor();
clang::Token tok;
PP.Lex(tok);
StringRef Text(SM.getCharacterData(tok.getLocation()), tok.getLength());
- if (Text == "once") {
+ if (Text == sOnce) {
pragmaOnce = true;
pragmaOnceLoc = tok.getEndLoc();
}
}
-bool HipifyAction::cudaLaunchKernel(const clang::ast_matchers::MatchFinder::MatchResult& Result) {
- StringRef refName = "cudaLaunchKernel";
- const auto* launchKernel = Result.Nodes.getNodeAs(refName);
- if (!launchKernel) {
- return false;
- }
- const clang::Expr* calleeExpr = launchKernel->getCallee();
- if (!calleeExpr) {
- return false;
- }
- const clang::FunctionDecl *caleeDecl = launchKernel->getDirectCallee();
- if (!caleeDecl) {
- return false;
- }
- const clang::CallExpr* config = launchKernel->getConfig();
- if (!config) {
- return false;
- }
+bool HipifyAction::cudaLaunchKernel(const mat::MatchFinder::MatchResult &Result) {
+ auto *launchKernel = Result.Nodes.getNodeAs(sCudaLaunchKernel);
+ if (!launchKernel) return false;
+ auto *calleeExpr = launchKernel->getCallee();
+ if (!calleeExpr) return false;
+ auto *caleeDecl = launchKernel->getDirectCallee();
+ if (!caleeDecl) return false;
+ auto *config = launchKernel->getConfig();
+ if (!config) return false;
clang::SmallString<40> XStr;
llvm::raw_svector_ostream OS(XStr);
clang::LangOptions DefaultLangOptions;
- clang::SourceManager* SM = Result.SourceManager;
- OS << sHipLaunchKernelGGL;
- if (caleeDecl->isTemplateInstantiation()) OS << sHIP_KERNEL_NAME << "(";
- OS << readSourceText(*SM, calleeExpr->getSourceRange());
+ auto *SM = Result.SourceManager;
+ clang::SourceRange sr = calleeExpr->getSourceRange();
+ std::string kern = readSourceText(*SM, sr).str();
+ OS << sHipLaunchKernelGGL << "(";
+ if (caleeDecl->isTemplateInstantiation()) {
+ OS << sHIP_KERNEL_NAME << "(";
+ std::string cub = sCub + "::";
+ std::string hipcub;
+ const auto found = CUDA_CUB_TYPE_NAME_MAP.find(sCub);
+ if (found != CUDA_CUB_TYPE_NAME_MAP.end()) {
+ hipcub = found->second.hipName.str() + "::";
+ } else {
+ hipcub = sHipcub + "::";
+ }
+ size_t pos = kern.find(cub);
+ while (pos != std::string::npos) {
+ kern.replace(pos, cub.size(), hipcub);
+ pos = kern.find(cub, pos + hipcub.size());
+ }
+ }
+ OS << kern;
if (caleeDecl->isTemplateInstantiation()) OS << ")";
OS << ", ";
// Next up are the four kernel configuration parameters, the last two of which are optional and default to zero.
@@ -381,29 +397,24 @@ bool HipifyAction::cudaLaunchKernel(const clang::ast_matchers::MatchFinder::Matc
ct::Replacement Rep(*SM, launchStart, length, OS.str());
clang::FullSourceLoc fullSL(launchStart, *SM);
insertReplacement(Rep, fullSL);
- hipCounter counter = {"hipLaunchKernelGGL", "", ConvTypes::CONV_KERNEL_LAUNCH, ApiTypes::API_RUNTIME};
- Statistics::current().incrementCounter(counter, refName.str());
+ hipCounter counter = {sHipLaunchKernelGGL, "", ConvTypes::CONV_KERNEL_LAUNCH, ApiTypes::API_RUNTIME};
+ Statistics::current().incrementCounter(counter, sCudaLaunchKernel.str());
return true;
}
-bool HipifyAction::cudaSharedIncompleteArrayVar(const clang::ast_matchers::MatchFinder::MatchResult& Result) {
- StringRef refName = "cudaSharedIncompleteArrayVar";
- auto* sharedVar = Result.Nodes.getNodeAs(refName);
- if (!sharedVar) {
- return false;
- }
+bool HipifyAction::cudaSharedIncompleteArrayVar(const mat::MatchFinder::MatchResult &Result) {
+ auto *sharedVar = Result.Nodes.getNodeAs(sCudaSharedIncompleteArrayVar);
+ if (!sharedVar) return false;
// Example: extern __shared__ uint sRadix1[];
- if (!sharedVar->hasExternalFormalLinkage()) {
- return false;
- }
+ if (!sharedVar->hasExternalFormalLinkage()) return false;
clang::QualType QT = sharedVar->getType();
std::string typeName;
if (QT->isIncompleteArrayType()) {
- const clang::ArrayType* AT = QT.getTypePtr()->getAsArrayTypeUnsafe();
+ const clang::ArrayType *AT = QT.getTypePtr()->getAsArrayTypeUnsafe();
QT = AT->getElementType();
if (QT.getTypePtr()->isBuiltinType()) {
QT = QT.getCanonicalType();
- const auto* BT = clang::dyn_cast(QT);
+ auto *BT = clang::dyn_cast(QT);
if (BT) {
clang::LangOptions LO;
LO.CUDA = true;
@@ -417,7 +428,7 @@ bool HipifyAction::cudaSharedIncompleteArrayVar(const clang::ast_matchers::Match
if (!typeName.empty()) {
clang::SourceLocation slStart = sharedVar->getOuterLocStart();
clang::SourceLocation slEnd = llcompat::getEndLoc(sharedVar->getTypeSourceInfo()->getTypeLoc());
- clang::SourceManager* SM = Result.SourceManager;
+ auto *SM = Result.SourceManager;
size_t repLength = SM->getCharacterData(slEnd) - SM->getCharacterData(slStart) + 1;
std::string varName = sharedVar->getNameAsString();
std::string repName = sHIP_DYNAMIC_SHARED + "(" + typeName + ", " + varName + ")";
@@ -425,74 +436,84 @@ bool HipifyAction::cudaSharedIncompleteArrayVar(const clang::ast_matchers::Match
clang::FullSourceLoc fullSL(slStart, *SM);
insertReplacement(Rep, fullSL);
hipCounter counter = {sHIP_DYNAMIC_SHARED, "", ConvTypes::CONV_EXTERN_SHARED, ApiTypes::API_RUNTIME};
- Statistics::current().incrementCounter(counter, refName.str());
+ Statistics::current().incrementCounter(counter, sCudaSharedIncompleteArrayVar.str());
return true;
}
return false;
}
-bool HipifyAction::cudaDeviceFuncCall(const clang::ast_matchers::MatchFinder::MatchResult& Result) {
- if (const clang::CallExpr *call = Result.Nodes.getNodeAs("cudaDeviceFuncCall")) {
- const clang::FunctionDecl *funcDcl = call->getDirectCallee();
- if (!funcDcl) {
- return false;
- }
+bool HipifyAction::cudaDeviceFuncCall(const mat::MatchFinder::MatchResult &Result) {
+ if (const clang::CallExpr *call = Result.Nodes.getNodeAs(sCudaDeviceFuncCall)) {
+ auto *funcDcl = call->getDirectCallee();
+ if (!funcDcl) return false;
FindAndReplace(funcDcl->getDeclName().getAsString(), llcompat::getBeginLoc(call), CUDA_DEVICE_FUNC_MAP, false);
return true;
}
return false;
}
-bool HipifyAction::cubNamespacePrefix(const clang::ast_matchers::MatchFinder::MatchResult& Result) {
- if (const clang::TypedefNameDecl *decl = Result.Nodes.getNodeAs("cubNamespacePrefix")) {
- if (!decl) {
- return false;
- }
+bool HipifyAction::cubNamespacePrefix(const mat::MatchFinder::MatchResult &Result) {
+ if (auto *decl = Result.Nodes.getNodeAs(sCubNamespacePrefix)) {
clang::QualType QT = decl->getUnderlyingType();
- const clang::Type* t = QT.getTypePtr();
- if (!t) {
- return false;
- }
- const clang::ElaboratedType* et = t->getAs();
- if (!et) {
- return false;
- }
+ auto *t = QT.getTypePtr();
+ if (!t) return false;
+ const clang::ElaboratedType *et = t->getAs();
+ if (!et) return false;
const clang::NestedNameSpecifier *nns = et->getQualifier();
- if (!nns) {
- return false;
- }
+ if (!nns) return false;
const clang::NamespaceDecl *nsd = nns->getAsNamespace();
- if (!nsd) {
- return false;
- }
+ if (!nsd) return false;
const clang::TypeSourceInfo *si = decl->getTypeSourceInfo();
const clang::TypeLoc tloc = si->getTypeLoc();
const clang::SourceRange sr = tloc.getSourceRange();
- clang::SourceLocation sl(sr.getBegin());
- clang::SourceLocation end(sr.getEnd());
- clang::SourceManager& SM = getCompilerInstance().getSourceManager();
- size_t length = SM.getCharacterData(end) - SM.getCharacterData(sl);
- StringRef sfull = StringRef(SM.getCharacterData(sl), length);
std::string name = nsd->getDeclName().getAsString();
- size_t offset = sfull.find(name);
- if (offset > 0) {
- sl = sl.getLocWithOffset(offset);
- }
- FindAndReplace(name, sl, CUDA_CUB_TYPE_NAME_MAP);
+ FindAndReplace(name, GetSubstrLocation(name, sr), CUDA_CUB_TYPE_NAME_MAP);
return true;
}
return false;
}
-bool HipifyAction::cudaHostFuncCall(const clang::ast_matchers::MatchFinder::MatchResult& Result) {
- if (const clang::CallExpr * call = Result.Nodes.getNodeAs("cudaHostFuncCall")) {
- if (!call->getNumArgs()) {
- return false;
+bool HipifyAction::cubUsingNamespaceDecl(const mat::MatchFinder::MatchResult &Result) {
+ if (auto *decl = Result.Nodes.getNodeAs(sCubUsingNamespaceDecl)) {
+ if (auto nsd = decl->getNominatedNamespace()) {
+ FindAndReplace(nsd->getDeclName().getAsString(), decl->getIdentLocation(), CUDA_CUB_TYPE_NAME_MAP);
+ return true;
}
- const clang::FunctionDecl* funcDcl = call->getDirectCallee();
- if (!funcDcl) {
- return false;
+ }
+ return false;
+}
+
+bool HipifyAction::cubFunctionTemplateDecl(const mat::MatchFinder::MatchResult &Result) {
+ if (auto *decl = Result.Nodes.getNodeAs(sCubFunctionTemplateDecl)) {
+ auto *Tparams = decl->getTemplateParameters();
+ bool ret = false;
+ for (size_t I = 0; I < Tparams->size(); ++I) {
+ const clang::ValueDecl *valueDecl = dyn_cast(Tparams->getParam(I));
+ if (!valueDecl) continue;
+ clang::QualType QT = valueDecl->getType();
+ auto *t = QT.getTypePtr();
+ if (!t) continue;
+ const clang::ElaboratedType *et = t->getAs();
+ if (!et) continue;
+ const clang::NestedNameSpecifier *nns = et->getQualifier();
+ if (!nns) continue;
+ const clang::NamespaceDecl *nsd = nns->getAsNamespace();
+ if (!nsd) continue;
+ const clang::SourceRange sr = valueDecl->getSourceRange();
+ std::string name = nsd->getDeclName().getAsString();
+ FindAndReplace(name, GetSubstrLocation(name, sr), CUDA_CUB_TYPE_NAME_MAP);
+ ret = true;
}
+ return ret;
+ }
+ return false;
+}
+
+bool HipifyAction::cudaHostFuncCall(const mat::MatchFinder::MatchResult &Result) {
+ if (auto *call = Result.Nodes.getNodeAs(sCudaHostFuncCall)) {
+ if (!call->getNumArgs()) return false;
+ auto *funcDcl = call->getDirectCallee();
+ if (!funcDcl) return false;
std::string sName = funcDcl->getDeclName().getAsString();
unsigned int argNum = 0;
bool b_reinterpret = (ReinterpretFunctions.find(sName) != ReinterpretFunctions.end()) ? true : false;
@@ -506,7 +527,7 @@ bool HipifyAction::cudaHostFuncCall(const clang::ast_matchers::MatchFinder::Matc
clang::SmallString<40> XStr;
llvm::raw_svector_ostream OS(XStr);
clang::SourceRange sr = call->getArg(argNum)->getSourceRange();
- clang::SourceManager* SM = Result.SourceManager;
+ auto *SM = Result.SourceManager;
OS << (b_reinterpret ? s_reinterpret_cast : sHIP_SYMBOL) << "(" << readSourceText(*SM, sr) << ")";
clang::SourceRange replacementRange = getWriteRange(*SM, { sr.getBegin(), sr.getEnd() });
clang::SourceLocation s = replacementRange.getBegin();
@@ -521,7 +542,7 @@ bool HipifyAction::cudaHostFuncCall(const clang::ast_matchers::MatchFinder::Matc
return false;
}
-void HipifyAction::insertReplacement(const ct::Replacement& rep, const clang::FullSourceLoc& fullSL) {
+void HipifyAction::insertReplacement(const ct::Replacement &rep, const clang::FullSourceLoc &fullSL) {
llcompat::insertReplacement(*replacements, rep);
if (PrintStats) {
rep.getLength();
@@ -530,10 +551,10 @@ void HipifyAction::insertReplacement(const ct::Replacement& rep, const clang::Fu
}
}
-std::unique_ptr HipifyAction::CreateASTConsumer(clang::CompilerInstance& CI, llvm::StringRef) {
- Finder.reset(new clang::ast_matchers::MatchFinder);
+std::unique_ptr HipifyAction::CreateASTConsumer(clang::CompilerInstance &CI, StringRef) {
+ Finder.reset(new mat::MatchFinder);
// Replace the <<<...>>> language extension with a hip kernel launch
- Finder->addMatcher(mat::cudaKernelCallExpr(mat::isExpansionInMainFile()).bind("cudaLaunchKernel"), this);
+ Finder->addMatcher(mat::cudaKernelCallExpr(mat::isExpansionInMainFile()).bind(sCudaLaunchKernel), this);
Finder->addMatcher(
mat::varDecl(
mat::isExpansionInMainFile(),
@@ -541,7 +562,7 @@ std::unique_ptr HipifyAction::CreateASTConsumer(clang::Compi
mat::hasAttr(clang::attr::CUDAShared),
mat::hasType(mat::incompleteArrayType())
)
- ).bind("cudaSharedIncompleteArrayVar"),
+ ).bind(sCudaSharedIncompleteArrayVar),
this
);
Finder->addMatcher(
@@ -561,7 +582,7 @@ std::unique_ptr HipifyAction::CreateASTConsumer(clang::Compi
)
)
)
- ).bind("cudaHostFuncCall"),
+ ).bind(sCudaHostFuncCall),
this
);
Finder->addMatcher(
@@ -576,7 +597,7 @@ std::unique_ptr HipifyAction::CreateASTConsumer(clang::Compi
mat::unless(mat::hasAttr(clang::attr::CUDAHost))
)
)
- ).bind("cudaDeviceFuncCall"),
+ ).bind(sCudaDeviceFuncCall),
this
);
Finder->addMatcher(
@@ -586,12 +607,26 @@ std::unique_ptr HipifyAction::CreateASTConsumer(clang::Compi
mat::elaboratedType(
mat::hasQualifier(
mat::specifiesNamespace(
- mat::hasName("cub")
+ mat::hasName(sCub)
)
)
)
)
- ).bind("cubNamespacePrefix"),
+ ).bind(sCubNamespacePrefix),
+ this
+ );
+ // TODO: Maybe worth to make it more concrete based on final cubFunctionTemplateDecl
+ Finder->addMatcher(
+ mat::functionTemplateDecl(
+ mat::isExpansionInMainFile()
+ ).bind(sCubFunctionTemplateDecl),
+ this
+ );
+ // TODO: Maybe worth to make it more concrete
+ Finder->addMatcher(
+ mat::usingDirectiveDecl(
+ mat::isExpansionInMainFile()
+ ).bind(sCubUsingNamespaceDecl),
this
);
// Ownership is transferred to the caller.
@@ -599,10 +634,8 @@ std::unique_ptr HipifyAction::CreateASTConsumer(clang::Compi
}
void HipifyAction::Ifndef(clang::SourceLocation Loc, const clang::Token &MacroNameTok, const clang::MacroDefinition &MD) {
- clang::SourceManager& SM = getCompilerInstance().getSourceManager();
- if (!SM.isWrittenInMainFile(Loc)) {
- return;
- }
+ auto &SM = getCompilerInstance().getSourceManager();
+ if (!SM.isWrittenInMainFile(Loc)) return;
StringRef Text(SM.getCharacterData(MacroNameTok.getLocation()), MacroNameTok.getLength());
Ifndefs.insert(std::make_pair(Text.str(), MacroNameTok.getEndLoc()));
}
@@ -615,12 +648,12 @@ void HipifyAction::EndSourceFileAction() {
// one copy of the hip include into every file.
bool placeForIncludeCalculated = false;
clang::SourceLocation sl, controllingMacroLoc;
- clang::SourceManager& SM = getCompilerInstance().getSourceManager();
- clang::Preprocessor& PP = getCompilerInstance().getPreprocessor();
- clang::HeaderSearch& HS = PP.getHeaderSearchInfo();
- clang::ExternalPreprocessorSource* EPL = HS.getExternalLookup();
- const clang::FileEntry* FE = SM.getFileEntryForID(SM.getMainFileID());
- const clang::IdentifierInfo* controllingMacro = HS.getFileInfo(FE).getControllingMacro(EPL);
+ auto &SM = getCompilerInstance().getSourceManager();
+ clang::Preprocessor &PP = getCompilerInstance().getPreprocessor();
+ clang::HeaderSearch &HS = PP.getHeaderSearchInfo();
+ clang::ExternalPreprocessorSource *EPL = HS.getExternalLookup();
+ const clang::FileEntry *FE = SM.getFileEntryForID(SM.getMainFileID());
+ const clang::IdentifierInfo *controllingMacro = HS.getFileInfo(FE).getControllingMacro(EPL);
if (controllingMacro) {
auto found = Ifndefs.find(controllingMacro->getName().str());
if (found != Ifndefs.end()) {
@@ -629,19 +662,13 @@ void HipifyAction::EndSourceFileAction() {
}
}
if (pragmaOnce) {
- if (placeForIncludeCalculated) {
- sl = pragmaOnceLoc < controllingMacroLoc ? pragmaOnceLoc : controllingMacroLoc;
- } else {
- sl = pragmaOnceLoc;
- }
+ if (placeForIncludeCalculated) sl = pragmaOnceLoc < controllingMacroLoc ? pragmaOnceLoc : controllingMacroLoc;
+ else sl = pragmaOnceLoc;
placeForIncludeCalculated = true;
}
if (!placeForIncludeCalculated) {
- if (firstHeader) {
- sl = firstHeaderLoc;
- } else {
- sl = SM.getLocForStartOfFile(SM.getMainFileID());
- }
+ if (firstHeader) sl = firstHeaderLoc;
+ else sl = SM.getLocForStartOfFile(SM.getMainFileID());
}
clang::FullSourceLoc fullSL(sl, SM);
ct::Replacement Rep(SM, sl, 0, "\n#include \n");
@@ -656,15 +683,15 @@ namespace {
* A silly little class to proxy PPCallbacks back to the HipifyAction class.
*/
class PPCallbackProxy : public clang::PPCallbacks {
- HipifyAction& hipifyAction;
+ HipifyAction &hipifyAction;
public:
- explicit PPCallbackProxy(HipifyAction& action): hipifyAction(action) {}
+ explicit PPCallbackProxy(HipifyAction &action): hipifyAction(action) {}
- void InclusionDirective(clang::SourceLocation hash_loc, const clang::Token& include_token,
+ void InclusionDirective(clang::SourceLocation hash_loc, const clang::Token &include_token,
StringRef file_name, bool is_angled, clang::CharSourceRange filename_range,
- const clang::FileEntry* file, StringRef search_path, StringRef relative_path,
- const clang::Module* imported
+ const clang::FileEntry *file, StringRef search_path, StringRef relative_path,
+ const clang::Module *imported
#if LLVM_VERSION_MAJOR > 6
, clang::SrcMgr::CharacteristicKind FileType
#endif
@@ -688,10 +715,10 @@ bool HipifyAction::BeginInvocation(clang::CompilerInstance &CI) {
}
void HipifyAction::ExecuteAction() {
- clang::Preprocessor& PP = getCompilerInstance().getPreprocessor();
- clang::SourceManager& SM = getCompilerInstance().getSourceManager();
+ clang::Preprocessor &PP = getCompilerInstance().getPreprocessor();
+ auto &SM = getCompilerInstance().getSourceManager();
// Start lexing the specified input file.
- const llvm::MemoryBuffer* FromFile = SM.getBuffer(SM.getMainFileID());
+ const llvm::MemoryBuffer *FromFile = SM.getBuffer(SM.getMainFileID());
clang::Lexer RawLex(SM.getMainFileID(), FromFile, SM, PP.getLangOpts());
RawLex.SetKeepWhitespaceMode(true);
// Perform a token-level rewrite of CUDA identifiers to hip ones. The raw-mode lexer gives us enough
@@ -709,10 +736,12 @@ void HipifyAction::ExecuteAction() {
clang::ASTFrontendAction::ExecuteAction();
}
-void HipifyAction::run(const clang::ast_matchers::MatchFinder::MatchResult& Result) {
+void HipifyAction::run(const mat::MatchFinder::MatchResult &Result) {
if (cudaLaunchKernel(Result)) return;
if (cudaSharedIncompleteArrayVar(Result)) return;
if (cudaHostFuncCall(Result)) return;
if (cudaDeviceFuncCall(Result)) return;
if (cubNamespacePrefix(Result)) return;
+ if (cubFunctionTemplateDecl(Result)) return;
+ if (cubUsingNamespaceDecl(Result)) return;
}
diff --git a/hipify-clang/src/HipifyAction.h b/hipify-clang/src/HipifyAction.h
index a24404deee..f70d17dd0b 100644
--- a/hipify-clang/src/HipifyAction.h
+++ b/hipify-clang/src/HipifyAction.h
@@ -31,17 +31,18 @@ THE SOFTWARE.
#include "Statistics.h"
namespace ct = clang::tooling;
+namespace mat = clang::ast_matchers;
using namespace llvm;
/**
* A FrontendAction that hipifies CUDA programs.
*/
class HipifyAction : public clang::ASTFrontendAction,
- public clang::ast_matchers::MatchFinder::MatchCallback {
+ public mat::MatchFinder::MatchCallback {
private:
- ct::Replacements* replacements;
+ ct::Replacements *replacements;
std::map Ifndefs;
- std::unique_ptr Finder;
+ std::unique_ptr Finder;
// CUDA implicitly adds its runtime header. We rewrite explicitly-provided CUDA includes with equivalent
// ones, and track - using this flag - if the result led to us including the hip runtime header. If it did
// not, we insert it at the top of the file when we finish processing it.
@@ -62,17 +63,20 @@ private:
void RewriteString(StringRef s, clang::SourceLocation start);
// Replace a CUDA identifier with the corresponding hip identifier, if applicable.
void RewriteToken(const clang::Token &t);
+ // Calculate str's SourceLocation in SourceRange sr
+ clang::SourceLocation GetSubstrLocation(const std::string &str, const clang::SourceRange &sr);
public:
explicit HipifyAction(ct::Replacements *replacements): clang::ASTFrontendAction(),
replacements(replacements) {}
// MatchCallback listeners
- bool cudaBuiltin(const clang::ast_matchers::MatchFinder::MatchResult& Result);
- bool cudaLaunchKernel(const clang::ast_matchers::MatchFinder::MatchResult& Result);
- bool cudaSharedIncompleteArrayVar(const clang::ast_matchers::MatchFinder::MatchResult& Result);
- bool cudaDeviceFuncCall(const clang::ast_matchers::MatchFinder::MatchResult& Result);
- bool cudaHostFuncCall(const clang::ast_matchers::MatchFinder::MatchResult& Result);
- bool cubNamespacePrefix(const clang::ast_matchers::MatchFinder::MatchResult& Result);
+ bool cudaLaunchKernel(const mat::MatchFinder::MatchResult &Result);
+ bool cudaSharedIncompleteArrayVar(const mat::MatchFinder::MatchResult &Result);
+ bool cudaDeviceFuncCall(const mat::MatchFinder::MatchResult &Result);
+ bool cudaHostFuncCall(const mat::MatchFinder::MatchResult &Result);
+ bool cubNamespacePrefix(const mat::MatchFinder::MatchResult &Result);
+ bool cubFunctionTemplateDecl(const mat::MatchFinder::MatchResult &Result);
+ bool cubUsingNamespaceDecl(const mat::MatchFinder::MatchResult &Result);
// Called by the preprocessor for each include directive during the non-raw lexing pass.
void InclusionDirective(clang::SourceLocation hash_loc,
const clang::Token &include_token,
@@ -91,7 +95,7 @@ public:
protected:
// Add a Replacement for the current file. These will all be applied after executing the FrontendAction.
- void insertReplacement(const ct::Replacement& rep, const clang::FullSourceLoc& fullSL);
+ void insertReplacement(const ct::Replacement &rep, const clang::FullSourceLoc &fullSL);
// FrontendAction entry point.
void ExecuteAction() override;
// Callback before starting processing a single input; used by hipify-clang for setting Preprocessor options.
@@ -99,8 +103,8 @@ protected:
// Called at the start of each new file to process.
void EndSourceFileAction() override;
// MatchCallback API entry point. Called by the AST visitor while searching the AST for things we registered an interest for.
- void run(const clang::ast_matchers::MatchFinder::MatchResult& Result) override;
- std::unique_ptr CreateASTConsumer(clang::CompilerInstance &CI, llvm::StringRef InFile) override;
- bool Exclude(const hipCounter & hipToken);
- void FindAndReplace(llvm::StringRef name, clang::SourceLocation sl, const std::map& repMap, bool bReplace = true);
+ void run(const mat::MatchFinder::MatchResult &Result) override;
+ std::unique_ptr CreateASTConsumer(clang::CompilerInstance &CI, StringRef InFile) override;
+ bool Exclude(const hipCounter &hipToken);
+ void FindAndReplace(StringRef name, clang::SourceLocation sl, const std::map &repMap, bool bReplace = true);
};
diff --git a/hipify-clang/src/main.cpp b/hipify-clang/src/main.cpp
index 2214567df3..64037c43dd 100644
--- a/hipify-clang/src/main.cpp
+++ b/hipify-clang/src/main.cpp
@@ -199,6 +199,7 @@ int main(int argc, const char **argv) {
Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("cuda", ct::ArgumentInsertPosition::BEGIN));
Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("-x", ct::ArgumentInsertPosition::BEGIN));
Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("--cuda-host-only", ct::ArgumentInsertPosition::BEGIN));
+ Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("-fno-delayed-template-parsing", ct::ArgumentInsertPosition::BEGIN));
if (!CudaPath.empty()) {
std::string sCudaPath = "--cuda-path=" + CudaPath;
Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster(sCudaPath.c_str(), ct::ArgumentInsertPosition::BEGIN));
diff --git a/include/hip/hcc_detail/code_object_bundle.hpp b/include/hip/hcc_detail/code_object_bundle.hpp
index 32b0c0dbc8..f312d2e79b 100644
--- a/include/hip/hcc_detail/code_object_bundle.hpp
+++ b/include/hip/hcc_detail/code_object_bundle.hpp
@@ -86,7 +86,7 @@ struct Bundled_code {
char cbuf[sizeof(offset) + sizeof(bundle_sz) + sizeof(triple_sz)];
} header;
std::string triple;
- std::vector blob;
+ std::string blob;
};
#define magic_string_ "__CLANG_OFFLOAD_BUNDLE__"
diff --git a/include/hip/hcc_detail/functional_grid_launch.hpp b/include/hip/hcc_detail/functional_grid_launch.hpp
index c493eec933..da57240ebd 100644
--- a/include/hip/hcc_detail/functional_grid_launch.hpp
+++ b/include/hip/hcc_detail/functional_grid_launch.hpp
@@ -127,35 +127,6 @@ void hipLaunchKernelGGLImpl(
} // Namespace hip_impl.
-template
-inline
-hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* blockSize,
- F kernel, size_t dynSharedMemPerBlk, uint32_t blockSizeLimit) {
-
- using namespace hip_impl;
-
- hip_impl::hip_init();
- auto f = get_program_state().kernel_descriptor(reinterpret_cast(kernel),
- target_agent(0));
-
- return hipOccupancyMaxPotentialBlockSize(gridSize, blockSize, f,
- dynSharedMemPerBlk, blockSizeLimit);
-}
-
-template
-inline
-hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(uint32_t* numBlocks, F kernel,
- uint32_t blockSize, size_t dynSharedMemPerBlk) {
-
- using namespace hip_impl;
-
- hip_impl::hip_init();
- auto f = get_program_state().kernel_descriptor(reinterpret_cast(kernel),
- target_agent(0));
-
- return hipOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, f, blockSize, dynSharedMemPerBlk);
-}
-
template
inline
void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
diff --git a/include/hip/hcc_detail/hip_atomic.h b/include/hip/hcc_detail/hip_atomic.h
index a5ac94a74b..2c13411319 100644
--- a/include/hip/hcc_detail/hip_atomic.h
+++ b/include/hip/hcc_detail/hip_atomic.h
@@ -59,12 +59,17 @@ float atomicAdd(float* address, float val)
{
unsigned int* uaddr{reinterpret_cast(address)};
unsigned int old{__atomic_load_n(uaddr, __ATOMIC_RELAXED)};
- unsigned int r;
+ unsigned int r;
do {
- r = old;
+ r = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
+
+ if (r != old) { r = old; continue; }
+
old = atomicCAS(uaddr, r, __float_as_uint(val + __uint_as_float(r)));
- } while (r != old);
+
+ if (r == old) break;
+ } while (true);
return __uint_as_float(r);
}
@@ -74,13 +79,18 @@ double atomicAdd(double* address, double val)
{
unsigned long long* uaddr{reinterpret_cast(address)};
unsigned long long old{__atomic_load_n(uaddr, __ATOMIC_RELAXED)};
- unsigned long long r;
+ unsigned long long r;
do {
- r = old;
+ r = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
+
+ if (r != old) { r = old; continue; }
+
old = atomicCAS(
uaddr, r, __double_as_longlong(val + __longlong_as_double(r)));
- } while (r != old);
+
+ if (r == old) break;
+ } while (true);
return __longlong_as_double(r);
}
@@ -144,7 +154,13 @@ unsigned long long atomicMin(
unsigned long long* address, unsigned long long val)
{
unsigned long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)};
- while (val < tmp) { tmp = atomicCAS(address, tmp, val); }
+ while (val < tmp) {
+ const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED);
+
+ if (tmp1 != tmp) { tmp = tmp1; continue; }
+
+ tmp = atomicCAS(address, tmp, val);
+ }
return tmp;
}
@@ -167,7 +183,13 @@ unsigned long long atomicMax(
unsigned long long* address, unsigned long long val)
{
unsigned long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)};
- while (tmp < val) { tmp = atomicCAS(address, tmp, val); }
+ while (tmp < val) {
+ const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED);
+
+ if (tmp1 != tmp) { tmp = tmp1; continue; }
+
+ tmp = atomicCAS(address, tmp, val);
+ }
return tmp;
}
@@ -177,7 +199,7 @@ inline
unsigned int atomicInc(unsigned int* address, unsigned int val)
{
__device__
- extern
+ extern
unsigned int __builtin_amdgcn_atomic_inc(
unsigned int*,
unsigned int,
@@ -194,7 +216,7 @@ inline
unsigned int atomicDec(unsigned int* address, unsigned int val)
{
__device__
- extern
+ extern
unsigned int __builtin_amdgcn_atomic_dec(
unsigned int*,
unsigned int,
diff --git a/include/hip/hcc_detail/hip_runtime_api.h b/include/hip/hcc_detail/hip_runtime_api.h
index c07d2ad9f1..3a81305ba3 100644
--- a/include/hip/hcc_detail/hip_runtime_api.h
+++ b/include/hip/hcc_detail/hip_runtime_api.h
@@ -1742,10 +1742,10 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes);
*
* @param[out] dst Data ptr to be filled
* @param[in] constant value to be set
- * @param[in] sizeBytes Data size in bytes
+ * @param[in] number of values to be set
* @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
*/
-hipError_t hipMemsetD8(hipDeviceptr_t dest, unsigned char value, size_t sizeBytes);
+hipError_t hipMemsetD8(hipDeviceptr_t dest, unsigned char value, size_t count);
/**
* @brief Fills the first sizeBytes bytes of the memory area pointed to by dest with the constant
@@ -1758,11 +1758,11 @@ hipError_t hipMemsetD8(hipDeviceptr_t dest, unsigned char value, size_t sizeByte
*
* @param[out] dst Data ptr to be filled
* @param[in] constant value to be set
- * @param[in] sizeBytes Data size in bytes
+ * @param[in] number of values to be set
* @param[in] stream - Stream identifier
* @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
*/
-hipError_t hipMemsetD8Async(hipDeviceptr_t dest, unsigned char value, size_t sizeBytes, hipStream_t stream __dparm(0));
+hipError_t hipMemsetD8Async(hipDeviceptr_t dest, unsigned char value, size_t count, hipStream_t stream __dparm(0));
/**
* @brief Fills the first sizeBytes bytes of the memory area pointed to by dest with the constant
@@ -1770,10 +1770,10 @@ hipError_t hipMemsetD8Async(hipDeviceptr_t dest, unsigned char value, size_t siz
*
* @param[out] dst Data ptr to be filled
* @param[in] constant value to be set
- * @param[in] sizeBytes Data size in bytes
+ * @param[in] number of values to be set
* @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
*/
-hipError_t hipMemsetD16(hipDeviceptr_t dest, unsigned short value, size_t sizeBytes);
+hipError_t hipMemsetD16(hipDeviceptr_t dest, unsigned short value, size_t count);
/**
* @brief Fills the first sizeBytes bytes of the memory area pointed to by dest with the constant
@@ -1786,11 +1786,11 @@ hipError_t hipMemsetD16(hipDeviceptr_t dest, unsigned short value, size_t sizeBy
*
* @param[out] dst Data ptr to be filled
* @param[in] constant value to be set
- * @param[in] sizeBytes Data size in bytes
+ * @param[in] number of values to be set
* @param[in] stream - Stream identifier
* @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
*/
-hipError_t hipMemsetD16Async(hipDeviceptr_t dest, unsigned short value, size_t sizeBytes, hipStream_t stream __dparm(0));
+hipError_t hipMemsetD16Async(hipDeviceptr_t dest, unsigned short value, size_t count, hipStream_t stream __dparm(0));
/**
* @brief Fills the memory area pointed to by dest with the constant integer
@@ -2884,14 +2884,14 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
* @param [out] gridSize minimum grid size for maximum potential occupancy
* @param [out] blockSize block size for maximum potential occupancy
* @param [in] f kernel function for which occupancy is calulated
- * @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
+ * @param [in] dynamicSMemSize Per - block dynamic shared memory usage intended, in bytes
* @param [in] blockSizeLimit the maximum block size for the kernel, use 0 for no limit
*
* @returns hipSuccess, hipInvalidDevice, hipErrorInvalidValue
*/
-hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* blockSize,
- hipFunction_t f, size_t dynSharedMemPerBlk,
- uint32_t blockSizeLimit);
+hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
+ const void* f, size_t dynamicSMemSize,
+ int blockSizeLimit);
/**
* @brief Returns occupancy for a device function.
@@ -2899,10 +2899,10 @@ hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* block
* @param [out] numBlocks Returned occupancy
* @param [in] func Kernel function for which occupancy is calulated
* @param [in] blockSize Block size the kernel is intended to be launched with
- * @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
+ * @param [in] dynamicSMemSize Per - block dynamic shared memory usage intended, in bytes
*/
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
- uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk);
+ int* numBlocks, const void* f, int blockSize, size_t dynamicSMemSize);
/**
* @brief Returns occupancy for a device function.
@@ -2910,11 +2910,11 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
* @param [out] numBlocks Returned occupancy
* @param [in] func Kernel function for which occupancy is calulated
* @param [in] blockSize Block size the kernel is intended to be launched with
- * @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
+ * @param [in] dynamicSMemSize Per - block dynamic shared memory usage intended, in bytes
* @param [in] flags Extra flags for occupancy calculation (currently ignored)
*/
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
- uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk, unsigned int flags);
+ int* numBlocks, const void* f, int blockSize, size_t dynamicSMemSize, unsigned int flags);
/**
* @brief Launches kernels on multiple devices and guarantees all specified kernels are dispatched
@@ -3320,7 +3320,27 @@ hipError_t hipBindTextureToMipmappedArray(const texture& tex,
return hipSuccess;
}
+template
+inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
+ int* numBlocks, T f, int blockSize, size_t dynamicSMemSize) {
+ return hipOccupancyMaxActiveBlocksPerMultiprocessor(
+ numBlocks, reinterpret_cast(f), blockSize, dynamicSMemSize);
+}
+template
+inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+ int* numBlocks, T f, int blockSize, size_t dynamicSMemSize, unsigned int flags) {
+ return hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+ numBlocks, reinterpret_cast(f), blockSize, dynamicSMemSize, flags);
+}
+
+template
+inline hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
+ T f, size_t dynamicSMemSize, int blockSizeLimit) {
+ return hipOccupancyMaxPotentialBlockSize(
+ gridSize, blockSize, reinterpret_cast(f), dynamicSMemSize, blockSizeLimit);
+}
+
template
inline hipError_t hipLaunchCooperativeKernel(T f, dim3 gridDim, dim3 blockDim,
void** kernelParams, unsigned int sharedMemBytes, hipStream_t stream) {
diff --git a/include/hip/hcc_detail/hip_runtime_prof.h b/include/hip/hcc_detail/hip_runtime_prof.h
new file mode 100644
index 0000000000..4d4eccb54d
--- /dev/null
+++ b/include/hip/hcc_detail/hip_runtime_prof.h
@@ -0,0 +1,77 @@
+/*
+Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_PROF_H
+#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_PROF_H
+
+// HIP VDI Op IDs enumeration
+enum HipVdiOpId {
+ kHipVdiOpIdDispatch = 0,
+ kHipVdiOpIdCopy = 1,
+ kHipVdiOpIdBarrier = 2,
+ kHipVdiOpIdNumber = 3
+};
+
+// Types of VDI commands
+enum HipVdiCommandKind {
+ kHipVdiCommandKernel = 0x11F0,
+ kHipVdiMemcpyDeviceToHost = 0x11F3,
+ kHipHipVdiMemcpyHostToDevice = 0x11F4,
+ kHipVdiMemcpyDeviceToDevice = 0x11F5,
+ kHipVidMemcpyDeviceToHostRect = 0x1201,
+ kHipVdiMemcpyHostToDeviceRect = 0x1202,
+ kHipVdiMemcpyDeviceToDeviceRect = 0x1203,
+ kHipVdiFillMemory = 0x1207,
+};
+
+/**
+ * @brief Initializes activity callback
+ *
+ * @param [input] id_callback Event ID callback function
+ * @param [input] op_callback Event operation callback function
+ * @param [input] arg Arguments passed into callback
+ *
+ * @returns None
+ */
+void hipInitActivityCallback(void* id_callback, void* op_callback, void* arg);
+
+/**
+ * @brief Enables activity callback
+ *
+ * @param [input] op Operation, which will trigger a callback (@see HipVdiOpId)
+ * @param [input] enable Enable state for the callback
+ *
+ * @returns True if successful
+ */
+bool hipEnableActivityCallback(uint32_t op, bool enable);
+
+/**
+ * @brief Returns the description string for the operation kind
+ *
+ * @param [input] id Command kind id (@see HipVdiCommandKind)
+ *
+ * @returns A pointer to a const string with the command description
+ */
+const char* hipGetCmdName(uint32_t id);
+
+#endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_PROF_H
+
diff --git a/include/hip/hcc_detail/hip_vector_types.h b/include/hip/hcc_detail/hip_vector_types.h
index a03a46b8cf..f80745038a 100644
--- a/include/hip/hcc_detail/hip_vector_types.h
+++ b/include/hip/hcc_detail/hip_vector_types.h
@@ -47,6 +47,95 @@ THE SOFTWARE.
#if defined(__cplusplus)
#include
+ namespace hip_impl {
+ template
+ struct Scalar_accessor {
+ // Idea from https://t0rakka.silvrback.com/simd-scalar-accessor
+ Vector data;
+
+ __host__ __device__
+ operator T() const noexcept { return data[idx]; }
+
+ __host__ __device__
+ Scalar_accessor& operator=(T x) noexcept {
+ data[idx] = x;
+
+ return *this;
+ }
+
+ __host__ __device__
+ Scalar_accessor& operator+=(T x) noexcept {
+ data[idx] += x;
+ return *this;
+ }
+ __host__ __device__
+ Scalar_accessor& operator-=(T x) noexcept {
+ data[idx] -= x;
+ return *this;
+ }
+
+ __host__ __device__
+ Scalar_accessor& operator*=(T x) noexcept {
+ data[idx] *= x;
+ return *this;
+ }
+ __host__ __device__
+ Scalar_accessor& operator/=(T x) noexcept {
+ data[idx] /= x;
+ return *this;
+ }
+ template<
+ typename U = T,
+ typename std::enable_if{}>::type* = nullptr>
+ __host__ __device__
+ Scalar_accessor& operator%=(T x) noexcept {
+ data[idx] %= x;
+ return *this;
+ }
+
+ template<
+ typename U = T,
+ typename std::enable_if{}>::type* = nullptr>
+ __host__ __device__
+ Scalar_accessor& operator>>=(T x) noexcept {
+ data[idx] >>= x;
+ return *this;
+ }
+ template<
+ typename U = T,
+ typename std::enable_if{}>::type* = nullptr>
+ __host__ __device__
+ Scalar_accessor& operator<<=(T x) noexcept {
+ data[idx] <<= x;
+ return *this;
+ }
+ template<
+ typename U = T,
+ typename std::enable_if{}>::type* = nullptr>
+ __host__ __device__
+ Scalar_accessor& operator&=(T x) noexcept {
+ data[idx] &= x;
+ return *this;
+ }
+ template<
+ typename U = T,
+ typename std::enable_if{}>::type* = nullptr>
+ __host__ __device__
+ Scalar_accessor& operator|=(T x) noexcept {
+ data[idx] |= x;
+ return *this;
+ }
+ template<
+ typename U = T,
+ typename std::enable_if{}>::type* = nullptr>
+ __host__ __device__
+ Scalar_accessor& operator^=(T x) noexcept {
+ data[idx] ^= x;
+ return *this;
+ }
+ };
+ } // Namespace hip_impl.
+
template struct HIP_vector_base;
template
@@ -55,9 +144,7 @@ THE SOFTWARE.
union {
Native_vec_ data;
- struct {
- T x;
- };
+ hip_impl::Scalar_accessor x;
};
};
@@ -67,10 +154,8 @@ THE SOFTWARE.
union {
Native_vec_ data;
- struct {
- T x;
- T y;
- };
+ hip_impl::Scalar_accessor x;
+ hip_impl::Scalar_accessor y;
};
};
@@ -238,12 +323,10 @@ THE SOFTWARE.
union {
Native_vec_ data;
- struct {
- T x;
- T y;
- T z;
- T w;
- };
+ hip_impl::Scalar_accessor x;
+ hip_impl::Scalar_accessor y;
+ hip_impl::Scalar_accessor z;
+ hip_impl::Scalar_accessor w;
};
};
diff --git a/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp b/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp
old mode 100644
new mode 100755
index 0aa9ef2913..7c22e19d5d
--- a/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp
+++ b/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp
@@ -30,7 +30,7 @@ THE SOFTWARE.
#define fileName "tex2dKernel.code"
texture tex;
-bool testResult = false;
+bool testResult = true;
#define HIP_CHECK(cmd) \
{ \
@@ -62,7 +62,7 @@ bool runTest(int argc, char** argv) {
desc.NumChannels = 1;
desc.Width = width;
desc.Height = height;
- hipArrayCreate(&array, &desc);
+ HIP_CHECK(hipArrayCreate(&array, &desc));
hip_Memcpy2D copyParam;
memset(©Param, 0, sizeof(copyParam));
@@ -73,19 +73,19 @@ bool runTest(int argc, char** argv) {
copyParam.srcPitch = width * sizeof(float);
copyParam.WidthInBytes = copyParam.srcPitch;
copyParam.Height = height;
- hipMemcpyParam2D(©Param);
+ HIP_CHECK(hipMemcpyParam2D(©Param));
textureReference* texref;
- hipModuleGetTexRef(&texref, Module, "tex");
- hipTexRefSetAddressMode(texref, 0, hipAddressModeWrap);
- hipTexRefSetAddressMode(texref, 1, hipAddressModeWrap);
- hipTexRefSetFilterMode(texref, hipFilterModePoint);
- hipTexRefSetFlags(texref, 0);
- hipTexRefSetFormat(texref, HIP_AD_FORMAT_FLOAT, 1);
- hipTexRefSetArray(texref, array, HIP_TRSA_OVERRIDE_FORMAT);
+ HIP_CHECK(hipModuleGetTexRef(&texref, Module, "tex"));
+ HIP_CHECK(hipTexRefSetAddressMode(texref, 0, hipAddressModeWrap));
+ HIP_CHECK(hipTexRefSetAddressMode(texref, 1, hipAddressModeWrap));
+ HIP_CHECK(hipTexRefSetFilterMode(texref, hipFilterModePoint));
+ HIP_CHECK(hipTexRefSetFlags(texref, 0));
+ HIP_CHECK(hipTexRefSetFormat(texref, HIP_AD_FORMAT_FLOAT, 1));
+ HIP_CHECK(hipTexRefSetArray(texref, array, HIP_TRSA_OVERRIDE_FORMAT));
float* dData = NULL;
- hipMalloc((void**)&dData, size);
+ HIP_CHECK(hipMalloc((void**)&dData, size));
struct {
void* _Ad;
@@ -112,7 +112,7 @@ bool runTest(int argc, char** argv) {
float* hOutputData = (float*)malloc(size);
memset(hOutputData, 0, size);
- hipMemcpy(hOutputData, dData, size, hipMemcpyDeviceToHost);
+ HIP_CHECK(hipMemcpy(hOutputData, dData, size, hipMemcpyDeviceToHost));
for (int i = 0; i < height; i++) {
for (int j = 0; j < width; j++) {
@@ -124,10 +124,10 @@ bool runTest(int argc, char** argv) {
}
}
}
- hipUnbindTexture(tex);
- hipFree(dData);
- hipFreeArray(array);
- return true;
+ HIP_CHECK(hipUnbindTexture(tex));
+ HIP_CHECK(hipFree(dData));
+ HIP_CHECK(hipFreeArray(array));
+ return testResult;
}
int main(int argc, char** argv) {
diff --git a/samples/2_Cookbook/13_occupancy/occupancy.cpp b/samples/2_Cookbook/13_occupancy/occupancy.cpp
index a9f4e198b0..01fa7aafed 100644
--- a/samples/2_Cookbook/13_occupancy/occupancy.cpp
+++ b/samples/2_Cookbook/13_occupancy/occupancy.cpp
@@ -56,9 +56,9 @@ void launchKernel(float* C, float* A, float* B, bool manual){
const unsigned threadsperblock = 32;
const unsigned blocks = (NUM/threadsperblock)+1;
- uint32_t mingridSize = 0;
- uint32_t gridSize = 0;
- uint32_t blockSize = 0;
+ int mingridSize = 0;
+ int gridSize = 0;
+ int blockSize = 0;
if (manual){
blockSize = threadsperblock;
@@ -86,7 +86,7 @@ void launchKernel(float* C, float* A, float* B, bool manual){
printf("kernel Execution time = %6.3fms\n", eventMs);
//Calculate Occupancy
- uint32_t numBlock = 0;
+ int numBlock = 0;
HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, multiply, blockSize, 0));
if(devProp.maxThreadsPerMultiProcessor){
diff --git a/src/hip_hcc_internal.h b/src/hip_hcc_internal.h
index 3ee14577b0..658cfbf576 100644
--- a/src/hip_hcc_internal.h
+++ b/src/hip_hcc_internal.h
@@ -1022,6 +1022,27 @@ inline std::ostream& operator<<(std::ostream& os, const ihipCtx_t* c) {
namespace hip_internal {
hipError_t memcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind,
hipStream_t stream);
+
+hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned int flags);
+
+hipError_t ihipHostFree(TlsData *tls, void* ptr);
+
+};
+
+#define MAX_COOPERATIVE_GPUs 255
+
+// do not change these two structs without changing the device library
+struct mg_sync {
+ uint w0;
+ uint w1;
+};
+
+struct mg_info {
+ struct mg_sync *mgs;
+ uint grid_id;
+ uint num_grids;
+ ulong prev_sum;
+ ulong all_sum;
};
//---
diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp
index ed1422fcda..c8369685ec 100644
--- a/src/hip_memory.cpp
+++ b/src/hip_memory.cpp
@@ -141,6 +141,103 @@ void* allocAndSharePtr(const char* msg, size_t sizeBytes, ihipCtx_t* ctx, bool s
return ptr;
}
+hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned int flags) {
+ hipError_t hip_status = hipSuccess;
+
+ if (HIP_SYNC_HOST_ALLOC) {
+ hipDeviceSynchronize();
+ }
+
+ auto ctx = ihipGetTlsDefaultCtx();
+ if ((ctx == nullptr) || (ptr == nullptr)) {
+ hip_status = hipErrorInvalidValue;
+ }
+ else if (sizeBytes == 0) {
+ hip_status = hipSuccess;
+ // TODO - should size of 0 return err or be siliently ignored?
+ } else {
+ unsigned trueFlags = flags;
+ if (flags == hipHostMallocDefault) {
+ // HCC/ROCM provide a modern system with unified memory and should set both of these
+ // flags by default:
+ trueFlags = hipHostMallocMapped | hipHostMallocPortable;
+ }
+
+
+ const unsigned supportedFlags = hipHostMallocPortable | hipHostMallocMapped |
+ hipHostMallocWriteCombined | hipHostMallocCoherent |
+ hipHostMallocNonCoherent;
+
+
+ const unsigned coherencyFlags = hipHostMallocCoherent | hipHostMallocNonCoherent;
+
+ if ((flags & ~supportedFlags) || ((flags & coherencyFlags) == coherencyFlags)) {
+ *ptr = nullptr;
+ // can't specify unsupported flags, can't specify both Coherent + NonCoherent
+ hip_status = hipErrorInvalidValue;
+ } else {
+ auto device = ctx->getWriteableDevice();
+#if (__hcc_workweek__ >= 19115)
+ //Avoid mapping host pinned memory to all devices by HCC
+ unsigned amFlags = amHostUnmapped;
+#else
+ unsigned amFlags = 0;
+#endif
+ if (flags & hipHostMallocCoherent) {
+ amFlags |= amHostCoherent;
+ } else if (flags & hipHostMallocNonCoherent) {
+ amFlags |= amHostNonCoherent;
+ } else {
+ // depends on env variables:
+ amFlags |= HIP_HOST_COHERENT ? amHostCoherent : amHostNonCoherent;
+ }
+
+
+ *ptr = hip_internal::allocAndSharePtr(
+ (amFlags & amHostCoherent) ? "finegrained_host" : "pinned_host", sizeBytes, ctx,
+ true /*shareWithAll*/, amFlags, flags, 0);
+
+ if (sizeBytes && (*ptr == NULL)) {
+ hip_status = hipErrorMemoryAllocation;
+ }
+ }
+ }
+
+ if (HIP_SYNC_HOST_ALLOC) {
+ hipDeviceSynchronize();
+ }
+ return hip_status;
+}
+
+hipError_t ihipHostFree(TlsData *tls, void* ptr) {
+
+ // Synchronize to ensure all work has finished.
+ ihipGetTlsDefaultCtx()->locked_waitAllStreams(); // ignores non-blocking streams, this waits
+ // for all activity to finish.
+
+ hipError_t hipStatus = hipErrorInvalidValue;
+ if (ptr) {
+ hc::accelerator acc;
+#if (__hcc_workweek__ >= 17332)
+ hc::AmPointerInfo amPointerInfo(NULL, NULL, NULL, 0, acc, 0, 0);
+#else
+ hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0);
+#endif
+ am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, ptr);
+ if (status == AM_SUCCESS) {
+ if (amPointerInfo._hostPointer == ptr) {
+ hc::am_free(ptr);
+ hipStatus = hipSuccess;
+ }
+ }
+ } else {
+ // free NULL pointer succeeds and is common technique to initialize runtime
+ hipStatus = hipSuccess;
+ }
+
+ return hipStatus;
+}
+
} // end namespace hip_internal
@@ -301,79 +398,12 @@ hipError_t hipExtMallocWithFlags(void** ptr, size_t sizeBytes, unsigned int flag
return ihipLogStatus(hip_status);
}
-hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned int flags) {
- hipError_t hip_status = hipSuccess;
-
- if (HIP_SYNC_HOST_ALLOC) {
- hipDeviceSynchronize();
- }
-
- auto ctx = ihipGetTlsDefaultCtx();
- if ((ctx == nullptr) || (ptr == nullptr)) {
- hip_status = hipErrorInvalidValue;
- }
- else if (sizeBytes == 0) {
- hip_status = hipSuccess;
- // TODO - should size of 0 return err or be siliently ignored?
- } else {
- unsigned trueFlags = flags;
- if (flags == hipHostMallocDefault) {
- // HCC/ROCM provide a modern system with unified memory and should set both of these
- // flags by default:
- trueFlags = hipHostMallocMapped | hipHostMallocPortable;
- }
-
-
- const unsigned supportedFlags = hipHostMallocPortable | hipHostMallocMapped |
- hipHostMallocWriteCombined | hipHostMallocCoherent |
- hipHostMallocNonCoherent;
-
-
- const unsigned coherencyFlags = hipHostMallocCoherent | hipHostMallocNonCoherent;
-
- if ((flags & ~supportedFlags) || ((flags & coherencyFlags) == coherencyFlags)) {
- *ptr = nullptr;
- // can't specify unsupported flags, can't specify both Coherent + NonCoherent
- hip_status = hipErrorInvalidValue;
- } else {
- auto device = ctx->getWriteableDevice();
-#if (__hcc_workweek__ >= 19115)
- //Avoid mapping host pinned memory to all devices by HCC
- unsigned amFlags = amHostUnmapped;
-#else
- unsigned amFlags = 0;
-#endif
- if (flags & hipHostMallocCoherent) {
- amFlags |= amHostCoherent;
- } else if (flags & hipHostMallocNonCoherent) {
- amFlags |= amHostNonCoherent;
- } else {
- // depends on env variables:
- amFlags |= HIP_HOST_COHERENT ? amHostCoherent : amHostNonCoherent;
- }
-
-
- *ptr = hip_internal::allocAndSharePtr(
- (amFlags & amHostCoherent) ? "finegrained_host" : "pinned_host", sizeBytes, ctx,
- true /*shareWithAll*/, amFlags, flags, 0);
-
- if (sizeBytes && (*ptr == NULL)) {
- hip_status = hipErrorMemoryAllocation;
- }
- }
- }
-
- if (HIP_SYNC_HOST_ALLOC) {
- hipDeviceSynchronize();
- }
- return hip_status;
-}
hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) {
HIP_INIT_SPECIAL_API(hipHostMalloc, (TRACE_MEM), ptr, sizeBytes, flags);
HIP_SET_DEVICE();
hipError_t hip_status = hipSuccess;
- hip_status = ihipHostMalloc(tls, ptr, sizeBytes, flags);
+ hip_status = hip_internal::ihipHostMalloc(tls, ptr, sizeBytes, flags);
return ihipLogStatus(hip_status);
}
@@ -384,7 +414,7 @@ hipError_t hipMallocManaged(void** devPtr, size_t size, unsigned int flags) {
if(flags != hipMemAttachGlobal)
hip_status = hipErrorInvalidValue;
else
- hip_status = ihipHostMalloc(tls, devPtr, size, hipHostMallocDefault);
+ hip_status = hip_internal::ihipHostMalloc(tls, devPtr, size, hipHostMallocDefault);
return ihipLogStatus(hip_status);
}
@@ -1935,15 +1965,15 @@ hipError_t hipMemset2DAsync(void* dst, size_t pitch, int value, size_t width, si
return ihipLogStatus(e);
};
-hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char value, size_t sizeBytes) {
- HIP_INIT_SPECIAL_API(hipMemsetD8, (TRACE_MCMD), dst, value, sizeBytes);
+hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char value, size_t count) {
+ HIP_INIT_SPECIAL_API(hipMemsetD8, (TRACE_MCMD), dst, value, count);
hipError_t e = hipSuccess;
hipStream_t stream = hipStreamNull;
stream = ihipSyncAndResolveStream(stream);
if (stream) {
- e = ihipMemset(dst, value, sizeBytes, stream, ihipMemsetDataTypeChar);
+ e = ihipMemset(dst, value, count, stream, ihipMemsetDataTypeChar);
stream->locked_wait();
} else {
e = hipErrorInvalidValue;
@@ -1951,23 +1981,23 @@ hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char value, size_t sizeBytes
return ihipLogStatus(e);
}
-hipError_t hipMemsetD8Async(hipDeviceptr_t dst, unsigned char value, size_t sizeBytes , hipStream_t stream ) {
- HIP_INIT_SPECIAL_API(hipMemsetD8Async, (TRACE_MCMD), dst, value, sizeBytes, stream);
+hipError_t hipMemsetD8Async(hipDeviceptr_t dst, unsigned char value, size_t count , hipStream_t stream ) {
+ HIP_INIT_SPECIAL_API(hipMemsetD8Async, (TRACE_MCMD), dst, value, count, stream);
stream = ihipSyncAndResolveStream(stream);
if (stream) {
- return ihipLogStatus(ihipMemset(dst, value, sizeBytes, stream, ihipMemsetDataTypeChar));
+ return ihipLogStatus(ihipMemset(dst, value, count, stream, ihipMemsetDataTypeChar));
} else {
return ihipLogStatus(hipErrorInvalidValue);
}
}
-hipError_t hipMemsetD16(hipDeviceptr_t dst, unsigned short value, size_t sizeBytes){
- HIP_INIT_SPECIAL_API(hipMemsetD16, (TRACE_MCMD), dst, value, sizeBytes);
+hipError_t hipMemsetD16(hipDeviceptr_t dst, unsigned short value, size_t count){
+ HIP_INIT_SPECIAL_API(hipMemsetD16, (TRACE_MCMD), dst, value, count);
hipError_t e = hipSuccess;
hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
if (stream) {
- e = ihipMemset(dst, value, sizeBytes, stream, ihipMemsetDataTypeShort);
+ e = ihipMemset(dst, value, count, stream, ihipMemsetDataTypeShort);
if(hipSuccess == e)
stream->locked_wait();
} else {
@@ -1976,12 +2006,12 @@ hipError_t hipMemsetD16(hipDeviceptr_t dst, unsigned short value, size_t sizeByt
return ihipLogStatus(e);
}
-hipError_t hipMemsetD16Async(hipDeviceptr_t dst, unsigned short value, size_t sizeBytes, hipStream_t stream ){
- HIP_INIT_SPECIAL_API(hipMemsetD16Async, (TRACE_MCMD), dst, value, sizeBytes, stream);
+hipError_t hipMemsetD16Async(hipDeviceptr_t dst, unsigned short value, size_t count, hipStream_t stream ){
+ HIP_INIT_SPECIAL_API(hipMemsetD16Async, (TRACE_MCMD), dst, value, count, stream);
stream = ihipSyncAndResolveStream(stream);
if (stream) {
- return ihipLogStatus(ihipMemset(dst, value, sizeBytes, stream, ihipMemsetDataTypeShort));
+ return ihipLogStatus(ihipMemset(dst, value, count, stream, ihipMemsetDataTypeShort));
} else {
return ihipLogStatus(hipErrorInvalidValue);
}
@@ -2146,30 +2176,8 @@ hipError_t hipFree(void* ptr) {
hipError_t hipHostFree(void* ptr) {
HIP_INIT_SPECIAL_API(hipHostFree, (TRACE_MEM), ptr);
- // Synchronize to ensure all work has finished.
- ihipGetTlsDefaultCtx()->locked_waitAllStreams(); // ignores non-blocking streams, this waits
- // for all activity to finish.
-
-
- hipError_t hipStatus = hipErrorInvalidValue;
- if (ptr) {
- hc::accelerator acc;
-#if (__hcc_workweek__ >= 17332)
- hc::AmPointerInfo amPointerInfo(NULL, NULL, NULL, 0, acc, 0, 0);
-#else
- hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0);
-#endif
- am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, ptr);
- if (status == AM_SUCCESS) {
- if (amPointerInfo._hostPointer == ptr) {
- hc::am_free(ptr);
- hipStatus = hipSuccess;
- }
- }
- } else {
- // free NULL pointer succeeds and is common technique to initialize runtime
- hipStatus = hipSuccess;
- }
+ hipError_t hipStatus = hipSuccess;
+ hipStatus = hip_internal::ihipHostFree(tls, ptr);
return ihipLogStatus(hipStatus);
};
diff --git a/src/hip_module.cpp b/src/hip_module.cpp
index d98b98a378..c15cc34cd5 100644
--- a/src/hip_module.cpp
+++ b/src/hip_module.cpp
@@ -109,6 +109,7 @@ struct ihipModuleSymbol_t {
amd_kernel_code_t const* _header{};
string _name; // TODO - review for performance cost. Name is just used for debug.
vector> _kernarg_layout{};
+ bool _is_code_object_v3{};
};
template <>
@@ -137,7 +138,8 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global
uint32_t localWorkSizeX, uint32_t localWorkSizeY,
uint32_t localWorkSizeZ, size_t sharedMemBytes,
hipStream_t hStream, void** kernelParams, void** extra,
- hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags, bool isStreamLocked = 0) {
+ hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags, bool isStreamLocked = 0,
+ void** impCoopParams = 0) {
using namespace hip_impl;
auto ctx = ihipGetTlsDefaultCtx();
@@ -181,10 +183,17 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global
return hipErrorInvalidValue;
}
- // Insert 48-bytes at the end for implicit kernel arguments and fill with value zero.
+ // Insert 56-bytes at the end for implicit kernel arguments and fill with value zero.
size_t padSize = (~kernargs.size() + 1) & (HIP_IMPLICIT_KERNARG_ALIGNMENT - 1);
kernargs.insert(kernargs.end(), padSize + HIP_IMPLICIT_KERNARG_SIZE, 0);
+ if (impCoopParams) {
+ const auto p{static_cast(*impCoopParams)};
+ // The sixth index is for multi-grid synchronization
+ kernargs.insert((kernargs.cend() - padSize - HIP_IMPLICIT_KERNARG_SIZE) + 6 * HIP_IMPLICIT_KERNARG_ALIGNMENT,
+ p, p + HIP_IMPLICIT_KERNARG_ALIGNMENT);
+ }
+
/*
Kernel argument preparation.
*/
@@ -208,8 +217,7 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global
aql.grid_size_x = globalWorkSizeX;
aql.grid_size_y = globalWorkSizeY;
aql.grid_size_z = globalWorkSizeZ;
- bool is_code_object_v3 = f->_name.find(".kd") != std::string::npos;
- if (is_code_object_v3) {
+ if (f->_is_code_object_v3) {
const auto* header =
reinterpret_cast(f->_header);
aql.group_segment_size =
@@ -449,6 +457,10 @@ hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim,
return ihipLogStatus(hipErrorLaunchFailure);
}
+ size_t impCoopArg = 1;
+ void* impCoopParams[1];
+ impCoopParams[0] = &impCoopArg;
+
// launch the main kernel
result = ihipModuleLaunchKernel(tls, kd,
gridDim.x * blockDimX.x,
@@ -456,7 +468,7 @@ hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim,
gridDim.z * blockDimX.z,
blockDimX.x, blockDimX.y, blockDimX.z,
sharedMemBytes, stream, kernelParams, nullptr, nullptr,
- nullptr, 0, true);
+ nullptr, 0, true, impCoopParams);
stream->criticalData().unlock();
#if (__hcc_workweek__ >= 19213)
@@ -472,7 +484,7 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
HIP_INIT_API(hipLaunchCooperativeKernelMultiDevice, launchParamsList, numDevices, flags);
hipError_t result;
- if (numDevices > g_deviceCnt || launchParamsList == nullptr) {
+ if (numDevices > g_deviceCnt || launchParamsList == nullptr || numDevices > MAX_COOPERATIVE_GPUs) {
return ihipLogStatus(hipErrorInvalidValue);
}
@@ -523,6 +535,32 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
kargs.getHandle());
}
+ mg_sync *mg_sync_ptr = 0;
+ mg_info *mg_info_ptr[MAX_COOPERATIVE_GPUs] = {0};
+
+ result = hip_internal::ihipHostMalloc(tls, (void **)&mg_sync_ptr, sizeof(mg_sync), hipHostMallocDefault);
+ if (result != hipSuccess) {
+ return ihipLogStatus(hipErrorInvalidValue);
+ }
+ mg_sync_ptr->w0 = 0;
+ mg_sync_ptr->w1 = 0;
+
+ uint all_sum = 0;
+ for (int i = 0; i < numDevices; ++i) {
+ result = hip_internal::ihipHostMalloc(tls, (void **)&mg_info_ptr[i], sizeof(mg_info), hipHostMallocDefault);
+ if (result != hipSuccess) {
+ hip_internal::ihipHostFree(tls, mg_sync_ptr);
+ for (int j = 0; j < i; ++j) {
+ hip_internal::ihipHostFree(tls, mg_info_ptr[j]);
+ }
+ return ihipLogStatus(hipErrorInvalidValue);
+ }
+ // calculate the sum of sizes of all grids
+ const hipLaunchParams& lp = launchParamsList[i];
+ all_sum += lp.blockDim.x * lp.blockDim.y * lp.blockDim.z *
+ lp.gridDim.x * lp.gridDim.y * lp.gridDim.z;
+ }
+
// lock all streams before launching the blit kernels for initializing the GWS and main kernels to each device
for (int i = 0; i < numDevices; ++i) {
LockedAccessor_StreamCrit_t streamCrit(launchParamsList[i].stream->criticalData(), false);
@@ -531,7 +569,7 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
#endif
}
- // launch the init_gws kernel to initialize the GWS followed by launching the main kernels for each device
+ // launch the init_gws kernel to initialize the GWS for each device
for (int i = 0; i < numDevices; ++i) {
const hipLaunchParams& lp = launchParamsList[i];
@@ -549,8 +587,32 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
launchParamsList[j].stream->criticalData()._av.release_locked_hsa_queue();
#endif
}
+ hip_internal::ihipHostFree(tls, mg_sync_ptr);
+ for (int j = 0; j < numDevices; ++j) {
+ hip_internal::ihipHostFree(tls, mg_info_ptr[j]);
+ }
+
return ihipLogStatus(hipErrorLaunchFailure);
}
+ }
+
+ void* impCoopParams[1];
+ ulong prev_sum = 0;
+ // launch the main kernels for each device
+ for (int i = 0; i < numDevices; ++i) {
+ const hipLaunchParams& lp = launchParamsList[i];
+
+ //initialize and setup the implicit kernel argument for multi-grid sync
+ mg_info_ptr[i]->mgs = mg_sync_ptr;
+ mg_info_ptr[i]->grid_id = i;
+ mg_info_ptr[i]->num_grids = numDevices;
+ mg_info_ptr[i]->all_sum = all_sum;
+ mg_info_ptr[i]->prev_sum = prev_sum;
+ prev_sum += lp.blockDim.x * lp.blockDim.y * lp.blockDim.z *
+ lp.gridDim.x * lp.gridDim.y * lp.gridDim.z;
+
+
+ impCoopParams[0] = &mg_info_ptr[i];
result = ihipModuleLaunchKernel(tls, kds[i],
lp.gridDim.x * lp.blockDim.x,
@@ -559,7 +621,23 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
lp.blockDim.x, lp.blockDim.y,
lp.blockDim.z, lp.sharedMem,
lp.stream, lp.args, nullptr, nullptr, nullptr, 0,
- true);
+ true, impCoopParams);
+
+ if (result != hipSuccess) {
+ for (int j = 0; j < numDevices; ++j) {
+ launchParamsList[j].stream->criticalData().unlock();
+#if (__hcc_workweek__ >= 19213)
+ launchParamsList[j].stream->criticalData()._av.release_locked_hsa_queue();
+#endif
+ }
+ hip_internal::ihipHostFree(tls, mg_sync_ptr);
+ for (int j = 0; j < numDevices; ++j) {
+ hip_internal::ihipHostFree(tls, mg_info_ptr[j]);
+ }
+
+ return ihipLogStatus(hipErrorLaunchFailure);
+ }
+
}
// unlock all streams
@@ -573,6 +651,11 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
free(gwsKds);
free(kds);
+ hip_internal::ihipHostFree(tls, mg_sync_ptr);
+ for (int j = 0; j < numDevices; ++j) {
+ hip_internal::ihipHostFree(tls, mg_info_ptr[j]);
+ }
+
return ihipLogStatus(result);
}
@@ -977,31 +1060,24 @@ hipFuncAttributes make_function_attributes(TlsData *tls, const ihipModuleSymbol_
// available per CU, therefore we hardcode it to 64 KiRegisters.
prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024;
- bool is_code_object_v3 = kd._name.find(".kd") != std::string::npos;
- if (is_code_object_v3) {
+ if (kd._is_code_object_v3) {
r.localSizeBytes = header_v3(kd)->private_segment_fixed_size;
r.sharedSizeBytes = header_v3(kd)->group_segment_fixed_size;
- } else {
- r.localSizeBytes = kd._header->workitem_private_segment_byte_size;
- r.sharedSizeBytes = kd._header->workgroup_group_segment_byte_size;
- }
- r.maxDynamicSharedSizeBytes = prop.sharedMemPerBlock - r.sharedSizeBytes;
- if (is_code_object_v3) {
r.numRegs = ((header_v3(kd)->compute_pgm_rsrc1 & 0x3F) + 1) << 2;
- } else {
- r.numRegs = kd._header->workitem_vgpr_count;
- }
- r.maxThreadsPerBlock = r.numRegs ?
- std::min(prop.maxThreadsPerBlock, prop.regsPerBlock / r.numRegs) :
- prop.maxThreadsPerBlock;
- if (is_code_object_v3) {
r.binaryVersion = 0; // FIXME: should it be the ISA version or code
// object format version?
} else {
+ r.localSizeBytes = kd._header->workitem_private_segment_byte_size;
+ r.sharedSizeBytes = kd._header->workgroup_group_segment_byte_size;
+ r.numRegs = kd._header->workitem_vgpr_count;
r.binaryVersion =
kd._header->amd_machine_version_major * 10 +
kd._header->amd_machine_version_minor;
}
+ r.maxDynamicSharedSizeBytes = prop.sharedMemPerBlock - r.sharedSizeBytes;
+ r.maxThreadsPerBlock = r.numRegs ?
+ std::min(prop.maxThreadsPerBlock, prop.regsPerBlock / r.numRegs) :
+ prop.maxThreadsPerBlock;
r.ptxVersion = prop.major * 10 + prop.minor; // HIP currently presents itself as PTX 3.0.
return r;
@@ -1099,8 +1175,7 @@ hipError_t ihipModuleLoadData(TlsData *tls, hipModule_t* module, const void* ima
content.data(), content.size(), (*module)->executable,
this_agent());
- std::vector blob(content.cbegin(), content.cend());
- program_state_impl::read_kernarg_metadata(blob, (*module)->kernargs);
+ program_state_impl::read_kernarg_metadata(content, (*module)->kernargs);
// compute the hash of the code object
(*module)->hash = checksum(content.length(), content.data());
@@ -1152,8 +1227,7 @@ hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const
void getGprsLdsUsage(hipFunction_t f, size_t* usedVGPRS, size_t* usedSGPRS, size_t* usedLDS)
{
- bool is_code_object_v3 = f->_name.find(".kd") != std::string::npos;
- if (is_code_object_v3) {
+ if (f->_is_code_object_v3) {
const auto header = reinterpret_cast(f->_header);
// GRANULATED_WAVEFRONT_VGPR_COUNT is specified in 0:5 bits of COMPUTE_PGM_RSRC1
// the granularity for gfx6-gfx9 is max(0, ceil(vgprs_used / 4) - 1)
@@ -1174,9 +1248,9 @@ void getGprsLdsUsage(hipFunction_t f, size_t* usedVGPRS, size_t* usedSGPRS, size
}
}
-hipError_t ihipOccupancyMaxPotentialBlockSize(TlsData *tls, uint32_t* gridSize, uint32_t* blockSize,
- hipFunction_t f, size_t dynSharedMemPerBlk,
- uint32_t blockSizeLimit)
+hipError_t ihipOccupancyMaxPotentialBlockSize(TlsData *tls, int* gridSize, int* blockSize,
+ hipFunction_t f, size_t dynamicSMemSize,
+ int blockSizeLimit)
{
using namespace hip_impl;
@@ -1257,7 +1331,7 @@ hipError_t ihipOccupancyMaxPotentialBlockSize(TlsData *tls, uint32_t* gridSize,
}
else {
size_t availableSharedMemPerCU = prop.maxSharedMemoryPerMultiProcessor;
- size_t workgroupPerCU = availableSharedMemPerCU / (usedLDS + dynSharedMemPerBlk);
+ size_t workgroupPerCU = availableSharedMemPerCU / (usedLDS + dynamicSMemSize);
wavefrontsLDS = min(workgroupPerCU, maxWorkgroupPerCU) * wavefrontsPerWG;
}
@@ -1286,18 +1360,19 @@ hipError_t ihipOccupancyMaxPotentialBlockSize(TlsData *tls, uint32_t* gridSize,
return hipSuccess;
}
-hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* blockSize,
- hipFunction_t f, size_t dynSharedMemPerBlk,
- uint32_t blockSizeLimit)
+hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
+ const void* f, size_t dynamicSMemSize,
+ int blockSizeLimit)
{
- HIP_INIT_API(hipOccupancyMaxPotentialBlockSize, gridSize, blockSize, f, dynSharedMemPerBlk, blockSizeLimit);
-
+ HIP_INIT_API(hipOccupancyMaxPotentialBlockSize, gridSize, blockSize, f, dynamicSMemSize, blockSizeLimit);
+ auto F = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)(f),
+ hip_impl::target_agent(0));
return ihipLogStatus(ihipOccupancyMaxPotentialBlockSize(tls,
- gridSize, blockSize, f, dynSharedMemPerBlk, blockSizeLimit));
+ gridSize, blockSize, F, dynamicSMemSize, blockSizeLimit));
}
hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
- TlsData *tls, uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk)
+ TlsData *tls, int* numBlocks, hipFunction_t f, int blockSize, size_t dynamicSMemSize)
{
using namespace hip_impl;
@@ -1326,45 +1401,50 @@ hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
size_t numWavefronts = (blockSize + wavefrontSize - 1) / wavefrontSize;
size_t availableVGPRs = (prop.regsPerBlock / wavefrontSize / simdPerCU);
- size_t vgprs_alu_occupancy = simdPerCU * std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS);
+ size_t vgprs_alu_occupancy = simdPerCU * (usedVGPRS == 0 ? maxWavesPerSimd
+ : std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS));
// Calculate blocks occupancy per CU based on VGPR usage
*numBlocks = vgprs_alu_occupancy / numWavefronts;
const size_t availableSGPRs = (prop.gcnArch < 800) ? 512 : 800;
- size_t sgprs_alu_occupancy = simdPerCU * ((usedSGPRS == 0) ? maxWavesPerSimd
+ size_t sgprs_alu_occupancy = simdPerCU * (usedSGPRS == 0 ? maxWavesPerSimd
: std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS));
// Calculate blocks occupancy per CU based on SGPR usage
- *numBlocks = std::min(*numBlocks, (uint32_t) (sgprs_alu_occupancy / numWavefronts));
+ *numBlocks = std::min(*numBlocks, (int) (sgprs_alu_occupancy / numWavefronts));
- size_t total_used_lds = usedLDS + dynSharedMemPerBlk;
+ size_t total_used_lds = usedLDS + dynamicSMemSize;
if (total_used_lds != 0) {
// Calculate LDS occupacy per CU. lds_per_cu / (static_lsd + dynamic_lds)
size_t lds_occupancy = prop.maxSharedMemoryPerMultiProcessor / total_used_lds;
- *numBlocks = std::min(*numBlocks, (uint32_t) lds_occupancy);
+ *numBlocks = std::min(*numBlocks, (int) lds_occupancy);
}
return hipSuccess;
}
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
- uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk)
+ int* numBlocks, const void* f, int blockSize, size_t dynamicSMemSize)
{
- HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessor, numBlocks, f, blockSize, dynSharedMemPerBlk);
+ HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessor, numBlocks, f, blockSize, dynamicSMemSize);
+ auto F = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)(f),
+ hip_impl::target_agent(0));
return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor(
- tls, numBlocks, f, blockSize, dynSharedMemPerBlk));
+ tls, numBlocks, F, blockSize, dynamicSMemSize));
}
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
- uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk,
+ int* numBlocks, const void* f, int blockSize, size_t dynamicSMemSize,
unsigned int flags)
{
- HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, numBlocks, f, blockSize, dynSharedMemPerBlk, flags);
+ HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, numBlocks, f, blockSize, dynamicSMemSize, flags);
+ auto F = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)(f),
+ hip_impl::target_agent(0));
return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor(
- tls, numBlocks, f, blockSize, dynSharedMemPerBlk));
+ tls, numBlocks, F, blockSize, dynamicSMemSize));
}
hipError_t hipLaunchKernel(
diff --git a/src/program_state.inl b/src/program_state.inl
index 4f05d2763e..9feabbc2f7 100644
--- a/src/program_state.inl
+++ b/src/program_state.inl
@@ -89,9 +89,10 @@ struct Symbol {
class Kernel_descriptor {
std::uint64_t kernel_object_{};
- amd_kernel_code_t const* kernel_header_{nullptr};
- std::string name_{};
+ amd_kernel_code_t const* header_{};
+ std::string name_;
std::vector> kernarg_layout_{};
+ bool is_code_object_v3_{};
public:
Kernel_descriptor() = default;
Kernel_descriptor(
@@ -101,7 +102,8 @@ public:
:
kernel_object_{kernel_object},
name_{name},
- kernarg_layout_{std::move(kernarg_layout)}
+ kernarg_layout_{std::move(kernarg_layout)},
+ is_code_object_v3_{name.find(".kd") != std::string::npos}
{
bool supported{false};
std::uint16_t min_v{UINT16_MAX};
@@ -123,7 +125,7 @@ public:
r = tbl.hsa_ven_amd_loader_query_host_address(
reinterpret_cast(kernel_object_),
- reinterpret_cast(&kernel_header_));
+ reinterpret_cast(&header_));
if (r != HSA_STATUS_SUCCESS) return;
}
@@ -149,7 +151,7 @@ public:
std::string,
std::unordered_map<
hsa_isa_t,
- std::vector>>>> code_object_blobs;
+ std::vector>>> code_object_blobs;
std::pair<
std::once_flag,
@@ -213,7 +215,7 @@ public:
std::string,
std::unordered_map<
hsa_isa_t,
- std::vector>>>& get_code_object_blobs() {
+ std::vector>>& get_code_object_blobs() {
std::call_once(code_object_blobs.first, [this]() {
dl_iterate_phdr([](dl_phdr_info* info, std::size_t, void* p) {
@@ -584,6 +586,68 @@ public:
return functions[agent].second;
}
+ static
+ std::size_t parse_args_v2(
+ const std::string& metadata,
+ std::size_t f,
+ std::size_t l,
+ std::vector>& size_align) {
+ if (f == l) return f;
+ if (!size_align.empty()) return l;
+
+ do {
+ static constexpr size_t size_sz{5};
+ f = metadata.find("Size:", f) + size_sz;
+
+ if (l <= f) return f;
+
+ auto size = std::strtoul(&metadata[f], nullptr, 10);
+
+ static constexpr size_t align_sz{6};
+ f = metadata.find("Align:", f) + align_sz;
+
+ char* l{};
+ auto align = std::strtoul(&metadata[f], &l, 10);
+
+ f += (l - &metadata[f]) + 1;
+
+ size_align.emplace_back(size, align);
+ } while (true);
+ }
+
+ static
+ void read_kernarg_metadata_v2(
+ const std::string& kernels_md,
+ std::size_t dx,
+ std::unordered_map<
+ std::string,
+ std::vector>>& kernargs) {
+ do {
+ dx = kernels_md.find("Name:", dx);
+
+ if (dx == std::string::npos) break;
+
+ static constexpr decltype(kernels_md.size()) name_sz{5};
+ dx = kernels_md.find_first_not_of(" '", dx + name_sz);
+
+ auto fn =
+ kernels_md.substr(dx, kernels_md.find_first_of("'\n", dx) - dx);
+ dx += fn.size();
+
+ auto dx1 = kernels_md.find("CodeProps", dx);
+ dx = kernels_md.find("Args:", dx);
+
+ if (dx1 < dx) {
+ dx = dx1;
+ continue;
+ }
+ if (dx == std::string::npos) break;
+
+ static constexpr decltype(kernels_md.size()) args_sz{5};
+ dx = parse_args_v2(kernels_md, dx + args_sz, dx1, kernargs[fn]);
+ } while (true);
+ }
+
static
std::string metadata_to_string(const amd_comgr_metadata_node_t& md) {
std::string str;
@@ -598,9 +662,8 @@ public:
}
static
- void parse_args(
+ void parse_args_v3(
const amd_comgr_metadata_node_t& args_md,
- bool is_code_object_v3,
std::vector>& size_align) {
size_t arg_count = 0;
if (amd_comgr_get_metadata_list_size(args_md, &arg_count)
@@ -615,9 +678,7 @@ public:
return;
amd_comgr_metadata_node_t arg_size_md;
- if (amd_comgr_metadata_lookup(arg_md,
- is_code_object_v3 ? ".size" : "Size",
- &arg_size_md)
+ if (amd_comgr_metadata_lookup(arg_md, ".size", &arg_size_md)
!= AMD_COMGR_STATUS_SUCCESS)
return;
@@ -629,35 +690,21 @@ public:
size_t arg_align;
- if (is_code_object_v3) {
- amd_comgr_metadata_node_t arg_offset_md;
- if (amd_comgr_metadata_lookup(arg_md, ".offset", &arg_offset_md)
- != AMD_COMGR_STATUS_SUCCESS)
- return;
+ amd_comgr_metadata_node_t arg_offset_md;
+ if (amd_comgr_metadata_lookup(arg_md, ".offset", &arg_offset_md)
+ != AMD_COMGR_STATUS_SUCCESS)
+ return;
- size_t arg_offset
- = std::stoul(metadata_to_string(arg_offset_md));
+ size_t arg_offset = std::stoul(metadata_to_string(arg_offset_md));
- if (amd_comgr_destroy_metadata(arg_offset_md)
- != AMD_COMGR_STATUS_SUCCESS)
- return;
+ if (amd_comgr_destroy_metadata(arg_offset_md)
+ != AMD_COMGR_STATUS_SUCCESS)
+ return;
- arg_align = 1;
- while (arg_offset && (arg_offset & 1) == 0) {
- arg_offset >>= 1;
- arg_align <<= 1;
- }
- } else {
- amd_comgr_metadata_node_t arg_align_md;
- if (amd_comgr_metadata_lookup(arg_md, "Align", &arg_align_md)
- != AMD_COMGR_STATUS_SUCCESS)
- return;
-
- arg_align = std::stoul(metadata_to_string(arg_align_md));
-
- if (amd_comgr_destroy_metadata(arg_align_md)
- != AMD_COMGR_STATUS_SUCCESS)
- return;
+ arg_align = 1;
+ while (arg_offset && (arg_offset & 1) == 0) {
+ arg_offset >>= 1;
+ arg_align <<= 1;
}
size_align.emplace_back(arg_size, arg_align);
@@ -669,11 +716,11 @@ public:
}
static
- void read_kernarg_metadata(
- const std::vector& blob,
+ void read_kernarg_metadata_v3(
+ const std::string& blob,
std::unordered_map<
- std::string,
- std::vector>>& kernargs) {
+ std::string,
+ std::vector>>& kernargs) {
amd_comgr_data_t dataIn;
amd_comgr_status_t status;
@@ -690,7 +737,6 @@ public:
!= AMD_COMGR_STATUS_SUCCESS)
return;
- bool is_code_object_v3 = false;
amd_comgr_metadata_node_t kernels_md;
if (amd_comgr_metadata_lookup(metadata, "Kernels", &kernels_md)
!= AMD_COMGR_STATUS_SUCCESS) {
@@ -699,7 +745,6 @@ public:
&kernels_md)
!= AMD_COMGR_STATUS_SUCCESS)
return;
- is_code_object_v3 = true;
}
size_t kernel_count = 0;
@@ -715,9 +760,7 @@ public:
continue;
amd_comgr_metadata_node_t name_md;
- if (amd_comgr_metadata_lookup(kernel_md,
- is_code_object_v3 ? ".name" : "Name",
- &name_md)
+ if (amd_comgr_metadata_lookup(kernel_md, ".name", &name_md)
!= AMD_COMGR_STATUS_SUCCESS)
continue;
@@ -727,21 +770,15 @@ public:
!= AMD_COMGR_STATUS_SUCCESS)
continue;
- if (is_code_object_v3)
- kernel_name_str.append(".kd");
-
-
amd_comgr_metadata_node_t args_md;
- if (amd_comgr_metadata_lookup(kernel_md,
- is_code_object_v3 ? ".args" : "Args",
- &args_md)
+ if (amd_comgr_metadata_lookup(kernel_md, ".args", &args_md)
!= AMD_COMGR_STATUS_SUCCESS)
continue;
auto foundKernel = kernargs.find(kernel_name_str);
// parse arguments for a given kernel only once
if (foundKernel == kernargs.end()) {
- parse_args(args_md, is_code_object_v3, kernargs[kernel_name_str]);
+ parse_args_v3(args_md, kernargs[kernel_name_str]);
}
if (amd_comgr_destroy_metadata(args_md) != AMD_COMGR_STATUS_SUCCESS
@@ -757,7 +794,52 @@ public:
amd_comgr_release_data(dataIn);
}
- const std::unordered_map>>& kernargs)
+ {
+ std::istringstream istr{blob};
+ ELFIO::elfio reader;
+
+ if (!reader.load(istr)) return;
+
+ // TODO: this is inefficient.
+ auto it = find_section_if(reader, [](const ELFIO::section* x) {
+ return x->get_type() == SHT_NOTE;
+ });
+
+ if (!it) return;
+
+ const ELFIO::note_section_accessor acc{reader, it};
+ auto n{acc.get_notes_num()};
+ while (n--) {
+ ELFIO::Elf_Word type{};
+ std::string name{};
+ void* desc{};
+ ELFIO::Elf_Word desc_size{};
+
+ acc.get_note(n, type, name, desc, desc_size);
+
+ if (name == "AMDGPU") {
+ return read_kernarg_metadata_v3(blob, kernargs);
+ }
+ if (name != "AMD") continue; // TODO: switch to using NT_AMD_AMDGPU_HSA_METADATA.
+
+ std::string tmp{
+ static_cast(desc), static_cast(desc) + desc_size};
+
+ auto dx = tmp.find("Kernels:");
+
+ if (dx == std::string::npos) continue;
+
+ return read_kernarg_metadata_v2(tmp, dx + 8u, kernargs); // Skip "Kernels:".
+ }
+ }
+
+ const std::unordered_map>>& get_kernargs() {
std::call_once(kernargs.first, [this]() {
diff --git a/tests/hipify-clang/lit.cfg b/tests/hipify-clang/lit.cfg
index 1d092a4327..ea496e6a69 100644
--- a/tests/hipify-clang/lit.cfg
+++ b/tests/hipify-clang/lit.cfg
@@ -12,18 +12,33 @@ import lit.util
site_cfg = lit_config.params.get('site_config', None)
lit_config.load_config(config, site_cfg)
-print(str("========================================"))
+config.excludes = ['cmdparser.hpp']
+config.excludes.append('spatial_batch_norm_op.h')
+config.excludes.append('common_cudnn.h')
+
+delimiter = "===============================================================";
+print(delimiter)
print("CUDA " + config.cuda_version + " - will be used for testing")
print("LLVM " + config.llvm_version + " - will be used for testing")
print(platform.machine() + " - Platform architecture")
print(platform.system() + " " + platform.release() + " - Platform OS")
print(str(config.pointer_size * 8) + " - hipify-clang binary bitness")
print(str(struct.calcsize("P") * 8) + " - python " + str(platform.python_version()) + " binary bitness")
-print(str("========================================"))
-
-config.excludes = ['cmdparser.hpp']
-config.excludes.append('spatial_batch_norm_op.h')
-config.excludes.append('common_cudnn.h')
+print(delimiter)
+warns = None
+if not config.cuda_dnn_root:
+ config.excludes.append('cudnn_convolution_forward.cu')
+ config.excludes.append('cudnn_softmax.cu')
+ print("WARN: cuDNN tests are excluded due to unset CUDA_DNN_ROOT_DIR")
+ warns = True
+if not config.cuda_cub_root:
+ config.excludes.append('cub_01.cu')
+ config.excludes.append('cub_02.cu')
+ config.excludes.append('cub_03.cu')
+ print("WARN: CUB tests are excluded due to unset CUDA_CUB_ROOT_DIR")
+ warns = True
+if warns:
+ print(delimiter)
if config.cuda_version_major == 7 and config.cuda_version_minor == 0:
config.excludes.append('headers_test_09.cu')
@@ -90,17 +105,47 @@ else:
run_test_ext = ".sh"
# CUDA SDK ROOT
clang_arguments += " -isystem'%s'/samples/common/inc"
-# cuDNN ROOT
-clang_arguments += " -I'%s'/include"
if config.pointer_size == 8:
clang_arguments += " -D__LP64__"
+# Set max clang's CudaArch for corresponding CUDA version
+# to support maximum CUDA features in offline tests
+if config.cuda_version_major == 7:
+ if config.cuda_version_minor == 5:
+ clang_arguments += " --cuda-gpu-arch=sm_53"
+ else:
+ clang_arguments += " --cuda-gpu-arch=sm_52"
+elif config.cuda_version_major == 8:
+ clang_arguments += " --cuda-gpu-arch=sm_62"
+elif config.cuda_version_major == 9:
+ if config.cuda_version_minor == 2:
+ clang_arguments += " --cuda-gpu-arch=sm_72"
+ else:
+ clang_arguments += " --cuda-gpu-arch=sm_70"
+elif config.cuda_version_major == 10:
+ clang_arguments += " --cuda-gpu-arch=sm_75"
+
+# cuDNN ROOT
+if config.cuda_dnn_root:
+ clang_arguments += " -I'%s'/include"
# CUB ROOT
-clang_arguments += " -I'%s'"
+if config.cuda_cub_root:
+ clang_arguments += " -I'%s'"
-hipify_arguments = "--cuda-path='%s'"
+if config.cuda_dnn_root and config.cuda_cub_root:
+ config.substitutions.append(("%clang_args", clang_arguments % (config.cuda_sdk_root, config.cuda_dnn_root, config.cuda_cub_root)))
+elif config.cuda_dnn_root:
+ config.substitutions.append(("%clang_args", clang_arguments % (config.cuda_sdk_root, config.cuda_dnn_root)))
+elif config.cuda_cub_root:
+ config.substitutions.append(("%clang_args", clang_arguments % (config.cuda_sdk_root, config.cuda_cub_root)))
+else:
+ config.substitutions.append(("%clang_args", clang_arguments % config.cuda_sdk_root))
-config.substitutions.append(("%clang_args", clang_arguments % (config.cuda_sdk_root, config.cuda_dnn_root, config.cuda_cub_root)))
-config.substitutions.append(("%hipify_args", hipify_arguments % (config.cuda_root)))
+if config.llvm_version_major < 4:
+ hipify_arguments = "-I'%s'/include"
+else:
+ hipify_arguments = "--cuda-path='%s'"
+
+config.substitutions.append(("%hipify_args", hipify_arguments % config.cuda_root))
config.substitutions.append(("hipify", '"' + hipify_path + "/hipify-clang" + '"'))
config.substitutions.append(("%run_test", '"' + config.test_source_root + "/run_test" + run_test_ext + '"'))
diff --git a/tests/hipify-clang/unit_tests/device/atomics.cu b/tests/hipify-clang/unit_tests/device/atomics.cu
index 1afd1ab541..3089efe1b8 100644
--- a/tests/hipify-clang/unit_tests/device/atomics.cu
+++ b/tests/hipify-clang/unit_tests/device/atomics.cu
@@ -276,7 +276,9 @@ int main(int argc, char** argv) {
runTest();
runTest();
runTest();
+#if CUDA_VERSION >= 8000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
runTest();
+#endif
// CHECK: hipDeviceReset();
cudaDeviceReset();
printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
diff --git a/tests/hipify-clang/unit_tests/libraries/CUB/cub_02.cu b/tests/hipify-clang/unit_tests/libraries/CUB/cub_02.cu
new file mode 100644
index 0000000000..21898baa03
--- /dev/null
+++ b/tests/hipify-clang/unit_tests/libraries/CUB/cub_02.cu
@@ -0,0 +1,69 @@
+// RUN: %run_test hipify "%s" "%t" %hipify_args %clang_args
+// CHECK: #include
+#include
+// CHECK: #include
+#include
+// CHECK: #include
+#include
+
+#include
+
+template
+__global__ void sort(const T* data_in, T* data_out){
+ // CHECK: typedef hipcub::BlockLoad BlockLoadT;
+ typedef cub::BlockLoad BlockLoadT;
+ // CHECK: typedef hipcub::BlockRadixSort BlockRadixSortT;
+ typedef cub::BlockRadixSort BlockRadixSortT;
+ // CHECK: typedef hipcub::BlockStore BlockStoreT;
+ typedef cub::BlockStore BlockStoreT;
+ __shared__ union {
+ typename BlockLoadT::TempStorage load;
+ typename BlockRadixSortT::TempStorage sort;
+ typename BlockStoreT::TempStorage store;
+ } tmp_storage;
+ T items[ITEMS_PER_THREAD];
+ BlockLoadT(tmp_storage.load).Load(data_in + blockIdx.x * BLOCK_WIDTH * ITEMS_PER_THREAD, items);
+ __syncthreads();
+ BlockRadixSortT(tmp_storage.sort).Sort(items);
+ __syncthreads();
+ BlockStoreT(tmp_storage.store).Store(data_out + blockIdx.x * BLOCK_WIDTH * ITEMS_PER_THREAD, items);
+}
+
+int main() {
+ double* d_gpu = NULL;
+ double* result_gpu = NULL;
+ double* data_sorted = new double[1000*4096];
+ // Allocate memory on the GPU
+ // CHECK: hipMalloc(&d_gpu, 1000*4096 * sizeof(double));
+ cudaMalloc(&d_gpu, 1000*4096 * sizeof(double));
+ // CHECK: hipMalloc(&result_gpu, 1000*4096 * sizeof(double));
+ cudaMalloc(&result_gpu, 1000*4096 * sizeof(double));
+ // CHECK: hiprandGenerator_t gen;
+ curandGenerator_t gen;
+ // Create generator
+ // CHECK: hiprandCreateGenerator(&gen, HIPRAND_RNG_PSEUDO_DEFAULT);
+ curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
+ // Fill array with random numbers
+ // CHECK: hiprandGenerateNormalDouble(gen, d_gpu, 1000*4096, 0.0, 1.0);
+ curandGenerateNormalDouble(gen, d_gpu, 1000*4096, 0.0, 1.0);
+ // Destroy generator
+ // CHECK: hiprandDestroyGenerator(gen);
+ curandDestroyGenerator(gen);
+ // Sort data
+ // CHECK: hipLaunchKernelGGL(HIP_KERNEL_NAME(sort<512, 8, hipcub::BLOCK_LOAD_TRANSPOSE, hipcub::BLOCK_STORE_TRANSPOSE>), dim3(1000), dim3(512), 0, 0, d_gpu, result_gpu);
+ sort<512, 8, cub::BLOCK_LOAD_TRANSPOSE, cub::BLOCK_STORE_TRANSPOSE><<<1000, 512>>>(d_gpu, result_gpu);
+ // CHECK: hipLaunchKernelGGL(HIP_KERNEL_NAME(sort<256, 16, hipcub::BLOCK_LOAD_DIRECT, hipcub::BLOCK_STORE_DIRECT>), dim3(1000), dim3(256), 0, 0, d_gpu, result_gpu);
+ sort<256, 16, cub::BLOCK_LOAD_DIRECT, cub::BLOCK_STORE_DIRECT><<<1000, 256>>>(d_gpu, result_gpu);
+ // CHECK: hipMemcpy(data_sorted, result_gpu, 1000*4096*sizeof(double), hipMemcpyDeviceToHost);
+ cudaMemcpy(data_sorted, result_gpu, 1000*4096*sizeof(double), cudaMemcpyDeviceToHost);
+ // Write the sorted data to standard out
+ for (int i = 0; i < 4095; ++i) {
+ std::cout << data_sorted[i] << ", ";
+ }
+ std::cout << data_sorted[4095] << std::endl;
+}
diff --git a/tests/hipify-clang/unit_tests/libraries/CUB/cub_03.cu b/tests/hipify-clang/unit_tests/libraries/CUB/cub_03.cu
new file mode 100644
index 0000000000..bc914d419d
--- /dev/null
+++ b/tests/hipify-clang/unit_tests/libraries/CUB/cub_03.cu
@@ -0,0 +1,33 @@
+// RUN: %run_test hipify "%s" "%t" %hipify_args %clang_args
+// CHECK: #include
+#include
+// CHECK: #include
+#include
+
+// using namespace hipcub;
+using namespace cub;
+
+// Simple CUDA kernel for computing tiled partial sums
+template
+ cub::BlockScanAlgorithm SCAN_ALGO>
+__global__ void ScanTilesKernel(int *d_in, int *d_out) {
+ // Specialize collective types for problem context
+ // CHECK: typedef ::hipcub::BlockLoad BlockLoadT;
+ typedef ::cub::BlockLoad BlockLoadT;
+ typedef BlockScan BlockScanT;
+ // Allocate on-chip temporary storage
+ __shared__ union {
+ typename BlockLoadT::TempStorage load;
+ typename BlockScanT::TempStorage reduce;
+ } temp_storage;
+ // Load data per thread
+ int thread_data[ITEMS_PER_THREAD];
+ int offset = blockIdx.x * (BLOCK_THREADS * ITEMS_PER_THREAD);
+ BlockLoadT(temp_storage.load).Load(d_in + offset, offset);
+ __syncthreads();
+ // Compute the block-wide prefix sum
+ BlockScanT(temp_storage).Sum(thread_data);
+}
diff --git a/tests/hipify-clang/unit_tests/samples/vec_add.cu b/tests/hipify-clang/unit_tests/samples/vec_add.cu
index bc8219bf8c..a6d8950e0b 100644
--- a/tests/hipify-clang/unit_tests/samples/vec_add.cu
+++ b/tests/hipify-clang/unit_tests/samples/vec_add.cu
@@ -67,7 +67,6 @@ int devcheck(int gpudevice, int rank)
cudaError_t cudareturn;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, gpudevice);
- // CHECK: if (deviceProp.hipWarpSize <= 1)
if (deviceProp.warpSize <= 1)
{
printf("rank %d: warning, CUDA Device Emulation (CPU) detected, exiting\n", rank);
diff --git a/tests/src/deviceLib/hipMathFunctions.cpp b/tests/src/deviceLib/hipMathFunctions.cpp
index dc064da189..b1b0e8334a 100644
--- a/tests/src/deviceLib/hipMathFunctions.cpp
+++ b/tests/src/deviceLib/hipMathFunctions.cpp
@@ -29,7 +29,12 @@ THE SOFTWARE.
#include "hip/hip_runtime.h"
#include "test_common.h"
-#if __HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__ || __HIP_ARCH_GFX906__ || __HIP_ARCH_GFX908__
+#if __HIP_ARCH_GFX803__ || \
+ __HIP_ARCH_GFX900__ || \
+ __HIP_ARCH_GFX906__ || \
+ __HIP_ARCH_GFX908__ || \
+ __HIP_ARCH_GFX1010__ || \
+ __HIP_ARCH_GFX1012__
__global__ void kernel_abs_int64(long long *input, long long *output) {
int tx = threadIdx.x;
diff --git a/tests/src/deviceLib/hipTestHalf.cpp b/tests/src/deviceLib/hipTestHalf.cpp
index b78e1d8c63..751d44e242 100644
--- a/tests/src/deviceLib/hipTestHalf.cpp
+++ b/tests/src/deviceLib/hipTestHalf.cpp
@@ -28,7 +28,12 @@ THE SOFTWARE.
#include "test_common.h"
-#if __HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__ || __HIP_ARCH_GFX906__ || __HIP_ARCH_GFX908__
+#if __HIP_ARCH_GFX803__ || \
+ __HIP_ARCH_GFX900__ || \
+ __HIP_ARCH_GFX906__ || \
+ __HIP_ARCH_GFX908__ || \
+ __HIP_ARCH_GFX1010__ || \
+ __HIP_ARCH_GFX1012__
__device__ void test_convert() {
__half x;
diff --git a/tests/src/deviceLib/hipTestNativeHalf.cpp b/tests/src/deviceLib/hipTestNativeHalf.cpp
index 55213d446d..8639127c7c 100644
--- a/tests/src/deviceLib/hipTestNativeHalf.cpp
+++ b/tests/src/deviceLib/hipTestNativeHalf.cpp
@@ -32,7 +32,12 @@ THE SOFTWARE.
using namespace std;
-#if __HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__ || __HIP_ARCH_GFX906__ || __HIP_ARCH_GFX908__
+#if __HIP_ARCH_GFX803__ || \
+ __HIP_ARCH_GFX900__ || \
+ __HIP_ARCH_GFX906__ || \
+ __HIP_ARCH_GFX908__ || \
+ __HIP_ARCH_GFX1010__ || \
+ __HIP_ARCH_GFX1012__
__global__
void __halfTest(bool* result, __half a) {
diff --git a/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp b/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp
index 8ce02d6164..4aacfa866d 100644
--- a/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp
+++ b/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp
@@ -120,8 +120,8 @@ void simpleTest2(size_t numElements, bool usePinnedHost) {
HIPCHECK(hipHostFree(A_h1));
HIPCHECK(hipHostFree(A_h2));
} else {
- free(A_h1);
- free(A_h2);
+ aligned_free(A_h1);
+ aligned_free(A_h2);
}
}
diff --git a/tests/src/runtimeApi/memory/hipMemset2D.cpp b/tests/src/runtimeApi/memory/hipMemset2D.cpp
index b3bcf42222..2eb62a859f 100644
--- a/tests/src/runtimeApi/memory/hipMemset2D.cpp
+++ b/tests/src/runtimeApi/memory/hipMemset2D.cpp
@@ -45,7 +45,7 @@ bool testhipMemset2D(int memsetval,int p_gpuDevice)
char *A_d;
char *A_h;
bool testResult = true;
- HIPCHECK ( hipMemAllocPitch((hipDeviceptr_t*)&A_d, &pitch_A, width , numH,16) );
+ HIPCHECK(hipMemAllocPitch((hipDeviceptr_t*)&A_d, &pitch_A, width , numH,16));
A_h = (char*)malloc(sizeElements);
HIPASSERT(A_h != NULL);
for (size_t i=0; i(&numBlocks),
+ hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks,
test_gws, dimBlock.x * dimBlock.y * dimBlock.z, dimBlock.x * sizeof(long));
dimGrid.x = deviceProp.multiProcessorCount * std::min(numBlocks, 32);
diff --git a/tests/src/runtimeApi/module/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp b/tests/src/runtimeApi/module/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp
index ebf656b72f..2838c09cd1 100644
--- a/tests/src/runtimeApi/module/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp
+++ b/tests/src/runtimeApi/module/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp
@@ -30,10 +30,6 @@ THE SOFTWARE.
#include "hip/hip_runtime.h"
#include "test_common.h"
-#define fileName "vcpy_kernel.code"
-#define kernel_name "hello_world"
-
-
__global__ void f1(float *a) { *a = 1.0; }
template
@@ -44,16 +40,15 @@ __global__ void f2(T *a) { *a = 1; }
int main(int argc, char* argv[]) {
// test case for using kernel function pointer
- uint32_t gridSize = 0;
- uint32_t blockSize = 0;
+ int gridSize = 0;
+ int blockSize = 0;
hipOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, f1, 0, 0);
assert(gridSize != 0 && blockSize != 0);
- uint32_t numBlock = 0;
+ int numBlock = 0;
hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, f1, blockSize, 0);
assert(numBlock != 0);
-
// test case for using kernel function pointer with template
gridSize = 0;
blockSize = 0;
@@ -64,15 +59,5 @@ int main(int argc, char* argv[]) {
hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, f2, blockSize, 0);
assert(numBlock != 0);
-
- // test case for using kernel with hipFunction_t type
- numBlock = 0;
- hipModule_t Module;
- hipFunction_t Function;
- HIPCHECK(hipModuleLoad(&Module, fileName));
- HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name));
- HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, Function, blockSize, 0));
- assert(numBlock != 0);
-
passed();
}
diff --git a/tests/src/runtimeApi/module/hipOccupancyMaxPotentialBlockSize.cpp b/tests/src/runtimeApi/module/hipOccupancyMaxPotentialBlockSize.cpp
index a81862952d..22a3f05283 100644
--- a/tests/src/runtimeApi/module/hipOccupancyMaxPotentialBlockSize.cpp
+++ b/tests/src/runtimeApi/module/hipOccupancyMaxPotentialBlockSize.cpp
@@ -30,22 +30,16 @@ THE SOFTWARE.
#include "hip/hip_runtime.h"
#include "test_common.h"
-#define fileName "vcpy_kernel.code"
-#define kernel_name "hello_world"
-
-
__global__ void f1(float *a) { *a = 1.0; }
template
__global__ void f2(T *a) { *a = 1; }
-
-
int main(int argc, char* argv[]) {
// test case for using kernel function pointer
- uint32_t gridSize = 0;
- uint32_t blockSize = 0;
+ int gridSize = 0;
+ int blockSize = 0;
hipOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, f1, 0, 0);
assert(gridSize != 0 && blockSize != 0);
@@ -55,15 +49,5 @@ int main(int argc, char* argv[]) {
hipOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, f2, 0, 0);
assert(gridSize != 0 && blockSize != 0);
- // test case for using kernel with hipFunction_t type
- gridSize = 0;
- blockSize = 0;
- hipModule_t Module;
- hipFunction_t Function;
- HIPCHECK(hipModuleLoad(&Module, fileName));
- HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name));
- HIPCHECK(hipOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, Function, 0, 0));
- assert(gridSize != 0 && blockSize != 0);
-
passed();
}
diff --git a/tests/src/test_common.h b/tests/src/test_common.h
index 73a952b0d1..67a8e5e60a 100644
--- a/tests/src/test_common.h
+++ b/tests/src/test_common.h
@@ -99,10 +99,13 @@ THE SOFTWARE.
#ifdef _WIN64
#include
-#define aligned_alloc _aligned_malloc
+#define aligned_alloc(x,y) _aligned_malloc(y,x)
+#define aligned_free(x) _aligned_free(x)
#define popen(x,y) _popen(x,y)
#define pclose(x) _pclose(x)
#define setenv(x,y,z) _putenv_s(x,y)
+#else
+#define aligned_free(x) free(x)
#endif
// standard command-line variables: