Этот коммит содержится в:
Anusha Godavarthy Surya
2019-10-25 15:52:09 +05:30
родитель 259d8b4cdf 70f2cd1317
Коммит 5f47e99ffe
40 изменённых файлов: 1341 добавлений и 690 удалений
+15 -2
Просмотреть файл
@@ -289,7 +289,7 @@ if(HIP_PLATFORM STREQUAL "hcc")
target_link_libraries(hip_hcc PRIVATE hc_am)
target_link_libraries(hip_hcc_static PRIVATE hc_am)
add_library(hiprtc SHARED src/hiprtc.cpp)
add_library(hiprtc SHARED src/hiprtc.cpp src/code_object_bundle.cpp)
target_include_directories(
hiprtc SYSTEM
PRIVATE ${PROJECT_SOURCE_DIR}/include ${HSA_PATH}/include)
@@ -504,7 +504,7 @@ endif()
find_program(CPPCHECK_EXE cppcheck)
if(CPPCHECK_EXE)
add_custom_target(cppcheck COMMAND ${CPPCHECK_EXE} --force --quiet --enable=warning,performance,portability,information,missingInclude src include -I /opt/rocm/include/hcc -I /opt/rocm/include --suppress=*:/opt/rocm/include/hcc/hc.hpp
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endif()
#############################
@@ -545,4 +545,17 @@ else()
message(STATUS "Testing targets will not be available. To enable them please ensure that the HIP installation directory is writeable. Use -DCMAKE_INSTALL_PREFIX to specify a suitable location")
endif()
#############################
# Code analysis
#############################
# Target: clang
if(HIP_HIPCC_EXECUTABLE)
add_custom_target(analyze
COMMAND ${HIP_HIPCC_EXECUTABLE} -fvisibility=hidden -fvisibility-inlines-hidden --analyze --analyzer-outputtext -isystem /opt/rocm/include ${HIP_HCC_BUILD_FLAGS} -Wno-unused-command-line-argument -I/opt/rocm/include -c src/*.cpp -Iinclude/ -I./
WORKING_DIRECTORY ${HIP_SRC_PATH})
if(CPPCHECK_EXE)
add_dependencies(analyze cppcheck)
endif()
endif()
# vim: ts=4:sw=4:expandtab:smartindent
+9 -23
Просмотреть файл
@@ -234,6 +234,8 @@ if ($HIP_PLATFORM eq "clang") {
if ($HIP_RUNTIME eq "HCC" ) {
$HSA_PATH=$ENV{'HSA_PATH'} // "$ROCM_PATH/hsa";
$HIPCXXFLAGS .= " -isystem $HSA_PATH/include";
} else {
$HIPCXXFLAGS .= " -fhip-new-launch-api";
}
} elsif ($HIP_PLATFORM eq "hcc") {
@@ -352,7 +354,6 @@ my $runCmd = 1;
my $buildDeps = 0;
my $linkType = 1;
my $setLinkType = 0;
my $coFormatv3 = 1;
my @options = ();
my @inputs = ();
@@ -472,22 +473,6 @@ foreach $arg (@ARGV)
$swallowArg = 1;
}
# code object format parsing
if ($trimarg eq '-mcode-object-v3') {
$coFormatv3 = 1;
# hip-clang already recognizes -mcode-object-v3, so we just pass it on
if ($HIP_PLATFORM eq 'hcc') {
$swallowArg = 1;
}
}
if ($trimarg eq '-mno-code-object-v3') {
$coFormatv3 = 0;
# hip-clang already recognizes -mno-code-object-v3, so we just pass it on
if ($HIP_PLATFORM eq 'hcc') {
$swallowArg = 1;
}
}
if (($arg =~ /--genco/) and $HIP_PLATFORM eq 'clang' ) {
$arg = "--cuda-device-only";
}
@@ -526,6 +511,13 @@ foreach $arg (@ARGV)
$optArg = $arg;
}
## This is a temporary workaround for CMake detection of OpenMP support.
## It should be removed when the OpenMP detection c++ test in CMake is updated
## and corrected CMake version is available.
if((defined $HIP_COMPILER) and ($HIP_COMPILER eq "clang") and ($arg eq '-fopenmp')) {
$HIPCXXFLAGS .= " -D_OPENMP "
}
## process linker response file for hip-clang
## extract object files from static library and pass them directly to
## hip-clang in command line.
@@ -863,12 +855,6 @@ if($HIP_PLATFORM eq "hcc" or $HIP_PLATFORM eq "clang"){
}
}
# hcc defaults to v2, so we need to convert to the appropriate flag
# hip-clang defaults to v3, so we don't need to do anything
if ($coFormatv3 and $HIP_PLATFORM eq 'hcc') {
$HIPLDFLAGS .= " -Wl,-hcc-cov3 ";
}
if ($hasC and $HIP_PLATFORM eq 'nvcc') {
$HIPCXXFLAGS .= " -x cu";
}
+7 -1
Просмотреть файл
@@ -983,7 +983,6 @@ sub simpleSubstitutions {
$ft{'type'} += s/\bcusparsePointerMode_t\b/hipsparsePointerMode_t/g;
$ft{'type'} += s/\bcusparseSolvePolicy_t\b/hipsparseSolvePolicy_t/g;
$ft{'type'} += s/\bcusparseStatus_t\b/hipsparseStatus_t/g;
$ft{'type'} += s/\bwarpSize\b/hipWarpSize/g;
$ft{'numeric_literal'} += s/\bCUBLAS_DIAG_NON_UNIT\b/HIPBLAS_DIAG_NON_UNIT/g;
$ft{'numeric_literal'} += s/\bCUBLAS_DIAG_UNIT\b/HIPBLAS_DIAG_UNIT/g;
$ft{'numeric_literal'} += s/\bCUBLAS_FILL_MODE_FULL\b/HIPBLAS_FILL_MODE_FULL/g;
@@ -1676,6 +1675,12 @@ sub transformKernelLaunch {
}
}
sub transformCubNamespace {
my $k = 0;
$k += s/using\s*namespace\s*cub/using namespace hipcub/g;
return $k;
}
sub transformHostFunctions {
my $k = 0;
foreach $func (
@@ -2368,6 +2373,7 @@ while (@ARGV) {
simpleSubstitutions();
transformExternShared();
transformKernelLaunch();
transformCubNamespace();
if ($print_stats) {
while (/(\b(hip|HIP)([A-Z]|_)\w+\b)/g) {
$convertedTags{$1}++;
+1 -2
Просмотреть файл
@@ -196,8 +196,7 @@ This file can be copied and edited to provide more selective HSA event recording
#### How to enable profiling at HIP build time
Recent pre-built packages of HIP are always built with profiling support enabled.
For developer builds, you must enable marker support manually when compiling HIP.
Pre-built packages of HIP are not built with profiling support enabled.You must enable marker support manually when compiling HIP.
1. Build HIP with ATP markers enabled
HIP pre-built packages are enabled with ATP marker support by default.
+2 -2
Просмотреть файл
@@ -348,7 +348,7 @@ def generate_prof_header(f, api_map, opts_map):
# Generating the callbacks data structure
f.write('\n// HIP API callbacks data structure\n')
f.write(
'struct hip_api_data_t {\n' +
'typedef struct hip_api_data_t {\n' +
' uint64_t correlation_id;\n' +
' uint32_t phase;\n' +
' union {\n'
@@ -364,7 +364,7 @@ def generate_prof_header(f, api_map, opts_map):
f.write(' } ' + name + ';\n')
f.write(
' } args;\n' +
'};\n'
'} hip_api_data_t;\n'
)
# Generating the callbacks args data filling macros
+35 -8
Просмотреть файл
@@ -1,8 +1,4 @@
if (CUDA_VERSION VERSION_GREATER "9.2")
cmake_minimum_required(VERSION 3.12.3)
else()
cmake_minimum_required(VERSION 3.7.2)
endif()
cmake_minimum_required(VERSION 3.5.1)
project(hipify-clang)
@@ -58,7 +54,7 @@ target_link_libraries(hipify-clang PRIVATE
LLVMOption
LLVMCore)
if (LLVM_PACKAGE_VERSION VERSION_GREATER_EQUAL "7")
if (LLVM_PACKAGE_VERSION VERSION_GREATER "6.0.1")
target_link_libraries(hipify-clang PRIVATE clangToolingInclusions)
endif()
@@ -72,7 +68,7 @@ else()
set(StdCpp "-std=c++")
endif()
if (LLVM_PACKAGE_VERSION VERSION_GREATER_EQUAL "10.0")
if (LLVM_PACKAGE_VERSION VERSION_GREATER "9.0")
string(APPEND StdCpp "14")
# MSVC starting from 1900 (VS 2015) supports only the following c++ std values: c++14|c++17|c++latest
elseif (MSVC)
@@ -84,7 +80,38 @@ endif()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_CFLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CFLAGS} ${StdCpp} -DHIPIFY_CLANG_RES=\\\"${LLVM_LIBRARY_DIRS}/clang/${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}\\\"")
install(TARGETS hipify-clang DESTINATION bin)
set(INSTALL_PATH_DOC_STRING "Installation path for hipify-clang")
if (CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
if(CMAKE_BUILD_TYPE MATCHES Debug)
set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_SOURCE_DIR}/bin" CACHE PATH ${INSTALL_PATH_DOC_STRING} FORCE)
elseif(CMAKE_BUILD_TYPE MATCHES Release)
if (BIN_INSTALL_DIR)
set(CMAKE_INSTALL_PREFIX "${BIN_INSTALL_DIR}" CACHE PATH ${INSTALL_PATH_DOC_STRING} FORCE)
else()
set(CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/bin" CACHE PATH ${INSTALL_PATH_DOC_STRING} FORCE)
endif()
else()
message(FATAL_ERROR "Invalid CMAKE_BUILD_TYPE specified. Valid values are Debug and Release")
endif()
elseif(BIN_INSTALL_DIR)
set(CMAKE_INSTALL_PREFIX "${BIN_INSTALL_DIR}" CACHE PATH ${INSTALL_PATH_DOC_STRING} FORCE)
endif()
install(TARGETS hipify-clang DESTINATION ${CMAKE_INSTALL_PREFIX})
install(
DIRECTORY ${LLVM_DIR}/../../clang/${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}/
DESTINATION ${CMAKE_INSTALL_PREFIX}
COMPONENT clang-resource-headers
FILES_MATCHING
PATTERN "*.h"
PATTERN "*.modulemap"
PATTERN "algorithm"
PATTERN "complex"
PATTERN "new"
PATTERN "ppc_wrappers" EXCLUDE
PATTERN "openmp_wrappers" EXCLUDE
)
if (HIPIFY_CLANG_TESTS)
find_package(PythonInterp 2.7 REQUIRED)
+79 -78
Просмотреть файл
@@ -63,7 +63,7 @@
In most cases, you can get a suitable version of LLVM+CLANG with your package manager.
Failing that or having multiple versions of LLVM, you can [download a release archive](http://releases.llvm.org/), build or install it, and set
[CMAKE_PREFIX_PATH](https://cmake.org/cmake/help/v3.12/variable/CMAKE_PREFIX_PATH.html) so `cmake` can find it; for instance: `-DCMAKE_PREFIX_PATH=f:\LLVM\9.0.0\dist`
[CMAKE_PREFIX_PATH](https://cmake.org/cmake/help/v3.5/variable/CMAKE_PREFIX_PATH.html) so `cmake` can find it; for instance: `-DCMAKE_PREFIX_PATH=f:\LLVM\9.0.0\dist`
## <a name="build-and-install"></a> Build and install
@@ -154,7 +154,7 @@ To run it:
* Path to CUB should be specified by the `CUDA_CUB_ROOT_DIR` option:
- Linux: `-DCUDA_CUB_ROOT_DIR=/srv/CUB`
- Linux: `-DCUDA_CUB_ROOT_DIR=/srv/git/CUB`
- Windows: `-DCUDA_CUB_ROOT_DIR=f:/GIT/cub`
@@ -194,9 +194,9 @@ Ubuntu 14: LLVM 5.0.0 - 6.0.1, CUDA 7.0 - 9.0, cudnn-5.0.5 - cudnn-7.6.4.38
Ubuntu 16-18: LLVM 8.0.0 - 9.0.0, CUDA 8.0 - 10.1, cudnn-5.1.10 - cudnn-7.6.4.38
Build system for the above configurations:
Minimum build system requirements for the above configurations:
Python 2.7 (min), cmake 3.12.3 (min), GNU C/C++ 5.4.0 (min).
Python 2.7, cmake 3.5.1, GNU C/C++ 5.4.0.
Here is an example of building `hipify-clang` with testing support on `Ubuntu 16.04`:
@@ -208,7 +208,7 @@ cmake
-DCMAKE_PREFIX_PATH=/srv/git/LLVM/9.0.0/dist \
-DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-10.1 \
-DCUDA_DNN_ROOT_DIR=/srv/CUDNN/cudnn-10.1-v7.6.4.38 \
-DCUDA_CUB_ROOT_DIR=/srv/CUB \
-DCUDA_CUB_ROOT_DIR=/srv/git/CUB \
-DLLVM_EXTERNAL_LIT=/srv/git/LLVM/9.0.0/build/bin/llvm-lit \
..
```
@@ -264,88 +264,89 @@ Linux 5.2.0 - Platform OS
64 - hipify-clang binary bitness
64 - python 2.7.12 binary bitness
========================================
-- Testing: 64 tests, 12 threads --
PASS: hipify :: unit_tests/casts/reinterpret_cast.cu (1 of 64)
PASS: hipify :: unit_tests/device/math_functions.cu (2 of 64)
PASS: hipify :: unit_tests/device/atomics.cu (3 of 64)
PASS: hipify :: unit_tests/device/device_symbols.cu (4 of 64)
PASS: hipify :: unit_tests/headers/headers_test_02.cu (5 of 64)
PASS: hipify :: unit_tests/headers/headers_test_03.cu (6 of 64)
PASS: hipify :: unit_tests/headers/headers_test_01.cu (7 of 64)
PASS: hipify :: unit_tests/headers/headers_test_04.cu (8 of 64)
PASS: hipify :: unit_tests/headers/headers_test_05.cu (9 of 64)
PASS: hipify :: unit_tests/headers/headers_test_07.cu (10 of 64)
PASS: hipify :: unit_tests/headers/headers_test_06.cu (11 of 64)
PASS: hipify :: unit_tests/headers/headers_test_11.cu (12 of 64)
PASS: hipify :: unit_tests/headers/headers_test_08.cu (13 of 64)
PASS: hipify :: unit_tests/headers/headers_test_10.cu (14 of 64)
PASS: hipify :: unit_tests/headers/headers_test_09.cu (15 of 64)
PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_02.cu (16 of 64)
PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_01.cu (17 of 64)
PASS: hipify :: unit_tests/libraries/CUB/cub_01.cu (18 of 64)
PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_1_based_indexing.cu (19 of 64)
PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_0_based_indexing.cu (20 of 64)
PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_sgemm_matrix_multiplication.cu (21 of 64)
PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_0_based_indexing_rocblas.cu (22 of 64)
PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_sgemm_matrix_multiplication_rocblas.cu (23 of 64)
PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_1_based_indexing_rocblas.cu (24 of 64)
PASS: hipify :: unit_tests/libraries/cuComplex/cuComplex_Julia.cu (25 of 64)
PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_softmax.cu (26 of 64)
PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_convolution_forward.cu (27 of 64)
PASS: hipify :: unit_tests/libraries/cuFFT/simple_cufft.cu (28 of 64)
PASS: hipify :: unit_tests/libraries/cuRAND/poisson_api_example.cu (29 of 64)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_01.cu (30 of 64)
PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_generate.cpp (31 of 64)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_02.cu (32 of 64)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_03.cu (33 of 64)
PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_kernel.cpp (34 of 64)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_04.cu (35 of 64)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_05.cu (36 of 64)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_06.cu (37 of 64)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_07.cu (38 of 64)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_09.cu (39 of 64)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_10.cu (40 of 64)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_08.cu (41 of 64)
PASS: hipify :: unit_tests/namespace/ns_kernel_launch.cu (42 of 64)
PASS: hipify :: unit_tests/pp/pp_if_else_conditionals.cu (43 of 64)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_11.cu (44 of 64)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_12.cu (45 of 64)
PASS: hipify :: unit_tests/pp/pp_if_else_conditionals_01.cu (46 of 64)
PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/tex2dKernel.cpp (47 of 64)
PASS: hipify :: unit_tests/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp (48 of 64)
PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp (49 of 64)
PASS: hipify :: unit_tests/samples/2_Cookbook/13_occupancy/occupancy.cpp (50 of 64)
PASS: hipify :: unit_tests/samples/2_Cookbook/1_hipEvent/hipEvent.cpp (51 of 64)
PASS: hipify :: unit_tests/samples/2_Cookbook/2_Profiler/Profiler.cpp (52 of 64)
PASS: hipify :: unit_tests/samples/MallocManaged.cpp (53 of 64)
PASS: hipify :: unit_tests/samples/2_Cookbook/7_streams/stream.cpp (54 of 64)
PASS: hipify :: unit_tests/samples/2_Cookbook/8_peer2peer/peer2peer.cpp (55 of 64)
PASS: hipify :: unit_tests/samples/allocators.cu (56 of 64)
PASS: hipify :: unit_tests/samples/coalescing.cu (57 of 64)
PASS: hipify :: unit_tests/samples/dynamic_shared_memory.cu (58 of 64)
PASS: hipify :: unit_tests/samples/axpy.cu (59 of 64)
PASS: hipify :: unit_tests/samples/cudaRegister.cu (60 of 64)
PASS: hipify :: unit_tests/samples/intro.cu (61 of 64)
PASS: hipify :: unit_tests/samples/square.cu (62 of 64)
PASS: hipify :: unit_tests/samples/static_shared_memory.cu (63 of 64)
PASS: hipify :: unit_tests/samples/vec_add.cu (64 of 64)
Testing Time: 2.98s
Expected Passes : 64
-- Testing: 65 tests, 12 threads --
PASS: hipify :: unit_tests/casts/reinterpret_cast.cu (1 of 65)
PASS: hipify :: unit_tests/device/math_functions.cu (2 of 65)
PASS: hipify :: unit_tests/device/atomics.cu (3 of 65)
PASS: hipify :: unit_tests/device/device_symbols.cu (4 of 65)
PASS: hipify :: unit_tests/headers/headers_test_02.cu (5 of 65)
PASS: hipify :: unit_tests/headers/headers_test_03.cu (6 of 65)
PASS: hipify :: unit_tests/headers/headers_test_01.cu (7 of 65)
PASS: hipify :: unit_tests/headers/headers_test_04.cu (8 of 65)
PASS: hipify :: unit_tests/headers/headers_test_05.cu (9 of 65)
PASS: hipify :: unit_tests/headers/headers_test_07.cu (10 of 65)
PASS: hipify :: unit_tests/headers/headers_test_06.cu (11 of 65)
PASS: hipify :: unit_tests/headers/headers_test_11.cu (12 of 65)
PASS: hipify :: unit_tests/headers/headers_test_08.cu (13 of 65)
PASS: hipify :: unit_tests/headers/headers_test_10.cu (14 of 65)
PASS: hipify :: unit_tests/headers/headers_test_09.cu (15 of 65)
PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_02.cu (16 of 65)
PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_01.cu (17 of 65)
PASS: hipify :: unit_tests/libraries/CUB/cub_01.cu (18 of 65)
PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_1_based_indexing.cu (19 of 65)
PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_0_based_indexing.cu (20 of 65)
PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_sgemm_matrix_multiplication.cu (21 of 65)
PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_0_based_indexing_rocblas.cu (22 of 65)
PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_sgemm_matrix_multiplication_rocblas.cu (23 of 65)
PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_1_based_indexing_rocblas.cu (24 of 65)
PASS: hipify :: unit_tests/libraries/cuComplex/cuComplex_Julia.cu (25 of 65)
PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_softmax.cu (26 of 65)
PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_convolution_forward.cu (27 of 65)
PASS: hipify :: unit_tests/libraries/cuFFT/simple_cufft.cu (28 of 65)
PASS: hipify :: unit_tests/libraries/cuRAND/poisson_api_example.cu (29 of 65)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_01.cu (30 of 65)
PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_generate.cpp (31 of 65)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_02.cu (32 of 65)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_03.cu (33 of 65)
PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_kernel.cpp (34 of 65)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_04.cu (35 of 65)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_05.cu (36 of 65)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_06.cu (37 of 65)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_07.cu (38 of 65)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_09.cu (39 of 65)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_10.cu (40 of 65)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_08.cu (41 of 65)
PASS: hipify :: unit_tests/namespace/ns_kernel_launch.cu (42 of 65)
PASS: hipify :: unit_tests/pp/pp_if_else_conditionals.cu (43 of 65)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_11.cu (44 of 65)
PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_12.cu (45 of 65)
PASS: hipify :: unit_tests/pp/pp_if_else_conditionals_01.cu (46 of 65)
PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/tex2dKernel.cpp (47 of 65)
PASS: hipify :: unit_tests/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp (48 of 65)
PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp (49 of 65)
PASS: hipify :: unit_tests/samples/2_Cookbook/13_occupancy/occupancy.cpp (50 of 65)
PASS: hipify :: unit_tests/samples/2_Cookbook/1_hipEvent/hipEvent.cpp (51 of 65)
PASS: hipify :: unit_tests/samples/2_Cookbook/2_Profiler/Profiler.cpp (52 of 65)
PASS: hipify :: unit_tests/samples/MallocManaged.cpp (53 of 65)
PASS: hipify :: unit_tests/samples/2_Cookbook/7_streams/stream.cpp (54 of 65)
PASS: hipify :: unit_tests/samples/2_Cookbook/8_peer2peer/peer2peer.cpp (55 of 65)
PASS: hipify :: unit_tests/samples/allocators.cu (56 of 65)
PASS: hipify :: unit_tests/samples/coalescing.cu (57 of 65)
PASS: hipify :: unit_tests/samples/dynamic_shared_memory.cu (58 of 65)
PASS: hipify :: unit_tests/samples/axpy.cu (59 of 65)
PASS: hipify :: unit_tests/samples/cudaRegister.cu (60 of 65)
PASS: hipify :: unit_tests/samples/intro.cu (61 of 65)
PASS: hipify :: unit_tests/samples/square.cu (62 of 65)
PASS: hipify :: unit_tests/samples/static_shared_memory.cu (63 of 65)
PASS: hipify :: unit_tests/samples/vec_add.cu (64 of 65)
PASS: hipify :: unit_tests/libraries/CUB/cub_02.cu (18 of 65)
Testing Time: 3.01s
Expected Passes : 65
[100%] Built target test-hipify
```
### <a name="windows"></a >Windows
On Windows 10 the following configurations are tested:
LLVM 5.0.0 - 5.0.2, CUDA 8.0, cudnn-5.1.10 - cudnn-7.1.4.18
LLVM 5.0.0 - 5.0.2, CUDA 8.0, cudnn 5.1.10 - 7.1.4.18
LLVM 6.0.0 - 6.0.1, CUDA 9.0, cudnn-7.0.5.15 - cudnn-7.6.4.38
LLVM 6.0.0 - 6.0.1, CUDA 9.0, cudnn 7.0.5.15 - 7.6.4.38
LLVM 7.0.0 - 9.0.0, CUDA 7.5 - 10.1, cudnn-7.0.5.15 - cudnn-7.6.4.38
LLVM 7.0.0 - 9.0.0, CUDA 7.5 - 10.1, cudnn 7.0.5.15 - 7.6.4.38
Build system for the above configurations:
Build system requirements for the latest configuration LLVM 9.0.0/CUDA 10.1 Update 2:
Python 3.6 - 3.7.4, cmake 3.12.3 - 3.15.5, Visual Studio 2017 (15.5.2) - 2019 (16.3.4).
Python 3.6.0 - 3.8.0, cmake 3.5.1 - 3.15.5, Visual Studio 2017 (15.5.2) - 2019 (16.3.5).
Here is an example of building `hipify-clang` with testing support on `Windows 10` by `Visual Studio 16 2019`:
@@ -371,7 +372,7 @@ cmake
-- - CMake module path: F:/LLVM/9.0.0/dist/lib/cmake/llvm
-- - Include path : F:/LLVM/9.0.0/dist/include
-- - Binary path : F:/LLVM/9.0.0/dist/bin
-- Found PythonInterp: C:/Program Files/Python37/python.exe (found suitable version "3.7.4", minimum required is "3.6")
-- Found PythonInterp: C:/Program Files/Python38/python.exe (found suitable version "3.8.0", minimum required is "3.6")
-- Found lit: C:/Program Files/Python36/Scripts/lit.exe
-- Found FileCheck: F:/LLVM/9.0.0/dist/bin/FileCheck.exe
-- Found CUDA: C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1 (found version "10.1")
+28
Просмотреть файл
@@ -0,0 +1,28 @@
/*
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "CUDA2HIP.h"
// Maps the names of CUDA CUB API types to the corresponding HIP types
const std::map<llvm::StringRef, hipCounter> CUDA_CUB_TYPE_NAME_MAP{
{"cub", {"hipcub", "", CONV_TYPE, API_CUB}},
};
+7
Просмотреть файл
@@ -251,6 +251,11 @@ namespace perl {
*streamPtr.get() << tab_2 << "$Tkernels{$1}++;" << endl_tab << "}" << endl << "}" << endl;
}
void generateCubNamespace(unique_ptr<ostream>& streamPtr) {
*streamPtr.get() << endl << sub << "transformCubNamespace" << " {" << endl_tab << my_k << endl;
*streamPtr.get() << tab << "$k += s/using\\s*namespace\\s*cub/using namespace hipcub/g;" << endl << tab << return_k << "}" << endl;
}
void generateHostFunctions(unique_ptr<ostream>& streamPtr) {
*streamPtr.get() << endl << sub << "transformHostFunctions" << " {" << endl_tab << my_k << endl;
set<string> &funcSet = DeviceSymbolFunctions0;
@@ -358,6 +363,7 @@ namespace perl {
generateSimpleSubstitutions(streamPtr);
generateExternShared(streamPtr);
generateKernelLaunch(streamPtr);
generateCubNamespace(streamPtr);
generateHostFunctions(streamPtr);
generateDeviceFunctions(streamPtr);
*streamPtr.get() << endl << "# Count of transforms in all files" << endl;
@@ -401,6 +407,7 @@ namespace perl {
*streamPtr.get() << tab_2 << "simpleSubstitutions();" << endl;
*streamPtr.get() << tab_2 << "transformExternShared();" << endl;
*streamPtr.get() << tab_2 << "transformKernelLaunch();" << endl;
*streamPtr.get() << tab_2 << "transformCubNamespace();" << endl;
*streamPtr.get() << tab_2 << "if ($print_stats) {" << endl;
*streamPtr.get() << tab_3 << while_ << "(/(\\b(hip|HIP)([A-Z]|_)\\w+\\b)/g) {" << endl;
*streamPtr.get() << tab_4 << "$convertedTags{$1}++;" << endl_tab_3 << "}" << endl_tab_2 << "}" << endl;
-2
Просмотреть файл
@@ -31,8 +31,6 @@ const std::map<llvm::StringRef, hipCounter> CUDA_RUNTIME_TYPE_NAME_MAP {
{"cudaChannelFormatDesc", {"hipChannelFormatDesc", "", CONV_TYPE, API_RUNTIME}},
// no analogue
{"cudaDeviceProp", {"hipDeviceProp_t", "", CONV_TYPE, API_RUNTIME}},
// NOTE: int warpSize is a field of cudaDeviceProp
{"warpSize", {"hipWarpSize", "", CONV_TYPE, API_RUNTIME}},
// no analogue
{"cudaEglFrame", {"hipEglFrame", "", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}},
+212 -183
Просмотреть файл
@@ -33,16 +33,21 @@ THE SOFTWARE.
#include "StringUtils.h"
#include "ArgParse.h"
namespace ct = clang::tooling;
namespace mat = clang::ast_matchers;
const std::string sHIP = "HIP";
const std::string sROC = "ROC";
const std::string sCub = "cub";
const std::string sHipcub = "hipcub";
const std::string sHIP_DYNAMIC_SHARED = "HIP_DYNAMIC_SHARED";
const std::string sHIP_KERNEL_NAME = "HIP_KERNEL_NAME";
std::string sHIP_SYMBOL = "HIP_SYMBOL";
std::string sHIP_KERNEL_NAME = "HIP_KERNEL_NAME";
std::string s_reinterpret_cast = "reinterpret_cast<const void*>";
const std::string sHipLaunchKernelGGL = "hipLaunchKernelGGL(";
const std::string sHipLaunchKernelGGL = "hipLaunchKernelGGL";
const std::string sDim3 = "dim3(";
const std::string s_hiprand_kernel_h = "hiprand_kernel.h";
const std::string s_hiprand_h = "hiprand.h";
const std::string sOnce = "once";
const std::string s_string_literal = "[string literal]";
// CUDA identifiers, used in matchers
const std::string sCudaMemcpyToSymbol = "cudaMemcpyToSymbol";
const std::string sCudaMemcpyToSymbolAsync = "cudaMemcpyToSymbolAsync";
const std::string sCudaGetSymbolSize = "cudaGetSymbolSize";
@@ -51,6 +56,14 @@ const std::string sCudaMemcpyFromSymbol = "cudaMemcpyFromSymbol";
const std::string sCudaMemcpyFromSymbolAsync = "cudaMemcpyFromSymbolAsync";
const std::string sCudaFuncSetCacheConfig = "cudaFuncSetCacheConfig";
const std::string sCudaFuncGetAttributes = "cudaFuncGetAttributes";
// Matchers' names
const StringRef sCudaSharedIncompleteArrayVar = "cudaSharedIncompleteArrayVar";
const StringRef sCudaLaunchKernel = "cudaLaunchKernel";
const StringRef sCudaHostFuncCall = "cudaHostFuncCall";
const StringRef sCudaDeviceFuncCall = "cudaDeviceFuncCall";
const StringRef sCubNamespacePrefix = "cubNamespacePrefix";
const StringRef sCubFunctionTemplateDecl = "cubFunctionTemplateDecl";
const StringRef sCubUsingNamespaceDecl = "cubUsingNamespaceDecl";
std::set<std::string> DeviceSymbolFunctions0 {
{sCudaMemcpyToSymbol},
@@ -78,7 +91,7 @@ std::set<std::string> ReinterpretFunctions1{
};
void HipifyAction::RewriteString(StringRef s, clang::SourceLocation start) {
clang::SourceManager& SM = getCompilerInstance().getSourceManager();
auto &SM = getCompilerInstance().getSourceManager();
size_t begin = 0;
while ((begin = s.find("cu", begin)) != StringRef::npos) {
const size_t end = s.find_first_of(" ", begin + 4);
@@ -86,7 +99,7 @@ void HipifyAction::RewriteString(StringRef s, clang::SourceLocation start) {
const auto found = CUDA_RENAMES_MAP().find(name);
if (found != CUDA_RENAMES_MAP().end()) {
StringRef repName = Statistics::isToRoc(found->second) ? found->second.rocName : found->second.hipName;
hipCounter counter = {"[string literal]", "", ConvTypes::CONV_LITERAL, ApiTypes::API_RUNTIME, found->second.supportDegree};
hipCounter counter = {s_string_literal, "", ConvTypes::CONV_LITERAL, ApiTypes::API_RUNTIME, found->second.supportDegree};
Statistics::current().incrementCounter(counter, name.str());
if (!Statistics::isUnsupported(counter)) {
clang::SourceLocation sl = start.getLocWithOffset(begin + 1);
@@ -95,13 +108,24 @@ void HipifyAction::RewriteString(StringRef s, clang::SourceLocation start) {
insertReplacement(Rep, fullSL);
}
}
if (end == StringRef::npos) {
break;
}
if (end == StringRef::npos) break;
begin = end + 1;
}
}
clang::SourceLocation HipifyAction::GetSubstrLocation(const std::string &str, const clang::SourceRange &sr) {
clang::SourceLocation sl(sr.getBegin());
clang::SourceLocation end(sr.getEnd());
auto &SM = getCompilerInstance().getSourceManager();
size_t length = SM.getCharacterData(end) - SM.getCharacterData(sl);
StringRef sfull = StringRef(SM.getCharacterData(sl), length);
size_t offset = sfull.find(str);
if (offset > 0) {
sl = sl.getLocWithOffset(offset);
}
return sl;
}
/**
* Look at, and consider altering, a given token.
*
@@ -109,7 +133,7 @@ void HipifyAction::RewriteString(StringRef s, clang::SourceLocation start) {
* If it's an unsupported CUDA identifier, a warning is emitted.
* Otherwise, the source file is updated with the corresponding hipification.
*/
void HipifyAction::RewriteToken(const clang::Token& t) {
void HipifyAction::RewriteToken(const clang::Token &t) {
// String literals containing CUDA references need fixing.
if (t.is(clang::tok::string_literal)) {
StringRef s(t.getLiteralData(), t.getLength());
@@ -124,20 +148,21 @@ void HipifyAction::RewriteToken(const clang::Token& t) {
FindAndReplace(name, sl, CUDA_RENAMES_MAP());
}
void HipifyAction::FindAndReplace(llvm::StringRef name,
void HipifyAction::FindAndReplace(StringRef name,
clang::SourceLocation sl,
const std::map<llvm::StringRef, hipCounter>& repMap, bool bReplace) {
const std::map<StringRef, hipCounter> &repMap,
bool bReplace) {
const auto found = repMap.find(name);
if (found == repMap.end()) {
// So it's an identifier, but not CUDA? Boring.
return;
}
Statistics::current().incrementCounter(found->second, name.str());
clang::DiagnosticsEngine& DE = getCompilerInstance().getDiagnostics();
clang::DiagnosticsEngine &DE = getCompilerInstance().getDiagnostics();
// Warn the user about unsupported identifier.
if (Statistics::isUnsupported(found->second)) {
std::string sWarn;
Statistics::isToRoc(found->second) ? sWarn = "ROC" : sWarn = "HIP";
Statistics::isToRoc(found->second) ? sWarn = sROC : sWarn = sHIP;
sWarn = "" + sWarn;
const auto ID = DE.getCustomDiagID(clang::DiagnosticsEngine::Warning, "CUDA identifier is unsupported in %0.");
DE.Report(sl, ID) << sWarn;
@@ -147,7 +172,7 @@ void HipifyAction::FindAndReplace(llvm::StringRef name,
return;
}
StringRef repName = Statistics::isToRoc(found->second) ? found->second.rocName : found->second.hipName;
clang::SourceManager& SM = getCompilerInstance().getSourceManager();
auto &SM = getCompilerInstance().getSourceManager();
ct::Replacement Rep(SM, sl, name.size(), repName.str());
clang::FullSourceLoc fullSL(sl, SM);
insertReplacement(Rep, fullSL);
@@ -155,7 +180,7 @@ void HipifyAction::FindAndReplace(llvm::StringRef name,
namespace {
clang::SourceRange getReadRange(clang::SourceManager& SM, const clang::SourceRange& exprRange) {
clang::SourceRange getReadRange(clang::SourceManager &SM, const clang::SourceRange &exprRange) {
clang::SourceLocation begin = exprRange.getBegin();
clang::SourceLocation end = exprRange.getEnd();
bool beginSafe = !SM.isMacroBodyExpansion(begin) || clang::Lexer::isAtStartOfMacroExpansion(begin, SM, clang::LangOptions{});
@@ -167,7 +192,7 @@ clang::SourceRange getReadRange(clang::SourceManager& SM, const clang::SourceRan
}
}
clang::SourceRange getWriteRange(clang::SourceManager& SM, const clang::SourceRange& exprRange) {
clang::SourceRange getWriteRange(clang::SourceManager &SM, const clang::SourceRange &exprRange) {
clang::SourceLocation begin = exprRange.getBegin();
clang::SourceLocation end = exprRange.getEnd();
// If the range is contained within a macro, update the macro definition.
@@ -178,7 +203,7 @@ clang::SourceRange getWriteRange(clang::SourceManager& SM, const clang::SourceRa
return {SM.getSpellingLoc(begin), SM.getSpellingLoc(end)};
}
StringRef readSourceText(clang::SourceManager& SM, const clang::SourceRange& exprRange) {
StringRef readSourceText(clang::SourceManager &SM, const clang::SourceRange &exprRange) {
return clang::Lexer::getSourceText(clang::CharSourceRange::getTokenRange(getReadRange(SM, exprRange)), SM, clang::LangOptions(), nullptr);
}
@@ -186,53 +211,50 @@ StringRef readSourceText(clang::SourceManager& SM, const clang::SourceRange& exp
* Get a string representation of the expression `arg`, unless it's a defaulting function
* call argument, in which case get a 0. Used for building argument lists to kernel calls.
*/
std::string stringifyZeroDefaultedArg(clang::SourceManager& SM, const clang::Expr* arg) {
if (clang::isa<clang::CXXDefaultArgExpr>(arg)) {
return "0";
} else {
return readSourceText(SM, arg->getSourceRange());
}
std::string stringifyZeroDefaultedArg(clang::SourceManager &SM, const clang::Expr *arg) {
if (clang::isa<clang::CXXDefaultArgExpr>(arg)) return "0";
else return readSourceText(SM, arg->getSourceRange());
}
} // anonymous namespace
bool HipifyAction::Exclude(const hipCounter & hipToken) {
bool HipifyAction::Exclude(const hipCounter &hipToken) {
switch (hipToken.type) {
case CONV_INCLUDE_CUDA_MAIN_H:
switch (hipToken.apiType) {
case API_DRIVER:
case API_RUNTIME:
if (insertedRuntimeHeader) { return true; }
if (insertedRuntimeHeader) return true;
insertedRuntimeHeader = true;
return false;
case API_BLAS:
if (insertedBLASHeader) { return true; }
if (insertedBLASHeader) return true;
insertedBLASHeader = true;
return false;
case API_RAND:
if (hipToken.hipName == "hiprand_kernel.h") {
if (insertedRAND_kernelHeader) { return true; }
if (hipToken.hipName == s_hiprand_kernel_h) {
if (insertedRAND_kernelHeader) return true;
insertedRAND_kernelHeader = true;
return false;
} else if (hipToken.hipName == "hiprand.h") {
if (insertedRANDHeader) { return true; }
} else if (hipToken.hipName == s_hiprand_h) {
if (insertedRANDHeader) return true;
insertedRANDHeader = true;
return false;
}
case API_DNN:
if (insertedDNNHeader) { return true; }
if (insertedDNNHeader) return true;
insertedDNNHeader = true;
return false;
case API_FFT:
if (insertedFFTHeader) { return true; }
if (insertedFFTHeader) return true;
insertedFFTHeader = true;
return false;
case API_COMPLEX:
if (insertedComplexHeader) { return true; }
if (insertedComplexHeader) return true;
insertedComplexHeader = true;
return false;
case API_SPARSE:
if (insertedSPARSEHeader) { return true; }
if (insertedSPARSEHeader) return true;
insertedSPARSEHeader = true;
return false;
default:
@@ -240,13 +262,11 @@ bool HipifyAction::Exclude(const hipCounter & hipToken) {
}
return false;
case CONV_INCLUDE:
if (hipToken.hipName.empty()) {
return true;
}
if (hipToken.hipName.empty()) return true;
switch (hipToken.apiType) {
case API_RAND:
if (hipToken.hipName == "hiprand_kernel.h") {
if (insertedRAND_kernelHeader) { return true; }
if (hipToken.hipName == s_hiprand_kernel_h) {
if (insertedRAND_kernelHeader) return true;
insertedRAND_kernelHeader = true;
}
return false;
@@ -267,24 +287,19 @@ void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc,
clang::CharSourceRange filename_range,
const clang::FileEntry*, StringRef,
StringRef, const clang::Module*) {
clang::SourceManager& SM = getCompilerInstance().getSourceManager();
if (!SM.isWrittenInMainFile(hash_loc)) {
return;
}
auto &SM = getCompilerInstance().getSourceManager();
if (!SM.isWrittenInMainFile(hash_loc)) return;
if (!firstHeader) {
firstHeader = true;
firstHeaderLoc = hash_loc;
}
const auto found = CUDA_INCLUDE_MAP.find(file_name);
if (found == CUDA_INCLUDE_MAP.end()) {
return;
}
if (found == CUDA_INCLUDE_MAP.end()) return;
bool exclude = Exclude(found->second);
Statistics::current().incrementCounter(found->second, file_name.str());
clang::SourceLocation sl = filename_range.getBegin();
if (Statistics::isUnsupported(found->second)) {
clang::DiagnosticsEngine& DE = getCompilerInstance().getDiagnostics();
clang::DiagnosticsEngine &DE = getCompilerInstance().getDiagnostics();
DE.Report(sl, DE.getCustomDiagID(clang::DiagnosticsEngine::Warning, "Unsupported CUDA header."));
return;
}
@@ -293,11 +308,8 @@ void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc,
if (!exclude) {
clang::SmallString<128> includeBuffer;
llvm::StringRef name = Statistics::isToRoc(found->second) ? found->second.rocName : found->second.hipName;
if (is_angled) {
newInclude = llvm::Twine("<" + name+ ">").toStringRef(includeBuffer);
} else {
newInclude = llvm::Twine("\"" + name + "\"").toStringRef(includeBuffer);
}
if (is_angled) newInclude = llvm::Twine("<" + name+ ">").toStringRef(includeBuffer);
else newInclude = llvm::Twine("\"" + name + "\"").toStringRef(includeBuffer);
} else {
// hashLoc is location of the '#', thus replacing the whole include directive by empty newInclude starting with '#'.
sl = hash_loc;
@@ -309,48 +321,52 @@ void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc,
}
void HipifyAction::PragmaDirective(clang::SourceLocation Loc, clang::PragmaIntroducerKind Introducer) {
if (pragmaOnce) {
return;
}
clang::SourceManager& SM = getCompilerInstance().getSourceManager();
if (!SM.isWrittenInMainFile(Loc)) {
return;
}
clang::Preprocessor& PP = getCompilerInstance().getPreprocessor();
if (pragmaOnce) return;
auto &SM = getCompilerInstance().getSourceManager();
if (!SM.isWrittenInMainFile(Loc)) return;
clang::Preprocessor &PP = getCompilerInstance().getPreprocessor();
clang::Token tok;
PP.Lex(tok);
StringRef Text(SM.getCharacterData(tok.getLocation()), tok.getLength());
if (Text == "once") {
if (Text == sOnce) {
pragmaOnce = true;
pragmaOnceLoc = tok.getEndLoc();
}
}
bool HipifyAction::cudaLaunchKernel(const clang::ast_matchers::MatchFinder::MatchResult& Result) {
StringRef refName = "cudaLaunchKernel";
const auto* launchKernel = Result.Nodes.getNodeAs<clang::CUDAKernelCallExpr>(refName);
if (!launchKernel) {
return false;
}
const clang::Expr* calleeExpr = launchKernel->getCallee();
if (!calleeExpr) {
return false;
}
const clang::FunctionDecl *caleeDecl = launchKernel->getDirectCallee();
if (!caleeDecl) {
return false;
}
const clang::CallExpr* config = launchKernel->getConfig();
if (!config) {
return false;
}
bool HipifyAction::cudaLaunchKernel(const mat::MatchFinder::MatchResult &Result) {
auto *launchKernel = Result.Nodes.getNodeAs<clang::CUDAKernelCallExpr>(sCudaLaunchKernel);
if (!launchKernel) return false;
auto *calleeExpr = launchKernel->getCallee();
if (!calleeExpr) return false;
auto *caleeDecl = launchKernel->getDirectCallee();
if (!caleeDecl) return false;
auto *config = launchKernel->getConfig();
if (!config) return false;
clang::SmallString<40> XStr;
llvm::raw_svector_ostream OS(XStr);
clang::LangOptions DefaultLangOptions;
clang::SourceManager* SM = Result.SourceManager;
OS << sHipLaunchKernelGGL;
if (caleeDecl->isTemplateInstantiation()) OS << sHIP_KERNEL_NAME << "(";
OS << readSourceText(*SM, calleeExpr->getSourceRange());
auto *SM = Result.SourceManager;
clang::SourceRange sr = calleeExpr->getSourceRange();
std::string kern = readSourceText(*SM, sr).str();
OS << sHipLaunchKernelGGL << "(";
if (caleeDecl->isTemplateInstantiation()) {
OS << sHIP_KERNEL_NAME << "(";
std::string cub = sCub + "::";
std::string hipcub;
const auto found = CUDA_CUB_TYPE_NAME_MAP.find(sCub);
if (found != CUDA_CUB_TYPE_NAME_MAP.end()) {
hipcub = found->second.hipName.str() + "::";
} else {
hipcub = sHipcub + "::";
}
size_t pos = kern.find(cub);
while (pos != std::string::npos) {
kern.replace(pos, cub.size(), hipcub);
pos = kern.find(cub, pos + hipcub.size());
}
}
OS << kern;
if (caleeDecl->isTemplateInstantiation()) OS << ")";
OS << ", ";
// Next up are the four kernel configuration parameters, the last two of which are optional and default to zero.
@@ -381,29 +397,24 @@ bool HipifyAction::cudaLaunchKernel(const clang::ast_matchers::MatchFinder::Matc
ct::Replacement Rep(*SM, launchStart, length, OS.str());
clang::FullSourceLoc fullSL(launchStart, *SM);
insertReplacement(Rep, fullSL);
hipCounter counter = {"hipLaunchKernelGGL", "", ConvTypes::CONV_KERNEL_LAUNCH, ApiTypes::API_RUNTIME};
Statistics::current().incrementCounter(counter, refName.str());
hipCounter counter = {sHipLaunchKernelGGL, "", ConvTypes::CONV_KERNEL_LAUNCH, ApiTypes::API_RUNTIME};
Statistics::current().incrementCounter(counter, sCudaLaunchKernel.str());
return true;
}
bool HipifyAction::cudaSharedIncompleteArrayVar(const clang::ast_matchers::MatchFinder::MatchResult& Result) {
StringRef refName = "cudaSharedIncompleteArrayVar";
auto* sharedVar = Result.Nodes.getNodeAs<clang::VarDecl>(refName);
if (!sharedVar) {
return false;
}
bool HipifyAction::cudaSharedIncompleteArrayVar(const mat::MatchFinder::MatchResult &Result) {
auto *sharedVar = Result.Nodes.getNodeAs<clang::VarDecl>(sCudaSharedIncompleteArrayVar);
if (!sharedVar) return false;
// Example: extern __shared__ uint sRadix1[];
if (!sharedVar->hasExternalFormalLinkage()) {
return false;
}
if (!sharedVar->hasExternalFormalLinkage()) return false;
clang::QualType QT = sharedVar->getType();
std::string typeName;
if (QT->isIncompleteArrayType()) {
const clang::ArrayType* AT = QT.getTypePtr()->getAsArrayTypeUnsafe();
const clang::ArrayType *AT = QT.getTypePtr()->getAsArrayTypeUnsafe();
QT = AT->getElementType();
if (QT.getTypePtr()->isBuiltinType()) {
QT = QT.getCanonicalType();
const auto* BT = clang::dyn_cast<clang::BuiltinType>(QT);
auto *BT = clang::dyn_cast<clang::BuiltinType>(QT);
if (BT) {
clang::LangOptions LO;
LO.CUDA = true;
@@ -417,7 +428,7 @@ bool HipifyAction::cudaSharedIncompleteArrayVar(const clang::ast_matchers::Match
if (!typeName.empty()) {
clang::SourceLocation slStart = sharedVar->getOuterLocStart();
clang::SourceLocation slEnd = llcompat::getEndLoc(sharedVar->getTypeSourceInfo()->getTypeLoc());
clang::SourceManager* SM = Result.SourceManager;
auto *SM = Result.SourceManager;
size_t repLength = SM->getCharacterData(slEnd) - SM->getCharacterData(slStart) + 1;
std::string varName = sharedVar->getNameAsString();
std::string repName = sHIP_DYNAMIC_SHARED + "(" + typeName + ", " + varName + ")";
@@ -425,74 +436,84 @@ bool HipifyAction::cudaSharedIncompleteArrayVar(const clang::ast_matchers::Match
clang::FullSourceLoc fullSL(slStart, *SM);
insertReplacement(Rep, fullSL);
hipCounter counter = {sHIP_DYNAMIC_SHARED, "", ConvTypes::CONV_EXTERN_SHARED, ApiTypes::API_RUNTIME};
Statistics::current().incrementCounter(counter, refName.str());
Statistics::current().incrementCounter(counter, sCudaSharedIncompleteArrayVar.str());
return true;
}
return false;
}
bool HipifyAction::cudaDeviceFuncCall(const clang::ast_matchers::MatchFinder::MatchResult& Result) {
if (const clang::CallExpr *call = Result.Nodes.getNodeAs<clang::CallExpr>("cudaDeviceFuncCall")) {
const clang::FunctionDecl *funcDcl = call->getDirectCallee();
if (!funcDcl) {
return false;
}
bool HipifyAction::cudaDeviceFuncCall(const mat::MatchFinder::MatchResult &Result) {
if (const clang::CallExpr *call = Result.Nodes.getNodeAs<clang::CallExpr>(sCudaDeviceFuncCall)) {
auto *funcDcl = call->getDirectCallee();
if (!funcDcl) return false;
FindAndReplace(funcDcl->getDeclName().getAsString(), llcompat::getBeginLoc(call), CUDA_DEVICE_FUNC_MAP, false);
return true;
}
return false;
}
bool HipifyAction::cubNamespacePrefix(const clang::ast_matchers::MatchFinder::MatchResult& Result) {
if (const clang::TypedefNameDecl *decl = Result.Nodes.getNodeAs<clang::TypedefNameDecl>("cubNamespacePrefix")) {
if (!decl) {
return false;
}
bool HipifyAction::cubNamespacePrefix(const mat::MatchFinder::MatchResult &Result) {
if (auto *decl = Result.Nodes.getNodeAs<clang::TypedefNameDecl>(sCubNamespacePrefix)) {
clang::QualType QT = decl->getUnderlyingType();
const clang::Type* t = QT.getTypePtr();
if (!t) {
return false;
}
const clang::ElaboratedType* et = t->getAs<clang::ElaboratedType>();
if (!et) {
return false;
}
auto *t = QT.getTypePtr();
if (!t) return false;
const clang::ElaboratedType *et = t->getAs<clang::ElaboratedType>();
if (!et) return false;
const clang::NestedNameSpecifier *nns = et->getQualifier();
if (!nns) {
return false;
}
if (!nns) return false;
const clang::NamespaceDecl *nsd = nns->getAsNamespace();
if (!nsd) {
return false;
}
if (!nsd) return false;
const clang::TypeSourceInfo *si = decl->getTypeSourceInfo();
const clang::TypeLoc tloc = si->getTypeLoc();
const clang::SourceRange sr = tloc.getSourceRange();
clang::SourceLocation sl(sr.getBegin());
clang::SourceLocation end(sr.getEnd());
clang::SourceManager& SM = getCompilerInstance().getSourceManager();
size_t length = SM.getCharacterData(end) - SM.getCharacterData(sl);
StringRef sfull = StringRef(SM.getCharacterData(sl), length);
std::string name = nsd->getDeclName().getAsString();
size_t offset = sfull.find(name);
if (offset > 0) {
sl = sl.getLocWithOffset(offset);
}
FindAndReplace(name, sl, CUDA_CUB_TYPE_NAME_MAP);
FindAndReplace(name, GetSubstrLocation(name, sr), CUDA_CUB_TYPE_NAME_MAP);
return true;
}
return false;
}
bool HipifyAction::cudaHostFuncCall(const clang::ast_matchers::MatchFinder::MatchResult& Result) {
if (const clang::CallExpr * call = Result.Nodes.getNodeAs<clang::CallExpr>("cudaHostFuncCall")) {
if (!call->getNumArgs()) {
return false;
bool HipifyAction::cubUsingNamespaceDecl(const mat::MatchFinder::MatchResult &Result) {
if (auto *decl = Result.Nodes.getNodeAs<clang::UsingDirectiveDecl>(sCubUsingNamespaceDecl)) {
if (auto nsd = decl->getNominatedNamespace()) {
FindAndReplace(nsd->getDeclName().getAsString(), decl->getIdentLocation(), CUDA_CUB_TYPE_NAME_MAP);
return true;
}
const clang::FunctionDecl* funcDcl = call->getDirectCallee();
if (!funcDcl) {
return false;
}
return false;
}
bool HipifyAction::cubFunctionTemplateDecl(const mat::MatchFinder::MatchResult &Result) {
if (auto *decl = Result.Nodes.getNodeAs<clang::FunctionTemplateDecl>(sCubFunctionTemplateDecl)) {
auto *Tparams = decl->getTemplateParameters();
bool ret = false;
for (size_t I = 0; I < Tparams->size(); ++I) {
const clang::ValueDecl *valueDecl = dyn_cast<clang::ValueDecl>(Tparams->getParam(I));
if (!valueDecl) continue;
clang::QualType QT = valueDecl->getType();
auto *t = QT.getTypePtr();
if (!t) continue;
const clang::ElaboratedType *et = t->getAs<clang::ElaboratedType>();
if (!et) continue;
const clang::NestedNameSpecifier *nns = et->getQualifier();
if (!nns) continue;
const clang::NamespaceDecl *nsd = nns->getAsNamespace();
if (!nsd) continue;
const clang::SourceRange sr = valueDecl->getSourceRange();
std::string name = nsd->getDeclName().getAsString();
FindAndReplace(name, GetSubstrLocation(name, sr), CUDA_CUB_TYPE_NAME_MAP);
ret = true;
}
return ret;
}
return false;
}
bool HipifyAction::cudaHostFuncCall(const mat::MatchFinder::MatchResult &Result) {
if (auto *call = Result.Nodes.getNodeAs<clang::CallExpr>(sCudaHostFuncCall)) {
if (!call->getNumArgs()) return false;
auto *funcDcl = call->getDirectCallee();
if (!funcDcl) return false;
std::string sName = funcDcl->getDeclName().getAsString();
unsigned int argNum = 0;
bool b_reinterpret = (ReinterpretFunctions.find(sName) != ReinterpretFunctions.end()) ? true : false;
@@ -506,7 +527,7 @@ bool HipifyAction::cudaHostFuncCall(const clang::ast_matchers::MatchFinder::Matc
clang::SmallString<40> XStr;
llvm::raw_svector_ostream OS(XStr);
clang::SourceRange sr = call->getArg(argNum)->getSourceRange();
clang::SourceManager* SM = Result.SourceManager;
auto *SM = Result.SourceManager;
OS << (b_reinterpret ? s_reinterpret_cast : sHIP_SYMBOL) << "(" << readSourceText(*SM, sr) << ")";
clang::SourceRange replacementRange = getWriteRange(*SM, { sr.getBegin(), sr.getEnd() });
clang::SourceLocation s = replacementRange.getBegin();
@@ -521,7 +542,7 @@ bool HipifyAction::cudaHostFuncCall(const clang::ast_matchers::MatchFinder::Matc
return false;
}
void HipifyAction::insertReplacement(const ct::Replacement& rep, const clang::FullSourceLoc& fullSL) {
void HipifyAction::insertReplacement(const ct::Replacement &rep, const clang::FullSourceLoc &fullSL) {
llcompat::insertReplacement(*replacements, rep);
if (PrintStats) {
rep.getLength();
@@ -530,10 +551,10 @@ void HipifyAction::insertReplacement(const ct::Replacement& rep, const clang::Fu
}
}
std::unique_ptr<clang::ASTConsumer> HipifyAction::CreateASTConsumer(clang::CompilerInstance& CI, llvm::StringRef) {
Finder.reset(new clang::ast_matchers::MatchFinder);
std::unique_ptr<clang::ASTConsumer> HipifyAction::CreateASTConsumer(clang::CompilerInstance &CI, StringRef) {
Finder.reset(new mat::MatchFinder);
// Replace the <<<...>>> language extension with a hip kernel launch
Finder->addMatcher(mat::cudaKernelCallExpr(mat::isExpansionInMainFile()).bind("cudaLaunchKernel"), this);
Finder->addMatcher(mat::cudaKernelCallExpr(mat::isExpansionInMainFile()).bind(sCudaLaunchKernel), this);
Finder->addMatcher(
mat::varDecl(
mat::isExpansionInMainFile(),
@@ -541,7 +562,7 @@ std::unique_ptr<clang::ASTConsumer> HipifyAction::CreateASTConsumer(clang::Compi
mat::hasAttr(clang::attr::CUDAShared),
mat::hasType(mat::incompleteArrayType())
)
).bind("cudaSharedIncompleteArrayVar"),
).bind(sCudaSharedIncompleteArrayVar),
this
);
Finder->addMatcher(
@@ -561,7 +582,7 @@ std::unique_ptr<clang::ASTConsumer> HipifyAction::CreateASTConsumer(clang::Compi
)
)
)
).bind("cudaHostFuncCall"),
).bind(sCudaHostFuncCall),
this
);
Finder->addMatcher(
@@ -576,7 +597,7 @@ std::unique_ptr<clang::ASTConsumer> HipifyAction::CreateASTConsumer(clang::Compi
mat::unless(mat::hasAttr(clang::attr::CUDAHost))
)
)
).bind("cudaDeviceFuncCall"),
).bind(sCudaDeviceFuncCall),
this
);
Finder->addMatcher(
@@ -586,12 +607,26 @@ std::unique_ptr<clang::ASTConsumer> HipifyAction::CreateASTConsumer(clang::Compi
mat::elaboratedType(
mat::hasQualifier(
mat::specifiesNamespace(
mat::hasName("cub")
mat::hasName(sCub)
)
)
)
)
).bind("cubNamespacePrefix"),
).bind(sCubNamespacePrefix),
this
);
// TODO: Maybe worth to make it more concrete based on final cubFunctionTemplateDecl
Finder->addMatcher(
mat::functionTemplateDecl(
mat::isExpansionInMainFile()
).bind(sCubFunctionTemplateDecl),
this
);
// TODO: Maybe worth to make it more concrete
Finder->addMatcher(
mat::usingDirectiveDecl(
mat::isExpansionInMainFile()
).bind(sCubUsingNamespaceDecl),
this
);
// Ownership is transferred to the caller.
@@ -599,10 +634,8 @@ std::unique_ptr<clang::ASTConsumer> HipifyAction::CreateASTConsumer(clang::Compi
}
void HipifyAction::Ifndef(clang::SourceLocation Loc, const clang::Token &MacroNameTok, const clang::MacroDefinition &MD) {
clang::SourceManager& SM = getCompilerInstance().getSourceManager();
if (!SM.isWrittenInMainFile(Loc)) {
return;
}
auto &SM = getCompilerInstance().getSourceManager();
if (!SM.isWrittenInMainFile(Loc)) return;
StringRef Text(SM.getCharacterData(MacroNameTok.getLocation()), MacroNameTok.getLength());
Ifndefs.insert(std::make_pair(Text.str(), MacroNameTok.getEndLoc()));
}
@@ -615,12 +648,12 @@ void HipifyAction::EndSourceFileAction() {
// one copy of the hip include into every file.
bool placeForIncludeCalculated = false;
clang::SourceLocation sl, controllingMacroLoc;
clang::SourceManager& SM = getCompilerInstance().getSourceManager();
clang::Preprocessor& PP = getCompilerInstance().getPreprocessor();
clang::HeaderSearch& HS = PP.getHeaderSearchInfo();
clang::ExternalPreprocessorSource* EPL = HS.getExternalLookup();
const clang::FileEntry* FE = SM.getFileEntryForID(SM.getMainFileID());
const clang::IdentifierInfo* controllingMacro = HS.getFileInfo(FE).getControllingMacro(EPL);
auto &SM = getCompilerInstance().getSourceManager();
clang::Preprocessor &PP = getCompilerInstance().getPreprocessor();
clang::HeaderSearch &HS = PP.getHeaderSearchInfo();
clang::ExternalPreprocessorSource *EPL = HS.getExternalLookup();
const clang::FileEntry *FE = SM.getFileEntryForID(SM.getMainFileID());
const clang::IdentifierInfo *controllingMacro = HS.getFileInfo(FE).getControllingMacro(EPL);
if (controllingMacro) {
auto found = Ifndefs.find(controllingMacro->getName().str());
if (found != Ifndefs.end()) {
@@ -629,19 +662,13 @@ void HipifyAction::EndSourceFileAction() {
}
}
if (pragmaOnce) {
if (placeForIncludeCalculated) {
sl = pragmaOnceLoc < controllingMacroLoc ? pragmaOnceLoc : controllingMacroLoc;
} else {
sl = pragmaOnceLoc;
}
if (placeForIncludeCalculated) sl = pragmaOnceLoc < controllingMacroLoc ? pragmaOnceLoc : controllingMacroLoc;
else sl = pragmaOnceLoc;
placeForIncludeCalculated = true;
}
if (!placeForIncludeCalculated) {
if (firstHeader) {
sl = firstHeaderLoc;
} else {
sl = SM.getLocForStartOfFile(SM.getMainFileID());
}
if (firstHeader) sl = firstHeaderLoc;
else sl = SM.getLocForStartOfFile(SM.getMainFileID());
}
clang::FullSourceLoc fullSL(sl, SM);
ct::Replacement Rep(SM, sl, 0, "\n#include <hip/hip_runtime.h>\n");
@@ -656,15 +683,15 @@ namespace {
* A silly little class to proxy PPCallbacks back to the HipifyAction class.
*/
class PPCallbackProxy : public clang::PPCallbacks {
HipifyAction& hipifyAction;
HipifyAction &hipifyAction;
public:
explicit PPCallbackProxy(HipifyAction& action): hipifyAction(action) {}
explicit PPCallbackProxy(HipifyAction &action): hipifyAction(action) {}
void InclusionDirective(clang::SourceLocation hash_loc, const clang::Token& include_token,
void InclusionDirective(clang::SourceLocation hash_loc, const clang::Token &include_token,
StringRef file_name, bool is_angled, clang::CharSourceRange filename_range,
const clang::FileEntry* file, StringRef search_path, StringRef relative_path,
const clang::Module* imported
const clang::FileEntry *file, StringRef search_path, StringRef relative_path,
const clang::Module *imported
#if LLVM_VERSION_MAJOR > 6
, clang::SrcMgr::CharacteristicKind FileType
#endif
@@ -688,10 +715,10 @@ bool HipifyAction::BeginInvocation(clang::CompilerInstance &CI) {
}
void HipifyAction::ExecuteAction() {
clang::Preprocessor& PP = getCompilerInstance().getPreprocessor();
clang::SourceManager& SM = getCompilerInstance().getSourceManager();
clang::Preprocessor &PP = getCompilerInstance().getPreprocessor();
auto &SM = getCompilerInstance().getSourceManager();
// Start lexing the specified input file.
const llvm::MemoryBuffer* FromFile = SM.getBuffer(SM.getMainFileID());
const llvm::MemoryBuffer *FromFile = SM.getBuffer(SM.getMainFileID());
clang::Lexer RawLex(SM.getMainFileID(), FromFile, SM, PP.getLangOpts());
RawLex.SetKeepWhitespaceMode(true);
// Perform a token-level rewrite of CUDA identifiers to hip ones. The raw-mode lexer gives us enough
@@ -709,10 +736,12 @@ void HipifyAction::ExecuteAction() {
clang::ASTFrontendAction::ExecuteAction();
}
void HipifyAction::run(const clang::ast_matchers::MatchFinder::MatchResult& Result) {
void HipifyAction::run(const mat::MatchFinder::MatchResult &Result) {
if (cudaLaunchKernel(Result)) return;
if (cudaSharedIncompleteArrayVar(Result)) return;
if (cudaHostFuncCall(Result)) return;
if (cudaDeviceFuncCall(Result)) return;
if (cubNamespacePrefix(Result)) return;
if (cubFunctionTemplateDecl(Result)) return;
if (cubUsingNamespaceDecl(Result)) return;
}
+18 -14
Просмотреть файл
@@ -31,17 +31,18 @@ THE SOFTWARE.
#include "Statistics.h"
namespace ct = clang::tooling;
namespace mat = clang::ast_matchers;
using namespace llvm;
/**
* A FrontendAction that hipifies CUDA programs.
*/
class HipifyAction : public clang::ASTFrontendAction,
public clang::ast_matchers::MatchFinder::MatchCallback {
public mat::MatchFinder::MatchCallback {
private:
ct::Replacements* replacements;
ct::Replacements *replacements;
std::map<std::string, clang::SourceLocation> Ifndefs;
std::unique_ptr<clang::ast_matchers::MatchFinder> Finder;
std::unique_ptr<mat::MatchFinder> Finder;
// CUDA implicitly adds its runtime header. We rewrite explicitly-provided CUDA includes with equivalent
// ones, and track - using this flag - if the result led to us including the hip runtime header. If it did
// not, we insert it at the top of the file when we finish processing it.
@@ -62,17 +63,20 @@ private:
void RewriteString(StringRef s, clang::SourceLocation start);
// Replace a CUDA identifier with the corresponding hip identifier, if applicable.
void RewriteToken(const clang::Token &t);
// Calculate str's SourceLocation in SourceRange sr
clang::SourceLocation GetSubstrLocation(const std::string &str, const clang::SourceRange &sr);
public:
explicit HipifyAction(ct::Replacements *replacements): clang::ASTFrontendAction(),
replacements(replacements) {}
// MatchCallback listeners
bool cudaBuiltin(const clang::ast_matchers::MatchFinder::MatchResult& Result);
bool cudaLaunchKernel(const clang::ast_matchers::MatchFinder::MatchResult& Result);
bool cudaSharedIncompleteArrayVar(const clang::ast_matchers::MatchFinder::MatchResult& Result);
bool cudaDeviceFuncCall(const clang::ast_matchers::MatchFinder::MatchResult& Result);
bool cudaHostFuncCall(const clang::ast_matchers::MatchFinder::MatchResult& Result);
bool cubNamespacePrefix(const clang::ast_matchers::MatchFinder::MatchResult& Result);
bool cudaLaunchKernel(const mat::MatchFinder::MatchResult &Result);
bool cudaSharedIncompleteArrayVar(const mat::MatchFinder::MatchResult &Result);
bool cudaDeviceFuncCall(const mat::MatchFinder::MatchResult &Result);
bool cudaHostFuncCall(const mat::MatchFinder::MatchResult &Result);
bool cubNamespacePrefix(const mat::MatchFinder::MatchResult &Result);
bool cubFunctionTemplateDecl(const mat::MatchFinder::MatchResult &Result);
bool cubUsingNamespaceDecl(const mat::MatchFinder::MatchResult &Result);
// Called by the preprocessor for each include directive during the non-raw lexing pass.
void InclusionDirective(clang::SourceLocation hash_loc,
const clang::Token &include_token,
@@ -91,7 +95,7 @@ public:
protected:
// Add a Replacement for the current file. These will all be applied after executing the FrontendAction.
void insertReplacement(const ct::Replacement& rep, const clang::FullSourceLoc& fullSL);
void insertReplacement(const ct::Replacement &rep, const clang::FullSourceLoc &fullSL);
// FrontendAction entry point.
void ExecuteAction() override;
// Callback before starting processing a single input; used by hipify-clang for setting Preprocessor options.
@@ -99,8 +103,8 @@ protected:
// Called at the start of each new file to process.
void EndSourceFileAction() override;
// MatchCallback API entry point. Called by the AST visitor while searching the AST for things we registered an interest for.
void run(const clang::ast_matchers::MatchFinder::MatchResult& Result) override;
std::unique_ptr<clang::ASTConsumer> CreateASTConsumer(clang::CompilerInstance &CI, llvm::StringRef InFile) override;
bool Exclude(const hipCounter & hipToken);
void FindAndReplace(llvm::StringRef name, clang::SourceLocation sl, const std::map<llvm::StringRef, hipCounter>& repMap, bool bReplace = true);
void run(const mat::MatchFinder::MatchResult &Result) override;
std::unique_ptr<clang::ASTConsumer> CreateASTConsumer(clang::CompilerInstance &CI, StringRef InFile) override;
bool Exclude(const hipCounter &hipToken);
void FindAndReplace(StringRef name, clang::SourceLocation sl, const std::map<StringRef, hipCounter> &repMap, bool bReplace = true);
};
+1
Просмотреть файл
@@ -199,6 +199,7 @@ int main(int argc, const char **argv) {
Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("cuda", ct::ArgumentInsertPosition::BEGIN));
Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("-x", ct::ArgumentInsertPosition::BEGIN));
Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("--cuda-host-only", ct::ArgumentInsertPosition::BEGIN));
Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("-fno-delayed-template-parsing", ct::ArgumentInsertPosition::BEGIN));
if (!CudaPath.empty()) {
std::string sCudaPath = "--cuda-path=" + CudaPath;
Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster(sCudaPath.c_str(), ct::ArgumentInsertPosition::BEGIN));
+1 -1
Просмотреть файл
@@ -86,7 +86,7 @@ struct Bundled_code {
char cbuf[sizeof(offset) + sizeof(bundle_sz) + sizeof(triple_sz)];
} header;
std::string triple;
std::vector<char> blob;
std::string blob;
};
#define magic_string_ "__CLANG_OFFLOAD_BUNDLE__"
-29
Просмотреть файл
@@ -127,35 +127,6 @@ void hipLaunchKernelGGLImpl(
} // Namespace hip_impl.
template <typename F>
inline
hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* blockSize,
F kernel, size_t dynSharedMemPerBlk, uint32_t blockSizeLimit) {
using namespace hip_impl;
hip_impl::hip_init();
auto f = get_program_state().kernel_descriptor(reinterpret_cast<std::uintptr_t>(kernel),
target_agent(0));
return hipOccupancyMaxPotentialBlockSize(gridSize, blockSize, f,
dynSharedMemPerBlk, blockSizeLimit);
}
template <typename F>
inline
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(uint32_t* numBlocks, F kernel,
uint32_t blockSize, size_t dynSharedMemPerBlk) {
using namespace hip_impl;
hip_impl::hip_init();
auto f = get_program_state().kernel_descriptor(reinterpret_cast<std::uintptr_t>(kernel),
target_agent(0));
return hipOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, f, blockSize, dynSharedMemPerBlk);
}
template <typename... Args, typename F = void (*)(Args...)>
inline
void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
+32 -10
Просмотреть файл
@@ -59,12 +59,17 @@ float atomicAdd(float* address, float val)
{
unsigned int* uaddr{reinterpret_cast<unsigned int*>(address)};
unsigned int old{__atomic_load_n(uaddr, __ATOMIC_RELAXED)};
unsigned int r;
unsigned int r;
do {
r = old;
r = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
if (r != old) { r = old; continue; }
old = atomicCAS(uaddr, r, __float_as_uint(val + __uint_as_float(r)));
} while (r != old);
if (r == old) break;
} while (true);
return __uint_as_float(r);
}
@@ -74,13 +79,18 @@ double atomicAdd(double* address, double val)
{
unsigned long long* uaddr{reinterpret_cast<unsigned long long*>(address)};
unsigned long long old{__atomic_load_n(uaddr, __ATOMIC_RELAXED)};
unsigned long long r;
unsigned long long r;
do {
r = old;
r = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
if (r != old) { r = old; continue; }
old = atomicCAS(
uaddr, r, __double_as_longlong(val + __longlong_as_double(r)));
} while (r != old);
if (r == old) break;
} while (true);
return __longlong_as_double(r);
}
@@ -144,7 +154,13 @@ unsigned long long atomicMin(
unsigned long long* address, unsigned long long val)
{
unsigned long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)};
while (val < tmp) { tmp = atomicCAS(address, tmp, val); }
while (val < tmp) {
const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED);
if (tmp1 != tmp) { tmp = tmp1; continue; }
tmp = atomicCAS(address, tmp, val);
}
return tmp;
}
@@ -167,7 +183,13 @@ unsigned long long atomicMax(
unsigned long long* address, unsigned long long val)
{
unsigned long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)};
while (tmp < val) { tmp = atomicCAS(address, tmp, val); }
while (tmp < val) {
const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED);
if (tmp1 != tmp) { tmp = tmp1; continue; }
tmp = atomicCAS(address, tmp, val);
}
return tmp;
}
@@ -177,7 +199,7 @@ inline
unsigned int atomicInc(unsigned int* address, unsigned int val)
{
__device__
extern
extern
unsigned int __builtin_amdgcn_atomic_inc(
unsigned int*,
unsigned int,
@@ -194,7 +216,7 @@ inline
unsigned int atomicDec(unsigned int* address, unsigned int val)
{
__device__
extern
extern
unsigned int __builtin_amdgcn_atomic_dec(
unsigned int*,
unsigned int,
+36 -16
Просмотреть файл
@@ -1742,10 +1742,10 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes);
*
* @param[out] dst Data ptr to be filled
* @param[in] constant value to be set
* @param[in] sizeBytes Data size in bytes
* @param[in] number of values to be set
* @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
*/
hipError_t hipMemsetD8(hipDeviceptr_t dest, unsigned char value, size_t sizeBytes);
hipError_t hipMemsetD8(hipDeviceptr_t dest, unsigned char value, size_t count);
/**
* @brief Fills the first sizeBytes bytes of the memory area pointed to by dest with the constant
@@ -1758,11 +1758,11 @@ hipError_t hipMemsetD8(hipDeviceptr_t dest, unsigned char value, size_t sizeByte
*
* @param[out] dst Data ptr to be filled
* @param[in] constant value to be set
* @param[in] sizeBytes Data size in bytes
* @param[in] number of values to be set
* @param[in] stream - Stream identifier
* @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
*/
hipError_t hipMemsetD8Async(hipDeviceptr_t dest, unsigned char value, size_t sizeBytes, hipStream_t stream __dparm(0));
hipError_t hipMemsetD8Async(hipDeviceptr_t dest, unsigned char value, size_t count, hipStream_t stream __dparm(0));
/**
* @brief Fills the first sizeBytes bytes of the memory area pointed to by dest with the constant
@@ -1770,10 +1770,10 @@ hipError_t hipMemsetD8Async(hipDeviceptr_t dest, unsigned char value, size_t siz
*
* @param[out] dst Data ptr to be filled
* @param[in] constant value to be set
* @param[in] sizeBytes Data size in bytes
* @param[in] number of values to be set
* @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
*/
hipError_t hipMemsetD16(hipDeviceptr_t dest, unsigned short value, size_t sizeBytes);
hipError_t hipMemsetD16(hipDeviceptr_t dest, unsigned short value, size_t count);
/**
* @brief Fills the first sizeBytes bytes of the memory area pointed to by dest with the constant
@@ -1786,11 +1786,11 @@ hipError_t hipMemsetD16(hipDeviceptr_t dest, unsigned short value, size_t sizeBy
*
* @param[out] dst Data ptr to be filled
* @param[in] constant value to be set
* @param[in] sizeBytes Data size in bytes
* @param[in] number of values to be set
* @param[in] stream - Stream identifier
* @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
*/
hipError_t hipMemsetD16Async(hipDeviceptr_t dest, unsigned short value, size_t sizeBytes, hipStream_t stream __dparm(0));
hipError_t hipMemsetD16Async(hipDeviceptr_t dest, unsigned short value, size_t count, hipStream_t stream __dparm(0));
/**
* @brief Fills the memory area pointed to by dest with the constant integer
@@ -2884,14 +2884,14 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
* @param [out] gridSize minimum grid size for maximum potential occupancy
* @param [out] blockSize block size for maximum potential occupancy
* @param [in] f kernel function for which occupancy is calulated
* @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
* @param [in] dynamicSMemSize Per - block dynamic shared memory usage intended, in bytes
* @param [in] blockSizeLimit the maximum block size for the kernel, use 0 for no limit
*
* @returns hipSuccess, hipInvalidDevice, hipErrorInvalidValue
*/
hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* blockSize,
hipFunction_t f, size_t dynSharedMemPerBlk,
uint32_t blockSizeLimit);
hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
const void* f, size_t dynamicSMemSize,
int blockSizeLimit);
/**
* @brief Returns occupancy for a device function.
@@ -2899,10 +2899,10 @@ hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* block
* @param [out] numBlocks Returned occupancy
* @param [in] func Kernel function for which occupancy is calulated
* @param [in] blockSize Block size the kernel is intended to be launched with
* @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
* @param [in] dynamicSMemSize Per - block dynamic shared memory usage intended, in bytes
*/
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk);
int* numBlocks, const void* f, int blockSize, size_t dynamicSMemSize);
/**
* @brief Returns occupancy for a device function.
@@ -2910,11 +2910,11 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
* @param [out] numBlocks Returned occupancy
* @param [in] func Kernel function for which occupancy is calulated
* @param [in] blockSize Block size the kernel is intended to be launched with
* @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
* @param [in] dynamicSMemSize Per - block dynamic shared memory usage intended, in bytes
* @param [in] flags Extra flags for occupancy calculation (currently ignored)
*/
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk, unsigned int flags);
int* numBlocks, const void* f, int blockSize, size_t dynamicSMemSize, unsigned int flags);
/**
* @brief Launches kernels on multiple devices and guarantees all specified kernels are dispatched
@@ -3320,7 +3320,27 @@ hipError_t hipBindTextureToMipmappedArray(const texture<T, dim, readMode>& tex,
return hipSuccess;
}
template <class T>
inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
int* numBlocks, T f, int blockSize, size_t dynamicSMemSize) {
return hipOccupancyMaxActiveBlocksPerMultiprocessor(
numBlocks, reinterpret_cast<const void*>(f), blockSize, dynamicSMemSize);
}
template <class T>
inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
int* numBlocks, T f, int blockSize, size_t dynamicSMemSize, unsigned int flags) {
return hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
numBlocks, reinterpret_cast<const void*>(f), blockSize, dynamicSMemSize, flags);
}
template <class T>
inline hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
T f, size_t dynamicSMemSize, int blockSizeLimit) {
return hipOccupancyMaxPotentialBlockSize(
gridSize, blockSize, reinterpret_cast<const void*>(f), dynamicSMemSize, blockSizeLimit);
}
template <class T>
inline hipError_t hipLaunchCooperativeKernel(T f, dim3 gridDim, dim3 blockDim,
void** kernelParams, unsigned int sharedMemBytes, hipStream_t stream) {
+77
Просмотреть файл
@@ -0,0 +1,77 @@
/*
Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_PROF_H
#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_PROF_H
// HIP VDI Op IDs enumeration
enum HipVdiOpId {
kHipVdiOpIdDispatch = 0,
kHipVdiOpIdCopy = 1,
kHipVdiOpIdBarrier = 2,
kHipVdiOpIdNumber = 3
};
// Types of VDI commands
enum HipVdiCommandKind {
kHipVdiCommandKernel = 0x11F0,
kHipVdiMemcpyDeviceToHost = 0x11F3,
kHipHipVdiMemcpyHostToDevice = 0x11F4,
kHipVdiMemcpyDeviceToDevice = 0x11F5,
kHipVidMemcpyDeviceToHostRect = 0x1201,
kHipVdiMemcpyHostToDeviceRect = 0x1202,
kHipVdiMemcpyDeviceToDeviceRect = 0x1203,
kHipVdiFillMemory = 0x1207,
};
/**
* @brief Initializes activity callback
*
* @param [input] id_callback Event ID callback function
* @param [input] op_callback Event operation callback function
* @param [input] arg Arguments passed into callback
*
* @returns None
*/
void hipInitActivityCallback(void* id_callback, void* op_callback, void* arg);
/**
* @brief Enables activity callback
*
* @param [input] op Operation, which will trigger a callback (@see HipVdiOpId)
* @param [input] enable Enable state for the callback
*
* @returns True if successful
*/
bool hipEnableActivityCallback(uint32_t op, bool enable);
/**
* @brief Returns the description string for the operation kind
*
* @param [input] id Command kind id (@see HipVdiCommandKind)
*
* @returns A pointer to a const string with the command description
*/
const char* hipGetCmdName(uint32_t id);
#endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_PROF_H
+96 -13
Просмотреть файл
@@ -47,6 +47,95 @@ THE SOFTWARE.
#if defined(__cplusplus)
#include <type_traits>
namespace hip_impl {
template<typename T, typename Vector, unsigned int idx>
struct Scalar_accessor {
// Idea from https://t0rakka.silvrback.com/simd-scalar-accessor
Vector data;
__host__ __device__
operator T() const noexcept { return data[idx]; }
__host__ __device__
Scalar_accessor& operator=(T x) noexcept {
data[idx] = x;
return *this;
}
__host__ __device__
Scalar_accessor& operator+=(T x) noexcept {
data[idx] += x;
return *this;
}
__host__ __device__
Scalar_accessor& operator-=(T x) noexcept {
data[idx] -= x;
return *this;
}
__host__ __device__
Scalar_accessor& operator*=(T x) noexcept {
data[idx] *= x;
return *this;
}
__host__ __device__
Scalar_accessor& operator/=(T x) noexcept {
data[idx] /= x;
return *this;
}
template<
typename U = T,
typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
__host__ __device__
Scalar_accessor& operator%=(T x) noexcept {
data[idx] %= x;
return *this;
}
template<
typename U = T,
typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
__host__ __device__
Scalar_accessor& operator>>=(T x) noexcept {
data[idx] >>= x;
return *this;
}
template<
typename U = T,
typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
__host__ __device__
Scalar_accessor& operator<<=(T x) noexcept {
data[idx] <<= x;
return *this;
}
template<
typename U = T,
typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
__host__ __device__
Scalar_accessor& operator&=(T x) noexcept {
data[idx] &= x;
return *this;
}
template<
typename U = T,
typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
__host__ __device__
Scalar_accessor& operator|=(T x) noexcept {
data[idx] |= x;
return *this;
}
template<
typename U = T,
typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
__host__ __device__
Scalar_accessor& operator^=(T x) noexcept {
data[idx] ^= x;
return *this;
}
};
} // Namespace hip_impl.
template<typename T, unsigned int n> struct HIP_vector_base;
template<typename T>
@@ -55,9 +144,7 @@ THE SOFTWARE.
union {
Native_vec_ data;
struct {
T x;
};
hip_impl::Scalar_accessor<T, Native_vec_, 0> x;
};
};
@@ -67,10 +154,8 @@ THE SOFTWARE.
union {
Native_vec_ data;
struct {
T x;
T y;
};
hip_impl::Scalar_accessor<T, Native_vec_, 0> x;
hip_impl::Scalar_accessor<T, Native_vec_, 1> y;
};
};
@@ -238,12 +323,10 @@ THE SOFTWARE.
union {
Native_vec_ data;
struct {
T x;
T y;
T z;
T w;
};
hip_impl::Scalar_accessor<T, Native_vec_, 0> x;
hip_impl::Scalar_accessor<T, Native_vec_, 1> y;
hip_impl::Scalar_accessor<T, Native_vec_, 2> z;
hip_impl::Scalar_accessor<T, Native_vec_, 3> w;
};
};
Обычный файл → Исполняемый файл
+16 -16
Просмотреть файл
@@ -30,7 +30,7 @@ THE SOFTWARE.
#define fileName "tex2dKernel.code"
texture<float, 2, hipReadModeElementType> tex;
bool testResult = false;
bool testResult = true;
#define HIP_CHECK(cmd) \
{ \
@@ -62,7 +62,7 @@ bool runTest(int argc, char** argv) {
desc.NumChannels = 1;
desc.Width = width;
desc.Height = height;
hipArrayCreate(&array, &desc);
HIP_CHECK(hipArrayCreate(&array, &desc));
hip_Memcpy2D copyParam;
memset(&copyParam, 0, sizeof(copyParam));
@@ -73,19 +73,19 @@ bool runTest(int argc, char** argv) {
copyParam.srcPitch = width * sizeof(float);
copyParam.WidthInBytes = copyParam.srcPitch;
copyParam.Height = height;
hipMemcpyParam2D(&copyParam);
HIP_CHECK(hipMemcpyParam2D(&copyParam));
textureReference* texref;
hipModuleGetTexRef(&texref, Module, "tex");
hipTexRefSetAddressMode(texref, 0, hipAddressModeWrap);
hipTexRefSetAddressMode(texref, 1, hipAddressModeWrap);
hipTexRefSetFilterMode(texref, hipFilterModePoint);
hipTexRefSetFlags(texref, 0);
hipTexRefSetFormat(texref, HIP_AD_FORMAT_FLOAT, 1);
hipTexRefSetArray(texref, array, HIP_TRSA_OVERRIDE_FORMAT);
HIP_CHECK(hipModuleGetTexRef(&texref, Module, "tex"));
HIP_CHECK(hipTexRefSetAddressMode(texref, 0, hipAddressModeWrap));
HIP_CHECK(hipTexRefSetAddressMode(texref, 1, hipAddressModeWrap));
HIP_CHECK(hipTexRefSetFilterMode(texref, hipFilterModePoint));
HIP_CHECK(hipTexRefSetFlags(texref, 0));
HIP_CHECK(hipTexRefSetFormat(texref, HIP_AD_FORMAT_FLOAT, 1));
HIP_CHECK(hipTexRefSetArray(texref, array, HIP_TRSA_OVERRIDE_FORMAT));
float* dData = NULL;
hipMalloc((void**)&dData, size);
HIP_CHECK(hipMalloc((void**)&dData, size));
struct {
void* _Ad;
@@ -112,7 +112,7 @@ bool runTest(int argc, char** argv) {
float* hOutputData = (float*)malloc(size);
memset(hOutputData, 0, size);
hipMemcpy(hOutputData, dData, size, hipMemcpyDeviceToHost);
HIP_CHECK(hipMemcpy(hOutputData, dData, size, hipMemcpyDeviceToHost));
for (int i = 0; i < height; i++) {
for (int j = 0; j < width; j++) {
@@ -124,10 +124,10 @@ bool runTest(int argc, char** argv) {
}
}
}
hipUnbindTexture(tex);
hipFree(dData);
hipFreeArray(array);
return true;
HIP_CHECK(hipUnbindTexture(tex));
HIP_CHECK(hipFree(dData));
HIP_CHECK(hipFreeArray(array));
return testResult;
}
int main(int argc, char** argv) {
+4 -4
Просмотреть файл
@@ -56,9 +56,9 @@ void launchKernel(float* C, float* A, float* B, bool manual){
const unsigned threadsperblock = 32;
const unsigned blocks = (NUM/threadsperblock)+1;
uint32_t mingridSize = 0;
uint32_t gridSize = 0;
uint32_t blockSize = 0;
int mingridSize = 0;
int gridSize = 0;
int blockSize = 0;
if (manual){
blockSize = threadsperblock;
@@ -86,7 +86,7 @@ void launchKernel(float* C, float* A, float* B, bool manual){
printf("kernel Execution time = %6.3fms\n", eventMs);
//Calculate Occupancy
uint32_t numBlock = 0;
int numBlock = 0;
HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, multiply, blockSize, 0));
if(devProp.maxThreadsPerMultiProcessor){
+21
Просмотреть файл
@@ -1022,6 +1022,27 @@ inline std::ostream& operator<<(std::ostream& os, const ihipCtx_t* c) {
namespace hip_internal {
hipError_t memcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind,
hipStream_t stream);
hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned int flags);
hipError_t ihipHostFree(TlsData *tls, void* ptr);
};
#define MAX_COOPERATIVE_GPUs 255
// do not change these two structs without changing the device library
struct mg_sync {
uint w0;
uint w1;
};
struct mg_info {
struct mg_sync *mgs;
uint grid_id;
uint num_grids;
ulong prev_sum;
ulong all_sum;
};
//---
+113 -105
Просмотреть файл
@@ -141,6 +141,103 @@ void* allocAndSharePtr(const char* msg, size_t sizeBytes, ihipCtx_t* ctx, bool s
return ptr;
}
hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned int flags) {
hipError_t hip_status = hipSuccess;
if (HIP_SYNC_HOST_ALLOC) {
hipDeviceSynchronize();
}
auto ctx = ihipGetTlsDefaultCtx();
if ((ctx == nullptr) || (ptr == nullptr)) {
hip_status = hipErrorInvalidValue;
}
else if (sizeBytes == 0) {
hip_status = hipSuccess;
// TODO - should size of 0 return err or be siliently ignored?
} else {
unsigned trueFlags = flags;
if (flags == hipHostMallocDefault) {
// HCC/ROCM provide a modern system with unified memory and should set both of these
// flags by default:
trueFlags = hipHostMallocMapped | hipHostMallocPortable;
}
const unsigned supportedFlags = hipHostMallocPortable | hipHostMallocMapped |
hipHostMallocWriteCombined | hipHostMallocCoherent |
hipHostMallocNonCoherent;
const unsigned coherencyFlags = hipHostMallocCoherent | hipHostMallocNonCoherent;
if ((flags & ~supportedFlags) || ((flags & coherencyFlags) == coherencyFlags)) {
*ptr = nullptr;
// can't specify unsupported flags, can't specify both Coherent + NonCoherent
hip_status = hipErrorInvalidValue;
} else {
auto device = ctx->getWriteableDevice();
#if (__hcc_workweek__ >= 19115)
//Avoid mapping host pinned memory to all devices by HCC
unsigned amFlags = amHostUnmapped;
#else
unsigned amFlags = 0;
#endif
if (flags & hipHostMallocCoherent) {
amFlags |= amHostCoherent;
} else if (flags & hipHostMallocNonCoherent) {
amFlags |= amHostNonCoherent;
} else {
// depends on env variables:
amFlags |= HIP_HOST_COHERENT ? amHostCoherent : amHostNonCoherent;
}
*ptr = hip_internal::allocAndSharePtr(
(amFlags & amHostCoherent) ? "finegrained_host" : "pinned_host", sizeBytes, ctx,
true /*shareWithAll*/, amFlags, flags, 0);
if (sizeBytes && (*ptr == NULL)) {
hip_status = hipErrorMemoryAllocation;
}
}
}
if (HIP_SYNC_HOST_ALLOC) {
hipDeviceSynchronize();
}
return hip_status;
}
hipError_t ihipHostFree(TlsData *tls, void* ptr) {
// Synchronize to ensure all work has finished.
ihipGetTlsDefaultCtx()->locked_waitAllStreams(); // ignores non-blocking streams, this waits
// for all activity to finish.
hipError_t hipStatus = hipErrorInvalidValue;
if (ptr) {
hc::accelerator acc;
#if (__hcc_workweek__ >= 17332)
hc::AmPointerInfo amPointerInfo(NULL, NULL, NULL, 0, acc, 0, 0);
#else
hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0);
#endif
am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, ptr);
if (status == AM_SUCCESS) {
if (amPointerInfo._hostPointer == ptr) {
hc::am_free(ptr);
hipStatus = hipSuccess;
}
}
} else {
// free NULL pointer succeeds and is common technique to initialize runtime
hipStatus = hipSuccess;
}
return hipStatus;
}
} // end namespace hip_internal
@@ -301,79 +398,12 @@ hipError_t hipExtMallocWithFlags(void** ptr, size_t sizeBytes, unsigned int flag
return ihipLogStatus(hip_status);
}
hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned int flags) {
hipError_t hip_status = hipSuccess;
if (HIP_SYNC_HOST_ALLOC) {
hipDeviceSynchronize();
}
auto ctx = ihipGetTlsDefaultCtx();
if ((ctx == nullptr) || (ptr == nullptr)) {
hip_status = hipErrorInvalidValue;
}
else if (sizeBytes == 0) {
hip_status = hipSuccess;
// TODO - should size of 0 return err or be siliently ignored?
} else {
unsigned trueFlags = flags;
if (flags == hipHostMallocDefault) {
// HCC/ROCM provide a modern system with unified memory and should set both of these
// flags by default:
trueFlags = hipHostMallocMapped | hipHostMallocPortable;
}
const unsigned supportedFlags = hipHostMallocPortable | hipHostMallocMapped |
hipHostMallocWriteCombined | hipHostMallocCoherent |
hipHostMallocNonCoherent;
const unsigned coherencyFlags = hipHostMallocCoherent | hipHostMallocNonCoherent;
if ((flags & ~supportedFlags) || ((flags & coherencyFlags) == coherencyFlags)) {
*ptr = nullptr;
// can't specify unsupported flags, can't specify both Coherent + NonCoherent
hip_status = hipErrorInvalidValue;
} else {
auto device = ctx->getWriteableDevice();
#if (__hcc_workweek__ >= 19115)
//Avoid mapping host pinned memory to all devices by HCC
unsigned amFlags = amHostUnmapped;
#else
unsigned amFlags = 0;
#endif
if (flags & hipHostMallocCoherent) {
amFlags |= amHostCoherent;
} else if (flags & hipHostMallocNonCoherent) {
amFlags |= amHostNonCoherent;
} else {
// depends on env variables:
amFlags |= HIP_HOST_COHERENT ? amHostCoherent : amHostNonCoherent;
}
*ptr = hip_internal::allocAndSharePtr(
(amFlags & amHostCoherent) ? "finegrained_host" : "pinned_host", sizeBytes, ctx,
true /*shareWithAll*/, amFlags, flags, 0);
if (sizeBytes && (*ptr == NULL)) {
hip_status = hipErrorMemoryAllocation;
}
}
}
if (HIP_SYNC_HOST_ALLOC) {
hipDeviceSynchronize();
}
return hip_status;
}
hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) {
HIP_INIT_SPECIAL_API(hipHostMalloc, (TRACE_MEM), ptr, sizeBytes, flags);
HIP_SET_DEVICE();
hipError_t hip_status = hipSuccess;
hip_status = ihipHostMalloc(tls, ptr, sizeBytes, flags);
hip_status = hip_internal::ihipHostMalloc(tls, ptr, sizeBytes, flags);
return ihipLogStatus(hip_status);
}
@@ -384,7 +414,7 @@ hipError_t hipMallocManaged(void** devPtr, size_t size, unsigned int flags) {
if(flags != hipMemAttachGlobal)
hip_status = hipErrorInvalidValue;
else
hip_status = ihipHostMalloc(tls, devPtr, size, hipHostMallocDefault);
hip_status = hip_internal::ihipHostMalloc(tls, devPtr, size, hipHostMallocDefault);
return ihipLogStatus(hip_status);
}
@@ -1935,15 +1965,15 @@ hipError_t hipMemset2DAsync(void* dst, size_t pitch, int value, size_t width, si
return ihipLogStatus(e);
};
hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char value, size_t sizeBytes) {
HIP_INIT_SPECIAL_API(hipMemsetD8, (TRACE_MCMD), dst, value, sizeBytes);
hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char value, size_t count) {
HIP_INIT_SPECIAL_API(hipMemsetD8, (TRACE_MCMD), dst, value, count);
hipError_t e = hipSuccess;
hipStream_t stream = hipStreamNull;
stream = ihipSyncAndResolveStream(stream);
if (stream) {
e = ihipMemset(dst, value, sizeBytes, stream, ihipMemsetDataTypeChar);
e = ihipMemset(dst, value, count, stream, ihipMemsetDataTypeChar);
stream->locked_wait();
} else {
e = hipErrorInvalidValue;
@@ -1951,23 +1981,23 @@ hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char value, size_t sizeBytes
return ihipLogStatus(e);
}
hipError_t hipMemsetD8Async(hipDeviceptr_t dst, unsigned char value, size_t sizeBytes , hipStream_t stream ) {
HIP_INIT_SPECIAL_API(hipMemsetD8Async, (TRACE_MCMD), dst, value, sizeBytes, stream);
hipError_t hipMemsetD8Async(hipDeviceptr_t dst, unsigned char value, size_t count , hipStream_t stream ) {
HIP_INIT_SPECIAL_API(hipMemsetD8Async, (TRACE_MCMD), dst, value, count, stream);
stream = ihipSyncAndResolveStream(stream);
if (stream) {
return ihipLogStatus(ihipMemset(dst, value, sizeBytes, stream, ihipMemsetDataTypeChar));
return ihipLogStatus(ihipMemset(dst, value, count, stream, ihipMemsetDataTypeChar));
} else {
return ihipLogStatus(hipErrorInvalidValue);
}
}
hipError_t hipMemsetD16(hipDeviceptr_t dst, unsigned short value, size_t sizeBytes){
HIP_INIT_SPECIAL_API(hipMemsetD16, (TRACE_MCMD), dst, value, sizeBytes);
hipError_t hipMemsetD16(hipDeviceptr_t dst, unsigned short value, size_t count){
HIP_INIT_SPECIAL_API(hipMemsetD16, (TRACE_MCMD), dst, value, count);
hipError_t e = hipSuccess;
hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
if (stream) {
e = ihipMemset(dst, value, sizeBytes, stream, ihipMemsetDataTypeShort);
e = ihipMemset(dst, value, count, stream, ihipMemsetDataTypeShort);
if(hipSuccess == e)
stream->locked_wait();
} else {
@@ -1976,12 +2006,12 @@ hipError_t hipMemsetD16(hipDeviceptr_t dst, unsigned short value, size_t sizeByt
return ihipLogStatus(e);
}
hipError_t hipMemsetD16Async(hipDeviceptr_t dst, unsigned short value, size_t sizeBytes, hipStream_t stream ){
HIP_INIT_SPECIAL_API(hipMemsetD16Async, (TRACE_MCMD), dst, value, sizeBytes, stream);
hipError_t hipMemsetD16Async(hipDeviceptr_t dst, unsigned short value, size_t count, hipStream_t stream ){
HIP_INIT_SPECIAL_API(hipMemsetD16Async, (TRACE_MCMD), dst, value, count, stream);
stream = ihipSyncAndResolveStream(stream);
if (stream) {
return ihipLogStatus(ihipMemset(dst, value, sizeBytes, stream, ihipMemsetDataTypeShort));
return ihipLogStatus(ihipMemset(dst, value, count, stream, ihipMemsetDataTypeShort));
} else {
return ihipLogStatus(hipErrorInvalidValue);
}
@@ -2146,30 +2176,8 @@ hipError_t hipFree(void* ptr) {
hipError_t hipHostFree(void* ptr) {
HIP_INIT_SPECIAL_API(hipHostFree, (TRACE_MEM), ptr);
// Synchronize to ensure all work has finished.
ihipGetTlsDefaultCtx()->locked_waitAllStreams(); // ignores non-blocking streams, this waits
// for all activity to finish.
hipError_t hipStatus = hipErrorInvalidValue;
if (ptr) {
hc::accelerator acc;
#if (__hcc_workweek__ >= 17332)
hc::AmPointerInfo amPointerInfo(NULL, NULL, NULL, 0, acc, 0, 0);
#else
hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0);
#endif
am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, ptr);
if (status == AM_SUCCESS) {
if (amPointerInfo._hostPointer == ptr) {
hc::am_free(ptr);
hipStatus = hipSuccess;
}
}
} else {
// free NULL pointer succeeds and is common technique to initialize runtime
hipStatus = hipSuccess;
}
hipError_t hipStatus = hipSuccess;
hipStatus = hip_internal::ihipHostFree(tls, ptr);
return ihipLogStatus(hipStatus);
};
+129 -49
Просмотреть файл
@@ -109,6 +109,7 @@ struct ihipModuleSymbol_t {
amd_kernel_code_t const* _header{};
string _name; // TODO - review for performance cost. Name is just used for debug.
vector<pair<size_t, size_t>> _kernarg_layout{};
bool _is_code_object_v3{};
};
template <>
@@ -137,7 +138,8 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global
uint32_t localWorkSizeX, uint32_t localWorkSizeY,
uint32_t localWorkSizeZ, size_t sharedMemBytes,
hipStream_t hStream, void** kernelParams, void** extra,
hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags, bool isStreamLocked = 0) {
hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags, bool isStreamLocked = 0,
void** impCoopParams = 0) {
using namespace hip_impl;
auto ctx = ihipGetTlsDefaultCtx();
@@ -181,10 +183,17 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global
return hipErrorInvalidValue;
}
// Insert 48-bytes at the end for implicit kernel arguments and fill with value zero.
// Insert 56-bytes at the end for implicit kernel arguments and fill with value zero.
size_t padSize = (~kernargs.size() + 1) & (HIP_IMPLICIT_KERNARG_ALIGNMENT - 1);
kernargs.insert(kernargs.end(), padSize + HIP_IMPLICIT_KERNARG_SIZE, 0);
if (impCoopParams) {
const auto p{static_cast<const char*>(*impCoopParams)};
// The sixth index is for multi-grid synchronization
kernargs.insert((kernargs.cend() - padSize - HIP_IMPLICIT_KERNARG_SIZE) + 6 * HIP_IMPLICIT_KERNARG_ALIGNMENT,
p, p + HIP_IMPLICIT_KERNARG_ALIGNMENT);
}
/*
Kernel argument preparation.
*/
@@ -208,8 +217,7 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global
aql.grid_size_x = globalWorkSizeX;
aql.grid_size_y = globalWorkSizeY;
aql.grid_size_z = globalWorkSizeZ;
bool is_code_object_v3 = f->_name.find(".kd") != std::string::npos;
if (is_code_object_v3) {
if (f->_is_code_object_v3) {
const auto* header =
reinterpret_cast<const amd_kernel_code_v3_t*>(f->_header);
aql.group_segment_size =
@@ -449,6 +457,10 @@ hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim,
return ihipLogStatus(hipErrorLaunchFailure);
}
size_t impCoopArg = 1;
void* impCoopParams[1];
impCoopParams[0] = &impCoopArg;
// launch the main kernel
result = ihipModuleLaunchKernel(tls, kd,
gridDim.x * blockDimX.x,
@@ -456,7 +468,7 @@ hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim,
gridDim.z * blockDimX.z,
blockDimX.x, blockDimX.y, blockDimX.z,
sharedMemBytes, stream, kernelParams, nullptr, nullptr,
nullptr, 0, true);
nullptr, 0, true, impCoopParams);
stream->criticalData().unlock();
#if (__hcc_workweek__ >= 19213)
@@ -472,7 +484,7 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
HIP_INIT_API(hipLaunchCooperativeKernelMultiDevice, launchParamsList, numDevices, flags);
hipError_t result;
if (numDevices > g_deviceCnt || launchParamsList == nullptr) {
if (numDevices > g_deviceCnt || launchParamsList == nullptr || numDevices > MAX_COOPERATIVE_GPUs) {
return ihipLogStatus(hipErrorInvalidValue);
}
@@ -523,6 +535,32 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
kargs.getHandle());
}
mg_sync *mg_sync_ptr = 0;
mg_info *mg_info_ptr[MAX_COOPERATIVE_GPUs] = {0};
result = hip_internal::ihipHostMalloc(tls, (void **)&mg_sync_ptr, sizeof(mg_sync), hipHostMallocDefault);
if (result != hipSuccess) {
return ihipLogStatus(hipErrorInvalidValue);
}
mg_sync_ptr->w0 = 0;
mg_sync_ptr->w1 = 0;
uint all_sum = 0;
for (int i = 0; i < numDevices; ++i) {
result = hip_internal::ihipHostMalloc(tls, (void **)&mg_info_ptr[i], sizeof(mg_info), hipHostMallocDefault);
if (result != hipSuccess) {
hip_internal::ihipHostFree(tls, mg_sync_ptr);
for (int j = 0; j < i; ++j) {
hip_internal::ihipHostFree(tls, mg_info_ptr[j]);
}
return ihipLogStatus(hipErrorInvalidValue);
}
// calculate the sum of sizes of all grids
const hipLaunchParams& lp = launchParamsList[i];
all_sum += lp.blockDim.x * lp.blockDim.y * lp.blockDim.z *
lp.gridDim.x * lp.gridDim.y * lp.gridDim.z;
}
// lock all streams before launching the blit kernels for initializing the GWS and main kernels to each device
for (int i = 0; i < numDevices; ++i) {
LockedAccessor_StreamCrit_t streamCrit(launchParamsList[i].stream->criticalData(), false);
@@ -531,7 +569,7 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
#endif
}
// launch the init_gws kernel to initialize the GWS followed by launching the main kernels for each device
// launch the init_gws kernel to initialize the GWS for each device
for (int i = 0; i < numDevices; ++i) {
const hipLaunchParams& lp = launchParamsList[i];
@@ -549,8 +587,32 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
launchParamsList[j].stream->criticalData()._av.release_locked_hsa_queue();
#endif
}
hip_internal::ihipHostFree(tls, mg_sync_ptr);
for (int j = 0; j < numDevices; ++j) {
hip_internal::ihipHostFree(tls, mg_info_ptr[j]);
}
return ihipLogStatus(hipErrorLaunchFailure);
}
}
void* impCoopParams[1];
ulong prev_sum = 0;
// launch the main kernels for each device
for (int i = 0; i < numDevices; ++i) {
const hipLaunchParams& lp = launchParamsList[i];
//initialize and setup the implicit kernel argument for multi-grid sync
mg_info_ptr[i]->mgs = mg_sync_ptr;
mg_info_ptr[i]->grid_id = i;
mg_info_ptr[i]->num_grids = numDevices;
mg_info_ptr[i]->all_sum = all_sum;
mg_info_ptr[i]->prev_sum = prev_sum;
prev_sum += lp.blockDim.x * lp.blockDim.y * lp.blockDim.z *
lp.gridDim.x * lp.gridDim.y * lp.gridDim.z;
impCoopParams[0] = &mg_info_ptr[i];
result = ihipModuleLaunchKernel(tls, kds[i],
lp.gridDim.x * lp.blockDim.x,
@@ -559,7 +621,23 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
lp.blockDim.x, lp.blockDim.y,
lp.blockDim.z, lp.sharedMem,
lp.stream, lp.args, nullptr, nullptr, nullptr, 0,
true);
true, impCoopParams);
if (result != hipSuccess) {
for (int j = 0; j < numDevices; ++j) {
launchParamsList[j].stream->criticalData().unlock();
#if (__hcc_workweek__ >= 19213)
launchParamsList[j].stream->criticalData()._av.release_locked_hsa_queue();
#endif
}
hip_internal::ihipHostFree(tls, mg_sync_ptr);
for (int j = 0; j < numDevices; ++j) {
hip_internal::ihipHostFree(tls, mg_info_ptr[j]);
}
return ihipLogStatus(hipErrorLaunchFailure);
}
}
// unlock all streams
@@ -573,6 +651,11 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
free(gwsKds);
free(kds);
hip_internal::ihipHostFree(tls, mg_sync_ptr);
for (int j = 0; j < numDevices; ++j) {
hip_internal::ihipHostFree(tls, mg_info_ptr[j]);
}
return ihipLogStatus(result);
}
@@ -977,31 +1060,24 @@ hipFuncAttributes make_function_attributes(TlsData *tls, const ihipModuleSymbol_
// available per CU, therefore we hardcode it to 64 KiRegisters.
prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024;
bool is_code_object_v3 = kd._name.find(".kd") != std::string::npos;
if (is_code_object_v3) {
if (kd._is_code_object_v3) {
r.localSizeBytes = header_v3(kd)->private_segment_fixed_size;
r.sharedSizeBytes = header_v3(kd)->group_segment_fixed_size;
} else {
r.localSizeBytes = kd._header->workitem_private_segment_byte_size;
r.sharedSizeBytes = kd._header->workgroup_group_segment_byte_size;
}
r.maxDynamicSharedSizeBytes = prop.sharedMemPerBlock - r.sharedSizeBytes;
if (is_code_object_v3) {
r.numRegs = ((header_v3(kd)->compute_pgm_rsrc1 & 0x3F) + 1) << 2;
} else {
r.numRegs = kd._header->workitem_vgpr_count;
}
r.maxThreadsPerBlock = r.numRegs ?
std::min(prop.maxThreadsPerBlock, prop.regsPerBlock / r.numRegs) :
prop.maxThreadsPerBlock;
if (is_code_object_v3) {
r.binaryVersion = 0; // FIXME: should it be the ISA version or code
// object format version?
} else {
r.localSizeBytes = kd._header->workitem_private_segment_byte_size;
r.sharedSizeBytes = kd._header->workgroup_group_segment_byte_size;
r.numRegs = kd._header->workitem_vgpr_count;
r.binaryVersion =
kd._header->amd_machine_version_major * 10 +
kd._header->amd_machine_version_minor;
}
r.maxDynamicSharedSizeBytes = prop.sharedMemPerBlock - r.sharedSizeBytes;
r.maxThreadsPerBlock = r.numRegs ?
std::min(prop.maxThreadsPerBlock, prop.regsPerBlock / r.numRegs) :
prop.maxThreadsPerBlock;
r.ptxVersion = prop.major * 10 + prop.minor; // HIP currently presents itself as PTX 3.0.
return r;
@@ -1099,8 +1175,7 @@ hipError_t ihipModuleLoadData(TlsData *tls, hipModule_t* module, const void* ima
content.data(), content.size(), (*module)->executable,
this_agent());
std::vector<char> blob(content.cbegin(), content.cend());
program_state_impl::read_kernarg_metadata(blob, (*module)->kernargs);
program_state_impl::read_kernarg_metadata(content, (*module)->kernargs);
// compute the hash of the code object
(*module)->hash = checksum(content.length(), content.data());
@@ -1152,8 +1227,7 @@ hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const
void getGprsLdsUsage(hipFunction_t f, size_t* usedVGPRS, size_t* usedSGPRS, size_t* usedLDS)
{
bool is_code_object_v3 = f->_name.find(".kd") != std::string::npos;
if (is_code_object_v3) {
if (f->_is_code_object_v3) {
const auto header = reinterpret_cast<const amd_kernel_code_v3_t*>(f->_header);
// GRANULATED_WAVEFRONT_VGPR_COUNT is specified in 0:5 bits of COMPUTE_PGM_RSRC1
// the granularity for gfx6-gfx9 is max(0, ceil(vgprs_used / 4) - 1)
@@ -1174,9 +1248,9 @@ void getGprsLdsUsage(hipFunction_t f, size_t* usedVGPRS, size_t* usedSGPRS, size
}
}
hipError_t ihipOccupancyMaxPotentialBlockSize(TlsData *tls, uint32_t* gridSize, uint32_t* blockSize,
hipFunction_t f, size_t dynSharedMemPerBlk,
uint32_t blockSizeLimit)
hipError_t ihipOccupancyMaxPotentialBlockSize(TlsData *tls, int* gridSize, int* blockSize,
hipFunction_t f, size_t dynamicSMemSize,
int blockSizeLimit)
{
using namespace hip_impl;
@@ -1257,7 +1331,7 @@ hipError_t ihipOccupancyMaxPotentialBlockSize(TlsData *tls, uint32_t* gridSize,
}
else {
size_t availableSharedMemPerCU = prop.maxSharedMemoryPerMultiProcessor;
size_t workgroupPerCU = availableSharedMemPerCU / (usedLDS + dynSharedMemPerBlk);
size_t workgroupPerCU = availableSharedMemPerCU / (usedLDS + dynamicSMemSize);
wavefrontsLDS = min(workgroupPerCU, maxWorkgroupPerCU) * wavefrontsPerWG;
}
@@ -1286,18 +1360,19 @@ hipError_t ihipOccupancyMaxPotentialBlockSize(TlsData *tls, uint32_t* gridSize,
return hipSuccess;
}
hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* blockSize,
hipFunction_t f, size_t dynSharedMemPerBlk,
uint32_t blockSizeLimit)
hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
const void* f, size_t dynamicSMemSize,
int blockSizeLimit)
{
HIP_INIT_API(hipOccupancyMaxPotentialBlockSize, gridSize, blockSize, f, dynSharedMemPerBlk, blockSizeLimit);
HIP_INIT_API(hipOccupancyMaxPotentialBlockSize, gridSize, blockSize, f, dynamicSMemSize, blockSizeLimit);
auto F = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)(f),
hip_impl::target_agent(0));
return ihipLogStatus(ihipOccupancyMaxPotentialBlockSize(tls,
gridSize, blockSize, f, dynSharedMemPerBlk, blockSizeLimit));
gridSize, blockSize, F, dynamicSMemSize, blockSizeLimit));
}
hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
TlsData *tls, uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk)
TlsData *tls, int* numBlocks, hipFunction_t f, int blockSize, size_t dynamicSMemSize)
{
using namespace hip_impl;
@@ -1326,45 +1401,50 @@ hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
size_t numWavefronts = (blockSize + wavefrontSize - 1) / wavefrontSize;
size_t availableVGPRs = (prop.regsPerBlock / wavefrontSize / simdPerCU);
size_t vgprs_alu_occupancy = simdPerCU * std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS);
size_t vgprs_alu_occupancy = simdPerCU * (usedVGPRS == 0 ? maxWavesPerSimd
: std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS));
// Calculate blocks occupancy per CU based on VGPR usage
*numBlocks = vgprs_alu_occupancy / numWavefronts;
const size_t availableSGPRs = (prop.gcnArch < 800) ? 512 : 800;
size_t sgprs_alu_occupancy = simdPerCU * ((usedSGPRS == 0) ? maxWavesPerSimd
size_t sgprs_alu_occupancy = simdPerCU * (usedSGPRS == 0 ? maxWavesPerSimd
: std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS));
// Calculate blocks occupancy per CU based on SGPR usage
*numBlocks = std::min(*numBlocks, (uint32_t) (sgprs_alu_occupancy / numWavefronts));
*numBlocks = std::min(*numBlocks, (int) (sgprs_alu_occupancy / numWavefronts));
size_t total_used_lds = usedLDS + dynSharedMemPerBlk;
size_t total_used_lds = usedLDS + dynamicSMemSize;
if (total_used_lds != 0) {
// Calculate LDS occupacy per CU. lds_per_cu / (static_lsd + dynamic_lds)
size_t lds_occupancy = prop.maxSharedMemoryPerMultiProcessor / total_used_lds;
*numBlocks = std::min(*numBlocks, (uint32_t) lds_occupancy);
*numBlocks = std::min(*numBlocks, (int) lds_occupancy);
}
return hipSuccess;
}
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk)
int* numBlocks, const void* f, int blockSize, size_t dynamicSMemSize)
{
HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessor, numBlocks, f, blockSize, dynSharedMemPerBlk);
HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessor, numBlocks, f, blockSize, dynamicSMemSize);
auto F = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)(f),
hip_impl::target_agent(0));
return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor(
tls, numBlocks, f, blockSize, dynSharedMemPerBlk));
tls, numBlocks, F, blockSize, dynamicSMemSize));
}
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk,
int* numBlocks, const void* f, int blockSize, size_t dynamicSMemSize,
unsigned int flags)
{
HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, numBlocks, f, blockSize, dynSharedMemPerBlk, flags);
HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, numBlocks, f, blockSize, dynamicSMemSize, flags);
auto F = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)(f),
hip_impl::target_agent(0));
return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor(
tls, numBlocks, f, blockSize, dynSharedMemPerBlk));
tls, numBlocks, F, blockSize, dynamicSMemSize));
}
hipError_t hipLaunchKernel(
+137 -55
Просмотреть файл
@@ -89,9 +89,10 @@ struct Symbol {
class Kernel_descriptor {
std::uint64_t kernel_object_{};
amd_kernel_code_t const* kernel_header_{nullptr};
std::string name_{};
amd_kernel_code_t const* header_{};
std::string name_;
std::vector<std::pair<std::size_t, std::size_t>> kernarg_layout_{};
bool is_code_object_v3_{};
public:
Kernel_descriptor() = default;
Kernel_descriptor(
@@ -101,7 +102,8 @@ public:
:
kernel_object_{kernel_object},
name_{name},
kernarg_layout_{std::move(kernarg_layout)}
kernarg_layout_{std::move(kernarg_layout)},
is_code_object_v3_{name.find(".kd") != std::string::npos}
{
bool supported{false};
std::uint16_t min_v{UINT16_MAX};
@@ -123,7 +125,7 @@ public:
r = tbl.hsa_ven_amd_loader_query_host_address(
reinterpret_cast<void*>(kernel_object_),
reinterpret_cast<const void**>(&kernel_header_));
reinterpret_cast<const void**>(&header_));
if (r != HSA_STATUS_SUCCESS) return;
}
@@ -149,7 +151,7 @@ public:
std::string,
std::unordered_map<
hsa_isa_t,
std::vector<std::vector<char>>>>> code_object_blobs;
std::vector<std::string>>>> code_object_blobs;
std::pair<
std::once_flag,
@@ -213,7 +215,7 @@ public:
std::string,
std::unordered_map<
hsa_isa_t,
std::vector<std::vector<char>>>>& get_code_object_blobs() {
std::vector<std::string>>>& get_code_object_blobs() {
std::call_once(code_object_blobs.first, [this]() {
dl_iterate_phdr([](dl_phdr_info* info, std::size_t, void* p) {
@@ -584,6 +586,68 @@ public:
return functions[agent].second;
}
static
std::size_t parse_args_v2(
const std::string& metadata,
std::size_t f,
std::size_t l,
std::vector<std::pair<std::size_t, std::size_t>>& size_align) {
if (f == l) return f;
if (!size_align.empty()) return l;
do {
static constexpr size_t size_sz{5};
f = metadata.find("Size:", f) + size_sz;
if (l <= f) return f;
auto size = std::strtoul(&metadata[f], nullptr, 10);
static constexpr size_t align_sz{6};
f = metadata.find("Align:", f) + align_sz;
char* l{};
auto align = std::strtoul(&metadata[f], &l, 10);
f += (l - &metadata[f]) + 1;
size_align.emplace_back(size, align);
} while (true);
}
static
void read_kernarg_metadata_v2(
const std::string& kernels_md,
std::size_t dx,
std::unordered_map<
std::string,
std::vector<std::pair<std::size_t, std::size_t>>>& kernargs) {
do {
dx = kernels_md.find("Name:", dx);
if (dx == std::string::npos) break;
static constexpr decltype(kernels_md.size()) name_sz{5};
dx = kernels_md.find_first_not_of(" '", dx + name_sz);
auto fn =
kernels_md.substr(dx, kernels_md.find_first_of("'\n", dx) - dx);
dx += fn.size();
auto dx1 = kernels_md.find("CodeProps", dx);
dx = kernels_md.find("Args:", dx);
if (dx1 < dx) {
dx = dx1;
continue;
}
if (dx == std::string::npos) break;
static constexpr decltype(kernels_md.size()) args_sz{5};
dx = parse_args_v2(kernels_md, dx + args_sz, dx1, kernargs[fn]);
} while (true);
}
static
std::string metadata_to_string(const amd_comgr_metadata_node_t& md) {
std::string str;
@@ -598,9 +662,8 @@ public:
}
static
void parse_args(
void parse_args_v3(
const amd_comgr_metadata_node_t& args_md,
bool is_code_object_v3,
std::vector<std::pair<std::size_t, std::size_t>>& size_align) {
size_t arg_count = 0;
if (amd_comgr_get_metadata_list_size(args_md, &arg_count)
@@ -615,9 +678,7 @@ public:
return;
amd_comgr_metadata_node_t arg_size_md;
if (amd_comgr_metadata_lookup(arg_md,
is_code_object_v3 ? ".size" : "Size",
&arg_size_md)
if (amd_comgr_metadata_lookup(arg_md, ".size", &arg_size_md)
!= AMD_COMGR_STATUS_SUCCESS)
return;
@@ -629,35 +690,21 @@ public:
size_t arg_align;
if (is_code_object_v3) {
amd_comgr_metadata_node_t arg_offset_md;
if (amd_comgr_metadata_lookup(arg_md, ".offset", &arg_offset_md)
!= AMD_COMGR_STATUS_SUCCESS)
return;
amd_comgr_metadata_node_t arg_offset_md;
if (amd_comgr_metadata_lookup(arg_md, ".offset", &arg_offset_md)
!= AMD_COMGR_STATUS_SUCCESS)
return;
size_t arg_offset
= std::stoul(metadata_to_string(arg_offset_md));
size_t arg_offset = std::stoul(metadata_to_string(arg_offset_md));
if (amd_comgr_destroy_metadata(arg_offset_md)
!= AMD_COMGR_STATUS_SUCCESS)
return;
if (amd_comgr_destroy_metadata(arg_offset_md)
!= AMD_COMGR_STATUS_SUCCESS)
return;
arg_align = 1;
while (arg_offset && (arg_offset & 1) == 0) {
arg_offset >>= 1;
arg_align <<= 1;
}
} else {
amd_comgr_metadata_node_t arg_align_md;
if (amd_comgr_metadata_lookup(arg_md, "Align", &arg_align_md)
!= AMD_COMGR_STATUS_SUCCESS)
return;
arg_align = std::stoul(metadata_to_string(arg_align_md));
if (amd_comgr_destroy_metadata(arg_align_md)
!= AMD_COMGR_STATUS_SUCCESS)
return;
arg_align = 1;
while (arg_offset && (arg_offset & 1) == 0) {
arg_offset >>= 1;
arg_align <<= 1;
}
size_align.emplace_back(arg_size, arg_align);
@@ -669,11 +716,11 @@ public:
}
static
void read_kernarg_metadata(
const std::vector<char>& blob,
void read_kernarg_metadata_v3(
const std::string& blob,
std::unordered_map<
std::string,
std::vector<std::pair<std::size_t, std::size_t>>>& kernargs) {
std::string,
std::vector<std::pair<std::size_t, std::size_t>>>& kernargs) {
amd_comgr_data_t dataIn;
amd_comgr_status_t status;
@@ -690,7 +737,6 @@ public:
!= AMD_COMGR_STATUS_SUCCESS)
return;
bool is_code_object_v3 = false;
amd_comgr_metadata_node_t kernels_md;
if (amd_comgr_metadata_lookup(metadata, "Kernels", &kernels_md)
!= AMD_COMGR_STATUS_SUCCESS) {
@@ -699,7 +745,6 @@ public:
&kernels_md)
!= AMD_COMGR_STATUS_SUCCESS)
return;
is_code_object_v3 = true;
}
size_t kernel_count = 0;
@@ -715,9 +760,7 @@ public:
continue;
amd_comgr_metadata_node_t name_md;
if (amd_comgr_metadata_lookup(kernel_md,
is_code_object_v3 ? ".name" : "Name",
&name_md)
if (amd_comgr_metadata_lookup(kernel_md, ".name", &name_md)
!= AMD_COMGR_STATUS_SUCCESS)
continue;
@@ -727,21 +770,15 @@ public:
!= AMD_COMGR_STATUS_SUCCESS)
continue;
if (is_code_object_v3)
kernel_name_str.append(".kd");
amd_comgr_metadata_node_t args_md;
if (amd_comgr_metadata_lookup(kernel_md,
is_code_object_v3 ? ".args" : "Args",
&args_md)
if (amd_comgr_metadata_lookup(kernel_md, ".args", &args_md)
!= AMD_COMGR_STATUS_SUCCESS)
continue;
auto foundKernel = kernargs.find(kernel_name_str);
// parse arguments for a given kernel only once
if (foundKernel == kernargs.end()) {
parse_args(args_md, is_code_object_v3, kernargs[kernel_name_str]);
parse_args_v3(args_md, kernargs[kernel_name_str]);
}
if (amd_comgr_destroy_metadata(args_md) != AMD_COMGR_STATUS_SUCCESS
@@ -757,7 +794,52 @@ public:
amd_comgr_release_data(dataIn);
}
const std::unordered_map<std::string,
static
void read_kernarg_metadata(
const std::string& blob,
std::unordered_map<
std::string,
std::vector<std::pair<std::size_t, std::size_t>>>& kernargs)
{
std::istringstream istr{blob};
ELFIO::elfio reader;
if (!reader.load(istr)) return;
// TODO: this is inefficient.
auto it = find_section_if(reader, [](const ELFIO::section* x) {
return x->get_type() == SHT_NOTE;
});
if (!it) return;
const ELFIO::note_section_accessor acc{reader, it};
auto n{acc.get_notes_num()};
while (n--) {
ELFIO::Elf_Word type{};
std::string name{};
void* desc{};
ELFIO::Elf_Word desc_size{};
acc.get_note(n, type, name, desc, desc_size);
if (name == "AMDGPU") {
return read_kernarg_metadata_v3(blob, kernargs);
}
if (name != "AMD") continue; // TODO: switch to using NT_AMD_AMDGPU_HSA_METADATA.
std::string tmp{
static_cast<char*>(desc), static_cast<char*>(desc) + desc_size};
auto dx = tmp.find("Kernels:");
if (dx == std::string::npos) continue;
return read_kernarg_metadata_v2(tmp, dx + 8u, kernargs); // Skip "Kernels:".
}
}
const std::unordered_map<std::string,
std::vector<std::pair<std::size_t, std::size_t>>>& get_kernargs() {
std::call_once(kernargs.first, [this]() {
+57 -12
Просмотреть файл
@@ -12,18 +12,33 @@ import lit.util
site_cfg = lit_config.params.get('site_config', None)
lit_config.load_config(config, site_cfg)
print(str("========================================"))
config.excludes = ['cmdparser.hpp']
config.excludes.append('spatial_batch_norm_op.h')
config.excludes.append('common_cudnn.h')
delimiter = "===============================================================";
print(delimiter)
print("CUDA " + config.cuda_version + " - will be used for testing")
print("LLVM " + config.llvm_version + " - will be used for testing")
print(platform.machine() + " - Platform architecture")
print(platform.system() + " " + platform.release() + " - Platform OS")
print(str(config.pointer_size * 8) + " - hipify-clang binary bitness")
print(str(struct.calcsize("P") * 8) + " - python " + str(platform.python_version()) + " binary bitness")
print(str("========================================"))
config.excludes = ['cmdparser.hpp']
config.excludes.append('spatial_batch_norm_op.h')
config.excludes.append('common_cudnn.h')
print(delimiter)
warns = None
if not config.cuda_dnn_root:
config.excludes.append('cudnn_convolution_forward.cu')
config.excludes.append('cudnn_softmax.cu')
print("WARN: cuDNN tests are excluded due to unset CUDA_DNN_ROOT_DIR")
warns = True
if not config.cuda_cub_root:
config.excludes.append('cub_01.cu')
config.excludes.append('cub_02.cu')
config.excludes.append('cub_03.cu')
print("WARN: CUB tests are excluded due to unset CUDA_CUB_ROOT_DIR")
warns = True
if warns:
print(delimiter)
if config.cuda_version_major == 7 and config.cuda_version_minor == 0:
config.excludes.append('headers_test_09.cu')
@@ -90,17 +105,47 @@ else:
run_test_ext = ".sh"
# CUDA SDK ROOT
clang_arguments += " -isystem'%s'/samples/common/inc"
# cuDNN ROOT
clang_arguments += " -I'%s'/include"
if config.pointer_size == 8:
clang_arguments += " -D__LP64__"
# Set max clang's CudaArch for corresponding CUDA version
# to support maximum CUDA features in offline tests
if config.cuda_version_major == 7:
if config.cuda_version_minor == 5:
clang_arguments += " --cuda-gpu-arch=sm_53"
else:
clang_arguments += " --cuda-gpu-arch=sm_52"
elif config.cuda_version_major == 8:
clang_arguments += " --cuda-gpu-arch=sm_62"
elif config.cuda_version_major == 9:
if config.cuda_version_minor == 2:
clang_arguments += " --cuda-gpu-arch=sm_72"
else:
clang_arguments += " --cuda-gpu-arch=sm_70"
elif config.cuda_version_major == 10:
clang_arguments += " --cuda-gpu-arch=sm_75"
# cuDNN ROOT
if config.cuda_dnn_root:
clang_arguments += " -I'%s'/include"
# CUB ROOT
clang_arguments += " -I'%s'"
if config.cuda_cub_root:
clang_arguments += " -I'%s'"
hipify_arguments = "--cuda-path='%s'"
if config.cuda_dnn_root and config.cuda_cub_root:
config.substitutions.append(("%clang_args", clang_arguments % (config.cuda_sdk_root, config.cuda_dnn_root, config.cuda_cub_root)))
elif config.cuda_dnn_root:
config.substitutions.append(("%clang_args", clang_arguments % (config.cuda_sdk_root, config.cuda_dnn_root)))
elif config.cuda_cub_root:
config.substitutions.append(("%clang_args", clang_arguments % (config.cuda_sdk_root, config.cuda_cub_root)))
else:
config.substitutions.append(("%clang_args", clang_arguments % config.cuda_sdk_root))
config.substitutions.append(("%clang_args", clang_arguments % (config.cuda_sdk_root, config.cuda_dnn_root, config.cuda_cub_root)))
config.substitutions.append(("%hipify_args", hipify_arguments % (config.cuda_root)))
if config.llvm_version_major < 4:
hipify_arguments = "-I'%s'/include"
else:
hipify_arguments = "--cuda-path='%s'"
config.substitutions.append(("%hipify_args", hipify_arguments % config.cuda_root))
config.substitutions.append(("hipify", '"' + hipify_path + "/hipify-clang" + '"'))
config.substitutions.append(("%run_test", '"' + config.test_source_root + "/run_test" + run_test_ext + '"'))
+2
Просмотреть файл
@@ -276,7 +276,9 @@ int main(int argc, char** argv) {
runTest<unsigned int>();
runTest<unsigned long long>();
runTest<float>();
#if CUDA_VERSION >= 8000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
runTest<double>();
#endif
// CHECK: hipDeviceReset();
cudaDeviceReset();
printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
+69
Просмотреть файл
@@ -0,0 +1,69 @@
// RUN: %run_test hipify "%s" "%t" %hipify_args %clang_args
// CHECK: #include <hip/hip_runtime.h>
#include <iostream>
// CHECK: #include <hiprand.h>
#include <curand.h>
// CHECK: #include <hipcub/hipcub.hpp>
#include <cub/cub.cuh>
#include <iostream>
template <int BLOCK_WIDTH, int ITEMS_PER_THREAD,
// CHECK: hipcub::BlockLoadAlgorithm BLOCK_LOAD_ALGO,
cub::BlockLoadAlgorithm BLOCK_LOAD_ALGO,
// CHECK: hipcub::BlockStoreAlgorithm BLOCK_STORE_ALGO,
cub::BlockStoreAlgorithm BLOCK_STORE_ALGO,
typename T>
__global__ void sort(const T* data_in, T* data_out){
// CHECK: typedef hipcub::BlockLoad<T, BLOCK_WIDTH, ITEMS_PER_THREAD, BLOCK_LOAD_ALGO> BlockLoadT;
typedef cub::BlockLoad<T, BLOCK_WIDTH, ITEMS_PER_THREAD, BLOCK_LOAD_ALGO> BlockLoadT;
// CHECK: typedef hipcub::BlockRadixSort<T, BLOCK_WIDTH, ITEMS_PER_THREAD> BlockRadixSortT;
typedef cub::BlockRadixSort<T, BLOCK_WIDTH, ITEMS_PER_THREAD> BlockRadixSortT;
// CHECK: typedef hipcub::BlockStore<T, BLOCK_WIDTH, ITEMS_PER_THREAD, BLOCK_STORE_ALGO> BlockStoreT;
typedef cub::BlockStore<T, BLOCK_WIDTH, ITEMS_PER_THREAD, BLOCK_STORE_ALGO> BlockStoreT;
__shared__ union {
typename BlockLoadT::TempStorage load;
typename BlockRadixSortT::TempStorage sort;
typename BlockStoreT::TempStorage store;
} tmp_storage;
T items[ITEMS_PER_THREAD];
BlockLoadT(tmp_storage.load).Load(data_in + blockIdx.x * BLOCK_WIDTH * ITEMS_PER_THREAD, items);
__syncthreads();
BlockRadixSortT(tmp_storage.sort).Sort(items);
__syncthreads();
BlockStoreT(tmp_storage.store).Store(data_out + blockIdx.x * BLOCK_WIDTH * ITEMS_PER_THREAD, items);
}
int main() {
double* d_gpu = NULL;
double* result_gpu = NULL;
double* data_sorted = new double[1000*4096];
// Allocate memory on the GPU
// CHECK: hipMalloc(&d_gpu, 1000*4096 * sizeof(double));
cudaMalloc(&d_gpu, 1000*4096 * sizeof(double));
// CHECK: hipMalloc(&result_gpu, 1000*4096 * sizeof(double));
cudaMalloc(&result_gpu, 1000*4096 * sizeof(double));
// CHECK: hiprandGenerator_t gen;
curandGenerator_t gen;
// Create generator
// CHECK: hiprandCreateGenerator(&gen, HIPRAND_RNG_PSEUDO_DEFAULT);
curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
// Fill array with random numbers
// CHECK: hiprandGenerateNormalDouble(gen, d_gpu, 1000*4096, 0.0, 1.0);
curandGenerateNormalDouble(gen, d_gpu, 1000*4096, 0.0, 1.0);
// Destroy generator
// CHECK: hiprandDestroyGenerator(gen);
curandDestroyGenerator(gen);
// Sort data
// CHECK: hipLaunchKernelGGL(HIP_KERNEL_NAME(sort<512, 8, hipcub::BLOCK_LOAD_TRANSPOSE, hipcub::BLOCK_STORE_TRANSPOSE>), dim3(1000), dim3(512), 0, 0, d_gpu, result_gpu);
sort<512, 8, cub::BLOCK_LOAD_TRANSPOSE, cub::BLOCK_STORE_TRANSPOSE><<<1000, 512>>>(d_gpu, result_gpu);
// CHECK: hipLaunchKernelGGL(HIP_KERNEL_NAME(sort<256, 16, hipcub::BLOCK_LOAD_DIRECT, hipcub::BLOCK_STORE_DIRECT>), dim3(1000), dim3(256), 0, 0, d_gpu, result_gpu);
sort<256, 16, cub::BLOCK_LOAD_DIRECT, cub::BLOCK_STORE_DIRECT><<<1000, 256>>>(d_gpu, result_gpu);
// CHECK: hipMemcpy(data_sorted, result_gpu, 1000*4096*sizeof(double), hipMemcpyDeviceToHost);
cudaMemcpy(data_sorted, result_gpu, 1000*4096*sizeof(double), cudaMemcpyDeviceToHost);
// Write the sorted data to standard out
for (int i = 0; i < 4095; ++i) {
std::cout << data_sorted[i] << ", ";
}
std::cout << data_sorted[4095] << std::endl;
}
+33
Просмотреть файл
@@ -0,0 +1,33 @@
// RUN: %run_test hipify "%s" "%t" %hipify_args %clang_args
// CHECK: #include <hip/hip_runtime.h>
#include <iostream>
// CHECK: #include <hipcub/hipcub.hpp>
#include <cub/cub.cuh>
// using namespace hipcub;
using namespace cub;
// Simple CUDA kernel for computing tiled partial sums
template <int BLOCK_THREADS, int ITEMS_PER_THREAD,
// CHECK: hipcub::BlockLoadAlgorithm LOAD_ALGO,
cub::BlockLoadAlgorithm LOAD_ALGO,
// CHECK: hipcub::BlockScanAlgorithm SCAN_ALGO>
cub::BlockScanAlgorithm SCAN_ALGO>
__global__ void ScanTilesKernel(int *d_in, int *d_out) {
// Specialize collective types for problem context
// CHECK: typedef ::hipcub::BlockLoad<int*, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGO> BlockLoadT;
typedef ::cub::BlockLoad<int*, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGO> BlockLoadT;
typedef BlockScan<int, BLOCK_THREADS, SCAN_ALGO> BlockScanT;
// Allocate on-chip temporary storage
__shared__ union {
typename BlockLoadT::TempStorage load;
typename BlockScanT::TempStorage reduce;
} temp_storage;
// Load data per thread
int thread_data[ITEMS_PER_THREAD];
int offset = blockIdx.x * (BLOCK_THREADS * ITEMS_PER_THREAD);
BlockLoadT(temp_storage.load).Load(d_in + offset, offset);
__syncthreads();
// Compute the block-wide prefix sum
BlockScanT(temp_storage).Sum(thread_data);
}
-1
Просмотреть файл
@@ -67,7 +67,6 @@ int devcheck(int gpudevice, int rank)
cudaError_t cudareturn;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, gpudevice);
// CHECK: if (deviceProp.hipWarpSize <= 1)
if (deviceProp.warpSize <= 1)
{
printf("rank %d: warning, CUDA Device Emulation (CPU) detected, exiting\n", rank);
+6 -1
Просмотреть файл
@@ -29,7 +29,12 @@ THE SOFTWARE.
#include "hip/hip_runtime.h"
#include "test_common.h"
#if __HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__ || __HIP_ARCH_GFX906__ || __HIP_ARCH_GFX908__
#if __HIP_ARCH_GFX803__ || \
__HIP_ARCH_GFX900__ || \
__HIP_ARCH_GFX906__ || \
__HIP_ARCH_GFX908__ || \
__HIP_ARCH_GFX1010__ || \
__HIP_ARCH_GFX1012__
__global__ void kernel_abs_int64(long long *input, long long *output) {
int tx = threadIdx.x;
+6 -1
Просмотреть файл
@@ -28,7 +28,12 @@ THE SOFTWARE.
#include "test_common.h"
#if __HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__ || __HIP_ARCH_GFX906__ || __HIP_ARCH_GFX908__
#if __HIP_ARCH_GFX803__ || \
__HIP_ARCH_GFX900__ || \
__HIP_ARCH_GFX906__ || \
__HIP_ARCH_GFX908__ || \
__HIP_ARCH_GFX1010__ || \
__HIP_ARCH_GFX1012__
__device__ void test_convert() {
__half x;
+6 -1
Просмотреть файл
@@ -32,7 +32,12 @@ THE SOFTWARE.
using namespace std;
#if __HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__ || __HIP_ARCH_GFX906__ || __HIP_ARCH_GFX908__
#if __HIP_ARCH_GFX803__ || \
__HIP_ARCH_GFX900__ || \
__HIP_ARCH_GFX906__ || \
__HIP_ARCH_GFX908__ || \
__HIP_ARCH_GFX1010__ || \
__HIP_ARCH_GFX1012__
__global__
void __halfTest(bool* result, __half a) {
+2 -2
Просмотреть файл
@@ -120,8 +120,8 @@ void simpleTest2(size_t numElements, bool usePinnedHost) {
HIPCHECK(hipHostFree(A_h1));
HIPCHECK(hipHostFree(A_h2));
} else {
free(A_h1);
free(A_h2);
aligned_free(A_h1);
aligned_free(A_h2);
}
}
+7 -6
Просмотреть файл
@@ -45,7 +45,7 @@ bool testhipMemset2D(int memsetval,int p_gpuDevice)
char *A_d;
char *A_h;
bool testResult = true;
HIPCHECK ( hipMemAllocPitch((hipDeviceptr_t*)&A_d, &pitch_A, width , numH,16) );
HIPCHECK(hipMemAllocPitch((hipDeviceptr_t*)&A_d, &pitch_A, width , numH,16));
A_h = (char*)malloc(sizeElements);
HIPASSERT(A_h != NULL);
for (size_t i=0; i<elements; i++) {
@@ -89,8 +89,9 @@ bool testhipMemset2DAsync(int memsetval,int p_gpuDevice)
}
hipStream_t stream;
HIPCHECK(hipStreamCreate(&stream));
HIPCHECK ( hipMemset2DAsync(A_d, pitch_A, memsetval, numW, numH, stream) );
HIPCHECK ( hipMemcpy2D(A_h, width, A_d, pitch_A, numW, numH, hipMemcpyDeviceToHost));
HIPCHECK(hipMemset2DAsync(A_d, pitch_A, memsetval, numW, numH, stream) );
HIPCHECK(hipStreamSynchronize(stream));
HIPCHECK(hipMemcpy2D(A_h, width, A_d, pitch_A, numW, numH, hipMemcpyDeviceToHost));
for (int i=0; i<elements; i++) {
if (A_h[i] != memsetval) {
@@ -112,9 +113,9 @@ int main(int argc, char *argv[])
hipCtx_t context;
hipCtxCreate(&context, 0, p_gpuDevice);
bool testResult = false;
testResult = testhipMemset2D(memsetval, p_gpuDevice);
testResult = testhipMemset2DAsync(memsetval, p_gpuDevice);
bool testResult = true;
testResult &= testhipMemset2D(memsetval, p_gpuDevice);
testResult &= testhipMemset2DAsync(memsetval, p_gpuDevice);
hipCtxDestroy(context);
if(testResult){
passed();
+67 -15
Просмотреть файл
@@ -41,27 +41,27 @@ bool testhipMemset3D(int memsetval,int p_gpuDevice)
size_t elements = numW* numH* depth;
printf ("testhipMemset3D memsetval=%2x device=%d\n", memsetval, p_gpuDevice);
printf ("testhipMemset3D memsetval=%2x device=%d\n", memsetval, p_gpuDevice);
char *A_h;
bool testResult = true;
hipExtent extent = make_hipExtent(width, numH, depth);
hipPitchedPtr devPitchedPtr;
HIPCHECK(hipMalloc3D(&devPitchedPtr, extent));
A_h = (char*)malloc(sizeElements);
HIPASSERT(A_h != NULL);
for (size_t i=0; i<elements; i++) {
A_h = (char*)malloc(sizeElements);
HIPASSERT(A_h != NULL);
for (size_t i=0; i<elements; i++) {
A_h[i] = 1;
}
HIPCHECK ( hipMemset3D( devPitchedPtr, memsetval, extent) );
hipMemcpy3DParms myparms = {0};
myparms.srcPos = make_hipPos(0,0,0);
myparms.dstPos = make_hipPos(0,0,0);
myparms.dstPtr = make_hipPitchedPtr(A_h, width , numW, numH);
myparms.srcPtr = devPitchedPtr;
myparms.extent = extent;
HIPCHECK(hipMemset3D( devPitchedPtr, memsetval, extent));
hipMemcpy3DParms myparms = {0};
myparms.srcPos = make_hipPos(0,0,0);
myparms.dstPos = make_hipPos(0,0,0);
myparms.dstPtr = make_hipPitchedPtr(A_h, width , numW, numH);
myparms.srcPtr = devPitchedPtr;
myparms.extent = extent;
#ifdef __HIP_PLATFORM_NVCC__
myparms.kind = hipMemcpyKindToCudaMemcpyKind(hipMemcpyDeviceToHost);
myparms.kind = hipMemcpyKindToCudaMemcpyKind(hipMemcpyDeviceToHost);
#else
myparms.kind = hipMemcpyDeviceToHost;
#endif
@@ -69,7 +69,58 @@ bool testhipMemset3D(int memsetval,int p_gpuDevice)
for (int i=0; i<elements; i++) {
if (A_h[i] != memsetval) {
testResult = false;
testResult = false;
printf("mismatch at index:%d computed:%02x, memsetval:%02x\n", i, (int)A_h[i], (int)memsetval);
break;
}
}
HIPCHECK(hipFree(devPitchedPtr.ptr));
free(A_h);
return testResult;
}
bool testhipMemset3DAsync(int memsetval,int p_gpuDevice)
{
size_t numH = 256;
size_t numW = 256;
size_t depth = 10;
size_t width = numW * sizeof(char);
size_t sizeElements = width * numH * depth;
size_t elements = numW* numH* depth;
printf ("testhipMemset3D memsetval=%2x device=%d\n", memsetval, p_gpuDevice);
char *A_h;
bool testResult = true;
hipExtent extent = make_hipExtent(width, numH, depth);
hipPitchedPtr devPitchedPtr;
HIPCHECK(hipMalloc3D(&devPitchedPtr, extent));
A_h = (char*)malloc(sizeElements);
HIPASSERT(A_h != NULL);
for (size_t i=0; i<elements; i++) {
A_h[i] = 1;
}
hipStream_t stream;
HIPCHECK(hipStreamCreate(&stream));
HIPCHECK(hipMemset3DAsync(devPitchedPtr, memsetval, extent, stream));
HIPCHECK(hipStreamSynchronize(stream));
hipMemcpy3DParms myparms = {0};
myparms.srcPos = make_hipPos(0,0,0);
myparms.dstPos = make_hipPos(0,0,0);
myparms.dstPtr = make_hipPitchedPtr(A_h, width , numW, numH);
myparms.srcPtr = devPitchedPtr;
myparms.extent = extent;
#ifdef __HIP_PLATFORM_NVCC__
myparms.kind = hipMemcpyKindToCudaMemcpyKind(hipMemcpyDeviceToHost);
#else
myparms.kind = hipMemcpyDeviceToHost;
#endif
HIPCHECK(hipMemcpy3D(&myparms));
for (int i=0; i<elements; i++) {
if (A_h[i] != memsetval) {
testResult = false;
printf("mismatch at index:%d computed:%02x, memsetval:%02x\n", i, (int)A_h[i], (int)memsetval);
break;
}
@@ -82,9 +133,10 @@ bool testhipMemset3D(int memsetval,int p_gpuDevice)
int main(int argc, char *argv[])
{
HipTest::parseStandardArguments(argc, argv, true);
bool testResult = false;
HIPCHECK(hipSetDevice(p_gpuDevice));
testResult = testhipMemset3D(memsetval, p_gpuDevice);
bool testResult = true;
testResult &= testhipMemset3D(memsetval, p_gpuDevice);
testResult &= testhipMemset3DAsync(memsetval, p_gpuDevice);
if (testResult) {
passed();
} else {
+1 -1
Просмотреть файл
@@ -116,7 +116,7 @@ int main() {
dimBlock.x = workgroups[i];
// Calculate the device occupancy to know how many blocks can be run concurrently
hipOccupancyMaxActiveBlocksPerMultiprocessor(reinterpret_cast<uint32_t*>(&numBlocks),
hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks,
test_gws, dimBlock.x * dimBlock.y * dimBlock.z, dimBlock.x * sizeof(long));
dimGrid.x = deviceProp.multiProcessorCount * std::min(numBlocks, 32);
+3 -18
Просмотреть файл
@@ -30,10 +30,6 @@ THE SOFTWARE.
#include "hip/hip_runtime.h"
#include "test_common.h"
#define fileName "vcpy_kernel.code"
#define kernel_name "hello_world"
__global__ void f1(float *a) { *a = 1.0; }
template <typename T>
@@ -44,16 +40,15 @@ __global__ void f2(T *a) { *a = 1; }
int main(int argc, char* argv[]) {
// test case for using kernel function pointer
uint32_t gridSize = 0;
uint32_t blockSize = 0;
int gridSize = 0;
int blockSize = 0;
hipOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, f1, 0, 0);
assert(gridSize != 0 && blockSize != 0);
uint32_t numBlock = 0;
int numBlock = 0;
hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, f1, blockSize, 0);
assert(numBlock != 0);
// test case for using kernel function pointer with template
gridSize = 0;
blockSize = 0;
@@ -64,15 +59,5 @@ int main(int argc, char* argv[]) {
hipOccupancyMaxActiveBlocksPerMultiprocessor<void(*)(int *)>(&numBlock, f2, blockSize, 0);
assert(numBlock != 0);
// test case for using kernel with hipFunction_t type
numBlock = 0;
hipModule_t Module;
hipFunction_t Function;
HIPCHECK(hipModuleLoad(&Module, fileName));
HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name));
HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, Function, blockSize, 0));
assert(numBlock != 0);
passed();
}
+2 -18
Просмотреть файл
@@ -30,22 +30,16 @@ THE SOFTWARE.
#include "hip/hip_runtime.h"
#include "test_common.h"
#define fileName "vcpy_kernel.code"
#define kernel_name "hello_world"
__global__ void f1(float *a) { *a = 1.0; }
template <typename T>
__global__ void f2(T *a) { *a = 1; }
int main(int argc, char* argv[]) {
// test case for using kernel function pointer
uint32_t gridSize = 0;
uint32_t blockSize = 0;
int gridSize = 0;
int blockSize = 0;
hipOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, f1, 0, 0);
assert(gridSize != 0 && blockSize != 0);
@@ -55,15 +49,5 @@ int main(int argc, char* argv[]) {
hipOccupancyMaxPotentialBlockSize<void(*)(int *)>(&gridSize, &blockSize, f2, 0, 0);
assert(gridSize != 0 && blockSize != 0);
// test case for using kernel with hipFunction_t type
gridSize = 0;
blockSize = 0;
hipModule_t Module;
hipFunction_t Function;
HIPCHECK(hipModuleLoad(&Module, fileName));
HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name));
HIPCHECK(hipOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, Function, 0, 0));
assert(gridSize != 0 && blockSize != 0);
passed();
}
+4 -1
Просмотреть файл
@@ -99,10 +99,13 @@ THE SOFTWARE.
#ifdef _WIN64
#include <tchar.h>
#define aligned_alloc _aligned_malloc
#define aligned_alloc(x,y) _aligned_malloc(y,x)
#define aligned_free(x) _aligned_free(x)
#define popen(x,y) _popen(x,y)
#define pclose(x) _pclose(x)
#define setenv(x,y,z) _putenv_s(x,y)
#else
#define aligned_free(x) free(x)
#endif
// standard command-line variables: