diff --git a/CMakeLists.txt b/CMakeLists.txt index 0ad40a10ea..57369a9039 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -289,7 +289,7 @@ if(HIP_PLATFORM STREQUAL "hcc") target_link_libraries(hip_hcc PRIVATE hc_am) target_link_libraries(hip_hcc_static PRIVATE hc_am) - add_library(hiprtc SHARED src/hiprtc.cpp) + add_library(hiprtc SHARED src/hiprtc.cpp src/code_object_bundle.cpp) target_include_directories( hiprtc SYSTEM PRIVATE ${PROJECT_SOURCE_DIR}/include ${HSA_PATH}/include) @@ -504,7 +504,7 @@ endif() find_program(CPPCHECK_EXE cppcheck) if(CPPCHECK_EXE) add_custom_target(cppcheck COMMAND ${CPPCHECK_EXE} --force --quiet --enable=warning,performance,portability,information,missingInclude src include -I /opt/rocm/include/hcc -I /opt/rocm/include --suppress=*:/opt/rocm/include/hcc/hc.hpp - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) endif() ############################# @@ -545,4 +545,17 @@ else() message(STATUS "Testing targets will not be available. To enable them please ensure that the HIP installation directory is writeable. Use -DCMAKE_INSTALL_PREFIX to specify a suitable location") endif() +############################# +# Code analysis +############################# +# Target: clang +if(HIP_HIPCC_EXECUTABLE) + add_custom_target(analyze + COMMAND ${HIP_HIPCC_EXECUTABLE} -fvisibility=hidden -fvisibility-inlines-hidden --analyze --analyzer-outputtext -isystem /opt/rocm/include ${HIP_HCC_BUILD_FLAGS} -Wno-unused-command-line-argument -I/opt/rocm/include -c src/*.cpp -Iinclude/ -I./ + WORKING_DIRECTORY ${HIP_SRC_PATH}) + if(CPPCHECK_EXE) + add_dependencies(analyze cppcheck) + endif() +endif() + # vim: ts=4:sw=4:expandtab:smartindent diff --git a/bin/hipcc b/bin/hipcc index 35fbb54397..9d325c2bf9 100755 --- a/bin/hipcc +++ b/bin/hipcc @@ -234,6 +234,8 @@ if ($HIP_PLATFORM eq "clang") { if ($HIP_RUNTIME eq "HCC" ) { $HSA_PATH=$ENV{'HSA_PATH'} // "$ROCM_PATH/hsa"; $HIPCXXFLAGS .= " -isystem $HSA_PATH/include"; + } else { + $HIPCXXFLAGS .= " -fhip-new-launch-api"; } } elsif ($HIP_PLATFORM eq "hcc") { @@ -352,7 +354,6 @@ my $runCmd = 1; my $buildDeps = 0; my $linkType = 1; my $setLinkType = 0; -my $coFormatv3 = 1; my @options = (); my @inputs = (); @@ -472,22 +473,6 @@ foreach $arg (@ARGV) $swallowArg = 1; } - # code object format parsing - if ($trimarg eq '-mcode-object-v3') { - $coFormatv3 = 1; - # hip-clang already recognizes -mcode-object-v3, so we just pass it on - if ($HIP_PLATFORM eq 'hcc') { - $swallowArg = 1; - } - } - if ($trimarg eq '-mno-code-object-v3') { - $coFormatv3 = 0; - # hip-clang already recognizes -mno-code-object-v3, so we just pass it on - if ($HIP_PLATFORM eq 'hcc') { - $swallowArg = 1; - } - } - if (($arg =~ /--genco/) and $HIP_PLATFORM eq 'clang' ) { $arg = "--cuda-device-only"; } @@ -526,6 +511,13 @@ foreach $arg (@ARGV) $optArg = $arg; } + ## This is a temporary workaround for CMake detection of OpenMP support. + ## It should be removed when the OpenMP detection c++ test in CMake is updated + ## and corrected CMake version is available. + if((defined $HIP_COMPILER) and ($HIP_COMPILER eq "clang") and ($arg eq '-fopenmp')) { + $HIPCXXFLAGS .= " -D_OPENMP " + } + ## process linker response file for hip-clang ## extract object files from static library and pass them directly to ## hip-clang in command line. @@ -863,12 +855,6 @@ if($HIP_PLATFORM eq "hcc" or $HIP_PLATFORM eq "clang"){ } } -# hcc defaults to v2, so we need to convert to the appropriate flag -# hip-clang defaults to v3, so we don't need to do anything -if ($coFormatv3 and $HIP_PLATFORM eq 'hcc') { - $HIPLDFLAGS .= " -Wl,-hcc-cov3 "; -} - if ($hasC and $HIP_PLATFORM eq 'nvcc') { $HIPCXXFLAGS .= " -x cu"; } diff --git a/bin/hipify-perl b/bin/hipify-perl index 2e391ab8d2..80f721c58e 100755 --- a/bin/hipify-perl +++ b/bin/hipify-perl @@ -983,7 +983,6 @@ sub simpleSubstitutions { $ft{'type'} += s/\bcusparsePointerMode_t\b/hipsparsePointerMode_t/g; $ft{'type'} += s/\bcusparseSolvePolicy_t\b/hipsparseSolvePolicy_t/g; $ft{'type'} += s/\bcusparseStatus_t\b/hipsparseStatus_t/g; - $ft{'type'} += s/\bwarpSize\b/hipWarpSize/g; $ft{'numeric_literal'} += s/\bCUBLAS_DIAG_NON_UNIT\b/HIPBLAS_DIAG_NON_UNIT/g; $ft{'numeric_literal'} += s/\bCUBLAS_DIAG_UNIT\b/HIPBLAS_DIAG_UNIT/g; $ft{'numeric_literal'} += s/\bCUBLAS_FILL_MODE_FULL\b/HIPBLAS_FILL_MODE_FULL/g; @@ -1676,6 +1675,12 @@ sub transformKernelLaunch { } } +sub transformCubNamespace { + my $k = 0; + $k += s/using\s*namespace\s*cub/using namespace hipcub/g; + return $k; +} + sub transformHostFunctions { my $k = 0; foreach $func ( @@ -2368,6 +2373,7 @@ while (@ARGV) { simpleSubstitutions(); transformExternShared(); transformKernelLaunch(); + transformCubNamespace(); if ($print_stats) { while (/(\b(hip|HIP)([A-Z]|_)\w+\b)/g) { $convertedTags{$1}++; diff --git a/docs/markdown/hip_profiling.md b/docs/markdown/hip_profiling.md index 8a44368680..28ed37e321 100644 --- a/docs/markdown/hip_profiling.md +++ b/docs/markdown/hip_profiling.md @@ -196,8 +196,7 @@ This file can be copied and edited to provide more selective HSA event recording #### How to enable profiling at HIP build time -Recent pre-built packages of HIP are always built with profiling support enabled. -For developer builds, you must enable marker support manually when compiling HIP. +Pre-built packages of HIP are not built with profiling support enabled.You must enable marker support manually when compiling HIP. 1. Build HIP with ATP markers enabled HIP pre-built packages are enabled with ATP marker support by default. diff --git a/hip_prof_gen.py b/hip_prof_gen.py index d1203a64d6..9e90c1558c 100755 --- a/hip_prof_gen.py +++ b/hip_prof_gen.py @@ -348,7 +348,7 @@ def generate_prof_header(f, api_map, opts_map): # Generating the callbacks data structure f.write('\n// HIP API callbacks data structure\n') f.write( - 'struct hip_api_data_t {\n' + + 'typedef struct hip_api_data_t {\n' + ' uint64_t correlation_id;\n' + ' uint32_t phase;\n' + ' union {\n' @@ -364,7 +364,7 @@ def generate_prof_header(f, api_map, opts_map): f.write(' } ' + name + ';\n') f.write( ' } args;\n' + - '};\n' + '} hip_api_data_t;\n' ) # Generating the callbacks args data filling macros diff --git a/hipify-clang/CMakeLists.txt b/hipify-clang/CMakeLists.txt index 5eb71b3a3e..2f24c6c6fe 100644 --- a/hipify-clang/CMakeLists.txt +++ b/hipify-clang/CMakeLists.txt @@ -1,8 +1,4 @@ -if (CUDA_VERSION VERSION_GREATER "9.2") - cmake_minimum_required(VERSION 3.12.3) -else() - cmake_minimum_required(VERSION 3.7.2) -endif() +cmake_minimum_required(VERSION 3.5.1) project(hipify-clang) @@ -58,7 +54,7 @@ target_link_libraries(hipify-clang PRIVATE LLVMOption LLVMCore) -if (LLVM_PACKAGE_VERSION VERSION_GREATER_EQUAL "7") +if (LLVM_PACKAGE_VERSION VERSION_GREATER "6.0.1") target_link_libraries(hipify-clang PRIVATE clangToolingInclusions) endif() @@ -72,7 +68,7 @@ else() set(StdCpp "-std=c++") endif() -if (LLVM_PACKAGE_VERSION VERSION_GREATER_EQUAL "10.0") +if (LLVM_PACKAGE_VERSION VERSION_GREATER "9.0") string(APPEND StdCpp "14") # MSVC starting from 1900 (VS 2015) supports only the following c++ std values: c++14|c++17|c++latest elseif (MSVC) @@ -84,7 +80,38 @@ endif() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_CFLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CFLAGS} ${StdCpp} -DHIPIFY_CLANG_RES=\\\"${LLVM_LIBRARY_DIRS}/clang/${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}\\\"") -install(TARGETS hipify-clang DESTINATION bin) +set(INSTALL_PATH_DOC_STRING "Installation path for hipify-clang") +if (CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) + if(CMAKE_BUILD_TYPE MATCHES Debug) + set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_SOURCE_DIR}/bin" CACHE PATH ${INSTALL_PATH_DOC_STRING} FORCE) + elseif(CMAKE_BUILD_TYPE MATCHES Release) + if (BIN_INSTALL_DIR) + set(CMAKE_INSTALL_PREFIX "${BIN_INSTALL_DIR}" CACHE PATH ${INSTALL_PATH_DOC_STRING} FORCE) + else() + set(CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/bin" CACHE PATH ${INSTALL_PATH_DOC_STRING} FORCE) + endif() + else() + message(FATAL_ERROR "Invalid CMAKE_BUILD_TYPE specified. Valid values are Debug and Release") + endif() +elseif(BIN_INSTALL_DIR) + set(CMAKE_INSTALL_PREFIX "${BIN_INSTALL_DIR}" CACHE PATH ${INSTALL_PATH_DOC_STRING} FORCE) +endif() + +install(TARGETS hipify-clang DESTINATION ${CMAKE_INSTALL_PREFIX}) + +install( + DIRECTORY ${LLVM_DIR}/../../clang/${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}/ + DESTINATION ${CMAKE_INSTALL_PREFIX} + COMPONENT clang-resource-headers + FILES_MATCHING + PATTERN "*.h" + PATTERN "*.modulemap" + PATTERN "algorithm" + PATTERN "complex" + PATTERN "new" + PATTERN "ppc_wrappers" EXCLUDE + PATTERN "openmp_wrappers" EXCLUDE + ) if (HIPIFY_CLANG_TESTS) find_package(PythonInterp 2.7 REQUIRED) diff --git a/hipify-clang/README.md b/hipify-clang/README.md index 07466dbe62..da5abc19da 100644 --- a/hipify-clang/README.md +++ b/hipify-clang/README.md @@ -63,7 +63,7 @@ In most cases, you can get a suitable version of LLVM+CLANG with your package manager. Failing that or having multiple versions of LLVM, you can [download a release archive](http://releases.llvm.org/), build or install it, and set -[CMAKE_PREFIX_PATH](https://cmake.org/cmake/help/v3.12/variable/CMAKE_PREFIX_PATH.html) so `cmake` can find it; for instance: `-DCMAKE_PREFIX_PATH=f:\LLVM\9.0.0\dist` +[CMAKE_PREFIX_PATH](https://cmake.org/cmake/help/v3.5/variable/CMAKE_PREFIX_PATH.html) so `cmake` can find it; for instance: `-DCMAKE_PREFIX_PATH=f:\LLVM\9.0.0\dist` ## Build and install @@ -154,7 +154,7 @@ To run it: * Path to CUB should be specified by the `CUDA_CUB_ROOT_DIR` option: - - Linux: `-DCUDA_CUB_ROOT_DIR=/srv/CUB` + - Linux: `-DCUDA_CUB_ROOT_DIR=/srv/git/CUB` - Windows: `-DCUDA_CUB_ROOT_DIR=f:/GIT/cub` @@ -194,9 +194,9 @@ Ubuntu 14: LLVM 5.0.0 - 6.0.1, CUDA 7.0 - 9.0, cudnn-5.0.5 - cudnn-7.6.4.38 Ubuntu 16-18: LLVM 8.0.0 - 9.0.0, CUDA 8.0 - 10.1, cudnn-5.1.10 - cudnn-7.6.4.38 -Build system for the above configurations: +Minimum build system requirements for the above configurations: -Python 2.7 (min), cmake 3.12.3 (min), GNU C/C++ 5.4.0 (min). +Python 2.7, cmake 3.5.1, GNU C/C++ 5.4.0. Here is an example of building `hipify-clang` with testing support on `Ubuntu 16.04`: @@ -208,7 +208,7 @@ cmake -DCMAKE_PREFIX_PATH=/srv/git/LLVM/9.0.0/dist \ -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-10.1 \ -DCUDA_DNN_ROOT_DIR=/srv/CUDNN/cudnn-10.1-v7.6.4.38 \ - -DCUDA_CUB_ROOT_DIR=/srv/CUB \ + -DCUDA_CUB_ROOT_DIR=/srv/git/CUB \ -DLLVM_EXTERNAL_LIT=/srv/git/LLVM/9.0.0/build/bin/llvm-lit \ .. ``` @@ -264,88 +264,89 @@ Linux 5.2.0 - Platform OS 64 - hipify-clang binary bitness 64 - python 2.7.12 binary bitness ======================================== --- Testing: 64 tests, 12 threads -- -PASS: hipify :: unit_tests/casts/reinterpret_cast.cu (1 of 64) -PASS: hipify :: unit_tests/device/math_functions.cu (2 of 64) -PASS: hipify :: unit_tests/device/atomics.cu (3 of 64) -PASS: hipify :: unit_tests/device/device_symbols.cu (4 of 64) -PASS: hipify :: unit_tests/headers/headers_test_02.cu (5 of 64) -PASS: hipify :: unit_tests/headers/headers_test_03.cu (6 of 64) -PASS: hipify :: unit_tests/headers/headers_test_01.cu (7 of 64) -PASS: hipify :: unit_tests/headers/headers_test_04.cu (8 of 64) -PASS: hipify :: unit_tests/headers/headers_test_05.cu (9 of 64) -PASS: hipify :: unit_tests/headers/headers_test_07.cu (10 of 64) -PASS: hipify :: unit_tests/headers/headers_test_06.cu (11 of 64) -PASS: hipify :: unit_tests/headers/headers_test_11.cu (12 of 64) -PASS: hipify :: unit_tests/headers/headers_test_08.cu (13 of 64) -PASS: hipify :: unit_tests/headers/headers_test_10.cu (14 of 64) -PASS: hipify :: unit_tests/headers/headers_test_09.cu (15 of 64) -PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_02.cu (16 of 64) -PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_01.cu (17 of 64) -PASS: hipify :: unit_tests/libraries/CUB/cub_01.cu (18 of 64) -PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_1_based_indexing.cu (19 of 64) -PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_0_based_indexing.cu (20 of 64) -PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_sgemm_matrix_multiplication.cu (21 of 64) -PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_0_based_indexing_rocblas.cu (22 of 64) -PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_sgemm_matrix_multiplication_rocblas.cu (23 of 64) -PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_1_based_indexing_rocblas.cu (24 of 64) -PASS: hipify :: unit_tests/libraries/cuComplex/cuComplex_Julia.cu (25 of 64) -PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_softmax.cu (26 of 64) -PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_convolution_forward.cu (27 of 64) -PASS: hipify :: unit_tests/libraries/cuFFT/simple_cufft.cu (28 of 64) -PASS: hipify :: unit_tests/libraries/cuRAND/poisson_api_example.cu (29 of 64) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_01.cu (30 of 64) -PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_generate.cpp (31 of 64) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_02.cu (32 of 64) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_03.cu (33 of 64) -PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_kernel.cpp (34 of 64) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_04.cu (35 of 64) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_05.cu (36 of 64) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_06.cu (37 of 64) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_07.cu (38 of 64) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_09.cu (39 of 64) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_10.cu (40 of 64) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_08.cu (41 of 64) -PASS: hipify :: unit_tests/namespace/ns_kernel_launch.cu (42 of 64) -PASS: hipify :: unit_tests/pp/pp_if_else_conditionals.cu (43 of 64) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_11.cu (44 of 64) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_12.cu (45 of 64) -PASS: hipify :: unit_tests/pp/pp_if_else_conditionals_01.cu (46 of 64) -PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/tex2dKernel.cpp (47 of 64) -PASS: hipify :: unit_tests/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp (48 of 64) -PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp (49 of 64) -PASS: hipify :: unit_tests/samples/2_Cookbook/13_occupancy/occupancy.cpp (50 of 64) -PASS: hipify :: unit_tests/samples/2_Cookbook/1_hipEvent/hipEvent.cpp (51 of 64) -PASS: hipify :: unit_tests/samples/2_Cookbook/2_Profiler/Profiler.cpp (52 of 64) -PASS: hipify :: unit_tests/samples/MallocManaged.cpp (53 of 64) -PASS: hipify :: unit_tests/samples/2_Cookbook/7_streams/stream.cpp (54 of 64) -PASS: hipify :: unit_tests/samples/2_Cookbook/8_peer2peer/peer2peer.cpp (55 of 64) -PASS: hipify :: unit_tests/samples/allocators.cu (56 of 64) -PASS: hipify :: unit_tests/samples/coalescing.cu (57 of 64) -PASS: hipify :: unit_tests/samples/dynamic_shared_memory.cu (58 of 64) -PASS: hipify :: unit_tests/samples/axpy.cu (59 of 64) -PASS: hipify :: unit_tests/samples/cudaRegister.cu (60 of 64) -PASS: hipify :: unit_tests/samples/intro.cu (61 of 64) -PASS: hipify :: unit_tests/samples/square.cu (62 of 64) -PASS: hipify :: unit_tests/samples/static_shared_memory.cu (63 of 64) -PASS: hipify :: unit_tests/samples/vec_add.cu (64 of 64) -Testing Time: 2.98s - Expected Passes : 64 +-- Testing: 65 tests, 12 threads -- +PASS: hipify :: unit_tests/casts/reinterpret_cast.cu (1 of 65) +PASS: hipify :: unit_tests/device/math_functions.cu (2 of 65) +PASS: hipify :: unit_tests/device/atomics.cu (3 of 65) +PASS: hipify :: unit_tests/device/device_symbols.cu (4 of 65) +PASS: hipify :: unit_tests/headers/headers_test_02.cu (5 of 65) +PASS: hipify :: unit_tests/headers/headers_test_03.cu (6 of 65) +PASS: hipify :: unit_tests/headers/headers_test_01.cu (7 of 65) +PASS: hipify :: unit_tests/headers/headers_test_04.cu (8 of 65) +PASS: hipify :: unit_tests/headers/headers_test_05.cu (9 of 65) +PASS: hipify :: unit_tests/headers/headers_test_07.cu (10 of 65) +PASS: hipify :: unit_tests/headers/headers_test_06.cu (11 of 65) +PASS: hipify :: unit_tests/headers/headers_test_11.cu (12 of 65) +PASS: hipify :: unit_tests/headers/headers_test_08.cu (13 of 65) +PASS: hipify :: unit_tests/headers/headers_test_10.cu (14 of 65) +PASS: hipify :: unit_tests/headers/headers_test_09.cu (15 of 65) +PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_02.cu (16 of 65) +PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_01.cu (17 of 65) +PASS: hipify :: unit_tests/libraries/CUB/cub_01.cu (18 of 65) +PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_1_based_indexing.cu (19 of 65) +PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_0_based_indexing.cu (20 of 65) +PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_sgemm_matrix_multiplication.cu (21 of 65) +PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_0_based_indexing_rocblas.cu (22 of 65) +PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_sgemm_matrix_multiplication_rocblas.cu (23 of 65) +PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_1_based_indexing_rocblas.cu (24 of 65) +PASS: hipify :: unit_tests/libraries/cuComplex/cuComplex_Julia.cu (25 of 65) +PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_softmax.cu (26 of 65) +PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_convolution_forward.cu (27 of 65) +PASS: hipify :: unit_tests/libraries/cuFFT/simple_cufft.cu (28 of 65) +PASS: hipify :: unit_tests/libraries/cuRAND/poisson_api_example.cu (29 of 65) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_01.cu (30 of 65) +PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_generate.cpp (31 of 65) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_02.cu (32 of 65) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_03.cu (33 of 65) +PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_kernel.cpp (34 of 65) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_04.cu (35 of 65) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_05.cu (36 of 65) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_06.cu (37 of 65) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_07.cu (38 of 65) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_09.cu (39 of 65) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_10.cu (40 of 65) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_08.cu (41 of 65) +PASS: hipify :: unit_tests/namespace/ns_kernel_launch.cu (42 of 65) +PASS: hipify :: unit_tests/pp/pp_if_else_conditionals.cu (43 of 65) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_11.cu (44 of 65) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_12.cu (45 of 65) +PASS: hipify :: unit_tests/pp/pp_if_else_conditionals_01.cu (46 of 65) +PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/tex2dKernel.cpp (47 of 65) +PASS: hipify :: unit_tests/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp (48 of 65) +PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp (49 of 65) +PASS: hipify :: unit_tests/samples/2_Cookbook/13_occupancy/occupancy.cpp (50 of 65) +PASS: hipify :: unit_tests/samples/2_Cookbook/1_hipEvent/hipEvent.cpp (51 of 65) +PASS: hipify :: unit_tests/samples/2_Cookbook/2_Profiler/Profiler.cpp (52 of 65) +PASS: hipify :: unit_tests/samples/MallocManaged.cpp (53 of 65) +PASS: hipify :: unit_tests/samples/2_Cookbook/7_streams/stream.cpp (54 of 65) +PASS: hipify :: unit_tests/samples/2_Cookbook/8_peer2peer/peer2peer.cpp (55 of 65) +PASS: hipify :: unit_tests/samples/allocators.cu (56 of 65) +PASS: hipify :: unit_tests/samples/coalescing.cu (57 of 65) +PASS: hipify :: unit_tests/samples/dynamic_shared_memory.cu (58 of 65) +PASS: hipify :: unit_tests/samples/axpy.cu (59 of 65) +PASS: hipify :: unit_tests/samples/cudaRegister.cu (60 of 65) +PASS: hipify :: unit_tests/samples/intro.cu (61 of 65) +PASS: hipify :: unit_tests/samples/square.cu (62 of 65) +PASS: hipify :: unit_tests/samples/static_shared_memory.cu (63 of 65) +PASS: hipify :: unit_tests/samples/vec_add.cu (64 of 65) +PASS: hipify :: unit_tests/libraries/CUB/cub_02.cu (18 of 65) +Testing Time: 3.01s + Expected Passes : 65 [100%] Built target test-hipify ``` ### Windows On Windows 10 the following configurations are tested: -LLVM 5.0.0 - 5.0.2, CUDA 8.0, cudnn-5.1.10 - cudnn-7.1.4.18 +LLVM 5.0.0 - 5.0.2, CUDA 8.0, cudnn 5.1.10 - 7.1.4.18 -LLVM 6.0.0 - 6.0.1, CUDA 9.0, cudnn-7.0.5.15 - cudnn-7.6.4.38 +LLVM 6.0.0 - 6.0.1, CUDA 9.0, cudnn 7.0.5.15 - 7.6.4.38 -LLVM 7.0.0 - 9.0.0, CUDA 7.5 - 10.1, cudnn-7.0.5.15 - cudnn-7.6.4.38 +LLVM 7.0.0 - 9.0.0, CUDA 7.5 - 10.1, cudnn 7.0.5.15 - 7.6.4.38 -Build system for the above configurations: +Build system requirements for the latest configuration LLVM 9.0.0/CUDA 10.1 Update 2: -Python 3.6 - 3.7.4, cmake 3.12.3 - 3.15.5, Visual Studio 2017 (15.5.2) - 2019 (16.3.4). +Python 3.6.0 - 3.8.0, cmake 3.5.1 - 3.15.5, Visual Studio 2017 (15.5.2) - 2019 (16.3.5). Here is an example of building `hipify-clang` with testing support on `Windows 10` by `Visual Studio 16 2019`: @@ -371,7 +372,7 @@ cmake -- - CMake module path: F:/LLVM/9.0.0/dist/lib/cmake/llvm -- - Include path : F:/LLVM/9.0.0/dist/include -- - Binary path : F:/LLVM/9.0.0/dist/bin --- Found PythonInterp: C:/Program Files/Python37/python.exe (found suitable version "3.7.4", minimum required is "3.6") +-- Found PythonInterp: C:/Program Files/Python38/python.exe (found suitable version "3.8.0", minimum required is "3.6") -- Found lit: C:/Program Files/Python36/Scripts/lit.exe -- Found FileCheck: F:/LLVM/9.0.0/dist/bin/FileCheck.exe -- Found CUDA: C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1 (found version "10.1") diff --git a/hipify-clang/src/CUDA2HIP_CUB_API_types.cpp b/hipify-clang/src/CUDA2HIP_CUB_API_types.cpp new file mode 100644 index 0000000000..0ef1912b54 --- /dev/null +++ b/hipify-clang/src/CUDA2HIP_CUB_API_types.cpp @@ -0,0 +1,28 @@ +/* +Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "CUDA2HIP.h" + +// Maps the names of CUDA CUB API types to the corresponding HIP types +const std::map CUDA_CUB_TYPE_NAME_MAP{ + {"cub", {"hipcub", "", CONV_TYPE, API_CUB}}, +}; diff --git a/hipify-clang/src/CUDA2HIP_Perl.cpp b/hipify-clang/src/CUDA2HIP_Perl.cpp index df83bf8d57..8d59089d4f 100644 --- a/hipify-clang/src/CUDA2HIP_Perl.cpp +++ b/hipify-clang/src/CUDA2HIP_Perl.cpp @@ -251,6 +251,11 @@ namespace perl { *streamPtr.get() << tab_2 << "$Tkernels{$1}++;" << endl_tab << "}" << endl << "}" << endl; } + void generateCubNamespace(unique_ptr& streamPtr) { + *streamPtr.get() << endl << sub << "transformCubNamespace" << " {" << endl_tab << my_k << endl; + *streamPtr.get() << tab << "$k += s/using\\s*namespace\\s*cub/using namespace hipcub/g;" << endl << tab << return_k << "}" << endl; + } + void generateHostFunctions(unique_ptr& streamPtr) { *streamPtr.get() << endl << sub << "transformHostFunctions" << " {" << endl_tab << my_k << endl; set &funcSet = DeviceSymbolFunctions0; @@ -358,6 +363,7 @@ namespace perl { generateSimpleSubstitutions(streamPtr); generateExternShared(streamPtr); generateKernelLaunch(streamPtr); + generateCubNamespace(streamPtr); generateHostFunctions(streamPtr); generateDeviceFunctions(streamPtr); *streamPtr.get() << endl << "# Count of transforms in all files" << endl; @@ -401,6 +407,7 @@ namespace perl { *streamPtr.get() << tab_2 << "simpleSubstitutions();" << endl; *streamPtr.get() << tab_2 << "transformExternShared();" << endl; *streamPtr.get() << tab_2 << "transformKernelLaunch();" << endl; + *streamPtr.get() << tab_2 << "transformCubNamespace();" << endl; *streamPtr.get() << tab_2 << "if ($print_stats) {" << endl; *streamPtr.get() << tab_3 << while_ << "(/(\\b(hip|HIP)([A-Z]|_)\\w+\\b)/g) {" << endl; *streamPtr.get() << tab_4 << "$convertedTags{$1}++;" << endl_tab_3 << "}" << endl_tab_2 << "}" << endl; diff --git a/hipify-clang/src/CUDA2HIP_Runtime_API_types.cpp b/hipify-clang/src/CUDA2HIP_Runtime_API_types.cpp index fda9b34b2c..7b34d97ab4 100644 --- a/hipify-clang/src/CUDA2HIP_Runtime_API_types.cpp +++ b/hipify-clang/src/CUDA2HIP_Runtime_API_types.cpp @@ -31,8 +31,6 @@ const std::map CUDA_RUNTIME_TYPE_NAME_MAP { {"cudaChannelFormatDesc", {"hipChannelFormatDesc", "", CONV_TYPE, API_RUNTIME}}, // no analogue {"cudaDeviceProp", {"hipDeviceProp_t", "", CONV_TYPE, API_RUNTIME}}, - // NOTE: int warpSize is a field of cudaDeviceProp - {"warpSize", {"hipWarpSize", "", CONV_TYPE, API_RUNTIME}}, // no analogue {"cudaEglFrame", {"hipEglFrame", "", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}}, diff --git a/hipify-clang/src/HipifyAction.cpp b/hipify-clang/src/HipifyAction.cpp index c6d10f0cdc..510d91978a 100644 --- a/hipify-clang/src/HipifyAction.cpp +++ b/hipify-clang/src/HipifyAction.cpp @@ -33,16 +33,21 @@ THE SOFTWARE. #include "StringUtils.h" #include "ArgParse.h" -namespace ct = clang::tooling; -namespace mat = clang::ast_matchers; - +const std::string sHIP = "HIP"; +const std::string sROC = "ROC"; +const std::string sCub = "cub"; +const std::string sHipcub = "hipcub"; const std::string sHIP_DYNAMIC_SHARED = "HIP_DYNAMIC_SHARED"; +const std::string sHIP_KERNEL_NAME = "HIP_KERNEL_NAME"; std::string sHIP_SYMBOL = "HIP_SYMBOL"; -std::string sHIP_KERNEL_NAME = "HIP_KERNEL_NAME"; std::string s_reinterpret_cast = "reinterpret_cast"; -const std::string sHipLaunchKernelGGL = "hipLaunchKernelGGL("; +const std::string sHipLaunchKernelGGL = "hipLaunchKernelGGL"; const std::string sDim3 = "dim3("; - +const std::string s_hiprand_kernel_h = "hiprand_kernel.h"; +const std::string s_hiprand_h = "hiprand.h"; +const std::string sOnce = "once"; +const std::string s_string_literal = "[string literal]"; +// CUDA identifiers, used in matchers const std::string sCudaMemcpyToSymbol = "cudaMemcpyToSymbol"; const std::string sCudaMemcpyToSymbolAsync = "cudaMemcpyToSymbolAsync"; const std::string sCudaGetSymbolSize = "cudaGetSymbolSize"; @@ -51,6 +56,14 @@ const std::string sCudaMemcpyFromSymbol = "cudaMemcpyFromSymbol"; const std::string sCudaMemcpyFromSymbolAsync = "cudaMemcpyFromSymbolAsync"; const std::string sCudaFuncSetCacheConfig = "cudaFuncSetCacheConfig"; const std::string sCudaFuncGetAttributes = "cudaFuncGetAttributes"; +// Matchers' names +const StringRef sCudaSharedIncompleteArrayVar = "cudaSharedIncompleteArrayVar"; +const StringRef sCudaLaunchKernel = "cudaLaunchKernel"; +const StringRef sCudaHostFuncCall = "cudaHostFuncCall"; +const StringRef sCudaDeviceFuncCall = "cudaDeviceFuncCall"; +const StringRef sCubNamespacePrefix = "cubNamespacePrefix"; +const StringRef sCubFunctionTemplateDecl = "cubFunctionTemplateDecl"; +const StringRef sCubUsingNamespaceDecl = "cubUsingNamespaceDecl"; std::set DeviceSymbolFunctions0 { {sCudaMemcpyToSymbol}, @@ -78,7 +91,7 @@ std::set ReinterpretFunctions1{ }; void HipifyAction::RewriteString(StringRef s, clang::SourceLocation start) { - clang::SourceManager& SM = getCompilerInstance().getSourceManager(); + auto &SM = getCompilerInstance().getSourceManager(); size_t begin = 0; while ((begin = s.find("cu", begin)) != StringRef::npos) { const size_t end = s.find_first_of(" ", begin + 4); @@ -86,7 +99,7 @@ void HipifyAction::RewriteString(StringRef s, clang::SourceLocation start) { const auto found = CUDA_RENAMES_MAP().find(name); if (found != CUDA_RENAMES_MAP().end()) { StringRef repName = Statistics::isToRoc(found->second) ? found->second.rocName : found->second.hipName; - hipCounter counter = {"[string literal]", "", ConvTypes::CONV_LITERAL, ApiTypes::API_RUNTIME, found->second.supportDegree}; + hipCounter counter = {s_string_literal, "", ConvTypes::CONV_LITERAL, ApiTypes::API_RUNTIME, found->second.supportDegree}; Statistics::current().incrementCounter(counter, name.str()); if (!Statistics::isUnsupported(counter)) { clang::SourceLocation sl = start.getLocWithOffset(begin + 1); @@ -95,13 +108,24 @@ void HipifyAction::RewriteString(StringRef s, clang::SourceLocation start) { insertReplacement(Rep, fullSL); } } - if (end == StringRef::npos) { - break; - } + if (end == StringRef::npos) break; begin = end + 1; } } +clang::SourceLocation HipifyAction::GetSubstrLocation(const std::string &str, const clang::SourceRange &sr) { + clang::SourceLocation sl(sr.getBegin()); + clang::SourceLocation end(sr.getEnd()); + auto &SM = getCompilerInstance().getSourceManager(); + size_t length = SM.getCharacterData(end) - SM.getCharacterData(sl); + StringRef sfull = StringRef(SM.getCharacterData(sl), length); + size_t offset = sfull.find(str); + if (offset > 0) { + sl = sl.getLocWithOffset(offset); + } + return sl; +} + /** * Look at, and consider altering, a given token. * @@ -109,7 +133,7 @@ void HipifyAction::RewriteString(StringRef s, clang::SourceLocation start) { * If it's an unsupported CUDA identifier, a warning is emitted. * Otherwise, the source file is updated with the corresponding hipification. */ -void HipifyAction::RewriteToken(const clang::Token& t) { +void HipifyAction::RewriteToken(const clang::Token &t) { // String literals containing CUDA references need fixing. if (t.is(clang::tok::string_literal)) { StringRef s(t.getLiteralData(), t.getLength()); @@ -124,20 +148,21 @@ void HipifyAction::RewriteToken(const clang::Token& t) { FindAndReplace(name, sl, CUDA_RENAMES_MAP()); } -void HipifyAction::FindAndReplace(llvm::StringRef name, +void HipifyAction::FindAndReplace(StringRef name, clang::SourceLocation sl, - const std::map& repMap, bool bReplace) { + const std::map &repMap, + bool bReplace) { const auto found = repMap.find(name); if (found == repMap.end()) { // So it's an identifier, but not CUDA? Boring. return; } Statistics::current().incrementCounter(found->second, name.str()); - clang::DiagnosticsEngine& DE = getCompilerInstance().getDiagnostics(); + clang::DiagnosticsEngine &DE = getCompilerInstance().getDiagnostics(); // Warn the user about unsupported identifier. if (Statistics::isUnsupported(found->second)) { std::string sWarn; - Statistics::isToRoc(found->second) ? sWarn = "ROC" : sWarn = "HIP"; + Statistics::isToRoc(found->second) ? sWarn = sROC : sWarn = sHIP; sWarn = "" + sWarn; const auto ID = DE.getCustomDiagID(clang::DiagnosticsEngine::Warning, "CUDA identifier is unsupported in %0."); DE.Report(sl, ID) << sWarn; @@ -147,7 +172,7 @@ void HipifyAction::FindAndReplace(llvm::StringRef name, return; } StringRef repName = Statistics::isToRoc(found->second) ? found->second.rocName : found->second.hipName; - clang::SourceManager& SM = getCompilerInstance().getSourceManager(); + auto &SM = getCompilerInstance().getSourceManager(); ct::Replacement Rep(SM, sl, name.size(), repName.str()); clang::FullSourceLoc fullSL(sl, SM); insertReplacement(Rep, fullSL); @@ -155,7 +180,7 @@ void HipifyAction::FindAndReplace(llvm::StringRef name, namespace { -clang::SourceRange getReadRange(clang::SourceManager& SM, const clang::SourceRange& exprRange) { +clang::SourceRange getReadRange(clang::SourceManager &SM, const clang::SourceRange &exprRange) { clang::SourceLocation begin = exprRange.getBegin(); clang::SourceLocation end = exprRange.getEnd(); bool beginSafe = !SM.isMacroBodyExpansion(begin) || clang::Lexer::isAtStartOfMacroExpansion(begin, SM, clang::LangOptions{}); @@ -167,7 +192,7 @@ clang::SourceRange getReadRange(clang::SourceManager& SM, const clang::SourceRan } } -clang::SourceRange getWriteRange(clang::SourceManager& SM, const clang::SourceRange& exprRange) { +clang::SourceRange getWriteRange(clang::SourceManager &SM, const clang::SourceRange &exprRange) { clang::SourceLocation begin = exprRange.getBegin(); clang::SourceLocation end = exprRange.getEnd(); // If the range is contained within a macro, update the macro definition. @@ -178,7 +203,7 @@ clang::SourceRange getWriteRange(clang::SourceManager& SM, const clang::SourceRa return {SM.getSpellingLoc(begin), SM.getSpellingLoc(end)}; } -StringRef readSourceText(clang::SourceManager& SM, const clang::SourceRange& exprRange) { +StringRef readSourceText(clang::SourceManager &SM, const clang::SourceRange &exprRange) { return clang::Lexer::getSourceText(clang::CharSourceRange::getTokenRange(getReadRange(SM, exprRange)), SM, clang::LangOptions(), nullptr); } @@ -186,53 +211,50 @@ StringRef readSourceText(clang::SourceManager& SM, const clang::SourceRange& exp * Get a string representation of the expression `arg`, unless it's a defaulting function * call argument, in which case get a 0. Used for building argument lists to kernel calls. */ -std::string stringifyZeroDefaultedArg(clang::SourceManager& SM, const clang::Expr* arg) { - if (clang::isa(arg)) { - return "0"; - } else { - return readSourceText(SM, arg->getSourceRange()); - } +std::string stringifyZeroDefaultedArg(clang::SourceManager &SM, const clang::Expr *arg) { + if (clang::isa(arg)) return "0"; + else return readSourceText(SM, arg->getSourceRange()); } } // anonymous namespace -bool HipifyAction::Exclude(const hipCounter & hipToken) { +bool HipifyAction::Exclude(const hipCounter &hipToken) { switch (hipToken.type) { case CONV_INCLUDE_CUDA_MAIN_H: switch (hipToken.apiType) { case API_DRIVER: case API_RUNTIME: - if (insertedRuntimeHeader) { return true; } + if (insertedRuntimeHeader) return true; insertedRuntimeHeader = true; return false; case API_BLAS: - if (insertedBLASHeader) { return true; } + if (insertedBLASHeader) return true; insertedBLASHeader = true; return false; case API_RAND: - if (hipToken.hipName == "hiprand_kernel.h") { - if (insertedRAND_kernelHeader) { return true; } + if (hipToken.hipName == s_hiprand_kernel_h) { + if (insertedRAND_kernelHeader) return true; insertedRAND_kernelHeader = true; return false; - } else if (hipToken.hipName == "hiprand.h") { - if (insertedRANDHeader) { return true; } + } else if (hipToken.hipName == s_hiprand_h) { + if (insertedRANDHeader) return true; insertedRANDHeader = true; return false; } case API_DNN: - if (insertedDNNHeader) { return true; } + if (insertedDNNHeader) return true; insertedDNNHeader = true; return false; case API_FFT: - if (insertedFFTHeader) { return true; } + if (insertedFFTHeader) return true; insertedFFTHeader = true; return false; case API_COMPLEX: - if (insertedComplexHeader) { return true; } + if (insertedComplexHeader) return true; insertedComplexHeader = true; return false; case API_SPARSE: - if (insertedSPARSEHeader) { return true; } + if (insertedSPARSEHeader) return true; insertedSPARSEHeader = true; return false; default: @@ -240,13 +262,11 @@ bool HipifyAction::Exclude(const hipCounter & hipToken) { } return false; case CONV_INCLUDE: - if (hipToken.hipName.empty()) { - return true; - } + if (hipToken.hipName.empty()) return true; switch (hipToken.apiType) { case API_RAND: - if (hipToken.hipName == "hiprand_kernel.h") { - if (insertedRAND_kernelHeader) { return true; } + if (hipToken.hipName == s_hiprand_kernel_h) { + if (insertedRAND_kernelHeader) return true; insertedRAND_kernelHeader = true; } return false; @@ -267,24 +287,19 @@ void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc, clang::CharSourceRange filename_range, const clang::FileEntry*, StringRef, StringRef, const clang::Module*) { - clang::SourceManager& SM = getCompilerInstance().getSourceManager(); - if (!SM.isWrittenInMainFile(hash_loc)) { - return; - } + auto &SM = getCompilerInstance().getSourceManager(); + if (!SM.isWrittenInMainFile(hash_loc)) return; if (!firstHeader) { firstHeader = true; firstHeaderLoc = hash_loc; } const auto found = CUDA_INCLUDE_MAP.find(file_name); - if (found == CUDA_INCLUDE_MAP.end()) { - return; - } + if (found == CUDA_INCLUDE_MAP.end()) return; bool exclude = Exclude(found->second); Statistics::current().incrementCounter(found->second, file_name.str()); - clang::SourceLocation sl = filename_range.getBegin(); if (Statistics::isUnsupported(found->second)) { - clang::DiagnosticsEngine& DE = getCompilerInstance().getDiagnostics(); + clang::DiagnosticsEngine &DE = getCompilerInstance().getDiagnostics(); DE.Report(sl, DE.getCustomDiagID(clang::DiagnosticsEngine::Warning, "Unsupported CUDA header.")); return; } @@ -293,11 +308,8 @@ void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc, if (!exclude) { clang::SmallString<128> includeBuffer; llvm::StringRef name = Statistics::isToRoc(found->second) ? found->second.rocName : found->second.hipName; - if (is_angled) { - newInclude = llvm::Twine("<" + name+ ">").toStringRef(includeBuffer); - } else { - newInclude = llvm::Twine("\"" + name + "\"").toStringRef(includeBuffer); - } + if (is_angled) newInclude = llvm::Twine("<" + name+ ">").toStringRef(includeBuffer); + else newInclude = llvm::Twine("\"" + name + "\"").toStringRef(includeBuffer); } else { // hashLoc is location of the '#', thus replacing the whole include directive by empty newInclude starting with '#'. sl = hash_loc; @@ -309,48 +321,52 @@ void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc, } void HipifyAction::PragmaDirective(clang::SourceLocation Loc, clang::PragmaIntroducerKind Introducer) { - if (pragmaOnce) { - return; - } - clang::SourceManager& SM = getCompilerInstance().getSourceManager(); - if (!SM.isWrittenInMainFile(Loc)) { - return; - } - clang::Preprocessor& PP = getCompilerInstance().getPreprocessor(); + if (pragmaOnce) return; + auto &SM = getCompilerInstance().getSourceManager(); + if (!SM.isWrittenInMainFile(Loc)) return; + clang::Preprocessor &PP = getCompilerInstance().getPreprocessor(); clang::Token tok; PP.Lex(tok); StringRef Text(SM.getCharacterData(tok.getLocation()), tok.getLength()); - if (Text == "once") { + if (Text == sOnce) { pragmaOnce = true; pragmaOnceLoc = tok.getEndLoc(); } } -bool HipifyAction::cudaLaunchKernel(const clang::ast_matchers::MatchFinder::MatchResult& Result) { - StringRef refName = "cudaLaunchKernel"; - const auto* launchKernel = Result.Nodes.getNodeAs(refName); - if (!launchKernel) { - return false; - } - const clang::Expr* calleeExpr = launchKernel->getCallee(); - if (!calleeExpr) { - return false; - } - const clang::FunctionDecl *caleeDecl = launchKernel->getDirectCallee(); - if (!caleeDecl) { - return false; - } - const clang::CallExpr* config = launchKernel->getConfig(); - if (!config) { - return false; - } +bool HipifyAction::cudaLaunchKernel(const mat::MatchFinder::MatchResult &Result) { + auto *launchKernel = Result.Nodes.getNodeAs(sCudaLaunchKernel); + if (!launchKernel) return false; + auto *calleeExpr = launchKernel->getCallee(); + if (!calleeExpr) return false; + auto *caleeDecl = launchKernel->getDirectCallee(); + if (!caleeDecl) return false; + auto *config = launchKernel->getConfig(); + if (!config) return false; clang::SmallString<40> XStr; llvm::raw_svector_ostream OS(XStr); clang::LangOptions DefaultLangOptions; - clang::SourceManager* SM = Result.SourceManager; - OS << sHipLaunchKernelGGL; - if (caleeDecl->isTemplateInstantiation()) OS << sHIP_KERNEL_NAME << "("; - OS << readSourceText(*SM, calleeExpr->getSourceRange()); + auto *SM = Result.SourceManager; + clang::SourceRange sr = calleeExpr->getSourceRange(); + std::string kern = readSourceText(*SM, sr).str(); + OS << sHipLaunchKernelGGL << "("; + if (caleeDecl->isTemplateInstantiation()) { + OS << sHIP_KERNEL_NAME << "("; + std::string cub = sCub + "::"; + std::string hipcub; + const auto found = CUDA_CUB_TYPE_NAME_MAP.find(sCub); + if (found != CUDA_CUB_TYPE_NAME_MAP.end()) { + hipcub = found->second.hipName.str() + "::"; + } else { + hipcub = sHipcub + "::"; + } + size_t pos = kern.find(cub); + while (pos != std::string::npos) { + kern.replace(pos, cub.size(), hipcub); + pos = kern.find(cub, pos + hipcub.size()); + } + } + OS << kern; if (caleeDecl->isTemplateInstantiation()) OS << ")"; OS << ", "; // Next up are the four kernel configuration parameters, the last two of which are optional and default to zero. @@ -381,29 +397,24 @@ bool HipifyAction::cudaLaunchKernel(const clang::ast_matchers::MatchFinder::Matc ct::Replacement Rep(*SM, launchStart, length, OS.str()); clang::FullSourceLoc fullSL(launchStart, *SM); insertReplacement(Rep, fullSL); - hipCounter counter = {"hipLaunchKernelGGL", "", ConvTypes::CONV_KERNEL_LAUNCH, ApiTypes::API_RUNTIME}; - Statistics::current().incrementCounter(counter, refName.str()); + hipCounter counter = {sHipLaunchKernelGGL, "", ConvTypes::CONV_KERNEL_LAUNCH, ApiTypes::API_RUNTIME}; + Statistics::current().incrementCounter(counter, sCudaLaunchKernel.str()); return true; } -bool HipifyAction::cudaSharedIncompleteArrayVar(const clang::ast_matchers::MatchFinder::MatchResult& Result) { - StringRef refName = "cudaSharedIncompleteArrayVar"; - auto* sharedVar = Result.Nodes.getNodeAs(refName); - if (!sharedVar) { - return false; - } +bool HipifyAction::cudaSharedIncompleteArrayVar(const mat::MatchFinder::MatchResult &Result) { + auto *sharedVar = Result.Nodes.getNodeAs(sCudaSharedIncompleteArrayVar); + if (!sharedVar) return false; // Example: extern __shared__ uint sRadix1[]; - if (!sharedVar->hasExternalFormalLinkage()) { - return false; - } + if (!sharedVar->hasExternalFormalLinkage()) return false; clang::QualType QT = sharedVar->getType(); std::string typeName; if (QT->isIncompleteArrayType()) { - const clang::ArrayType* AT = QT.getTypePtr()->getAsArrayTypeUnsafe(); + const clang::ArrayType *AT = QT.getTypePtr()->getAsArrayTypeUnsafe(); QT = AT->getElementType(); if (QT.getTypePtr()->isBuiltinType()) { QT = QT.getCanonicalType(); - const auto* BT = clang::dyn_cast(QT); + auto *BT = clang::dyn_cast(QT); if (BT) { clang::LangOptions LO; LO.CUDA = true; @@ -417,7 +428,7 @@ bool HipifyAction::cudaSharedIncompleteArrayVar(const clang::ast_matchers::Match if (!typeName.empty()) { clang::SourceLocation slStart = sharedVar->getOuterLocStart(); clang::SourceLocation slEnd = llcompat::getEndLoc(sharedVar->getTypeSourceInfo()->getTypeLoc()); - clang::SourceManager* SM = Result.SourceManager; + auto *SM = Result.SourceManager; size_t repLength = SM->getCharacterData(slEnd) - SM->getCharacterData(slStart) + 1; std::string varName = sharedVar->getNameAsString(); std::string repName = sHIP_DYNAMIC_SHARED + "(" + typeName + ", " + varName + ")"; @@ -425,74 +436,84 @@ bool HipifyAction::cudaSharedIncompleteArrayVar(const clang::ast_matchers::Match clang::FullSourceLoc fullSL(slStart, *SM); insertReplacement(Rep, fullSL); hipCounter counter = {sHIP_DYNAMIC_SHARED, "", ConvTypes::CONV_EXTERN_SHARED, ApiTypes::API_RUNTIME}; - Statistics::current().incrementCounter(counter, refName.str()); + Statistics::current().incrementCounter(counter, sCudaSharedIncompleteArrayVar.str()); return true; } return false; } -bool HipifyAction::cudaDeviceFuncCall(const clang::ast_matchers::MatchFinder::MatchResult& Result) { - if (const clang::CallExpr *call = Result.Nodes.getNodeAs("cudaDeviceFuncCall")) { - const clang::FunctionDecl *funcDcl = call->getDirectCallee(); - if (!funcDcl) { - return false; - } +bool HipifyAction::cudaDeviceFuncCall(const mat::MatchFinder::MatchResult &Result) { + if (const clang::CallExpr *call = Result.Nodes.getNodeAs(sCudaDeviceFuncCall)) { + auto *funcDcl = call->getDirectCallee(); + if (!funcDcl) return false; FindAndReplace(funcDcl->getDeclName().getAsString(), llcompat::getBeginLoc(call), CUDA_DEVICE_FUNC_MAP, false); return true; } return false; } -bool HipifyAction::cubNamespacePrefix(const clang::ast_matchers::MatchFinder::MatchResult& Result) { - if (const clang::TypedefNameDecl *decl = Result.Nodes.getNodeAs("cubNamespacePrefix")) { - if (!decl) { - return false; - } +bool HipifyAction::cubNamespacePrefix(const mat::MatchFinder::MatchResult &Result) { + if (auto *decl = Result.Nodes.getNodeAs(sCubNamespacePrefix)) { clang::QualType QT = decl->getUnderlyingType(); - const clang::Type* t = QT.getTypePtr(); - if (!t) { - return false; - } - const clang::ElaboratedType* et = t->getAs(); - if (!et) { - return false; - } + auto *t = QT.getTypePtr(); + if (!t) return false; + const clang::ElaboratedType *et = t->getAs(); + if (!et) return false; const clang::NestedNameSpecifier *nns = et->getQualifier(); - if (!nns) { - return false; - } + if (!nns) return false; const clang::NamespaceDecl *nsd = nns->getAsNamespace(); - if (!nsd) { - return false; - } + if (!nsd) return false; const clang::TypeSourceInfo *si = decl->getTypeSourceInfo(); const clang::TypeLoc tloc = si->getTypeLoc(); const clang::SourceRange sr = tloc.getSourceRange(); - clang::SourceLocation sl(sr.getBegin()); - clang::SourceLocation end(sr.getEnd()); - clang::SourceManager& SM = getCompilerInstance().getSourceManager(); - size_t length = SM.getCharacterData(end) - SM.getCharacterData(sl); - StringRef sfull = StringRef(SM.getCharacterData(sl), length); std::string name = nsd->getDeclName().getAsString(); - size_t offset = sfull.find(name); - if (offset > 0) { - sl = sl.getLocWithOffset(offset); - } - FindAndReplace(name, sl, CUDA_CUB_TYPE_NAME_MAP); + FindAndReplace(name, GetSubstrLocation(name, sr), CUDA_CUB_TYPE_NAME_MAP); return true; } return false; } -bool HipifyAction::cudaHostFuncCall(const clang::ast_matchers::MatchFinder::MatchResult& Result) { - if (const clang::CallExpr * call = Result.Nodes.getNodeAs("cudaHostFuncCall")) { - if (!call->getNumArgs()) { - return false; +bool HipifyAction::cubUsingNamespaceDecl(const mat::MatchFinder::MatchResult &Result) { + if (auto *decl = Result.Nodes.getNodeAs(sCubUsingNamespaceDecl)) { + if (auto nsd = decl->getNominatedNamespace()) { + FindAndReplace(nsd->getDeclName().getAsString(), decl->getIdentLocation(), CUDA_CUB_TYPE_NAME_MAP); + return true; } - const clang::FunctionDecl* funcDcl = call->getDirectCallee(); - if (!funcDcl) { - return false; + } + return false; +} + +bool HipifyAction::cubFunctionTemplateDecl(const mat::MatchFinder::MatchResult &Result) { + if (auto *decl = Result.Nodes.getNodeAs(sCubFunctionTemplateDecl)) { + auto *Tparams = decl->getTemplateParameters(); + bool ret = false; + for (size_t I = 0; I < Tparams->size(); ++I) { + const clang::ValueDecl *valueDecl = dyn_cast(Tparams->getParam(I)); + if (!valueDecl) continue; + clang::QualType QT = valueDecl->getType(); + auto *t = QT.getTypePtr(); + if (!t) continue; + const clang::ElaboratedType *et = t->getAs(); + if (!et) continue; + const clang::NestedNameSpecifier *nns = et->getQualifier(); + if (!nns) continue; + const clang::NamespaceDecl *nsd = nns->getAsNamespace(); + if (!nsd) continue; + const clang::SourceRange sr = valueDecl->getSourceRange(); + std::string name = nsd->getDeclName().getAsString(); + FindAndReplace(name, GetSubstrLocation(name, sr), CUDA_CUB_TYPE_NAME_MAP); + ret = true; } + return ret; + } + return false; +} + +bool HipifyAction::cudaHostFuncCall(const mat::MatchFinder::MatchResult &Result) { + if (auto *call = Result.Nodes.getNodeAs(sCudaHostFuncCall)) { + if (!call->getNumArgs()) return false; + auto *funcDcl = call->getDirectCallee(); + if (!funcDcl) return false; std::string sName = funcDcl->getDeclName().getAsString(); unsigned int argNum = 0; bool b_reinterpret = (ReinterpretFunctions.find(sName) != ReinterpretFunctions.end()) ? true : false; @@ -506,7 +527,7 @@ bool HipifyAction::cudaHostFuncCall(const clang::ast_matchers::MatchFinder::Matc clang::SmallString<40> XStr; llvm::raw_svector_ostream OS(XStr); clang::SourceRange sr = call->getArg(argNum)->getSourceRange(); - clang::SourceManager* SM = Result.SourceManager; + auto *SM = Result.SourceManager; OS << (b_reinterpret ? s_reinterpret_cast : sHIP_SYMBOL) << "(" << readSourceText(*SM, sr) << ")"; clang::SourceRange replacementRange = getWriteRange(*SM, { sr.getBegin(), sr.getEnd() }); clang::SourceLocation s = replacementRange.getBegin(); @@ -521,7 +542,7 @@ bool HipifyAction::cudaHostFuncCall(const clang::ast_matchers::MatchFinder::Matc return false; } -void HipifyAction::insertReplacement(const ct::Replacement& rep, const clang::FullSourceLoc& fullSL) { +void HipifyAction::insertReplacement(const ct::Replacement &rep, const clang::FullSourceLoc &fullSL) { llcompat::insertReplacement(*replacements, rep); if (PrintStats) { rep.getLength(); @@ -530,10 +551,10 @@ void HipifyAction::insertReplacement(const ct::Replacement& rep, const clang::Fu } } -std::unique_ptr HipifyAction::CreateASTConsumer(clang::CompilerInstance& CI, llvm::StringRef) { - Finder.reset(new clang::ast_matchers::MatchFinder); +std::unique_ptr HipifyAction::CreateASTConsumer(clang::CompilerInstance &CI, StringRef) { + Finder.reset(new mat::MatchFinder); // Replace the <<<...>>> language extension with a hip kernel launch - Finder->addMatcher(mat::cudaKernelCallExpr(mat::isExpansionInMainFile()).bind("cudaLaunchKernel"), this); + Finder->addMatcher(mat::cudaKernelCallExpr(mat::isExpansionInMainFile()).bind(sCudaLaunchKernel), this); Finder->addMatcher( mat::varDecl( mat::isExpansionInMainFile(), @@ -541,7 +562,7 @@ std::unique_ptr HipifyAction::CreateASTConsumer(clang::Compi mat::hasAttr(clang::attr::CUDAShared), mat::hasType(mat::incompleteArrayType()) ) - ).bind("cudaSharedIncompleteArrayVar"), + ).bind(sCudaSharedIncompleteArrayVar), this ); Finder->addMatcher( @@ -561,7 +582,7 @@ std::unique_ptr HipifyAction::CreateASTConsumer(clang::Compi ) ) ) - ).bind("cudaHostFuncCall"), + ).bind(sCudaHostFuncCall), this ); Finder->addMatcher( @@ -576,7 +597,7 @@ std::unique_ptr HipifyAction::CreateASTConsumer(clang::Compi mat::unless(mat::hasAttr(clang::attr::CUDAHost)) ) ) - ).bind("cudaDeviceFuncCall"), + ).bind(sCudaDeviceFuncCall), this ); Finder->addMatcher( @@ -586,12 +607,26 @@ std::unique_ptr HipifyAction::CreateASTConsumer(clang::Compi mat::elaboratedType( mat::hasQualifier( mat::specifiesNamespace( - mat::hasName("cub") + mat::hasName(sCub) ) ) ) ) - ).bind("cubNamespacePrefix"), + ).bind(sCubNamespacePrefix), + this + ); + // TODO: Maybe worth to make it more concrete based on final cubFunctionTemplateDecl + Finder->addMatcher( + mat::functionTemplateDecl( + mat::isExpansionInMainFile() + ).bind(sCubFunctionTemplateDecl), + this + ); + // TODO: Maybe worth to make it more concrete + Finder->addMatcher( + mat::usingDirectiveDecl( + mat::isExpansionInMainFile() + ).bind(sCubUsingNamespaceDecl), this ); // Ownership is transferred to the caller. @@ -599,10 +634,8 @@ std::unique_ptr HipifyAction::CreateASTConsumer(clang::Compi } void HipifyAction::Ifndef(clang::SourceLocation Loc, const clang::Token &MacroNameTok, const clang::MacroDefinition &MD) { - clang::SourceManager& SM = getCompilerInstance().getSourceManager(); - if (!SM.isWrittenInMainFile(Loc)) { - return; - } + auto &SM = getCompilerInstance().getSourceManager(); + if (!SM.isWrittenInMainFile(Loc)) return; StringRef Text(SM.getCharacterData(MacroNameTok.getLocation()), MacroNameTok.getLength()); Ifndefs.insert(std::make_pair(Text.str(), MacroNameTok.getEndLoc())); } @@ -615,12 +648,12 @@ void HipifyAction::EndSourceFileAction() { // one copy of the hip include into every file. bool placeForIncludeCalculated = false; clang::SourceLocation sl, controllingMacroLoc; - clang::SourceManager& SM = getCompilerInstance().getSourceManager(); - clang::Preprocessor& PP = getCompilerInstance().getPreprocessor(); - clang::HeaderSearch& HS = PP.getHeaderSearchInfo(); - clang::ExternalPreprocessorSource* EPL = HS.getExternalLookup(); - const clang::FileEntry* FE = SM.getFileEntryForID(SM.getMainFileID()); - const clang::IdentifierInfo* controllingMacro = HS.getFileInfo(FE).getControllingMacro(EPL); + auto &SM = getCompilerInstance().getSourceManager(); + clang::Preprocessor &PP = getCompilerInstance().getPreprocessor(); + clang::HeaderSearch &HS = PP.getHeaderSearchInfo(); + clang::ExternalPreprocessorSource *EPL = HS.getExternalLookup(); + const clang::FileEntry *FE = SM.getFileEntryForID(SM.getMainFileID()); + const clang::IdentifierInfo *controllingMacro = HS.getFileInfo(FE).getControllingMacro(EPL); if (controllingMacro) { auto found = Ifndefs.find(controllingMacro->getName().str()); if (found != Ifndefs.end()) { @@ -629,19 +662,13 @@ void HipifyAction::EndSourceFileAction() { } } if (pragmaOnce) { - if (placeForIncludeCalculated) { - sl = pragmaOnceLoc < controllingMacroLoc ? pragmaOnceLoc : controllingMacroLoc; - } else { - sl = pragmaOnceLoc; - } + if (placeForIncludeCalculated) sl = pragmaOnceLoc < controllingMacroLoc ? pragmaOnceLoc : controllingMacroLoc; + else sl = pragmaOnceLoc; placeForIncludeCalculated = true; } if (!placeForIncludeCalculated) { - if (firstHeader) { - sl = firstHeaderLoc; - } else { - sl = SM.getLocForStartOfFile(SM.getMainFileID()); - } + if (firstHeader) sl = firstHeaderLoc; + else sl = SM.getLocForStartOfFile(SM.getMainFileID()); } clang::FullSourceLoc fullSL(sl, SM); ct::Replacement Rep(SM, sl, 0, "\n#include \n"); @@ -656,15 +683,15 @@ namespace { * A silly little class to proxy PPCallbacks back to the HipifyAction class. */ class PPCallbackProxy : public clang::PPCallbacks { - HipifyAction& hipifyAction; + HipifyAction &hipifyAction; public: - explicit PPCallbackProxy(HipifyAction& action): hipifyAction(action) {} + explicit PPCallbackProxy(HipifyAction &action): hipifyAction(action) {} - void InclusionDirective(clang::SourceLocation hash_loc, const clang::Token& include_token, + void InclusionDirective(clang::SourceLocation hash_loc, const clang::Token &include_token, StringRef file_name, bool is_angled, clang::CharSourceRange filename_range, - const clang::FileEntry* file, StringRef search_path, StringRef relative_path, - const clang::Module* imported + const clang::FileEntry *file, StringRef search_path, StringRef relative_path, + const clang::Module *imported #if LLVM_VERSION_MAJOR > 6 , clang::SrcMgr::CharacteristicKind FileType #endif @@ -688,10 +715,10 @@ bool HipifyAction::BeginInvocation(clang::CompilerInstance &CI) { } void HipifyAction::ExecuteAction() { - clang::Preprocessor& PP = getCompilerInstance().getPreprocessor(); - clang::SourceManager& SM = getCompilerInstance().getSourceManager(); + clang::Preprocessor &PP = getCompilerInstance().getPreprocessor(); + auto &SM = getCompilerInstance().getSourceManager(); // Start lexing the specified input file. - const llvm::MemoryBuffer* FromFile = SM.getBuffer(SM.getMainFileID()); + const llvm::MemoryBuffer *FromFile = SM.getBuffer(SM.getMainFileID()); clang::Lexer RawLex(SM.getMainFileID(), FromFile, SM, PP.getLangOpts()); RawLex.SetKeepWhitespaceMode(true); // Perform a token-level rewrite of CUDA identifiers to hip ones. The raw-mode lexer gives us enough @@ -709,10 +736,12 @@ void HipifyAction::ExecuteAction() { clang::ASTFrontendAction::ExecuteAction(); } -void HipifyAction::run(const clang::ast_matchers::MatchFinder::MatchResult& Result) { +void HipifyAction::run(const mat::MatchFinder::MatchResult &Result) { if (cudaLaunchKernel(Result)) return; if (cudaSharedIncompleteArrayVar(Result)) return; if (cudaHostFuncCall(Result)) return; if (cudaDeviceFuncCall(Result)) return; if (cubNamespacePrefix(Result)) return; + if (cubFunctionTemplateDecl(Result)) return; + if (cubUsingNamespaceDecl(Result)) return; } diff --git a/hipify-clang/src/HipifyAction.h b/hipify-clang/src/HipifyAction.h index a24404deee..f70d17dd0b 100644 --- a/hipify-clang/src/HipifyAction.h +++ b/hipify-clang/src/HipifyAction.h @@ -31,17 +31,18 @@ THE SOFTWARE. #include "Statistics.h" namespace ct = clang::tooling; +namespace mat = clang::ast_matchers; using namespace llvm; /** * A FrontendAction that hipifies CUDA programs. */ class HipifyAction : public clang::ASTFrontendAction, - public clang::ast_matchers::MatchFinder::MatchCallback { + public mat::MatchFinder::MatchCallback { private: - ct::Replacements* replacements; + ct::Replacements *replacements; std::map Ifndefs; - std::unique_ptr Finder; + std::unique_ptr Finder; // CUDA implicitly adds its runtime header. We rewrite explicitly-provided CUDA includes with equivalent // ones, and track - using this flag - if the result led to us including the hip runtime header. If it did // not, we insert it at the top of the file when we finish processing it. @@ -62,17 +63,20 @@ private: void RewriteString(StringRef s, clang::SourceLocation start); // Replace a CUDA identifier with the corresponding hip identifier, if applicable. void RewriteToken(const clang::Token &t); + // Calculate str's SourceLocation in SourceRange sr + clang::SourceLocation GetSubstrLocation(const std::string &str, const clang::SourceRange &sr); public: explicit HipifyAction(ct::Replacements *replacements): clang::ASTFrontendAction(), replacements(replacements) {} // MatchCallback listeners - bool cudaBuiltin(const clang::ast_matchers::MatchFinder::MatchResult& Result); - bool cudaLaunchKernel(const clang::ast_matchers::MatchFinder::MatchResult& Result); - bool cudaSharedIncompleteArrayVar(const clang::ast_matchers::MatchFinder::MatchResult& Result); - bool cudaDeviceFuncCall(const clang::ast_matchers::MatchFinder::MatchResult& Result); - bool cudaHostFuncCall(const clang::ast_matchers::MatchFinder::MatchResult& Result); - bool cubNamespacePrefix(const clang::ast_matchers::MatchFinder::MatchResult& Result); + bool cudaLaunchKernel(const mat::MatchFinder::MatchResult &Result); + bool cudaSharedIncompleteArrayVar(const mat::MatchFinder::MatchResult &Result); + bool cudaDeviceFuncCall(const mat::MatchFinder::MatchResult &Result); + bool cudaHostFuncCall(const mat::MatchFinder::MatchResult &Result); + bool cubNamespacePrefix(const mat::MatchFinder::MatchResult &Result); + bool cubFunctionTemplateDecl(const mat::MatchFinder::MatchResult &Result); + bool cubUsingNamespaceDecl(const mat::MatchFinder::MatchResult &Result); // Called by the preprocessor for each include directive during the non-raw lexing pass. void InclusionDirective(clang::SourceLocation hash_loc, const clang::Token &include_token, @@ -91,7 +95,7 @@ public: protected: // Add a Replacement for the current file. These will all be applied after executing the FrontendAction. - void insertReplacement(const ct::Replacement& rep, const clang::FullSourceLoc& fullSL); + void insertReplacement(const ct::Replacement &rep, const clang::FullSourceLoc &fullSL); // FrontendAction entry point. void ExecuteAction() override; // Callback before starting processing a single input; used by hipify-clang for setting Preprocessor options. @@ -99,8 +103,8 @@ protected: // Called at the start of each new file to process. void EndSourceFileAction() override; // MatchCallback API entry point. Called by the AST visitor while searching the AST for things we registered an interest for. - void run(const clang::ast_matchers::MatchFinder::MatchResult& Result) override; - std::unique_ptr CreateASTConsumer(clang::CompilerInstance &CI, llvm::StringRef InFile) override; - bool Exclude(const hipCounter & hipToken); - void FindAndReplace(llvm::StringRef name, clang::SourceLocation sl, const std::map& repMap, bool bReplace = true); + void run(const mat::MatchFinder::MatchResult &Result) override; + std::unique_ptr CreateASTConsumer(clang::CompilerInstance &CI, StringRef InFile) override; + bool Exclude(const hipCounter &hipToken); + void FindAndReplace(StringRef name, clang::SourceLocation sl, const std::map &repMap, bool bReplace = true); }; diff --git a/hipify-clang/src/main.cpp b/hipify-clang/src/main.cpp index 2214567df3..64037c43dd 100644 --- a/hipify-clang/src/main.cpp +++ b/hipify-clang/src/main.cpp @@ -199,6 +199,7 @@ int main(int argc, const char **argv) { Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("cuda", ct::ArgumentInsertPosition::BEGIN)); Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("-x", ct::ArgumentInsertPosition::BEGIN)); Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("--cuda-host-only", ct::ArgumentInsertPosition::BEGIN)); + Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("-fno-delayed-template-parsing", ct::ArgumentInsertPosition::BEGIN)); if (!CudaPath.empty()) { std::string sCudaPath = "--cuda-path=" + CudaPath; Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster(sCudaPath.c_str(), ct::ArgumentInsertPosition::BEGIN)); diff --git a/include/hip/hcc_detail/code_object_bundle.hpp b/include/hip/hcc_detail/code_object_bundle.hpp index 32b0c0dbc8..f312d2e79b 100644 --- a/include/hip/hcc_detail/code_object_bundle.hpp +++ b/include/hip/hcc_detail/code_object_bundle.hpp @@ -86,7 +86,7 @@ struct Bundled_code { char cbuf[sizeof(offset) + sizeof(bundle_sz) + sizeof(triple_sz)]; } header; std::string triple; - std::vector blob; + std::string blob; }; #define magic_string_ "__CLANG_OFFLOAD_BUNDLE__" diff --git a/include/hip/hcc_detail/functional_grid_launch.hpp b/include/hip/hcc_detail/functional_grid_launch.hpp index c493eec933..da57240ebd 100644 --- a/include/hip/hcc_detail/functional_grid_launch.hpp +++ b/include/hip/hcc_detail/functional_grid_launch.hpp @@ -127,35 +127,6 @@ void hipLaunchKernelGGLImpl( } // Namespace hip_impl. -template -inline -hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* blockSize, - F kernel, size_t dynSharedMemPerBlk, uint32_t blockSizeLimit) { - - using namespace hip_impl; - - hip_impl::hip_init(); - auto f = get_program_state().kernel_descriptor(reinterpret_cast(kernel), - target_agent(0)); - - return hipOccupancyMaxPotentialBlockSize(gridSize, blockSize, f, - dynSharedMemPerBlk, blockSizeLimit); -} - -template -inline -hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(uint32_t* numBlocks, F kernel, - uint32_t blockSize, size_t dynSharedMemPerBlk) { - - using namespace hip_impl; - - hip_impl::hip_init(); - auto f = get_program_state().kernel_descriptor(reinterpret_cast(kernel), - target_agent(0)); - - return hipOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, f, blockSize, dynSharedMemPerBlk); -} - template inline void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks, diff --git a/include/hip/hcc_detail/hip_atomic.h b/include/hip/hcc_detail/hip_atomic.h index a5ac94a74b..2c13411319 100644 --- a/include/hip/hcc_detail/hip_atomic.h +++ b/include/hip/hcc_detail/hip_atomic.h @@ -59,12 +59,17 @@ float atomicAdd(float* address, float val) { unsigned int* uaddr{reinterpret_cast(address)}; unsigned int old{__atomic_load_n(uaddr, __ATOMIC_RELAXED)}; - unsigned int r; + unsigned int r; do { - r = old; + r = __atomic_load_n(uaddr, __ATOMIC_RELAXED); + + if (r != old) { r = old; continue; } + old = atomicCAS(uaddr, r, __float_as_uint(val + __uint_as_float(r))); - } while (r != old); + + if (r == old) break; + } while (true); return __uint_as_float(r); } @@ -74,13 +79,18 @@ double atomicAdd(double* address, double val) { unsigned long long* uaddr{reinterpret_cast(address)}; unsigned long long old{__atomic_load_n(uaddr, __ATOMIC_RELAXED)}; - unsigned long long r; + unsigned long long r; do { - r = old; + r = __atomic_load_n(uaddr, __ATOMIC_RELAXED); + + if (r != old) { r = old; continue; } + old = atomicCAS( uaddr, r, __double_as_longlong(val + __longlong_as_double(r))); - } while (r != old); + + if (r == old) break; + } while (true); return __longlong_as_double(r); } @@ -144,7 +154,13 @@ unsigned long long atomicMin( unsigned long long* address, unsigned long long val) { unsigned long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)}; - while (val < tmp) { tmp = atomicCAS(address, tmp, val); } + while (val < tmp) { + const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED); + + if (tmp1 != tmp) { tmp = tmp1; continue; } + + tmp = atomicCAS(address, tmp, val); + } return tmp; } @@ -167,7 +183,13 @@ unsigned long long atomicMax( unsigned long long* address, unsigned long long val) { unsigned long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)}; - while (tmp < val) { tmp = atomicCAS(address, tmp, val); } + while (tmp < val) { + const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED); + + if (tmp1 != tmp) { tmp = tmp1; continue; } + + tmp = atomicCAS(address, tmp, val); + } return tmp; } @@ -177,7 +199,7 @@ inline unsigned int atomicInc(unsigned int* address, unsigned int val) { __device__ - extern + extern unsigned int __builtin_amdgcn_atomic_inc( unsigned int*, unsigned int, @@ -194,7 +216,7 @@ inline unsigned int atomicDec(unsigned int* address, unsigned int val) { __device__ - extern + extern unsigned int __builtin_amdgcn_atomic_dec( unsigned int*, unsigned int, diff --git a/include/hip/hcc_detail/hip_runtime_api.h b/include/hip/hcc_detail/hip_runtime_api.h index c07d2ad9f1..3a81305ba3 100644 --- a/include/hip/hcc_detail/hip_runtime_api.h +++ b/include/hip/hcc_detail/hip_runtime_api.h @@ -1742,10 +1742,10 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes); * * @param[out] dst Data ptr to be filled * @param[in] constant value to be set - * @param[in] sizeBytes Data size in bytes + * @param[in] number of values to be set * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized */ -hipError_t hipMemsetD8(hipDeviceptr_t dest, unsigned char value, size_t sizeBytes); +hipError_t hipMemsetD8(hipDeviceptr_t dest, unsigned char value, size_t count); /** * @brief Fills the first sizeBytes bytes of the memory area pointed to by dest with the constant @@ -1758,11 +1758,11 @@ hipError_t hipMemsetD8(hipDeviceptr_t dest, unsigned char value, size_t sizeByte * * @param[out] dst Data ptr to be filled * @param[in] constant value to be set - * @param[in] sizeBytes Data size in bytes + * @param[in] number of values to be set * @param[in] stream - Stream identifier * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized */ -hipError_t hipMemsetD8Async(hipDeviceptr_t dest, unsigned char value, size_t sizeBytes, hipStream_t stream __dparm(0)); +hipError_t hipMemsetD8Async(hipDeviceptr_t dest, unsigned char value, size_t count, hipStream_t stream __dparm(0)); /** * @brief Fills the first sizeBytes bytes of the memory area pointed to by dest with the constant @@ -1770,10 +1770,10 @@ hipError_t hipMemsetD8Async(hipDeviceptr_t dest, unsigned char value, size_t siz * * @param[out] dst Data ptr to be filled * @param[in] constant value to be set - * @param[in] sizeBytes Data size in bytes + * @param[in] number of values to be set * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized */ -hipError_t hipMemsetD16(hipDeviceptr_t dest, unsigned short value, size_t sizeBytes); +hipError_t hipMemsetD16(hipDeviceptr_t dest, unsigned short value, size_t count); /** * @brief Fills the first sizeBytes bytes of the memory area pointed to by dest with the constant @@ -1786,11 +1786,11 @@ hipError_t hipMemsetD16(hipDeviceptr_t dest, unsigned short value, size_t sizeBy * * @param[out] dst Data ptr to be filled * @param[in] constant value to be set - * @param[in] sizeBytes Data size in bytes + * @param[in] number of values to be set * @param[in] stream - Stream identifier * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized */ -hipError_t hipMemsetD16Async(hipDeviceptr_t dest, unsigned short value, size_t sizeBytes, hipStream_t stream __dparm(0)); +hipError_t hipMemsetD16Async(hipDeviceptr_t dest, unsigned short value, size_t count, hipStream_t stream __dparm(0)); /** * @brief Fills the memory area pointed to by dest with the constant integer @@ -2884,14 +2884,14 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi * @param [out] gridSize minimum grid size for maximum potential occupancy * @param [out] blockSize block size for maximum potential occupancy * @param [in] f kernel function for which occupancy is calulated - * @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block + * @param [in] dynamicSMemSize Per - block dynamic shared memory usage intended, in bytes * @param [in] blockSizeLimit the maximum block size for the kernel, use 0 for no limit * * @returns hipSuccess, hipInvalidDevice, hipErrorInvalidValue */ -hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* blockSize, - hipFunction_t f, size_t dynSharedMemPerBlk, - uint32_t blockSizeLimit); +hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize, + const void* f, size_t dynamicSMemSize, + int blockSizeLimit); /** * @brief Returns occupancy for a device function. @@ -2899,10 +2899,10 @@ hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* block * @param [out] numBlocks Returned occupancy * @param [in] func Kernel function for which occupancy is calulated * @param [in] blockSize Block size the kernel is intended to be launched with - * @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block + * @param [in] dynamicSMemSize Per - block dynamic shared memory usage intended, in bytes */ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor( - uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk); + int* numBlocks, const void* f, int blockSize, size_t dynamicSMemSize); /** * @brief Returns occupancy for a device function. @@ -2910,11 +2910,11 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor( * @param [out] numBlocks Returned occupancy * @param [in] func Kernel function for which occupancy is calulated * @param [in] blockSize Block size the kernel is intended to be launched with - * @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block + * @param [in] dynamicSMemSize Per - block dynamic shared memory usage intended, in bytes * @param [in] flags Extra flags for occupancy calculation (currently ignored) */ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( - uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk, unsigned int flags); + int* numBlocks, const void* f, int blockSize, size_t dynamicSMemSize, unsigned int flags); /** * @brief Launches kernels on multiple devices and guarantees all specified kernels are dispatched @@ -3320,7 +3320,27 @@ hipError_t hipBindTextureToMipmappedArray(const texture& tex, return hipSuccess; } +template +inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor( + int* numBlocks, T f, int blockSize, size_t dynamicSMemSize) { + return hipOccupancyMaxActiveBlocksPerMultiprocessor( + numBlocks, reinterpret_cast(f), blockSize, dynamicSMemSize); +} +template +inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( + int* numBlocks, T f, int blockSize, size_t dynamicSMemSize, unsigned int flags) { + return hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( + numBlocks, reinterpret_cast(f), blockSize, dynamicSMemSize, flags); +} + +template +inline hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize, + T f, size_t dynamicSMemSize, int blockSizeLimit) { + return hipOccupancyMaxPotentialBlockSize( + gridSize, blockSize, reinterpret_cast(f), dynamicSMemSize, blockSizeLimit); +} + template inline hipError_t hipLaunchCooperativeKernel(T f, dim3 gridDim, dim3 blockDim, void** kernelParams, unsigned int sharedMemBytes, hipStream_t stream) { diff --git a/include/hip/hcc_detail/hip_runtime_prof.h b/include/hip/hcc_detail/hip_runtime_prof.h new file mode 100644 index 0000000000..4d4eccb54d --- /dev/null +++ b/include/hip/hcc_detail/hip_runtime_prof.h @@ -0,0 +1,77 @@ +/* +Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_PROF_H +#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_PROF_H + +// HIP VDI Op IDs enumeration +enum HipVdiOpId { + kHipVdiOpIdDispatch = 0, + kHipVdiOpIdCopy = 1, + kHipVdiOpIdBarrier = 2, + kHipVdiOpIdNumber = 3 +}; + +// Types of VDI commands +enum HipVdiCommandKind { + kHipVdiCommandKernel = 0x11F0, + kHipVdiMemcpyDeviceToHost = 0x11F3, + kHipHipVdiMemcpyHostToDevice = 0x11F4, + kHipVdiMemcpyDeviceToDevice = 0x11F5, + kHipVidMemcpyDeviceToHostRect = 0x1201, + kHipVdiMemcpyHostToDeviceRect = 0x1202, + kHipVdiMemcpyDeviceToDeviceRect = 0x1203, + kHipVdiFillMemory = 0x1207, +}; + +/** + * @brief Initializes activity callback + * + * @param [input] id_callback Event ID callback function + * @param [input] op_callback Event operation callback function + * @param [input] arg Arguments passed into callback + * + * @returns None + */ +void hipInitActivityCallback(void* id_callback, void* op_callback, void* arg); + +/** + * @brief Enables activity callback + * + * @param [input] op Operation, which will trigger a callback (@see HipVdiOpId) + * @param [input] enable Enable state for the callback + * + * @returns True if successful + */ +bool hipEnableActivityCallback(uint32_t op, bool enable); + +/** + * @brief Returns the description string for the operation kind + * + * @param [input] id Command kind id (@see HipVdiCommandKind) + * + * @returns A pointer to a const string with the command description + */ +const char* hipGetCmdName(uint32_t id); + +#endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_PROF_H + diff --git a/include/hip/hcc_detail/hip_vector_types.h b/include/hip/hcc_detail/hip_vector_types.h index a03a46b8cf..f80745038a 100644 --- a/include/hip/hcc_detail/hip_vector_types.h +++ b/include/hip/hcc_detail/hip_vector_types.h @@ -47,6 +47,95 @@ THE SOFTWARE. #if defined(__cplusplus) #include + namespace hip_impl { + template + struct Scalar_accessor { + // Idea from https://t0rakka.silvrback.com/simd-scalar-accessor + Vector data; + + __host__ __device__ + operator T() const noexcept { return data[idx]; } + + __host__ __device__ + Scalar_accessor& operator=(T x) noexcept { + data[idx] = x; + + return *this; + } + + __host__ __device__ + Scalar_accessor& operator+=(T x) noexcept { + data[idx] += x; + return *this; + } + __host__ __device__ + Scalar_accessor& operator-=(T x) noexcept { + data[idx] -= x; + return *this; + } + + __host__ __device__ + Scalar_accessor& operator*=(T x) noexcept { + data[idx] *= x; + return *this; + } + __host__ __device__ + Scalar_accessor& operator/=(T x) noexcept { + data[idx] /= x; + return *this; + } + template< + typename U = T, + typename std::enable_if{}>::type* = nullptr> + __host__ __device__ + Scalar_accessor& operator%=(T x) noexcept { + data[idx] %= x; + return *this; + } + + template< + typename U = T, + typename std::enable_if{}>::type* = nullptr> + __host__ __device__ + Scalar_accessor& operator>>=(T x) noexcept { + data[idx] >>= x; + return *this; + } + template< + typename U = T, + typename std::enable_if{}>::type* = nullptr> + __host__ __device__ + Scalar_accessor& operator<<=(T x) noexcept { + data[idx] <<= x; + return *this; + } + template< + typename U = T, + typename std::enable_if{}>::type* = nullptr> + __host__ __device__ + Scalar_accessor& operator&=(T x) noexcept { + data[idx] &= x; + return *this; + } + template< + typename U = T, + typename std::enable_if{}>::type* = nullptr> + __host__ __device__ + Scalar_accessor& operator|=(T x) noexcept { + data[idx] |= x; + return *this; + } + template< + typename U = T, + typename std::enable_if{}>::type* = nullptr> + __host__ __device__ + Scalar_accessor& operator^=(T x) noexcept { + data[idx] ^= x; + return *this; + } + }; + } // Namespace hip_impl. + template struct HIP_vector_base; template @@ -55,9 +144,7 @@ THE SOFTWARE. union { Native_vec_ data; - struct { - T x; - }; + hip_impl::Scalar_accessor x; }; }; @@ -67,10 +154,8 @@ THE SOFTWARE. union { Native_vec_ data; - struct { - T x; - T y; - }; + hip_impl::Scalar_accessor x; + hip_impl::Scalar_accessor y; }; }; @@ -238,12 +323,10 @@ THE SOFTWARE. union { Native_vec_ data; - struct { - T x; - T y; - T z; - T w; - }; + hip_impl::Scalar_accessor x; + hip_impl::Scalar_accessor y; + hip_impl::Scalar_accessor z; + hip_impl::Scalar_accessor w; }; }; diff --git a/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp b/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp old mode 100644 new mode 100755 index 0aa9ef2913..7c22e19d5d --- a/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp +++ b/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp @@ -30,7 +30,7 @@ THE SOFTWARE. #define fileName "tex2dKernel.code" texture tex; -bool testResult = false; +bool testResult = true; #define HIP_CHECK(cmd) \ { \ @@ -62,7 +62,7 @@ bool runTest(int argc, char** argv) { desc.NumChannels = 1; desc.Width = width; desc.Height = height; - hipArrayCreate(&array, &desc); + HIP_CHECK(hipArrayCreate(&array, &desc)); hip_Memcpy2D copyParam; memset(©Param, 0, sizeof(copyParam)); @@ -73,19 +73,19 @@ bool runTest(int argc, char** argv) { copyParam.srcPitch = width * sizeof(float); copyParam.WidthInBytes = copyParam.srcPitch; copyParam.Height = height; - hipMemcpyParam2D(©Param); + HIP_CHECK(hipMemcpyParam2D(©Param)); textureReference* texref; - hipModuleGetTexRef(&texref, Module, "tex"); - hipTexRefSetAddressMode(texref, 0, hipAddressModeWrap); - hipTexRefSetAddressMode(texref, 1, hipAddressModeWrap); - hipTexRefSetFilterMode(texref, hipFilterModePoint); - hipTexRefSetFlags(texref, 0); - hipTexRefSetFormat(texref, HIP_AD_FORMAT_FLOAT, 1); - hipTexRefSetArray(texref, array, HIP_TRSA_OVERRIDE_FORMAT); + HIP_CHECK(hipModuleGetTexRef(&texref, Module, "tex")); + HIP_CHECK(hipTexRefSetAddressMode(texref, 0, hipAddressModeWrap)); + HIP_CHECK(hipTexRefSetAddressMode(texref, 1, hipAddressModeWrap)); + HIP_CHECK(hipTexRefSetFilterMode(texref, hipFilterModePoint)); + HIP_CHECK(hipTexRefSetFlags(texref, 0)); + HIP_CHECK(hipTexRefSetFormat(texref, HIP_AD_FORMAT_FLOAT, 1)); + HIP_CHECK(hipTexRefSetArray(texref, array, HIP_TRSA_OVERRIDE_FORMAT)); float* dData = NULL; - hipMalloc((void**)&dData, size); + HIP_CHECK(hipMalloc((void**)&dData, size)); struct { void* _Ad; @@ -112,7 +112,7 @@ bool runTest(int argc, char** argv) { float* hOutputData = (float*)malloc(size); memset(hOutputData, 0, size); - hipMemcpy(hOutputData, dData, size, hipMemcpyDeviceToHost); + HIP_CHECK(hipMemcpy(hOutputData, dData, size, hipMemcpyDeviceToHost)); for (int i = 0; i < height; i++) { for (int j = 0; j < width; j++) { @@ -124,10 +124,10 @@ bool runTest(int argc, char** argv) { } } } - hipUnbindTexture(tex); - hipFree(dData); - hipFreeArray(array); - return true; + HIP_CHECK(hipUnbindTexture(tex)); + HIP_CHECK(hipFree(dData)); + HIP_CHECK(hipFreeArray(array)); + return testResult; } int main(int argc, char** argv) { diff --git a/samples/2_Cookbook/13_occupancy/occupancy.cpp b/samples/2_Cookbook/13_occupancy/occupancy.cpp index a9f4e198b0..01fa7aafed 100644 --- a/samples/2_Cookbook/13_occupancy/occupancy.cpp +++ b/samples/2_Cookbook/13_occupancy/occupancy.cpp @@ -56,9 +56,9 @@ void launchKernel(float* C, float* A, float* B, bool manual){ const unsigned threadsperblock = 32; const unsigned blocks = (NUM/threadsperblock)+1; - uint32_t mingridSize = 0; - uint32_t gridSize = 0; - uint32_t blockSize = 0; + int mingridSize = 0; + int gridSize = 0; + int blockSize = 0; if (manual){ blockSize = threadsperblock; @@ -86,7 +86,7 @@ void launchKernel(float* C, float* A, float* B, bool manual){ printf("kernel Execution time = %6.3fms\n", eventMs); //Calculate Occupancy - uint32_t numBlock = 0; + int numBlock = 0; HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, multiply, blockSize, 0)); if(devProp.maxThreadsPerMultiProcessor){ diff --git a/src/hip_hcc_internal.h b/src/hip_hcc_internal.h index 3ee14577b0..658cfbf576 100644 --- a/src/hip_hcc_internal.h +++ b/src/hip_hcc_internal.h @@ -1022,6 +1022,27 @@ inline std::ostream& operator<<(std::ostream& os, const ihipCtx_t* c) { namespace hip_internal { hipError_t memcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream); + +hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned int flags); + +hipError_t ihipHostFree(TlsData *tls, void* ptr); + +}; + +#define MAX_COOPERATIVE_GPUs 255 + +// do not change these two structs without changing the device library +struct mg_sync { + uint w0; + uint w1; +}; + +struct mg_info { + struct mg_sync *mgs; + uint grid_id; + uint num_grids; + ulong prev_sum; + ulong all_sum; }; //--- diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index ed1422fcda..c8369685ec 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -141,6 +141,103 @@ void* allocAndSharePtr(const char* msg, size_t sizeBytes, ihipCtx_t* ctx, bool s return ptr; } +hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned int flags) { + hipError_t hip_status = hipSuccess; + + if (HIP_SYNC_HOST_ALLOC) { + hipDeviceSynchronize(); + } + + auto ctx = ihipGetTlsDefaultCtx(); + if ((ctx == nullptr) || (ptr == nullptr)) { + hip_status = hipErrorInvalidValue; + } + else if (sizeBytes == 0) { + hip_status = hipSuccess; + // TODO - should size of 0 return err or be siliently ignored? + } else { + unsigned trueFlags = flags; + if (flags == hipHostMallocDefault) { + // HCC/ROCM provide a modern system with unified memory and should set both of these + // flags by default: + trueFlags = hipHostMallocMapped | hipHostMallocPortable; + } + + + const unsigned supportedFlags = hipHostMallocPortable | hipHostMallocMapped | + hipHostMallocWriteCombined | hipHostMallocCoherent | + hipHostMallocNonCoherent; + + + const unsigned coherencyFlags = hipHostMallocCoherent | hipHostMallocNonCoherent; + + if ((flags & ~supportedFlags) || ((flags & coherencyFlags) == coherencyFlags)) { + *ptr = nullptr; + // can't specify unsupported flags, can't specify both Coherent + NonCoherent + hip_status = hipErrorInvalidValue; + } else { + auto device = ctx->getWriteableDevice(); +#if (__hcc_workweek__ >= 19115) + //Avoid mapping host pinned memory to all devices by HCC + unsigned amFlags = amHostUnmapped; +#else + unsigned amFlags = 0; +#endif + if (flags & hipHostMallocCoherent) { + amFlags |= amHostCoherent; + } else if (flags & hipHostMallocNonCoherent) { + amFlags |= amHostNonCoherent; + } else { + // depends on env variables: + amFlags |= HIP_HOST_COHERENT ? amHostCoherent : amHostNonCoherent; + } + + + *ptr = hip_internal::allocAndSharePtr( + (amFlags & amHostCoherent) ? "finegrained_host" : "pinned_host", sizeBytes, ctx, + true /*shareWithAll*/, amFlags, flags, 0); + + if (sizeBytes && (*ptr == NULL)) { + hip_status = hipErrorMemoryAllocation; + } + } + } + + if (HIP_SYNC_HOST_ALLOC) { + hipDeviceSynchronize(); + } + return hip_status; +} + +hipError_t ihipHostFree(TlsData *tls, void* ptr) { + + // Synchronize to ensure all work has finished. + ihipGetTlsDefaultCtx()->locked_waitAllStreams(); // ignores non-blocking streams, this waits + // for all activity to finish. + + hipError_t hipStatus = hipErrorInvalidValue; + if (ptr) { + hc::accelerator acc; +#if (__hcc_workweek__ >= 17332) + hc::AmPointerInfo amPointerInfo(NULL, NULL, NULL, 0, acc, 0, 0); +#else + hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0); +#endif + am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, ptr); + if (status == AM_SUCCESS) { + if (amPointerInfo._hostPointer == ptr) { + hc::am_free(ptr); + hipStatus = hipSuccess; + } + } + } else { + // free NULL pointer succeeds and is common technique to initialize runtime + hipStatus = hipSuccess; + } + + return hipStatus; +} + } // end namespace hip_internal @@ -301,79 +398,12 @@ hipError_t hipExtMallocWithFlags(void** ptr, size_t sizeBytes, unsigned int flag return ihipLogStatus(hip_status); } -hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned int flags) { - hipError_t hip_status = hipSuccess; - - if (HIP_SYNC_HOST_ALLOC) { - hipDeviceSynchronize(); - } - - auto ctx = ihipGetTlsDefaultCtx(); - if ((ctx == nullptr) || (ptr == nullptr)) { - hip_status = hipErrorInvalidValue; - } - else if (sizeBytes == 0) { - hip_status = hipSuccess; - // TODO - should size of 0 return err or be siliently ignored? - } else { - unsigned trueFlags = flags; - if (flags == hipHostMallocDefault) { - // HCC/ROCM provide a modern system with unified memory and should set both of these - // flags by default: - trueFlags = hipHostMallocMapped | hipHostMallocPortable; - } - - - const unsigned supportedFlags = hipHostMallocPortable | hipHostMallocMapped | - hipHostMallocWriteCombined | hipHostMallocCoherent | - hipHostMallocNonCoherent; - - - const unsigned coherencyFlags = hipHostMallocCoherent | hipHostMallocNonCoherent; - - if ((flags & ~supportedFlags) || ((flags & coherencyFlags) == coherencyFlags)) { - *ptr = nullptr; - // can't specify unsupported flags, can't specify both Coherent + NonCoherent - hip_status = hipErrorInvalidValue; - } else { - auto device = ctx->getWriteableDevice(); -#if (__hcc_workweek__ >= 19115) - //Avoid mapping host pinned memory to all devices by HCC - unsigned amFlags = amHostUnmapped; -#else - unsigned amFlags = 0; -#endif - if (flags & hipHostMallocCoherent) { - amFlags |= amHostCoherent; - } else if (flags & hipHostMallocNonCoherent) { - amFlags |= amHostNonCoherent; - } else { - // depends on env variables: - amFlags |= HIP_HOST_COHERENT ? amHostCoherent : amHostNonCoherent; - } - - - *ptr = hip_internal::allocAndSharePtr( - (amFlags & amHostCoherent) ? "finegrained_host" : "pinned_host", sizeBytes, ctx, - true /*shareWithAll*/, amFlags, flags, 0); - - if (sizeBytes && (*ptr == NULL)) { - hip_status = hipErrorMemoryAllocation; - } - } - } - - if (HIP_SYNC_HOST_ALLOC) { - hipDeviceSynchronize(); - } - return hip_status; -} hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) { HIP_INIT_SPECIAL_API(hipHostMalloc, (TRACE_MEM), ptr, sizeBytes, flags); HIP_SET_DEVICE(); hipError_t hip_status = hipSuccess; - hip_status = ihipHostMalloc(tls, ptr, sizeBytes, flags); + hip_status = hip_internal::ihipHostMalloc(tls, ptr, sizeBytes, flags); return ihipLogStatus(hip_status); } @@ -384,7 +414,7 @@ hipError_t hipMallocManaged(void** devPtr, size_t size, unsigned int flags) { if(flags != hipMemAttachGlobal) hip_status = hipErrorInvalidValue; else - hip_status = ihipHostMalloc(tls, devPtr, size, hipHostMallocDefault); + hip_status = hip_internal::ihipHostMalloc(tls, devPtr, size, hipHostMallocDefault); return ihipLogStatus(hip_status); } @@ -1935,15 +1965,15 @@ hipError_t hipMemset2DAsync(void* dst, size_t pitch, int value, size_t width, si return ihipLogStatus(e); }; -hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char value, size_t sizeBytes) { - HIP_INIT_SPECIAL_API(hipMemsetD8, (TRACE_MCMD), dst, value, sizeBytes); +hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char value, size_t count) { + HIP_INIT_SPECIAL_API(hipMemsetD8, (TRACE_MCMD), dst, value, count); hipError_t e = hipSuccess; hipStream_t stream = hipStreamNull; stream = ihipSyncAndResolveStream(stream); if (stream) { - e = ihipMemset(dst, value, sizeBytes, stream, ihipMemsetDataTypeChar); + e = ihipMemset(dst, value, count, stream, ihipMemsetDataTypeChar); stream->locked_wait(); } else { e = hipErrorInvalidValue; @@ -1951,23 +1981,23 @@ hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char value, size_t sizeBytes return ihipLogStatus(e); } -hipError_t hipMemsetD8Async(hipDeviceptr_t dst, unsigned char value, size_t sizeBytes , hipStream_t stream ) { - HIP_INIT_SPECIAL_API(hipMemsetD8Async, (TRACE_MCMD), dst, value, sizeBytes, stream); +hipError_t hipMemsetD8Async(hipDeviceptr_t dst, unsigned char value, size_t count , hipStream_t stream ) { + HIP_INIT_SPECIAL_API(hipMemsetD8Async, (TRACE_MCMD), dst, value, count, stream); stream = ihipSyncAndResolveStream(stream); if (stream) { - return ihipLogStatus(ihipMemset(dst, value, sizeBytes, stream, ihipMemsetDataTypeChar)); + return ihipLogStatus(ihipMemset(dst, value, count, stream, ihipMemsetDataTypeChar)); } else { return ihipLogStatus(hipErrorInvalidValue); } } -hipError_t hipMemsetD16(hipDeviceptr_t dst, unsigned short value, size_t sizeBytes){ - HIP_INIT_SPECIAL_API(hipMemsetD16, (TRACE_MCMD), dst, value, sizeBytes); +hipError_t hipMemsetD16(hipDeviceptr_t dst, unsigned short value, size_t count){ + HIP_INIT_SPECIAL_API(hipMemsetD16, (TRACE_MCMD), dst, value, count); hipError_t e = hipSuccess; hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull); if (stream) { - e = ihipMemset(dst, value, sizeBytes, stream, ihipMemsetDataTypeShort); + e = ihipMemset(dst, value, count, stream, ihipMemsetDataTypeShort); if(hipSuccess == e) stream->locked_wait(); } else { @@ -1976,12 +2006,12 @@ hipError_t hipMemsetD16(hipDeviceptr_t dst, unsigned short value, size_t sizeByt return ihipLogStatus(e); } -hipError_t hipMemsetD16Async(hipDeviceptr_t dst, unsigned short value, size_t sizeBytes, hipStream_t stream ){ - HIP_INIT_SPECIAL_API(hipMemsetD16Async, (TRACE_MCMD), dst, value, sizeBytes, stream); +hipError_t hipMemsetD16Async(hipDeviceptr_t dst, unsigned short value, size_t count, hipStream_t stream ){ + HIP_INIT_SPECIAL_API(hipMemsetD16Async, (TRACE_MCMD), dst, value, count, stream); stream = ihipSyncAndResolveStream(stream); if (stream) { - return ihipLogStatus(ihipMemset(dst, value, sizeBytes, stream, ihipMemsetDataTypeShort)); + return ihipLogStatus(ihipMemset(dst, value, count, stream, ihipMemsetDataTypeShort)); } else { return ihipLogStatus(hipErrorInvalidValue); } @@ -2146,30 +2176,8 @@ hipError_t hipFree(void* ptr) { hipError_t hipHostFree(void* ptr) { HIP_INIT_SPECIAL_API(hipHostFree, (TRACE_MEM), ptr); - // Synchronize to ensure all work has finished. - ihipGetTlsDefaultCtx()->locked_waitAllStreams(); // ignores non-blocking streams, this waits - // for all activity to finish. - - - hipError_t hipStatus = hipErrorInvalidValue; - if (ptr) { - hc::accelerator acc; -#if (__hcc_workweek__ >= 17332) - hc::AmPointerInfo amPointerInfo(NULL, NULL, NULL, 0, acc, 0, 0); -#else - hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0); -#endif - am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, ptr); - if (status == AM_SUCCESS) { - if (amPointerInfo._hostPointer == ptr) { - hc::am_free(ptr); - hipStatus = hipSuccess; - } - } - } else { - // free NULL pointer succeeds and is common technique to initialize runtime - hipStatus = hipSuccess; - } + hipError_t hipStatus = hipSuccess; + hipStatus = hip_internal::ihipHostFree(tls, ptr); return ihipLogStatus(hipStatus); }; diff --git a/src/hip_module.cpp b/src/hip_module.cpp index d98b98a378..c15cc34cd5 100644 --- a/src/hip_module.cpp +++ b/src/hip_module.cpp @@ -109,6 +109,7 @@ struct ihipModuleSymbol_t { amd_kernel_code_t const* _header{}; string _name; // TODO - review for performance cost. Name is just used for debug. vector> _kernarg_layout{}; + bool _is_code_object_v3{}; }; template <> @@ -137,7 +138,8 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ, size_t sharedMemBytes, hipStream_t hStream, void** kernelParams, void** extra, - hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags, bool isStreamLocked = 0) { + hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags, bool isStreamLocked = 0, + void** impCoopParams = 0) { using namespace hip_impl; auto ctx = ihipGetTlsDefaultCtx(); @@ -181,10 +183,17 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global return hipErrorInvalidValue; } - // Insert 48-bytes at the end for implicit kernel arguments and fill with value zero. + // Insert 56-bytes at the end for implicit kernel arguments and fill with value zero. size_t padSize = (~kernargs.size() + 1) & (HIP_IMPLICIT_KERNARG_ALIGNMENT - 1); kernargs.insert(kernargs.end(), padSize + HIP_IMPLICIT_KERNARG_SIZE, 0); + if (impCoopParams) { + const auto p{static_cast(*impCoopParams)}; + // The sixth index is for multi-grid synchronization + kernargs.insert((kernargs.cend() - padSize - HIP_IMPLICIT_KERNARG_SIZE) + 6 * HIP_IMPLICIT_KERNARG_ALIGNMENT, + p, p + HIP_IMPLICIT_KERNARG_ALIGNMENT); + } + /* Kernel argument preparation. */ @@ -208,8 +217,7 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global aql.grid_size_x = globalWorkSizeX; aql.grid_size_y = globalWorkSizeY; aql.grid_size_z = globalWorkSizeZ; - bool is_code_object_v3 = f->_name.find(".kd") != std::string::npos; - if (is_code_object_v3) { + if (f->_is_code_object_v3) { const auto* header = reinterpret_cast(f->_header); aql.group_segment_size = @@ -449,6 +457,10 @@ hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, return ihipLogStatus(hipErrorLaunchFailure); } + size_t impCoopArg = 1; + void* impCoopParams[1]; + impCoopParams[0] = &impCoopArg; + // launch the main kernel result = ihipModuleLaunchKernel(tls, kd, gridDim.x * blockDimX.x, @@ -456,7 +468,7 @@ hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, gridDim.z * blockDimX.z, blockDimX.x, blockDimX.y, blockDimX.z, sharedMemBytes, stream, kernelParams, nullptr, nullptr, - nullptr, 0, true); + nullptr, 0, true, impCoopParams); stream->criticalData().unlock(); #if (__hcc_workweek__ >= 19213) @@ -472,7 +484,7 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi HIP_INIT_API(hipLaunchCooperativeKernelMultiDevice, launchParamsList, numDevices, flags); hipError_t result; - if (numDevices > g_deviceCnt || launchParamsList == nullptr) { + if (numDevices > g_deviceCnt || launchParamsList == nullptr || numDevices > MAX_COOPERATIVE_GPUs) { return ihipLogStatus(hipErrorInvalidValue); } @@ -523,6 +535,32 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi kargs.getHandle()); } + mg_sync *mg_sync_ptr = 0; + mg_info *mg_info_ptr[MAX_COOPERATIVE_GPUs] = {0}; + + result = hip_internal::ihipHostMalloc(tls, (void **)&mg_sync_ptr, sizeof(mg_sync), hipHostMallocDefault); + if (result != hipSuccess) { + return ihipLogStatus(hipErrorInvalidValue); + } + mg_sync_ptr->w0 = 0; + mg_sync_ptr->w1 = 0; + + uint all_sum = 0; + for (int i = 0; i < numDevices; ++i) { + result = hip_internal::ihipHostMalloc(tls, (void **)&mg_info_ptr[i], sizeof(mg_info), hipHostMallocDefault); + if (result != hipSuccess) { + hip_internal::ihipHostFree(tls, mg_sync_ptr); + for (int j = 0; j < i; ++j) { + hip_internal::ihipHostFree(tls, mg_info_ptr[j]); + } + return ihipLogStatus(hipErrorInvalidValue); + } + // calculate the sum of sizes of all grids + const hipLaunchParams& lp = launchParamsList[i]; + all_sum += lp.blockDim.x * lp.blockDim.y * lp.blockDim.z * + lp.gridDim.x * lp.gridDim.y * lp.gridDim.z; + } + // lock all streams before launching the blit kernels for initializing the GWS and main kernels to each device for (int i = 0; i < numDevices; ++i) { LockedAccessor_StreamCrit_t streamCrit(launchParamsList[i].stream->criticalData(), false); @@ -531,7 +569,7 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi #endif } - // launch the init_gws kernel to initialize the GWS followed by launching the main kernels for each device + // launch the init_gws kernel to initialize the GWS for each device for (int i = 0; i < numDevices; ++i) { const hipLaunchParams& lp = launchParamsList[i]; @@ -549,8 +587,32 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi launchParamsList[j].stream->criticalData()._av.release_locked_hsa_queue(); #endif } + hip_internal::ihipHostFree(tls, mg_sync_ptr); + for (int j = 0; j < numDevices; ++j) { + hip_internal::ihipHostFree(tls, mg_info_ptr[j]); + } + return ihipLogStatus(hipErrorLaunchFailure); } + } + + void* impCoopParams[1]; + ulong prev_sum = 0; + // launch the main kernels for each device + for (int i = 0; i < numDevices; ++i) { + const hipLaunchParams& lp = launchParamsList[i]; + + //initialize and setup the implicit kernel argument for multi-grid sync + mg_info_ptr[i]->mgs = mg_sync_ptr; + mg_info_ptr[i]->grid_id = i; + mg_info_ptr[i]->num_grids = numDevices; + mg_info_ptr[i]->all_sum = all_sum; + mg_info_ptr[i]->prev_sum = prev_sum; + prev_sum += lp.blockDim.x * lp.blockDim.y * lp.blockDim.z * + lp.gridDim.x * lp.gridDim.y * lp.gridDim.z; + + + impCoopParams[0] = &mg_info_ptr[i]; result = ihipModuleLaunchKernel(tls, kds[i], lp.gridDim.x * lp.blockDim.x, @@ -559,7 +621,23 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi lp.blockDim.x, lp.blockDim.y, lp.blockDim.z, lp.sharedMem, lp.stream, lp.args, nullptr, nullptr, nullptr, 0, - true); + true, impCoopParams); + + if (result != hipSuccess) { + for (int j = 0; j < numDevices; ++j) { + launchParamsList[j].stream->criticalData().unlock(); +#if (__hcc_workweek__ >= 19213) + launchParamsList[j].stream->criticalData()._av.release_locked_hsa_queue(); +#endif + } + hip_internal::ihipHostFree(tls, mg_sync_ptr); + for (int j = 0; j < numDevices; ++j) { + hip_internal::ihipHostFree(tls, mg_info_ptr[j]); + } + + return ihipLogStatus(hipErrorLaunchFailure); + } + } // unlock all streams @@ -573,6 +651,11 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi free(gwsKds); free(kds); + hip_internal::ihipHostFree(tls, mg_sync_ptr); + for (int j = 0; j < numDevices; ++j) { + hip_internal::ihipHostFree(tls, mg_info_ptr[j]); + } + return ihipLogStatus(result); } @@ -977,31 +1060,24 @@ hipFuncAttributes make_function_attributes(TlsData *tls, const ihipModuleSymbol_ // available per CU, therefore we hardcode it to 64 KiRegisters. prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024; - bool is_code_object_v3 = kd._name.find(".kd") != std::string::npos; - if (is_code_object_v3) { + if (kd._is_code_object_v3) { r.localSizeBytes = header_v3(kd)->private_segment_fixed_size; r.sharedSizeBytes = header_v3(kd)->group_segment_fixed_size; - } else { - r.localSizeBytes = kd._header->workitem_private_segment_byte_size; - r.sharedSizeBytes = kd._header->workgroup_group_segment_byte_size; - } - r.maxDynamicSharedSizeBytes = prop.sharedMemPerBlock - r.sharedSizeBytes; - if (is_code_object_v3) { r.numRegs = ((header_v3(kd)->compute_pgm_rsrc1 & 0x3F) + 1) << 2; - } else { - r.numRegs = kd._header->workitem_vgpr_count; - } - r.maxThreadsPerBlock = r.numRegs ? - std::min(prop.maxThreadsPerBlock, prop.regsPerBlock / r.numRegs) : - prop.maxThreadsPerBlock; - if (is_code_object_v3) { r.binaryVersion = 0; // FIXME: should it be the ISA version or code // object format version? } else { + r.localSizeBytes = kd._header->workitem_private_segment_byte_size; + r.sharedSizeBytes = kd._header->workgroup_group_segment_byte_size; + r.numRegs = kd._header->workitem_vgpr_count; r.binaryVersion = kd._header->amd_machine_version_major * 10 + kd._header->amd_machine_version_minor; } + r.maxDynamicSharedSizeBytes = prop.sharedMemPerBlock - r.sharedSizeBytes; + r.maxThreadsPerBlock = r.numRegs ? + std::min(prop.maxThreadsPerBlock, prop.regsPerBlock / r.numRegs) : + prop.maxThreadsPerBlock; r.ptxVersion = prop.major * 10 + prop.minor; // HIP currently presents itself as PTX 3.0. return r; @@ -1099,8 +1175,7 @@ hipError_t ihipModuleLoadData(TlsData *tls, hipModule_t* module, const void* ima content.data(), content.size(), (*module)->executable, this_agent()); - std::vector blob(content.cbegin(), content.cend()); - program_state_impl::read_kernarg_metadata(blob, (*module)->kernargs); + program_state_impl::read_kernarg_metadata(content, (*module)->kernargs); // compute the hash of the code object (*module)->hash = checksum(content.length(), content.data()); @@ -1152,8 +1227,7 @@ hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const void getGprsLdsUsage(hipFunction_t f, size_t* usedVGPRS, size_t* usedSGPRS, size_t* usedLDS) { - bool is_code_object_v3 = f->_name.find(".kd") != std::string::npos; - if (is_code_object_v3) { + if (f->_is_code_object_v3) { const auto header = reinterpret_cast(f->_header); // GRANULATED_WAVEFRONT_VGPR_COUNT is specified in 0:5 bits of COMPUTE_PGM_RSRC1 // the granularity for gfx6-gfx9 is max(0, ceil(vgprs_used / 4) - 1) @@ -1174,9 +1248,9 @@ void getGprsLdsUsage(hipFunction_t f, size_t* usedVGPRS, size_t* usedSGPRS, size } } -hipError_t ihipOccupancyMaxPotentialBlockSize(TlsData *tls, uint32_t* gridSize, uint32_t* blockSize, - hipFunction_t f, size_t dynSharedMemPerBlk, - uint32_t blockSizeLimit) +hipError_t ihipOccupancyMaxPotentialBlockSize(TlsData *tls, int* gridSize, int* blockSize, + hipFunction_t f, size_t dynamicSMemSize, + int blockSizeLimit) { using namespace hip_impl; @@ -1257,7 +1331,7 @@ hipError_t ihipOccupancyMaxPotentialBlockSize(TlsData *tls, uint32_t* gridSize, } else { size_t availableSharedMemPerCU = prop.maxSharedMemoryPerMultiProcessor; - size_t workgroupPerCU = availableSharedMemPerCU / (usedLDS + dynSharedMemPerBlk); + size_t workgroupPerCU = availableSharedMemPerCU / (usedLDS + dynamicSMemSize); wavefrontsLDS = min(workgroupPerCU, maxWorkgroupPerCU) * wavefrontsPerWG; } @@ -1286,18 +1360,19 @@ hipError_t ihipOccupancyMaxPotentialBlockSize(TlsData *tls, uint32_t* gridSize, return hipSuccess; } -hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* blockSize, - hipFunction_t f, size_t dynSharedMemPerBlk, - uint32_t blockSizeLimit) +hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize, + const void* f, size_t dynamicSMemSize, + int blockSizeLimit) { - HIP_INIT_API(hipOccupancyMaxPotentialBlockSize, gridSize, blockSize, f, dynSharedMemPerBlk, blockSizeLimit); - + HIP_INIT_API(hipOccupancyMaxPotentialBlockSize, gridSize, blockSize, f, dynamicSMemSize, blockSizeLimit); + auto F = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)(f), + hip_impl::target_agent(0)); return ihipLogStatus(ihipOccupancyMaxPotentialBlockSize(tls, - gridSize, blockSize, f, dynSharedMemPerBlk, blockSizeLimit)); + gridSize, blockSize, F, dynamicSMemSize, blockSizeLimit)); } hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor( - TlsData *tls, uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk) + TlsData *tls, int* numBlocks, hipFunction_t f, int blockSize, size_t dynamicSMemSize) { using namespace hip_impl; @@ -1326,45 +1401,50 @@ hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor( size_t numWavefronts = (blockSize + wavefrontSize - 1) / wavefrontSize; size_t availableVGPRs = (prop.regsPerBlock / wavefrontSize / simdPerCU); - size_t vgprs_alu_occupancy = simdPerCU * std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS); + size_t vgprs_alu_occupancy = simdPerCU * (usedVGPRS == 0 ? maxWavesPerSimd + : std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS)); // Calculate blocks occupancy per CU based on VGPR usage *numBlocks = vgprs_alu_occupancy / numWavefronts; const size_t availableSGPRs = (prop.gcnArch < 800) ? 512 : 800; - size_t sgprs_alu_occupancy = simdPerCU * ((usedSGPRS == 0) ? maxWavesPerSimd + size_t sgprs_alu_occupancy = simdPerCU * (usedSGPRS == 0 ? maxWavesPerSimd : std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS)); // Calculate blocks occupancy per CU based on SGPR usage - *numBlocks = std::min(*numBlocks, (uint32_t) (sgprs_alu_occupancy / numWavefronts)); + *numBlocks = std::min(*numBlocks, (int) (sgprs_alu_occupancy / numWavefronts)); - size_t total_used_lds = usedLDS + dynSharedMemPerBlk; + size_t total_used_lds = usedLDS + dynamicSMemSize; if (total_used_lds != 0) { // Calculate LDS occupacy per CU. lds_per_cu / (static_lsd + dynamic_lds) size_t lds_occupancy = prop.maxSharedMemoryPerMultiProcessor / total_used_lds; - *numBlocks = std::min(*numBlocks, (uint32_t) lds_occupancy); + *numBlocks = std::min(*numBlocks, (int) lds_occupancy); } return hipSuccess; } hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor( - uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk) + int* numBlocks, const void* f, int blockSize, size_t dynamicSMemSize) { - HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessor, numBlocks, f, blockSize, dynSharedMemPerBlk); + HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessor, numBlocks, f, blockSize, dynamicSMemSize); + auto F = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)(f), + hip_impl::target_agent(0)); return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor( - tls, numBlocks, f, blockSize, dynSharedMemPerBlk)); + tls, numBlocks, F, blockSize, dynamicSMemSize)); } hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( - uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk, + int* numBlocks, const void* f, int blockSize, size_t dynamicSMemSize, unsigned int flags) { - HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, numBlocks, f, blockSize, dynSharedMemPerBlk, flags); + HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, numBlocks, f, blockSize, dynamicSMemSize, flags); + auto F = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)(f), + hip_impl::target_agent(0)); return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor( - tls, numBlocks, f, blockSize, dynSharedMemPerBlk)); + tls, numBlocks, F, blockSize, dynamicSMemSize)); } hipError_t hipLaunchKernel( diff --git a/src/program_state.inl b/src/program_state.inl index 4f05d2763e..9feabbc2f7 100644 --- a/src/program_state.inl +++ b/src/program_state.inl @@ -89,9 +89,10 @@ struct Symbol { class Kernel_descriptor { std::uint64_t kernel_object_{}; - amd_kernel_code_t const* kernel_header_{nullptr}; - std::string name_{}; + amd_kernel_code_t const* header_{}; + std::string name_; std::vector> kernarg_layout_{}; + bool is_code_object_v3_{}; public: Kernel_descriptor() = default; Kernel_descriptor( @@ -101,7 +102,8 @@ public: : kernel_object_{kernel_object}, name_{name}, - kernarg_layout_{std::move(kernarg_layout)} + kernarg_layout_{std::move(kernarg_layout)}, + is_code_object_v3_{name.find(".kd") != std::string::npos} { bool supported{false}; std::uint16_t min_v{UINT16_MAX}; @@ -123,7 +125,7 @@ public: r = tbl.hsa_ven_amd_loader_query_host_address( reinterpret_cast(kernel_object_), - reinterpret_cast(&kernel_header_)); + reinterpret_cast(&header_)); if (r != HSA_STATUS_SUCCESS) return; } @@ -149,7 +151,7 @@ public: std::string, std::unordered_map< hsa_isa_t, - std::vector>>>> code_object_blobs; + std::vector>>> code_object_blobs; std::pair< std::once_flag, @@ -213,7 +215,7 @@ public: std::string, std::unordered_map< hsa_isa_t, - std::vector>>>& get_code_object_blobs() { + std::vector>>& get_code_object_blobs() { std::call_once(code_object_blobs.first, [this]() { dl_iterate_phdr([](dl_phdr_info* info, std::size_t, void* p) { @@ -584,6 +586,68 @@ public: return functions[agent].second; } + static + std::size_t parse_args_v2( + const std::string& metadata, + std::size_t f, + std::size_t l, + std::vector>& size_align) { + if (f == l) return f; + if (!size_align.empty()) return l; + + do { + static constexpr size_t size_sz{5}; + f = metadata.find("Size:", f) + size_sz; + + if (l <= f) return f; + + auto size = std::strtoul(&metadata[f], nullptr, 10); + + static constexpr size_t align_sz{6}; + f = metadata.find("Align:", f) + align_sz; + + char* l{}; + auto align = std::strtoul(&metadata[f], &l, 10); + + f += (l - &metadata[f]) + 1; + + size_align.emplace_back(size, align); + } while (true); + } + + static + void read_kernarg_metadata_v2( + const std::string& kernels_md, + std::size_t dx, + std::unordered_map< + std::string, + std::vector>>& kernargs) { + do { + dx = kernels_md.find("Name:", dx); + + if (dx == std::string::npos) break; + + static constexpr decltype(kernels_md.size()) name_sz{5}; + dx = kernels_md.find_first_not_of(" '", dx + name_sz); + + auto fn = + kernels_md.substr(dx, kernels_md.find_first_of("'\n", dx) - dx); + dx += fn.size(); + + auto dx1 = kernels_md.find("CodeProps", dx); + dx = kernels_md.find("Args:", dx); + + if (dx1 < dx) { + dx = dx1; + continue; + } + if (dx == std::string::npos) break; + + static constexpr decltype(kernels_md.size()) args_sz{5}; + dx = parse_args_v2(kernels_md, dx + args_sz, dx1, kernargs[fn]); + } while (true); + } + static std::string metadata_to_string(const amd_comgr_metadata_node_t& md) { std::string str; @@ -598,9 +662,8 @@ public: } static - void parse_args( + void parse_args_v3( const amd_comgr_metadata_node_t& args_md, - bool is_code_object_v3, std::vector>& size_align) { size_t arg_count = 0; if (amd_comgr_get_metadata_list_size(args_md, &arg_count) @@ -615,9 +678,7 @@ public: return; amd_comgr_metadata_node_t arg_size_md; - if (amd_comgr_metadata_lookup(arg_md, - is_code_object_v3 ? ".size" : "Size", - &arg_size_md) + if (amd_comgr_metadata_lookup(arg_md, ".size", &arg_size_md) != AMD_COMGR_STATUS_SUCCESS) return; @@ -629,35 +690,21 @@ public: size_t arg_align; - if (is_code_object_v3) { - amd_comgr_metadata_node_t arg_offset_md; - if (amd_comgr_metadata_lookup(arg_md, ".offset", &arg_offset_md) - != AMD_COMGR_STATUS_SUCCESS) - return; + amd_comgr_metadata_node_t arg_offset_md; + if (amd_comgr_metadata_lookup(arg_md, ".offset", &arg_offset_md) + != AMD_COMGR_STATUS_SUCCESS) + return; - size_t arg_offset - = std::stoul(metadata_to_string(arg_offset_md)); + size_t arg_offset = std::stoul(metadata_to_string(arg_offset_md)); - if (amd_comgr_destroy_metadata(arg_offset_md) - != AMD_COMGR_STATUS_SUCCESS) - return; + if (amd_comgr_destroy_metadata(arg_offset_md) + != AMD_COMGR_STATUS_SUCCESS) + return; - arg_align = 1; - while (arg_offset && (arg_offset & 1) == 0) { - arg_offset >>= 1; - arg_align <<= 1; - } - } else { - amd_comgr_metadata_node_t arg_align_md; - if (amd_comgr_metadata_lookup(arg_md, "Align", &arg_align_md) - != AMD_COMGR_STATUS_SUCCESS) - return; - - arg_align = std::stoul(metadata_to_string(arg_align_md)); - - if (amd_comgr_destroy_metadata(arg_align_md) - != AMD_COMGR_STATUS_SUCCESS) - return; + arg_align = 1; + while (arg_offset && (arg_offset & 1) == 0) { + arg_offset >>= 1; + arg_align <<= 1; } size_align.emplace_back(arg_size, arg_align); @@ -669,11 +716,11 @@ public: } static - void read_kernarg_metadata( - const std::vector& blob, + void read_kernarg_metadata_v3( + const std::string& blob, std::unordered_map< - std::string, - std::vector>>& kernargs) { + std::string, + std::vector>>& kernargs) { amd_comgr_data_t dataIn; amd_comgr_status_t status; @@ -690,7 +737,6 @@ public: != AMD_COMGR_STATUS_SUCCESS) return; - bool is_code_object_v3 = false; amd_comgr_metadata_node_t kernels_md; if (amd_comgr_metadata_lookup(metadata, "Kernels", &kernels_md) != AMD_COMGR_STATUS_SUCCESS) { @@ -699,7 +745,6 @@ public: &kernels_md) != AMD_COMGR_STATUS_SUCCESS) return; - is_code_object_v3 = true; } size_t kernel_count = 0; @@ -715,9 +760,7 @@ public: continue; amd_comgr_metadata_node_t name_md; - if (amd_comgr_metadata_lookup(kernel_md, - is_code_object_v3 ? ".name" : "Name", - &name_md) + if (amd_comgr_metadata_lookup(kernel_md, ".name", &name_md) != AMD_COMGR_STATUS_SUCCESS) continue; @@ -727,21 +770,15 @@ public: != AMD_COMGR_STATUS_SUCCESS) continue; - if (is_code_object_v3) - kernel_name_str.append(".kd"); - - amd_comgr_metadata_node_t args_md; - if (amd_comgr_metadata_lookup(kernel_md, - is_code_object_v3 ? ".args" : "Args", - &args_md) + if (amd_comgr_metadata_lookup(kernel_md, ".args", &args_md) != AMD_COMGR_STATUS_SUCCESS) continue; auto foundKernel = kernargs.find(kernel_name_str); // parse arguments for a given kernel only once if (foundKernel == kernargs.end()) { - parse_args(args_md, is_code_object_v3, kernargs[kernel_name_str]); + parse_args_v3(args_md, kernargs[kernel_name_str]); } if (amd_comgr_destroy_metadata(args_md) != AMD_COMGR_STATUS_SUCCESS @@ -757,7 +794,52 @@ public: amd_comgr_release_data(dataIn); } - const std::unordered_map>>& kernargs) + { + std::istringstream istr{blob}; + ELFIO::elfio reader; + + if (!reader.load(istr)) return; + + // TODO: this is inefficient. + auto it = find_section_if(reader, [](const ELFIO::section* x) { + return x->get_type() == SHT_NOTE; + }); + + if (!it) return; + + const ELFIO::note_section_accessor acc{reader, it}; + auto n{acc.get_notes_num()}; + while (n--) { + ELFIO::Elf_Word type{}; + std::string name{}; + void* desc{}; + ELFIO::Elf_Word desc_size{}; + + acc.get_note(n, type, name, desc, desc_size); + + if (name == "AMDGPU") { + return read_kernarg_metadata_v3(blob, kernargs); + } + if (name != "AMD") continue; // TODO: switch to using NT_AMD_AMDGPU_HSA_METADATA. + + std::string tmp{ + static_cast(desc), static_cast(desc) + desc_size}; + + auto dx = tmp.find("Kernels:"); + + if (dx == std::string::npos) continue; + + return read_kernarg_metadata_v2(tmp, dx + 8u, kernargs); // Skip "Kernels:". + } + } + + const std::unordered_map>>& get_kernargs() { std::call_once(kernargs.first, [this]() { diff --git a/tests/hipify-clang/lit.cfg b/tests/hipify-clang/lit.cfg index 1d092a4327..ea496e6a69 100644 --- a/tests/hipify-clang/lit.cfg +++ b/tests/hipify-clang/lit.cfg @@ -12,18 +12,33 @@ import lit.util site_cfg = lit_config.params.get('site_config', None) lit_config.load_config(config, site_cfg) -print(str("========================================")) +config.excludes = ['cmdparser.hpp'] +config.excludes.append('spatial_batch_norm_op.h') +config.excludes.append('common_cudnn.h') + +delimiter = "==============================================================="; +print(delimiter) print("CUDA " + config.cuda_version + " - will be used for testing") print("LLVM " + config.llvm_version + " - will be used for testing") print(platform.machine() + " - Platform architecture") print(platform.system() + " " + platform.release() + " - Platform OS") print(str(config.pointer_size * 8) + " - hipify-clang binary bitness") print(str(struct.calcsize("P") * 8) + " - python " + str(platform.python_version()) + " binary bitness") -print(str("========================================")) - -config.excludes = ['cmdparser.hpp'] -config.excludes.append('spatial_batch_norm_op.h') -config.excludes.append('common_cudnn.h') +print(delimiter) +warns = None +if not config.cuda_dnn_root: + config.excludes.append('cudnn_convolution_forward.cu') + config.excludes.append('cudnn_softmax.cu') + print("WARN: cuDNN tests are excluded due to unset CUDA_DNN_ROOT_DIR") + warns = True +if not config.cuda_cub_root: + config.excludes.append('cub_01.cu') + config.excludes.append('cub_02.cu') + config.excludes.append('cub_03.cu') + print("WARN: CUB tests are excluded due to unset CUDA_CUB_ROOT_DIR") + warns = True +if warns: + print(delimiter) if config.cuda_version_major == 7 and config.cuda_version_minor == 0: config.excludes.append('headers_test_09.cu') @@ -90,17 +105,47 @@ else: run_test_ext = ".sh" # CUDA SDK ROOT clang_arguments += " -isystem'%s'/samples/common/inc" -# cuDNN ROOT -clang_arguments += " -I'%s'/include" if config.pointer_size == 8: clang_arguments += " -D__LP64__" +# Set max clang's CudaArch for corresponding CUDA version +# to support maximum CUDA features in offline tests +if config.cuda_version_major == 7: + if config.cuda_version_minor == 5: + clang_arguments += " --cuda-gpu-arch=sm_53" + else: + clang_arguments += " --cuda-gpu-arch=sm_52" +elif config.cuda_version_major == 8: + clang_arguments += " --cuda-gpu-arch=sm_62" +elif config.cuda_version_major == 9: + if config.cuda_version_minor == 2: + clang_arguments += " --cuda-gpu-arch=sm_72" + else: + clang_arguments += " --cuda-gpu-arch=sm_70" +elif config.cuda_version_major == 10: + clang_arguments += " --cuda-gpu-arch=sm_75" + +# cuDNN ROOT +if config.cuda_dnn_root: + clang_arguments += " -I'%s'/include" # CUB ROOT -clang_arguments += " -I'%s'" +if config.cuda_cub_root: + clang_arguments += " -I'%s'" -hipify_arguments = "--cuda-path='%s'" +if config.cuda_dnn_root and config.cuda_cub_root: + config.substitutions.append(("%clang_args", clang_arguments % (config.cuda_sdk_root, config.cuda_dnn_root, config.cuda_cub_root))) +elif config.cuda_dnn_root: + config.substitutions.append(("%clang_args", clang_arguments % (config.cuda_sdk_root, config.cuda_dnn_root))) +elif config.cuda_cub_root: + config.substitutions.append(("%clang_args", clang_arguments % (config.cuda_sdk_root, config.cuda_cub_root))) +else: + config.substitutions.append(("%clang_args", clang_arguments % config.cuda_sdk_root)) -config.substitutions.append(("%clang_args", clang_arguments % (config.cuda_sdk_root, config.cuda_dnn_root, config.cuda_cub_root))) -config.substitutions.append(("%hipify_args", hipify_arguments % (config.cuda_root))) +if config.llvm_version_major < 4: + hipify_arguments = "-I'%s'/include" +else: + hipify_arguments = "--cuda-path='%s'" + +config.substitutions.append(("%hipify_args", hipify_arguments % config.cuda_root)) config.substitutions.append(("hipify", '"' + hipify_path + "/hipify-clang" + '"')) config.substitutions.append(("%run_test", '"' + config.test_source_root + "/run_test" + run_test_ext + '"')) diff --git a/tests/hipify-clang/unit_tests/device/atomics.cu b/tests/hipify-clang/unit_tests/device/atomics.cu index 1afd1ab541..3089efe1b8 100644 --- a/tests/hipify-clang/unit_tests/device/atomics.cu +++ b/tests/hipify-clang/unit_tests/device/atomics.cu @@ -276,7 +276,9 @@ int main(int argc, char** argv) { runTest(); runTest(); runTest(); +#if CUDA_VERSION >= 8000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600 runTest(); +#endif // CHECK: hipDeviceReset(); cudaDeviceReset(); printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!"); diff --git a/tests/hipify-clang/unit_tests/libraries/CUB/cub_02.cu b/tests/hipify-clang/unit_tests/libraries/CUB/cub_02.cu new file mode 100644 index 0000000000..21898baa03 --- /dev/null +++ b/tests/hipify-clang/unit_tests/libraries/CUB/cub_02.cu @@ -0,0 +1,69 @@ +// RUN: %run_test hipify "%s" "%t" %hipify_args %clang_args +// CHECK: #include +#include +// CHECK: #include +#include +// CHECK: #include +#include + +#include + +template +__global__ void sort(const T* data_in, T* data_out){ + // CHECK: typedef hipcub::BlockLoad BlockLoadT; + typedef cub::BlockLoad BlockLoadT; + // CHECK: typedef hipcub::BlockRadixSort BlockRadixSortT; + typedef cub::BlockRadixSort BlockRadixSortT; + // CHECK: typedef hipcub::BlockStore BlockStoreT; + typedef cub::BlockStore BlockStoreT; + __shared__ union { + typename BlockLoadT::TempStorage load; + typename BlockRadixSortT::TempStorage sort; + typename BlockStoreT::TempStorage store; + } tmp_storage; + T items[ITEMS_PER_THREAD]; + BlockLoadT(tmp_storage.load).Load(data_in + blockIdx.x * BLOCK_WIDTH * ITEMS_PER_THREAD, items); + __syncthreads(); + BlockRadixSortT(tmp_storage.sort).Sort(items); + __syncthreads(); + BlockStoreT(tmp_storage.store).Store(data_out + blockIdx.x * BLOCK_WIDTH * ITEMS_PER_THREAD, items); +} + +int main() { + double* d_gpu = NULL; + double* result_gpu = NULL; + double* data_sorted = new double[1000*4096]; + // Allocate memory on the GPU + // CHECK: hipMalloc(&d_gpu, 1000*4096 * sizeof(double)); + cudaMalloc(&d_gpu, 1000*4096 * sizeof(double)); + // CHECK: hipMalloc(&result_gpu, 1000*4096 * sizeof(double)); + cudaMalloc(&result_gpu, 1000*4096 * sizeof(double)); + // CHECK: hiprandGenerator_t gen; + curandGenerator_t gen; + // Create generator + // CHECK: hiprandCreateGenerator(&gen, HIPRAND_RNG_PSEUDO_DEFAULT); + curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT); + // Fill array with random numbers + // CHECK: hiprandGenerateNormalDouble(gen, d_gpu, 1000*4096, 0.0, 1.0); + curandGenerateNormalDouble(gen, d_gpu, 1000*4096, 0.0, 1.0); + // Destroy generator + // CHECK: hiprandDestroyGenerator(gen); + curandDestroyGenerator(gen); + // Sort data + // CHECK: hipLaunchKernelGGL(HIP_KERNEL_NAME(sort<512, 8, hipcub::BLOCK_LOAD_TRANSPOSE, hipcub::BLOCK_STORE_TRANSPOSE>), dim3(1000), dim3(512), 0, 0, d_gpu, result_gpu); + sort<512, 8, cub::BLOCK_LOAD_TRANSPOSE, cub::BLOCK_STORE_TRANSPOSE><<<1000, 512>>>(d_gpu, result_gpu); + // CHECK: hipLaunchKernelGGL(HIP_KERNEL_NAME(sort<256, 16, hipcub::BLOCK_LOAD_DIRECT, hipcub::BLOCK_STORE_DIRECT>), dim3(1000), dim3(256), 0, 0, d_gpu, result_gpu); + sort<256, 16, cub::BLOCK_LOAD_DIRECT, cub::BLOCK_STORE_DIRECT><<<1000, 256>>>(d_gpu, result_gpu); + // CHECK: hipMemcpy(data_sorted, result_gpu, 1000*4096*sizeof(double), hipMemcpyDeviceToHost); + cudaMemcpy(data_sorted, result_gpu, 1000*4096*sizeof(double), cudaMemcpyDeviceToHost); + // Write the sorted data to standard out + for (int i = 0; i < 4095; ++i) { + std::cout << data_sorted[i] << ", "; + } + std::cout << data_sorted[4095] << std::endl; +} diff --git a/tests/hipify-clang/unit_tests/libraries/CUB/cub_03.cu b/tests/hipify-clang/unit_tests/libraries/CUB/cub_03.cu new file mode 100644 index 0000000000..bc914d419d --- /dev/null +++ b/tests/hipify-clang/unit_tests/libraries/CUB/cub_03.cu @@ -0,0 +1,33 @@ +// RUN: %run_test hipify "%s" "%t" %hipify_args %clang_args +// CHECK: #include +#include +// CHECK: #include +#include + +// using namespace hipcub; +using namespace cub; + +// Simple CUDA kernel for computing tiled partial sums +template + cub::BlockScanAlgorithm SCAN_ALGO> +__global__ void ScanTilesKernel(int *d_in, int *d_out) { + // Specialize collective types for problem context + // CHECK: typedef ::hipcub::BlockLoad BlockLoadT; + typedef ::cub::BlockLoad BlockLoadT; + typedef BlockScan BlockScanT; + // Allocate on-chip temporary storage + __shared__ union { + typename BlockLoadT::TempStorage load; + typename BlockScanT::TempStorage reduce; + } temp_storage; + // Load data per thread + int thread_data[ITEMS_PER_THREAD]; + int offset = blockIdx.x * (BLOCK_THREADS * ITEMS_PER_THREAD); + BlockLoadT(temp_storage.load).Load(d_in + offset, offset); + __syncthreads(); + // Compute the block-wide prefix sum + BlockScanT(temp_storage).Sum(thread_data); +} diff --git a/tests/hipify-clang/unit_tests/samples/vec_add.cu b/tests/hipify-clang/unit_tests/samples/vec_add.cu index bc8219bf8c..a6d8950e0b 100644 --- a/tests/hipify-clang/unit_tests/samples/vec_add.cu +++ b/tests/hipify-clang/unit_tests/samples/vec_add.cu @@ -67,7 +67,6 @@ int devcheck(int gpudevice, int rank) cudaError_t cudareturn; cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, gpudevice); - // CHECK: if (deviceProp.hipWarpSize <= 1) if (deviceProp.warpSize <= 1) { printf("rank %d: warning, CUDA Device Emulation (CPU) detected, exiting\n", rank); diff --git a/tests/src/deviceLib/hipMathFunctions.cpp b/tests/src/deviceLib/hipMathFunctions.cpp index dc064da189..b1b0e8334a 100644 --- a/tests/src/deviceLib/hipMathFunctions.cpp +++ b/tests/src/deviceLib/hipMathFunctions.cpp @@ -29,7 +29,12 @@ THE SOFTWARE. #include "hip/hip_runtime.h" #include "test_common.h" -#if __HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__ || __HIP_ARCH_GFX906__ || __HIP_ARCH_GFX908__ +#if __HIP_ARCH_GFX803__ || \ + __HIP_ARCH_GFX900__ || \ + __HIP_ARCH_GFX906__ || \ + __HIP_ARCH_GFX908__ || \ + __HIP_ARCH_GFX1010__ || \ + __HIP_ARCH_GFX1012__ __global__ void kernel_abs_int64(long long *input, long long *output) { int tx = threadIdx.x; diff --git a/tests/src/deviceLib/hipTestHalf.cpp b/tests/src/deviceLib/hipTestHalf.cpp index b78e1d8c63..751d44e242 100644 --- a/tests/src/deviceLib/hipTestHalf.cpp +++ b/tests/src/deviceLib/hipTestHalf.cpp @@ -28,7 +28,12 @@ THE SOFTWARE. #include "test_common.h" -#if __HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__ || __HIP_ARCH_GFX906__ || __HIP_ARCH_GFX908__ +#if __HIP_ARCH_GFX803__ || \ + __HIP_ARCH_GFX900__ || \ + __HIP_ARCH_GFX906__ || \ + __HIP_ARCH_GFX908__ || \ + __HIP_ARCH_GFX1010__ || \ + __HIP_ARCH_GFX1012__ __device__ void test_convert() { __half x; diff --git a/tests/src/deviceLib/hipTestNativeHalf.cpp b/tests/src/deviceLib/hipTestNativeHalf.cpp index 55213d446d..8639127c7c 100644 --- a/tests/src/deviceLib/hipTestNativeHalf.cpp +++ b/tests/src/deviceLib/hipTestNativeHalf.cpp @@ -32,7 +32,12 @@ THE SOFTWARE. using namespace std; -#if __HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__ || __HIP_ARCH_GFX906__ || __HIP_ARCH_GFX908__ +#if __HIP_ARCH_GFX803__ || \ + __HIP_ARCH_GFX900__ || \ + __HIP_ARCH_GFX906__ || \ + __HIP_ARCH_GFX908__ || \ + __HIP_ARCH_GFX1010__ || \ + __HIP_ARCH_GFX1012__ __global__ void __halfTest(bool* result, __half a) { diff --git a/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp b/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp index 8ce02d6164..4aacfa866d 100644 --- a/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp +++ b/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp @@ -120,8 +120,8 @@ void simpleTest2(size_t numElements, bool usePinnedHost) { HIPCHECK(hipHostFree(A_h1)); HIPCHECK(hipHostFree(A_h2)); } else { - free(A_h1); - free(A_h2); + aligned_free(A_h1); + aligned_free(A_h2); } } diff --git a/tests/src/runtimeApi/memory/hipMemset2D.cpp b/tests/src/runtimeApi/memory/hipMemset2D.cpp index b3bcf42222..2eb62a859f 100644 --- a/tests/src/runtimeApi/memory/hipMemset2D.cpp +++ b/tests/src/runtimeApi/memory/hipMemset2D.cpp @@ -45,7 +45,7 @@ bool testhipMemset2D(int memsetval,int p_gpuDevice) char *A_d; char *A_h; bool testResult = true; - HIPCHECK ( hipMemAllocPitch((hipDeviceptr_t*)&A_d, &pitch_A, width , numH,16) ); + HIPCHECK(hipMemAllocPitch((hipDeviceptr_t*)&A_d, &pitch_A, width , numH,16)); A_h = (char*)malloc(sizeElements); HIPASSERT(A_h != NULL); for (size_t i=0; i(&numBlocks), + hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, test_gws, dimBlock.x * dimBlock.y * dimBlock.z, dimBlock.x * sizeof(long)); dimGrid.x = deviceProp.multiProcessorCount * std::min(numBlocks, 32); diff --git a/tests/src/runtimeApi/module/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp b/tests/src/runtimeApi/module/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp index ebf656b72f..2838c09cd1 100644 --- a/tests/src/runtimeApi/module/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp +++ b/tests/src/runtimeApi/module/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp @@ -30,10 +30,6 @@ THE SOFTWARE. #include "hip/hip_runtime.h" #include "test_common.h" -#define fileName "vcpy_kernel.code" -#define kernel_name "hello_world" - - __global__ void f1(float *a) { *a = 1.0; } template @@ -44,16 +40,15 @@ __global__ void f2(T *a) { *a = 1; } int main(int argc, char* argv[]) { // test case for using kernel function pointer - uint32_t gridSize = 0; - uint32_t blockSize = 0; + int gridSize = 0; + int blockSize = 0; hipOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, f1, 0, 0); assert(gridSize != 0 && blockSize != 0); - uint32_t numBlock = 0; + int numBlock = 0; hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, f1, blockSize, 0); assert(numBlock != 0); - // test case for using kernel function pointer with template gridSize = 0; blockSize = 0; @@ -64,15 +59,5 @@ int main(int argc, char* argv[]) { hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, f2, blockSize, 0); assert(numBlock != 0); - - // test case for using kernel with hipFunction_t type - numBlock = 0; - hipModule_t Module; - hipFunction_t Function; - HIPCHECK(hipModuleLoad(&Module, fileName)); - HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name)); - HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, Function, blockSize, 0)); - assert(numBlock != 0); - passed(); } diff --git a/tests/src/runtimeApi/module/hipOccupancyMaxPotentialBlockSize.cpp b/tests/src/runtimeApi/module/hipOccupancyMaxPotentialBlockSize.cpp index a81862952d..22a3f05283 100644 --- a/tests/src/runtimeApi/module/hipOccupancyMaxPotentialBlockSize.cpp +++ b/tests/src/runtimeApi/module/hipOccupancyMaxPotentialBlockSize.cpp @@ -30,22 +30,16 @@ THE SOFTWARE. #include "hip/hip_runtime.h" #include "test_common.h" -#define fileName "vcpy_kernel.code" -#define kernel_name "hello_world" - - __global__ void f1(float *a) { *a = 1.0; } template __global__ void f2(T *a) { *a = 1; } - - int main(int argc, char* argv[]) { // test case for using kernel function pointer - uint32_t gridSize = 0; - uint32_t blockSize = 0; + int gridSize = 0; + int blockSize = 0; hipOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, f1, 0, 0); assert(gridSize != 0 && blockSize != 0); @@ -55,15 +49,5 @@ int main(int argc, char* argv[]) { hipOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, f2, 0, 0); assert(gridSize != 0 && blockSize != 0); - // test case for using kernel with hipFunction_t type - gridSize = 0; - blockSize = 0; - hipModule_t Module; - hipFunction_t Function; - HIPCHECK(hipModuleLoad(&Module, fileName)); - HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name)); - HIPCHECK(hipOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, Function, 0, 0)); - assert(gridSize != 0 && blockSize != 0); - passed(); } diff --git a/tests/src/test_common.h b/tests/src/test_common.h index 73a952b0d1..67a8e5e60a 100644 --- a/tests/src/test_common.h +++ b/tests/src/test_common.h @@ -99,10 +99,13 @@ THE SOFTWARE. #ifdef _WIN64 #include -#define aligned_alloc _aligned_malloc +#define aligned_alloc(x,y) _aligned_malloc(y,x) +#define aligned_free(x) _aligned_free(x) #define popen(x,y) _popen(x,y) #define pclose(x) _pclose(x) #define setenv(x,y,z) _putenv_s(x,y) +#else +#define aligned_free(x) free(x) #endif // standard command-line variables: