merge from master

2019-10-25 15:52:09 +05:30
@@ -289,7 +289,7 @@ if(HIP_PLATFORM STREQUAL "hcc")
        target_link_libraries(hip_hcc PRIVATE hc_am)
        target_link_libraries(hip_hcc_static PRIVATE hc_am)

-        add_library(hiprtc SHARED src/hiprtc.cpp)
+        add_library(hiprtc SHARED src/hiprtc.cpp src/code_object_bundle.cpp)
        target_include_directories(
            hiprtc SYSTEM
                PRIVATE ${PROJECT_SOURCE_DIR}/include ${HSA_PATH}/include)
@@ -504,7 +504,7 @@ endif()
 find_program(CPPCHECK_EXE cppcheck)
 if(CPPCHECK_EXE)
    add_custom_target(cppcheck COMMAND ${CPPCHECK_EXE} --force --quiet --enable=warning,performance,portability,information,missingInclude src include -I /opt/rocm/include/hcc -I /opt/rocm/include --suppress=*:/opt/rocm/include/hcc/hc.hpp
-        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
 endif()

 #############################
@@ -545,4 +545,17 @@ else()
    message(STATUS "Testing targets will not be available. To enable them please ensure that the HIP installation directory is writeable. Use -DCMAKE_INSTALL_PREFIX to specify a suitable location")
 endif()

+#############################
+# Code analysis
+#############################
+# Target: clang
+if(HIP_HIPCC_EXECUTABLE)
+    add_custom_target(analyze 
+        COMMAND ${HIP_HIPCC_EXECUTABLE} -fvisibility=hidden -fvisibility-inlines-hidden --analyze --analyzer-outputtext  -isystem /opt/rocm/include ${HIP_HCC_BUILD_FLAGS} -Wno-unused-command-line-argument -I/opt/rocm/include -c  src/*.cpp -Iinclude/ -I./
+    WORKING_DIRECTORY ${HIP_SRC_PATH})
+    if(CPPCHECK_EXE)
+        add_dependencies(analyze cppcheck)
+    endif()
+endif()
+
 # vim: ts=4:sw=4:expandtab:smartindent
@@ -234,6 +234,8 @@ if ($HIP_PLATFORM eq "clang") {
    if ($HIP_RUNTIME eq "HCC" ) {
      $HSA_PATH=$ENV{'HSA_PATH'} // "$ROCM_PATH/hsa";
      $HIPCXXFLAGS .= " -isystem $HSA_PATH/include";
+    } else {
+      $HIPCXXFLAGS .= " -fhip-new-launch-api";
    }

 } elsif ($HIP_PLATFORM eq "hcc") {
@@ -352,7 +354,6 @@ my $runCmd = 1;
 my $buildDeps = 0;
 my $linkType = 1;
 my $setLinkType = 0;
-my $coFormatv3 = 1;

 my @options = ();
 my @inputs  = ();
@@ -472,22 +473,6 @@ foreach $arg (@ARGV)
        $swallowArg = 1;
    }

-    # code object format parsing
-    if ($trimarg eq '-mcode-object-v3') {
-        $coFormatv3 = 1;
-        # hip-clang already recognizes -mcode-object-v3, so we just pass it on
-        if ($HIP_PLATFORM eq 'hcc') {
-            $swallowArg = 1;
-        }
-    }
-    if ($trimarg eq '-mno-code-object-v3') {
-        $coFormatv3 = 0;
-        # hip-clang already recognizes -mno-code-object-v3, so we just pass it on
-        if ($HIP_PLATFORM eq 'hcc') {
-            $swallowArg = 1;
-        }
-    }
-
    if (($arg =~ /--genco/) and $HIP_PLATFORM eq 'clang' ) {
        $arg = "--cuda-device-only";
    }
@@ -526,6 +511,13 @@ foreach $arg (@ARGV)
        $optArg = $arg;
    }

+    ## This is a temporary workaround for CMake detection of OpenMP support.
+    ## It should be removed when the OpenMP detection c++ test in CMake is updated
+    ## and corrected CMake version is available.
+    if((defined $HIP_COMPILER) and ($HIP_COMPILER eq "clang") and ($arg eq '-fopenmp')) {
+        $HIPCXXFLAGS .= " -D_OPENMP "
+    }
+
    ## process linker response file for hip-clang
    ## extract object files from static library and pass them directly to
    ## hip-clang in command line.
@@ -863,12 +855,6 @@ if($HIP_PLATFORM eq "hcc" or $HIP_PLATFORM eq "clang"){
    }
 }

-# hcc defaults to v2, so we need to convert to the appropriate flag
-# hip-clang defaults to v3, so we don't need to do anything
-if ($coFormatv3 and $HIP_PLATFORM eq 'hcc') {
-    $HIPLDFLAGS .= " -Wl,-hcc-cov3 ";
-}
-
 if ($hasC and $HIP_PLATFORM eq 'nvcc') {
    $HIPCXXFLAGS .= " -x cu";
 }
@@ -983,7 +983,6 @@ sub simpleSubstitutions {
    $ft{'type'} += s/\bcusparsePointerMode_t\b/hipsparsePointerMode_t/g;
    $ft{'type'} += s/\bcusparseSolvePolicy_t\b/hipsparseSolvePolicy_t/g;
    $ft{'type'} += s/\bcusparseStatus_t\b/hipsparseStatus_t/g;
-    $ft{'type'} += s/\bwarpSize\b/hipWarpSize/g;
    $ft{'numeric_literal'} += s/\bCUBLAS_DIAG_NON_UNIT\b/HIPBLAS_DIAG_NON_UNIT/g;
    $ft{'numeric_literal'} += s/\bCUBLAS_DIAG_UNIT\b/HIPBLAS_DIAG_UNIT/g;
    $ft{'numeric_literal'} += s/\bCUBLAS_FILL_MODE_FULL\b/HIPBLAS_FILL_MODE_FULL/g;
@@ -1676,6 +1675,12 @@ sub transformKernelLaunch {
    }
 }

+sub transformCubNamespace {
+    my $k = 0;
+    $k += s/using\s*namespace\s*cub/using namespace hipcub/g;
+    return $k;
+}
+
 sub transformHostFunctions {
    my $k = 0;
    foreach $func (
@@ -2368,6 +2373,7 @@ while (@ARGV) {
        simpleSubstitutions();
        transformExternShared();
        transformKernelLaunch();
+        transformCubNamespace();
        if ($print_stats) {
            while (/(\b(hip|HIP)([A-Z]|_)\w+\b)/g) {
                $convertedTags{$1}++;
@@ -196,8 +196,7 @@ This file can be copied and edited to provide more selective HSA event recording


 #### How to enable profiling at HIP build time
-Recent pre-built packages of HIP are always built with profiling support enabled.
-For developer builds, you must enable marker support manually when compiling HIP.
+Pre-built packages of HIP are not built with profiling support enabled.You must enable marker support manually when compiling HIP.

 1. Build HIP with ATP markers enabled
 HIP pre-built packages are enabled with ATP marker support by default.
@@ -348,7 +348,7 @@ def generate_prof_header(f, api_map, opts_map):
  # Generating the callbacks data structure
  f.write('\n// HIP API callbacks data structure\n')
  f.write(
-  'struct hip_api_data_t {\n' +
+  'typedef struct hip_api_data_t {\n' +
  '  uint64_t correlation_id;\n' +
  '  uint32_t phase;\n' +
  '  union {\n'
@@ -364,7 +364,7 @@ def generate_prof_header(f, api_map, opts_map):
      f.write('    } ' + name + ';\n')
  f.write(
  '  } args;\n' +
-  '};\n'
+  '} hip_api_data_t;\n'
  )
  
  # Generating the callbacks args data filling macros
@@ -1,8 +1,4 @@
-if (CUDA_VERSION VERSION_GREATER "9.2")
-    cmake_minimum_required(VERSION 3.12.3)
-else()
-    cmake_minimum_required(VERSION 3.7.2)
-endif()
+cmake_minimum_required(VERSION 3.5.1)

 project(hipify-clang)

@@ -58,7 +54,7 @@ target_link_libraries(hipify-clang PRIVATE
    LLVMOption
    LLVMCore)

-if (LLVM_PACKAGE_VERSION VERSION_GREATER_EQUAL "7")
+if (LLVM_PACKAGE_VERSION VERSION_GREATER "6.0.1")
    target_link_libraries(hipify-clang PRIVATE clangToolingInclusions)
 endif()

@@ -72,7 +68,7 @@ else()
    set(StdCpp "-std=c++")
 endif()

-if (LLVM_PACKAGE_VERSION VERSION_GREATER_EQUAL "10.0")
+if (LLVM_PACKAGE_VERSION VERSION_GREATER "9.0")
    string(APPEND StdCpp "14")
 # MSVC starting from 1900 (VS 2015) supports only the following c++ std values: c++14|c++17|c++latest
 elseif (MSVC)
@@ -84,7 +80,38 @@ endif()
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_CFLAGS}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CFLAGS} ${StdCpp} -DHIPIFY_CLANG_RES=\\\"${LLVM_LIBRARY_DIRS}/clang/${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}\\\"")

-install(TARGETS hipify-clang DESTINATION bin)
+set(INSTALL_PATH_DOC_STRING "Installation path for hipify-clang")
+if (CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
+    if(CMAKE_BUILD_TYPE MATCHES Debug)
+        set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_SOURCE_DIR}/bin" CACHE PATH ${INSTALL_PATH_DOC_STRING} FORCE)
+    elseif(CMAKE_BUILD_TYPE MATCHES Release)
+        if (BIN_INSTALL_DIR)
+            set(CMAKE_INSTALL_PREFIX "${BIN_INSTALL_DIR}" CACHE PATH ${INSTALL_PATH_DOC_STRING} FORCE)
+        else()
+            set(CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/bin" CACHE PATH ${INSTALL_PATH_DOC_STRING} FORCE)
+        endif()
+    else()
+        message(FATAL_ERROR "Invalid CMAKE_BUILD_TYPE specified. Valid values are Debug and Release")
+    endif()
+elseif(BIN_INSTALL_DIR)
+    set(CMAKE_INSTALL_PREFIX "${BIN_INSTALL_DIR}" CACHE PATH ${INSTALL_PATH_DOC_STRING} FORCE)
+endif()
+
+install(TARGETS hipify-clang DESTINATION ${CMAKE_INSTALL_PREFIX})
+
+install(
+  DIRECTORY ${LLVM_DIR}/../../clang/${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}/
+  DESTINATION ${CMAKE_INSTALL_PREFIX}
+  COMPONENT clang-resource-headers
+  FILES_MATCHING
+  PATTERN "*.h"
+  PATTERN "*.modulemap"
+  PATTERN "algorithm"
+  PATTERN "complex"
+  PATTERN "new"
+  PATTERN "ppc_wrappers" EXCLUDE
+  PATTERN "openmp_wrappers" EXCLUDE
+  )

 if (HIPIFY_CLANG_TESTS)
    find_package(PythonInterp 2.7 REQUIRED)
@@ -63,7 +63,7 @@
 In most cases, you can get a suitable version of LLVM+CLANG with your package manager.

 Failing that or having multiple versions of LLVM, you can [download a release archive](http://releases.llvm.org/), build or install it, and set
-[CMAKE_PREFIX_PATH](https://cmake.org/cmake/help/v3.12/variable/CMAKE_PREFIX_PATH.html) so `cmake` can find it; for instance: `-DCMAKE_PREFIX_PATH=f:\LLVM\9.0.0\dist`
+[CMAKE_PREFIX_PATH](https://cmake.org/cmake/help/v3.5/variable/CMAKE_PREFIX_PATH.html) so `cmake` can find it; for instance: `-DCMAKE_PREFIX_PATH=f:\LLVM\9.0.0\dist`

 ## <a name="build-and-install"></a> Build and install

@@ -154,7 +154,7 @@ To run it:

    * Path to CUB should be specified by the `CUDA_CUB_ROOT_DIR` option:

-        - Linux: `-DCUDA_CUB_ROOT_DIR=/srv/CUB`
+        - Linux: `-DCUDA_CUB_ROOT_DIR=/srv/git/CUB`

        - Windows: `-DCUDA_CUB_ROOT_DIR=f:/GIT/cub`

@@ -194,9 +194,9 @@ Ubuntu 14: LLVM 5.0.0 - 6.0.1, CUDA 7.0 - 9.0, cudnn-5.0.5 - cudnn-7.6.4.38

 Ubuntu 16-18: LLVM 8.0.0 - 9.0.0, CUDA 8.0 - 10.1, cudnn-5.1.10 - cudnn-7.6.4.38

-Build system for the above configurations:
+Minimum build system requirements for the above configurations:

-Python 2.7 (min), cmake 3.12.3 (min), GNU C/C++ 5.4.0 (min).
+Python 2.7, cmake 3.5.1, GNU C/C++ 5.4.0.

 Here is an example of building `hipify-clang` with testing support on `Ubuntu 16.04`:

@@ -208,7 +208,7 @@ cmake
 -DCMAKE_PREFIX_PATH=/srv/git/LLVM/9.0.0/dist \
 -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-10.1 \
 -DCUDA_DNN_ROOT_DIR=/srv/CUDNN/cudnn-10.1-v7.6.4.38 \
- -DCUDA_CUB_ROOT_DIR=/srv/CUB \
+ -DCUDA_CUB_ROOT_DIR=/srv/git/CUB \
 -DLLVM_EXTERNAL_LIT=/srv/git/LLVM/9.0.0/build/bin/llvm-lit \
 ..
 ```
@@ -264,88 +264,89 @@ Linux 5.2.0 - Platform OS
 64 - hipify-clang binary bitness
 64 - python 2.7.12 binary bitness
 ========================================
-- Testing: 64 tests, 12 threads --
-PASS: hipify :: unit_tests/casts/reinterpret_cast.cu (1 of 64)
-PASS: hipify :: unit_tests/device/math_functions.cu (2 of 64)
-PASS: hipify :: unit_tests/device/atomics.cu (3 of 64)
-PASS: hipify :: unit_tests/device/device_symbols.cu (4 of 64)
-PASS: hipify :: unit_tests/headers/headers_test_02.cu (5 of 64)
-PASS: hipify :: unit_tests/headers/headers_test_03.cu (6 of 64)
-PASS: hipify :: unit_tests/headers/headers_test_01.cu (7 of 64)
-PASS: hipify :: unit_tests/headers/headers_test_04.cu (8 of 64)
-PASS: hipify :: unit_tests/headers/headers_test_05.cu (9 of 64)
-PASS: hipify :: unit_tests/headers/headers_test_07.cu (10 of 64)
-PASS: hipify :: unit_tests/headers/headers_test_06.cu (11 of 64)
-PASS: hipify :: unit_tests/headers/headers_test_11.cu (12 of 64)
-PASS: hipify :: unit_tests/headers/headers_test_08.cu (13 of 64)
-PASS: hipify :: unit_tests/headers/headers_test_10.cu (14 of 64)
-PASS: hipify :: unit_tests/headers/headers_test_09.cu (15 of 64)
-PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_02.cu (16 of 64)
-PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_01.cu (17 of 64)
-PASS: hipify :: unit_tests/libraries/CUB/cub_01.cu (18 of 64)
-PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_1_based_indexing.cu (19 of 64)
-PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_0_based_indexing.cu (20 of 64)
-PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_sgemm_matrix_multiplication.cu (21 of 64)
-PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_0_based_indexing_rocblas.cu (22 of 64)
-PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_sgemm_matrix_multiplication_rocblas.cu (23 of 64)
-PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_1_based_indexing_rocblas.cu (24 of 64)
-PASS: hipify :: unit_tests/libraries/cuComplex/cuComplex_Julia.cu (25 of 64)
-PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_softmax.cu (26 of 64)
-PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_convolution_forward.cu (27 of 64)
-PASS: hipify :: unit_tests/libraries/cuFFT/simple_cufft.cu (28 of 64)
-PASS: hipify :: unit_tests/libraries/cuRAND/poisson_api_example.cu (29 of 64)
-PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_01.cu (30 of 64)
-PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_generate.cpp (31 of 64)
-PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_02.cu (32 of 64)
-PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_03.cu (33 of 64)
-PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_kernel.cpp (34 of 64)
-PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_04.cu (35 of 64)
-PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_05.cu (36 of 64)
-PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_06.cu (37 of 64)
-PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_07.cu (38 of 64)
-PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_09.cu (39 of 64)
-PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_10.cu (40 of 64)
-PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_08.cu (41 of 64)
-PASS: hipify :: unit_tests/namespace/ns_kernel_launch.cu (42 of 64)
-PASS: hipify :: unit_tests/pp/pp_if_else_conditionals.cu (43 of 64)
-PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_11.cu (44 of 64)
-PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_12.cu (45 of 64)
-PASS: hipify :: unit_tests/pp/pp_if_else_conditionals_01.cu (46 of 64)
-PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/tex2dKernel.cpp (47 of 64)
-PASS: hipify :: unit_tests/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp (48 of 64)
-PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp (49 of 64)
-PASS: hipify :: unit_tests/samples/2_Cookbook/13_occupancy/occupancy.cpp (50 of 64)
-PASS: hipify :: unit_tests/samples/2_Cookbook/1_hipEvent/hipEvent.cpp (51 of 64)
-PASS: hipify :: unit_tests/samples/2_Cookbook/2_Profiler/Profiler.cpp (52 of 64)
-PASS: hipify :: unit_tests/samples/MallocManaged.cpp (53 of 64)
-PASS: hipify :: unit_tests/samples/2_Cookbook/7_streams/stream.cpp (54 of 64)
-PASS: hipify :: unit_tests/samples/2_Cookbook/8_peer2peer/peer2peer.cpp (55 of 64)
-PASS: hipify :: unit_tests/samples/allocators.cu (56 of 64)
-PASS: hipify :: unit_tests/samples/coalescing.cu (57 of 64)
-PASS: hipify :: unit_tests/samples/dynamic_shared_memory.cu (58 of 64)
-PASS: hipify :: unit_tests/samples/axpy.cu (59 of 64)
-PASS: hipify :: unit_tests/samples/cudaRegister.cu (60 of 64)
-PASS: hipify :: unit_tests/samples/intro.cu (61 of 64)
-PASS: hipify :: unit_tests/samples/square.cu (62 of 64)
-PASS: hipify :: unit_tests/samples/static_shared_memory.cu (63 of 64)
-PASS: hipify :: unit_tests/samples/vec_add.cu (64 of 64)
-Testing Time: 2.98s
-  Expected Passes    : 64
+-- Testing: 65 tests, 12 threads --
+PASS: hipify :: unit_tests/casts/reinterpret_cast.cu (1 of 65)
+PASS: hipify :: unit_tests/device/math_functions.cu (2 of 65)
+PASS: hipify :: unit_tests/device/atomics.cu (3 of 65)
+PASS: hipify :: unit_tests/device/device_symbols.cu (4 of 65)
+PASS: hipify :: unit_tests/headers/headers_test_02.cu (5 of 65)
+PASS: hipify :: unit_tests/headers/headers_test_03.cu (6 of 65)
+PASS: hipify :: unit_tests/headers/headers_test_01.cu (7 of 65)
+PASS: hipify :: unit_tests/headers/headers_test_04.cu (8 of 65)
+PASS: hipify :: unit_tests/headers/headers_test_05.cu (9 of 65)
+PASS: hipify :: unit_tests/headers/headers_test_07.cu (10 of 65)
+PASS: hipify :: unit_tests/headers/headers_test_06.cu (11 of 65)
+PASS: hipify :: unit_tests/headers/headers_test_11.cu (12 of 65)
+PASS: hipify :: unit_tests/headers/headers_test_08.cu (13 of 65)
+PASS: hipify :: unit_tests/headers/headers_test_10.cu (14 of 65)
+PASS: hipify :: unit_tests/headers/headers_test_09.cu (15 of 65)
+PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_02.cu (16 of 65)
+PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_01.cu (17 of 65)
+PASS: hipify :: unit_tests/libraries/CUB/cub_01.cu (18 of 65)
+PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_1_based_indexing.cu (19 of 65)
+PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_0_based_indexing.cu (20 of 65)
+PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_sgemm_matrix_multiplication.cu (21 of 65)
+PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_0_based_indexing_rocblas.cu (22 of 65)
+PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_sgemm_matrix_multiplication_rocblas.cu (23 of 65)
+PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_1_based_indexing_rocblas.cu (24 of 65)
+PASS: hipify :: unit_tests/libraries/cuComplex/cuComplex_Julia.cu (25 of 65)
+PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_softmax.cu (26 of 65)
+PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_convolution_forward.cu (27 of 65)
+PASS: hipify :: unit_tests/libraries/cuFFT/simple_cufft.cu (28 of 65)
+PASS: hipify :: unit_tests/libraries/cuRAND/poisson_api_example.cu (29 of 65)
+PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_01.cu (30 of 65)
+PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_generate.cpp (31 of 65)
+PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_02.cu (32 of 65)
+PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_03.cu (33 of 65)
+PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_kernel.cpp (34 of 65)
+PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_04.cu (35 of 65)
+PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_05.cu (36 of 65)
+PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_06.cu (37 of 65)
+PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_07.cu (38 of 65)
+PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_09.cu (39 of 65)
+PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_10.cu (40 of 65)
+PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_08.cu (41 of 65)
+PASS: hipify :: unit_tests/namespace/ns_kernel_launch.cu (42 of 65)
+PASS: hipify :: unit_tests/pp/pp_if_else_conditionals.cu (43 of 65)
+PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_11.cu (44 of 65)
+PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_12.cu (45 of 65)
+PASS: hipify :: unit_tests/pp/pp_if_else_conditionals_01.cu (46 of 65)
+PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/tex2dKernel.cpp (47 of 65)
+PASS: hipify :: unit_tests/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp (48 of 65)
+PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp (49 of 65)
+PASS: hipify :: unit_tests/samples/2_Cookbook/13_occupancy/occupancy.cpp (50 of 65)
+PASS: hipify :: unit_tests/samples/2_Cookbook/1_hipEvent/hipEvent.cpp (51 of 65)
+PASS: hipify :: unit_tests/samples/2_Cookbook/2_Profiler/Profiler.cpp (52 of 65)
+PASS: hipify :: unit_tests/samples/MallocManaged.cpp (53 of 65)
+PASS: hipify :: unit_tests/samples/2_Cookbook/7_streams/stream.cpp (54 of 65)
+PASS: hipify :: unit_tests/samples/2_Cookbook/8_peer2peer/peer2peer.cpp (55 of 65)
+PASS: hipify :: unit_tests/samples/allocators.cu (56 of 65)
+PASS: hipify :: unit_tests/samples/coalescing.cu (57 of 65)
+PASS: hipify :: unit_tests/samples/dynamic_shared_memory.cu (58 of 65)
+PASS: hipify :: unit_tests/samples/axpy.cu (59 of 65)
+PASS: hipify :: unit_tests/samples/cudaRegister.cu (60 of 65)
+PASS: hipify :: unit_tests/samples/intro.cu (61 of 65)
+PASS: hipify :: unit_tests/samples/square.cu (62 of 65)
+PASS: hipify :: unit_tests/samples/static_shared_memory.cu (63 of 65)
+PASS: hipify :: unit_tests/samples/vec_add.cu (64 of 65)
+PASS: hipify :: unit_tests/libraries/CUB/cub_02.cu (18 of 65)
+Testing Time: 3.01s
+  Expected Passes    : 65
 [100%] Built target test-hipify
 ```
 ### <a name="windows"></a >Windows

 On Windows 10 the following configurations are tested:

-LLVM 5.0.0 - 5.0.2, CUDA 8.0,  cudnn-5.1.10 - cudnn-7.1.4.18
+LLVM 5.0.0 - 5.0.2, CUDA 8.0, cudnn 5.1.10 - 7.1.4.18

-LLVM 6.0.0 - 6.0.1, CUDA 9.0, cudnn-7.0.5.15 - cudnn-7.6.4.38
+LLVM 6.0.0 - 6.0.1, CUDA 9.0, cudnn 7.0.5.15 - 7.6.4.38

-LLVM 7.0.0 - 9.0.0, CUDA 7.5 - 10.1, cudnn-7.0.5.15 - cudnn-7.6.4.38
+LLVM 7.0.0 - 9.0.0, CUDA 7.5 - 10.1, cudnn 7.0.5.15 - 7.6.4.38

-Build system for the above configurations:
+Build system requirements for the latest configuration LLVM 9.0.0/CUDA 10.1 Update 2:

-Python 3.6 - 3.7.4, cmake 3.12.3 - 3.15.5, Visual Studio 2017 (15.5.2) - 2019 (16.3.4).
+Python 3.6.0 - 3.8.0, cmake 3.5.1 - 3.15.5, Visual Studio 2017 (15.5.2) - 2019 (16.3.5).

 Here is an example of building `hipify-clang` with testing support on `Windows 10` by `Visual Studio 16 2019`:

@@ -371,7 +372,7 @@ cmake
 --    - CMake module path: F:/LLVM/9.0.0/dist/lib/cmake/llvm
 --    - Include path     : F:/LLVM/9.0.0/dist/include
 --    - Binary path      : F:/LLVM/9.0.0/dist/bin
-- Found PythonInterp: C:/Program Files/Python37/python.exe (found suitable version "3.7.4", minimum required is "3.6")
+-- Found PythonInterp: C:/Program Files/Python38/python.exe (found suitable version "3.8.0", minimum required is "3.6")
 -- Found lit: C:/Program Files/Python36/Scripts/lit.exe
 -- Found FileCheck: F:/LLVM/9.0.0/dist/bin/FileCheck.exe
 -- Found CUDA: C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1 (found version "10.1")
@@ -0,0 +1,28 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "CUDA2HIP.h"
+
+// Maps the names of CUDA CUB API types to the corresponding HIP types
+const std::map<llvm::StringRef, hipCounter> CUDA_CUB_TYPE_NAME_MAP{
+  {"cub",  {"hipcub",  "", CONV_TYPE, API_CUB}},
+};
@@ -251,6 +251,11 @@ namespace perl {
    *streamPtr.get() << tab_2 << "$Tkernels{$1}++;" << endl_tab << "}" << endl << "}" << endl;
  }

+  void generateCubNamespace(unique_ptr<ostream>& streamPtr) {
+    *streamPtr.get() << endl << sub << "transformCubNamespace" << " {" << endl_tab << my_k << endl;
+    *streamPtr.get() << tab << "$k += s/using\\s*namespace\\s*cub/using namespace hipcub/g;" << endl << tab << return_k << "}" << endl;
+  }
+
  void generateHostFunctions(unique_ptr<ostream>& streamPtr) {
    *streamPtr.get() << endl << sub << "transformHostFunctions" << " {" << endl_tab << my_k << endl;
    set<string> &funcSet = DeviceSymbolFunctions0;
@@ -358,6 +363,7 @@ namespace perl {
    generateSimpleSubstitutions(streamPtr);
    generateExternShared(streamPtr);
    generateKernelLaunch(streamPtr);
+    generateCubNamespace(streamPtr);
    generateHostFunctions(streamPtr);
    generateDeviceFunctions(streamPtr);
    *streamPtr.get() << endl << "# Count of transforms in all files" << endl;
@@ -401,6 +407,7 @@ namespace perl {
    *streamPtr.get() << tab_2 << "simpleSubstitutions();" << endl;
    *streamPtr.get() << tab_2 << "transformExternShared();" << endl;
    *streamPtr.get() << tab_2 << "transformKernelLaunch();" << endl;
+    *streamPtr.get() << tab_2 << "transformCubNamespace();" << endl;
    *streamPtr.get() << tab_2 << "if ($print_stats) {" << endl;
    *streamPtr.get() << tab_3 << while_ << "(/(\\b(hip|HIP)([A-Z]|_)\\w+\\b)/g) {" << endl;
    *streamPtr.get() << tab_4 << "$convertedTags{$1}++;" << endl_tab_3 << "}" << endl_tab_2 << "}" << endl;
@@ -31,8 +31,6 @@ const std::map<llvm::StringRef, hipCounter> CUDA_RUNTIME_TYPE_NAME_MAP {
  {"cudaChannelFormatDesc",                                            {"hipChannelFormatDesc",                                     "", CONV_TYPE, API_RUNTIME}},
  // no analogue
  {"cudaDeviceProp",                                                   {"hipDeviceProp_t",                                          "", CONV_TYPE, API_RUNTIME}},
-  // NOTE: int warpSize is a field of cudaDeviceProp
-  {"warpSize",                                                         {"hipWarpSize",                                              "", CONV_TYPE, API_RUNTIME}},

  // no analogue
  {"cudaEglFrame",                                                     {"hipEglFrame",                                              "", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}},
@@ -33,16 +33,21 @@ THE SOFTWARE.
 #include "StringUtils.h"
 #include "ArgParse.h"

-namespace ct = clang::tooling;
-namespace mat = clang::ast_matchers;
-
+const std::string sHIP = "HIP";
+const std::string sROC = "ROC";
+const std::string sCub = "cub";
+const std::string sHipcub = "hipcub";
 const std::string sHIP_DYNAMIC_SHARED = "HIP_DYNAMIC_SHARED";
+const std::string sHIP_KERNEL_NAME = "HIP_KERNEL_NAME";
 std::string sHIP_SYMBOL = "HIP_SYMBOL";
-std::string sHIP_KERNEL_NAME = "HIP_KERNEL_NAME";
 std::string s_reinterpret_cast = "reinterpret_cast<const void*>";
-const std::string sHipLaunchKernelGGL = "hipLaunchKernelGGL(";
+const std::string sHipLaunchKernelGGL = "hipLaunchKernelGGL";
 const std::string sDim3 = "dim3(";
-
+const std::string s_hiprand_kernel_h = "hiprand_kernel.h";
+const std::string s_hiprand_h = "hiprand.h";
+const std::string sOnce = "once";
+const std::string s_string_literal = "[string literal]";
+// CUDA identifiers, used in matchers
 const std::string sCudaMemcpyToSymbol = "cudaMemcpyToSymbol";
 const std::string sCudaMemcpyToSymbolAsync = "cudaMemcpyToSymbolAsync";
 const std::string sCudaGetSymbolSize = "cudaGetSymbolSize";
@@ -51,6 +56,14 @@ const std::string sCudaMemcpyFromSymbol = "cudaMemcpyFromSymbol";
 const std::string sCudaMemcpyFromSymbolAsync = "cudaMemcpyFromSymbolAsync";
 const std::string sCudaFuncSetCacheConfig = "cudaFuncSetCacheConfig";
 const std::string sCudaFuncGetAttributes = "cudaFuncGetAttributes";
+// Matchers' names
+const StringRef sCudaSharedIncompleteArrayVar = "cudaSharedIncompleteArrayVar";
+const StringRef sCudaLaunchKernel = "cudaLaunchKernel";
+const StringRef sCudaHostFuncCall = "cudaHostFuncCall";
+const StringRef sCudaDeviceFuncCall = "cudaDeviceFuncCall";
+const StringRef sCubNamespacePrefix = "cubNamespacePrefix";
+const StringRef sCubFunctionTemplateDecl = "cubFunctionTemplateDecl";
+const StringRef sCubUsingNamespaceDecl = "cubUsingNamespaceDecl";

 std::set<std::string> DeviceSymbolFunctions0 {
  {sCudaMemcpyToSymbol},
@@ -78,7 +91,7 @@ std::set<std::string> ReinterpretFunctions1{
 };

 void HipifyAction::RewriteString(StringRef s, clang::SourceLocation start) {
-  clang::SourceManager& SM = getCompilerInstance().getSourceManager();
+  auto &SM = getCompilerInstance().getSourceManager();
  size_t begin = 0;
  while ((begin = s.find("cu", begin)) != StringRef::npos) {
    const size_t end = s.find_first_of(" ", begin + 4);
@@ -86,7 +99,7 @@ void HipifyAction::RewriteString(StringRef s, clang::SourceLocation start) {
    const auto found = CUDA_RENAMES_MAP().find(name);
    if (found != CUDA_RENAMES_MAP().end()) {
      StringRef repName = Statistics::isToRoc(found->second) ? found->second.rocName : found->second.hipName;
-      hipCounter counter = {"[string literal]", "", ConvTypes::CONV_LITERAL, ApiTypes::API_RUNTIME, found->second.supportDegree};
+      hipCounter counter = {s_string_literal, "", ConvTypes::CONV_LITERAL, ApiTypes::API_RUNTIME, found->second.supportDegree};
      Statistics::current().incrementCounter(counter, name.str());
      if (!Statistics::isUnsupported(counter)) {
        clang::SourceLocation sl = start.getLocWithOffset(begin + 1);
@@ -95,13 +108,24 @@ void HipifyAction::RewriteString(StringRef s, clang::SourceLocation start) {
        insertReplacement(Rep, fullSL);
      }
    }
-    if (end == StringRef::npos) {
-      break;
-    }
+    if (end == StringRef::npos) break;
    begin = end + 1;
  }
 }

+clang::SourceLocation HipifyAction::GetSubstrLocation(const std::string &str, const clang::SourceRange &sr) {
+  clang::SourceLocation sl(sr.getBegin());
+  clang::SourceLocation end(sr.getEnd());
+  auto &SM = getCompilerInstance().getSourceManager();
+  size_t length = SM.getCharacterData(end) - SM.getCharacterData(sl);
+  StringRef sfull = StringRef(SM.getCharacterData(sl), length);
+  size_t offset = sfull.find(str);
+  if (offset > 0) {
+    sl = sl.getLocWithOffset(offset);
+  }
+  return sl;
+}
+
 /**
  * Look at, and consider altering, a given token.
  *
@@ -109,7 +133,7 @@ void HipifyAction::RewriteString(StringRef s, clang::SourceLocation start) {
  * If it's an unsupported CUDA identifier, a warning is emitted.
  * Otherwise, the source file is updated with the corresponding hipification.
  */
-void HipifyAction::RewriteToken(const clang::Token& t) {
+void HipifyAction::RewriteToken(const clang::Token &t) {
  // String literals containing CUDA references need fixing.
  if (t.is(clang::tok::string_literal)) {
    StringRef s(t.getLiteralData(), t.getLength());
@@ -124,20 +148,21 @@ void HipifyAction::RewriteToken(const clang::Token& t) {
  FindAndReplace(name, sl, CUDA_RENAMES_MAP());
 }

-void HipifyAction::FindAndReplace(llvm::StringRef name,
+void HipifyAction::FindAndReplace(StringRef name,
                                  clang::SourceLocation sl,
-                                  const std::map<llvm::StringRef, hipCounter>& repMap, bool bReplace) {
+                                  const std::map<StringRef, hipCounter> &repMap, 
+                                  bool bReplace) {
  const auto found = repMap.find(name);
  if (found == repMap.end()) {
    // So it's an identifier, but not CUDA? Boring.
    return;
  }
  Statistics::current().incrementCounter(found->second, name.str());
-  clang::DiagnosticsEngine& DE = getCompilerInstance().getDiagnostics();
+  clang::DiagnosticsEngine &DE = getCompilerInstance().getDiagnostics();
  // Warn the user about unsupported identifier.
  if (Statistics::isUnsupported(found->second)) {
    std::string sWarn;
-    Statistics::isToRoc(found->second) ? sWarn = "ROC" : sWarn = "HIP";
+    Statistics::isToRoc(found->second) ? sWarn = sROC : sWarn = sHIP;
    sWarn = "" + sWarn;
    const auto ID = DE.getCustomDiagID(clang::DiagnosticsEngine::Warning, "CUDA identifier is unsupported in %0.");
    DE.Report(sl, ID) << sWarn;
@@ -147,7 +172,7 @@ void HipifyAction::FindAndReplace(llvm::StringRef name,
    return;
  }
  StringRef repName = Statistics::isToRoc(found->second) ? found->second.rocName : found->second.hipName;
-  clang::SourceManager& SM = getCompilerInstance().getSourceManager();
+  auto &SM = getCompilerInstance().getSourceManager();
  ct::Replacement Rep(SM, sl, name.size(), repName.str());
  clang::FullSourceLoc fullSL(sl, SM);
  insertReplacement(Rep, fullSL);
@@ -155,7 +180,7 @@ void HipifyAction::FindAndReplace(llvm::StringRef name,

 namespace {

-clang::SourceRange getReadRange(clang::SourceManager& SM, const clang::SourceRange& exprRange) {
+clang::SourceRange getReadRange(clang::SourceManager &SM, const clang::SourceRange &exprRange) {
  clang::SourceLocation begin = exprRange.getBegin();
  clang::SourceLocation end = exprRange.getEnd();
  bool beginSafe = !SM.isMacroBodyExpansion(begin) || clang::Lexer::isAtStartOfMacroExpansion(begin, SM, clang::LangOptions{});
@@ -167,7 +192,7 @@ clang::SourceRange getReadRange(clang::SourceManager& SM, const clang::SourceRan
  }
 }

-clang::SourceRange getWriteRange(clang::SourceManager& SM, const clang::SourceRange& exprRange) {
+clang::SourceRange getWriteRange(clang::SourceManager &SM, const clang::SourceRange &exprRange) {
  clang::SourceLocation begin = exprRange.getBegin();
  clang::SourceLocation end = exprRange.getEnd();
  // If the range is contained within a macro, update the macro definition.
@@ -178,7 +203,7 @@ clang::SourceRange getWriteRange(clang::SourceManager& SM, const clang::SourceRa
  return {SM.getSpellingLoc(begin), SM.getSpellingLoc(end)};
 }

-StringRef readSourceText(clang::SourceManager& SM, const clang::SourceRange& exprRange) {
+StringRef readSourceText(clang::SourceManager &SM, const clang::SourceRange &exprRange) {
  return clang::Lexer::getSourceText(clang::CharSourceRange::getTokenRange(getReadRange(SM, exprRange)), SM, clang::LangOptions(), nullptr);
 }

@@ -186,53 +211,50 @@ StringRef readSourceText(clang::SourceManager& SM, const clang::SourceRange& exp
  * Get a string representation of the expression `arg`, unless it's a defaulting function
  * call argument, in which case get a 0. Used for building argument lists to kernel calls.
  */
-std::string stringifyZeroDefaultedArg(clang::SourceManager& SM, const clang::Expr* arg) {
-  if (clang::isa<clang::CXXDefaultArgExpr>(arg)) {
-    return "0";
-  } else {
-    return readSourceText(SM, arg->getSourceRange());
-  }
+std::string stringifyZeroDefaultedArg(clang::SourceManager &SM, const clang::Expr *arg) {
+  if (clang::isa<clang::CXXDefaultArgExpr>(arg)) return "0";
+  else return readSourceText(SM, arg->getSourceRange());
 }

 } // anonymous namespace

-bool HipifyAction::Exclude(const hipCounter & hipToken) {
+bool HipifyAction::Exclude(const hipCounter &hipToken) {
  switch (hipToken.type) {
    case CONV_INCLUDE_CUDA_MAIN_H:
      switch (hipToken.apiType) {
        case API_DRIVER:
        case API_RUNTIME:
-          if (insertedRuntimeHeader) { return true; }
+          if (insertedRuntimeHeader) return true;
          insertedRuntimeHeader = true;
          return false;
        case API_BLAS:
-          if (insertedBLASHeader) { return true; }
+          if (insertedBLASHeader) return true;
          insertedBLASHeader = true;
          return false;
        case API_RAND:
-          if (hipToken.hipName == "hiprand_kernel.h") {
-            if (insertedRAND_kernelHeader) { return true; }
+          if (hipToken.hipName == s_hiprand_kernel_h) {
+            if (insertedRAND_kernelHeader) return true;
            insertedRAND_kernelHeader = true;
            return false;
-          } else if (hipToken.hipName == "hiprand.h") {
-            if (insertedRANDHeader) { return true; }
+          } else if (hipToken.hipName == s_hiprand_h) {
+            if (insertedRANDHeader) return true;
            insertedRANDHeader = true;
            return false;
          }
        case API_DNN:
-          if (insertedDNNHeader) { return true; }
+          if (insertedDNNHeader) return true;
          insertedDNNHeader = true;
          return false;
        case API_FFT:
-          if (insertedFFTHeader) { return true; }
+          if (insertedFFTHeader) return true;
          insertedFFTHeader = true;
          return false;
        case API_COMPLEX:
-          if (insertedComplexHeader) { return true; }
+          if (insertedComplexHeader) return true;
          insertedComplexHeader = true;
          return false;
        case API_SPARSE:
-          if (insertedSPARSEHeader) { return true; }
+          if (insertedSPARSEHeader) return true;
          insertedSPARSEHeader = true;
          return false;
        default:
@@ -240,13 +262,11 @@ bool HipifyAction::Exclude(const hipCounter & hipToken) {
      }
      return false;
    case CONV_INCLUDE:
-      if (hipToken.hipName.empty()) {
-        return true;
-      }
+      if (hipToken.hipName.empty()) return true;
      switch (hipToken.apiType) {
        case API_RAND:
-          if (hipToken.hipName == "hiprand_kernel.h") {
-            if (insertedRAND_kernelHeader) { return true; }
+          if (hipToken.hipName == s_hiprand_kernel_h) {
+            if (insertedRAND_kernelHeader) return true;
            insertedRAND_kernelHeader = true;
          }
          return false;
@@ -267,24 +287,19 @@ void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc,
                                      clang::CharSourceRange filename_range,
                                      const clang::FileEntry*, StringRef,
                                      StringRef, const clang::Module*) {
-  clang::SourceManager& SM = getCompilerInstance().getSourceManager();
-  if (!SM.isWrittenInMainFile(hash_loc)) {
-    return;
-  }
+  auto &SM = getCompilerInstance().getSourceManager();
+  if (!SM.isWrittenInMainFile(hash_loc)) return;
  if (!firstHeader) {
    firstHeader = true;
    firstHeaderLoc = hash_loc;
  }
  const auto found = CUDA_INCLUDE_MAP.find(file_name);
-  if (found == CUDA_INCLUDE_MAP.end()) {
-    return;
-  }
+  if (found == CUDA_INCLUDE_MAP.end()) return;
  bool exclude = Exclude(found->second);
  Statistics::current().incrementCounter(found->second, file_name.str());
-
  clang::SourceLocation sl = filename_range.getBegin();
  if (Statistics::isUnsupported(found->second)) {
-    clang::DiagnosticsEngine& DE = getCompilerInstance().getDiagnostics();
+    clang::DiagnosticsEngine &DE = getCompilerInstance().getDiagnostics();
    DE.Report(sl, DE.getCustomDiagID(clang::DiagnosticsEngine::Warning, "Unsupported CUDA header."));
    return;
  }
@@ -293,11 +308,8 @@ void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc,
  if (!exclude) {
    clang::SmallString<128> includeBuffer;
    llvm::StringRef name = Statistics::isToRoc(found->second) ? found->second.rocName : found->second.hipName;
-    if (is_angled) {
-      newInclude = llvm::Twine("<" + name+ ">").toStringRef(includeBuffer);
-    } else {
-      newInclude = llvm::Twine("\"" + name + "\"").toStringRef(includeBuffer);
-    }
+    if (is_angled) newInclude = llvm::Twine("<" + name+ ">").toStringRef(includeBuffer);
+    else           newInclude = llvm::Twine("\"" + name + "\"").toStringRef(includeBuffer);
  } else {
    // hashLoc is location of the '#', thus replacing the whole include directive by empty newInclude starting with '#'.
    sl = hash_loc;
@@ -309,48 +321,52 @@ void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc,
 }

 void HipifyAction::PragmaDirective(clang::SourceLocation Loc, clang::PragmaIntroducerKind Introducer) {
-  if (pragmaOnce) {
-    return;
-  }
-  clang::SourceManager& SM = getCompilerInstance().getSourceManager();
-  if (!SM.isWrittenInMainFile(Loc)) {
-    return;
-  }
-  clang::Preprocessor& PP = getCompilerInstance().getPreprocessor();
+  if (pragmaOnce) return;
+  auto &SM = getCompilerInstance().getSourceManager();
+  if (!SM.isWrittenInMainFile(Loc)) return;
+  clang::Preprocessor &PP = getCompilerInstance().getPreprocessor();
  clang::Token tok;
  PP.Lex(tok);
  StringRef Text(SM.getCharacterData(tok.getLocation()), tok.getLength());
-  if (Text == "once") {
+  if (Text == sOnce) {
    pragmaOnce = true;
    pragmaOnceLoc = tok.getEndLoc();
  }
 }

-bool HipifyAction::cudaLaunchKernel(const clang::ast_matchers::MatchFinder::MatchResult& Result) {
-  StringRef refName = "cudaLaunchKernel";
-  const auto* launchKernel = Result.Nodes.getNodeAs<clang::CUDAKernelCallExpr>(refName);
-  if (!launchKernel) {
-    return false;
-  }
-  const clang::Expr* calleeExpr = launchKernel->getCallee();
-  if (!calleeExpr) {
-    return false;
-  }
-  const clang::FunctionDecl *caleeDecl = launchKernel->getDirectCallee();
-  if (!caleeDecl) {
-    return false;
-  }
-  const clang::CallExpr* config = launchKernel->getConfig();
-  if (!config) {
-    return false;
-  }
+bool HipifyAction::cudaLaunchKernel(const mat::MatchFinder::MatchResult &Result) {
+  auto *launchKernel = Result.Nodes.getNodeAs<clang::CUDAKernelCallExpr>(sCudaLaunchKernel);
+  if (!launchKernel) return false;
+  auto *calleeExpr = launchKernel->getCallee();
+  if (!calleeExpr) return false;
+  auto *caleeDecl = launchKernel->getDirectCallee();
+  if (!caleeDecl) return false;
+  auto *config = launchKernel->getConfig();
+  if (!config) return false;
  clang::SmallString<40> XStr;
  llvm::raw_svector_ostream OS(XStr);
  clang::LangOptions DefaultLangOptions;
-  clang::SourceManager* SM = Result.SourceManager;
-  OS << sHipLaunchKernelGGL;
-  if (caleeDecl->isTemplateInstantiation()) OS << sHIP_KERNEL_NAME << "(";
-  OS << readSourceText(*SM, calleeExpr->getSourceRange());
+  auto *SM = Result.SourceManager;
+  clang::SourceRange sr = calleeExpr->getSourceRange();
+  std::string kern = readSourceText(*SM, sr).str();
+  OS << sHipLaunchKernelGGL << "(";
+  if (caleeDecl->isTemplateInstantiation()) {
+    OS << sHIP_KERNEL_NAME << "(";
+    std::string cub = sCub + "::";
+    std::string hipcub;
+    const auto found = CUDA_CUB_TYPE_NAME_MAP.find(sCub);
+    if (found != CUDA_CUB_TYPE_NAME_MAP.end()) {
+      hipcub = found->second.hipName.str() + "::";
+    } else {
+      hipcub = sHipcub + "::";
+    }
+    size_t pos = kern.find(cub);
+    while (pos != std::string::npos) {
+      kern.replace(pos, cub.size(), hipcub);
+      pos = kern.find(cub, pos + hipcub.size());
+    }
+  }
+  OS << kern;
  if (caleeDecl->isTemplateInstantiation()) OS << ")";
  OS << ", ";
  // Next up are the four kernel configuration parameters, the last two of which are optional and default to zero.
@@ -381,29 +397,24 @@ bool HipifyAction::cudaLaunchKernel(const clang::ast_matchers::MatchFinder::Matc
  ct::Replacement Rep(*SM, launchStart, length, OS.str());
  clang::FullSourceLoc fullSL(launchStart, *SM);
  insertReplacement(Rep, fullSL);
-  hipCounter counter = {"hipLaunchKernelGGL", "", ConvTypes::CONV_KERNEL_LAUNCH, ApiTypes::API_RUNTIME};
-  Statistics::current().incrementCounter(counter, refName.str());
+  hipCounter counter = {sHipLaunchKernelGGL, "", ConvTypes::CONV_KERNEL_LAUNCH, ApiTypes::API_RUNTIME};
+  Statistics::current().incrementCounter(counter, sCudaLaunchKernel.str());
  return true;
 }

-bool HipifyAction::cudaSharedIncompleteArrayVar(const clang::ast_matchers::MatchFinder::MatchResult& Result) {
-  StringRef refName = "cudaSharedIncompleteArrayVar";
-  auto* sharedVar = Result.Nodes.getNodeAs<clang::VarDecl>(refName);
-  if (!sharedVar) {
-    return false;
-  }
+bool HipifyAction::cudaSharedIncompleteArrayVar(const mat::MatchFinder::MatchResult &Result) {
+  auto *sharedVar = Result.Nodes.getNodeAs<clang::VarDecl>(sCudaSharedIncompleteArrayVar);
+  if (!sharedVar) return false;
  // Example: extern __shared__ uint sRadix1[];
-  if (!sharedVar->hasExternalFormalLinkage()) {
-    return false;
-  }
+  if (!sharedVar->hasExternalFormalLinkage()) return false;
  clang::QualType QT = sharedVar->getType();
  std::string typeName;
  if (QT->isIncompleteArrayType()) {
-    const clang::ArrayType* AT = QT.getTypePtr()->getAsArrayTypeUnsafe();
+    const clang::ArrayType *AT = QT.getTypePtr()->getAsArrayTypeUnsafe();
    QT = AT->getElementType();
    if (QT.getTypePtr()->isBuiltinType()) {
      QT = QT.getCanonicalType();
-      const auto* BT = clang::dyn_cast<clang::BuiltinType>(QT);
+      auto *BT = clang::dyn_cast<clang::BuiltinType>(QT);
      if (BT) {
        clang::LangOptions LO;
        LO.CUDA = true;
@@ -417,7 +428,7 @@ bool HipifyAction::cudaSharedIncompleteArrayVar(const clang::ast_matchers::Match
  if (!typeName.empty()) {
    clang::SourceLocation slStart = sharedVar->getOuterLocStart();
    clang::SourceLocation slEnd = llcompat::getEndLoc(sharedVar->getTypeSourceInfo()->getTypeLoc());
-    clang::SourceManager* SM = Result.SourceManager;
+    auto *SM = Result.SourceManager;
    size_t repLength = SM->getCharacterData(slEnd) - SM->getCharacterData(slStart) + 1;
    std::string varName = sharedVar->getNameAsString();
    std::string repName = sHIP_DYNAMIC_SHARED + "(" + typeName + ", " + varName + ")";
@@ -425,74 +436,84 @@ bool HipifyAction::cudaSharedIncompleteArrayVar(const clang::ast_matchers::Match
    clang::FullSourceLoc fullSL(slStart, *SM);
    insertReplacement(Rep, fullSL);
    hipCounter counter = {sHIP_DYNAMIC_SHARED, "", ConvTypes::CONV_EXTERN_SHARED, ApiTypes::API_RUNTIME};
-    Statistics::current().incrementCounter(counter, refName.str());
+    Statistics::current().incrementCounter(counter, sCudaSharedIncompleteArrayVar.str());
    return true;
  }
  return false;
 }

-bool HipifyAction::cudaDeviceFuncCall(const clang::ast_matchers::MatchFinder::MatchResult& Result) {
-  if (const clang::CallExpr *call = Result.Nodes.getNodeAs<clang::CallExpr>("cudaDeviceFuncCall")) {
-    const clang::FunctionDecl *funcDcl = call->getDirectCallee();
-    if (!funcDcl) {
-      return false;
-    }
+bool HipifyAction::cudaDeviceFuncCall(const mat::MatchFinder::MatchResult &Result) {
+  if (const clang::CallExpr *call = Result.Nodes.getNodeAs<clang::CallExpr>(sCudaDeviceFuncCall)) {
+    auto *funcDcl = call->getDirectCallee();
+    if (!funcDcl) return false;
    FindAndReplace(funcDcl->getDeclName().getAsString(), llcompat::getBeginLoc(call), CUDA_DEVICE_FUNC_MAP, false);
    return true;
  }
  return false;
 }

-bool HipifyAction::cubNamespacePrefix(const clang::ast_matchers::MatchFinder::MatchResult& Result) {
-  if (const clang::TypedefNameDecl *decl = Result.Nodes.getNodeAs<clang::TypedefNameDecl>("cubNamespacePrefix")) {
-     if (!decl) {
-      return false;
-    }
+bool HipifyAction::cubNamespacePrefix(const mat::MatchFinder::MatchResult &Result) {
+  if (auto *decl = Result.Nodes.getNodeAs<clang::TypedefNameDecl>(sCubNamespacePrefix)) {
    clang::QualType QT = decl->getUnderlyingType();
-    const clang::Type* t = QT.getTypePtr();
-    if (!t) {
-      return false;
-    }
-    const clang::ElaboratedType* et = t->getAs<clang::ElaboratedType>();
-    if (!et) {
-      return false;
-    }
+    auto *t = QT.getTypePtr();
+    if (!t) return false;
+    const clang::ElaboratedType *et = t->getAs<clang::ElaboratedType>();
+    if (!et) return false;
    const clang::NestedNameSpecifier *nns = et->getQualifier();
-    if (!nns) {
-      return false;
-    }
+    if (!nns) return false;
    const clang::NamespaceDecl *nsd = nns->getAsNamespace();
-    if (!nsd) {
-      return false;
-    }
+    if (!nsd) return false;
    const clang::TypeSourceInfo *si = decl->getTypeSourceInfo();
    const clang::TypeLoc tloc = si->getTypeLoc();
    const clang::SourceRange sr = tloc.getSourceRange();
-    clang::SourceLocation sl(sr.getBegin());
-    clang::SourceLocation end(sr.getEnd());
-    clang::SourceManager& SM = getCompilerInstance().getSourceManager();
-    size_t length = SM.getCharacterData(end) - SM.getCharacterData(sl);
-    StringRef sfull = StringRef(SM.getCharacterData(sl), length);
    std::string name = nsd->getDeclName().getAsString();
-    size_t offset = sfull.find(name);
-    if (offset > 0) {
-      sl = sl.getLocWithOffset(offset);
-    }
-    FindAndReplace(name, sl, CUDA_CUB_TYPE_NAME_MAP);
+    FindAndReplace(name, GetSubstrLocation(name, sr), CUDA_CUB_TYPE_NAME_MAP);
    return true;
  }
  return false;
 }

-bool HipifyAction::cudaHostFuncCall(const clang::ast_matchers::MatchFinder::MatchResult& Result) {
-  if (const clang::CallExpr * call = Result.Nodes.getNodeAs<clang::CallExpr>("cudaHostFuncCall")) {
-    if (!call->getNumArgs()) {
-      return false;
+bool HipifyAction::cubUsingNamespaceDecl(const mat::MatchFinder::MatchResult &Result) {
+  if (auto *decl = Result.Nodes.getNodeAs<clang::UsingDirectiveDecl>(sCubUsingNamespaceDecl)) {
+    if (auto nsd = decl->getNominatedNamespace()) {
+      FindAndReplace(nsd->getDeclName().getAsString(), decl->getIdentLocation(), CUDA_CUB_TYPE_NAME_MAP);
+      return true;
    }
-    const clang::FunctionDecl* funcDcl = call->getDirectCallee();
-    if (!funcDcl) {
-      return false;
+  }
+  return false;
+}
+
+bool HipifyAction::cubFunctionTemplateDecl(const mat::MatchFinder::MatchResult &Result) {
+  if (auto *decl = Result.Nodes.getNodeAs<clang::FunctionTemplateDecl>(sCubFunctionTemplateDecl)) {
+    auto *Tparams = decl->getTemplateParameters();
+    bool ret = false;
+    for (size_t I = 0; I < Tparams->size(); ++I) {
+      const clang::ValueDecl *valueDecl = dyn_cast<clang::ValueDecl>(Tparams->getParam(I));
+      if (!valueDecl) continue;
+      clang::QualType QT = valueDecl->getType();
+      auto *t = QT.getTypePtr();
+      if (!t) continue;
+      const clang::ElaboratedType *et = t->getAs<clang::ElaboratedType>();
+      if (!et) continue;
+      const clang::NestedNameSpecifier *nns = et->getQualifier();
+      if (!nns) continue;
+      const clang::NamespaceDecl *nsd = nns->getAsNamespace();
+      if (!nsd) continue;
+      const clang::SourceRange sr = valueDecl->getSourceRange();
+      std::string name = nsd->getDeclName().getAsString();
+      FindAndReplace(name, GetSubstrLocation(name, sr), CUDA_CUB_TYPE_NAME_MAP);
+      ret = true;
    }
+    return ret;
+  }
+  return false;
+}
+
+bool HipifyAction::cudaHostFuncCall(const mat::MatchFinder::MatchResult &Result) {
+  if (auto *call = Result.Nodes.getNodeAs<clang::CallExpr>(sCudaHostFuncCall)) {
+    if (!call->getNumArgs()) return false;
+    auto *funcDcl = call->getDirectCallee();
+    if (!funcDcl) return false;
    std::string sName = funcDcl->getDeclName().getAsString();
    unsigned int argNum = 0;
    bool b_reinterpret = (ReinterpretFunctions.find(sName) != ReinterpretFunctions.end()) ? true : false;
@@ -506,7 +527,7 @@ bool HipifyAction::cudaHostFuncCall(const clang::ast_matchers::MatchFinder::Matc
    clang::SmallString<40> XStr;
    llvm::raw_svector_ostream OS(XStr);
    clang::SourceRange sr = call->getArg(argNum)->getSourceRange();
-    clang::SourceManager* SM = Result.SourceManager;
+    auto *SM = Result.SourceManager;
    OS << (b_reinterpret ? s_reinterpret_cast : sHIP_SYMBOL) << "(" << readSourceText(*SM, sr) << ")";
    clang::SourceRange replacementRange = getWriteRange(*SM, { sr.getBegin(), sr.getEnd() });
    clang::SourceLocation s = replacementRange.getBegin();
@@ -521,7 +542,7 @@ bool HipifyAction::cudaHostFuncCall(const clang::ast_matchers::MatchFinder::Matc
  return false;
 }

-void HipifyAction::insertReplacement(const ct::Replacement& rep, const clang::FullSourceLoc& fullSL) {
+void HipifyAction::insertReplacement(const ct::Replacement &rep, const clang::FullSourceLoc &fullSL) {
  llcompat::insertReplacement(*replacements, rep);
  if (PrintStats) {
    rep.getLength();
@@ -530,10 +551,10 @@ void HipifyAction::insertReplacement(const ct::Replacement& rep, const clang::Fu
  }
 }

-std::unique_ptr<clang::ASTConsumer> HipifyAction::CreateASTConsumer(clang::CompilerInstance& CI, llvm::StringRef) {
-  Finder.reset(new clang::ast_matchers::MatchFinder);
+std::unique_ptr<clang::ASTConsumer> HipifyAction::CreateASTConsumer(clang::CompilerInstance &CI, StringRef) {
+  Finder.reset(new mat::MatchFinder);
  // Replace the <<<...>>> language extension with a hip kernel launch
-  Finder->addMatcher(mat::cudaKernelCallExpr(mat::isExpansionInMainFile()).bind("cudaLaunchKernel"), this);
+  Finder->addMatcher(mat::cudaKernelCallExpr(mat::isExpansionInMainFile()).bind(sCudaLaunchKernel), this);
  Finder->addMatcher(
    mat::varDecl(
      mat::isExpansionInMainFile(),
@@ -541,7 +562,7 @@ std::unique_ptr<clang::ASTConsumer> HipifyAction::CreateASTConsumer(clang::Compi
        mat::hasAttr(clang::attr::CUDAShared),
        mat::hasType(mat::incompleteArrayType())
      )
-    ).bind("cudaSharedIncompleteArrayVar"),
+    ).bind(sCudaSharedIncompleteArrayVar),
    this
  );
  Finder->addMatcher(
@@ -561,7 +582,7 @@ std::unique_ptr<clang::ASTConsumer> HipifyAction::CreateASTConsumer(clang::Compi
          )
        )
      )
-    ).bind("cudaHostFuncCall"),
+    ).bind(sCudaHostFuncCall),
    this
  );
  Finder->addMatcher(
@@ -576,7 +597,7 @@ std::unique_ptr<clang::ASTConsumer> HipifyAction::CreateASTConsumer(clang::Compi
          mat::unless(mat::hasAttr(clang::attr::CUDAHost))
        )
      )
-    ).bind("cudaDeviceFuncCall"),
+    ).bind(sCudaDeviceFuncCall),
    this
  );
  Finder->addMatcher(
@@ -586,12 +607,26 @@ std::unique_ptr<clang::ASTConsumer> HipifyAction::CreateASTConsumer(clang::Compi
        mat::elaboratedType(
          mat::hasQualifier(
            mat::specifiesNamespace(
-              mat::hasName("cub")
+              mat::hasName(sCub)
            )
          )
        )
       )
-    ).bind("cubNamespacePrefix"),
+    ).bind(sCubNamespacePrefix),
+    this
+  );
+  // TODO: Maybe worth to make it more concrete based on final cubFunctionTemplateDecl
+  Finder->addMatcher(
+    mat::functionTemplateDecl(
+      mat::isExpansionInMainFile()
+    ).bind(sCubFunctionTemplateDecl),
+    this
+  );
+  // TODO: Maybe worth to make it more concrete
+  Finder->addMatcher(
+    mat::usingDirectiveDecl(
+      mat::isExpansionInMainFile()
+    ).bind(sCubUsingNamespaceDecl),
    this
  );
  // Ownership is transferred to the caller.
@@ -599,10 +634,8 @@ std::unique_ptr<clang::ASTConsumer> HipifyAction::CreateASTConsumer(clang::Compi
 }

 void HipifyAction::Ifndef(clang::SourceLocation Loc, const clang::Token &MacroNameTok, const clang::MacroDefinition &MD) {
-  clang::SourceManager& SM = getCompilerInstance().getSourceManager();
-  if (!SM.isWrittenInMainFile(Loc)) {
-    return;
-  }
+  auto &SM = getCompilerInstance().getSourceManager();
+  if (!SM.isWrittenInMainFile(Loc)) return;
  StringRef Text(SM.getCharacterData(MacroNameTok.getLocation()), MacroNameTok.getLength());
  Ifndefs.insert(std::make_pair(Text.str(), MacroNameTok.getEndLoc()));
 }
@@ -615,12 +648,12 @@ void HipifyAction::EndSourceFileAction() {
    // one copy of the hip include into every file.
    bool placeForIncludeCalculated = false;
    clang::SourceLocation sl, controllingMacroLoc;
-    clang::SourceManager& SM = getCompilerInstance().getSourceManager();
-    clang::Preprocessor& PP = getCompilerInstance().getPreprocessor();
-    clang::HeaderSearch& HS = PP.getHeaderSearchInfo();
-    clang::ExternalPreprocessorSource* EPL = HS.getExternalLookup();
-    const clang::FileEntry* FE = SM.getFileEntryForID(SM.getMainFileID());
-    const clang::IdentifierInfo* controllingMacro = HS.getFileInfo(FE).getControllingMacro(EPL);
+    auto &SM = getCompilerInstance().getSourceManager();
+    clang::Preprocessor &PP = getCompilerInstance().getPreprocessor();
+    clang::HeaderSearch &HS = PP.getHeaderSearchInfo();
+    clang::ExternalPreprocessorSource *EPL = HS.getExternalLookup();
+    const clang::FileEntry *FE = SM.getFileEntryForID(SM.getMainFileID());
+    const clang::IdentifierInfo *controllingMacro = HS.getFileInfo(FE).getControllingMacro(EPL);
    if (controllingMacro) {
      auto found = Ifndefs.find(controllingMacro->getName().str());
      if (found != Ifndefs.end()) {
@@ -629,19 +662,13 @@ void HipifyAction::EndSourceFileAction() {
      }
    }
    if (pragmaOnce) {
-      if (placeForIncludeCalculated) {
-        sl = pragmaOnceLoc < controllingMacroLoc ? pragmaOnceLoc : controllingMacroLoc;
-      } else {
-        sl = pragmaOnceLoc;
-      }
+      if (placeForIncludeCalculated) sl = pragmaOnceLoc < controllingMacroLoc ? pragmaOnceLoc : controllingMacroLoc;
+      else                           sl = pragmaOnceLoc;
      placeForIncludeCalculated = true;
    }
    if (!placeForIncludeCalculated) {
-      if (firstHeader) {
-        sl = firstHeaderLoc;
-      } else {
-        sl = SM.getLocForStartOfFile(SM.getMainFileID());
-      }
+      if (firstHeader)               sl = firstHeaderLoc;
+      else                           sl = SM.getLocForStartOfFile(SM.getMainFileID());
    }
    clang::FullSourceLoc fullSL(sl, SM);
    ct::Replacement Rep(SM, sl, 0, "\n#include <hip/hip_runtime.h>\n");
@@ -656,15 +683,15 @@ namespace {
  * A silly little class to proxy PPCallbacks back to the HipifyAction class.
  */
 class PPCallbackProxy : public clang::PPCallbacks {
-  HipifyAction& hipifyAction;
+  HipifyAction &hipifyAction;

 public:
-  explicit PPCallbackProxy(HipifyAction& action): hipifyAction(action) {}
+  explicit PPCallbackProxy(HipifyAction &action): hipifyAction(action) {}

-  void InclusionDirective(clang::SourceLocation hash_loc, const clang::Token& include_token,
+  void InclusionDirective(clang::SourceLocation hash_loc, const clang::Token &include_token,
                          StringRef file_name, bool is_angled, clang::CharSourceRange filename_range,
-                          const clang::FileEntry* file, StringRef search_path, StringRef relative_path,
-                          const clang::Module* imported
+                          const clang::FileEntry *file, StringRef search_path, StringRef relative_path,
+                          const clang::Module *imported
 #if LLVM_VERSION_MAJOR > 6
                        , clang::SrcMgr::CharacteristicKind FileType
 #endif
@@ -688,10 +715,10 @@ bool HipifyAction::BeginInvocation(clang::CompilerInstance &CI) {
 }

 void HipifyAction::ExecuteAction() {
-  clang::Preprocessor& PP = getCompilerInstance().getPreprocessor();
-  clang::SourceManager& SM = getCompilerInstance().getSourceManager();
+  clang::Preprocessor &PP = getCompilerInstance().getPreprocessor();
+  auto &SM = getCompilerInstance().getSourceManager();
  // Start lexing the specified input file.
-  const llvm::MemoryBuffer* FromFile = SM.getBuffer(SM.getMainFileID());
+  const llvm::MemoryBuffer *FromFile = SM.getBuffer(SM.getMainFileID());
  clang::Lexer RawLex(SM.getMainFileID(), FromFile, SM, PP.getLangOpts());
  RawLex.SetKeepWhitespaceMode(true);
  // Perform a token-level rewrite of CUDA identifiers to hip ones. The raw-mode lexer gives us enough
@@ -709,10 +736,12 @@ void HipifyAction::ExecuteAction() {
  clang::ASTFrontendAction::ExecuteAction();
 }

-void HipifyAction::run(const clang::ast_matchers::MatchFinder::MatchResult& Result) {
+void HipifyAction::run(const mat::MatchFinder::MatchResult &Result) {
  if (cudaLaunchKernel(Result)) return;
  if (cudaSharedIncompleteArrayVar(Result)) return;
  if (cudaHostFuncCall(Result)) return;
  if (cudaDeviceFuncCall(Result)) return;
  if (cubNamespacePrefix(Result)) return;
+  if (cubFunctionTemplateDecl(Result)) return;
+  if (cubUsingNamespaceDecl(Result)) return;
 }
@@ -31,17 +31,18 @@ THE SOFTWARE.
 #include "Statistics.h"

 namespace ct = clang::tooling;
+namespace mat = clang::ast_matchers;
 using namespace llvm;

 /**
  * A FrontendAction that hipifies CUDA programs.
  */
 class HipifyAction : public clang::ASTFrontendAction,
-                     public clang::ast_matchers::MatchFinder::MatchCallback {
+                     public mat::MatchFinder::MatchCallback {
 private:
-  ct::Replacements* replacements;
+  ct::Replacements *replacements;
  std::map<std::string, clang::SourceLocation> Ifndefs;
-  std::unique_ptr<clang::ast_matchers::MatchFinder> Finder;
+  std::unique_ptr<mat::MatchFinder> Finder;
  // CUDA implicitly adds its runtime header. We rewrite explicitly-provided CUDA includes with equivalent
  // ones, and track - using this flag - if the result led to us including the hip runtime header. If it did
  // not, we insert it at the top of the file when we finish processing it.
@@ -62,17 +63,20 @@ private:
  void RewriteString(StringRef s, clang::SourceLocation start);
  // Replace a CUDA identifier with the corresponding hip identifier, if applicable.
  void RewriteToken(const clang::Token &t);
+  // Calculate str's SourceLocation in SourceRange sr
+  clang::SourceLocation GetSubstrLocation(const std::string &str, const clang::SourceRange &sr);

 public:
  explicit HipifyAction(ct::Replacements *replacements): clang::ASTFrontendAction(),
    replacements(replacements) {}
  // MatchCallback listeners
-  bool cudaBuiltin(const clang::ast_matchers::MatchFinder::MatchResult& Result);
-  bool cudaLaunchKernel(const clang::ast_matchers::MatchFinder::MatchResult& Result);
-  bool cudaSharedIncompleteArrayVar(const clang::ast_matchers::MatchFinder::MatchResult& Result);
-  bool cudaDeviceFuncCall(const clang::ast_matchers::MatchFinder::MatchResult& Result);
-  bool cudaHostFuncCall(const clang::ast_matchers::MatchFinder::MatchResult& Result);
-  bool cubNamespacePrefix(const clang::ast_matchers::MatchFinder::MatchResult& Result);
+  bool cudaLaunchKernel(const mat::MatchFinder::MatchResult &Result);
+  bool cudaSharedIncompleteArrayVar(const mat::MatchFinder::MatchResult &Result);
+  bool cudaDeviceFuncCall(const mat::MatchFinder::MatchResult &Result);
+  bool cudaHostFuncCall(const mat::MatchFinder::MatchResult &Result);
+  bool cubNamespacePrefix(const mat::MatchFinder::MatchResult &Result);
+  bool cubFunctionTemplateDecl(const mat::MatchFinder::MatchResult &Result);
+  bool cubUsingNamespaceDecl(const mat::MatchFinder::MatchResult &Result);
  // Called by the preprocessor for each include directive during the non-raw lexing pass.
  void InclusionDirective(clang::SourceLocation hash_loc,
                          const clang::Token &include_token,
@@ -91,7 +95,7 @@ public:

 protected:
  // Add a Replacement for the current file. These will all be applied after executing the FrontendAction.
-  void insertReplacement(const ct::Replacement& rep, const clang::FullSourceLoc& fullSL);
+  void insertReplacement(const ct::Replacement &rep, const clang::FullSourceLoc &fullSL);
  // FrontendAction entry point.
  void ExecuteAction() override;
  // Callback before starting processing a single input; used by hipify-clang for setting Preprocessor options.
@@ -99,8 +103,8 @@ protected:
  // Called at the start of each new file to process.
  void EndSourceFileAction() override;
  // MatchCallback API entry point. Called by the AST visitor while searching the AST for things we registered an interest for.
-  void run(const clang::ast_matchers::MatchFinder::MatchResult& Result) override;
-  std::unique_ptr<clang::ASTConsumer> CreateASTConsumer(clang::CompilerInstance &CI, llvm::StringRef InFile) override;
-  bool Exclude(const hipCounter & hipToken);
-  void FindAndReplace(llvm::StringRef name, clang::SourceLocation sl, const std::map<llvm::StringRef, hipCounter>& repMap, bool bReplace = true);
+  void run(const mat::MatchFinder::MatchResult &Result) override;
+  std::unique_ptr<clang::ASTConsumer> CreateASTConsumer(clang::CompilerInstance &CI, StringRef InFile) override;
+  bool Exclude(const hipCounter &hipToken);
+  void FindAndReplace(StringRef name, clang::SourceLocation sl, const std::map<StringRef, hipCounter> &repMap, bool bReplace = true);
 };
@@ -199,6 +199,7 @@ int main(int argc, const char **argv) {
    Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("cuda", ct::ArgumentInsertPosition::BEGIN));
    Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("-x", ct::ArgumentInsertPosition::BEGIN));
    Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("--cuda-host-only", ct::ArgumentInsertPosition::BEGIN));
+    Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("-fno-delayed-template-parsing", ct::ArgumentInsertPosition::BEGIN));
    if (!CudaPath.empty()) {
      std::string sCudaPath = "--cuda-path=" + CudaPath;
      Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster(sCudaPath.c_str(), ct::ArgumentInsertPosition::BEGIN));
@@ -86,7 +86,7 @@ struct Bundled_code {
        char cbuf[sizeof(offset) + sizeof(bundle_sz) + sizeof(triple_sz)];
    } header;
    std::string triple;
-    std::vector<char> blob;
+    std::string blob;
 };

 #define magic_string_  "__CLANG_OFFLOAD_BUNDLE__"
@@ -127,35 +127,6 @@ void hipLaunchKernelGGLImpl(
 } // Namespace hip_impl.


-template <typename F>
-inline
-hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* blockSize,
-    F kernel, size_t dynSharedMemPerBlk, uint32_t blockSizeLimit) {
-
-    using namespace hip_impl;
-
-    hip_impl::hip_init();
-    auto f = get_program_state().kernel_descriptor(reinterpret_cast<std::uintptr_t>(kernel),
-                                                   target_agent(0));
-
-    return hipOccupancyMaxPotentialBlockSize(gridSize, blockSize, f,
-                                      dynSharedMemPerBlk, blockSizeLimit);
-}
-
-template <typename F>
-inline
-hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(uint32_t* numBlocks, F kernel,
-    uint32_t blockSize, size_t dynSharedMemPerBlk) {
-
-    using namespace hip_impl;
-
-    hip_impl::hip_init();
-    auto f = get_program_state().kernel_descriptor(reinterpret_cast<std::uintptr_t>(kernel),
-                                                   target_agent(0));
-
-    return hipOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, f, blockSize, dynSharedMemPerBlk);
-}
-
 template <typename... Args, typename F = void (*)(Args...)>
 inline
 void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
@@ -59,12 +59,17 @@ float atomicAdd(float* address, float val)
 {
    unsigned int* uaddr{reinterpret_cast<unsigned int*>(address)};
    unsigned int old{__atomic_load_n(uaddr, __ATOMIC_RELAXED)};
-    unsigned int r;

+    unsigned int r;
    do {
-        r = old;
+        r = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
+
+        if (r != old) { r = old; continue; }
+
        old = atomicCAS(uaddr, r, __float_as_uint(val + __uint_as_float(r)));
-    } while (r != old);
+
+        if (r == old) break;
+    } while (true);

    return __uint_as_float(r);
 }
@@ -74,13 +79,18 @@ double atomicAdd(double* address, double val)
 {
    unsigned long long* uaddr{reinterpret_cast<unsigned long long*>(address)};
    unsigned long long old{__atomic_load_n(uaddr, __ATOMIC_RELAXED)};
-    unsigned long long r;

+    unsigned long long r;
    do {
-        r = old;
+        r = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
+
+        if (r != old) { r = old; continue; }
+
        old = atomicCAS(
            uaddr, r, __double_as_longlong(val + __longlong_as_double(r)));
-    } while (r != old);
+
+        if (r == old) break;
+    } while (true);

    return __longlong_as_double(r);
 }
@@ -144,7 +154,13 @@ unsigned long long atomicMin(
    unsigned long long* address, unsigned long long val)
 {
    unsigned long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)};
-    while (val < tmp) { tmp = atomicCAS(address, tmp, val); }
+    while (val < tmp) {
+        const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED);
+
+        if (tmp1 != tmp) { tmp = tmp1; continue; }
+
+        tmp = atomicCAS(address, tmp, val);
+    }

    return tmp;
 }
@@ -167,7 +183,13 @@ unsigned long long atomicMax(
    unsigned long long* address, unsigned long long val)
 {
    unsigned long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)};
-    while (tmp < val) { tmp = atomicCAS(address, tmp, val); }
+    while (tmp < val) {
+        const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED);
+
+        if (tmp1 != tmp) { tmp = tmp1; continue; }
+
+        tmp = atomicCAS(address, tmp, val);
+    }

    return tmp;
 }
@@ -177,7 +199,7 @@ inline
 unsigned int atomicInc(unsigned int* address, unsigned int val)
 {
    __device__
-    extern 
+    extern
    unsigned int __builtin_amdgcn_atomic_inc(
        unsigned int*,
        unsigned int,
@@ -194,7 +216,7 @@ inline
 unsigned int atomicDec(unsigned int* address, unsigned int val)
 {
    __device__
-    extern 
+    extern
    unsigned int __builtin_amdgcn_atomic_dec(
        unsigned int*,
        unsigned int,
@@ -1742,10 +1742,10 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes);
 *
 *  @param[out] dst Data ptr to be filled
 *  @param[in]  constant value to be set
- *  @param[in]  sizeBytes Data size in bytes
+ *  @param[in]  number of values to be set
 *  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
 */
-hipError_t hipMemsetD8(hipDeviceptr_t dest, unsigned char value, size_t sizeBytes);
+hipError_t hipMemsetD8(hipDeviceptr_t dest, unsigned char value, size_t count);

 /**
 *  @brief Fills the first sizeBytes bytes of the memory area pointed to by dest with the constant
@@ -1758,11 +1758,11 @@ hipError_t hipMemsetD8(hipDeviceptr_t dest, unsigned char value, size_t sizeByte
 *
 *  @param[out] dst Data ptr to be filled
 *  @param[in]  constant value to be set
- *  @param[in]  sizeBytes Data size in bytes
+ *  @param[in]  number of values to be set
 *  @param[in]  stream - Stream identifier
 *  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
 */
-hipError_t hipMemsetD8Async(hipDeviceptr_t dest, unsigned char value, size_t sizeBytes, hipStream_t stream __dparm(0));
+hipError_t hipMemsetD8Async(hipDeviceptr_t dest, unsigned char value, size_t count, hipStream_t stream __dparm(0));

 /**
 *  @brief Fills the first sizeBytes bytes of the memory area pointed to by dest with the constant
@@ -1770,10 +1770,10 @@ hipError_t hipMemsetD8Async(hipDeviceptr_t dest, unsigned char value, size_t siz
 *
 *  @param[out] dst Data ptr to be filled
 *  @param[in]  constant value to be set
- *  @param[in]  sizeBytes Data size in bytes
+ *  @param[in]  number of values to be set
 *  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
 */
-hipError_t hipMemsetD16(hipDeviceptr_t dest, unsigned short value, size_t sizeBytes);
+hipError_t hipMemsetD16(hipDeviceptr_t dest, unsigned short value, size_t count);

 /**
 *  @brief Fills the first sizeBytes bytes of the memory area pointed to by dest with the constant
@@ -1786,11 +1786,11 @@ hipError_t hipMemsetD16(hipDeviceptr_t dest, unsigned short value, size_t sizeBy
 *
 *  @param[out] dst Data ptr to be filled
 *  @param[in]  constant value to be set
- *  @param[in]  sizeBytes Data size in bytes
+ *  @param[in]  number of values to be set
 *  @param[in]  stream - Stream identifier
 *  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
 */
-hipError_t hipMemsetD16Async(hipDeviceptr_t dest, unsigned short value, size_t sizeBytes, hipStream_t stream __dparm(0));
+hipError_t hipMemsetD16Async(hipDeviceptr_t dest, unsigned short value, size_t count, hipStream_t stream __dparm(0));

 /**
 *  @brief Fills the memory area pointed to by dest with the constant integer
@@ -2884,14 +2884,14 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
 * @param [out] gridSize           minimum grid size for maximum potential occupancy
 * @param [out] blockSize          block size for maximum potential occupancy
 * @param [in]  f                  kernel function for which occupancy is calulated
- * @param [in]  dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
+ * @param [in]  dynamicSMemSize    Per - block dynamic shared memory usage intended, in bytes
 * @param [in]  blockSizeLimit     the maximum block size for the kernel, use 0 for no limit
 *
 * @returns hipSuccess, hipInvalidDevice, hipErrorInvalidValue
 */
-hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* blockSize,
-                                             hipFunction_t f, size_t dynSharedMemPerBlk,
-                                             uint32_t blockSizeLimit);
+hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
+                                             const void* f, size_t dynamicSMemSize,
+                                             int blockSizeLimit);

 /**
 * @brief Returns occupancy for a device function.
@@ -2899,10 +2899,10 @@ hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* block
 * @param [out] numBlocks        Returned occupancy
 * @param [in]  func             Kernel function for which occupancy is calulated
 * @param [in]  blockSize        Block size the kernel is intended to be launched with
- * @param [in]  dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
+ * @param [in]  dynamicSMemSize  Per - block dynamic shared memory usage intended, in bytes
 */
 hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
-   uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk);
+   int* numBlocks, const void* f, int blockSize, size_t dynamicSMemSize);

 /**
 * @brief Returns occupancy for a device function.
@@ -2910,11 +2910,11 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
 * @param [out] numBlocks        Returned occupancy
 * @param [in]  func             Kernel function for which occupancy is calulated
 * @param [in]  blockSize        Block size the kernel is intended to be launched with
- * @param [in]  dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
+ * @param [in]  dynamicSMemSize  Per - block dynamic shared memory usage intended, in bytes
 * @param [in]  flags            Extra flags for occupancy calculation (currently ignored)
 */
 hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
-   uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk, unsigned int flags);
+   int* numBlocks, const void* f, int blockSize, size_t dynamicSMemSize, unsigned int flags);

 /**
 * @brief Launches kernels on multiple devices and guarantees all specified kernels are dispatched
@@ -3320,7 +3320,27 @@ hipError_t hipBindTextureToMipmappedArray(const texture<T, dim, readMode>& tex,
    return hipSuccess;
 }

+template <class T>
+inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
+    int* numBlocks, T f, int blockSize, size_t dynamicSMemSize) {
+    return hipOccupancyMaxActiveBlocksPerMultiprocessor(
+        numBlocks, reinterpret_cast<const void*>(f), blockSize, dynamicSMemSize);
+}

+template <class T>
+inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+    int* numBlocks, T f, int blockSize, size_t dynamicSMemSize, unsigned int flags) {
+    return hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+        numBlocks, reinterpret_cast<const void*>(f), blockSize, dynamicSMemSize, flags);
+}
+
+template <class T>
+inline hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
+    T f, size_t dynamicSMemSize, int blockSizeLimit) {
+    return hipOccupancyMaxPotentialBlockSize(
+    gridSize, blockSize, reinterpret_cast<const void*>(f), dynamicSMemSize, blockSizeLimit);
+}
+ 
 template <class T>
 inline hipError_t hipLaunchCooperativeKernel(T f, dim3 gridDim, dim3 blockDim,
                                             void** kernelParams, unsigned int sharedMemBytes, hipStream_t stream) {
@@ -0,0 +1,77 @@
+/*
+Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_PROF_H
+#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_PROF_H
+
+// HIP VDI Op IDs enumeration
+enum HipVdiOpId {
+  kHipVdiOpIdDispatch = 0,
+  kHipVdiOpIdCopy     = 1,
+  kHipVdiOpIdBarrier  = 2,
+  kHipVdiOpIdNumber   = 3
+};
+
+// Types of VDI commands
+enum HipVdiCommandKind {
+  kHipVdiCommandKernel            = 0x11F0,
+  kHipVdiMemcpyDeviceToHost       = 0x11F3,
+  kHipHipVdiMemcpyHostToDevice    = 0x11F4,
+  kHipVdiMemcpyDeviceToDevice     = 0x11F5,
+  kHipVidMemcpyDeviceToHostRect   = 0x1201,
+  kHipVdiMemcpyHostToDeviceRect   = 0x1202,
+  kHipVdiMemcpyDeviceToDeviceRect = 0x1203,
+  kHipVdiFillMemory               = 0x1207,
+}; 
+
+/**
+ * @brief Initializes activity callback
+ *
+ * @param [input] id_callback Event ID callback function
+ * @param [input] op_callback Event operation callback function
+ * @param [input] arg         Arguments passed into callback
+ *
+ * @returns None
+ */
+void hipInitActivityCallback(void* id_callback, void* op_callback, void* arg);
+
+/**
+ * @brief Enables activity callback
+ *
+ * @param [input] op      Operation, which will trigger a callback (@see HipVdiOpId)
+ * @param [input] enable  Enable state for the callback
+ *
+ * @returns True if successful
+ */
+bool hipEnableActivityCallback(uint32_t op, bool enable);
+
+/**
+ * @brief Returns the description string for the operation kind
+ *
+ * @param [input] id      Command kind id (@see HipVdiCommandKind)
+ *
+ * @returns A pointer to a const string with the command description
+ */
+const char* hipGetCmdName(uint32_t id);
+
+#endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_PROF_H
+
@@ -47,6 +47,95 @@ THE SOFTWARE.
 #if defined(__cplusplus)
    #include <type_traits>

+    namespace hip_impl {
+        template<typename T, typename Vector, unsigned int idx>
+        struct Scalar_accessor {
+            // Idea from https://t0rakka.silvrback.com/simd-scalar-accessor
+            Vector data;
+
+            __host__ __device__
+            operator T() const noexcept { return data[idx]; }
+
+            __host__ __device__
+            Scalar_accessor& operator=(T x) noexcept {
+                data[idx] = x;
+
+                return *this;
+            }
+
+            __host__ __device__
+            Scalar_accessor& operator+=(T x) noexcept {
+                data[idx] += x;
+                return *this;
+            }
+            __host__ __device__
+            Scalar_accessor& operator-=(T x) noexcept {
+                data[idx] -= x;
+                return *this;
+            }
+
+            __host__ __device__
+            Scalar_accessor& operator*=(T x) noexcept {
+                data[idx] *= x;
+                return *this;
+            }
+            __host__ __device__
+            Scalar_accessor& operator/=(T x) noexcept {
+                data[idx] /= x;
+                return *this;
+            }
+            template<
+                typename U = T,
+                typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+            __host__ __device__
+            Scalar_accessor& operator%=(T x) noexcept {
+                data[idx] %= x;
+                return *this;
+            }
+
+            template<
+                typename U = T,
+                typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+            __host__ __device__
+            Scalar_accessor& operator>>=(T x) noexcept {
+                data[idx] >>= x;
+                return *this;
+            }
+            template<
+                typename U = T,
+                typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+            __host__ __device__
+            Scalar_accessor& operator<<=(T x) noexcept {
+                data[idx] <<= x;
+                return *this;
+            }
+            template<
+                typename U = T,
+                typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+            __host__ __device__
+            Scalar_accessor& operator&=(T x) noexcept {
+                data[idx] &= x;
+                return *this;
+            }
+            template<
+                typename U = T,
+                typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+            __host__ __device__
+            Scalar_accessor& operator|=(T x) noexcept {
+                data[idx] |= x;
+                return *this;
+            }
+            template<
+                typename U = T,
+                typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+            __host__ __device__
+            Scalar_accessor& operator^=(T x) noexcept {
+                data[idx] ^= x;
+                return *this;
+            }
+        };
+    } // Namespace hip_impl.
+
    template<typename T, unsigned int n> struct HIP_vector_base;

    template<typename T>
@@ -55,9 +144,7 @@ THE SOFTWARE.

        union {
            Native_vec_ data;
-            struct {
-                T x;
-            };
+            hip_impl::Scalar_accessor<T, Native_vec_, 0> x;
        };
    };

@@ -67,10 +154,8 @@ THE SOFTWARE.

        union {
            Native_vec_ data;
-            struct {
-                T x;
-                T y;
-            };
+            hip_impl::Scalar_accessor<T, Native_vec_, 0> x;
+            hip_impl::Scalar_accessor<T, Native_vec_, 1> y;
        };
    };

@@ -238,12 +323,10 @@ THE SOFTWARE.

        union {
            Native_vec_ data;
-            struct {
-                T x;
-                T y;
-                T z;
-                T w;
-            };
+            hip_impl::Scalar_accessor<T, Native_vec_, 0> x;
+            hip_impl::Scalar_accessor<T, Native_vec_, 1> y;
+            hip_impl::Scalar_accessor<T, Native_vec_, 2> z;
+            hip_impl::Scalar_accessor<T, Native_vec_, 3> w;
        };
    };

@@ -30,7 +30,7 @@ THE SOFTWARE.
 #define fileName "tex2dKernel.code"

 texture<float, 2, hipReadModeElementType> tex;
-bool testResult = false;
+bool testResult = true;

 #define HIP_CHECK(cmd)                                                                             \
    {                                                                                              \
@@ -62,7 +62,7 @@ bool runTest(int argc, char** argv) {
    desc.NumChannels = 1;
    desc.Width = width;
    desc.Height = height;
-    hipArrayCreate(&array, &desc);
+    HIP_CHECK(hipArrayCreate(&array, &desc));

    hip_Memcpy2D copyParam;
    memset(&copyParam, 0, sizeof(copyParam));
@@ -73,19 +73,19 @@ bool runTest(int argc, char** argv) {
    copyParam.srcPitch = width * sizeof(float);
    copyParam.WidthInBytes = copyParam.srcPitch;
    copyParam.Height = height;
-    hipMemcpyParam2D(&copyParam);
+    HIP_CHECK(hipMemcpyParam2D(&copyParam));

    textureReference* texref;
-    hipModuleGetTexRef(&texref, Module, "tex");
-    hipTexRefSetAddressMode(texref, 0, hipAddressModeWrap);
-    hipTexRefSetAddressMode(texref, 1, hipAddressModeWrap);
-    hipTexRefSetFilterMode(texref, hipFilterModePoint);
-    hipTexRefSetFlags(texref, 0);
-    hipTexRefSetFormat(texref, HIP_AD_FORMAT_FLOAT, 1);
-    hipTexRefSetArray(texref, array, HIP_TRSA_OVERRIDE_FORMAT);
+    HIP_CHECK(hipModuleGetTexRef(&texref, Module, "tex"));
+    HIP_CHECK(hipTexRefSetAddressMode(texref, 0, hipAddressModeWrap));
+    HIP_CHECK(hipTexRefSetAddressMode(texref, 1, hipAddressModeWrap));
+    HIP_CHECK(hipTexRefSetFilterMode(texref, hipFilterModePoint));
+    HIP_CHECK(hipTexRefSetFlags(texref, 0));
+    HIP_CHECK(hipTexRefSetFormat(texref, HIP_AD_FORMAT_FLOAT, 1));
+    HIP_CHECK(hipTexRefSetArray(texref, array, HIP_TRSA_OVERRIDE_FORMAT));

    float* dData = NULL;
-    hipMalloc((void**)&dData, size);
+    HIP_CHECK(hipMalloc((void**)&dData, size));

    struct {
        void* _Ad;
@@ -112,7 +112,7 @@ bool runTest(int argc, char** argv) {

    float* hOutputData = (float*)malloc(size);
    memset(hOutputData, 0, size);
-    hipMemcpy(hOutputData, dData, size, hipMemcpyDeviceToHost);
+    HIP_CHECK(hipMemcpy(hOutputData, dData, size, hipMemcpyDeviceToHost));

    for (int i = 0; i < height; i++) {
        for (int j = 0; j < width; j++) {
@@ -124,10 +124,10 @@ bool runTest(int argc, char** argv) {
            }
        }
    }
-    hipUnbindTexture(tex);
-    hipFree(dData);
-    hipFreeArray(array);
-    return true;
+    HIP_CHECK(hipUnbindTexture(tex));
+    HIP_CHECK(hipFree(dData));
+    HIP_CHECK(hipFreeArray(array));
+    return testResult;
 }

 int main(int argc, char** argv) {
@@ -56,9 +56,9 @@ void launchKernel(float* C, float* A, float* B, bool manual){
     const unsigned threadsperblock = 32;
     const unsigned blocks = (NUM/threadsperblock)+1;

-     uint32_t mingridSize = 0;
-     uint32_t gridSize = 0;
-     uint32_t blockSize = 0;
+     int mingridSize = 0;
+     int gridSize = 0;
+     int blockSize = 0;
     
     if (manual){
 	blockSize = threadsperblock; 
@@ -86,7 +86,7 @@ void launchKernel(float* C, float* A, float* B, bool manual){
     printf("kernel Execution time = %6.3fms\n", eventMs);

     //Calculate Occupancy
-     uint32_t numBlock = 0;
+     int numBlock = 0;
     HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, multiply, blockSize, 0));
     
     if(devProp.maxThreadsPerMultiProcessor){
@@ -1022,6 +1022,27 @@ inline std::ostream& operator<<(std::ostream& os, const ihipCtx_t* c) {
 namespace hip_internal {
 hipError_t memcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind,
                       hipStream_t stream);
+
+hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned int flags);
+
+hipError_t ihipHostFree(TlsData *tls, void* ptr);
+
+};
+
+#define MAX_COOPERATIVE_GPUs 255
+
+// do not change these two structs without changing the device library
+struct mg_sync {
+    uint w0;
+    uint w1;
+};
+
+struct mg_info {
+    struct mg_sync *mgs;
+    uint grid_id;
+    uint num_grids;
+    ulong prev_sum;
+    ulong all_sum;
 };

 //---
@@ -141,6 +141,103 @@ void* allocAndSharePtr(const char* msg, size_t sizeBytes, ihipCtx_t* ctx, bool s
    return ptr;
 }

+hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned int flags) {
+    hipError_t hip_status = hipSuccess;
+
+    if (HIP_SYNC_HOST_ALLOC) {
+        hipDeviceSynchronize();
+    }
+
+    auto ctx = ihipGetTlsDefaultCtx();
+    if ((ctx == nullptr) || (ptr == nullptr)) {
+        hip_status = hipErrorInvalidValue;
+    }
+    else if (sizeBytes == 0) {
+        hip_status = hipSuccess;
+        // TODO - should size of 0 return err or be siliently ignored?
+    } else {
+        unsigned trueFlags = flags;
+        if (flags == hipHostMallocDefault) {
+            // HCC/ROCM provide a modern system with unified memory and should set both of these
+            // flags by default:
+            trueFlags = hipHostMallocMapped | hipHostMallocPortable;
+        }
+
+
+        const unsigned supportedFlags = hipHostMallocPortable | hipHostMallocMapped |
+                                        hipHostMallocWriteCombined | hipHostMallocCoherent |
+                                        hipHostMallocNonCoherent;
+
+
+        const unsigned coherencyFlags = hipHostMallocCoherent | hipHostMallocNonCoherent;
+
+        if ((flags & ~supportedFlags) || ((flags & coherencyFlags) == coherencyFlags)) {
+            *ptr = nullptr;
+            // can't specify unsupported flags, can't specify both Coherent + NonCoherent
+            hip_status = hipErrorInvalidValue;
+        } else {
+            auto device = ctx->getWriteableDevice();
+#if (__hcc_workweek__ >= 19115)
+            //Avoid mapping host pinned memory to all devices by HCC
+            unsigned amFlags = amHostUnmapped;
+#else
+            unsigned amFlags = 0;
+#endif
+            if (flags & hipHostMallocCoherent) {
+                amFlags |= amHostCoherent;
+            } else if (flags & hipHostMallocNonCoherent) {
+                amFlags |= amHostNonCoherent;
+            } else {
+                // depends on env variables:
+                amFlags |= HIP_HOST_COHERENT ? amHostCoherent : amHostNonCoherent;
+            }
+
+
+            *ptr = hip_internal::allocAndSharePtr(
+                (amFlags & amHostCoherent) ? "finegrained_host" : "pinned_host", sizeBytes, ctx,
+                true  /*shareWithAll*/, amFlags, flags, 0);
+
+            if (sizeBytes && (*ptr == NULL)) {
+                hip_status = hipErrorMemoryAllocation;
+            }
+        }
+    }
+
+    if (HIP_SYNC_HOST_ALLOC) {
+        hipDeviceSynchronize();
+    }
+    return hip_status;
+}
+
+hipError_t ihipHostFree(TlsData *tls, void* ptr) {
+
+    // Synchronize to ensure all work has finished.
+    ihipGetTlsDefaultCtx()->locked_waitAllStreams();  // ignores non-blocking streams, this waits
+                                                      // for all activity to finish.
+
+    hipError_t hipStatus = hipErrorInvalidValue;
+    if (ptr) {
+        hc::accelerator acc;
+#if (__hcc_workweek__ >= 17332)
+        hc::AmPointerInfo amPointerInfo(NULL, NULL, NULL, 0, acc, 0, 0);
+#else
+        hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0);
+#endif
+        am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, ptr);
+        if (status == AM_SUCCESS) {
+            if (amPointerInfo._hostPointer == ptr) {
+                hc::am_free(ptr);
+                hipStatus = hipSuccess;
+            }
+        }
+    } else {
+        // free NULL pointer succeeds and is common technique to initialize runtime
+        hipStatus = hipSuccess;
+    }
+
+    return hipStatus;
+}
+

 }  // end namespace hip_internal

@@ -301,79 +398,12 @@ hipError_t hipExtMallocWithFlags(void** ptr, size_t sizeBytes, unsigned int flag
    return ihipLogStatus(hip_status);
 }

-hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned int flags) {
-    hipError_t hip_status = hipSuccess;
-
-    if (HIP_SYNC_HOST_ALLOC) {
-        hipDeviceSynchronize();
-    }
-
-    auto ctx = ihipGetTlsDefaultCtx();
-    if ((ctx == nullptr) || (ptr == nullptr)) {
-        hip_status = hipErrorInvalidValue;
-    }
-    else if (sizeBytes == 0) {
-        hip_status = hipSuccess;
-        // TODO - should size of 0 return err or be siliently ignored?
-    } else {
-        unsigned trueFlags = flags;
-        if (flags == hipHostMallocDefault) {
-            // HCC/ROCM provide a modern system with unified memory and should set both of these
-            // flags by default:
-            trueFlags = hipHostMallocMapped | hipHostMallocPortable;
-        }
-
-
-        const unsigned supportedFlags = hipHostMallocPortable | hipHostMallocMapped |
-                                        hipHostMallocWriteCombined | hipHostMallocCoherent |
-                                        hipHostMallocNonCoherent;
-
-
-        const unsigned coherencyFlags = hipHostMallocCoherent | hipHostMallocNonCoherent;
-
-        if ((flags & ~supportedFlags) || ((flags & coherencyFlags) == coherencyFlags)) {
-            *ptr = nullptr;
-            // can't specify unsupported flags, can't specify both Coherent + NonCoherent
-            hip_status = hipErrorInvalidValue;
-        } else {
-            auto device = ctx->getWriteableDevice();
-#if (__hcc_workweek__ >= 19115)
-            //Avoid mapping host pinned memory to all devices by HCC
-            unsigned amFlags = amHostUnmapped;
-#else
-            unsigned amFlags = 0;
-#endif
-            if (flags & hipHostMallocCoherent) {
-                amFlags |= amHostCoherent;
-            } else if (flags & hipHostMallocNonCoherent) {
-                amFlags |= amHostNonCoherent;
-            } else {
-                // depends on env variables:
-                amFlags |= HIP_HOST_COHERENT ? amHostCoherent : amHostNonCoherent;
-            }
-
-
-            *ptr = hip_internal::allocAndSharePtr(
-                (amFlags & amHostCoherent) ? "finegrained_host" : "pinned_host", sizeBytes, ctx,
-                true  /*shareWithAll*/, amFlags, flags, 0);
-
-            if (sizeBytes && (*ptr == NULL)) {
-                hip_status = hipErrorMemoryAllocation;
-            }
-        }
-    }
-
-    if (HIP_SYNC_HOST_ALLOC) {
-        hipDeviceSynchronize();
-    }
-    return hip_status;
-}

 hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) {
    HIP_INIT_SPECIAL_API(hipHostMalloc, (TRACE_MEM), ptr, sizeBytes, flags);
    HIP_SET_DEVICE();
    hipError_t hip_status = hipSuccess;
-    hip_status = ihipHostMalloc(tls, ptr, sizeBytes, flags);
+    hip_status = hip_internal::ihipHostMalloc(tls, ptr, sizeBytes, flags);
    return ihipLogStatus(hip_status);
 }

@@ -384,7 +414,7 @@ hipError_t hipMallocManaged(void** devPtr, size_t size, unsigned int flags) {
    if(flags != hipMemAttachGlobal)
        hip_status = hipErrorInvalidValue;
    else
-        hip_status = ihipHostMalloc(tls, devPtr, size, hipHostMallocDefault);
+        hip_status = hip_internal::ihipHostMalloc(tls, devPtr, size, hipHostMallocDefault);
    return ihipLogStatus(hip_status);
 }

@@ -1935,15 +1965,15 @@ hipError_t hipMemset2DAsync(void* dst, size_t pitch, int value, size_t width, si
    return ihipLogStatus(e);
 };

-hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char value, size_t sizeBytes) {
-    HIP_INIT_SPECIAL_API(hipMemsetD8, (TRACE_MCMD), dst, value, sizeBytes);
+hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char value, size_t count) {
+    HIP_INIT_SPECIAL_API(hipMemsetD8, (TRACE_MCMD), dst, value, count);

    hipError_t e = hipSuccess;

    hipStream_t stream = hipStreamNull;
    stream = ihipSyncAndResolveStream(stream);
    if (stream) {
-        e = ihipMemset(dst, value, sizeBytes, stream, ihipMemsetDataTypeChar);
+        e = ihipMemset(dst, value, count, stream, ihipMemsetDataTypeChar);
        stream->locked_wait();
    } else {
        e = hipErrorInvalidValue;
@@ -1951,23 +1981,23 @@ hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char value, size_t sizeBytes
    return ihipLogStatus(e);
 }

-hipError_t hipMemsetD8Async(hipDeviceptr_t dst, unsigned char value, size_t sizeBytes , hipStream_t stream ) {
-    HIP_INIT_SPECIAL_API(hipMemsetD8Async, (TRACE_MCMD), dst, value, sizeBytes, stream);
+hipError_t hipMemsetD8Async(hipDeviceptr_t dst, unsigned char value, size_t count , hipStream_t stream ) {
+    HIP_INIT_SPECIAL_API(hipMemsetD8Async, (TRACE_MCMD), dst, value, count, stream);

    stream = ihipSyncAndResolveStream(stream);
    if (stream) {
-        return ihipLogStatus(ihipMemset(dst, value, sizeBytes, stream, ihipMemsetDataTypeChar));
+        return ihipLogStatus(ihipMemset(dst, value, count, stream, ihipMemsetDataTypeChar));
    } else {
        return ihipLogStatus(hipErrorInvalidValue);
    }
 }

-hipError_t hipMemsetD16(hipDeviceptr_t dst, unsigned short value, size_t sizeBytes){
-    HIP_INIT_SPECIAL_API(hipMemsetD16, (TRACE_MCMD), dst, value, sizeBytes);
+hipError_t hipMemsetD16(hipDeviceptr_t dst, unsigned short value, size_t count){
+    HIP_INIT_SPECIAL_API(hipMemsetD16, (TRACE_MCMD), dst, value, count);
    hipError_t e = hipSuccess;
    hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
    if (stream) {
-        e = ihipMemset(dst, value, sizeBytes, stream, ihipMemsetDataTypeShort);
+        e = ihipMemset(dst, value, count, stream, ihipMemsetDataTypeShort);
        if(hipSuccess == e)
            stream->locked_wait();
    } else {
@@ -1976,12 +2006,12 @@ hipError_t hipMemsetD16(hipDeviceptr_t dst, unsigned short value, size_t sizeByt
    return ihipLogStatus(e);
 }

-hipError_t hipMemsetD16Async(hipDeviceptr_t dst, unsigned short value, size_t sizeBytes, hipStream_t stream ){
-    HIP_INIT_SPECIAL_API(hipMemsetD16Async, (TRACE_MCMD), dst, value, sizeBytes, stream);
+hipError_t hipMemsetD16Async(hipDeviceptr_t dst, unsigned short value, size_t count, hipStream_t stream ){
+    HIP_INIT_SPECIAL_API(hipMemsetD16Async, (TRACE_MCMD), dst, value, count, stream);

    stream = ihipSyncAndResolveStream(stream);
    if (stream) {
-        return ihipLogStatus(ihipMemset(dst, value, sizeBytes, stream, ihipMemsetDataTypeShort));
+        return ihipLogStatus(ihipMemset(dst, value, count, stream, ihipMemsetDataTypeShort));
    } else {
        return ihipLogStatus(hipErrorInvalidValue);
    }
@@ -2146,30 +2176,8 @@ hipError_t hipFree(void* ptr) {
 hipError_t hipHostFree(void* ptr) {
    HIP_INIT_SPECIAL_API(hipHostFree, (TRACE_MEM), ptr);

-    // Synchronize to ensure all work has finished.
-    ihipGetTlsDefaultCtx()->locked_waitAllStreams();  // ignores non-blocking streams, this waits
-                                                      // for all activity to finish.
-
-
-    hipError_t hipStatus = hipErrorInvalidValue;
-    if (ptr) {
-        hc::accelerator acc;
-#if (__hcc_workweek__ >= 17332)
-        hc::AmPointerInfo amPointerInfo(NULL, NULL, NULL, 0, acc, 0, 0);
-#else
-        hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0);
-#endif
-        am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, ptr);
-        if (status == AM_SUCCESS) {
-            if (amPointerInfo._hostPointer == ptr) {
-                hc::am_free(ptr);
-                hipStatus = hipSuccess;
-            }
-        }
-    } else {
-        // free NULL pointer succeeds and is common technique to initialize runtime
-        hipStatus = hipSuccess;
-    }
+    hipError_t hipStatus = hipSuccess;
+    hipStatus = hip_internal::ihipHostFree(tls, ptr);

    return ihipLogStatus(hipStatus);
 };
@@ -109,6 +109,7 @@ struct ihipModuleSymbol_t {
    amd_kernel_code_t const* _header{};
    string _name;  // TODO - review for performance cost.  Name is just used for debug.
    vector<pair<size_t, size_t>> _kernarg_layout{};
+    bool _is_code_object_v3{};
 };

 template <>
@@ -137,7 +138,8 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global
                                  uint32_t localWorkSizeX, uint32_t localWorkSizeY,
                                  uint32_t localWorkSizeZ, size_t sharedMemBytes,
                                  hipStream_t hStream, void** kernelParams, void** extra,
-                                  hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags, bool isStreamLocked = 0) {
+                                  hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags, bool isStreamLocked = 0,
+                                  void** impCoopParams = 0) {
    using namespace hip_impl;

    auto ctx = ihipGetTlsDefaultCtx();
@@ -181,10 +183,17 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global
            return hipErrorInvalidValue;
        }

-        // Insert 48-bytes at the end for implicit kernel arguments and fill with value zero.
+        // Insert 56-bytes at the end for implicit kernel arguments and fill with value zero.
        size_t padSize = (~kernargs.size() + 1) & (HIP_IMPLICIT_KERNARG_ALIGNMENT - 1);
        kernargs.insert(kernargs.end(), padSize + HIP_IMPLICIT_KERNARG_SIZE, 0);

+        if (impCoopParams) {
+            const auto p{static_cast<const char*>(*impCoopParams)};
+            // The sixth index is for multi-grid synchronization
+            kernargs.insert((kernargs.cend() - padSize - HIP_IMPLICIT_KERNARG_SIZE) + 6 * HIP_IMPLICIT_KERNARG_ALIGNMENT,
+                            p, p + HIP_IMPLICIT_KERNARG_ALIGNMENT);
+        }
+
        /*
          Kernel argument preparation.
        */
@@ -208,8 +217,7 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global
        aql.grid_size_x = globalWorkSizeX;
        aql.grid_size_y = globalWorkSizeY;
        aql.grid_size_z = globalWorkSizeZ;
-        bool is_code_object_v3 = f->_name.find(".kd") != std::string::npos;
-        if (is_code_object_v3) {
+        if (f->_is_code_object_v3) {
            const auto* header =
                reinterpret_cast<const amd_kernel_code_v3_t*>(f->_header);
            aql.group_segment_size =
@@ -449,6 +457,10 @@ hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim,
        return ihipLogStatus(hipErrorLaunchFailure);
    }

+    size_t impCoopArg = 1;
+    void* impCoopParams[1];
+    impCoopParams[0] = &impCoopArg;
+
    // launch the main kernel
    result = ihipModuleLaunchKernel(tls, kd,
            gridDim.x * blockDimX.x,
@@ -456,7 +468,7 @@ hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim,
            gridDim.z * blockDimX.z,
            blockDimX.x, blockDimX.y, blockDimX.z,
            sharedMemBytes, stream, kernelParams, nullptr, nullptr,
-            nullptr, 0, true);
+            nullptr, 0, true, impCoopParams);

    stream->criticalData().unlock();
 #if (__hcc_workweek__ >= 19213)
@@ -472,7 +484,7 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
    HIP_INIT_API(hipLaunchCooperativeKernelMultiDevice, launchParamsList, numDevices, flags);
    hipError_t result;

-    if (numDevices > g_deviceCnt || launchParamsList == nullptr) {
+    if (numDevices > g_deviceCnt || launchParamsList == nullptr || numDevices > MAX_COOPERATIVE_GPUs) {
        return ihipLogStatus(hipErrorInvalidValue);
    }

@@ -523,6 +535,32 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
                kargs.getHandle());
    }

+    mg_sync *mg_sync_ptr = 0;
+    mg_info *mg_info_ptr[MAX_COOPERATIVE_GPUs] = {0};
+
+    result = hip_internal::ihipHostMalloc(tls, (void **)&mg_sync_ptr, sizeof(mg_sync), hipHostMallocDefault);
+    if (result != hipSuccess) {
+        return ihipLogStatus(hipErrorInvalidValue);
+    }
+    mg_sync_ptr->w0 = 0;
+    mg_sync_ptr->w1 = 0;
+
+    uint all_sum = 0;
+    for (int i = 0; i < numDevices; ++i) {
+        result = hip_internal::ihipHostMalloc(tls, (void **)&mg_info_ptr[i], sizeof(mg_info), hipHostMallocDefault);
+        if (result != hipSuccess) {
+            hip_internal::ihipHostFree(tls, mg_sync_ptr);
+            for (int j = 0; j < i; ++j) {
+                hip_internal::ihipHostFree(tls, mg_info_ptr[j]);
+            }
+            return ihipLogStatus(hipErrorInvalidValue);
+        }
+        // calculate the sum of sizes of all grids
+        const hipLaunchParams& lp = launchParamsList[i];
+        all_sum += lp.blockDim.x * lp.blockDim.y * lp.blockDim.z *
+                   lp.gridDim.x  * lp.gridDim.y  * lp.gridDim.z;
+    }
+
    // lock all streams before launching the blit kernels for initializing the GWS and main kernels to each device
    for (int i = 0; i < numDevices; ++i) {
        LockedAccessor_StreamCrit_t streamCrit(launchParamsList[i].stream->criticalData(), false);
@@ -531,7 +569,7 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
 #endif
    }

-    // launch the init_gws kernel to initialize the GWS followed by launching the main kernels for each device
+    // launch the init_gws kernel to initialize the GWS for each device
    for (int i = 0; i < numDevices; ++i) {
        const hipLaunchParams& lp = launchParamsList[i];

@@ -549,8 +587,32 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
                launchParamsList[j].stream->criticalData()._av.release_locked_hsa_queue();
 #endif
            }
+            hip_internal::ihipHostFree(tls, mg_sync_ptr);
+            for (int j = 0; j < numDevices; ++j) {
+                hip_internal::ihipHostFree(tls, mg_info_ptr[j]);
+            }
+
            return ihipLogStatus(hipErrorLaunchFailure);
        }
+    }
+
+    void* impCoopParams[1];
+    ulong prev_sum = 0;
+    // launch the main kernels for each device
+    for (int i = 0; i < numDevices; ++i) {
+        const hipLaunchParams& lp = launchParamsList[i];
+
+        //initialize and setup the implicit kernel argument for multi-grid sync
+        mg_info_ptr[i]->mgs       = mg_sync_ptr;
+        mg_info_ptr[i]->grid_id   = i;
+        mg_info_ptr[i]->num_grids = numDevices;
+        mg_info_ptr[i]->all_sum   = all_sum;
+        mg_info_ptr[i]->prev_sum  = prev_sum;
+        prev_sum += lp.blockDim.x * lp.blockDim.y * lp.blockDim.z *
+                    lp.gridDim.x  * lp.gridDim.y  * lp.gridDim.z;
+
+
+        impCoopParams[0] = &mg_info_ptr[i];

        result = ihipModuleLaunchKernel(tls, kds[i],
                lp.gridDim.x * lp.blockDim.x,
@@ -559,7 +621,23 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
                lp.blockDim.x, lp.blockDim.y,
                lp.blockDim.z, lp.sharedMem,
                lp.stream, lp.args, nullptr, nullptr, nullptr, 0,
-                true);
+                true, impCoopParams);
+
+        if (result != hipSuccess) {
+            for (int j = 0; j < numDevices; ++j) {
+                launchParamsList[j].stream->criticalData().unlock();
+#if (__hcc_workweek__ >= 19213)
+                launchParamsList[j].stream->criticalData()._av.release_locked_hsa_queue();
+#endif
+            }
+            hip_internal::ihipHostFree(tls, mg_sync_ptr);
+            for (int j = 0; j < numDevices; ++j) {
+                hip_internal::ihipHostFree(tls, mg_info_ptr[j]);
+            }
+
+            return ihipLogStatus(hipErrorLaunchFailure);
+        }
+
    }

    // unlock all streams
@@ -573,6 +651,11 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi
    free(gwsKds);
    free(kds);

+    hip_internal::ihipHostFree(tls, mg_sync_ptr);
+    for (int j = 0; j < numDevices; ++j) {
+        hip_internal::ihipHostFree(tls, mg_info_ptr[j]);
+    }
+
    return ihipLogStatus(result);
 }

@@ -977,31 +1060,24 @@ hipFuncAttributes make_function_attributes(TlsData *tls, const ihipModuleSymbol_
    //       available per CU, therefore we hardcode it to 64 KiRegisters.
    prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024;

-    bool is_code_object_v3 = kd._name.find(".kd") != std::string::npos;
-    if (is_code_object_v3) {
+    if (kd._is_code_object_v3) {
        r.localSizeBytes = header_v3(kd)->private_segment_fixed_size;
        r.sharedSizeBytes = header_v3(kd)->group_segment_fixed_size;
-    } else {
-        r.localSizeBytes = kd._header->workitem_private_segment_byte_size;
-        r.sharedSizeBytes = kd._header->workgroup_group_segment_byte_size;
-    }
-    r.maxDynamicSharedSizeBytes = prop.sharedMemPerBlock - r.sharedSizeBytes;
-    if (is_code_object_v3) {
        r.numRegs = ((header_v3(kd)->compute_pgm_rsrc1 & 0x3F) + 1) << 2;
-    } else {
-        r.numRegs = kd._header->workitem_vgpr_count;
-    }
-    r.maxThreadsPerBlock = r.numRegs ?
-        std::min(prop.maxThreadsPerBlock, prop.regsPerBlock / r.numRegs) :
-        prop.maxThreadsPerBlock;
-    if (is_code_object_v3) {
        r.binaryVersion = 0; // FIXME: should it be the ISA version or code
                             //        object format version?
    } else {
+        r.localSizeBytes = kd._header->workitem_private_segment_byte_size;
+        r.sharedSizeBytes = kd._header->workgroup_group_segment_byte_size;
+        r.numRegs = kd._header->workitem_vgpr_count;
        r.binaryVersion =
            kd._header->amd_machine_version_major * 10 +
            kd._header->amd_machine_version_minor;
    }
+    r.maxDynamicSharedSizeBytes = prop.sharedMemPerBlock - r.sharedSizeBytes;
+    r.maxThreadsPerBlock = r.numRegs ?
+        std::min(prop.maxThreadsPerBlock, prop.regsPerBlock / r.numRegs) :
+        prop.maxThreadsPerBlock;
    r.ptxVersion = prop.major * 10 + prop.minor; // HIP currently presents itself as PTX 3.0.

    return r;
@@ -1099,8 +1175,7 @@ hipError_t ihipModuleLoadData(TlsData *tls, hipModule_t* module, const void* ima
                                            content.data(), content.size(), (*module)->executable,
                                            this_agent());

-    std::vector<char> blob(content.cbegin(), content.cend());
-    program_state_impl::read_kernarg_metadata(blob, (*module)->kernargs);
+    program_state_impl::read_kernarg_metadata(content, (*module)->kernargs);

    // compute the hash of the code object
    (*module)->hash = checksum(content.length(), content.data());
@@ -1152,8 +1227,7 @@ hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const

 void getGprsLdsUsage(hipFunction_t f, size_t* usedVGPRS, size_t* usedSGPRS, size_t* usedLDS)
 {
-    bool is_code_object_v3 = f->_name.find(".kd") != std::string::npos;
-    if (is_code_object_v3) {
+    if (f->_is_code_object_v3) {
        const auto header = reinterpret_cast<const amd_kernel_code_v3_t*>(f->_header);
        // GRANULATED_WAVEFRONT_VGPR_COUNT is specified in 0:5 bits of COMPUTE_PGM_RSRC1
        // the granularity for gfx6-gfx9 is max(0, ceil(vgprs_used / 4) - 1)
@@ -1174,9 +1248,9 @@ void getGprsLdsUsage(hipFunction_t f, size_t* usedVGPRS, size_t* usedSGPRS, size
    }
 }

-hipError_t ihipOccupancyMaxPotentialBlockSize(TlsData *tls, uint32_t* gridSize, uint32_t* blockSize,
-                                              hipFunction_t f, size_t dynSharedMemPerBlk,
-                                              uint32_t blockSizeLimit)
+hipError_t ihipOccupancyMaxPotentialBlockSize(TlsData *tls, int* gridSize, int* blockSize,
+                                              hipFunction_t f, size_t dynamicSMemSize,
+                                              int blockSizeLimit)
 {
    using namespace hip_impl;

@@ -1257,7 +1331,7 @@ hipError_t ihipOccupancyMaxPotentialBlockSize(TlsData *tls, uint32_t* gridSize,
        }
        else {
            size_t availableSharedMemPerCU = prop.maxSharedMemoryPerMultiProcessor;
-            size_t workgroupPerCU = availableSharedMemPerCU / (usedLDS + dynSharedMemPerBlk);
+            size_t workgroupPerCU = availableSharedMemPerCU / (usedLDS + dynamicSMemSize);
            wavefrontsLDS = min(workgroupPerCU, maxWorkgroupPerCU) * wavefrontsPerWG;
        }

@@ -1286,18 +1360,19 @@ hipError_t ihipOccupancyMaxPotentialBlockSize(TlsData *tls, uint32_t* gridSize,
    return hipSuccess;
 }

-hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* blockSize,
-                                             hipFunction_t f, size_t dynSharedMemPerBlk,
-                                             uint32_t blockSizeLimit)
+hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
+                                             const void* f, size_t dynamicSMemSize,
+                                             int blockSizeLimit)
 {
-    HIP_INIT_API(hipOccupancyMaxPotentialBlockSize, gridSize, blockSize, f, dynSharedMemPerBlk, blockSizeLimit);
-
+    HIP_INIT_API(hipOccupancyMaxPotentialBlockSize, gridSize, blockSize, f, dynamicSMemSize, blockSizeLimit);
+    auto F = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)(f),
+                                                   hip_impl::target_agent(0));
    return ihipLogStatus(ihipOccupancyMaxPotentialBlockSize(tls,
-        gridSize, blockSize, f, dynSharedMemPerBlk, blockSizeLimit));
+        gridSize, blockSize, F, dynamicSMemSize, blockSizeLimit));
 }

 hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
-   TlsData *tls, uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk)
+   TlsData *tls, int* numBlocks, hipFunction_t f, int blockSize, size_t dynamicSMemSize)
 {
    using namespace hip_impl;

@@ -1326,45 +1401,50 @@ hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
    size_t numWavefronts = (blockSize + wavefrontSize - 1) / wavefrontSize;

    size_t availableVGPRs = (prop.regsPerBlock / wavefrontSize / simdPerCU);
-    size_t vgprs_alu_occupancy = simdPerCU * std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS);
+    size_t vgprs_alu_occupancy = simdPerCU * (usedVGPRS == 0 ? maxWavesPerSimd
+        : std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS));

    // Calculate blocks occupancy per CU based on VGPR usage
    *numBlocks = vgprs_alu_occupancy / numWavefronts;

    const size_t availableSGPRs = (prop.gcnArch < 800) ? 512 : 800;
-    size_t sgprs_alu_occupancy = simdPerCU * ((usedSGPRS == 0) ? maxWavesPerSimd
+    size_t sgprs_alu_occupancy = simdPerCU * (usedSGPRS == 0 ? maxWavesPerSimd
        : std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS));

    // Calculate blocks occupancy per CU based on SGPR usage
-    *numBlocks = std::min(*numBlocks, (uint32_t) (sgprs_alu_occupancy / numWavefronts)); 
+    *numBlocks = std::min(*numBlocks, (int) (sgprs_alu_occupancy / numWavefronts));

-    size_t total_used_lds = usedLDS + dynSharedMemPerBlk;
+    size_t total_used_lds = usedLDS + dynamicSMemSize;
    if (total_used_lds != 0) {
      // Calculate LDS occupacy per CU. lds_per_cu / (static_lsd + dynamic_lds)
      size_t lds_occupancy = prop.maxSharedMemoryPerMultiProcessor / total_used_lds;
-      *numBlocks = std::min(*numBlocks, (uint32_t) lds_occupancy);
+      *numBlocks = std::min(*numBlocks, (int) lds_occupancy);
    }

    return hipSuccess;
 }

 hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
-   uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk)
+   int* numBlocks, const void* f, int blockSize, size_t dynamicSMemSize)
 {
-    HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessor, numBlocks, f, blockSize, dynSharedMemPerBlk);
+    HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessor, numBlocks, f, blockSize, dynamicSMemSize);

+    auto F = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)(f),
+                                                   hip_impl::target_agent(0));
    return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor(
-        tls, numBlocks, f, blockSize, dynSharedMemPerBlk));
+        tls, numBlocks, F, blockSize, dynamicSMemSize));
 }

 hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
-   uint32_t* numBlocks, hipFunction_t f, uint32_t  blockSize, size_t dynSharedMemPerBlk,
+   int* numBlocks, const void* f, int  blockSize, size_t dynamicSMemSize,
   unsigned int flags)
 {
-    HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, numBlocks, f, blockSize, dynSharedMemPerBlk, flags);
+    HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, numBlocks, f, blockSize, dynamicSMemSize, flags);

+    auto F = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)(f),
+                                                   hip_impl::target_agent(0));
    return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor(
-        tls, numBlocks, f, blockSize, dynSharedMemPerBlk));
+        tls, numBlocks, F, blockSize, dynamicSMemSize));
 }

 hipError_t hipLaunchKernel(
@@ -89,9 +89,10 @@ struct Symbol {

 class Kernel_descriptor {
    std::uint64_t kernel_object_{};
-    amd_kernel_code_t const* kernel_header_{nullptr};
-    std::string name_{};
+    amd_kernel_code_t const* header_{};
+    std::string name_;
    std::vector<std::pair<std::size_t, std::size_t>> kernarg_layout_{};
+    bool is_code_object_v3_{};
 public:
    Kernel_descriptor() = default;
    Kernel_descriptor(
@@ -101,7 +102,8 @@ public:
        :
        kernel_object_{kernel_object},
        name_{name},
-        kernarg_layout_{std::move(kernarg_layout)}
+        kernarg_layout_{std::move(kernarg_layout)},
+        is_code_object_v3_{name.find(".kd") != std::string::npos}
    {
        bool supported{false};
        std::uint16_t min_v{UINT16_MAX};
@@ -123,7 +125,7 @@ public:

        r = tbl.hsa_ven_amd_loader_query_host_address(
            reinterpret_cast<void*>(kernel_object_),
-            reinterpret_cast<const void**>(&kernel_header_));
+            reinterpret_cast<const void**>(&header_));

        if (r != HSA_STATUS_SUCCESS) return;
    }
@@ -149,7 +151,7 @@ public:
            std::string,
                std::unordered_map<
                    hsa_isa_t,
-                    std::vector<std::vector<char>>>>> code_object_blobs;
+                    std::vector<std::string>>>> code_object_blobs;

    std::pair<
        std::once_flag,
@@ -213,7 +215,7 @@ public:
        std::string,
            std::unordered_map<
                hsa_isa_t,
-                std::vector<std::vector<char>>>>& get_code_object_blobs() {
+                std::vector<std::string>>>& get_code_object_blobs() {

        std::call_once(code_object_blobs.first, [this]() {
            dl_iterate_phdr([](dl_phdr_info* info, std::size_t, void* p) {
@@ -584,6 +586,68 @@ public:
        return functions[agent].second;
    }

+    static
+    std::size_t parse_args_v2(
+            const std::string& metadata,
+            std::size_t f,
+            std::size_t l,
+            std::vector<std::pair<std::size_t, std::size_t>>& size_align) {
+        if (f == l) return f;
+        if (!size_align.empty()) return l;
+
+        do {
+            static constexpr size_t size_sz{5};
+            f = metadata.find("Size:", f) + size_sz;
+
+            if (l <= f) return f;
+
+            auto size = std::strtoul(&metadata[f], nullptr, 10);
+
+            static constexpr size_t align_sz{6};
+            f = metadata.find("Align:", f) + align_sz;
+
+            char* l{};
+            auto align = std::strtoul(&metadata[f], &l, 10);
+
+            f += (l - &metadata[f]) + 1;
+
+            size_align.emplace_back(size, align);
+        } while (true);
+    }
+
+    static
+    void read_kernarg_metadata_v2(
+        const std::string& kernels_md,
+        std::size_t dx,
+        std::unordered_map<
+            std::string,
+            std::vector<std::pair<std::size_t, std::size_t>>>& kernargs) {
+        do {
+            dx = kernels_md.find("Name:", dx);
+
+            if (dx == std::string::npos) break;
+
+            static constexpr decltype(kernels_md.size()) name_sz{5};
+            dx = kernels_md.find_first_not_of(" '", dx + name_sz);
+
+            auto fn =
+                kernels_md.substr(dx, kernels_md.find_first_of("'\n", dx) - dx);
+            dx += fn.size();
+
+            auto dx1 = kernels_md.find("CodeProps", dx);
+            dx = kernels_md.find("Args:", dx);
+
+            if (dx1 < dx) {
+                dx = dx1;
+                continue;
+            }
+            if (dx == std::string::npos) break;
+
+            static constexpr decltype(kernels_md.size()) args_sz{5};
+            dx = parse_args_v2(kernels_md, dx + args_sz, dx1, kernargs[fn]);
+        } while (true);
+    }
+
    static
    std::string metadata_to_string(const amd_comgr_metadata_node_t& md) {
        std::string str;
@@ -598,9 +662,8 @@ public:
    }

    static
-    void parse_args(
+    void parse_args_v3(
            const amd_comgr_metadata_node_t& args_md,
-            bool is_code_object_v3,
            std::vector<std::pair<std::size_t, std::size_t>>& size_align) {
        size_t arg_count = 0;
        if (amd_comgr_get_metadata_list_size(args_md, &arg_count)
@@ -615,9 +678,7 @@ public:
                return;

            amd_comgr_metadata_node_t arg_size_md;
-            if (amd_comgr_metadata_lookup(arg_md,
-                                          is_code_object_v3 ? ".size" : "Size",
-                                          &arg_size_md)
+            if (amd_comgr_metadata_lookup(arg_md, ".size", &arg_size_md)
                != AMD_COMGR_STATUS_SUCCESS)
                return;

@@ -629,35 +690,21 @@ public:

            size_t arg_align;

-            if (is_code_object_v3) {
-                amd_comgr_metadata_node_t arg_offset_md;
-                if (amd_comgr_metadata_lookup(arg_md, ".offset", &arg_offset_md)
-                    != AMD_COMGR_STATUS_SUCCESS)
-                    return;
+            amd_comgr_metadata_node_t arg_offset_md;
+            if (amd_comgr_metadata_lookup(arg_md, ".offset", &arg_offset_md)
+                != AMD_COMGR_STATUS_SUCCESS)
+                return;

-                size_t arg_offset
-                    = std::stoul(metadata_to_string(arg_offset_md));
+            size_t arg_offset = std::stoul(metadata_to_string(arg_offset_md));

-                if (amd_comgr_destroy_metadata(arg_offset_md)
-                    != AMD_COMGR_STATUS_SUCCESS)
-                    return;
+            if (amd_comgr_destroy_metadata(arg_offset_md)
+                != AMD_COMGR_STATUS_SUCCESS)
+                return;

-                arg_align = 1;
-                while (arg_offset && (arg_offset & 1) == 0) {
-                    arg_offset >>= 1;
-                    arg_align <<= 1;
-                }
-            } else {
-                amd_comgr_metadata_node_t arg_align_md;
-                if (amd_comgr_metadata_lookup(arg_md, "Align", &arg_align_md)
-                    != AMD_COMGR_STATUS_SUCCESS)
-                    return;
-
-                arg_align = std::stoul(metadata_to_string(arg_align_md));
-
-                if (amd_comgr_destroy_metadata(arg_align_md)
-                    != AMD_COMGR_STATUS_SUCCESS)
-                    return;
+            arg_align = 1;
+            while (arg_offset && (arg_offset & 1) == 0) {
+                arg_offset >>= 1;
+                arg_align <<= 1;
            }

            size_align.emplace_back(arg_size, arg_align);
@@ -669,11 +716,11 @@ public:
    }

    static
-    void read_kernarg_metadata(
-            const std::vector<char>& blob,
+    void read_kernarg_metadata_v3(
+            const std::string& blob,
            std::unordered_map<
-            std::string,
-            std::vector<std::pair<std::size_t, std::size_t>>>& kernargs) {
+                std::string,
+                std::vector<std::pair<std::size_t, std::size_t>>>& kernargs) {
        amd_comgr_data_t dataIn;
        amd_comgr_status_t status;

@@ -690,7 +737,6 @@ public:
            != AMD_COMGR_STATUS_SUCCESS)
            return;

-        bool is_code_object_v3 = false;
        amd_comgr_metadata_node_t kernels_md;
        if (amd_comgr_metadata_lookup(metadata, "Kernels", &kernels_md)
            != AMD_COMGR_STATUS_SUCCESS) {
@@ -699,7 +745,6 @@ public:
                                          &kernels_md)
                != AMD_COMGR_STATUS_SUCCESS)
                return;
-            is_code_object_v3 = true;
        }

        size_t kernel_count = 0;
@@ -715,9 +760,7 @@ public:
                continue;

            amd_comgr_metadata_node_t name_md;
-            if (amd_comgr_metadata_lookup(kernel_md,
-                                          is_code_object_v3 ? ".name" : "Name",
-                                          &name_md)
+            if (amd_comgr_metadata_lookup(kernel_md, ".name", &name_md)
                != AMD_COMGR_STATUS_SUCCESS)
                continue;

@@ -727,21 +770,15 @@ public:
                != AMD_COMGR_STATUS_SUCCESS)
                continue;

-            if (is_code_object_v3)
-                kernel_name_str.append(".kd");
-
-
            amd_comgr_metadata_node_t args_md;
-            if (amd_comgr_metadata_lookup(kernel_md,
-                                          is_code_object_v3 ? ".args" : "Args",
-                                          &args_md)
+            if (amd_comgr_metadata_lookup(kernel_md, ".args", &args_md)
                != AMD_COMGR_STATUS_SUCCESS)
                continue;

            auto foundKernel = kernargs.find(kernel_name_str);
            // parse arguments for a given kernel only once
            if (foundKernel == kernargs.end()) {
-                parse_args(args_md, is_code_object_v3, kernargs[kernel_name_str]);
+                parse_args_v3(args_md, kernargs[kernel_name_str]);
            }

            if (amd_comgr_destroy_metadata(args_md) != AMD_COMGR_STATUS_SUCCESS
@@ -757,7 +794,52 @@ public:
        amd_comgr_release_data(dataIn);
    }

-    const std::unordered_map<std::string, 
+    static
+    void read_kernarg_metadata(
+        const std::string& blob,
+        std::unordered_map<
+            std::string,
+            std::vector<std::pair<std::size_t, std::size_t>>>& kernargs)
+    {
+        std::istringstream istr{blob};
+        ELFIO::elfio reader;
+
+        if (!reader.load(istr)) return;
+
+        // TODO: this is inefficient.
+        auto it = find_section_if(reader, [](const ELFIO::section* x) {
+            return x->get_type() == SHT_NOTE;
+        });
+
+        if (!it) return;
+
+        const ELFIO::note_section_accessor acc{reader, it};
+        auto n{acc.get_notes_num()};
+        while (n--) {
+            ELFIO::Elf_Word type{};
+            std::string name{};
+            void* desc{};
+            ELFIO::Elf_Word desc_size{};
+
+            acc.get_note(n, type, name, desc, desc_size);
+
+            if (name == "AMDGPU") {
+                return read_kernarg_metadata_v3(blob, kernargs);
+            }
+            if (name != "AMD") continue; // TODO: switch to using NT_AMD_AMDGPU_HSA_METADATA.
+
+            std::string tmp{
+                static_cast<char*>(desc), static_cast<char*>(desc) + desc_size};
+
+            auto dx = tmp.find("Kernels:");
+
+            if (dx == std::string::npos) continue;
+
+            return read_kernarg_metadata_v2(tmp, dx + 8u, kernargs); // Skip "Kernels:".
+        }
+    }
+
+    const std::unordered_map<std::string,
        std::vector<std::pair<std::size_t, std::size_t>>>& get_kernargs() {

        std::call_once(kernargs.first, [this]() {
@@ -12,18 +12,33 @@ import lit.util
 site_cfg = lit_config.params.get('site_config', None)
 lit_config.load_config(config, site_cfg)

-print(str("========================================"))
+config.excludes = ['cmdparser.hpp']
+config.excludes.append('spatial_batch_norm_op.h')
+config.excludes.append('common_cudnn.h')
+
+delimiter = "===============================================================";
+print(delimiter)
 print("CUDA " + config.cuda_version + " - will be used for testing")
 print("LLVM " + config.llvm_version + " - will be used for testing")
 print(platform.machine() + " - Platform architecture")
 print(platform.system() + " " + platform.release() + " - Platform OS")
 print(str(config.pointer_size * 8) + " - hipify-clang binary bitness")
 print(str(struct.calcsize("P") * 8) + " - python " + str(platform.python_version()) + " binary bitness")
-print(str("========================================"))
-
-config.excludes = ['cmdparser.hpp']
-config.excludes.append('spatial_batch_norm_op.h')
-config.excludes.append('common_cudnn.h')
+print(delimiter)
+warns = None
+if not config.cuda_dnn_root:
+    config.excludes.append('cudnn_convolution_forward.cu')
+    config.excludes.append('cudnn_softmax.cu')
+    print("WARN: cuDNN tests are excluded due to unset CUDA_DNN_ROOT_DIR")
+    warns = True
+if not config.cuda_cub_root:
+    config.excludes.append('cub_01.cu')
+    config.excludes.append('cub_02.cu')
+    config.excludes.append('cub_03.cu')
+    print("WARN: CUB tests are excluded due to unset CUDA_CUB_ROOT_DIR")
+    warns = True
+if warns:
+    print(delimiter)

 if config.cuda_version_major == 7 and config.cuda_version_minor == 0:
    config.excludes.append('headers_test_09.cu')
@@ -90,17 +105,47 @@ else:
    run_test_ext = ".sh"
    # CUDA SDK ROOT
    clang_arguments += " -isystem'%s'/samples/common/inc"
-# cuDNN ROOT
-clang_arguments += " -I'%s'/include"
 if config.pointer_size == 8:
    clang_arguments += " -D__LP64__"

+# Set max clang's CudaArch for corresponding CUDA version
+# to support maximum CUDA features in offline tests
+if config.cuda_version_major == 7:
+    if config.cuda_version_minor == 5:
+        clang_arguments += " --cuda-gpu-arch=sm_53"
+    else:
+        clang_arguments += " --cuda-gpu-arch=sm_52"
+elif config.cuda_version_major == 8:
+    clang_arguments += " --cuda-gpu-arch=sm_62"
+elif config.cuda_version_major == 9:
+    if config.cuda_version_minor == 2:
+        clang_arguments += " --cuda-gpu-arch=sm_72"
+    else:
+        clang_arguments += " --cuda-gpu-arch=sm_70"
+elif config.cuda_version_major == 10:
+    clang_arguments += " --cuda-gpu-arch=sm_75"
+
+# cuDNN ROOT
+if config.cuda_dnn_root:
+    clang_arguments += " -I'%s'/include"
 # CUB ROOT
-clang_arguments += " -I'%s'"
+if config.cuda_cub_root:
+    clang_arguments += " -I'%s'"

-hipify_arguments = "--cuda-path='%s'"
+if config.cuda_dnn_root and config.cuda_cub_root:
+    config.substitutions.append(("%clang_args", clang_arguments % (config.cuda_sdk_root, config.cuda_dnn_root, config.cuda_cub_root)))
+elif config.cuda_dnn_root:
+    config.substitutions.append(("%clang_args", clang_arguments % (config.cuda_sdk_root, config.cuda_dnn_root)))
+elif config.cuda_cub_root:
+    config.substitutions.append(("%clang_args", clang_arguments % (config.cuda_sdk_root, config.cuda_cub_root)))
+else:
+    config.substitutions.append(("%clang_args", clang_arguments % config.cuda_sdk_root))

-config.substitutions.append(("%clang_args", clang_arguments % (config.cuda_sdk_root, config.cuda_dnn_root, config.cuda_cub_root)))
-config.substitutions.append(("%hipify_args", hipify_arguments % (config.cuda_root)))
+if config.llvm_version_major < 4:
+    hipify_arguments = "-I'%s'/include"
+else:
+    hipify_arguments = "--cuda-path='%s'"
+
+config.substitutions.append(("%hipify_args", hipify_arguments % config.cuda_root))
 config.substitutions.append(("hipify", '"' + hipify_path + "/hipify-clang" + '"'))
 config.substitutions.append(("%run_test", '"' + config.test_source_root + "/run_test" + run_test_ext + '"'))
@@ -276,7 +276,9 @@ int main(int argc, char** argv) {
    runTest<unsigned int>();
    runTest<unsigned long long>();
    runTest<float>();
+#if CUDA_VERSION >= 8000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
    runTest<double>();
+#endif
    // CHECK: hipDeviceReset();
    cudaDeviceReset();
    printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
@@ -0,0 +1,69 @@
+// RUN: %run_test hipify "%s" "%t" %hipify_args %clang_args
+// CHECK: #include <hip/hip_runtime.h>
+#include <iostream>
+// CHECK: #include <hiprand.h>
+#include <curand.h>
+// CHECK: #include <hipcub/hipcub.hpp>
+#include <cub/cub.cuh>
+
+#include <iostream>
+
+template <int BLOCK_WIDTH, int ITEMS_PER_THREAD,
+          // CHECK: hipcub::BlockLoadAlgorithm BLOCK_LOAD_ALGO,
+          cub::BlockLoadAlgorithm BLOCK_LOAD_ALGO,
+          // CHECK: hipcub::BlockStoreAlgorithm BLOCK_STORE_ALGO,
+          cub::BlockStoreAlgorithm BLOCK_STORE_ALGO,
+          typename T>
+__global__ void sort(const T* data_in, T* data_out){
+    // CHECK: typedef hipcub::BlockLoad<T, BLOCK_WIDTH, ITEMS_PER_THREAD, BLOCK_LOAD_ALGO> BlockLoadT;
+    typedef cub::BlockLoad<T, BLOCK_WIDTH, ITEMS_PER_THREAD, BLOCK_LOAD_ALGO> BlockLoadT;
+    // CHECK: typedef hipcub::BlockRadixSort<T, BLOCK_WIDTH, ITEMS_PER_THREAD> BlockRadixSortT;
+    typedef cub::BlockRadixSort<T, BLOCK_WIDTH, ITEMS_PER_THREAD> BlockRadixSortT;
+    // CHECK: typedef hipcub::BlockStore<T, BLOCK_WIDTH, ITEMS_PER_THREAD, BLOCK_STORE_ALGO> BlockStoreT;
+    typedef cub::BlockStore<T, BLOCK_WIDTH, ITEMS_PER_THREAD, BLOCK_STORE_ALGO> BlockStoreT;
+    __shared__ union {
+        typename BlockLoadT::TempStorage load;
+        typename BlockRadixSortT::TempStorage sort;
+        typename BlockStoreT::TempStorage store;
+    } tmp_storage;
+    T items[ITEMS_PER_THREAD];
+    BlockLoadT(tmp_storage.load).Load(data_in + blockIdx.x * BLOCK_WIDTH * ITEMS_PER_THREAD, items);
+    __syncthreads();
+    BlockRadixSortT(tmp_storage.sort).Sort(items);
+    __syncthreads();
+    BlockStoreT(tmp_storage.store).Store(data_out + blockIdx.x * BLOCK_WIDTH * ITEMS_PER_THREAD, items);
+}
+
+int main() {
+    double* d_gpu = NULL;
+    double* result_gpu = NULL;
+    double* data_sorted = new double[1000*4096];
+    // Allocate memory on the GPU
+    // CHECK: hipMalloc(&d_gpu, 1000*4096 * sizeof(double));
+    cudaMalloc(&d_gpu, 1000*4096 * sizeof(double));
+    // CHECK: hipMalloc(&result_gpu, 1000*4096 * sizeof(double));
+    cudaMalloc(&result_gpu, 1000*4096 * sizeof(double));
+    // CHECK: hiprandGenerator_t gen;
+    curandGenerator_t gen;
+    // Create generator
+    // CHECK: hiprandCreateGenerator(&gen, HIPRAND_RNG_PSEUDO_DEFAULT);
+    curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
+    // Fill array with random numbers
+    // CHECK: hiprandGenerateNormalDouble(gen, d_gpu, 1000*4096, 0.0, 1.0);
+    curandGenerateNormalDouble(gen, d_gpu, 1000*4096, 0.0, 1.0);
+    // Destroy generator
+    // CHECK: hiprandDestroyGenerator(gen);
+    curandDestroyGenerator(gen);
+    // Sort data
+    // CHECK: hipLaunchKernelGGL(HIP_KERNEL_NAME(sort<512, 8, hipcub::BLOCK_LOAD_TRANSPOSE, hipcub::BLOCK_STORE_TRANSPOSE>), dim3(1000), dim3(512), 0, 0, d_gpu, result_gpu);
+    sort<512, 8, cub::BLOCK_LOAD_TRANSPOSE, cub::BLOCK_STORE_TRANSPOSE><<<1000, 512>>>(d_gpu, result_gpu);
+    // CHECK: hipLaunchKernelGGL(HIP_KERNEL_NAME(sort<256, 16, hipcub::BLOCK_LOAD_DIRECT, hipcub::BLOCK_STORE_DIRECT>), dim3(1000), dim3(256), 0, 0, d_gpu, result_gpu);
+    sort<256, 16, cub::BLOCK_LOAD_DIRECT, cub::BLOCK_STORE_DIRECT><<<1000, 256>>>(d_gpu, result_gpu);
+    // CHECK: hipMemcpy(data_sorted, result_gpu, 1000*4096*sizeof(double), hipMemcpyDeviceToHost);
+    cudaMemcpy(data_sorted, result_gpu, 1000*4096*sizeof(double), cudaMemcpyDeviceToHost);
+    // Write the sorted data to standard out
+    for (int i = 0; i < 4095; ++i) {
+        std::cout << data_sorted[i] << ", ";
+    }
+    std::cout << data_sorted[4095] << std::endl;
+}
@@ -0,0 +1,33 @@
+// RUN: %run_test hipify "%s" "%t" %hipify_args %clang_args
+// CHECK: #include <hip/hip_runtime.h>
+#include <iostream>
+// CHECK: #include <hipcub/hipcub.hpp>
+#include <cub/cub.cuh>
+
+// using namespace hipcub;
+using namespace cub;
+
+// Simple CUDA kernel for computing tiled partial sums
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD,
+          // CHECK: hipcub::BlockLoadAlgorithm LOAD_ALGO,
+          cub::BlockLoadAlgorithm LOAD_ALGO,
+          // CHECK: hipcub::BlockScanAlgorithm SCAN_ALGO>
+          cub::BlockScanAlgorithm SCAN_ALGO>
+__global__ void ScanTilesKernel(int *d_in, int *d_out) {
+  // Specialize collective types for problem context
+  // CHECK: typedef ::hipcub::BlockLoad<int*, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGO> BlockLoadT;
+  typedef ::cub::BlockLoad<int*, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGO> BlockLoadT;
+  typedef BlockScan<int, BLOCK_THREADS, SCAN_ALGO> BlockScanT;
+  // Allocate on-chip temporary storage
+  __shared__ union {
+    typename BlockLoadT::TempStorage load;
+    typename BlockScanT::TempStorage reduce;
+  } temp_storage;
+  // Load data per thread
+  int thread_data[ITEMS_PER_THREAD];
+  int offset = blockIdx.x * (BLOCK_THREADS * ITEMS_PER_THREAD);
+  BlockLoadT(temp_storage.load).Load(d_in + offset, offset);
+  __syncthreads();
+  // Compute the block-wide prefix sum
+  BlockScanT(temp_storage).Sum(thread_data);
+}
@@ -67,7 +67,6 @@ int devcheck(int  gpudevice, int rank)
  cudaError_t cudareturn;
  cudaDeviceProp deviceProp;
  cudaGetDeviceProperties(&deviceProp, gpudevice);
-  // CHECK: if (deviceProp.hipWarpSize <= 1)
  if (deviceProp.warpSize <= 1)
  {
    printf("rank %d: warning, CUDA Device Emulation (CPU) detected, exiting\n", rank);
@@ -29,7 +29,12 @@ THE SOFTWARE.
 #include "hip/hip_runtime.h"
 #include "test_common.h"

-#if __HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__ || __HIP_ARCH_GFX906__ || __HIP_ARCH_GFX908__
+#if __HIP_ARCH_GFX803__  || \
+    __HIP_ARCH_GFX900__  || \
+    __HIP_ARCH_GFX906__  || \
+    __HIP_ARCH_GFX908__  || \
+    __HIP_ARCH_GFX1010__ || \
+    __HIP_ARCH_GFX1012__

 __global__ void kernel_abs_int64(long long *input, long long *output) {
    int tx = threadIdx.x;
@@ -28,7 +28,12 @@ THE SOFTWARE.

 #include "test_common.h"

-#if __HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__ || __HIP_ARCH_GFX906__ || __HIP_ARCH_GFX908__
+#if __HIP_ARCH_GFX803__  || \
+    __HIP_ARCH_GFX900__  || \
+    __HIP_ARCH_GFX906__  || \
+    __HIP_ARCH_GFX908__  || \
+    __HIP_ARCH_GFX1010__ || \
+    __HIP_ARCH_GFX1012__

 __device__ void test_convert() {
  __half x;
@@ -32,7 +32,12 @@ THE SOFTWARE.

 using namespace std;

-#if __HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__ || __HIP_ARCH_GFX906__ || __HIP_ARCH_GFX908__
+#if __HIP_ARCH_GFX803__  || \
+    __HIP_ARCH_GFX900__  || \
+    __HIP_ARCH_GFX906__  || \
+    __HIP_ARCH_GFX908__  || \
+    __HIP_ARCH_GFX1010__ || \
+    __HIP_ARCH_GFX1012__

 __global__
 void __halfTest(bool* result, __half a) {
@@ -120,8 +120,8 @@ void simpleTest2(size_t numElements, bool usePinnedHost) {
        HIPCHECK(hipHostFree(A_h1));
        HIPCHECK(hipHostFree(A_h2));
    } else {
-        free(A_h1);
-        free(A_h2);
+        aligned_free(A_h1);
+        aligned_free(A_h2);
    }
 }

@@ -45,7 +45,7 @@ bool testhipMemset2D(int memsetval,int p_gpuDevice)
    char *A_d;
    char *A_h;
    bool testResult = true;
-    HIPCHECK ( hipMemAllocPitch((hipDeviceptr_t*)&A_d, &pitch_A, width , numH,16) );
+    HIPCHECK(hipMemAllocPitch((hipDeviceptr_t*)&A_d, &pitch_A, width , numH,16));
    A_h = (char*)malloc(sizeElements);
    HIPASSERT(A_h != NULL);
    for (size_t i=0; i<elements; i++) {
@@ -89,8 +89,9 @@ bool testhipMemset2DAsync(int memsetval,int p_gpuDevice)
    }
    hipStream_t stream;
    HIPCHECK(hipStreamCreate(&stream));
-    HIPCHECK ( hipMemset2DAsync(A_d, pitch_A, memsetval, numW, numH, stream) );
-    HIPCHECK ( hipMemcpy2D(A_h, width, A_d, pitch_A, numW, numH, hipMemcpyDeviceToHost));
+    HIPCHECK(hipMemset2DAsync(A_d, pitch_A, memsetval, numW, numH, stream) );
+    HIPCHECK(hipStreamSynchronize(stream));
+    HIPCHECK(hipMemcpy2D(A_h, width, A_d, pitch_A, numW, numH, hipMemcpyDeviceToHost));

    for (int i=0; i<elements; i++) {
        if (A_h[i] != memsetval) {
@@ -112,9 +113,9 @@ int main(int argc, char *argv[])
    hipCtx_t context;
    hipCtxCreate(&context, 0, p_gpuDevice);

-    bool testResult = false;
-    testResult = testhipMemset2D(memsetval, p_gpuDevice);
-    testResult = testhipMemset2DAsync(memsetval, p_gpuDevice);
+    bool testResult = true;
+    testResult &= testhipMemset2D(memsetval, p_gpuDevice);
+    testResult &= testhipMemset2DAsync(memsetval, p_gpuDevice);
    hipCtxDestroy(context);
    if(testResult){
       passed();
@@ -41,27 +41,27 @@ bool testhipMemset3D(int memsetval,int p_gpuDevice)
    size_t elements = numW* numH* depth;


-	printf ("testhipMemset3D memsetval=%2x device=%d\n", memsetval, p_gpuDevice);
+    printf ("testhipMemset3D memsetval=%2x device=%d\n", memsetval, p_gpuDevice);
    char *A_h;
    bool testResult = true;
    hipExtent extent = make_hipExtent(width, numH, depth);
    hipPitchedPtr devPitchedPtr;

    HIPCHECK(hipMalloc3D(&devPitchedPtr, extent));
-	A_h = (char*)malloc(sizeElements);
-	HIPASSERT(A_h != NULL);
-	for (size_t i=0; i<elements; i++) {
+    A_h = (char*)malloc(sizeElements);
+    HIPASSERT(A_h != NULL);
+    for (size_t i=0; i<elements; i++) {
        A_h[i] = 1;
    }
-	HIPCHECK ( hipMemset3D( devPitchedPtr, memsetval, extent) );
-	hipMemcpy3DParms myparms = {0};
-	myparms.srcPos = make_hipPos(0,0,0);
-	myparms.dstPos = make_hipPos(0,0,0);
-	myparms.dstPtr = make_hipPitchedPtr(A_h, width , numW, numH);
-	myparms.srcPtr = devPitchedPtr;
-	myparms.extent = extent;
+    HIPCHECK(hipMemset3D( devPitchedPtr, memsetval, extent));
+    hipMemcpy3DParms myparms = {0};
+    myparms.srcPos = make_hipPos(0,0,0);
+    myparms.dstPos = make_hipPos(0,0,0);
+    myparms.dstPtr = make_hipPitchedPtr(A_h, width , numW, numH);
+    myparms.srcPtr = devPitchedPtr;
+    myparms.extent = extent;
 #ifdef __HIP_PLATFORM_NVCC__
-	myparms.kind = hipMemcpyKindToCudaMemcpyKind(hipMemcpyDeviceToHost);
+    myparms.kind = hipMemcpyKindToCudaMemcpyKind(hipMemcpyDeviceToHost);
 #else
    myparms.kind = hipMemcpyDeviceToHost;
 #endif
@@ -69,7 +69,58 @@ bool testhipMemset3D(int memsetval,int p_gpuDevice)

    for (int i=0; i<elements; i++) {
        if (A_h[i] != memsetval) {
-			testResult = false;
+            testResult = false;
+            printf("mismatch at index:%d computed:%02x, memsetval:%02x\n", i, (int)A_h[i], (int)memsetval);
+            break;
+        }
+    }
+    HIPCHECK(hipFree(devPitchedPtr.ptr));
+    free(A_h);
+    return testResult;
+}
+
+bool testhipMemset3DAsync(int memsetval,int p_gpuDevice)
+{
+    size_t numH = 256;
+    size_t numW = 256;
+    size_t depth = 10;
+    size_t width = numW * sizeof(char);
+    size_t sizeElements = width * numH * depth;
+    size_t elements = numW* numH* depth;
+
+
+    printf ("testhipMemset3D memsetval=%2x device=%d\n", memsetval, p_gpuDevice);
+    char *A_h;
+    bool testResult = true;
+    hipExtent extent = make_hipExtent(width, numH, depth);
+    hipPitchedPtr devPitchedPtr;
+
+    HIPCHECK(hipMalloc3D(&devPitchedPtr, extent));
+    A_h = (char*)malloc(sizeElements);
+    HIPASSERT(A_h != NULL);
+    for (size_t i=0; i<elements; i++) {
+        A_h[i] = 1;
+    }
+    hipStream_t stream;
+    HIPCHECK(hipStreamCreate(&stream));
+    HIPCHECK(hipMemset3DAsync(devPitchedPtr, memsetval, extent, stream));
+    HIPCHECK(hipStreamSynchronize(stream));
+    hipMemcpy3DParms myparms = {0};
+    myparms.srcPos = make_hipPos(0,0,0);
+    myparms.dstPos = make_hipPos(0,0,0);
+    myparms.dstPtr = make_hipPitchedPtr(A_h, width , numW, numH);
+    myparms.srcPtr = devPitchedPtr;
+    myparms.extent = extent;
+#ifdef __HIP_PLATFORM_NVCC__
+    myparms.kind = hipMemcpyKindToCudaMemcpyKind(hipMemcpyDeviceToHost);
+#else
+    myparms.kind = hipMemcpyDeviceToHost;
+#endif
+    HIPCHECK(hipMemcpy3D(&myparms));
+
+    for (int i=0; i<elements; i++) {
+        if (A_h[i] != memsetval) {
+            testResult = false;
            printf("mismatch at index:%d computed:%02x, memsetval:%02x\n", i, (int)A_h[i], (int)memsetval);
            break;
        }
@@ -82,9 +133,10 @@ bool testhipMemset3D(int memsetval,int p_gpuDevice)
 int main(int argc, char *argv[])
 {
    HipTest::parseStandardArguments(argc, argv, true);
-    bool testResult = false;
    HIPCHECK(hipSetDevice(p_gpuDevice));
-    testResult = testhipMemset3D(memsetval, p_gpuDevice);
+    bool testResult = true;
+    testResult &= testhipMemset3D(memsetval, p_gpuDevice);
+    testResult &= testhipMemset3DAsync(memsetval, p_gpuDevice);
    if (testResult) {
        passed();
    } else {
@@ -116,7 +116,7 @@ int main() {

    dimBlock.x = workgroups[i];
    // Calculate the device occupancy to know how many blocks can be run concurrently
-    hipOccupancyMaxActiveBlocksPerMultiprocessor(reinterpret_cast<uint32_t*>(&numBlocks),
+    hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks,
        test_gws, dimBlock.x * dimBlock.y * dimBlock.z, dimBlock.x * sizeof(long));

    dimGrid.x = deviceProp.multiProcessorCount * std::min(numBlocks, 32);
@@ -30,10 +30,6 @@ THE SOFTWARE.
 #include "hip/hip_runtime.h"
 #include "test_common.h"

-#define fileName "vcpy_kernel.code"
-#define kernel_name "hello_world"
-
-
 __global__ void f1(float *a) { *a = 1.0; }

 template <typename T>
@@ -44,16 +40,15 @@ __global__ void f2(T *a) { *a = 1; }
 int main(int argc, char* argv[]) {

    // test case for using kernel function pointer
-    uint32_t gridSize = 0;
-    uint32_t blockSize = 0;
+    int gridSize = 0;
+    int blockSize = 0;
    hipOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, f1, 0, 0);
    assert(gridSize != 0 && blockSize != 0);

-    uint32_t numBlock = 0;
+    int numBlock = 0;
    hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, f1, blockSize, 0);
    assert(numBlock != 0);

-
    // test case for using kernel function pointer with template
    gridSize = 0;
    blockSize = 0;
@@ -64,15 +59,5 @@ int main(int argc, char* argv[]) {
    hipOccupancyMaxActiveBlocksPerMultiprocessor<void(*)(int *)>(&numBlock, f2, blockSize, 0);
    assert(numBlock != 0);

-
-    // test case for using kernel with hipFunction_t type
-    numBlock = 0;
-    hipModule_t Module;
-    hipFunction_t Function;
-    HIPCHECK(hipModuleLoad(&Module, fileName));
-    HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name));
-    HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, Function, blockSize, 0));
-    assert(numBlock != 0);
-
    passed();
 }
@@ -30,22 +30,16 @@ THE SOFTWARE.
 #include "hip/hip_runtime.h"
 #include "test_common.h"

-#define fileName "vcpy_kernel.code"
-#define kernel_name "hello_world"
-
-
 __global__ void f1(float *a) { *a = 1.0; }

 template <typename T>
 __global__ void f2(T *a) { *a = 1; }

-
-
 int main(int argc, char* argv[]) {

    // test case for using kernel function pointer 
-    uint32_t gridSize = 0;
-    uint32_t blockSize = 0;
+    int gridSize = 0;
+    int blockSize = 0;
    hipOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, f1, 0, 0);
    assert(gridSize != 0 && blockSize != 0);

@@ -55,15 +49,5 @@ int main(int argc, char* argv[]) {
    hipOccupancyMaxPotentialBlockSize<void(*)(int *)>(&gridSize, &blockSize, f2, 0, 0);
    assert(gridSize != 0 && blockSize != 0);

-    // test case for using kernel with hipFunction_t type
-    gridSize = 0;
-    blockSize = 0;
-    hipModule_t Module;
-    hipFunction_t Function;
-    HIPCHECK(hipModuleLoad(&Module, fileName));
-    HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name));
-    HIPCHECK(hipOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, Function, 0, 0));
-    assert(gridSize != 0 && blockSize != 0);
-
    passed();
 }
@@ -99,10 +99,13 @@ THE SOFTWARE.

 #ifdef _WIN64
 #include <tchar.h>
-#define aligned_alloc _aligned_malloc
+#define aligned_alloc(x,y) _aligned_malloc(y,x)
+#define aligned_free(x) _aligned_free(x)
 #define popen(x,y) _popen(x,y)
 #define pclose(x) _pclose(x)
 #define setenv(x,y,z) _putenv_s(x,y)
+#else
+#define aligned_free(x) free(x)
 #endif

 // standard command-line variables: