Merge branch 'amd-master' into amd-master-next

Change-Id: I3094c15008093f2072bcd38aca4ea90aeae2d97b [ROCm/hip commit: 2af31479e2]
2020-04-07 06:57:42 -04:00
@@ -207,19 +207,6 @@ if (NOT CPACK_SET_DESTDIR)
  set(CPACK_PACKAGING_INSTALL_PREFIX "/opt/rocm/hip" CACHE PATH "Default installation path of hcc installer package")
 endif (NOT CPACK_SET_DESTDIR)

-# Check if we need to enable ATP marker
-if(NOT DEFINED COMPILE_HIP_ATP_MARKER)
-    if(NOT DEFINED ENV{COMPILE_HIP_ATP_MARKER})
-        set(COMPILE_HIP_ATP_MARKER 0)
-    else()
-        set(COMPILE_HIP_ATP_MARKER $ENV{COMPILE_HIP_ATP_MARKER})
-        message(WARNING "HIP Markers are deprecated, please use roctracer/rocTX marker APIs.")
-    endif()
-else()
-    message(WARNING "HIP Markers are deprecated, please use roctracer/rocTX marker APIs.")
-endif()
-add_to_config(_buildInfo COMPILE_HIP_ATP_MARKER)
-
 #############################
 # Profiling API support
 #############################
@@ -309,10 +296,6 @@ message(STATUS "\nHSA runtime in: " ${HSA_PATH})
 if(HIP_PLATFORM STREQUAL "hcc")
    include_directories(${PROJECT_SOURCE_DIR}/include)
    set(HIP_HCC_BUILD_FLAGS)
-    if(COMPILE_HIP_ATP_MARKER)
-        include_directories(/opt/rocm/profiler/CXLActivityLogger/include)
-        set(HIP_HCC_BUILD_FLAGS "${HIP_HCC_BUILD_FLAGS} -DCOMPILE_HIP_ATP_MARKER=1")
-    endif()

    # Add HIP_VERSION to CMAKE_<LANG>_FLAGS
    set(HIP_HCC_BUILD_FLAGS "${HIP_HCC_BUILD_FLAGS} -DHIP_VERSION_MAJOR=${HIP_VERSION_MAJOR} -DHIP_VERSION_MINOR=${HIP_VERSION_MINOR} -DHIP_VERSION_PATCH=${HIP_VERSION_GITDATE}")
@@ -328,7 +311,6 @@ if(HIP_PLATFORM STREQUAL "hcc")
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${HIP_HCC_BUILD_FLAGS}")

    set(SOURCE_FILES_RUNTIME
-        src/code_object_bundle.cpp
        src/program_state.cpp
        src/hip_clang.cpp
        src/hip_hcc.cpp
@@ -363,9 +345,6 @@ if(HIP_PLATFORM STREQUAL "hcc")
        set (CMAKE_BUILD_WITH_INSTALL_RPATH TRUE )
        set (CMAKE_SKIP_BUILD_RPATH TRUE )
    endif ()
-    if(COMPILE_HIP_ATP_MARKER)
-        set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -L/opt/rocm/profiler/CXLActivityLogger/bin/x86_64 -lCXLActivityLogger")
-    endif()
    add_library(hip_hcc SHARED ${SOURCE_FILES_RUNTIME})
    add_library(hip_hcc_static STATIC ${SOURCE_FILES_RUNTIME})

@@ -376,7 +355,7 @@ if(HIP_PLATFORM STREQUAL "hcc")
    target_link_libraries(hip_hcc PRIVATE hc_am)
    target_link_libraries(hip_hcc_static PRIVATE hc_am)

-    add_library(hiprtc SHARED src/hiprtc.cpp src/code_object_bundle.cpp)
+    add_library(hiprtc SHARED src/hiprtc.cpp)
    target_compile_options(hiprtc PRIVATE -DDISABLE_REDUCED_GPU_BLOB_COPY)
    set_property ( TARGET hiprtc PROPERTY VERSION "${HIP_LIB_VERSION_STRING}" )
    set_property ( TARGET hiprtc PROPERTY SOVERSION "${HIP_LIB_VERSION_MAJOR}" )
@@ -387,6 +366,9 @@ if(HIP_PLATFORM STREQUAL "hcc")

    set_target_properties(hip_hcc PROPERTIES CXX_VISIBILITY_PRESET hidden)
    set_target_properties(hip_hcc PROPERTIES VISIBILITY_INLINES_HIDDEN 1)
+    set_target_properties(hiprtc PROPERTIES CXX_VISIBILITY_PRESET hidden)
+    set_target_properties(hiprtc PROPERTIES VISIBILITY_INLINES_HIDDEN 1)
+

    if(HIP_PLATFORM STREQUAL "hcc")
        find_package(amd_comgr REQUIRED CONFIG
@@ -1,6 +1,7 @@
 ## What is this repository for? ###

-HIP allows developers to convert CUDA code to portable C++.  The same source code can be compiled to run on NVIDIA or AMD GPUs. 
+**HIP is a C++ Runtime API and Kernel Language that allows developers to create portable applications for AMD and NVIDIA GPUs from single source code.**
+
 Key features include:

 * HIP is very thin and has little or no performance impact over coding directly in CUDA or hcc "HC" mode.
@@ -134,6 +134,7 @@ if (defined $HIP_RUNTIME and $HIP_RUNTIME eq "VDI" and !defined $HIP_VDI_HOME) {
        $HIP_VDI_HOME = $HIP_PATH; # use HIP_PATH
    }
    $HIPCXXFLAGS .= "-D__HIP_VDI__";
+    $HIPCFLAGS .= "-D__HIP_VDI__";
 }

 if (defined $HIP_VDI_HOME) {
@@ -207,7 +208,8 @@ if ($HIP_PLATFORM eq "clang") {
    } else {
      $HIPCXXFLAGS .= " -std=c++11";
    }
-    $HIPCXXFLAGS .= " -isystem $HIP_CLANG_INCLUDE_PATH";
+    $HIPCXXFLAGS .= " -isystem $HIP_CLANG_INCLUDE_PATH/..";
+    $HIPCFLAGS .= " -isystem $HIP_CLANG_INCLUDE_PATH/..";
    $HIPLDFLAGS .= " -L$HIP_LIB_PATH";
    if (not $isWindows) {
      $HIPLDFLAGS .= " -Wl,--rpath-link=$HIP_LIB_PATH";
@@ -222,8 +224,10 @@ if ($HIP_PLATFORM eq "clang") {

    $HSA_PATH=$ENV{'HSA_PATH'} // "$ROCM_PATH/hsa";
    $HIPCXXFLAGS .= " -isystem $HSA_PATH/include";
+    $HIPCFLAGS .= " -isystem $HSA_PATH/include";
    if (!($HIP_RUNTIME eq "HCC")) {
      $HIPCXXFLAGS .= " -D__HIP_VDI__ -fhip-new-launch-api";
+      $HIPCFLAGS .= " -D__HIP_VDI__ -fhip-new-launch-api";
    }

 } elsif ($HIP_PLATFORM eq "hcc") {
@@ -282,8 +286,11 @@ if ($HIP_PLATFORM eq "clang") {
    }

    $HIPCXXFLAGS .= " -isystem $HIP_PATH/include/hip/hcc_detail/cuda";
+    $HIPCFLAGS .= " -isystem $HIP_PATH/include/hip/hcc_detail/cuda";
    $HIPCXXFLAGS .= " -isystem $HSA_PATH/include";
+    $HIPCFLAGS .= " -isystem $HSA_PATH/include";
    $HIPCXXFLAGS .= " -Wno-deprecated-register";
+    $HIPCFLAGS .= " -Wno-deprecated-register";

    $HIPLDFLAGS .= " -L$HSA_PATH/lib -L$ROCM_PATH/lib -lhsa-runtime64 -lhc_am ";
 #    $HIPLDFLAGS .= " -L$HCC_HOME/compiler/lib -lLLVMAMDGPUDesc -lLLVMAMDGPUUtils -lLLVMMC -lLLVMCore -lLLVMSupport ";
@@ -321,6 +328,7 @@ if ($HIP_PLATFORM eq "clang") {
    $HIPCC="$CUDA_PATH/bin/nvcc";
    $HIPCXXFLAGS .= " -Wno-deprecated-gpu-targets ";
    $HIPCXXFLAGS .= " -isystem $CUDA_PATH/include";
+    $HIPCFLAGS .= " -isystem $CUDA_PATH/include";

    $HIPLDFLAGS = " -Wno-deprecated-gpu-targets -lcuda -lcudart -L$CUDA_PATH/lib64";
 } else {
@@ -330,11 +338,14 @@ if ($HIP_PLATFORM eq "clang") {

 # Add paths to common HIP includes:
 $HIPCXXFLAGS .= " -isystem $HIP_INCLUDE_PATH" ;
+$HIPCFLAGS .= " -isystem $HIP_INCLUDE_PATH" ;

 my $compileOnly = 0;
 my $needCXXFLAGS = 0;  # need to add CXX flags to compile step
+my $needCFLAGS = 0;    # need to add C flags to compile step
 my $needLDFLAGS = 1;   # need to add LDFLAGS to compile step.
-my $hasC = 0;          # options contain a c-style file (NVCC must force recognition as GPU file)
+my $hasC = 0;          # options contain a c-style file
+my $hasCXX = 0;        # options contain a cpp-style file (NVCC must force recognition as GPU file)
 my $hasCU = 0;         # options contain a cu-style file (HCC must force recognition as GPU file)
 my $needHipHcc = ($HIP_PLATFORM eq 'hcc');      # set if we need to link hip_hcc.o from src tree. (some builds, ie cmake, provide their own)
 my $printHipVersion = 0;    # print HIP version
@@ -343,6 +354,7 @@ my $buildDeps = 0;
 my $linkType = 1;
 my $setLinkType = 0;
 my $coFormatv3 = 1;
+my $funcSupp = 0;      # enable function support

 my @options = ();
 my @inputs  = ();
@@ -400,7 +412,6 @@ foreach $arg (@ARGV)
    my $swallowArg = 0;
    if ($arg eq '-c' or $arg eq '--genco') {
        $compileOnly = 1;
-        $needCXXFLAGS = 1;
        $needLDFLAGS  = 0;
    }

@@ -463,6 +474,7 @@ foreach $arg (@ARGV)
    }
    if($trimarg eq '-use_fast_math') {
        $HIPCXXFLAGS .= " -DHIP_FAST_MATH ";
+        $HIPCFLAGS .= " -DHIP_FAST_MATH ";
    }
    if(($trimarg eq '-use-staticlib') and ($setLinkType eq 0))
    {
@@ -599,20 +611,30 @@ foreach $arg (@ARGV)
            #if $arg eq "--hipcc_profile") {  # Example argument here, hipcc
            #
            #}
+            if ($arg eq "--hipcc-func-supp") {
+              $funcSupp = 1;
+            } elsif ($arg eq "--hipcc-no-func-supp") {
+              $funcSupp = 0;
+            }
        } else {
            push (@options, $arg);
        }
        #print "O: <$arg>\n";
    } else {
        # input files and libraries
-        if (($arg =~ /\.cpp$/) or ($arg =~ /\.cxx$/) or ($arg =~ /\.c$/) or ($arg =~ /\.cc$/) ) {
+        if ($arg =~ /\.c$/) {
            $hasC = 1;
+            $needCFLAGS = 1;
+            $toolArgs .= " -x c"
+        }
+        elsif (($arg =~ /\.cpp$/) or ($arg =~ /\.cxx$/) or ($arg =~ /\.cc$/) ) {
+            $hasCXX = 1;
            $needCXXFLAGS = 1;
-            if ($HIP_PLATFORM eq 'clang') {
+            if ($HIP_PLATFORM eq 'clang' and not $arg =~ /\.c$/) {
                $toolArgs .= " -x hip"
            }
        }
-        if (($arg =~ /\.cu$/) or ($arg =~ /\.cuh$/) or ($arg =~ /\.hip$/)) {
+        elsif (($arg =~ /\.cu$/) or ($arg =~ /\.cuh$/) or ($arg =~ /\.hip$/)) {
            $hasCU = 1;
            $needCXXFLAGS = 1;
            if ($HIP_PLATFORM eq 'clang') {
@@ -657,7 +679,7 @@ if($HIP_PLATFORM eq "hcc" or $HIP_PLATFORM eq "clang"){
            my $archMacro = ' -D__HIP_ARCH_' . uc($val) . '__=1 ';
            # Add the arch option and macro to the compiler options.
            $GPU_ARCH_ARG = $GPU_ARCH_OPT . $val;
-            $HIPLDFLAGS .= $GPU_ARCH_ARG;
+            $HIPLDARCHFLAGS .= $GPU_ARCH_ARG;
            $HIPCXXFLAGS .= $archMacro;
            if ($HIP_PLATFORM eq 'clang') {
                $HIPCXXFLAGS .= $GPU_ARCH_ARG;
@@ -685,7 +707,7 @@ if ($coFormatv3 and $HIP_PLATFORM eq 'hcc') {
    $HIPCXXFLAGS .= " -mcode-object-v3";
 }

-if ($hasC and $HIP_PLATFORM eq 'nvcc') {
+if ($hasCXX and $HIP_PLATFORM eq 'nvcc') {
    $HIPCXXFLAGS .= " -x cu";
 }
 if ($hasCU and $HIP_PLATFORM eq 'hcc') {
@@ -694,6 +716,7 @@ if ($hasCU and $HIP_PLATFORM eq 'hcc') {

 if ($buildDeps and $HIP_PLATFORM eq 'nvcc') {
    $HIPCXXFLAGS .= " -M -D__CUDACC__";
+    $HIPCFLAGS .= " -M -D__CUDACC__";
 }

 if ($buildDeps and $HIP_PLATFORM eq 'clang') {
@@ -701,10 +724,14 @@ if ($buildDeps and $HIP_PLATFORM eq 'clang') {
 }

 # Add --hip-link only if there are no source files.
-if (!$needCXXFLAGS and $HIP_PLATFORM eq 'clang') {
+if (!$needCXXFLAGS and !$needCFLAGS and $HIP_PLATFORM eq 'clang') {
    $HIPLDFLAGS .= " --hip-link";
 }

+if (!$needCFLAGS and $HIP_PLATFORM eq 'clang') {
+    $HIPLDFLAGS .= $HIPLDARCHFLAGS;
+}
+
 if ($setStdLib eq 0 and $HIP_PLATFORM eq 'hcc')
 {
    $HIPCXXFLAGS .= $HCC_WA_FLAGS;
@@ -727,11 +754,12 @@ if ($HIP_PLATFORM eq "clang") {
    # Set default optimization level to -O3 for hip-clang.
    if ($optArg eq "") {
        $HIPCXXFLAGS .= " -O3";
+        $HIPCFLAGS .= " -O3";
        $HIPLDFLAGS .= " -O3";
    }
    # Do not pass -mllvm on Windows since there is a clang bug causing duplicate -mllvm options in clang -cc1 on Windows.
    # ToDo : remove restriction for Windows after clang bug is fixed.
-    if ($optArg ne "-O0" and not $isWindows) {
+    if (!$funcSupp and $optArg ne "-O0" and not $isWindows) {
        $HIPCXXFLAGS .= " -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false";
        if ($needLDFLAGS and not $needCXXFLAGS) {
            $HIPLDFLAGS .= " -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false";
@@ -747,18 +775,22 @@ if ($HIP_PLATFORM eq "clang") {

 if ($HIPCC_COMPILE_FLAGS_APPEND) {
    $HIPCXXFLAGS .= " $HIPCC_COMPILE_FLAGS_APPEND";
+    $HIPCFLAGS .= " $HIPCC_COMPILE_FLAGS_APPEND";
 }
 if ($HIPCC_LINK_FLAGS_APPEND) {
    $HIPLDFLAGS .= " $HIPCC_LINK_FLAGS_APPEND";
 }

 my $CMD="$HIPCC";
-if ($needCXXFLAGS) {
-    $CMD .= " $HIPCXXFLAGS";
-}
 if ($needLDFLAGS and not $compileOnly) {
    $CMD .= " $HIPLDFLAGS";
 }
+if ($needCFLAGS) {
+    $CMD .= " $HIPCFLAGS";
+}
+if ($needCXXFLAGS) {
+    $CMD .= " $HIPCXXFLAGS";
+}
 $CMD .= " $toolArgs";

 if ($verbose & 0x1) {
@@ -341,8 +341,8 @@ sub simpleSubstitutions {
    $ft{'execution'} += s/\bcudaLaunchCooperativeKernelMultiDevice\b/hipLaunchCooperativeKernelMultiDevice/g;
    $ft{'execution'} += s/\bcudaLaunchKernel\b/hipLaunchKernel/g;
    $ft{'execution'} += s/\bcudaSetupArgument\b/hipSetupArgument/g;
-    $ft{'occupancy'} += s/\bcuOccupancyMaxActiveBlocksPerMultiprocessor\b/hipOccupancyMaxActiveBlocksPerMultiprocessor/g;
-    $ft{'occupancy'} += s/\bcuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags\b/hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags/g;
+    $ft{'occupancy'} += s/\bcuOccupancyMaxActiveBlocksPerMultiprocessor\b/hipDrvOccupancyMaxActiveBlocksPerMultiprocessor/g;
+    $ft{'occupancy'} += s/\bcuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags\b/hipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags/g;
    $ft{'occupancy'} += s/\bcuOccupancyMaxPotentialBlockSize\b/hipOccupancyMaxPotentialBlockSize/g;
    $ft{'occupancy'} += s/\bcudaOccupancyMaxActiveBlocksPerMultiprocessor\b/hipOccupancyMaxActiveBlocksPerMultiprocessor/g;
    $ft{'occupancy'} += s/\bcudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags\b/hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags/g;
@@ -754,6 +754,9 @@ sub simpleSubstitutions {
    $ft{'library'} += s/\bcusparseCcsrmm\b/hipsparseCcsrmm/g;
    $ft{'library'} += s/\bcusparseCcsrmm2\b/hipsparseCcsrmm2/g;
    $ft{'library'} += s/\bcusparseCcsrmv\b/hipsparseCcsrmv/g;
+    $ft{'library'} += s/\bcusparseCcsrsm2_analysis\b/hipsparseCcsrsm2_analysis/g;
+    $ft{'library'} += s/\bcusparseCcsrsm2_bufferSizeExt\b/hipsparseCcsrsm2_bufferSizeExt/g;
+    $ft{'library'} += s/\bcusparseCcsrsm_solve\b/hipsparseCcsrsm_solve/g;
    $ft{'library'} += s/\bcusparseCcsrsv2_analysis\b/hipsparseCcsrsv2_analysis/g;
    $ft{'library'} += s/\bcusparseCcsrsv2_bufferSize\b/hipsparseCcsrsv2_bufferSize/g;
    $ft{'library'} += s/\bcusparseCcsrsv2_bufferSizeExt\b/hipsparseCcsrsv2_bufferSizeExt/g;
@@ -763,9 +766,11 @@ sub simpleSubstitutions {
    $ft{'library'} += s/\bcusparseCgthr\b/hipsparseCgthr/g;
    $ft{'library'} += s/\bcusparseCgthrz\b/hipsparseCgthrz/g;
    $ft{'library'} += s/\bcusparseChybmv\b/hipsparseChybmv/g;
+    $ft{'library'} += s/\bcusparseCnnz\b/hipsparseCnnz/g;
    $ft{'library'} += s/\bcusparseCreate\b/hipsparseCreate/g;
    $ft{'library'} += s/\bcusparseCreateCsrgemm2Info\b/hipsparseCreateCsrgemm2Info/g;
    $ft{'library'} += s/\bcusparseCreateCsrilu02Info\b/hipsparseCreateCsrilu02Info/g;
+    $ft{'library'} += s/\bcusparseCreateCsrsm2Info\b/hipsparseCreateCsrsm2Info/g;
    $ft{'library'} += s/\bcusparseCreateCsrsv2Info\b/hipsparseCreateCsrsv2Info/g;
    $ft{'library'} += s/\bcusparseCreateHybMat\b/hipsparseCreateHybMat/g;
    $ft{'library'} += s/\bcusparseCreateIdentityPermutation\b/hipsparseCreateIdentityPermutation/g;
@@ -784,6 +789,9 @@ sub simpleSubstitutions {
    $ft{'library'} += s/\bcusparseDcsrmm\b/hipsparseDcsrmm/g;
    $ft{'library'} += s/\bcusparseDcsrmm2\b/hipsparseDcsrmm2/g;
    $ft{'library'} += s/\bcusparseDcsrmv\b/hipsparseDcsrmv/g;
+    $ft{'library'} += s/\bcusparseDcsrsm2_analysis\b/hipsparseDcsrsm2_analysis/g;
+    $ft{'library'} += s/\bcusparseDcsrsm2_bufferSizeExt\b/hipsparseDcsrsm2_bufferSizeExt/g;
+    $ft{'library'} += s/\bcusparseDcsrsm_solve\b/hipsparseDcsrsm_solve/g;
    $ft{'library'} += s/\bcusparseDcsrsv2_analysis\b/hipsparseDcsrsv2_analysis/g;
    $ft{'library'} += s/\bcusparseDcsrsv2_bufferSize\b/hipsparseDcsrsv2_bufferSize/g;
    $ft{'library'} += s/\bcusparseDcsrsv2_bufferSizeExt\b/hipsparseDcsrsv2_bufferSizeExt/g;
@@ -792,12 +800,14 @@ sub simpleSubstitutions {
    $ft{'library'} += s/\bcusparseDestroy\b/hipsparseDestroy/g;
    $ft{'library'} += s/\bcusparseDestroyCsrgemm2Info\b/hipsparseDestroyCsrgemm2Info/g;
    $ft{'library'} += s/\bcusparseDestroyCsrilu02Info\b/hipsparseDestroyCsrilu02Info/g;
+    $ft{'library'} += s/\bcusparseDestroyCsrsm2Info\b/hipsparseDestroyCsrsm2Info/g;
    $ft{'library'} += s/\bcusparseDestroyCsrsv2Info\b/hipsparseDestroyCsrsv2Info/g;
    $ft{'library'} += s/\bcusparseDestroyHybMat\b/hipsparseDestroyHybMat/g;
    $ft{'library'} += s/\bcusparseDestroyMatDescr\b/hipsparseDestroyMatDescr/g;
    $ft{'library'} += s/\bcusparseDgthr\b/hipsparseDgthr/g;
    $ft{'library'} += s/\bcusparseDgthrz\b/hipsparseDgthrz/g;
    $ft{'library'} += s/\bcusparseDhybmv\b/hipsparseDhybmv/g;
+    $ft{'library'} += s/\bcusparseDnnz\b/hipsparseDnnz/g;
    $ft{'library'} += s/\bcusparseDroti\b/hipsparseDroti/g;
    $ft{'library'} += s/\bcusparseDsctr\b/hipsparseDsctr/g;
    $ft{'library'} += s/\bcusparseGetMatDiagType\b/hipsparseGetMatDiagType/g;
@@ -820,6 +830,9 @@ sub simpleSubstitutions {
    $ft{'library'} += s/\bcusparseScsrmm\b/hipsparseScsrmm/g;
    $ft{'library'} += s/\bcusparseScsrmm2\b/hipsparseScsrmm2/g;
    $ft{'library'} += s/\bcusparseScsrmv\b/hipsparseScsrmv/g;
+    $ft{'library'} += s/\bcusparseScsrsm2_analysis\b/hipsparseScsrsm2_analysis/g;
+    $ft{'library'} += s/\bcusparseScsrsm2_bufferSizeExt\b/hipsparseScsrsm2_bufferSizeExt/g;
+    $ft{'library'} += s/\bcusparseScsrsm_solve\b/hipsparseScsrsm_solve/g;
    $ft{'library'} += s/\bcusparseScsrsv2_analysis\b/hipsparseScsrsv2_analysis/g;
    $ft{'library'} += s/\bcusparseScsrsv2_bufferSize\b/hipsparseScsrsv2_bufferSize/g;
    $ft{'library'} += s/\bcusparseScsrsv2_bufferSizeExt\b/hipsparseScsrsv2_bufferSizeExt/g;
@@ -834,6 +847,7 @@ sub simpleSubstitutions {
    $ft{'library'} += s/\bcusparseSgthr\b/hipsparseSgthr/g;
    $ft{'library'} += s/\bcusparseSgthrz\b/hipsparseSgthrz/g;
    $ft{'library'} += s/\bcusparseShybmv\b/hipsparseShybmv/g;
+    $ft{'library'} += s/\bcusparseSnnz\b/hipsparseSnnz/g;
    $ft{'library'} += s/\bcusparseSroti\b/hipsparseSroti/g;
    $ft{'library'} += s/\bcusparseSsctr\b/hipsparseSsctr/g;
    $ft{'library'} += s/\bcusparseXbsrilu02_zeroPivot\b/hipsparseXbsrilu02_zeroPivot/g;
@@ -847,6 +861,7 @@ sub simpleSubstitutions {
    $ft{'library'} += s/\bcusparseXcsrgemm2Nnz\b/hipsparseXcsrgemm2Nnz/g;
    $ft{'library'} += s/\bcusparseXcsrgemmNnz\b/hipsparseXcsrgemmNnz/g;
    $ft{'library'} += s/\bcusparseXcsrilu02_zeroPivot\b/hipsparseXcsrilu02_zeroPivot/g;
+    $ft{'library'} += s/\bcusparseXcsrsm2_zeroPivot\b/hipsparseXcsrsm2_zeroPivot/g;
    $ft{'library'} += s/\bcusparseXcsrsort\b/hipsparseXcsrsort/g;
    $ft{'library'} += s/\bcusparseXcsrsort_bufferSizeExt\b/hipsparseXcsrsort_bufferSizeExt/g;
    $ft{'library'} += s/\bcusparseXcsrsv2_zeroPivot\b/hipsparseXcsrsv2_zeroPivot/g;
@@ -863,6 +878,9 @@ sub simpleSubstitutions {
    $ft{'library'} += s/\bcusparseZcsrmm\b/hipsparseZcsrmm/g;
    $ft{'library'} += s/\bcusparseZcsrmm2\b/hipsparseZcsrmm2/g;
    $ft{'library'} += s/\bcusparseZcsrmv\b/hipsparseZcsrmv/g;
+    $ft{'library'} += s/\bcusparseZcsrsm2_analysis\b/hipsparseZcsrsm2_analysis/g;
+    $ft{'library'} += s/\bcusparseZcsrsm2_bufferSizeExt\b/hipsparseZcsrsm2_bufferSizeExt/g;
+    $ft{'library'} += s/\bcusparseZcsrsm_solve\b/hipsparseZcsrsm_solve/g;
    $ft{'library'} += s/\bcusparseZcsrsv2_analysis\b/hipsparseZcsrsv2_analysis/g;
    $ft{'library'} += s/\bcusparseZcsrsv2_bufferSize\b/hipsparseZcsrsv2_bufferSize/g;
    $ft{'library'} += s/\bcusparseZcsrsv2_bufferSizeExt\b/hipsparseZcsrsv2_bufferSizeExt/g;
@@ -872,6 +890,7 @@ sub simpleSubstitutions {
    $ft{'library'} += s/\bcusparseZgthr\b/hipsparseZgthr/g;
    $ft{'library'} += s/\bcusparseZgthrz\b/hipsparseZgthrz/g;
    $ft{'library'} += s/\bcusparseZhybmv\b/hipsparseZhybmv/g;
+    $ft{'library'} += s/\bcusparseZnnz\b/hipsparseZnnz/g;
    $ft{'library'} += s/\bcusparseZsctr\b/hipsparseZsctr/g;
    $ft{'device_library'} += s/\bcurand\b/hiprand/g;
    $ft{'device_library'} += s/\bcurand_discrete\b/hiprand_discrete/g;
@@ -997,6 +1016,8 @@ sub simpleSubstitutions {
    $ft{'type'} += s/\bcsrgemm2Info\b/csrgemm2Info/g;
    $ft{'type'} += s/\bcsrgemm2Info_t\b/csrgemm2Info_t/g;
    $ft{'type'} += s/\bcsrilu02Info_t\b/csrilu02Info_t/g;
+    $ft{'type'} += s/\bcsrsm2Info\b/csrsm2Info/g;
+    $ft{'type'} += s/\bcsrsm2Info_t\b/csrsm2Info_t/g;
    $ft{'type'} += s/\bcsrsv2Info_t\b/csrsv2Info_t/g;
    $ft{'type'} += s/\bcuComplex\b/hipComplex/g;
    $ft{'type'} += s/\bcuDoubleComplex\b/hipDoubleComplex/g;
@@ -1130,6 +1151,7 @@ sub simpleSubstitutions {
    $ft{'type'} += s/\bcurandStatus_t\b/hiprandStatus_t/g;
    $ft{'type'} += s/\bcusparseAction_t\b/hipsparseAction_t/g;
    $ft{'type'} += s/\bcusparseDiagType_t\b/hipsparseDiagType_t/g;
+    $ft{'type'} += s/\bcusparseDirection_t\b/hipsparseDirection_t/g;
    $ft{'type'} += s/\bcusparseFillMode_t\b/hipsparseFillMode_t/g;
    $ft{'type'} += s/\bcusparseHandle_t\b/hipsparseHandle_t/g;
    $ft{'type'} += s/\bcusparseHybMat_t\b/hipsparseHybMat_t/g;
@@ -1398,6 +1420,8 @@ sub simpleSubstitutions {
    $ft{'numeric_literal'} += s/\bCUSPARSE_ACTION_SYMBOLIC\b/HIPSPARSE_ACTION_SYMBOLIC/g;
    $ft{'numeric_literal'} += s/\bCUSPARSE_DIAG_TYPE_NON_UNIT\b/HIPSPARSE_DIAG_TYPE_NON_UNIT/g;
    $ft{'numeric_literal'} += s/\bCUSPARSE_DIAG_TYPE_UNIT\b/HIPSPARSE_DIAG_TYPE_UNIT/g;
+    $ft{'numeric_literal'} += s/\bCUSPARSE_DIRECTION_COLUMN\b/HIPSPARSE_DIRECTION_COLUMN/g;
+    $ft{'numeric_literal'} += s/\bCUSPARSE_DIRECTION_ROW\b/HIPSPARSE_DIRECTION_ROW/g;
    $ft{'numeric_literal'} += s/\bCUSPARSE_FILL_MODE_LOWER\b/HIPSPARSE_FILL_MODE_LOWER/g;
    $ft{'numeric_literal'} += s/\bCUSPARSE_FILL_MODE_UPPER\b/HIPSPARSE_FILL_MODE_UPPER/g;
    $ft{'numeric_literal'} += s/\bCUSPARSE_HYB_PARTITION_AUTO\b/HIPSPARSE_HYB_PARTITION_AUTO/g;
@@ -75,7 +75,6 @@ if(UNIX AND NOT APPLE AND NOT CYGWIN)
        endif()
        # And push it back to the cache
        set(HIP_ROOT_DIR ${HIP_ROOT_DIR} CACHE PATH "HIP installed location" FORCE)
-        message("Found HIP at ${HIP_ROOT_DIR}")
    endif()

    # Find HIPCC executable
@@ -12,9 +12,9 @@
 | enum         |***`cusparseAction_t`***                                       |                  |***`hipsparseAction_t`***                                   |
 |            0 |*`CUSPARSE_ACTION_SYMBOLIC`*                                   |                  |*`HIPSPARSE_ACTION_SYMBOLIC`*                               |
 |            1 |*`CUSPARSE_ACTION_NUMERIC`*                                    |                  |*`HIPSPARSE_ACTION_NUMERIC`*                                |
-| enum         |***`cusparseDirection_t`***                                    |                  |                                                            |
-|            0 |*`CUSPARSE_DIRECTION_ROW`*                                     |                  |                                                            |
-|            1 |*`CUSPARSE_DIRECTION_COLUMN`*                                  |                  |                                                            |
+| enum         |***`cusparseDirection_t`***                                    |                  |***`hipsparseDirection_t`***                                |
+|            0 |*`CUSPARSE_DIRECTION_ROW`*                                     |                  |*`HIPSPARSE_DIRECTION_ROW`*                                 |
+|            1 |*`CUSPARSE_DIRECTION_COLUMN`*                                  |                  |*`HIPSPARSE_DIRECTION_COLUMN`*                              |
 | enum         |***`cusparseHybPartition_t`***                                 |                  |***`hipsparseHybPartition_t`***                             |
 |            0 |*`CUSPARSE_HYB_PARTITION_AUTO`*                                |                  |*`HIPSPARSE_HYB_PARTITION_AUTO`*                            |
 |            1 |*`CUSPARSE_HYB_PARTITION_USER`*                                |                  |*`HIPSPARSE_HYB_PARTITION_USER`*                            |
@@ -69,8 +69,8 @@
 | typedef      |`cusparseSolveAnalysisInfo_t`                                  |                  |                                                            |
 | struct       |`csrsv2Info`                                                   |                  |                                                            |
 | typedef      |`csrsv2Info_t`                                                 |                  |`csrsv2Info_t`                                              |
-| struct       |`csrsm2Info`                                                   | 9.2              |                                                            |
-| typedef      |`csrsm2Info_t`                                                 |                  |                                                            |
+| struct       |`csrsm2Info`                                                   | 9.2              |`csrsm2Info`                                                |
+| typedef      |`csrsm2Info_t`                                                 |                  |`csrsm2Info_t`                                              |
 | struct       |`bsrsv2Info`                                                   |                  |                                                            |
 | typedef      |`bsrsv2Info_t`                                                 |                  |                                                            |
 | struct       |`bsrsm2Info`                                                   |                  |                                                            |
@@ -151,8 +151,8 @@
 |`cusparseGetStream`                                        |`hipsparseGetStream`                             | 8.0              |
 |`cusparseCreateCsrsv2Info`                                 |`hipsparseCreateCsrsv2Info`                      |
 |`cusparseDestroyCsrsv2Info`                                |`hipsparseDestroyCsrsv2Info`                     |
-|`cusparseCreateCsrsm2Info`                                 |                                                 | 9.2              |
-|`cusparseDestroyCsrsm2Info`                                |                                                 | 9.2              |
+|`cusparseCreateCsrsm2Info`                                 |`hipsparseCreateCsrsm2Info`                      | 9.2              |
+|`cusparseDestroyCsrsm2Info`                                |`hipsparseDestroyCsrsm2Info`                     | 9.2              |
 |`cusparseCreateCsric02Info`                                |                                                 |
 |`cusparseDestroyCsric02Info`                               |                                                 |
 |`cusparseCreateCsrilu02Info`                               |`hipsparseCreateCsrilu02Info`                    |
@@ -306,19 +306,19 @@
 |`cusparseDcsrsm_solve`                                     |                                                 |
 |`cusparseCcsrsm_solve`                                     |                                                 |
 |`cusparseZcsrsm_solve`                                     |                                                 |
-|`cusparseScsrsm2_bufferSizeExt`                            |                                                 | 9.2              |
-|`cusparseDcsrsm2_bufferSizeExt`                            |                                                 | 9.2              |
-|`cusparseCcsrsm2_bufferSizeExt`                            |                                                 | 9.2              |
-|`cusparseZcsrsm2_bufferSizeExt`                            |                                                 | 9.2              |
-|`cusparseScsrsm2_analysis`                                 |                                                 | 9.2              |
-|`cusparseDcsrsm2_analysis`                                 |                                                 | 9.2              |
-|`cusparseCcsrsm2_analysis`                                 |                                                 | 9.2              |
-|`cusparseZcsrsm2_analysis`                                 |                                                 | 9.2              |
-|`cusparseScsrsm2_solve`                                    |                                                 | 9.2              |
-|`cusparseDcsrsm2_solve`                                    |                                                 | 9.2              |
-|`cusparseCcsrsm2_solve`                                    |                                                 | 9.2              |
-|`cusparseZcsrsm2_solve`                                    |                                                 | 9.2              |
-|`cusparseXcsrsm2_zeroPivot`                                |                                                 | 9.2              |
+|`cusparseScsrsm2_bufferSizeExt`                            |`hipsparseScsrsm2_bufferSizeExt`                 | 9.2              |
+|`cusparseDcsrsm2_bufferSizeExt`                            |`hipsparseDcsrsm2_bufferSizeExt`                 | 9.2              |
+|`cusparseCcsrsm2_bufferSizeExt`                            |`hipsparseCcsrsm2_bufferSizeExt`                 | 9.2              |
+|`cusparseZcsrsm2_bufferSizeExt`                            |`hipsparseZcsrsm2_bufferSizeExt`                 | 9.2              |
+|`cusparseScsrsm2_analysis`                                 |`hipsparseScsrsm2_analysis`                      | 9.2              |
+|`cusparseDcsrsm2_analysis`                                 |`hipsparseDcsrsm2_analysis`                      | 9.2              |
+|`cusparseCcsrsm2_analysis`                                 |`hipsparseCcsrsm2_analysis`                      | 9.2              |
+|`cusparseZcsrsm2_analysis`                                 |`hipsparseZcsrsm2_analysis`                      | 9.2              |
+|`cusparseScsrsm2_solve`                                    |`hipsparseScsrsm2_solve`                         | 9.2              |
+|`cusparseDcsrsm2_solve`                                    |`hipsparseDcsrsm2_solve`                         | 9.2              |
+|`cusparseCcsrsm2_solve`                                    |`hipsparseCcsrsm2_solve`                         | 9.2              |
+|`cusparseZcsrsm2_solve`                                    |`hipsparseZcsrsm2_solve`                         | 9.2              |
+|`cusparseXcsrsm2_zeroPivot`                                |`hipsparseXcsrsm2_zeroPivot`                     | 9.2              |
 |`cusparseSbsrmm`                                           |                                                 |
 |`cusparseDbsrmm`                                           |                                                 |
 |`cusparseCbsrmm`                                           |                                                 |
@@ -662,10 +662,10 @@
 |`cusparseDhyb2dense`                                       |                                                 |
 |`cusparseChyb2dense`                                       |                                                 |
 |`cusparseZhyb2dense`                                       |                                                 |
-|`cusparseSnnz`                                             |                                                 |
-|`cusparseDnnz`                                             |                                                 |
-|`cusparseCnnz`                                             |                                                 |
-|`cusparseZnnz`                                             |                                                 |
+|`cusparseSnnz`                                             |`cusparseSnnz`                                   |
+|`cusparseDnnz`                                             |`cusparseDnnz`                                   |
+|`cusparseCnnz`                                             |`cusparseCnnz`                                   |
+|`cusparseZnnz`                                             |`cusparseZnnz`                                   |
 |`cusparseCreateIdentityPermutation`                        |`hipsparseCreateIdentityPermutation`             |
 |`cusparseXcoosort_bufferSizeExt`                           |`hipsparseXcoosort_bufferSizeExt`                |
 |`cusparseXcoosortByRow`                                    |`hipsparseXcoosortByRow`                         |
@@ -1,13 +1,13 @@
 Table of Contents
 =================

-  * [Profiling HIP Code](#profiling-hip-code" aria-hidden="true"><span aria-hidden="true)
-      * [Using HIP_DB](#using-hip_db" aria-hidden="true"><span aria-hidden="true)
-      * [Using ltrace](#using-ltrace" aria-hidden="true"><span aria-hidden="true)
-      * [Chicken bits](#chicken-bits" aria-hidden="true"><span aria-hidden="true)
-      * [Debugging HIP Applications](#debugging-hip-applications" aria-hidden="true"><span aria-hidden="true)
-      * [General Debugging Tips](#general-debugging-tips" aria-hidden="true"><span aria-hidden="true)
-        * [Print env var state](#print-env-var-state" aria-hidden="true"><span aria-hidden="true)
+  * [Profiling HIP Code](#profiling-hip-code)
+      * [Using HIP_DB](#using-hip_db)
+      * [Using ltrace](#using-ltrace)
+      * [Chicken bits](#chicken-bits)
+      * [Debugging HIP Applications](#debugging-hip-applications)
+      * [General Debugging Tips](#general-debugging-tips)
+        * [Print env var state](#print-env-var-state)

 ### Using HIP_DB

@@ -388,30 +388,29 @@ def generate_prof_header(f, api_map, opts_map):
  f.write('#define INIT_CB_ARGS_DATA(cb_id, cb_data) INIT_##cb_id##_CB_ARGS_DATA(cb_data)\n')
  
  # Generating the method for the API string, name and parameters
-  if False:
-    f.write('\n')
-    f.write('#if 0\n')
-    f.write('#include <sstream>\n');
-    f.write('#include <string>\n');
-    f.write('// HIP API string method, method name and parameters\n')
-    f.write('const char* hipApiString(hip_api_id_t id, const hip_api_data_t* data) {\n')
-    f.write('  std::ostringstream oss;\n')
-    f.write('  switch (id) {\n')
-    for name, args in api_map.items():
-      f.write('    case HIP_API_ID_' + name + ':\n')
-      f.write('      oss << "' + name + '("')
-      for ind in range(0, len(args)):
-        arg_tuple = args[ind]
-        arg_name = arg_tuple[1]
-        if ind != 0: f.write(' << ","')
-        f.write('\n          << " ' + arg_name  + '=" << data->args.' + name + '.' + arg_name)
-      f.write('\n          << ")";\n')
-      f.write('    break;\n')
-    f.write('    default: oss << "unknown";\n')
-    f.write('  };\n')
-    f.write('  return strdup(oss.str().c_str());\n')
-    f.write('};\n')
-    f.write('#endif\n')
+  f.write('\n')
+  f.write('#if ENABLE_HIP_API_STRING\n')
+  f.write('#include <sstream>\n');
+  f.write('#include <string>\n');
+  f.write('// HIP API string method, method name and parameters\n')
+  f.write('const char* hipApiString(hip_api_id_t id, const hip_api_data_t* data) {\n')
+  f.write('  std::ostringstream oss;\n')
+  f.write('  switch (id) {\n')
+  for name, args in api_map.items():
+    f.write('    case HIP_API_ID_' + name + ':\n')
+    f.write('      oss << "' + name + '("')
+    for ind in range(0, len(args)):
+      arg_tuple = args[ind]
+      arg_name = arg_tuple[1]
+      if ind != 0: f.write(' << ","')
+      f.write('\n          << " ' + arg_name  + '=" << data->args.' + name + '.' + arg_name)
+    f.write('\n          << ")";\n')
+    f.write('    break;\n')
+  f.write('    default: oss << "unknown";\n')
+  f.write('  };\n')
+  f.write('  return strdup(oss.str().c_str());\n')
+  f.write('};\n')
+  f.write('#endif  // ENABLE_HIP_API_STRING\n')
  
  f.write('#endif  // _HIP_PROF_STR_H\n');

@@ -42,10 +42,9 @@ After applying all the matchers, the output HIP source is produced.

 `hipify-clang` requires:

-1. [**LLVM+CLANG**](http://releases.llvm.org) of at least version [3.8.0](http://releases.llvm.org/download.html#3.8.0); the latest stable and recommended release: [**9.0.1**](http://releases.llvm.org/download.html#9.0.1), the latest release candidate: [10.0.0-rc3](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc3).
+1. [**LLVM+CLANG**](http://releases.llvm.org) of at least version [3.8.0](http://releases.llvm.org/download.html#3.8.0); the latest stable and recommended release: [**10.0.0**](http://releases.llvm.org/download.html#10.0.0).

-2. [**CUDA**](https://developer.nvidia.com/cuda-downloads) of at least version [7.0](https://developer.nvidia.com/cuda-toolkit-70), the latest supported version is [**10.1 Update 2**](https://developer.nvidia.com/cuda-10.1-download-archive-base).
-To use the latest CUDA version [10.2](https://developer.nvidia.com/cuda-downloads) please use the latest `LLVM` release candidate: [10.0.0-rc3](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc3).
+2. [**CUDA**](https://developer.nvidia.com/cuda-downloads) of at least version [7.0](https://developer.nvidia.com/cuda-toolkit-70), the latest supported version is [**10.2**](https://developer.nvidia.com/cuda-downloads).

 | **LLVM release version**                                   | **CUDA latest supported version**                                        | **Windows** | **Linux** |
 |:----------------------------------------------------------:|:------------------------------------------------------------------------:|:-----------:|:---------:|
@@ -66,15 +65,15 @@ To use the latest CUDA version [10.2](https://developer.nvidia.com/cuda-download
 | [8.0.0](http://releases.llvm.org/download.html#8.0.0)      | [10.0](https://developer.nvidia.com/cuda-10.0-download-archive)          | - <br/> not working due to <br/> the clang's bug [38811](https://bugs.llvm.org/show_bug.cgi?id=38811) <br/>+<br/>[patch](patches/patch_for_clang_8.0.0_bug_38811.zip)*</br> | + |
 | [8.0.1](http://releases.llvm.org/download.html#8.0.1)      | [10.0](https://developer.nvidia.com/cuda-10.0-download-archive)          | - <br/> not working due to <br/> the clang's bug [38811](https://bugs.llvm.org/show_bug.cgi?id=38811) <br/>+<br/>[patch](patches/patch_for_clang_8.0.1_bug_38811.zip)*</br> | + |
 | [9.0.0](http://releases.llvm.org/download.html#9.0.0)      | [10.1](https://developer.nvidia.com/cuda-10.1-download-archive-base)     | +           | +         |
-| [**9.0.1**](http://releases.llvm.org/download.html#9.0.1)  | [**10.1**](https://developer.nvidia.com/cuda-10.1-download-archive-base) | + <br/> **LATEST STABLE RELEASE** | + <br/> **LATEST STABLE RELEASE** |
-| [10.0.0-rc3](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc3) | [10.2](https://developer.nvidia.com/cuda-downloads)             | +           | +         |
+| [9.0.1](http://releases.llvm.org/download.html#9.0.1)      | [10.1](https://developer.nvidia.com/cuda-10.1-download-archive-base)     | +           | +         |
+| [**10.0.0**](http://releases.llvm.org/download.html#10.0.0)| [**10.2**](https://developer.nvidia.com/cuda-downloads)                  | + <br/> **LATEST STABLE RELEASE** | + <br/> **LATEST STABLE RELEASE** |

 `*` Download the patch and unpack it into your `LLVM` distributive directory; a few header files will be overwritten; rebuilding of `LLVM` is not needed.

 In most cases, you can get a suitable version of `LLVM+CLANG` with your package manager.

 Failing that or having multiple versions of `LLVM`, you can [download a release archive](http://releases.llvm.org/), build or install it, and set
-[CMAKE_PREFIX_PATH](https://cmake.org/cmake/help/v3.5/variable/CMAKE_PREFIX_PATH.html) so `cmake` can find it; for instance: `-DCMAKE_PREFIX_PATH=f:\LLVM\9.0.1\dist`
+[CMAKE_PREFIX_PATH](https://cmake.org/cmake/help/v3.5/variable/CMAKE_PREFIX_PATH.html) so `cmake` can find it; for instance: `-DCMAKE_PREFIX_PATH=d:\LLVM\10.0.0\dist`

 ### <a name="hipify-clang-usage"></a> hipify-clang: usage

@@ -83,14 +82,14 @@ To process a file, `hipify-clang` needs access to the same headers that would be
 For example:

 ```shell
-./hipify-clang square.cu --cuda-path=/usr/local/cuda-10.1 -I /usr/local/cuda-10.1/samples/common/inc
+./hipify-clang square.cu --cuda-path=/usr/local/cuda-10.2 -I /usr/local/cuda-10.2/samples/common/inc
 ```

 `hipify-clang` arguments are given first, followed by a separator `'--'`, and then the arguments you'd pass to `clang` if you
 were compiling the input file. For example:

 ```bash
-./hipify-clang cpp17.cu --cuda-path=/usr/local/cuda-10.1 -- -std=c++17
+./hipify-clang cpp17.cu --cuda-path=/usr/local/cuda-10.2 -- -std=c++17
 ```

 The [Clang manual for compiling CUDA](https://llvm.org/docs/CompileCudaWithLLVM.html#compiling-cuda-code) may be useful.
@@ -158,7 +157,7 @@ Run `Visual Studio 16 2019`, open the generated `LLVM.sln`, build all, build pro

 **LLVM 10.0.0 or newer:**

-1. download [`LLVM project`](https://github.com/llvm/llvm-project/archive/llvmorg-10.0.0-rc3.tar.gz) sources;
+1. download [`LLVM project`](https://github.com/llvm/llvm-project/releases/download/llvmorg-10.0.0/llvm-project-10.0.0.tar.xz) sources;
 2. build [`LLVM project`](http://llvm.org/docs/CMake.html):

 **Linux**:
@@ -193,19 +192,19 @@ Run `Visual Studio 16 2019`, open the generated `LLVM.sln`, build all, build pro

    * Having multiple CUDA installations to choose a particular version the `DCUDA_TOOLKIT_ROOT_DIR` option should be specified:

-        - ***Linux***: `-DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-10.1`
+        - ***Linux***: `-DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-10.2`

-        - ***Windows***: `-DCUDA_TOOLKIT_ROOT_DIR="c:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1"`
+        - ***Windows***: `-DCUDA_TOOLKIT_ROOT_DIR="c:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.2"`

-          `-DCUDA_SDK_ROOT_DIR="c:/ProgramData/NVIDIA Corporation/CUDA Samples/v10.1"`
+          `-DCUDA_SDK_ROOT_DIR="c:/ProgramData/NVIDIA Corporation/CUDA Samples/v10.2"`

 4. Ensure [`cuDNN`](https://developer.nvidia.com/rdp/cudnn-archive) of the version corresponding to CUDA's version is installed.

    * Path to cuDNN should be specified by the `CUDA_DNN_ROOT_DIR` option:

-        - ***Linux***: `-DCUDA_DNN_ROOT_DIR=/srv/CUDNN/cudnn-10.1-v7.6.5.32`
+        - ***Linux***: `-DCUDA_DNN_ROOT_DIR=/srv/CUDNN/cudnn-10.2-v7.6.5.32`

-        - ***Windows***: `-DCUDA_DNN_ROOT_DIR=f:/CUDNN/cudnn-10.1-windows10-x64-v7.6.5.32`
+        - ***Windows***: `-DCUDA_DNN_ROOT_DIR=d:/CUDNN/cudnn-10.2-windows10-x64-v7.6.5.32`

 5. Ensure [`CUB`](https://github.com/NVlabs/cub) of the version corresponding to CUDA's version is installed.

@@ -213,7 +212,7 @@ Run `Visual Studio 16 2019`, open the generated `LLVM.sln`, build all, build pro

        - ***Linux***: `-DCUDA_CUB_ROOT_DIR=/srv/git/CUB`

-        - ***Windows***: `-DCUDA_CUB_ROOT_DIR=f:/GIT/cub`
+        - ***Windows***: `-DCUDA_CUB_ROOT_DIR=d:/GIT/cub`

 5. Ensure [`python`](https://www.python.org/downloads) of minimum required version 2.7 is installed.

@@ -221,21 +220,21 @@ Run `Visual Studio 16 2019`, open the generated `LLVM.sln`, build all, build pro

    * Install `lit` into `python`:

-        - ***Linux***: `python /srv/git/LLVM/9.0.1/llvm/utils/lit/setup.py install`
+        - ***Linux***: `python /srv/git/LLVM/10.0.0/llvm/utils/lit/setup.py install`

-        - ***Windows***: `python f:/LLVM/9.0.1/llvm/utils/lit/setup.py install`
+        - ***Windows***: `python d:/LLVM/10.0.0/llvm/utils/lit/setup.py install`

    * Starting with LLVM 6.0.1 path to `llvm-lit` python script should be specified by the `LLVM_EXTERNAL_LIT` option:

-        - ***Linux***: `-DLLVM_EXTERNAL_LIT=/srv/git/LLVM/9.0.1/build/bin/llvm-lit`
+        - ***Linux***: `-DLLVM_EXTERNAL_LIT=/srv/git/LLVM/10.0.0/build/bin/llvm-lit`

-        - ***Windows***: `-DLLVM_EXTERNAL_LIT=f:/LLVM/9.0.1/build/Release/bin/llvm-lit.py`
+        - ***Windows***: `-DLLVM_EXTERNAL_LIT=d:/LLVM/10.0.0/build/Release/bin/llvm-lit.py`

    * `FileCheck`:

-        - ***Linux***: copy from `/srv/git/LLVM/9.0.1/build/bin/` to `CMAKE_INSTALL_PREFIX/dist/bin`
+        - ***Linux***: copy from `/srv/git/LLVM/10.0.0/build/bin/` to `CMAKE_INSTALL_PREFIX/dist/bin`

-        - ***Windows***: copy from `f:/LLVM/9.0.1/build/Release/bin` to `CMAKE_INSTALL_PREFIX/dist/bin`
+        - ***Windows***: copy from `d:/LLVM/10.0.0/build/Release/bin` to `CMAKE_INSTALL_PREFIX/dist/bin`

        - Or specify the path to `FileCheck` in `CMAKE_INSTALL_PREFIX` option

@@ -249,7 +248,7 @@ On Linux the following configurations are tested:

 Ubuntu 14: LLVM 5.0.0 - 6.0.1, CUDA 7.0 - 9.0, cudnn-5.0.5 - cudnn-7.6.5.32

-Ubuntu 16-18: LLVM 8.0.0 - 10.0.0-rc3, CUDA 8.0 - 10.2, cudnn-5.1.10 - cudnn-7.6.5.32
+Ubuntu 16-18: LLVM 8.0.0 - 10.0.0, CUDA 8.0 - 10.2, cudnn-5.1.10 - cudnn-7.6.5.32

 Minimum build system requirements for the above configurations:

@@ -262,11 +261,11 @@ cmake
 -DHIPIFY_CLANG_TESTS=1 \
 -DCMAKE_BUILD_TYPE=Release \
 -DCMAKE_INSTALL_PREFIX=../dist \
- -DCMAKE_PREFIX_PATH=/srv/git/LLVM/9.0.1/dist \
- -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-10.1 \
- -DCUDA_DNN_ROOT_DIR=/srv/CUDNN/cudnn-10.1-v7.6.5.32 \
+ -DCMAKE_PREFIX_PATH=/srv/git/LLVM/10.0.0/dist \
+ -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-10.2 \
+ -DCUDA_DNN_ROOT_DIR=/srv/CUDNN/cudnn-10.2-v7.6.5.32 \
 -DCUDA_CUB_ROOT_DIR=/srv/git/CUB \
- -DLLVM_EXTERNAL_LIT=/srv/git/LLVM/9.0.1/build/bin/llvm-lit \
+ -DLLVM_EXTERNAL_LIT=/srv/git/LLVM/10.0.0/build/bin/llvm-lit \
 ..
 ```
 *A corresponding successful output:*
@@ -285,14 +284,14 @@ cmake
 -- Detecting CXX compiler ABI info - done
 -- Detecting CXX compile features
 -- Detecting CXX compile features - done
-- Found LLVM 9.0.1:
--    - CMake module path: /srv/git/LLVM/9.0.1/dist/lib/cmake/llvm
--    - Include path     : /srv/git/LLVM/9.0.1/dist/include
--    - Binary path      : /srv/git/LLVM/9.0.1/dist/bin
+-- Found LLVM 10.0.0:
+--    - CMake module path: /srv/git/LLVM/10.0.0/dist/lib/cmake/llvm
+--    - Include path     : /srv/git/LLVM/10.0.0/dist/include
+--    - Binary path      : /srv/git/LLVM/10.0.0/dist/bin
 -- Linker detection: GNU ld
 -- Found PythonInterp: /usr/bin/python2.7 (found suitable version "2.7.12", minimum required is "2.7")
 -- Found lit: /usr/local/bin/lit
-- Found FileCheck: /srv/git/LLVM/9.0.1/dist/bin/FileCheck
+-- Found FileCheck: /srv/git/LLVM/10.0.0/dist/bin/FileCheck
 -- Looking for pthread.h
 -- Looking for pthread.h - found
 -- Looking for pthread_create
@@ -302,7 +301,7 @@ cmake
 -- Looking for pthread_create in pthread
 -- Looking for pthread_create in pthread - found
 -- Found Threads: TRUE
-- Found CUDA: /usr/local/cuda-10.1 (found version "10.1")
+-- Found CUDA: /usr/local/cuda-10.2 (found version "10.2")
 -- Configuring done
 -- Generating done
 -- Build files have been written to: /srv/git/HIP/hipify-clang/build
@@ -314,83 +313,85 @@ make test-hipify
 ```shell
 Running HIPify regression tests
 ========================================
-CUDA 10.1 - will be used for testing
-LLVM 9.0.1 - will be used for testing
+CUDA 10.2 - will be used for testing
+LLVM 10.0.0 - will be used for testing
 x86_64 - Platform architecture
 Linux 5.2.0 - Platform OS
 64 - hipify-clang binary bitness
 64 - python 2.7.12 binary bitness
 ========================================
-- Testing: 67 tests, 12 threads --
-PASS: hipify :: unit_tests/casts/reinterpret_cast.cu (1 of 67)
-PASS: hipify :: unit_tests/device/math_functions.cu (2 of 67)
-PASS: hipify :: unit_tests/device/atomics.cu (3 of 67)
-PASS: hipify :: unit_tests/device/device_symbols.cu (4 of 67)
-PASS: hipify :: unit_tests/headers/headers_test_01.cu (5 of 67)
-PASS: hipify :: unit_tests/headers/headers_test_02.cu (6 of 67)
-PASS: hipify :: unit_tests/headers/headers_test_03.cu (7 of 67)
-PASS: hipify :: unit_tests/headers/headers_test_05.cu (8 of 67)
-PASS: hipify :: unit_tests/headers/headers_test_04.cu (9 of 67)
-PASS: hipify :: unit_tests/headers/headers_test_06.cu (10 of 67)
-PASS: hipify :: unit_tests/headers/headers_test_07.cu (11 of 67)
-PASS: hipify :: unit_tests/headers/headers_test_10.cu (12 of 67)
-PASS: hipify :: unit_tests/headers/headers_test_11.cu (13 of 67)
-PASS: hipify :: unit_tests/headers/headers_test_08.cu (14 of 67)
-PASS: hipify :: unit_tests/kernel_launch/kernel_launch_01.cu (15 of 67)
-PASS: hipify :: unit_tests/headers/headers_test_09.cu (16 of 67)
-PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_02.cu (17 of 67)
-PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_01.cu (18 of 67)
-PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_0_based_indexing.cu (19 of 67)
-PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_1_based_indexing.cu (20 of 67)
-PASS: hipify :: unit_tests/libraries/CUB/cub_03.cu (21 of 67)
-PASS: hipify :: unit_tests/libraries/CUB/cub_01.cu (22 of 67)
-PASS: hipify :: unit_tests/libraries/CUB/cub_02.cu (23 of 67)
-PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_0_based_indexing_rocblas.cu (24 of 67)
-PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_sgemm_matrix_multiplication.cu (25 of 67)
-PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_1_based_indexing_rocblas.cu (26 of 67)
-PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_sgemm_matrix_multiplication_rocblas.cu (27 of 67)
-PASS: hipify :: unit_tests/libraries/cuComplex/cuComplex_Julia.cu (28 of 67)
-PASS: hipify :: unit_tests/libraries/cuFFT/simple_cufft.cu (29 of 67)
-PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_softmax.cu (30 of 67)
-PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_convolution_forward.cu (31 of 67)
-PASS: hipify :: unit_tests/libraries/cuRAND/poisson_api_example.cu (32 of 67)
-PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_01.cu (33 of 67)
-PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_generate.cpp (34 of 67)
-PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_02.cu (35 of 67)
-PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_kernel.cpp (36 of 67)
-PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_03.cu (37 of 67)
-PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_04.cu (38 of 67)
-PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_05.cu (39 of 67)
-PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_07.cu (40 of 67)
-PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_06.cu (41 of 67)
-PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_08.cu (42 of 67)
-PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_09.cu (43 of 67)
-PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_11.cu (44 of 67)
-PASS: hipify :: unit_tests/namespace/ns_kernel_launch.cu (45 of 67)
-PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_10.cu (46 of 67)
-PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_12.cu (47 of 67)
-PASS: hipify :: unit_tests/pp/pp_if_else_conditionals.cu (48 of 67)
-PASS: hipify :: unit_tests/pp/pp_if_else_conditionals_01.cu (49 of 67)
-PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/tex2dKernel.cpp (50 of 67)
-PASS: hipify :: unit_tests/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp (51 of 67)
-PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp (52 of 67)
-PASS: hipify :: unit_tests/samples/2_Cookbook/13_occupancy/occupancy.cpp (53 of 67)
-PASS: hipify :: unit_tests/samples/2_Cookbook/1_hipEvent/hipEvent.cpp (54 of 67)
-PASS: hipify :: unit_tests/samples/2_Cookbook/2_Profiler/Profiler.cpp (55 of 67)
-PASS: hipify :: unit_tests/samples/2_Cookbook/7_streams/stream.cpp (56 of 67)
-PASS: hipify :: unit_tests/samples/2_Cookbook/8_peer2peer/peer2peer.cpp (57 of 67)
-PASS: hipify :: unit_tests/samples/MallocManaged.cpp (58 of 67)
-PASS: hipify :: unit_tests/samples/allocators.cu (59 of 67)
-PASS: hipify :: unit_tests/samples/coalescing.cu (60 of 67)
-PASS: hipify :: unit_tests/samples/dynamic_shared_memory.cu (61 of 67)
-PASS: hipify :: unit_tests/samples/axpy.cu (62 of 67)
-PASS: hipify :: unit_tests/samples/intro.cu (63 of 67)
-PASS: hipify :: unit_tests/samples/cudaRegister.cu (64 of 67)
-PASS: hipify :: unit_tests/samples/square.cu (65 of 67)
-PASS: hipify :: unit_tests/samples/static_shared_memory.cu (66 of 67)
-PASS: hipify :: unit_tests/samples/vec_add.cu (67 of 67)
-Testing Time: 3.07s
-  Expected Passes    : 67
+-- Testing: 69 tests, 12 threads --
+PASS: hipify :: unit_tests/casts/reinterpret_cast.cu (1 of 69)
+PASS: hipify :: unit_tests/device/math_functions.cu (2 of 69)
+PASS: hipify :: unit_tests/device/atomics.cu (3 of 69)
+PASS: hipify :: unit_tests/headers/headers_test_01.cu (4 of 69)
+PASS: hipify :: unit_tests/device/device_symbols.cu (5 of 69)
+PASS: hipify :: unit_tests/headers/headers_test_02.cu (6 of 69)
+PASS: hipify :: unit_tests/headers/headers_test_03.cu (7 of 69)
+PASS: hipify :: unit_tests/headers/headers_test_05.cu (8 of 69)
+PASS: hipify :: unit_tests/headers/headers_test_04.cu (9 of 69)
+PASS: hipify :: unit_tests/headers/headers_test_07.cu (10 of 69)
+PASS: hipify :: unit_tests/headers/headers_test_06.cu (11 of 69)
+PASS: hipify :: unit_tests/headers/headers_test_11.cu (12 of 69)
+PASS: hipify :: unit_tests/headers/headers_test_10.cu (13 of 69)
+PASS: hipify :: unit_tests/headers/headers_test_08.cu (14 of 69)
+PASS: hipify :: unit_tests/kernel_launch/kernel_launch_01.cu (15 of 69)
+PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_02.cu (16 of 69)
+PASS: hipify :: unit_tests/headers/headers_test_09.cu (17 of 69)
+PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_01.cu (18 of 69)
+PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_0_based_indexing.cu (19 of 69)
+PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_1_based_indexing.cu (20 of 69)
+PASS: hipify :: unit_tests/libraries/CUB/cub_03.cu (21 of 69)
+PASS: hipify :: unit_tests/libraries/CUB/cub_01.cu (22 of 69)
+PASS: hipify :: unit_tests/libraries/CUB/cub_02.cu (23 of 69)
+PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_sgemm_matrix_multiplication.cu (24 of 69)
+PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_0_based_indexing_rocblas.cu (25 of 69)
+PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_1_based_indexing_rocblas.cu (26 of 69)
+PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_sgemm_matrix_multiplication_rocblas.cu (27 of 69)
+PASS: hipify :: unit_tests/libraries/cuComplex/cuComplex_Julia.cu (28 of 69)
+PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_softmax.cu (29 of 69)
+PASS: hipify :: unit_tests/libraries/cuFFT/simple_cufft.cu (30 of 69)
+PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_convolution_forward.cu (31 of 69)
+PASS: hipify :: unit_tests/libraries/cuRAND/poisson_api_example.cu (32 of 69)
+PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_generate.cpp (33 of 69)
+PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_01.cu (34 of 69)
+PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_02.cu (35 of 69)
+PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_kernel.cpp (36 of 69)
+PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_03.cu (37 of 69)
+PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_04.cu (38 of 69)
+PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_05.cu (39 of 69)
+PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_06.cu (40 of 69)
+PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_07.cu (41 of 69)
+PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_08.cu (42 of 69)
+PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_09.cu (43 of 69)
+PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_10.cu (44 of 69)
+PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_11.cu (45 of 69)
+PASS: hipify :: unit_tests/namespace/ns_kernel_launch.cu (46 of 69)
+PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_12.cu (47 of 69)
+PASS: hipify :: unit_tests/pp/pp_if_else_conditionals.cu (48 of 69)
+PASS: hipify :: unit_tests/pp/pp_if_else_conditionals_01.cu (49 of 69)
+PASS: hipify :: unit_tests/pp/pp_if_else_conditionals_01_LLVM_10.cu (50 of 69)
+PASS: hipify :: unit_tests/pp/pp_if_else_conditionals_LLVM_10.cu (51 of 69)
+PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/tex2dKernel.cpp (52 of 69)
+PASS: hipify :: unit_tests/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp (53 of 69)
+PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp (54 of 69)
+PASS: hipify :: unit_tests/samples/2_Cookbook/1_hipEvent/hipEvent.cpp (55 of 69)
+PASS: hipify :: unit_tests/samples/2_Cookbook/13_occupancy/occupancy.cpp (56 of 69)
+PASS: hipify :: unit_tests/samples/2_Cookbook/2_Profiler/Profiler.cpp (57 of 69)
+PASS: hipify :: unit_tests/samples/MallocManaged.cpp (58 of 69)
+PASS: hipify :: unit_tests/samples/2_Cookbook/7_streams/stream.cpp (59 of 69)
+PASS: hipify :: unit_tests/samples/2_Cookbook/8_peer2peer/peer2peer.cpp (60 of 69)
+PASS: hipify :: unit_tests/samples/allocators.cu (61 of 69)
+PASS: hipify :: unit_tests/samples/coalescing.cu (62 of 69)
+PASS: hipify :: unit_tests/samples/axpy.cu (63 of 69)
+PASS: hipify :: unit_tests/samples/dynamic_shared_memory.cu (64 of 69)
+PASS: hipify :: unit_tests/samples/cudaRegister.cu (65 of 69)
+PASS: hipify :: unit_tests/samples/intro.cu (66 of 69)
+PASS: hipify :: unit_tests/samples/square.cu (67 of 69)
+PASS: hipify :: unit_tests/samples/static_shared_memory.cu (68 of 69)
+PASS: hipify :: unit_tests/samples/vec_add.cu (69 of 69)
+Testing Time: 3.23s
+  Expected Passes    : 69
 [100%] Built target test-hipify
 ```
 ### <a name="windows"></a > hipify-clang: Windows
@@ -404,8 +405,8 @@ Testing Time: 3.07s
 | 7.0.0 - 7.1.0  | 9.2      | 7.6.5.32            | 2017.15.9.11              | 3.13.3    | 3.7.3      |
 | 8.0.0 - 8.0.1  | 10.0     | 7.6.5.32            | 2017.15.9.15              | 3.14.2    | 3.7.4      |
 | 9.0.0 - 9.0.1  | 10.1     | 7.6.5.32            | 2017.15.9.20, 2019.16.4.5 | 3.16.4    | 3.8.0      |
-| 10.0.0-rc1-rc3 | 10.2     | 7.6.5.32            | 2017.15.9.20, 2019.16.4.5 | 3.16.4    | 3.8.1      |
-| 11.0.0git      | 10.2     | 7.6.5.32            | 2017.15.9.20, 2019.16.4.5 | 3.16.5    | 3.8.2      |
+| 10.0.0         | 10.2     | 7.6.5.32            | 2017.15.9.21, 2019.16.5.1 | 3.17.0    | 3.8.2      |
+| 11.0.0git      | 10.2     | 7.6.5.32            | 2017.15.9.21, 2019.16.5.1 | 3.17.0    | 3.8.2      |

 *Building with testing support on `Windows 10` by `Visual Studio 16 2019`:*

@@ -416,28 +417,28 @@ cmake
 -DHIPIFY_CLANG_TESTS=1 \
 -DCMAKE_BUILD_TYPE=Release \
 -DCMAKE_INSTALL_PREFIX=../dist \
- -DCMAKE_PREFIX_PATH=f:/LLVM/9.0.1/dist \
- -DCUDA_TOOLKIT_ROOT_DIR="c:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1" \
- -DCUDA_SDK_ROOT_DIR="c:/ProgramData/NVIDIA Corporation/CUDA Samples/v10.1" \
- -DCUDA_DNN_ROOT_DIR=f:/CUDNN/cudnn-10.1-windows10-x64-v7.6.5.32 \
- -DCUDA_CUB_ROOT_DIR=f:/GIT/cub \
- -DLLVM_EXTERNAL_LIT=f:/LLVM/9.0.1/build/Release/bin/llvm-lit.py \
+ -DCMAKE_PREFIX_PATH=d:/LLVM/10.0.0/dist \
+ -DCUDA_TOOLKIT_ROOT_DIR="c:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.2" \
+ -DCUDA_SDK_ROOT_DIR="c:/ProgramData/NVIDIA Corporation/CUDA Samples/v10.2" \
+ -DCUDA_DNN_ROOT_DIR=d:/CUDNN/cudnn-10.2-windows10-x64-v7.6.5.32 \
+ -DCUDA_CUB_ROOT_DIR=d:/GIT/cub \
+ -DLLVM_EXTERNAL_LIT=d:/LLVM/10.0.0/build/Release/bin/llvm-lit.py \
 -Thost=x64
 ..
 ```
 *A corresponding successful output:*
 ```shell
-- Found LLVM 9.0.1:
--    - CMake module path: F:/LLVM/9.0.1/dist/lib/cmake/llvm
--    - Include path     : F:/LLVM/9.0.1/dist/include
--    - Binary path      : F:/LLVM/9.0.1/dist/bin
-- Found PythonInterp: C:/Program Files/Python38/python.exe (found suitable version "3.8.2", minimum required is "3.6")
-- Found lit: C:/Program Files/Python38/Scripts/lit.exe
-- Found FileCheck: F:/LLVM/9.0.1/dist/bin/FileCheck.exe
-- Found CUDA: C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1 (found version "10.1")
+-- Found LLVM 10.0.0:
+--    - CMake module path: d:/LLVM/10.0.0/dist/lib/cmake/llvm
+--    - Include path     : d:/LLVM/10.0.0/dist/include
+--    - Binary path      : d:/LLVM/10.0.0/dist/bin
+-- Found PythonInterp: c:/Program Files/Python38/python.exe (found suitable version "3.8.2", minimum required is "3.6")
+-- Found lit: c:/Program Files/Python38/Scripts/lit.exe
+-- Found FileCheck: d:/LLVM/10.0.0/dist/bin/FileCheck.exe
+-- Found CUDA: c:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.2 (found version "10.2")
 -- Configuring done
 -- Generating done
-- Build files have been written to: f:/HIP/hipify-clang/build
+-- Build files have been written to: d:/HIP/hipify-clang/build
 ```

 Run `Visual Studio 16 2019`, open the generated `hipify-clang.sln`, build project `test-hipify`.
@@ -545,9 +545,9 @@ const std::map<llvm::StringRef, hipCounter> CUDA_DRIVER_FUNCTION_MAP{

  // 5.21. Occupancy
  // cudaOccupancyMaxActiveBlocksPerMultiprocessor
-  {"cuOccupancyMaxActiveBlocksPerMultiprocessor",          {"hipDrvOccupancyMaxActiveBlocksPerMultiprocessor",            "", CONV_OCCUPANCY, API_DRIVER}},
+  {"cuOccupancyMaxActiveBlocksPerMultiprocessor",          {"hipDrvOccupancyMaxActiveBlocksPerMultiprocessor",         "", CONV_OCCUPANCY, API_DRIVER}},
  // cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
-  {"cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", {"hipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags",   "", CONV_OCCUPANCY, API_DRIVER}},
+  {"cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", {"hipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags","", CONV_OCCUPANCY, API_DRIVER}},
  // cudaOccupancyMaxPotentialBlockSize
  {"cuOccupancyMaxPotentialBlockSize",                     {"hipOccupancyMaxPotentialBlockSize",                       "", CONV_OCCUPANCY, API_DRIVER}},
  // cudaOccupancyMaxPotentialBlockSizeWithFlags
@@ -49,8 +49,8 @@ const std::map<llvm::StringRef, hipCounter> CUDA_SPARSE_FUNCTION_MAP{
  {"cusparseGetStream",                           {"hipsparseGetStream",                           "", CONV_LIB_FUNC, API_SPARSE}},
  {"cusparseCreateCsrsv2Info",                    {"hipsparseCreateCsrsv2Info",                    "", CONV_LIB_FUNC, API_SPARSE}},
  {"cusparseDestroyCsrsv2Info",                   {"hipsparseDestroyCsrsv2Info",                   "", CONV_LIB_FUNC, API_SPARSE}},
-  {"cusparseCreateCsrsm2Info",                    {"hipsparseCreateCsrsm2Info",                    "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
-  {"cusparseDestroyCsrsm2Info",                   {"hipsparseDestroyCsrsm2Info",                   "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
+  {"cusparseCreateCsrsm2Info",                    {"hipsparseCreateCsrsm2Info",                    "", CONV_LIB_FUNC, API_SPARSE}},
+  {"cusparseDestroyCsrsm2Info",                   {"hipsparseDestroyCsrsm2Info",                   "", CONV_LIB_FUNC, API_SPARSE}},
  {"cusparseCreateCsric02Info",                   {"hipsparseCreateCsric02Info",                   "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
  {"cusparseDestroyCsric02Info",                  {"hipsparseDestroyCsric02Info",                  "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
  {"cusparseCreateCsrilu02Info",                  {"hipsparseCreateCsrilu02Info",                  "", CONV_LIB_FUNC, API_SPARSE}},
@@ -218,27 +218,27 @@ const std::map<llvm::StringRef, hipCounter> CUDA_SPARSE_FUNCTION_MAP{
  {"cusparseCcsrsm_analysis",                     {"hipsparseCcsrsm_analysis",                     "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
  {"cusparseZcsrsm_analysis",                     {"hipsparseZcsrsm_analysis",                     "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},

-  {"cusparseScsrsm_solve",                        {"hipsparseScsrsm_solve",                        "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
-  {"cusparseDcsrsm_solve",                        {"hipsparseDcsrsm_solve",                        "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
-  {"cusparseCcsrsm_solve",                        {"hipsparseCcsrsm_solve",                        "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
-  {"cusparseZcsrsm_solve",                        {"hipsparseZcsrsm_solve",                        "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
+  {"cusparseScsrsm_solve",                        {"hipsparseScsrsm_solve",                        "", CONV_LIB_FUNC, API_SPARSE}},
+  {"cusparseDcsrsm_solve",                        {"hipsparseDcsrsm_solve",                        "", CONV_LIB_FUNC, API_SPARSE}},
+  {"cusparseCcsrsm_solve",                        {"hipsparseCcsrsm_solve",                        "", CONV_LIB_FUNC, API_SPARSE}},
+  {"cusparseZcsrsm_solve",                        {"hipsparseZcsrsm_solve",                        "", CONV_LIB_FUNC, API_SPARSE}},

-  {"cusparseScsrsm2_bufferSizeExt",               {"hipsparseScsrsm2_bufferSizeExt",               "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
-  {"cusparseDcsrsm2_bufferSizeExt",               {"hipsparseDcsrsm2_bufferSizeExt",               "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
-  {"cusparseCcsrsm2_bufferSizeExt",               {"hipsparseCcsrsm2_bufferSizeExt",               "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
-  {"cusparseZcsrsm2_bufferSizeExt",               {"hipsparseZcsrsm2_bufferSizeExt",               "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
+  {"cusparseScsrsm2_bufferSizeExt",               {"hipsparseScsrsm2_bufferSizeExt",               "", CONV_LIB_FUNC, API_SPARSE}},
+  {"cusparseDcsrsm2_bufferSizeExt",               {"hipsparseDcsrsm2_bufferSizeExt",               "", CONV_LIB_FUNC, API_SPARSE,}},
+  {"cusparseCcsrsm2_bufferSizeExt",               {"hipsparseCcsrsm2_bufferSizeExt",               "", CONV_LIB_FUNC, API_SPARSE}},
+  {"cusparseZcsrsm2_bufferSizeExt",               {"hipsparseZcsrsm2_bufferSizeExt",               "", CONV_LIB_FUNC, API_SPARSE}},

-  {"cusparseScsrsm2_analysis",                    {"hipsparseScsrsm2_analysis",                    "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
-  {"cusparseDcsrsm2_analysis",                    {"hipsparseDcsrsm2_analysis",                    "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
-  {"cusparseCcsrsm2_analysis",                    {"hipsparseCcsrsm2_analysis",                    "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
-  {"cusparseZcsrsm2_analysis",                    {"hipsparseZcsrsm2_analysis",                    "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
+  {"cusparseScsrsm2_analysis",                    {"hipsparseScsrsm2_analysis",                    "", CONV_LIB_FUNC, API_SPARSE}},
+  {"cusparseDcsrsm2_analysis",                    {"hipsparseDcsrsm2_analysis",                    "", CONV_LIB_FUNC, API_SPARSE}},
+  {"cusparseCcsrsm2_analysis",                    {"hipsparseCcsrsm2_analysis",                    "", CONV_LIB_FUNC, API_SPARSE}},
+  {"cusparseZcsrsm2_analysis",                    {"hipsparseZcsrsm2_analysis",                    "", CONV_LIB_FUNC, API_SPARSE}},

  {"cusparseScsrsm2_solve",                       {"hipsparseScsrsm2_solve",                       "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
  {"cusparseDcsrsm2_solve",                       {"hipsparseDcsrsm2_solve",                       "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
  {"cusparseCcsrsm2_solve",                       {"hipsparseCcsrsm2_solve",                       "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
  {"cusparseZcsrsm2_solve",                       {"hipsparseZcsrsm2_solve",                       "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},

-  {"cusparseXcsrsm2_zeroPivot",                   {"hipsparseXcsrsm2_zeroPivot",                   "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
+  {"cusparseXcsrsm2_zeroPivot",                   {"hipsparseXcsrsm2_zeroPivot",                   "", CONV_LIB_FUNC, API_SPARSE}},

  {"cusparseSbsrmm",                              {"hipsparseSbsrmm",                              "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
  {"cusparseDbsrmm",                              {"hipsparseDbsrmm",                              "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
@@ -619,10 +619,10 @@ const std::map<llvm::StringRef, hipCounter> CUDA_SPARSE_FUNCTION_MAP{
  {"cusparseChyb2dense",                          {"hipsparseChyb2dense",                          "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
  {"cusparseZhyb2dense",                          {"hipsparseZhyb2dense",                          "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},

-  {"cusparseSnnz",                                {"hipsparseSnnz",                                "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
-  {"cusparseDnnz",                                {"hipsparseDnnz",                                "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
-  {"cusparseCnnz",                                {"hipsparseCnnz",                                "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
-  {"cusparseZnnz",                                {"hipsparseZnnz",                                "", CONV_LIB_FUNC, API_SPARSE, HIP_UNSUPPORTED}},
+  {"cusparseSnnz",                                {"hipsparseSnnz",                                "", CONV_LIB_FUNC, API_SPARSE}},
+  {"cusparseDnnz",                                {"hipsparseDnnz",                                "", CONV_LIB_FUNC, API_SPARSE}},
+  {"cusparseCnnz",                                {"hipsparseCnnz",                                "", CONV_LIB_FUNC, API_SPARSE}},
+  {"cusparseZnnz",                                {"hipsparseZnnz",                                "", CONV_LIB_FUNC, API_SPARSE}},

  {"cusparseCreateIdentityPermutation",           {"hipsparseCreateIdentityPermutation",           "", CONV_LIB_FUNC, API_SPARSE}},

@@ -41,8 +41,8 @@ const std::map<llvm::StringRef, hipCounter> CUDA_SPARSE_TYPE_NAME_MAP{
  {"csrsv2Info",                                {"csrsv2Info",                                 "", CONV_TYPE, API_SPARSE, HIP_UNSUPPORTED}},
  {"csrsv2Info_t",                              {"csrsv2Info_t",                               "", CONV_TYPE, API_SPARSE}},

-  {"csrsm2Info",                                {"csrsm2Info",                                 "", CONV_TYPE, API_SPARSE, HIP_UNSUPPORTED}},
-  {"csrsm2Info_t",                              {"csrsm2Info_t",                               "", CONV_TYPE, API_SPARSE, HIP_UNSUPPORTED}},
+  {"csrsm2Info",                                {"csrsm2Info",                                 "", CONV_TYPE, API_SPARSE}},
+  {"csrsm2Info_t",                              {"csrsm2Info_t",                               "", CONV_TYPE, API_SPARSE}},

  {"bsrsv2Info",                                {"bsrsv2Info",                                 "", CONV_TYPE, API_SPARSE, HIP_UNSUPPORTED}},
  {"bsrsv2Info_t",                              {"bsrsv2Info_t",                               "", CONV_TYPE, API_SPARSE, HIP_UNSUPPORTED}},
@@ -88,9 +88,9 @@ const std::map<llvm::StringRef, hipCounter> CUDA_SPARSE_TYPE_NAME_MAP{
  {"CUSPARSE_ACTION_SYMBOLIC",                  {"HIPSPARSE_ACTION_SYMBOLIC",                  "", CONV_NUMERIC_LITERAL, API_SPARSE}},
  {"CUSPARSE_ACTION_NUMERIC",                   {"HIPSPARSE_ACTION_NUMERIC",                   "", CONV_NUMERIC_LITERAL, API_SPARSE}},

-  {"cusparseDirection_t",                       {"hipsparseDirection_t",                       "", CONV_TYPE, API_SPARSE, HIP_UNSUPPORTED}},
-  {"CUSPARSE_DIRECTION_ROW",                    {"HIPSPARSE_DIRECTION_ROW",                    "", CONV_NUMERIC_LITERAL, API_SPARSE, HIP_UNSUPPORTED}},
-  {"CUSPARSE_DIRECTION_COLUMN",                 {"HIPSPARSE_DIRECTION_COLUMN",                 "", CONV_NUMERIC_LITERAL, API_SPARSE, HIP_UNSUPPORTED}},
+  {"cusparseDirection_t",                       {"hipsparseDirection_t",                       "", CONV_TYPE, API_SPARSE}},
+  {"CUSPARSE_DIRECTION_ROW",                    {"HIPSPARSE_DIRECTION_ROW",                    "", CONV_NUMERIC_LITERAL, API_SPARSE}},
+  {"CUSPARSE_DIRECTION_COLUMN",                 {"HIPSPARSE_DIRECTION_COLUMN",                 "", CONV_NUMERIC_LITERAL, API_SPARSE}},

  {"cusparseHybPartition_t",                    {"hipsparseHybPartition_t",                    "", CONV_TYPE, API_SPARSE}},
  {"CUSPARSE_HYB_PARTITION_AUTO",               {"HIPSPARSE_HYB_PARTITION_AUTO",               "", CONV_NUMERIC_LITERAL, API_SPARSE}},
@@ -128,7 +128,7 @@ __device__ static int __mul24(int x, int y);
 __device__ static long long int __mul64hi(long long int x, long long int y);
 __device__ static int __mulhi(int x, int y);
 __device__ static int __rhadd(int x, int y);
-__device__ static unsigned int __sad(int x, int y, int z);
+__device__ static unsigned int __sad(int x, int y,unsigned int z);
 __device__ static unsigned int __uhadd(unsigned int x, unsigned int y);
 __device__ static int __umul24(unsigned int x, unsigned int y);
 __device__ static unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y);
@@ -199,7 +199,7 @@ __device__ static inline int __rhadd(int x, int y) {
    int value = z & 0x7FFFFFFF;
    return ((value) >> 1 || sign);
 }
-__device__ static inline unsigned int __sad(int x, int y, int z) {
+__device__ static inline unsigned int __sad(int x, int y, unsigned int z) {
    return x > y ? x - y + z : y - x + z;
 }
 __device__ static inline unsigned int __uhadd(unsigned int x, unsigned int y) {
@@ -230,7 +230,7 @@ __device__ static inline unsigned int __urhadd(unsigned int x, unsigned int y) {
    return (x + y + 1) >> 1;
 }
 __device__ static inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) {
-    return __ockl_sad_u32(x, y, z);
+    return __ockl_sadd_u32(x, y, z);
 }

 __device__ static inline unsigned int __lane_id() { return  __mbcnt_hi(-1, __mbcnt_lo(-1, 0)); }
@@ -563,7 +563,7 @@ long __shfl_xor(long var, int lane_mask, int width = warpSize)
    return tmp1;
    #else
    static_assert(sizeof(long) == sizeof(int), "");
-    return static_cast<long>(__shfl_down(static_cast<int>(var), lane_mask, width));
+    return static_cast<long>(__shfl_xor(static_cast<int>(var), lane_mask, width));
    #endif
 }
 __device__
@@ -44,7 +44,7 @@ extern "C" __device__ __attribute__((const)) uint __ockl_mul24_u32(uint, uint);
 extern "C" __device__ __attribute__((const)) int __ockl_mul24_i32(int, int);
 extern "C" __device__ __attribute__((const)) uint __ockl_mul_hi_u32(uint, uint);
 extern "C" __device__ __attribute__((const)) int __ockl_mul_hi_i32(int, int);
-extern "C" __device__ __attribute__((const)) uint __ockl_sad_u32(uint, uint, uint);
+extern "C" __device__ __attribute__((const)) uint __ockl_sadd_u32(uint, uint, uint);

 extern "C" __device__ __attribute__((const)) uchar __ockl_clz_u8(uchar);
 extern "C" __device__ __attribute__((const)) ushort __ockl_clz_u16(ushort);
@@ -72,6 +72,7 @@ extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_thread_rank(
 extern "C" __device__ __attribute__((const)) int __ockl_multi_grid_is_valid(void);
 extern "C" __device__ __attribute__((convergent)) void __ockl_multi_grid_sync(void);

+extern "C" __device__ void __ockl_atomic_add_noret_f32(float*, float);

 // Introduce local address space
 #define __local __attribute__((address_space(3)))
@@ -37,14 +37,15 @@ THE SOFTWARE.
 hipError_t ihipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList, int numDevices,
                                               unsigned int flags, hip_impl::program_state& ps);

-hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDimX, void** kernelParams,
-                unsigned int sharedMemBytes, hipStream_t stream, hip_impl::program_state& ps);
-
-hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList, int  numDevices,
-                unsigned int  flags, hip_impl::program_state& ps);
-
-
+hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim,
+                                    dim3 blockDim, void** args,
+                                    size_t sharedMem, hipStream_t stream,
+                                    hip_impl::program_state& ps);

+hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
+                                                 int  numDevices,
+                                                 unsigned int flags,
+                                                 hip_impl::program_state& ps);

 #pragma GCC visibility push(hidden)

@@ -192,22 +193,24 @@ void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
 template <typename F>
 inline
 __attribute__((visibility("hidden")))
-hipError_t hipLaunchCooperativeKernel(F f, dim3 gridDim, dim3 blockDimX, void** kernelParams,
-                unsigned int sharedMemBytes, hipStream_t stream) {
-
+hipError_t hipLaunchCooperativeKernel(F f, dim3 gridDim, dim3 blockDim,
+                                      void** args, size_t sharedMem,
+                                      hipStream_t stream) {
    hip_impl::hip_init();
    auto& ps = hip_impl::get_program_state();
-    return ihipLaunchCooperativeKernel(reinterpret_cast<void*>(f), gridDim, blockDimX, kernelParams, sharedMemBytes, stream, ps);
+    return hipLaunchCooperativeKernel(reinterpret_cast<void*>(f), gridDim,
+                                      blockDim, args, sharedMem, stream, ps);
 }

 inline
 __attribute__((visibility("hidden")))
-hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList, int  numDevices,
-                unsigned int  flags) {
+hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
+                                                 int  numDevices,
+                                                 unsigned int  flags) {

    hip_impl::hip_init();
    auto& ps = hip_impl::get_program_state();
-    return ihipLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags, ps);
+    return hipLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags, ps);
 }

 #pragma GCC visibility pop
@@ -73,6 +73,14 @@ float atomicAdd(float* address, float val)

    return __uint_as_float(r);
 }
+
+__device__
+inline
+void atomicAddNoRet(float* address, float val)
+{
+    __ockl_atomic_add_noret_f32(address, val);
+}
+
 __device__
 inline
 double atomicAdd(double* address, double val)
@@ -1268,6 +1268,13 @@ THE SOFTWARE.
                    static_cast<__half_raw>(x).data +
                    static_cast<__half_raw>(y).data};
            }
+	    inline
+	    __device__
+	    __half __habs(__half x)
+	    {
+	        return __half_raw{
+		    __ocml_fabs_f16(static_cast<__half_raw>(x).data)};
+	    }
            inline
            __device__
            __half __hsub(__half x, __half y)
@@ -1334,6 +1341,13 @@ THE SOFTWARE.
                    static_cast<__half2_raw>(x).data +
                    static_cast<__half2_raw>(y).data};
            }
+	    inline
+	    __device__
+	    __half2 __habs2(__half2 x)
+	    {
+	        return __half2_raw{
+		    __ocml_fabs_2f16(static_cast<__half2_raw>(x).data)};
+	    }
            inline
            __device__
            __half2 __hsub2(__half2 x, __half2 y)
@@ -38,6 +38,7 @@ extern "C"
    __device__ __attribute__((const)) _Float16 __ocml_floor_f16(_Float16);
    __device__ __attribute__((const))
    _Float16 __ocml_fma_f16(_Float16, _Float16, _Float16);
+    __device__ __attribute__((const)) _Float16 __ocml_fabs_f16(_Float16);
    __device__ __attribute__((const)) int __ocml_isinf_f16(_Float16);
    __device__ __attribute__((const)) int __ocml_isnan_f16(_Float16);
    __device__ __attribute__((pure)) _Float16 __ocml_log_f16(_Float16);
@@ -58,6 +59,7 @@ extern "C"
    #endif

    __device__ __attribute__((const)) __2f16 __ocml_ceil_2f16(__2f16);
+    __device__ __attribute__((const)) __2f16 __ocml_fabs_2f16(__2f16);
    __device__ __2f16 __ocml_cos_2f16(__2f16);
    __device__ __attribute__((pure)) __2f16 __ocml_exp_2f16(__2f16);
    __device__ __attribute__((pure)) __2f16 __ocml_exp10_2f16(__2f16);
@@ -504,9 +504,14 @@ hc_get_workitem_absolute_id(int dim)
 #define __CUDA__
 #include <__clang_cuda_math_forward_declares.h>
 #include <__clang_cuda_complex_builtins.h>
-#include <cuda_wrappers/algorithm>
-#include <cuda_wrappers/complex>
-#include <cuda_wrappers/new>
+// Workaround for using libc++ with HIP-Clang.
+// The following headers requires clang include path before standard C++ include path.
+// However libc++ include path requires to be before clang include path.
+// To workaround this, we pass -isystem with the parent directory of clang include
+// path instead of the clang include path itself.
+#include <include/cuda_wrappers/algorithm>
+#include <include/cuda_wrappers/complex>
+#include <include/cuda_wrappers/new>
 #undef __CUDA__
 #pragma pop_macro("__CUDA__")
 #endif // !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
@@ -55,7 +55,7 @@ THE SOFTWARE.
 #define DEPRECATED(msg) __attribute__ ((deprecated(msg)))
 #endif // !defined(_MSC_VER)

-#define DEPRECATED_MSG "This API is marked as deprecated and may not be supported in future releases.For more details please refer https://github.com/ROCm-Developer-Tools/HIP/tree/master/docs/markdown/hip_deprecated_api_list"
+#define DEPRECATED_MSG "This API is marked as deprecated and may not be supported in future releases. For more details please refer https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_deprecated_api_list.md"

 #if defined(__HCC__) && (__hcc_workweek__ < 16155)
 #error("This version of HIP requires a newer version of HCC.");
@@ -34,7 +34,7 @@ THE SOFTWARE.

 #include "hip/hcc_detail/host_defines.h"

-#if !defined(_MSC_VER) || __clang__
+#if defined(__has_attribute)
    #if __has_attribute(ext_vector_type)
        #define __NATIVE_VECTOR__(n, T) T __attribute__((ext_vector_type(n)))
    #else
@@ -694,7 +694,7 @@ THE SOFTWARE.
            typename U = T,
            typename std::enable_if<std::is_signed<U>{}>::type* = nullptr>
        inline __host__ __device__
-        HIP_vector_type operator-() noexcept
+        HIP_vector_type operator-() const noexcept
        {
            auto tmp(*this);
            tmp.data = -tmp.data;
@@ -705,7 +705,7 @@ THE SOFTWARE.
            typename U = T,
            typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
        inline __host__ __device__
-        HIP_vector_type operator~() noexcept
+        HIP_vector_type operator~() const noexcept
        {
            HIP_vector_type r{*this};
            r.data = ~r.data;
@@ -1241,7 +1241,9 @@ DECLOP_MAKE_ONE_COMPONENT(signed long long, longlong1);
 DECLOP_MAKE_TWO_COMPONENT(signed long long, longlong2);
 DECLOP_MAKE_THREE_COMPONENT(signed long long, longlong3);
 DECLOP_MAKE_FOUR_COMPONENT(signed long long, longlong4);
-#else // defined(_MSC_VER)
+#else // !defined(__has_attribute)
+
+#if defined(_MSC_VER)
 #include <mmintrin.h>
 #include <xmmintrin.h>
 #include <emmintrin.h>
@@ -1347,5 +1349,92 @@ typedef union { double4 data; } double3;
 typedef union { __m256d data[2]; } double8;
 typedef union { __m256d data[4]; } double16;

+#else // !defined(_MSC_VER)
+
+typedef union { char data; } char1;
+typedef union { char data[2]; } char2;
+typedef union { char data[4]; } char4;
+typedef union { char data[8]; } char8;
+typedef union { char data[16]; } char16;
+typedef union { char4 data; } char3;
+
+typedef union { unsigned char data; } uchar1;
+typedef union { unsigned char data[2]; } uchar2;
+typedef union { unsigned char data[4]; } uchar4;
+typedef union { unsigned char data[8]; } uchar8;
+typedef union { unsigned char data[16]; } uchar16;
+typedef union { uchar4 data; } uchar3;
+
+typedef union { short data; } short1;
+typedef union { short data[2]; } short2;
+typedef union { short data[4]; } short4;
+typedef union { short data[8]; } short8;
+typedef union { short data[16]; } short16;
+typedef union { short4 data; } short3;
+
+typedef union { unsigned short data; } ushort1;
+typedef union { unsigned short data[2]; } ushort2;
+typedef union { unsigned short data[4]; } ushort4;
+typedef union { unsigned short data[8]; } ushort8;
+typedef union { unsigned short data[16]; } ushort16;
+typedef union { ushort4 data; } ushort3;
+
+typedef union { int data; } int1;
+typedef union { int data[2]; } int2;
+typedef union { int data[4]; } int4;
+typedef union { int data[8]; } int8;
+typedef union { int data[16]; } int16;
+typedef union { int4 data; } int3;
+
+typedef union { unsigned int data; } uint1;
+typedef union { unsigned int data[2]; } uint2;
+typedef union { unsigned int data[4]; } uint4;
+typedef union { unsigned int data[8]; } uint8;
+typedef union { unsigned int data[16]; } uint16;
+typedef union { uint4 data; } uint3;
+
+typedef union { long data; } long1;
+typedef union { long data[2]; } long2;
+typedef union { long data[4]; } long4;
+typedef union { long data[8]; } long8;
+typedef union { long data[16]; } long16;
+typedef union { long4 data; } long3;
+
+typedef union { unsigned long data; } ulong1;
+typedef union { unsigned long data[2]; } ulong2;
+typedef union { unsigned long data[4]; } ulong4;
+typedef union { unsigned long data[8]; } ulong8;
+typedef union { unsigned long data[16]; } ulong16;
+typedef union { ulong4 data; } ulong3;
+
+typedef union { long long data; } longlong1;
+typedef union { long long data[2]; } longlong2;
+typedef union { long long data[4]; } longlong4;
+typedef union { long long data[8]; } longlong8;
+typedef union { long long data[16]; } longlong16;
+typedef union { longlong4 data; } longlong3;
+
+typedef union { unsigned long long data; } ulonglong1;
+typedef union { unsigned long long data[2]; } ulonglong2;
+typedef union { unsigned long long data[4]; } ulonglong4;
+typedef union { unsigned long long data[8]; } ulonglong8;
+typedef union { unsigned long long data[16]; } ulonglong16;
+typedef union { ulonglong4 data; } ulonglong3;
+
+typedef union { float data; } float1;
+typedef union { float data[2]; } float2;
+typedef union { float data[4]; } float4;
+typedef union { float data[8]; } float8;
+typedef union { float data[16]; } float16;
+typedef union { float4 data; } float3;
+
+typedef union { double data; } double1;
+typedef union { double data[2]; } double2;
+typedef union { double data[4]; } double4;
+typedef union { double data[8]; } double8;
+typedef union { double data[16]; } double16;
+typedef union { double4 data; } double3;
+
 #endif // defined(_MSC_VER)
+#endif // defined(__has_attribute)
 #endif
@@ -28,6 +28,8 @@ extern "C" {

 #include <stdlib.h>

+#pragma GCC visibility push (default)
+
 enum hiprtcResult {
    HIPRTC_SUCCESS = 0,
    HIPRTC_ERROR_OUT_OF_MEMORY = 1,
@@ -79,6 +81,8 @@ hiprtcResult hiprtcGetCode(hiprtcProgram prog, char* code);

 hiprtcResult hiprtcGetCodeSize(hiprtcProgram prog, size_t* codeSizeRet);

+#pragma GCC visibility pop
+
 #ifdef __cplusplus
 }
 #endif /* __cplusplus */
@@ -60,7 +60,7 @@ THE SOFTWARE.
 */
 // _restrict is supported by the compiler
 #define __shared__ tile_static
-#define __constant__ __attribute__((hc))
+#define __constant__ __attribute__((hc, annotate("__HIP_constant__")))

 #elif defined(__clang__) && defined(__HIP__)

@@ -321,7 +321,6 @@ typedef enum hipDeviceAttribute_t {
    hipDeviceAttributeIntegrated,                        ///< iGPU
    hipDeviceAttributeCooperativeLaunch,                 ///< Support cooperative launch
    hipDeviceAttributeCooperativeMultiDeviceLaunch,      ///< Support cooperative launch on multiple devices
-
    hipDeviceAttributeMaxTexture1DWidth,    ///< Maximum number of elements in 1D images
    hipDeviceAttributeMaxTexture2DWidth,    ///< Maximum dimension width of 2D images in image elements
    hipDeviceAttributeMaxTexture2DHeight,   ///< Maximum dimension height of 2D images in image elements
@@ -186,6 +186,7 @@ typedef struct cudaArray hipArray;
 typedef struct cudaArray* hipArray_t;
 typedef struct cudaArray* hipArray_const_t;
 typedef struct cudaFuncAttributes hipFuncAttributes;
+typedef struct cudaLaunchParams hipLaunchParams;
 #define hipFunction_attribute CUfunction_attribute
 #define hip_Memcpy2D CUDA_MEMCPY2D
 #define hipMemcpy3DParms cudaMemcpy3DParms
@@ -860,7 +861,7 @@ inline static hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes,
 }


-inline hipError_t hipMemcpyWithStream(void* dst, const void* src,
+inline static hipError_t hipMemcpyWithStream(void* dst, const void* src,
 				      size_t sizeBytes, hipMemcpyKind copyKind,
 				      hipStream_t stream) {
 	cudaError_t error = cudaMemcpyAsync(dst, src, sizeBytes, 
@@ -1134,6 +1135,10 @@ inline static hipError_t hipGetDeviceProperties(hipDeviceProp_t* p_prop, int dev
    p_prop->integrated = cdprop.integrated;
    p_prop->cooperativeLaunch = cdprop.cooperativeLaunch;
    p_prop->cooperativeMultiDeviceLaunch = cdprop.cooperativeMultiDeviceLaunch;
+    p_prop->cooperativeMultiDeviceUnmatchedFunc = 0;
+    p_prop->cooperativeMultiDeviceUnmatchedGridDim = 0;
+    p_prop->cooperativeMultiDeviceUnmatchedBlockDim = 0;
+    p_prop->cooperativeMultiDeviceUnmatchedSharedMem = 0;

    p_prop->maxTexture1D    = cdprop.maxTexture1D;
    p_prop->maxTexture2D[0] = cdprop.maxTexture2D[0];
@@ -1271,6 +1276,12 @@ inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t att
        case hipDeviceAttributeEccEnabled:
            cdattr = cudaDevAttrEccEnabled;
            break;
+        case hipDeviceAttributeCooperativeLaunch:
+            cdattr = cudaDevAttrCooperativeLaunch;
+            break;
+        case hipDeviceAttributeCooperativeMultiDeviceLaunch:
+            cdattr = cudaDevAttrCooperativeMultiDeviceLaunch;
+            break;
        default:
            return hipCUDAErrorTohipError(cudaErrorInvalidValue);
    }
@@ -1679,6 +1690,17 @@ inline static hipError_t hipGetChannelDesc(hipChannelFormatDesc* desc, hipArray_
    return hipCUDAErrorTohipError(cudaGetChannelDesc(desc,array));
 }

+inline static hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDim,
+                                      void** kernelParams, unsigned int sharedMemBytes,
+                                      hipStream_t stream) {
+    return hipCUDAErrorTohipError(
+            cudaLaunchCooperativeKernel(f, gridDim, blockDim, kernelParams, sharedMemBytes, stream));
+}
+
+inline static hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
+                                                 int  numDevices, unsigned int  flags) {
+    return hipCUDAErrorTohipError(cudaLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags));
+}

 #ifdef __cplusplus
 }
@@ -1686,6 +1708,17 @@ inline static hipError_t hipGetChannelDesc(hipChannelFormatDesc* desc, hipArray_

 #ifdef __CUDACC__

+template<class T>
+inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks,
+                                                                      T func,
+                                                                      int blockSize,
+                                                                      size_t dynamicSMemSize) {
+    cudaError_t cerror;
+    cerror =
+        cudaOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, func, blockSize, dynamicSMemSize);
+    return hipCUDAErrorTohipError(cerror);
+}
+
 template <class T>
 inline static hipError_t hipOccupancyMaxPotentialBlockSize(int* minGridSize, int* blockSize, T func,
                                                           size_t dynamicSMemSize = 0,
@@ -1742,6 +1775,14 @@ template <class T>
 inline static hipChannelFormatDesc hipCreateChannelDesc() {
    return cudaCreateChannelDesc<T>();
 }
+
+template <class T>
+inline static hipError_t hipLaunchCooperativeKernel(T f, dim3 gridDim, dim3 blockDim,
+                                             void** kernelParams, unsigned int sharedMemBytes, hipStream_t stream) {
+    return hipCUDAErrorTohipError(
+            cudaLaunchCooperativeKernel(f, gridDim, blockDim, kernelParams, sharedMemBytes, stream));
+}
+
 #endif  //__CUDACC__

 #endif  // HIP_INCLUDE_HIP_NVCC_DETAIL_HIP_RUNTIME_API_H
@@ -14,7 +14,7 @@ install(TARGETS lpl RUNTIME DESTINATION bin)
 #-------------------------------------LPL--------------------------------------#

 #-------------------------------------CA---------------------------------------#
-add_executable(ca ca.cpp ${PROJECT_SOURCE_DIR}/src/code_object_bundle.cpp)
+add_executable(ca ca.cpp)
 set_target_properties(
    ca PROPERTIES
        CXX_STANDARD 11
@@ -2,7 +2,7 @@

 #include "common.hpp"

-#include "../include/hip/hcc_detail/code_object_bundle.hpp"
+#include "../src/code_object_bundle.inl"

 #include "clara/clara.hpp"

@@ -25,16 +25,15 @@ set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}
 set(CPACK_GENERATOR "TGZ;DEB;RPM")
 set(CPACK_BINARY_DEB "ON")
 set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm")
-set(CPACK_DEBIAN_PACKAGE_DEPENDS "perl (>= 5.0), llvm-amdgpu")
-set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip_base")
+set(CPACK_DEBIAN_PACKAGE_DEPENDS "perl (>= 5.0)")
+set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-base")
 set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_base")
-set(CPACK_DEBIAN_PACKAGE_CONFLICTS "hip_base")
 set(CPACK_BINARY_RPM "ON")
 set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
 set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst")
 set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm")
 set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
-set(CPACK_RPM_PACKAGE_REQUIRES "perl >= 5.0, llvm-amdgpu")
+set(CPACK_RPM_PACKAGE_REQUIRES "perl >= 5.0")
 set(CPACK_RPM_PACKAGE_OBSOLETES "hip_base")
 set(CPACK_RPM_PACKAGE_CONFLICTS "hip_base")
 set(CPACK_BINARY_RPM "ON")
@@ -32,9 +32,8 @@ set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}
 set(CPACK_GENERATOR "TGZ;DEB;RPM")
 set(CPACK_BINARY_DEB "ON")
 set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION})")
-set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip_doc")
+set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-doc")
 set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_doc")
-set(CPACK_DEBIAN_PACKAGE_CONFLICTS "hip_doc")
 set(CPACK_BINARY_RPM "ON")
 set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
 set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
@@ -37,9 +37,8 @@ set(CPACK_GENERATOR "TGZ;DEB;RPM")
 set(CPACK_BINARY_DEB "ON")
 set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm")
 set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}), ${HCC_PACKAGE_NAME} (= @HCC_PACKAGE_VERSION@), comgr (>= 1.1)")
-set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip_hcc")
+set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-hcc")
 set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_hcc")
-set(CPACK_DEBIAN_PACKAGE_CONFLICTS "hip_hcc")
 set(CPACK_BINARY_RPM "ON")
 set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
 set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst")
@@ -19,9 +19,8 @@ set(CPACK_GENERATOR "TGZ;DEB;RPM")
 set(CPACK_BINARY_DEB "ON")
 set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm")
 set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}), cuda (>= 7.5)")
-set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip_nvcc")
+set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-nvcc")
 set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_nvcc")
-set(CPACK_DEBIAN_PACKAGE_CONFLICTS "hip_nvcc")
 set(CPACK_BINARY_RPM "ON")
 set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
 set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst")
@@ -19,10 +19,9 @@ set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@)
 set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH})
 set(CPACK_GENERATOR "TGZ;DEB;RPM")
 set(CPACK_BINARY_DEB "ON")
-set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-vdi (= ${CPACK_PACKAGE_VERSION})")
-set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip_samples")
+set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION})")
+set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-samples")
 set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_samples")
-set(CPACK_DEBIAN_PACKAGE_CONFLICTS "hip_samples")
 set(CPACK_BINARY_RPM "ON")
 set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
 set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
@@ -114,8 +114,6 @@ set_target_properties(hip::device PROPERTIES
 else()
 set_target_properties(hip::device PROPERTIES
  INTERFACE_LINK_LIBRARIES "hip::host"
-  INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include"
-  INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include"
 )
 endif()

@@ -27,11 +27,7 @@ set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}
 set(CPACK_GENERATOR "TGZ;DEB;RPM")
 set(CPACK_BINARY_DEB "ON")
 set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm")
-if(@COMPILE_HIP_ATP_MARKER@)
-    set(CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev, hsa-ext-rocr-dev, rocm-utils, hip-base (= ${CPACK_PACKAGE_VERSION}), rocm-profiler, comgr (>= 1.1)")
-else()
-    set(CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev, hsa-ext-rocr-dev, rocm-utils, hip-base (= ${CPACK_PACKAGE_VERSION}),  comgr (>= 1.1)")
-endif()
+set(CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev, hsa-ext-rocr-dev, rocm-utils, hip-base (= ${CPACK_PACKAGE_VERSION}),  comgr (>= 1.1), llvm-amdgpu")
 set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip_vdi, hip-hcc (= ${CPACK_PACKAGE_VERSION})")
 set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_vdi")
 set(CPACK_DEBIAN_PACKAGE_CONFLICTS "hip_vdi")
@@ -41,11 +37,7 @@ set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst")
 set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm")
 set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
 string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION})
-if(@COMPILE_HIP_ATP_MARKER@)
-    set(CPACK_RPM_PACKAGE_REQUIRES "hsa-rocr-dev, hsa-ext-rocr-dev, rocm-utils, hip-base = ${HIP_BASE_VERSION},  rocm-profiler, comgr >= 1.1")
-else()
-    set(CPACK_RPM_PACKAGE_REQUIRES "hsa-rocr-dev, hsa-ext-rocr-dev, rocm-utils, hip-base = ${HIP_BASE_VERSION},  comgr >= 1.1")
-endif()
+set(CPACK_RPM_PACKAGE_REQUIRES "hsa-rocr-dev, hsa-ext-rocr-dev, rocm-utils, hip-base = ${HIP_BASE_VERSION},  comgr >= 1.1, llvm-amdgpu")
 set(CPACK_RPM_PACKAGE_PROVIDES "hip_vdi, hip-hcc = ${HIP_BASE_VERSION}")
 set(CPACK_RPM_PACKAGE_OBSOLETES "hip_vdi")
 set(CPACK_RPM_PACKAGE_CONFLICTS "hip_vdi")
@@ -80,8 +80,8 @@ int main() {

    hipFree(Ad);
    hipFree(Bd);
-    delete A;
-    delete B;
+    delete[] A;
+    delete[] B;
    hipCtxDestroy(context);
    return 0;
 }
@@ -107,8 +107,8 @@ int main() {

    hipFree(Ad);
    hipFree(Bd);
-    delete A;
-    delete B;
+    delete[] A;
+    delete[] B;
    hipCtxDestroy(context);
    return 0;
 }
@@ -99,8 +99,8 @@ int main() {

    hipFree(Ad);
    hipFree(Bd);
-    delete A;
-    delete B;
+    delete[] A;
+    delete[] B;
    hipCtxDestroy(context);
    return 0;
 }
@@ -154,8 +154,8 @@ int main() {

    hipFree(Ad);
    hipFree(Bd);
-    delete A;
-    delete B;
+    delete[] A;
+    delete[] B;
    hipCtxDestroy(context);
    return 0;
 }
@@ -56,6 +56,7 @@ void printCompilerInfo() {
 #endif
 }

+double bytesToKB(size_t s) { return (double)s / (1024.0); }
 double bytesToGB(size_t s) { return (double)s / (1024.0 * 1024.0 * 1024.0); }

 #define printLimit(w1, limit, units)                                                               \
@@ -97,7 +98,7 @@ void printDeviceProp(int deviceId) {
    cout << setw(w1) << "totalGlobalMem: " << fixed << setprecision(2)
         << bytesToGB(props.totalGlobalMem) << " GB" << endl;
    cout << setw(w1) << "maxSharedMemoryPerMultiProcessor: " << fixed << setprecision(2)
-         << bytesToGB(props.maxSharedMemoryPerMultiProcessor) << " GB" << endl;
+         << bytesToKB(props.maxSharedMemoryPerMultiProcessor) << " KB" << endl;
    cout << setw(w1) << "totalConstMem: " << props.totalConstMem << endl;
    cout << setw(w1) << "sharedMemPerBlock: " << (float)props.sharedMemPerBlock / 1024.0 << " KB"
         << endl;
@@ -1,34 +0,0 @@
-#include "../include/hip/hcc_detail/code_object_bundle.hpp"
-
-#include <hsa/hsa.h>
-
-#include <algorithm>
-#include <cstddef>
-#include <cstdint>
-#include <string>
-#include <vector>
-
-using namespace std;
-
-// CREATORS
-hip_impl::Bundled_code_header::Bundled_code_header(const vector<char>& x)
-    : Bundled_code_header{x.cbegin(), x.cend()} {}
-
-hip_impl::Bundled_code_header::Bundled_code_header(
-    const void* p) {  // This is a pretty terrible interface, useful only because
-    // hipLoadModuleData is so poorly specified (for no fault of its own).
-    if (!p) return;
-
-    if (!valid(*static_cast<const Bundled_code_header*>(p))) return;
-    auto ph = static_cast<const Header_*>(p);
-
-    size_t sz = sizeof(Header_) + ph->bundle_cnt_ * sizeof(Bundled_code::Header);
-    auto pb = static_cast<const char*>(p) + sizeof(Header_);
-    auto n = ph->bundle_cnt_;
-    while (n--) {
-        sz += reinterpret_cast<const Bundled_code::Header*>(pb)->bundle_sz;
-        pb += sizeof(Bundled_code::Header);
-    }
-
-    read(static_cast<const char*>(p), static_cast<const char*>(p) + sz, *this);
-}
@@ -92,10 +92,6 @@ struct Bundled_code {

 #define magic_string_  "__CLANG_OFFLOAD_BUNDLE__"

-#ifdef __GNUC__
-#pragma GCC visibility push (default)
-#endif
-
 class Bundled_code_header {
    // DATA - STATICS
    static constexpr auto magic_string_sz_ = sizeof(magic_string_) - 1;
@@ -167,8 +163,26 @@ class Bundled_code_header {
    Bundled_code_header() = default;
    template <typename RandomAccessIterator>
    Bundled_code_header(RandomAccessIterator f, RandomAccessIterator l);
-    explicit Bundled_code_header(const std::vector<char>& blob);
-    explicit Bundled_code_header(const void* maybe_blob);
+    explicit Bundled_code_header(const std::vector<char>& blob)
+        : Bundled_code_header{blob.cbegin(), blob.cend()} {}
+    explicit Bundled_code_header(const void* maybe_blob) {
+        // This is a pretty terrible interface, useful only because
+        // hipLoadModuleData is so poorly specified (for no fault of its own).
+        if (!maybe_blob) return;
+
+        if (!valid(*static_cast<const Bundled_code_header*>(maybe_blob))) return;
+        auto ph = static_cast<const Header_*>(maybe_blob);
+
+        size_t sz = sizeof(Header_) + ph->bundle_cnt_ * sizeof(Bundled_code::Header);
+        auto pb = static_cast<const char*>(maybe_blob) + sizeof(Header_);
+        auto n = ph->bundle_cnt_;
+        while (n--) {
+            sz += reinterpret_cast<const Bundled_code::Header*>(pb)->bundle_sz;
+            pb += sizeof(Bundled_code::Header);
+        }
+
+        read(static_cast<const char*>(maybe_blob), static_cast<const char*>(maybe_blob) + sz, *this);      
+    }
    Bundled_code_header(const Bundled_code_header&) = default;
    Bundled_code_header(Bundled_code_header&&) = default;
    ~Bundled_code_header() = default;
@@ -180,10 +194,6 @@ class Bundled_code_header {
    size_t bundled_code_size = 0;
 };

-#ifdef __GNUC__
-#pragma GCC visibility pop
-#endif
-
 // CREATORS
 template <typename RandomAccessIterator>
 Bundled_code_header::Bundled_code_header(RandomAccessIterator f, RandomAccessIterator l)
@@ -51,7 +51,7 @@ __hipRegisterFatBinary(const void* data)
    return nullptr;
  }

-  auto modules = new std::vector<hipModule_t>{g_deviceCnt};
+  auto modules = new std::vector<hipModule_t>(g_deviceCnt);
  if (!modules) {
    return nullptr;
  }
@@ -136,7 +136,7 @@ extern "C" void __hipRegisterFunction(
  int*         wSize)
 {
  HIP_INIT_API(NONE, modules, hostFunction, deviceFunction, deviceName);
-  std::vector<hipFunction_t> functions{g_deviceCnt};
+  std::vector<hipFunction_t> functions(g_deviceCnt);

  assert(modules && modules->size() >= g_deviceCnt);
  for (int deviceId = 0; deviceId < g_deviceCnt; ++deviceId) {
@@ -310,6 +310,18 @@ hipError_t ihipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device
            case hipDeviceAttributeCooperativeMultiDeviceLaunch:
                *pi = prop->cooperativeMultiDeviceLaunch;
                break;
+            case hipDeviceAttributeCooperativeMultiDeviceUnmatchedFunc:
+                *pi = prop->cooperativeMultiDeviceUnmatchedFunc;
+                break;
+            case hipDeviceAttributeCooperativeMultiDeviceUnmatchedGridDim:
+                *pi = prop->cooperativeMultiDeviceUnmatchedGridDim;
+                break;
+            case hipDeviceAttributeCooperativeMultiDeviceUnmatchedBlockDim:
+                *pi = prop->cooperativeMultiDeviceUnmatchedBlockDim;
+                break;
+            case hipDeviceAttributeCooperativeMultiDeviceUnmatchedSharedMem:
+                *pi = prop->cooperativeMultiDeviceUnmatchedSharedMem;
+                break;
            case hipDeviceAttributeMaxPitch:
                *pi = prop->memPitch;
                break;
@@ -677,7 +677,7 @@ hsa_status_t get_pool_info(hsa_amd_memory_pool_t pool, void* data) {
            break;
        case HSA_REGION_SEGMENT_GROUP:
            err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SIZE,
-                                               &(p_prop->sharedMemPerBlock));
+                                               &(p_prop->maxSharedMemoryPerMultiProcessor));
            break;
        default:
            break;
@@ -835,10 +835,8 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop) {
    hsa_region_t* am_region = static_cast<hsa_region_t*>(_acc.get_hsa_am_region());
    err = hsa_region_get_info(*am_region, HSA_REGION_INFO_SIZE, &prop->totalGlobalMem);
    DeviceErrorCheck(err);
-    // maxSharedMemoryPerMultiProcessor should be as the same as group memory size.
-    // Group memory will not be paged out, so, the physical memory size is the total shared memory
-    // size, and also equal to the group pool size.
-    prop->maxSharedMemoryPerMultiProcessor = prop->totalGlobalMem;
+    // Current GPUs allow a workgroup to use all of LDS in a CU, so these two are equal.
+    prop->sharedMemPerBlock = prop->maxSharedMemoryPerMultiProcessor;

    // Get Max memory clock frequency
    err =
@@ -897,9 +895,16 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop) {
        prop->integrated = 1;
    }

-    // Enable the cooperative group for gfx9+
-    prop->cooperativeLaunch = (prop->gcnArch < 900) ? 0 : 1;
-    prop->cooperativeMultiDeviceLaunch = (prop->gcnArch < 900) ? 0 : 1;
+    // Enable the cooperative group for GPUs that support all the required features
+    err = hsa_agent_get_info(_hsaAgent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COOPERATIVE_QUEUES,
+          &prop->cooperativeLaunch);
+    DeviceErrorCheck(err);
+    prop->cooperativeMultiDeviceLaunch = prop->cooperativeLaunch;
+
+    prop->cooperativeMultiDeviceUnmatchedFunc = prop->cooperativeMultiDeviceLaunch;
+    prop->cooperativeMultiDeviceUnmatchedGridDim = prop->cooperativeMultiDeviceLaunch;
+    prop->cooperativeMultiDeviceUnmatchedBlockDim = prop->cooperativeMultiDeviceLaunch;
+    prop->cooperativeMultiDeviceUnmatchedSharedMem = prop->cooperativeMultiDeviceLaunch;

    err = hsa_agent_get_info(_hsaAgent, (hsa_agent_info_t)HSA_EXT_AGENT_INFO_IMAGE_1D_MAX_ELEMENTS,
          &prop->maxTexture1D);
@@ -1515,20 +1520,6 @@ hipError_t ihipStreamSynchronize(TlsData *tls, hipStream_t stream) {
    return e;
 }

-void ihipStreamCallbackHandler(ihipStreamCallback_t* cb) {
-    hipError_t e = hipSuccess;
-
-    // Synchronize stream
-    tprintf(DB_SYNC, "ihipStreamCallbackHandler wait on stream %s\n",
-            ToString(cb->_stream).c_str());
-    GET_TLS();
-    e = ihipStreamSynchronize(tls, cb->_stream);
-
-    // Call registered callback function
-    cb->_callback(cb->_stream, e, cb->_userData);
-    delete cb;
-}
-
 //---
 // Get the stream to use for a command submission.
 //
@@ -1619,7 +1610,9 @@ void ihipPrintKernelLaunch(const char* kernelName, const grid_launch_parm* lp,
 // Allows runtime to track some information about the stream.
 hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_launch_parm* lp,
                                const char* kernelNameStr, bool lockAcquired) {
-    stream = ihipSyncAndResolveStream(stream, lockAcquired);
+    if (stream == nullptr || stream != stream->getCtx()->_defaultStream) {
+        stream = ihipSyncAndResolveStream(stream, lockAcquired);
+    }
    lp->grid_dim.x = grid.x;
    lp->grid_dim.y = grid.y;
    lp->grid_dim.z = grid.z;
@@ -654,19 +654,6 @@ class ihipStream_t {
 };


-//----
-// Internal structure for stream callback handler
-class ihipStreamCallback_t {
-   public:
-    ihipStreamCallback_t(hipStream_t stream, hipStreamCallback_t callback, void* userData)
-        : _stream(stream), _callback(callback), _userData(userData) {
-    };
-    hipStream_t _stream;
-    hipStreamCallback_t _callback;
-    void* _userData;
-};
-
-
 //----
 // Internal event structure:
 enum hipEventStatus_t {
@@ -980,7 +967,6 @@ hipError_t hipModuleGetFunctionEx(hipFunction_t* hfunc, hipModule_t hmod,

 hipStream_t ihipSyncAndResolveStream(hipStream_t, bool lockAcquired = 0);
 hipError_t ihipStreamSynchronize(TlsData *tls, hipStream_t stream);
-void ihipStreamCallbackHandler(ihipStreamCallback_t* cb);

 /**
 * @brief Copies the memory address and size of symbol @p symbolName
@@ -309,31 +309,52 @@ void generic_copy(void* __restrict dst, const void* __restrict src, size_t n,
    if (di.size == is_cpu_owned) return d2h_copy(dst, src, n, si);
    if (si.size == is_cpu_owned) return h2d_copy(dst, src, n, di);

-    throwing_result_check(hsa_amd_agents_allow_access(1u, &si.agentOwner,
-                                                      nullptr,
-                                                      di.agentBaseAddress),
-                          __FILE__, __func__, __LINE__);
-
-    return do_copy(dst, src, n, di.agentOwner, si.agentOwner);
+    hsa_status_t res = hsa_amd_agents_allow_access(1u, &si.agentOwner,
+                                                   nullptr, di.agentBaseAddress);
+    if (res == HSA_STATUS_SUCCESS){
+        return do_copy(dst, src, n, di.agentOwner, si.agentOwner);
+    }
+    // If devices do not have access then fallback mechanism will be used
+    // copy will be slower
+    throwing_result_check(hsa_memory_copy(dst,src,n), __FILE__, __func__, __LINE__);
 }

 inline
 void memcpy_impl(void* __restrict dst, const void* __restrict src, size_t n,
                 hipMemcpyKind k) {
+    auto si{info(src)};
+    auto di{info(dst)};
+
+    if (!is_large_BAR){
+       // Pointer info takes presidence over hipMemcpyKind
+       // if there is mismatch b/w Memcpy kind and dst/src pointer
+       // E.g. dst(host pointer),src(device pointer) and hipMemcpyKind set as hipMemcpyHostToDevice
+       if (di.size == is_cpu_owned && si.size == is_cpu_owned)
+          k = hipMemcpyHostToHost;
+       else if (si.size == is_cpu_owned && di.size != is_cpu_owned)
+          k = hipMemcpyHostToDevice;
+       else if (di.size == is_cpu_owned && si.size != is_cpu_owned)
+          k = hipMemcpyDeviceToHost;
+       else
+          k = hipMemcpyDeviceToDevice;
+    }
    switch (k) {
    case hipMemcpyHostToHost: std::memcpy(dst, src, n); break;
-    case hipMemcpyHostToDevice: return h2d_copy(dst, src, n, info(dst));
-    case hipMemcpyDeviceToHost: return d2h_copy(dst, src, n, info(src));
+    case hipMemcpyHostToDevice: return h2d_copy(dst, src, n, di);
+    case hipMemcpyDeviceToHost: return d2h_copy(dst, src, n, si);
    case hipMemcpyDeviceToDevice: {
-        const auto di{info(dst)};
-        const auto si{info(src)};
-        throwing_result_check(hsa_amd_agents_allow_access(1u, &si.agentOwner,
-                                                          nullptr,
-                                                          di.agentBaseAddress),
-                              __FILE__, __func__, __LINE__);
-        return do_copy(dst, src, n, di.agentOwner, si.agentOwner);
+        hsa_status_t res = hsa_amd_agents_allow_access(1u, &si.agentOwner,
+                                                       nullptr, di.agentBaseAddress);
+        if (res == HSA_STATUS_SUCCESS){
+	   return do_copy(dst, src, n, di.agentOwner, si.agentOwner);
+        }
+
+        // If devices do not have access then fallback mechanism will be used
+        // copy will be slower
+        throwing_result_check(hsa_memory_copy(dst,src,n), __FILE__, __func__, __LINE__);
+        break;
    }
-    default: return generic_copy(dst, src, n, info(dst), info(src));
+    default: return generic_copy(dst, src, n, di, si);
    }
 }

@@ -478,6 +499,10 @@ void* allocAndSharePtr(const char* msg, size_t sizeBytes, ihipCtx_t* ctx, bool s
 hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned int flags) {
    hipError_t hip_status = hipSuccess;

+    if (sizeBytes == 0) {
+        return hipSuccess;
+    }
+
    if (HIP_SYNC_HOST_ALLOC) {
        hipDeviceSynchronize();
    }
@@ -485,10 +510,6 @@ hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned i
    auto ctx = ihipGetTlsDefaultCtx();
    if ((ctx == nullptr) || (ptr == nullptr)) {
        hip_status = hipErrorInvalidValue;
-    }
-    else if (sizeBytes == 0) {
-        hip_status = hipSuccess;
-        // TODO - should size of 0 return err or be siliently ignored?
    } else {
        unsigned trueFlags = flags;
        if (flags == hipHostMallocDefault) {
@@ -673,14 +694,15 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) {
    HIP_SET_DEVICE();
    hipError_t hip_status = hipSuccess;

+    if (sizeBytes == 0) {
+        if (ptr) *ptr = NULL;
+        return ihipLogStatus(hipSuccess);
+    }
+
    auto ctx = ihipGetTlsDefaultCtx();
    // return NULL pointer when malloc size is 0
    if ( nullptr == ctx || nullptr == ptr)  {
        hip_status = hipErrorInvalidValue;
-    }
-    else if (sizeBytes == 0) {
-        *ptr = NULL;
-        hip_status = hipSuccess;
    } else {
        auto device = ctx->getWriteableDevice();
        *ptr = hip_internal::allocAndSharePtr("device_mem", sizeBytes, ctx, false /*shareWithAll*/,
@@ -700,14 +722,15 @@ hipError_t hipExtMallocWithFlags(void** ptr, size_t sizeBytes, unsigned int flag
    HIP_SET_DEVICE();

 #if (__hcc_workweek__ >= 19115)
+    if (sizeBytes == 0) {
+        if (ptr) *ptr = NULL;
+        return ihipLogStatus(hipSuccess);
+    }
+
    hipError_t hip_status = hipSuccess;

    auto ctx = ihipGetTlsDefaultCtx();
-    // return NULL pointer when malloc size is 0
-    if (sizeBytes == 0) {
-        *ptr = NULL;
-        hip_status = hipSuccess;
-    } else if ((ctx == nullptr) || (ptr == nullptr)) {
+    if ((ctx == nullptr) || (ptr == nullptr)) {
        hip_status = hipErrorInvalidValue;
    } else {
        unsigned amFlags = 0;
@@ -736,6 +759,9 @@ hipError_t hipExtMallocWithFlags(void** ptr, size_t sizeBytes, unsigned int flag
 hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) {
    HIP_INIT_SPECIAL_API(hipHostMalloc, (TRACE_MEM), ptr, sizeBytes, flags);
    HIP_SET_DEVICE();
+    if (sizeBytes == 0) {
+        return ihipLogStatus(hipSuccess);
+    }
    hipError_t hip_status = hipSuccess;
    hip_status = hip_internal::ihipHostMalloc(tls, ptr, sizeBytes, flags);
    return ihipLogStatus(hip_status);
@@ -744,6 +770,9 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) {
 hipError_t hipMallocManaged(void** devPtr, size_t size, unsigned int flags) {
    HIP_INIT_SPECIAL_API(hipMallocManaged, (TRACE_MEM), devPtr, size, flags);
    HIP_SET_DEVICE();
+    if (size == 0) {
+        return ihipLogStatus(hipSuccess);
+    }
    hipError_t hip_status = hipSuccess;
    if(flags != hipMemAttachGlobal)
        hip_status = hipErrorInvalidValue;
@@ -1224,6 +1253,7 @@ hipError_t hipMemcpyToSymbol(void* dst, const void* src, size_t count,

    tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbol_name, dst);

+    if (count == 0) return ihipLogStatus(hipSuccess);
    if (dst == nullptr) {
        return ihipLogStatus(hipErrorInvalidSymbol);
    }
@@ -1246,6 +1276,7 @@ hipError_t hipMemcpyFromSymbol(void* dst, const void* src, size_t count,

    tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbol_name, dst);

+    if (count == 0) return ihipLogStatus(hipSuccess);
    if (src == nullptr || dst == nullptr) {
        return ihipLogStatus(hipErrorInvalidSymbol);
    }
@@ -1269,6 +1300,7 @@ hipError_t hipMemcpyToSymbolAsync(void* dst, const void* src, size_t count,

    tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbol_name, dst);

+    if (count == 0) return ihipLogStatus(hipSuccess);
    if (dst == nullptr) {
        return ihipLogStatus(hipErrorInvalidSymbol);
    }
@@ -1301,6 +1333,7 @@ hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* src, size_t count,

    tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbol_name, src);

+    if (count == 0) return ihipLogStatus(hipSuccess);
    if (src == nullptr || dst == nullptr) {
        return ihipLogStatus(hipErrorInvalidSymbol);
    }
@@ -1592,6 +1625,7 @@ hipError_t ihipMemcpy3D(const struct hipMemcpy3DParms* p, hipStream_t stream, bo
        srcXoffset = p->srcPos.x;
        srcYoffset = p->srcPos.y;
        srcZoffset = p->srcPos.z;
+        if (copyWidth == 0) return hipSuccess;
        if (p->dstArray != nullptr) {
            if ((p->dstArray->isDrv == true) ||( p->dstPtr.ptr!= nullptr)){
                return hipErrorInvalidValue;
@@ -1933,6 +1967,7 @@ hipError_t getLockedPointer(void *hostPtr, size_t dataLen, void **devicePtrPtr)
 // TODO - review and optimize
 hipError_t ihipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width,
                        size_t height, hipMemcpyKind kind) {
+    if (height == 0 || width == 0) return hipSuccess;
    if (dst == nullptr || src == nullptr || width > dpitch || width > spitch) return hipErrorInvalidValue;

    hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
@@ -1989,6 +2024,7 @@ hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch,

 hipError_t ihipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width,
                            size_t height, hipMemcpyKind kind, hipStream_t stream) {
+    if (height == 0 || width == 0) return hipSuccess;
    if (dst == nullptr || src == nullptr || width > dpitch || width > spitch) return hipErrorInvalidValue;
    hipError_t e = hipSuccess;
    int isLockedOrD2D = 0;
@@ -2043,6 +2079,7 @@ hipError_t ihip2dOffsetMemcpy(void* dst, size_t dpitch, const void* src, size_t
                            size_t height, size_t srcXOffsetInBytes, size_t srcYOffset,
                            size_t dstXOffsetInBytes, size_t dstYOffset,hipMemcpyKind kind,
                            hipStream_t stream, bool isAsync) {
+    if (height == 0 || width == 0) return hipSuccess;
    if((spitch < width + srcXOffsetInBytes) || (srcYOffset >= height)){
        return hipErrorInvalidValue;
    } else if((dpitch < width + dstXOffsetInBytes) || (dstYOffset >= height)){
@@ -2061,6 +2098,7 @@ hipError_t ihipMemcpyParam2D(const hip_Memcpy2D* pCopy, hipStream_t stream, bool
    if (pCopy == nullptr) {
        return hipErrorInvalidValue;
    }
+    if (pCopy->Height == 0 || pCopy->WidthInBytes == 0) return hipSuccess;
    void* dst; const void* src;
    size_t spitch = pCopy->srcPitch;
    size_t dpitch = pCopy->dstPitch;
@@ -2140,6 +2178,7 @@ hipError_t hipMemcpy2DFromArray( void* dst, size_t dpitch, hipArray_const_t src,
 hipError_t hipMemcpy2DFromArrayAsync( void* dst, size_t dpitch, hipArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream ){
    HIP_INIT_SPECIAL_API(hipMemcpy2DFromArrayAsync, (TRACE_MCMD), dst, dpitch, src, wOffset, hOffset, width, height, kind, stream);
    size_t byteSize;
+    if (height == 0 || width == 0) return ihipLogStatus(hipSuccess);
    if(src) {
        switch (src->desc.f) {
            case hipChannelFormatKindSigned:
@@ -2239,8 +2278,6 @@ hipError_t hipMemGetInfo(size_t* free, size_t* total) {
        auto device = ctx->getWriteableDevice();
        if (total) {
            *total = device->_props.totalGlobalMem;
-        } else {
-            e = hipErrorInvalidValue;
        }

        if (free) {
@@ -2263,8 +2300,6 @@ hipError_t hipMemGetInfo(size_t* free, size_t* total) {
 		} else {
 		    return ihipLogStatus(hipErrorInvalidValue);
 		}
-        } else {
-            e = hipErrorInvalidValue;
        }

    } else {
@@ -50,7 +50,7 @@ THE SOFTWARE.
 #include <unordered_map>
 #include <utility>
 #include <vector>
-#include "../include/hip/hcc_detail/code_object_bundle.hpp"
+#include "code_object_bundle.inl"
 #include "hip_fatbin.h"
 // TODO Use Pool APIs from HCC to get memory regions.

@@ -140,7 +140,7 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global
                                  uint32_t localWorkSizeZ, size_t sharedMemBytes,
                                  hipStream_t hStream, void** kernelParams, void** extra,
                                  hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags, bool isStreamLocked = 0,
-                                  void** impCoopParams = 0) {
+                                  void** impCoopParams = 0, hc::accelerator_view* coopAV = 0) {
    using namespace hip_impl;

    auto ctx = ihipGetTlsDefaultCtx();
@@ -192,8 +192,8 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global
        if (impCoopParams) {
            const auto p{static_cast<const char*>(*impCoopParams)};
            // The sixth index is for multi-grid synchronization
-            kernargs.insert((kernargs.cend() - padSize - HIP_IMPLICIT_KERNARG_SIZE) + 6 * HIP_IMPLICIT_KERNARG_ALIGNMENT,
-                            p, p + HIP_IMPLICIT_KERNARG_ALIGNMENT);
+            copy(p, p + HIP_IMPLICIT_KERNARG_ALIGNMENT,
+                    (kernargs.end() - HIP_IMPLICIT_KERNARG_SIZE) + 6 * HIP_IMPLICIT_KERNARG_ALIGNMENT);
        }

        /*
@@ -245,6 +245,10 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global

        hc::completion_future cf;

+        if (coopAV) {
+            lp.av = coopAV;
+        }
+
        lp.av->dispatch_hsa_kernel(&aql, kernargs.data(), kernargs.size(),
                                   (startEvent || stopEvent) ? &cf : nullptr
 #if (__hcc_workweek__ > 17312)
@@ -326,22 +330,18 @@ hipError_t ihipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList
        return hipErrorInvalidValue;
    }

-    hipFunction_t* kds = reinterpret_cast<hipFunction_t*>(malloc(sizeof(hipFunction_t) * numDevices));
-    if (kds == nullptr) {
-        return hipErrorNotInitialized;
-    }
+    std::vector<hipFunction_t> kds(numDevices,0);

    // prepare all kernel descriptors for each device as all streams will be locked in the next loop
    for (int i = 0; i < numDevices; ++i) {
        const hipLaunchParams& lp = launchParamsList[i];
        if (lp.stream == nullptr) {
-            free(kds);
            return hipErrorNotInitialized;
        }
        kds[i] = ps.kernel_descriptor(reinterpret_cast<std::uintptr_t>(lp.func),
                hip_impl::target_agent(lp.stream));
+
        if (kds[i] == nullptr) {
-            free(kds);
            return hipErrorInvalidValue;
        }
        if (!kds[i]->_kernarg_layout.empty()) continue;
@@ -396,8 +396,6 @@ hipError_t ihipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList
 #endif
     }

-    free(kds);
-
    return result;
 }

@@ -409,6 +407,90 @@ hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList,
    return ihipExtLaunchMultiKernelMultiDevice(launchParamsList, numDevices, flags, ps);
 }

+void getGprsLdsUsage(hipFunction_t f, size_t* usedVGPRS, size_t* usedSGPRS, size_t* usedLDS)
+{
+    if (f->_is_code_object_v3) {
+        const auto header = reinterpret_cast<const amd_kernel_code_v3_t*>(f->_header);
+        // GRANULATED_WAVEFRONT_VGPR_COUNT is specified in 0:5 bits of COMPUTE_PGM_RSRC1
+        // the granularity for gfx6-gfx9 is max(0, ceil(vgprs_used / 4) - 1)
+        *usedVGPRS = ((header->compute_pgm_rsrc1 & 0x3F) + 1) << 2;
+        // GRANULATED_WAVEFRONT_SGPR_COUNT is specified in 6:9 bits of COMPUTE_PGM_RSRC1
+        // the granularity for gfx9+ is 2 * max(0, ceil(sgprs_used / 16) - 1)
+        *usedSGPRS = ((((header->compute_pgm_rsrc1 & 0x3C0) >> 6) >> 1) + 1) << 4;
+        *usedLDS = header->group_segment_fixed_size;
+    }
+    else {
+        const auto header = f->_header;
+        // VGPRs granularity is 4
+        *usedVGPRS = ((header->workitem_vgpr_count + 3) >> 2) << 2;
+        // adding 2 to take into account the 2 VCC registers & handle the granularity of 16
+        *usedSGPRS = header->wavefront_sgpr_count + 2;
+        *usedSGPRS = ((*usedSGPRS + 15) >> 4) << 4;
+        *usedLDS = header->workgroup_group_segment_byte_size;
+    }
+}
+
+static hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
+   TlsData *tls, uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk)
+{
+    using namespace hip_impl;
+
+    auto ctx = ihipGetTlsDefaultCtx();
+    if (ctx == nullptr) {
+        return hipErrorInvalidDevice;
+    }
+    if (numBlocks == nullptr) {
+        return hipErrorInvalidValue;
+    }
+
+    hipDeviceProp_t prop{};
+    ihipGetDeviceProperties(&prop, ihipGetTlsDefaultCtx()->getDevice()->_deviceId);
+
+    if (blockSize > prop.maxThreadsPerBlock) {
+        *numBlocks = 0;
+        return hipSuccess;
+    }
+
+    prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024;
+
+    size_t usedVGPRS = 0;
+    size_t usedSGPRS = 0;
+    size_t usedLDS = 0;
+    getGprsLdsUsage(f, &usedVGPRS, &usedSGPRS, &usedLDS);
+
+    // Due to SPI and private memory limitations, the max of wavefronts per CU in 32
+    size_t wavefrontSize = prop.warpSize;
+    size_t maxWavefrontsPerCU = min(prop.maxThreadsPerMultiProcessor / wavefrontSize, 32);
+
+    const size_t simdPerCU = 4;
+    const size_t maxWavesPerSimd = maxWavefrontsPerCU / simdPerCU;
+
+    size_t numWavefronts = (blockSize + wavefrontSize - 1) / wavefrontSize;
+
+    size_t availableVGPRs = (prop.regsPerBlock / wavefrontSize / simdPerCU);
+    size_t vgprs_alu_occupancy = simdPerCU * (usedVGPRS == 0 ? maxWavesPerSimd
+        : std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS));
+
+    // Calculate blocks occupancy per CU based on VGPR usage
+    *numBlocks = vgprs_alu_occupancy / numWavefronts;
+
+    const size_t availableSGPRs = (prop.gcnArch < 800) ? 512 : 800;
+    size_t sgprs_alu_occupancy = simdPerCU * (usedSGPRS == 0 ? maxWavesPerSimd
+        : std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS));
+
+    // Calculate blocks occupancy per CU based on SGPR usage
+    *numBlocks = std::min(*numBlocks, (uint32_t) (sgprs_alu_occupancy / numWavefronts));
+
+    size_t total_used_lds = usedLDS + dynSharedMemPerBlk;
+    if (total_used_lds != 0) {
+      // Calculate LDS occupacy per CU. lds_per_cu / (static_lsd + dynamic_lds)
+      size_t lds_occupancy = prop.maxSharedMemoryPerMultiProcessor / total_used_lds;
+      *numBlocks = std::min(*numBlocks, (uint32_t) lds_occupancy);
+    }
+
+    return hipSuccess;
+}
+
 namespace {
 // kernel for initializing GWS
 // nwm1 is the total number of work groups minus 1
@@ -417,25 +499,28 @@ __global__ void init_gws(uint nwm1) {
 }
 }

-__attribute__((visibility("default")))
 hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim,
-        dim3 blockDimX, void** kernelParams, unsigned int sharedMemBytes,
+        dim3 blockDim, void** kernelParams, unsigned int sharedMemBytes,
        hipStream_t stream, hip_impl::program_state& ps) {

+#if (__hcc_workweek__ >= 20093)
    hipError_t result;


-    if ((f == nullptr) || (stream == nullptr) || (kernelParams == nullptr)) {
+    if (f == nullptr || kernelParams == nullptr) {
        return hipErrorNotInitialized;
    }

-    if (!stream->getDevice()->_props.cooperativeLaunch) {
+    stream = ihipSyncAndResolveStream(stream);
+
+    if (!stream->getDevice()->_props.cooperativeLaunch ||
+        blockDim.x * blockDim.y * blockDim.z > stream->getDevice()->_props.maxThreadsPerBlock) {
        return hipErrorInvalidConfiguration;
    }

-    size_t globalWorkSizeX = (size_t)gridDim.x * (size_t)blockDimX.x;
-    size_t globalWorkSizeY = (size_t)gridDim.y * (size_t)blockDimX.y;
-    size_t globalWorkSizeZ = (size_t)gridDim.z * (size_t)blockDimX.z;
+    size_t globalWorkSizeX = (size_t)gridDim.x * (size_t)blockDim.x;
+    size_t globalWorkSizeY = (size_t)gridDim.y * (size_t)blockDim.y;
+    size_t globalWorkSizeZ = (size_t)gridDim.z * (size_t)blockDim.z;
    if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX)
    {
        return hipErrorInvalidConfiguration;
@@ -469,28 +554,49 @@ hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim,
    kd->_kernarg_layout = *reinterpret_cast<const std::vector<
            std::pair<std::size_t, std::size_t>>*>(kargs.getHandle());

+    GET_TLS();
+    uint32_t numBlocksPerSm = 0;
+    result = ihipOccupancyMaxActiveBlocksPerMultiprocessor(tls, &numBlocksPerSm, kd,
+                    blockDim.x * blockDim.y * blockDim.z, sharedMemBytes);
+    if (result != hipSuccess) {
+        return hipErrorLaunchFailure;
+    }
+    int maxActiveBlocks = numBlocksPerSm * stream->getDevice()->_props.multiProcessorCount;
+
+    //check to see if the workload fits on the GPU
+    if (gridDim.x * gridDim.y * gridDim.z > maxActiveBlocks) {
+        return hipErrorCooperativeLaunchTooLarge;
+    }

    void *gwsKernelParam[1];
    // calculate total number of work groups minus 1 for the main kernel
    uint nwm1 = (gridDim.x * gridDim.y * gridDim.z) - 1;
    gwsKernelParam[0] = &nwm1;

-    LockedAccessor_StreamCrit_t streamCrit(stream->criticalData(), false);
-#if (__hcc_workweek__ >= 19213)
-    streamCrit->_av.acquire_locked_hsa_queue();
-#endif
+    hc::accelerator acc = stream->getDevice()->_acc;
+    // create a cooperative accelerated view for launching gws and main kernels
+    hc::accelerator_view coopAV = acc.create_cooperative_view();

-    GET_TLS();
-    // launch the init_gws kernel to initialize the GWS
+    LockedAccessor_StreamCrit_t streamCrit(stream->criticalData(), false);
+
+    // the cooperative queue will wait until this stream completes its operations
+    hc::completion_future streamCF;
+    if (!streamCrit->_av.get_is_empty()) {
+        streamCF = streamCrit->_av.create_marker(hc::accelerator_scope);
+        coopAV.create_blocking_marker(streamCF, hc::accelerator_scope);
+    }
+
+    streamCrit->_av.acquire_locked_hsa_queue();
+    coopAV.acquire_locked_hsa_queue();
+
+    // launch the init_gws kernel to initialize the GWS in the dedicated cooperative queue
    result = ihipModuleLaunchKernel(tls, gwsKD, 1, 1, 1, 1, 1, 1,
-             0, stream, gwsKernelParam, nullptr, nullptr, nullptr, 0, true);
+             0, stream, gwsKernelParam, nullptr, nullptr, nullptr, 0, true, nullptr , &coopAV);

    if (result != hipSuccess) {
        stream->criticalData().unlock();
-#if (__hcc_workweek__ >= 19213)
        stream->criticalData()._av.release_locked_hsa_queue();
-#endif
-
+        coopAV.release_locked_hsa_queue();
        return hipErrorLaunchFailure;
    }

@@ -498,60 +604,106 @@ hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim,
    void* impCoopParams[1];
    impCoopParams[0] = &impCoopArg;

-    // launch the main kernel
+    // launch the main kernel in the cooperative queue
    result = ihipModuleLaunchKernel(tls, kd,
-            gridDim.x * blockDimX.x,
-            gridDim.y * blockDimX.y,
-            gridDim.z * blockDimX.z,
-            blockDimX.x, blockDimX.y, blockDimX.z,
+            gridDim.x * blockDim.x,
+            gridDim.y * blockDim.y,
+            gridDim.z * blockDim.z,
+            blockDim.x, blockDim.y, blockDim.z,
            sharedMemBytes, stream, kernelParams, nullptr, nullptr,
-            nullptr, 0, true, impCoopParams);
+            nullptr, 0, true, impCoopParams, &coopAV);
+
+
+    coopAV.release_locked_hsa_queue();
+    stream->criticalData()._av.release_locked_hsa_queue();
+
+    // this stream will wait until the cooperative queue completes its operations
+    hc::completion_future cooperativeCF;
+    if (!coopAV.get_is_empty()) {
+        cooperativeCF = coopAV.create_marker(hc::accelerator_scope);
+        streamCrit->_av.create_blocking_marker(cooperativeCF, hc::accelerator_scope);
+    }

    stream->criticalData().unlock();
-#if (__hcc_workweek__ >= 19213)
-    stream->criticalData()._av.release_locked_hsa_queue();
-#endif

    return result;
+#else
+    return hipErrorInvalidConfiguration;
+#endif
+
 }

 __attribute__((visibility("default")))
+hipError_t hipLaunchCooperativeKernel(const void* func, dim3 gridDim,
+                                    dim3 blockDim, void** args,
+                                    size_t sharedMem, hipStream_t stream,
+                                    hip_impl::program_state& ps) {
+
+    // Skipping passing in ps, because the logging function does not like it
+    HIP_INIT_API(hipLaunchCooperativeKernel, func, gridDim, blockDim, args,
+                 sharedMem, stream);
+
+    return ihipLogStatus(ihipLaunchCooperativeKernel(func, gridDim, blockDim,
+                         args, sharedMem, stream, ps));
+}
+
+
 hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
        int  numDevices, unsigned int  flags, hip_impl::program_state& ps) {

+#if (__hcc_workweek__ >= 20093)
    hipError_t result;

    if (numDevices > g_deviceCnt || launchParamsList == nullptr || numDevices > MAX_COOPERATIVE_GPUs) {
        return hipErrorInvalidValue;
    }

+    vector<hipStream_t> streams;
+    vector<uint64_t> deviceIDs;
+    // check to see if we have valid distinct streams/devices, if cooperative multi device
+    // launch is supported and if grid/block dimensions are valid
    for (int i = 0; i < numDevices; ++i) {
-        if (!launchParamsList[i].stream->getDevice()->_props.cooperativeMultiDeviceLaunch) {
+        const hipLaunchParams& lp = launchParamsList[i];
+
+        if (lp.stream == nullptr){
+            return hipErrorInvalidResourceHandle;
+        }
+
+        if (find(streams.begin(), streams.end(), lp.stream) == streams.end()) {
+            streams.push_back(lp.stream);
+        } else {
+            return hipErrorInvalidDevice;
+        }
+
+        const ihipDevice_t* currentDevice = lp.stream->getDevice();
+        if (find(deviceIDs.begin(), deviceIDs.end(), currentDevice->_deviceId) == deviceIDs.end()) {
+            deviceIDs.push_back(currentDevice->_deviceId);
+        } else {
+            return hipErrorInvalidDevice;
+        }
+
+        if (!currentDevice->_props.cooperativeMultiDeviceLaunch) {
+            return hipErrorInvalidConfiguration;
+        }
+
+        if (lp.gridDim.x == 0 || lp.gridDim.y == 0 || lp.gridDim.z == 0 ||
+            lp.blockDim.x == 0 || lp.blockDim.y == 0 || lp.blockDim.z == 0 ||
+            lp.blockDim.x * lp.blockDim.y * lp.blockDim.z > currentDevice->_props.maxThreadsPerBlock){
            return hipErrorInvalidConfiguration;
        }
    }

-    hipFunction_t* gwsKds = reinterpret_cast<hipFunction_t*>(malloc(sizeof(hipFunction_t) * numDevices));
-    hipFunction_t* kds    = reinterpret_cast<hipFunction_t*>(malloc(sizeof(hipFunction_t) * numDevices));
-
-    if (kds == nullptr || gwsKds == nullptr) {
-        return hipErrorNotInitialized;
-    }
+    vector<hipFunction_t> gwsKds;
+    vector<hipFunction_t> kds;

+    GET_TLS();
    // prepare all kernel descriptors for initializing the GWS and the main kernels per device
    for (int i = 0; i < numDevices; ++i) {
        const hipLaunchParams& lp = launchParamsList[i];
-        if (lp.stream == nullptr) {
-            free(gwsKds);
-            free(kds);
-            return hipErrorNotInitialized;
-        }

-        gwsKds[i] = ps.kernel_descriptor(reinterpret_cast<std::uintptr_t>(&init_gws),
-                hip_impl::target_agent(lp.stream));
+        gwsKds.push_back(ps.kernel_descriptor(reinterpret_cast<std::uintptr_t>(&init_gws),
+                hip_impl::target_agent(lp.stream)));
        if (gwsKds[i] == nullptr) {
-            free(gwsKds);
-            free(kds);
            return hipErrorInvalidValue;
        }
        hip_impl::kernargs_size_align gwsKargs = ps.get_kernargs_size_align(
@@ -560,23 +712,42 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
                gwsKargs.getHandle());


-        kds[i] = ps.kernel_descriptor(reinterpret_cast<std::uintptr_t>(lp.func),
-                hip_impl::target_agent(lp.stream));
+        kds.push_back(ps.kernel_descriptor(reinterpret_cast<std::uintptr_t>(lp.func),
+                hip_impl::target_agent(lp.stream)));
        if (kds[i] == nullptr) {
-            free(gwsKds);
-            free(kds);
            return hipErrorInvalidValue;
        }
        hip_impl::kernargs_size_align kargs = ps.get_kernargs_size_align(
                reinterpret_cast<std::uintptr_t>(lp.func));
        kds[i]->_kernarg_layout = *reinterpret_cast<const std::vector<std::pair<std::size_t, std::size_t>>*>(
                kargs.getHandle());
+
+        uint32_t numBlocksPerSm = 0;
+        result = ihipOccupancyMaxActiveBlocksPerMultiprocessor(tls, &numBlocksPerSm, kds[i],
+                        lp.blockDim.x * lp.blockDim.y * lp.blockDim.z, lp.sharedMem);
+        if (result != hipSuccess) {
+            return hipErrorLaunchFailure;
+        }
+        int maxActiveBlocks = numBlocksPerSm * lp.stream->getDevice()->_props.multiProcessorCount;
+
+        //check to see if the workload fits on the GPU
+        if (lp.gridDim.x * lp.gridDim.y * lp.gridDim.z > maxActiveBlocks) {
+            return hipErrorCooperativeLaunchTooLarge;
+        }
+    }
+
+    vector<hc::accelerator_view> coopAVs;
+
+    // create cooperative accelerated views for launching gws and main kernels on each device
+    for (int i = 0; i < numDevices; ++i) {
+        hc::accelerator acc = launchParamsList[i].stream->getDevice()->_acc;
+        coopAVs.push_back(acc.create_cooperative_view());
    }

    mg_sync *mg_sync_ptr = 0;
-    mg_info *mg_info_ptr[MAX_COOPERATIVE_GPUs] = {0};
+    vector<mg_info *> mg_info_ptr;
+

-    GET_TLS();
    result = hip_internal::ihipHostMalloc(tls, (void **)&mg_sync_ptr, sizeof(mg_sync), hipHostMallocDefault);
    if (result != hipSuccess) {
        return hipErrorInvalidValue;
@@ -586,7 +757,8 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL

    uint all_sum = 0;
    for (int i = 0; i < numDevices; ++i) {
-        result = hip_internal::ihipHostMalloc(tls, (void **)&mg_info_ptr[i], sizeof(mg_info), hipHostMallocDefault);
+        mg_info *mg_info_temp = nullptr;
+        result = hip_internal::ihipHostMalloc(tls, (void **)&mg_info_temp, sizeof(mg_info), hipHostMallocDefault);
        if (result != hipSuccess) {
            hip_internal::ihipHostFree(tls, mg_sync_ptr);
            for (int j = 0; j < i; ++j) {
@@ -594,6 +766,7 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
            }
            return hipErrorInvalidValue;
        }
+        mg_info_ptr.push_back(mg_info_temp);
        // calculate the sum of sizes of all grids
        const hipLaunchParams& lp = launchParamsList[i];
        all_sum += lp.blockDim.x * lp.blockDim.y * lp.blockDim.z *
@@ -603,9 +776,15 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
    // lock all streams before launching the blit kernels for initializing the GWS and main kernels to each device
    for (int i = 0; i < numDevices; ++i) {
        LockedAccessor_StreamCrit_t streamCrit(launchParamsList[i].stream->criticalData(), false);
-#if (__hcc_workweek__ >= 19213)
+
+        hc::completion_future streamCF;
+        if (!streamCrit->_av.get_is_empty()) {
+            streamCF = streamCrit->_av.create_marker(hc::accelerator_scope);
+            coopAVs[i].create_blocking_marker(streamCF, hc::accelerator_scope);
+        }
+
        streamCrit->_av.acquire_locked_hsa_queue();
-#endif
+        coopAVs[i].acquire_locked_hsa_queue();
    }

    // launch the init_gws kernel to initialize the GWS for each device
@@ -617,14 +796,13 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
        gwsKernelParam[0] = &nwm1;

        result = ihipModuleLaunchKernel(tls, gwsKds[i], 1, 1, 1, 1, 1, 1,
-                0, lp.stream, gwsKernelParam, nullptr, nullptr, nullptr, 0, true);
+                0, lp.stream, gwsKernelParam, nullptr, nullptr, nullptr, 0, true, nullptr, &coopAVs[i]);

        if (result != hipSuccess) {
            for (int j = 0; j < numDevices; ++j) {
                launchParamsList[j].stream->criticalData().unlock();
-#if (__hcc_workweek__ >= 19213)
                launchParamsList[j].stream->criticalData()._av.release_locked_hsa_queue();
-#endif
+                coopAVs[i].release_locked_hsa_queue();
            }
            hip_internal::ihipHostFree(tls, mg_sync_ptr);
            for (int j = 0; j < numDevices; ++j) {
@@ -670,14 +848,13 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
                lp.blockDim.x, lp.blockDim.y,
                lp.blockDim.z, lp.sharedMem,
                lp.stream, lp.args, nullptr, nullptr, nullptr, 0,
-                true, impCoopParams);
+                true, impCoopParams, &coopAVs[i]);

        if (result != hipSuccess) {
            for (int j = 0; j < numDevices; ++j) {
                launchParamsList[j].stream->criticalData().unlock();
-#if (__hcc_workweek__ >= 19213)
                launchParamsList[j].stream->criticalData()._av.release_locked_hsa_queue();
-#endif
+                coopAVs[i].release_locked_hsa_queue();
            }
            hip_internal::ihipHostFree(tls, mg_sync_ptr);
            for (int j = 0; j < numDevices; ++j) {
@@ -691,14 +868,18 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL

    // unlock all streams
    for (int i = 0; i < numDevices; ++i) {
-        launchParamsList[i].stream->criticalData().unlock();
-#if (__hcc_workweek__ >= 19213)
+        coopAVs[i].release_locked_hsa_queue();
        launchParamsList[i].stream->criticalData()._av.release_locked_hsa_queue();
-#endif
-    }

-    free(gwsKds);
-    free(kds);
+        hc::completion_future cooperativeCF;
+        if (!coopAVs[i].get_is_empty()) {
+            cooperativeCF = coopAVs[i].create_marker(hc::accelerator_scope);
+            launchParamsList[i].stream->criticalData()._av.create_blocking_marker(
+                    cooperativeCF, hc::accelerator_scope);
+        }
+
+        launchParamsList[i].stream->criticalData().unlock();
+    }

    hip_internal::ihipHostFree(tls, mg_sync_ptr);
    for (int j = 0; j < numDevices; ++j) {
@@ -706,6 +887,24 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
    }

    return result;
+#else
+    return hipErrorInvalidConfiguration;
+#endif
+}
+
+__attribute__((visibility("default")))
+hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
+                                                 int  numDevices,
+                                                 unsigned int flags,
+                                                 hip_impl::program_state& ps) {
+
+    // Skipping passing in ps, because the logging function does not like it
+    HIP_INIT_API(hipLaunchCooperativeKernelMultiDevice, launchParamsList,
+                 numDevices, flags);
+
+    return ihipLogStatus(ihipLaunchCooperativeKernelMultiDevice(launchParamsList,
+                                                                numDevices,
+                                                                flags, ps));
 }

 namespace hip_impl {
@@ -1120,7 +1319,7 @@ const amd_kernel_code_v3_t *header_v3(const ihipModuleSymbol_t& kd) {
  return reinterpret_cast<const amd_kernel_code_v3_t*>(kd._header);
 }

-hipFuncAttributes make_function_attributes(TlsData *tls, const ihipModuleSymbol_t& kd) {
+hipFuncAttributes make_function_attributes(TlsData *tls, ihipModuleSymbol_t& kd) {
    hipFuncAttributes r{};

    hipDeviceProp_t prop{};
@@ -1130,23 +1329,57 @@ hipFuncAttributes make_function_attributes(TlsData *tls, const ihipModuleSymbol_
    prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024;

    if (kd._is_code_object_v3) {
-        r.localSizeBytes = header_v3(kd)->private_segment_fixed_size;
-        r.sharedSizeBytes = header_v3(kd)->group_segment_fixed_size;
-        r.numRegs = ((header_v3(kd)->compute_pgm_rsrc1 & 0x3F) + 1) << 2;
        r.binaryVersion = 0; // FIXME: should it be the ISA version or code
                             //        object format version?
+        r.localSizeBytes = header_v3(kd)->private_segment_fixed_size;
+        r.sharedSizeBytes = header_v3(kd)->group_segment_fixed_size;
    } else {
        r.localSizeBytes = kd._header->workitem_private_segment_byte_size;
        r.sharedSizeBytes = kd._header->workgroup_group_segment_byte_size;
-        r.numRegs = kd._header->workitem_vgpr_count;
        r.binaryVersion =
            kd._header->amd_machine_version_major * 10 +
            kd._header->amd_machine_version_minor;
    }
    r.maxDynamicSharedSizeBytes = prop.sharedMemPerBlock - r.sharedSizeBytes;
-    r.maxThreadsPerBlock = r.numRegs ?
-        std::min(prop.maxThreadsPerBlock, prop.regsPerBlock / r.numRegs) :
-        prop.maxThreadsPerBlock;
+
+    size_t usedVGPRS = 0;
+    size_t usedSGPRS = 0;
+    size_t usedLDS = 0;
+    getGprsLdsUsage(&kd, &usedVGPRS, &usedSGPRS, &usedLDS);
+
+    r.numRegs = usedVGPRS;
+
+    size_t wavefrontSize = prop.warpSize;
+    size_t maxWavefrontsPerBlock = prop.maxThreadsPerBlock / wavefrontSize;
+    size_t maxWavefrontsPerCU = min(prop.maxThreadsPerMultiProcessor / wavefrontSize, 32);
+    const size_t numSIMD = 4;
+    const size_t maxWavesPerSimd = maxWavefrontsPerCU / numSIMD;
+    size_t maxWaves = 0;
+    for (int i = 0; i < maxWavefrontsPerBlock; i++) {
+        size_t wavefronts = i + 1;
+
+        if (usedVGPRS > 0) {
+            size_t availableVGPRs = (prop.regsPerBlock / wavefrontSize / numSIMD);
+            size_t vgprs_alu_occupancy = numSIMD * std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS);
+
+            // Calculate blocks occupancy per CU based on VGPR usage
+            if (vgprs_alu_occupancy < wavefronts)
+                break;
+        }
+
+        if (usedSGPRS > 0) {
+            const size_t availableSGPRs = (prop.gcnArch < 800) ? 512 : 800;
+            size_t sgprs_alu_occupancy = numSIMD * ((usedSGPRS == 0) ? maxWavesPerSimd
+                : std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS));
+
+            // Calculate blocks occupancy per CU based on SGPR usage
+            if (sgprs_alu_occupancy < wavefronts)
+                break;
+        }
+        maxWaves = wavefronts;
+    }
+
+    r.maxThreadsPerBlock = maxWaves * wavefrontSize;
    r.ptxVersion = prop.major * 10 + prop.minor; // HIP currently presents itself as PTX 3.0.

    return r;
@@ -1294,29 +1527,6 @@ hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const
    return ihipLogStatus(hipSuccess);
 }

-void getGprsLdsUsage(hipFunction_t f, size_t* usedVGPRS, size_t* usedSGPRS, size_t* usedLDS)
-{
-    if (f->_is_code_object_v3) {
-        const auto header = reinterpret_cast<const amd_kernel_code_v3_t*>(f->_header);
-        // GRANULATED_WAVEFRONT_VGPR_COUNT is specified in 0:5 bits of COMPUTE_PGM_RSRC1
-        // the granularity for gfx6-gfx9 is max(0, ceil(vgprs_used / 4) - 1)
-        *usedVGPRS = ((header->compute_pgm_rsrc1 & 0x3F) + 1) << 2;
-        // GRANULATED_WAVEFRONT_SGPR_COUNT is specified in 6:9 bits of COMPUTE_PGM_RSRC1
-        // the granularity for gfx9+ is 2 * max(0, ceil(sgprs_used / 16) - 1)
-        *usedSGPRS = ((((header->compute_pgm_rsrc1 & 0x3C0) >> 6) >> 1) + 1) << 4;
-        *usedLDS = header->group_segment_fixed_size;
-    }
-    else {
-        const auto header = f->_header;
-        // VGPRs granularity is 4
-        *usedVGPRS = ((header->workitem_vgpr_count + 3) >> 2) << 2;
-        // adding 2 to take into account the 2 VCC registers & handle the granularity of 16
-        *usedSGPRS = header->wavefront_sgpr_count + 2;
-        *usedSGPRS = ((*usedSGPRS + 15) >> 4) << 4;
-        *usedLDS = header->workgroup_group_segment_byte_size;
-    }
-}
-
 hipError_t ihipOccupancyMaxPotentialBlockSize(TlsData *tls, uint32_t* gridSize, uint32_t* blockSize,
                                              hipFunction_t f, size_t dynSharedMemPerBlk,
                                              uint32_t blockSizeLimit)
@@ -1439,59 +1649,6 @@ hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* block
        gridSize, blockSize, f, dynSharedMemPerBlk, blockSizeLimit));
 }

-hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
-   TlsData *tls, uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk)
-{
-    using namespace hip_impl;
-
-    auto ctx = ihipGetTlsDefaultCtx();
-    if (ctx == nullptr) {
-        return hipErrorInvalidDevice;
-    }
-
-    hipDeviceProp_t prop{};
-    ihipGetDeviceProperties(&prop, ihipGetTlsDefaultCtx()->getDevice()->_deviceId);
-
-    prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024;
-
-    size_t usedVGPRS = 0;
-    size_t usedSGPRS = 0;
-    size_t usedLDS = 0;
-    getGprsLdsUsage(f, &usedVGPRS, &usedSGPRS, &usedLDS);
-
-    // Due to SPI and private memory limitations, the max of wavefronts per CU in 32
-    size_t wavefrontSize = prop.warpSize;
-    size_t maxWavefrontsPerCU = min(prop.maxThreadsPerMultiProcessor / wavefrontSize, 32);
-
-    const size_t simdPerCU = 4;
-    const size_t maxWavesPerSimd = maxWavefrontsPerCU / simdPerCU;
-
-    size_t numWavefronts = (blockSize + wavefrontSize - 1) / wavefrontSize;
-
-    size_t availableVGPRs = (prop.regsPerBlock / wavefrontSize / simdPerCU);
-    size_t vgprs_alu_occupancy = simdPerCU * (usedVGPRS == 0 ? maxWavesPerSimd
-        : std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS));
-
-    // Calculate blocks occupancy per CU based on VGPR usage
-    *numBlocks = vgprs_alu_occupancy / numWavefronts;
-
-    const size_t availableSGPRs = (prop.gcnArch < 800) ? 512 : 800;
-    size_t sgprs_alu_occupancy = simdPerCU * (usedSGPRS == 0 ? maxWavesPerSimd
-        : std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS));
-
-    // Calculate blocks occupancy per CU based on SGPR usage
-    *numBlocks = std::min(*numBlocks, (uint32_t) (sgprs_alu_occupancy / numWavefronts));
-
-    size_t total_used_lds = usedLDS + dynSharedMemPerBlk;
-    if (total_used_lds != 0) {
-      // Calculate LDS occupacy per CU. lds_per_cu / (static_lsd + dynamic_lds)
-      size_t lds_occupancy = prop.maxSharedMemoryPerMultiProcessor / total_used_lds;
-      *numBlocks = std::min(*numBlocks, (uint32_t) lds_occupancy);
-    }
-
-    return hipSuccess;
-}
-
 hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
   uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk)
 {
@@ -257,11 +257,39 @@ hipError_t hipStreamGetPriority(hipStream_t stream, int* priority) {
 hipError_t hipStreamAddCallback(hipStream_t stream, hipStreamCallback_t callback, void* userData,
                                unsigned int flags) {
    HIP_INIT_API(hipStreamAddCallback, stream, callback, userData, flags);
-    hipError_t e = hipSuccess;

-    // Create a thread in detached mode to handle callback
-    ihipStreamCallback_t* cb = new ihipStreamCallback_t(stream, callback, userData);
-    std::thread(ihipStreamCallbackHandler, cb).detach();
+    auto stream_original{stream};
+    stream = ihipSyncAndResolveStream(stream);

-    return ihipLogStatus(e);
+    if (!stream) return hipErrorInvalidValue;
+
+    LockedAccessor_StreamCrit_t cs{stream->criticalData()};
+
+    // create first marker
+    auto cf = cs->_av.create_marker(hc::no_scope);
+    // get its signal
+    auto signal = *reinterpret_cast<hsa_signal_t*>(cf.get_native_handle());
+    // increment its signal value
+    hsa_signal_add_relaxed(signal, 1);
+
+    // create callback that can be passed to hsa_amd_signal_async_handler
+    // this function will call the user's callback, then sets first packet's signal to 0 to indicate completion
+    auto t{new std::function<void()>{[=]() {
+        callback(stream_original, hipSuccess, userData);
+        hsa_signal_store_relaxed(signal, 0);
+    }}};
+
+    // register above callback with HSA runtime to be called when first packet's signal
+    // is decremented from 2 to 1 by CP (or it is already at 1)
+    hsa_amd_signal_async_handler(signal, HSA_SIGNAL_CONDITION_EQ, 1,
+        [](hsa_signal_value_t x, void* p) {
+            (*static_cast<decltype(t)>(p))();
+            delete static_cast<decltype(t)>(p);
+            return false;
+        }, t);
+
+    // create additional marker that blocks on the first one
+    cs->_av.create_blocking_marker(cf, hc::no_scope);
+
+    return ihipLogStatus(hipSuccess);
 }
@@ -301,7 +301,12 @@ hipError_t hipCreateTextureObject(hipTextureObject_t* pTexObject, const hipResou
        hsa_ext_sampler_descriptor_t samplerDescriptor;
        fillSamplerDescriptor(samplerDescriptor, pTexDesc->addressMode[0], pTexDesc->filterMode,
                              pTexDesc->normalizedCoords);
-
+        if(hipResourceTypeLinear == pResDesc->resType) {
+           samplerDescriptor.filter_mode = HSA_EXT_SAMPLER_FILTER_MODE_NEAREST;
+           samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER;
+        } else if(!pTexDesc->normalizedCoords) {
+            samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE;
+        }
        hsa_access_permission_t permission = HSA_ACCESS_PERMISSION_RW;

        if(hipResourceTypePitch2D != pResDesc->resType)
@@ -312,6 +317,7 @@ hipError_t hipCreateTextureObject(hipTextureObject_t* pTexObject, const hipResou
                                      HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR, pitch, 0, &(pTexture->image)) ||
            HSA_STATUS_SUCCESS !=
                hsa_ext_sampler_create(*agent, &samplerDescriptor, &(pTexture->sampler))) {
+            free(pTexture);
            return ihipLogStatus(hipErrorRuntimeOther);
        }

@@ -438,7 +444,13 @@ hipError_t ihipBindTextureImpl(TlsData *tls_, int dim, enum hipTextureReadMode r
        imageDescriptor.format.channel_type = channelType;

        hsa_ext_sampler_descriptor_t samplerDescriptor;
-        fillSamplerDescriptor(samplerDescriptor, addressMode, filterMode, normalizedCoords);
+        samplerDescriptor.filter_mode = HSA_EXT_SAMPLER_FILTER_MODE_NEAREST;
+        samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER;
+        if (normalizedCoords) {
+            samplerDescriptor.coordinate_mode = HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED;
+        } else {
+            samplerDescriptor.coordinate_mode = HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED;
+        }

        hsa_access_permission_t permission = HSA_ACCESS_PERMISSION_RW;

@@ -449,6 +461,7 @@ hipError_t ihipBindTextureImpl(TlsData *tls_, int dim, enum hipTextureReadMode r
                                      HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR, rowPitch, 0, &(pTexture->image)) ||
            HSA_STATUS_SUCCESS !=
                hsa_ext_sampler_create(*agent, &samplerDescriptor, &(pTexture->sampler))) {
+            free(pTexture);
            return hipErrorRuntimeOther;
        }
        getHipTextureObject(&textureObject, pTexture->image, pTexture->sampler);
@@ -514,7 +527,9 @@ hipError_t ihipBindTexture2DImpl(TlsData *tls, int dim, enum hipTextureReadMode

        hsa_ext_sampler_descriptor_t samplerDescriptor;
        fillSamplerDescriptor(samplerDescriptor, addressMode, filterMode, normalizedCoords);
-
+        if(!normalizedCoords) {
+            samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE;
+        }
        hsa_access_permission_t permission = HSA_ACCESS_PERMISSION_RW;

        if( 0 == pitch)
@@ -525,6 +540,7 @@ hipError_t ihipBindTexture2DImpl(TlsData *tls, int dim, enum hipTextureReadMode
                                      HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR, pitch, 0, &(pTexture->image)) ||
            HSA_STATUS_SUCCESS !=
                hsa_ext_sampler_create(*agent, &samplerDescriptor, &(pTexture->sampler))) {
+            free(pTexture);
            return hipErrorRuntimeOther;
        }
        getHipTextureObject(&textureObject, pTexture->image, pTexture->sampler);
@@ -620,7 +636,9 @@ hipError_t ihipBindTextureToArrayImpl(TlsData *tls_, int dim, enum hipTextureRea

        hsa_ext_sampler_descriptor_t samplerDescriptor;
        fillSamplerDescriptor(samplerDescriptor, addressMode, filterMode, normalizedCoords);
-
+        if(!normalizedCoords) {
+            samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE;
+        }
        hsa_access_permission_t permission = HSA_ACCESS_PERMISSION_RW;

        size_t rowPitch = getElementSize(channelOrder, channelType) * alignUp(imageDescriptor.width, IMAGE_PITCH_ALIGNMENT);
@@ -21,7 +21,7 @@ THE SOFTWARE.
 */

 #include "../include/hip/hiprtc.h"
-#include "../include/hip/hcc_detail/code_object_bundle.hpp"
+#include "code_object_bundle.inl"
 #include "../include/hip/hcc_detail/elfio/elfio.hpp"
 #include "../include/hip/hcc_detail/program_state.hpp"

@@ -1,6 +1,6 @@
 #include "../include/hip/hcc_detail/program_state.hpp"

-#include "../include/hip/hcc_detail/code_object_bundle.hpp"
+#include "code_object_bundle.inl"
 #include "../include/hip/hcc_detail/hsa_helpers.hpp"

 #if !defined(__cpp_exceptions)
@@ -357,8 +357,11 @@ public:

            const auto it1 = get_symbol_addresses().find(x);
            if (it1 == get_symbol_addresses().cend()) {
-                hip_throw(std::runtime_error{
-                    "Global symbol: " + x + " is undefined."});
+                // For a unknown symbol, initialize it with a magic poison
+                hsa_executable_agent_global_variable_define(
+                    executable, agent, x.c_str(), 
+                    reinterpret_cast<void*>(0xDEADBEEFDEADBEEFull));
+                continue;
            }

            hsa_status_t status;
@@ -385,8 +385,7 @@ double compute_BSR(BCRSArrays& bcsr, double *x , double *y){
    cudaEventCreate(&startTime);
    cudaEventCreate(&stopTime);
    cudaEventRecord(startTime, bcsr.streamId);
-    // NOTE: cusparseDbsrmv and CUSPARSE_DIRECTION_COLUMN (of type cusparseDirection_t) are yet unsupported by HIP
-    // CHECK: cusparseDbsrmv(bcsr.cusparseHandle, CUSPARSE_DIRECTION_COLUMN, HIPSPARSE_OPERATION_NON_TRANSPOSE,
+    // CHECK: cusparseDbsrmv(bcsr.cusparseHandle, HIPSPARSE_DIRECTION_COLUMN, HIPSPARSE_OPERATION_NON_TRANSPOSE,
    cusparseDbsrmv(bcsr.cusparseHandle, CUSPARSE_DIRECTION_COLUMN, CUSPARSE_OPERATION_NON_TRANSPOSE,
                   bcsr.nbBlockRow, bcsr.m, bcsr.nbBlocks, &alpha, descr,
                   bcsr.cu_bsrValC, bcsr.cu_bsrRowPtrC, bcsr.cu_bsrColIndC, bcsr.blockSize,
@@ -0,0 +1,46 @@
+/*
+Copyright (c) 2015-Present Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp
+ * TEST: %t
+ * HIT_END
+ */
+
+#include "test_common.h"
+#define SIZE 1024
+
+int main(){
+
+    void *Sd;
+    hipError_t e;
+    char S[SIZE]="This is not a device symbol";
+
+    HIPCHECK(hipMalloc(&Sd,SIZE));
+
+    e = hipMemcpyFromSymbol(S, HIP_SYMBOL(Sd), SIZE, 0, hipMemcpyDeviceToHost);
+    HIPASSERT(e==hipErrorInvalidSymbol);
+    
+    e = hipMemcpyFromSymbol(S, NULL, SIZE, 0, hipMemcpyDeviceToHost);
+    HIPASSERT(e==hipErrorInvalidSymbol);
+
+    HIPCHECK(hipFree(Sd)); 
+    
+    passed();
+}
@@ -0,0 +1,49 @@
+/*
+Copyright (c) 2015-Present Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp
+ * TEST: %t
+ * HIT_END
+ */
+
+#include "test_common.h"
+#define SIZE 1024
+
+int main(){
+
+    void *Sd;
+    hipError_t e;
+    char S[SIZE]="This is not a device symbol";
+
+    HIPCHECK(hipMalloc(&Sd,SIZE));
+
+    hipStream_t stream;
+    HIPCHECK(hipStreamCreate(&stream));
+
+    e = hipMemcpyFromSymbolAsync(S, HIP_SYMBOL(Sd), SIZE, 0, hipMemcpyDeviceToHost, stream);
+    HIPASSERT(e==hipErrorInvalidSymbol);
+    
+    e = hipMemcpyFromSymbolAsync(S, NULL, SIZE, 0, hipMemcpyDeviceToHost, stream);
+    HIPASSERT(e==hipErrorInvalidSymbol);
+
+    HIPCHECK(hipFree(Sd)); 
+    
+    passed();
+}
@@ -0,0 +1,46 @@
+/*
+Copyright (c) 2015-Present Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp
+ * TEST: %t
+ * HIT_END
+ */
+
+#include "test_common.h"
+#define SIZE 1024
+
+int main(){
+
+    void *Sd;
+    hipError_t e;
+    char S[SIZE]="This is not a device symbol";
+
+    HIPCHECK(hipMalloc(&Sd,SIZE));
+
+    e = hipMemcpyToSymbol(HIP_SYMBOL(Sd), S, SIZE, 0, hipMemcpyHostToDevice);
+    HIPASSERT(e==hipErrorInvalidSymbol);
+    
+    e = hipMemcpyToSymbol(NULL, S, SIZE, 0, hipMemcpyHostToDevice);
+    HIPASSERT(e==hipErrorInvalidSymbol);
+
+    HIPCHECK(hipFree(Sd)); 
+    
+    passed();
+}
@@ -0,0 +1,49 @@
+/*
+Copyright (c) 2015-Present Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp
+ * TEST: %t
+ * HIT_END
+ */
+
+#include "test_common.h"
+#define SIZE 100
+
+int main(){
+
+    void *Sd;
+    hipError_t e;
+    char S[SIZE]="This is not a device symbol";
+    
+    HIPCHECK(hipMalloc(&Sd,SIZE));
+
+    hipStream_t stream;
+    HIPCHECK(hipStreamCreate(&stream));
+
+    e = hipMemcpyToSymbolAsync(HIP_SYMBOL(Sd), S, SIZE, 0, hipMemcpyHostToDevice, stream);
+    HIPASSERT(e==hipErrorInvalidSymbol);
+    
+    e = hipMemcpyToSymbolAsync(NULL, S, SIZE, 0, hipMemcpyHostToDevice, stream);
+    HIPASSERT(e==hipErrorInvalidSymbol);
+   
+    HIPCHECK(hipFree(Sd)); 
+    
+    passed();
+}
@@ -0,0 +1,43 @@
+/* 
+Copyright (c) 2015-Present Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM vdi
+ * TEST: %t
+ * HIT_END
+ */
+
+#include "test_common.h"
+#define SIZE 100
+
+int main(){
+   hipError_t e;
+   char str[SIZE]="Hi, I am Ellesemere. What is ur name?";
+
+   e = hipMemcpy(0, str, SIZE, hipMemcpyHostToDevice);
+   HIPASSERT(e==hipErrorInvalidValue);
+   
+   e = hipMemcpy(NULL, str, SIZE, hipMemcpyHostToDevice);
+   HIPASSERT(e==hipErrorInvalidValue);
+
+   e = hipMemset(0,99,80);
+   HIPASSERT(e==hipErrorInvalidValue);
+
+   passed();
+}
@@ -0,0 +1,40 @@
+/*
+Copyright (c) 2015-Present Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM vdi
+ * TEST: %t
+ * HIT_END
+ */
+
+#include "test_common.h"
+
+int main(){
+
+   hipError_t e;
+   hipStream_t stream;
+
+   e = hipStreamCreateWithFlags(&stream, -1);
+   HIPASSERT(e==hipErrorInvalidValue);
+
+   e = hipStreamCreateWithFlags(&stream, 2);
+   HIPASSERT(e==hipErrorInvalidValue);
+
+   passed();
+}
@@ -96,6 +96,18 @@ void kernel_hisinf(__half* input, int* output) {
  output[tx] = __hisinf(input[tx]);
 }

+__global__ void testHalfAbs(float* p) {
+    auto a = __float2half(*p);
+    a = __habs(a);
+    *p = __half2float(a);
+}
+
+__global__ void testHalf2Abs(float2* p) {
+    auto a = __float22half2_rn(*p);
+    a = __habs2(a);
+    *p = __half22float2(a);
+}
+
 #endif


@@ -237,6 +249,31 @@ void checkFunctional() {
  return;
 }

+void checkHalfAbs() {
+    {
+        float *p;
+        hipMalloc(&p, sizeof(float));
+        float pp = -2.1f;
+        hipMemcpy(p, &pp, sizeof(float), hipMemcpyDefault);
+        hipLaunchKernelGGL(testHalfAbs, 1, 1, 0, 0, p);
+        hipMemcpy(&pp, p, sizeof(float), hipMemcpyDefault);
+        hipFree(p);
+        if(pp < 0.0f) { failed("Half Abs failed"); }
+    }
+    {
+        float2 *p;
+        hipMalloc(&p, sizeof(float2));
+        float2 pp;
+        pp.x = -2.1f;
+        pp.y = -1.1f;
+        hipMemcpy(p, &pp, sizeof(float2), hipMemcpyDefault);
+        hipLaunchKernelGGL(testHalf2Abs, 1, 1, 0, 0, p);
+        hipMemcpy(&pp, p, sizeof(float2), hipMemcpyDefault);
+        hipFree(p);
+        if(pp.x < 0.0f || pp.y < 0.0f) { failed("Half2 Abs Test Failed"); }
+    }
+}
+
 int main() {
  bool* result{nullptr};
  hipMemAllocHost((void**)&result, sizeof(result));
@@ -260,5 +297,7 @@ int main() {
  // run some functional checks
  checkFunctional();

+  checkHalfAbs();
+
  passed();
 }
@@ -0,0 +1,239 @@
+/*
+Copyright (c) 2015-2019 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/* HIT_START
+ * BUILD: %t %s ../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc HIPCC_OPTIONS -std=c++14
+ * TEST: %t
+ * HIT_END
+ */
+
+#include <hip/hip_runtime.h>
+#include <type_traits>
+#include <random>
+#include "test_common.h"
+
+static std::random_device dev;
+static std::mt19937 rng(dev());
+
+template <typename T, typename M>
+__host__ __device__ inline constexpr int count() {
+    return sizeof(T) / sizeof(M);
+}
+
+inline float getRandomFloat(float min = 10, float max = 100) {
+    std::uniform_real_distribution<float> gen(min, max);
+    return gen(rng);
+}
+
+template <typename T, typename B>
+void fillMatrix(T* a, int size) {
+    for (int i = 0; i < size; i++) {
+        T t;
+        t.x = getRandomFloat();
+        if constexpr (count<T, B>() >= 2) t.y = getRandomFloat();
+        if constexpr (count<T, B>() >= 3) t.z = getRandomFloat();
+        if constexpr (count<T, B>() >= 4) t.w = getRandomFloat();
+
+        a[i] = t;
+    }
+}
+
+// Test operations
+template <typename T, typename B>
+__host__ __device__ void testOperations(T& a, T& b) {
+    a.x += b.x;
+    a.x++;
+    b.x++;
+    if constexpr (count<T, B>() >= 2) {
+        a.y = b.x;
+        a.x = b.y;
+    }
+    if constexpr (count<T, B>() >= 3) {
+        if (a.x > 0) b.x /= a.x;
+        a.x *= b.z;
+        a.y--;
+    }
+    if constexpr (count<T, B>() >= 4) {
+        b.w = a.x;
+        a.w += (-b.y);
+    }
+}
+
+template <typename T, typename B>
+__global__ void testOperationsGPU(T* d_a, T* d_b, int size) {
+    int id = threadIdx.x;
+    if (id > size) return;
+    T &a = d_a[id];
+    T &b = d_b[id];
+
+    testOperations<T, B>(a, b);
+}
+
+
+template <typename T>
+void dcopy(T* a, T* b, int size) {
+    for (int i = 0; i < size; i++) {
+        a[i] = b[i];
+    }
+}
+
+template <typename T>
+bool isEqual(T* a, T* b, int size) {
+    for (int i = 0; i < size; i++) {
+        if (a[i] != b[i]) {
+            return false;
+        }
+    }
+    return true;
+}
+
+// Main function that tests type
+// T = what you want to test
+// D = pack of 1 i.e. float1 int1
+template <typename T, typename D>
+void testType(int msize) {
+    T *fa, *fb, *fc, *h_fa, *h_fb;
+    fa = new T[msize];
+    fb = new T[msize];
+    fc = new T[msize];
+    h_fa = new T[msize];
+    h_fb = new T[msize];
+
+    T *d_fa, *d_fb;
+
+    constexpr int c = count<T, D>();
+
+    if (c <= 0 || c >= 5) {
+        failed("Invalid Size\n");
+    }
+
+    fillMatrix<T, D>(fa, msize);
+    dcopy(fb, fa, msize);
+    dcopy(h_fa, fa, msize);
+    dcopy(h_fb, fa, msize);
+    for (int i = 0; i < msize; i++) testOperations<T, D>(h_fa[i], h_fb[i]);
+
+    hipMalloc(&d_fa, sizeof(T) * msize);
+    hipMalloc(&d_fb, sizeof(T) * msize);
+
+    hipMemcpy(d_fa, fa, sizeof(T) * msize, hipMemcpyHostToDevice);
+    hipMemcpy(d_fb, fb, sizeof(T) * msize, hipMemcpyHostToDevice);
+
+    auto kernel = testOperationsGPU<T, D>;
+    hipLaunchKernelGGL(kernel, 1, msize, 0, 0, d_fa, d_fb, msize);
+
+    hipMemcpy(fc, d_fa, sizeof(T) * msize, hipMemcpyDeviceToHost);
+
+    bool pass = true;
+    if (!isEqual<T>(h_fa, fc, msize)) {
+        pass = false;
+    }
+
+    delete[] fa;
+    delete[] fb;
+    delete[] fc;
+    delete[] h_fa;
+    delete[] h_fb;
+    hipFree(d_fa);
+    hipFree(d_fb);
+
+    if (!pass) {
+        failed("Failed");
+    }
+}
+
+int main() {
+    const int msize = 100;
+    // double
+    testType<double1, double1>(msize);
+    testType<double2, double1>(msize);
+    testType<double3, double1>(msize);
+    testType<double4, double1>(msize);
+
+    // floats
+    testType<float1, float1>(msize);
+    testType<float2, float1>(msize);
+    testType<float3, float1>(msize);
+    testType<float4, float1>(msize);
+
+    // ints
+    testType<int1, int1>(msize);
+    testType<int2, int1>(msize);
+    testType<int3, int1>(msize);
+    testType<int4, int1>(msize);
+
+    // chars
+    testType<char1, char1>(msize);
+    testType<char2, char1>(msize);
+    testType<char3, char1>(msize);
+    testType<char4, char1>(msize);
+
+    // long
+    testType<long1, long1>(msize);
+    testType<long2, long1>(msize);
+    testType<long3, long1>(msize);
+    testType<long4, long1>(msize);
+
+    // longlong
+    testType<longlong1, longlong1>(msize);
+    testType<longlong2, longlong1>(msize);
+    testType<longlong3, longlong1>(msize);
+    testType<longlong4, longlong1>(msize);
+
+    // short
+    testType<short1, short1>(msize);
+    testType<short2, short1>(msize);
+    testType<short3, short1>(msize);
+    testType<short4, short1>(msize);
+
+    // uints
+    testType<uint1, uint1>(msize);
+    testType<uint2, uint1>(msize);
+    testType<uint3, uint1>(msize);
+    testType<uint4, uint1>(msize);
+
+    // uchars
+    testType<uchar1, uchar1>(msize);
+    testType<uchar2, uchar1>(msize);
+    testType<uchar3, uchar1>(msize);
+    testType<uchar4, uchar1>(msize);
+
+    // ulong
+    testType<ulong1, ulong1>(msize);
+    testType<ulong2, ulong1>(msize);
+    testType<ulong3, ulong1>(msize);
+    testType<ulong4, ulong1>(msize);
+
+    // ulonglong
+    testType<ulonglong1, ulonglong1>(msize);
+    testType<ulonglong2, ulonglong1>(msize);
+    testType<ulonglong3, ulonglong1>(msize);
+    testType<ulonglong4, ulonglong1>(msize);
+
+    // ushort
+    testType<ushort1, ushort1>(msize);
+    testType<ushort2, ushort1>(msize);
+    testType<ushort3, ushort1>(msize);
+    testType<ushort4, ushort1>(msize);
+
+    passed();
+}
@@ -20,7 +20,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
 /* HIT_START
- * BUILD: %t %s ../test_common.cpp LINK_OPTIONS hiprtc EXCLUDE_HIP_PLATFORM nvcc
+ * BUILD: %t %s ../test_common.cpp LINK_OPTIONS hiprtc EXCLUDE_HIP_PLATFORM nvcc vdi
 * TEST: %t
 * HIT_END
 */
@@ -20,7 +20,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
 /* HIT_START
- * BUILD: %t %s ../test_common.cpp LINK_OPTIONS hiprtc EXCLUDE_HIP_PLATFORM nvcc
+ * BUILD: %t %s ../test_common.cpp LINK_OPTIONS hiprtc EXCLUDE_HIP_PLATFORM nvcc vdi
 * TEST: %t
 * HIT_END
 */
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ * */
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc vdi
+ * TEST: %t
+ * HIT_END
+ */
+
+
+#include "test_common.h"
+
+int main() {
+    int* A;
+    int* Ad;
+    int* Bd;
+    
+    // Allocation
+    HIPCHECK(hipMalloc((void**)&Ad, sizeof(int)));
+    HIPCHECK(hipMalloc((void**)&Bd, sizeof(int)));
+    HIPCHECK(hipHostMalloc((void**)&A,sizeof(int)));
+
+    // Kind should be ignored and test should pass even for incorrect kind
+    HIPCHECK(hipMemcpy(Ad, A, sizeof(int), hipMemcpyDeviceToHost));
+    HIPCHECK(hipMemcpy(A,  Ad, sizeof(int), hipMemcpyHostToDevice));
+    HIPCHECK(hipMemcpy(Ad, Bd, sizeof(int), hipMemcpyHostToHost));
+    HIPCHECK(hipMemcpy(A,  A, sizeof(int), hipMemcpyDeviceToDevice));
+    
+    // nullptr passed as source or destination pointer
+    HIPASSERT(hipSuccess != hipMemcpy(nullptr, A, sizeof(int), hipMemcpyHostToDevice));
+    HIPASSERT(hipSuccess != hipMemcpy(Ad, nullptr, sizeof(int), hipMemcpyHostToDevice));
+    
+    HIPCHECK(hipFree(Ad));
+    HIPCHECK(hipFree(Bd));
+    HIPCHECK(hipFree(A));
+    passed();   
+}
@@ -107,8 +107,8 @@ void run(const std::vector<char>& buffer) {
   
    hipFree(Ad); 
    hipFree(Bd); 
-    delete A;
-    delete B;
+    delete[] A;
+    delete[] B;
    hipCtxDestroy(context);
    
 }
@@ -0,0 +1,145 @@
+#include <stdio.h>
+#include <hip/hip_runtime.h>
+#include <unistd.h>
+#include "test_common.h"
+#include <atomic>
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
+ * TEST: %t
+ * HIT_END
+ */
+
+enum class ExecState
+{
+   EXEC_NOT_STARTED,
+   EXEC_STARTED,
+   EXEC_CB_STARTED,
+   EXEC_CB_FINISHED,
+   EXEC_FINISHED
+};
+
+struct UserData
+{
+    size_t size;
+    int* ptr;
+};
+
+// Global variable to check exection order
+std::atomic<ExecState> gData(ExecState::EXEC_NOT_STARTED);
+
+
+void myCallback(hipStream_t stream, hipError_t status, void* user_data)
+{
+    if(gData.load() != ExecState::EXEC_STARTED)
+        return; // Error hence return early
+
+    gData.store(ExecState::EXEC_CB_STARTED);
+
+    UserData* data = reinterpret_cast<UserData*>(user_data);
+    printf("Callback started\n");
+
+    sleep(1);
+
+    printf("Callback ending.\n");
+    gData.store(ExecState::EXEC_CB_FINISHED);
+}
+
+bool test(int count)
+{
+    printf("\n============ Test iteration %d =============\n",count);
+    // Stream
+    hipStream_t stream;
+    bool result = true;
+
+    gData.store(ExecState::EXEC_STARTED);
+
+    HIPCHECK(hipStreamCreate(&stream));
+
+    // Array size
+    size_t size = 10000;
+
+    // Device array
+    int *data = NULL;
+    HIPCHECK(hipMalloc((void**)&data, sizeof(int) * size));
+
+    // Initialize device array to -1
+    HIPCHECK(hipMemset(data, -1, sizeof(int) * size));
+
+    // Host array
+    int *host = NULL;
+    HIPCHECK(hipHostMalloc((void**)&host, sizeof(int) * size));
+
+    // Print host ptr address
+    printf("In main thread\n");
+
+    // Initialize user_data for callback
+    UserData arg;
+    arg.size = size;
+    arg.ptr  = host;
+
+    // Synchronize device
+    HIPCHECK(hipDeviceSynchronize());
+
+    // Asynchronous copy from device to host
+    HIPCHECK(hipMemcpyAsync(host, data, sizeof(int) * size, hipMemcpyDeviceToHost, stream));
+
+    // Asynchronous memset on device
+    HIPCHECK(hipMemsetAsync(data, 0, sizeof(int) * size, stream));
+
+    // Add callback - should happen after hipMemsetAsync()
+    HIPCHECK(hipStreamAddCallback(stream, myCallback, &arg, 0));
+
+    printf("Will wait in main thread until callback completes\n");
+
+    //This should synchronize the stream (including the callback)
+    HIPCHECK(hipStreamSynchronize(stream));
+
+    if(gData.load() != ExecState::EXEC_CB_FINISHED)
+    {
+        std::cout<<"Callback is not finished\n";
+        return false;
+    }
+    printf("Callback completed will resume main thread execution\n");
+
+    if(host[size/2] != -1)
+    {
+         // Print some host data that just got copied
+         printf("Pseudo host data printing (should be -1): %d\n", host[size/2]);
+         result = false;
+    }
+
+    HIPCHECK(hipMemcpy(host, data, sizeof(int)*size, hipMemcpyDeviceToHost));
+
+    if(host[size-1] != 0)
+    {
+         printf("Pseudo host data printing (should be 0): %d\n", host[size-1]);
+         result = false;
+    }
+
+    HIPCHECK(hipFree(data));
+    HIPCHECK(hipHostFree(host));
+    HIPCHECK(hipStreamDestroy(stream));
+
+    gData.store(ExecState::EXEC_FINISHED);
+    return result;
+}
+
+int main()
+{
+    // Test involves multithreading hence running multiple times
+    // to make sure consitency in the behavior
+    bool status = true;
+
+    for(int i=0; i < 10; i++){
+       status = test(i+1);
+       if(status == false)
+       {
+          failed("Test Failed!\n");
+          break;
+       }
+    }
+
+    if(status == true) passed();
+    return 0;
+}
@@ -0,0 +1,409 @@
+
+#include <hip/hip_runtime.h>
+
+#include <stdexcept>
+#include <memory>
+#include <functional>
+#include <mutex>
+#include <condition_variable>
+#include <thread>
+#include <future>
+#include "test_common.h"
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM vdi
+ * TEST: %t
+ * HIT_END
+ */
+
+#define WORKAROUND 0 // Enable (1) this to make stream thread-safe by a workaround
+
+template<bool IsBlocking> // <true> = queue blocks, until task is finished in enqueue(queue,task)
+class QueueHipRt;
+
+// Queue types used in the tests
+using TestQueues = std::tuple<QueueHipRt<true>, QueueHipRt<false>>;
+
+
+// --- Implementation
+
+#define HIP_ASSERT(x) (assert((x)==hipSuccess))
+#define HIP_ASSERT_IGNORE(x,ign) auto err=x; HIP_ASSERT(err==ign ? hipSuccess : err)
+
+#ifdef __HIP_PLATFORM_HCC__
+  #define HIPRT_CB
+#endif
+
+template<bool isBlocking>
+static auto currentThreadWaitFor(QueueHipRt<isBlocking> const & queue)  -> void;
+
+template<bool IsBlocking>
+class QueueHipRt
+{
+public:
+  static constexpr bool isBlocking = IsBlocking;
+  //-----------------------------------------------------------------------------
+  QueueHipRt(
+    int dev) :
+    m_dev(dev),
+    m_HipQueue()
+    {
+      HIP_ASSERT(
+        hipSetDevice(
+          m_dev));
+      HIP_ASSERT(
+        hipStreamCreateWithFlags(
+          &m_HipQueue,
+          hipStreamNonBlocking));
+    }
+  //-----------------------------------------------------------------------------
+  QueueHipRt(QueueHipRt const &) = delete;
+  //-----------------------------------------------------------------------------
+  QueueHipRt(QueueHipRt &&) = delete;
+  //-----------------------------------------------------------------------------
+  auto operator=(QueueHipRt const &) -> QueueHipRt & = delete;
+  //-----------------------------------------------------------------------------
+  auto operator=(QueueHipRt &&) -> QueueHipRt & = delete;
+  //-----------------------------------------------------------------------------
+  ~QueueHipRt()
+    {
+      if(isBlocking) {
+#if WORKAROUND  // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
+                // we are a non-blocking queue, so we have to wait here with its destruction until all spawned tasks have been processed
+        currentThreadWaitFor(*this);
+#endif
+      }
+      HIP_ASSERT(
+        hipSetDevice(
+          m_dev));
+      HIP_ASSERT(
+        hipStreamDestroy(
+          m_HipQueue));
+    }
+
+public:
+  int m_dev;   //!< The device this queue is bound to.
+  hipStream_t m_HipQueue;
+
+#if WORKAROUND  // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
+  int m_callees = 0;
+  std::mutex m_mutex;
+#endif
+};
+
+template<typename TTask>
+struct Enqueue
+{
+  //#############################################################################
+  enum class CallbackState
+  {
+    enqueued,
+    notified,
+    finished,
+  };
+
+  //#############################################################################
+  struct CallbackSynchronizationData : public std::enable_shared_from_this<CallbackSynchronizationData>
+  {
+    std::mutex m_mutex;
+    std::condition_variable m_event;
+    CallbackState state = CallbackState::enqueued;
+  };
+
+  //-----------------------------------------------------------------------------
+  static void HIPRT_CB hipRtCallback(hipStream_t /*queue*/, hipError_t /*status*/, void *arg)
+    {
+      // explicitly copy the shared_ptr so that this method holds the state even when the executing thread has already finished.
+      const auto pCallbackSynchronizationData = reinterpret_cast<CallbackSynchronizationData*>(arg)->shared_from_this();
+
+      // Notify the executing thread.
+      {
+        std::unique_lock<std::mutex> lock(pCallbackSynchronizationData->m_mutex);
+        pCallbackSynchronizationData->state = CallbackState::notified;
+      }
+      pCallbackSynchronizationData->m_event.notify_one();
+
+      // Wait for the executing thread to finish the task if it has not already finished.
+      std::unique_lock<std::mutex> lock(pCallbackSynchronizationData->m_mutex);
+      if(pCallbackSynchronizationData->state != CallbackState::finished)
+      {
+        pCallbackSynchronizationData->m_event.wait(
+          lock,
+          [pCallbackSynchronizationData](){
+            return pCallbackSynchronizationData->state == CallbackState::finished;
+          }
+          );
+      }
+    }
+
+  //-----------------------------------------------------------------------------
+  template<bool isBlocking>
+  static auto enqueue(
+    QueueHipRt<isBlocking> & queue,
+    TTask const & task)
+    -> void
+    {
+
+#if WORKAROUND  // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
+      {
+        // thread-safe callee incrementing
+        std::lock_guard<std::mutex> guard(queue.m_mutex);
+        queue.m_callees += 1;
+      }
+#endif
+      auto pCallbackSynchronizationData = std::make_shared<CallbackSynchronizationData>();
+      // test example: https://github.com/ROCm-Developer-Tools/HIP/blob/roc-1.9.x/tests/src/runtimeApi/stream/hipStreamAddCallback.cpp
+      HIP_ASSERT(hipStreamAddCallback(
+                            queue.m_HipQueue,
+                            hipRtCallback,
+                            pCallbackSynchronizationData.get(),
+                            0u));
+
+      // We start a new std::thread which stores the task to be executed.
+      // This circumvents the limitation that it is not possible to call HIP methods within the HIP callback thread.
+      // The HIP thread signals the std::thread when it is ready to execute the task.
+      // The HIP thread is waiting for the std::thread to signal that it is finished executing the task
+      // before it executes the next task in the queue (HIP stream).
+      std::thread t(
+        [pCallbackSynchronizationData,
+         task
+#if WORKAROUND // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
+         ,&queue // requires queue's destructor to wait for all tasks
+#endif
+          ](){
+
+#if WORKAROUND // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
+          // thread-safe task execution and callee decrementing
+          std::lock_guard<std::mutex> guard(queue.m_mutex);
+#endif
+
+          // If the callback has not yet been called, we wait for it.
+          {
+            std::unique_lock<std::mutex> lock(pCallbackSynchronizationData->m_mutex);
+            if(pCallbackSynchronizationData->state != CallbackState::notified)
+            {
+              pCallbackSynchronizationData->m_event.wait(
+                lock,
+                [pCallbackSynchronizationData](){
+                  return pCallbackSynchronizationData->state == CallbackState::notified;
+                }
+                );
+            }
+
+            task();
+
+            // Notify the waiting HIP thread.
+            pCallbackSynchronizationData->state = CallbackState::finished;
+          }
+          pCallbackSynchronizationData->m_event.notify_one();
+#if WORKAROUND // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
+          queue.m_callees -= 1;
+#endif
+        }
+        );
+      if(isBlocking)
+        t.join(); // => waiting for task completion
+      else
+        t.detach(); // => do not wait for task completion
+    }
+};
+//#############################################################################
+//! The HIP RT non-blocking queue test trait specialization.
+struct Empty
+{
+  //-----------------------------------------------------------------------------
+  template<bool isBlocking>
+  static auto empty(
+    QueueHipRt<isBlocking> const & queue)
+    -> bool
+    {
+
+#if WORKAROUND  // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
+      return (queue.m_callees==0);
+#else
+
+      // Query is allowed even for queues on non current device.
+      hipError_t ret = hipSuccess;
+      HIP_ASSERT_IGNORE(
+        ret = hipStreamQuery(
+          queue.m_HipQueue),
+        hipErrorNotReady);
+      return (ret == hipSuccess);
+#endif
+    }
+};
+
+template<bool isBlocking>
+auto currentThreadWaitFor(QueueHipRt<isBlocking> const & queue)  -> void
+{
+#if WORKAROUND  // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
+  while(queue.m_callees>0) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(10u));
+  }
+#else
+  // Sync is allowed even for queues on non current device.
+  HIP_ASSERT( hipStreamSynchronize(
+                         queue.m_HipQueue));
+#endif
+}
+
+
+
+
+// --- Tests
+
+#define TEMPLATE_LIST_TEST_CASE(TestName) \
+template<typename TestType> static void TestName (std::atomic<int> &check); \
+static int TestName##Runner () { \
+    std::atomic<int> check{0}; \
+    TestName< QueueHipRt<true> >(check); \
+    fprintf(stderr, "After " #TestName " < QueueHipRt<true> >  errors=%d\n", check.load()); \
+    TestName< QueueHipRt<false> >(check); \
+    fprintf(stderr, "After " #TestName " < QueueHipRt<false> > errors=%d\n", check.load()); \
+    return check.load(); \
+} \
+template<typename TestType> static void TestName (std::atomic<int> &check)
+
+// add 1 if a check fails
+#define CHECK(result) do{int arg=(!(result)); fprintf(stderr, "Checking " #result " %d\n", arg); check.fetch_add(arg);}while(false)
+
+//-----------------------------------------------------------------------------
+TEMPLATE_LIST_TEST_CASE( queueIsInitiallyEmpty )
+{
+  TestType queue{0};
+  CHECK(Empty::empty(queue));
+}
+
+//-----------------------------------------------------------------------------
+TEMPLATE_LIST_TEST_CASE( queueCallbackIsWorking )
+{
+  std::promise<bool> promise;
+  auto task = [&](){ promise.set_value(true); };
+  TestType queue{0};
+  Enqueue<decltype(task)> enqueue;
+  enqueue.enqueue(
+    queue,
+    task
+    );
+
+  CHECK(promise.get_future().get());
+}
+
+//-----------------------------------------------------------------------------
+TEMPLATE_LIST_TEST_CASE( queueWaitShouldWork )
+{
+  bool CallbackFinished = false;
+  auto task =
+    [&CallbackFinished]() noexcept
+      {
+        std::this_thread::sleep_for(std::chrono::milliseconds(100u));
+        CallbackFinished = true;
+      };
+  TestType queue{0};
+  Enqueue<decltype(task)> enqueue;
+  enqueue.enqueue(
+    queue,
+    task
+    );
+
+  currentThreadWaitFor(queue);
+  CHECK(CallbackFinished);
+}
+
+//-----------------------------------------------------------------------------
+TEMPLATE_LIST_TEST_CASE( queueShouldNotBeEmptyWhenLastTaskIsStillExecutingAndIsEmptyAfterProcessingFinished )
+{
+  bool CallbackFinished = false;
+  TestType queue{0};
+  auto task = [&queue, &CallbackFinished, &check]() noexcept
+                {
+                  CHECK(!Empty::empty(queue));
+                  std::this_thread::sleep_for(std::chrono::milliseconds(100u));
+                  CallbackFinished = true;
+                };
+  Enqueue<decltype(task)> enqueue;
+  enqueue.enqueue(
+    queue,
+    task
+    );
+  // A non-blocking queue will always stay empty because the task has been executed immediately.
+  if(!TestType::isBlocking)
+  {
+    currentThreadWaitFor(queue);
+  }
+
+  CHECK(Empty::empty(queue));
+  CHECK(CallbackFinished);
+}
+
+//-----------------------------------------------------------------------------
+TEMPLATE_LIST_TEST_CASE( queueShouldNotExecuteTasksInParallel )
+{
+  std::atomic<bool> taskIsExecuting(false);
+  std::promise<void> firstTaskFinished;
+  std::future<void> firstTaskFinishedFuture = firstTaskFinished.get_future();
+  std::promise<void> secondTaskFinished;
+  std::future<void> secondTaskFinishedFuture = secondTaskFinished.get_future();
+
+  TestType queue{0};
+
+  std::thread thread1(
+    [&queue, &taskIsExecuting, &firstTaskFinished, &check]()
+      {
+        auto task1 = [&taskIsExecuting, &firstTaskFinished, &check]() noexcept
+                       {
+                         CHECK(!taskIsExecuting.exchange(true));
+                         std::this_thread::sleep_for(std::chrono::milliseconds(100u));
+                         CHECK(taskIsExecuting.exchange(false));
+                         firstTaskFinished.set_value();
+                       };
+        Enqueue<decltype(task1)> enqueue;
+        enqueue.enqueue(
+          queue,
+          task1
+          );
+      });
+
+  std::thread thread2(
+    [&queue, &taskIsExecuting, &secondTaskFinished, &check]()
+      {
+        auto task2 = [&taskIsExecuting, &secondTaskFinished, &check]() noexcept
+                       {
+                         CHECK(!taskIsExecuting.exchange(true));
+                         std::this_thread::sleep_for(std::chrono::milliseconds(100u));
+                         CHECK(taskIsExecuting.exchange(false));
+                         secondTaskFinished.set_value();
+                       };
+
+        Enqueue<decltype(task2)> enqueue;
+        enqueue.enqueue(
+          queue,
+          task2
+          );
+      });
+
+  // Both tasks have to be enqueued
+  thread1.join();
+  thread2.join();
+
+  currentThreadWaitFor(queue);
+
+  firstTaskFinishedFuture.get();
+  secondTaskFinishedFuture.get();
+}
+
+#define TESTER(name) do { \
+    int result = name (); \
+    fprintf(stderr, #name " %s\n", result?"Errors":"No Errors"); \
+    if (result) { failed(#name " failed\n"); } \
+} while (false)
+
+int main()
+{
+    TESTER(queueIsInitiallyEmptyRunner);
+    TESTER(queueCallbackIsWorkingRunner);
+    TESTER(queueWaitShouldWorkRunner);
+    TESTER(queueShouldNotBeEmptyWhenLastTaskIsStillExecutingAndIsEmptyAfterProcessingFinishedRunner);
+    TESTER(queueShouldNotExecuteTasksInParallelRunner);
+    passed();
+}
@@ -0,0 +1,122 @@
+/*
+Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/* HIT_START
+ * BUILD: %t %s ../test_common.cpp EXCLUDE_HIP_PLATFORM vdi
+ * TEST: %t
+ * HIT_END
+ */
+
+#include "hip/hip_runtime.h"
+#include "../test_common.h"
+
+#define N 16
+#define offset 3
+__global__ void tex1dKernel(float *val, hipTextureObject_t obj) {
+    int k = blockIdx.x * blockDim.x + threadIdx.x;
+    if (k < N)
+        val[k] = tex1Dfetch<float>(obj, k+offset);
+}
+
+int runTest(hipTextureAddressMode, hipTextureFilterMode);
+
+int main(int argc, char **argv) {
+    int testResult = runTest(hipAddressModeClamp,hipFilterModePoint);
+    testResult = runTest(hipAddressModeClamp,hipFilterModeLinear);
+    testResult = runTest(hipAddressModeWrap,hipFilterModePoint);
+    testResult = runTest(hipAddressModeWrap,hipFilterModeLinear);
+    if(testResult) {
+        passed();
+    } else {
+        exit(EXIT_FAILURE);
+    }
+}
+
+int runTest(hipTextureAddressMode addressMode, hipTextureFilterMode filterMode) {
+
+    int testResult = 1;
+
+    hipCtx_t HipContext;
+    hipDevice_t HipDevice;
+    int deviceID = 0;
+    hipDeviceGet(&HipDevice, deviceID);
+    hipCtxCreate(&HipContext, 0, HipDevice);
+
+    // Allocating the required buffer on gpu device
+    float *texBuf, *texBufOut;
+    float val[N], output[N];
+    
+    for (int i = 0; i < N; i++) {
+        val[i] = i+1;
+        output[i] = 0.0;
+    }
+
+    HIPCHECK(hipMalloc(&texBuf, N * sizeof(float)));
+    HIPCHECK(hipMalloc(&texBufOut, N * sizeof(float)));
+    HIPCHECK(hipMemcpy(texBuf, val, N * sizeof(float), hipMemcpyHostToDevice));
+    HIPCHECK(hipMemset(texBufOut, 0, N * sizeof(float)));
+    hipResourceDesc resDescLinear;
+
+    memset(&resDescLinear, 0, sizeof(resDescLinear));
+    resDescLinear.resType = hipResourceTypeLinear;
+    resDescLinear.res.linear.devPtr = texBuf;
+    resDescLinear.res.linear.desc = hipCreateChannelDesc(32, 0, 0, 0, hipChannelFormatKindFloat);
+    resDescLinear.res.linear.sizeInBytes = N * sizeof(float);
+
+    hipTextureDesc texDesc;
+    memset(&texDesc, 0, sizeof(texDesc));
+    texDesc.readMode = hipReadModeElementType;
+
+    texDesc.addressMode[0] = addressMode;
+    texDesc.addressMode[1] = addressMode;
+    texDesc.filterMode = filterMode;   
+    texDesc.normalizedCoords = false;
+
+    // Creating texture object
+    hipTextureObject_t texObj = 0;
+    HIPCHECK(hipCreateTextureObject(&texObj, &resDescLinear, &texDesc, NULL));
+
+    dim3 dimBlock(1, 1, 1);
+    dim3 dimGrid(N , 1, 1);
+
+    hipLaunchKernelGGL(tex1dKernel, dim3(dimGrid), dim3(dimBlock), 0, 0,
+                       texBufOut, texObj);
+    HIPCHECK(hipDeviceSynchronize());
+
+    HIPCHECK(hipMemcpy(output, texBufOut, N * sizeof(float), hipMemcpyDeviceToHost));
+
+    for (int i = offset; i < N; i++) {
+        if (output[i-offset] != val[i]) {
+            testResult = 0;
+            break;
+        }
+    }
+    if(testResult){
+        for(int i = N-offset; i < N; i++){
+           if (output[i] != 0){
+               testResult = 0;
+               break;
+           }
+        }
+    }
+    HIPCHECK(hipDestroyTextureObject(texObj));
+    HIPCHECK(hipFree(texBuf));
+    HIPCHECK(hipFree(texBufOut));
+    return testResult;
+}