Merge branch 'master' into amd-master-next

Conflicts: CMakeLists.txt tests/src/texture/simpleTexture2DLayered.cpp tests/src/texture/simpleTexture3D.cpp Change-Id: I4aa4754d391b5f37ddf15fa0bcfc84d9da020119
2020-03-06 14:10:04 -05:00
@@ -377,6 +377,7 @@ if(HIP_PLATFORM STREQUAL "hcc")
    target_link_libraries(hip_hcc_static PRIVATE hc_am)

    add_library(hiprtc SHARED src/hiprtc.cpp src/code_object_bundle.cpp)
+    target_compile_options(hiprtc PRIVATE -DDISABLE_REDUCED_GPU_BLOB_COPY)
    set_property ( TARGET hiprtc PROPERTY VERSION "${HIP_LIB_VERSION_STRING}" )
    set_property ( TARGET hiprtc PROPERTY SOVERSION "${HIP_LIB_VERSION_MAJOR}" )

@@ -177,7 +177,7 @@ def docker_build_inside_image( def build_image, String inside_args, String platf
            cd ${build_dir_rel}
            make install -j\$(nproc)
            make build_tests -i -j\$(nproc)
-            ctest -E "(hipMultiThreadDevice-pyramid|hipMemoryAllocateCoherentDriver)"
+            ctest --output-on-failure -E "(hipMultiThreadDevice-pyramid|hipMemoryAllocateCoherentDriver)"
          """
        // If unit tests output a junit or xunit file in the future, jenkins can parse that file
        // to display test results on the dashboard
@@ -295,13 +295,13 @@ def docker_upload_dockerhub( String local_org, String image_name, String remote_
 String build_config = 'Release'
 String job_name = env.JOB_NAME.toLowerCase( )

-// The following launches 3 builds in parallel: rocm-head, rocm-3.0.x and cuda-10.x
-parallel rocm_3_0:
+// The following launches 3 builds in parallel: rocm-head, rocm-3.1.x and cuda-10.x
+parallel rocm_3_1:
 {
  node('hip-rocm')
  {
-    String hcc_ver = 'rocm-3.0.x'
-    String from_image = 'ci_test_nodes/rocm-3.0.x/ubuntu-16.04:latest'
+    String hcc_ver = 'rocm-3.1.x'
+    String from_image = 'ci_test_nodes/rocm-3.1.x/ubuntu-16.04:latest'
    String inside_args = '--device=/dev/kfd --device=/dev/dri --group-add=video'

    // Checkout source code, dependencies and version files
@@ -726,6 +726,12 @@ if ($HIP_PLATFORM eq "clang") {
        $HIPCXXFLAGS .= " -O3";
        $HIPLDFLAGS .= " -O3";
    }
+    if ($optArg ne "-O0") {
+       $HIPCXXFLAGS .= " -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false";
+       if ($needLDFLAGS and not $needCXXFLAGS) {
+           $HIPLDFLAGS .= " -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false";
+        }
+    }
    $HIP_DEVLIB_FLAGS = " --hip-device-lib-path=$DEVICE_LIB_PATH";
    $HIPCXXFLAGS .= " $HIP_DEVLIB_FLAGS";
    if (not $isWindows) {
@@ -1,7 +1,7 @@
 #!/usr/bin/perl -w

 $HIP_BASE_VERSION_MAJOR = "3";
-$HIP_BASE_VERSION_MINOR = "1";
+$HIP_BASE_VERSION_MINOR = "2";

 # Need perl > 5.10 to use logic-defined or
 use 5.006; use v5.10.1;
@@ -1172,6 +1172,7 @@ sub simpleSubstitutions {
    $ft{'numeric_literal'} += s/\bCUDA_ERROR_ASSERT\b/hipErrorAssert/g;
    $ft{'numeric_literal'} += s/\bCUDA_ERROR_CONTEXT_ALREADY_CURRENT\b/hipErrorContextAlreadyCurrent/g;
    $ft{'numeric_literal'} += s/\bCUDA_ERROR_CONTEXT_ALREADY_IN_USE\b/hipErrorContextAlreadyInUse/g;
+    $ft{'numeric_literal'} += s/\bCUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE\b/hipErrorCooperativeLaunchTooLarge/g;
    $ft{'numeric_literal'} += s/\bCUDA_ERROR_DEINITIALIZED\b/hipErrorDeinitialized/g;
    $ft{'numeric_literal'} += s/\bCUDA_ERROR_ECC_UNCORRECTABLE\b/hipErrorECCNotCorrectable/g;
    $ft{'numeric_literal'} += s/\bCUDA_ERROR_FILE_NOT_FOUND\b/hipErrorFileNotFound/g;
@@ -1528,6 +1529,7 @@ sub simpleSubstitutions {
    $ft{'numeric_literal'} += s/\bCU_MEMORYTYPE_UNIFIED\b/hipMemoryTypeUnified/g;
    $ft{'numeric_literal'} += s/\bCU_MEM_ATTACH_GLOBAL\b/hipMemAttachGlobal/g;
    $ft{'numeric_literal'} += s/\bCU_MEM_ATTACH_HOST\b/hipMemAttachHost/g;
+    $ft{'numeric_literal'} += s/\bCU_OCCUPANCY_DEFAULT\b/hipOccupancyDefault/g;
    $ft{'numeric_literal'} += s/\bCU_RESOURCE_TYPE_ARRAY\b/hipResourceTypeArray/g;
    $ft{'numeric_literal'} += s/\bCU_RESOURCE_TYPE_LINEAR\b/hipResourceTypeLinear/g;
    $ft{'numeric_literal'} += s/\bCU_RESOURCE_TYPE_MIPMAPPED_ARRAY\b/hipResourceTypeMipmappedArray/g;
@@ -1636,6 +1638,7 @@ sub simpleSubstitutions {
    $ft{'numeric_literal'} += s/\bcudaErrorAlreadyMapped\b/hipErrorAlreadyMapped/g;
    $ft{'numeric_literal'} += s/\bcudaErrorArrayIsMapped\b/hipErrorArrayIsMapped/g;
    $ft{'numeric_literal'} += s/\bcudaErrorAssert\b/hipErrorAssert/g;
+    $ft{'numeric_literal'} += s/\bcudaErrorCooperativeLaunchTooLarge\b/hipErrorCooperativeLaunchTooLarge/g;
    $ft{'numeric_literal'} += s/\bcudaErrorCudartUnloading\b/hipErrorDeinitialized/g;
    $ft{'numeric_literal'} += s/\bcudaErrorDeviceAlreadyInUse\b/hipErrorContextAlreadyInUse/g;
    $ft{'numeric_literal'} += s/\bcudaErrorDeviceUninitialized\b/hipErrorInvalidContext/g;
@@ -1749,6 +1752,8 @@ sub simpleSubstitutions {
    $ft{'define'} += s/\bCUDA_ARRAY3D_LAYERED\b/hipArrayLayered/g;
    $ft{'define'} += s/\bCUDA_ARRAY3D_SURFACE_LDST\b/hipArraySurfaceLoadStore/g;
    $ft{'define'} += s/\bCUDA_ARRAY3D_TEXTURE_GATHER\b/hipArrayTextureGather/g;
+    $ft{'define'} += s/\bCUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC\b/hipCooperativeLaunchMultiDeviceNoPostSync/g;
+    $ft{'define'} += s/\bCUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC\b/hipCooperativeLaunchMultiDeviceNoPreSync/g;
    $ft{'define'} += s/\bCU_LAUNCH_PARAM_BUFFER_POINTER\b/HIP_LAUNCH_PARAM_BUFFER_POINTER/g;
    $ft{'define'} += s/\bCU_LAUNCH_PARAM_BUFFER_SIZE\b/HIP_LAUNCH_PARAM_BUFFER_SIZE/g;
    $ft{'define'} += s/\bCU_LAUNCH_PARAM_END\b/HIP_LAUNCH_PARAM_END/g;
@@ -1769,6 +1774,8 @@ sub simpleSubstitutions {
    $ft{'define'} += s/\bcudaArrayLayered\b/hipArrayLayered/g;
    $ft{'define'} += s/\bcudaArraySurfaceLoadStore\b/hipArraySurfaceLoadStore/g;
    $ft{'define'} += s/\bcudaArrayTextureGather\b/hipArrayTextureGather/g;
+    $ft{'define'} += s/\bcudaCooperativeLaunchMultiDeviceNoPostSync\b/hipCooperativeLaunchMultiDeviceNoPostSync/g;
+    $ft{'define'} += s/\bcudaCooperativeLaunchMultiDeviceNoPreSync\b/hipCooperativeLaunchMultiDeviceNoPreSync/g;
    $ft{'define'} += s/\bcudaDeviceBlockingSync\b/hipDeviceScheduleBlockingSync/g;
    $ft{'define'} += s/\bcudaDeviceLmemResizeToMax\b/hipDeviceLmemResizeToMax/g;
    $ft{'define'} += s/\bcudaDeviceMapHost\b/hipDeviceMapHost/g;
@@ -1792,6 +1799,7 @@ sub simpleSubstitutions {
    $ft{'define'} += s/\bcudaIpcMemLazyEnablePeerAccess\b/hipIpcMemLazyEnablePeerAccess/g;
    $ft{'define'} += s/\bcudaMemAttachGlobal\b/hipMemAttachGlobal/g;
    $ft{'define'} += s/\bcudaMemAttachHost\b/hipMemAttachHost/g;
+    $ft{'define'} += s/\bcudaOccupancyDefault\b/hipOccupancyDefault/g;
    $ft{'define'} += s/\bcudaStreamDefault\b/hipStreamDefault/g;
    $ft{'define'} += s/\bcudaStreamNonBlocking\b/hipStreamNonBlocking/g;
    $ft{'define'} += s/\bcudaTextureType1D\b/hipTextureType1D/g;
@@ -447,6 +447,7 @@
 |          717 |*`CUDA_ERROR_INVALID_ADDRESS_SPACE`*                                |                                                            |
 |          718 |*`CUDA_ERROR_INVALID_PC`*                                           |                                                            |
 |          719 |*`CUDA_ERROR_LAUNCH_FAILED`*                                        |*`hipErrorLaunchFailure`*                                   |
+|          720 |*`CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE`*                         |*`hipErrorCooperativeLaunchTooLarge`*                       |
 |          800 |*`CUDA_ERROR_NOT_PERMITTED`*                                        |                                                            |
 |          801 |*`CUDA_ERROR_NOT_SUPPORTED`*                                        |*`hipErrorNotSupported`*                                    |
 |          802 |*`CUDA_ERROR_SYSTEM_NOT_READY`*                                     |                                                            | 10.0             |
@@ -1125,10 +1126,10 @@

 |   **CUDA**                                                |   **HIP**                                               |**CUDA version\***|
 |-----------------------------------------------------------|---------------------------------------------------------|------------------|
-| `cuOccupancyMaxActiveBlocksPerMultiprocessor`             |`hipOccupancyMaxActiveBlocksPerMultiprocessor`           |
-| `cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags`    |`hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags`  |
-| `cuOccupancyMaxPotentialBlockSize`                        |`hipOccupancyMaxPotentialBlockSize`                      |
-| `cuOccupancyMaxPotentialBlockSizeWithFlags`               |                                                         |
+| `cuOccupancyMaxActiveBlocksPerMultiprocessor`             |`hipDrvOccupancyMaxActiveBlocksPerMultiprocessor`           |
+| `cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags`    |`hipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags`  |
+| `cuOccupancyMaxPotentialBlockSize`                        |`hipOccupancyMaxPotentialBlockSize`                         |
+| `cuOccupancyMaxPotentialBlockSizeWithFlags`               |                                                            |

 ## **22. Texture Reference Management [DEPRECATED]**

@@ -800,7 +800,7 @@
 |          717 |*`cudaErrorInvalidAddressSpace`*                     |                  |                                                            |
 |          718 |*`cudaErrorInvalidPc`*                               |                  |                                                            |
 |          719 |*`cudaErrorLaunchFailure`*                           |                  |*`hipErrorLaunchFailure`*                                   |
-|          720 |*`cudaErrorCooperativeLaunchTooLarge`*               | 9.0              |                                                            |
+|          720 |*`cudaErrorCooperativeLaunchTooLarge`*               | 9.0              |*`hipErrorCooperativeLaunchTooLarge`*                       |
 |          800 |*`cudaErrorNotPermitted`*                            |                  |                                                            |
 |          801 |*`cudaErrorNotSupported`*                            |                  |*`hipErrorNotSupported`*                                    |
 |          802 |*`cudaErrorSystemNotReady`*                          | 10.0             |                                                            |
@@ -1045,7 +1045,7 @@
 | define       |`cudaMemAttachGlobal`                                |                  |`hipMemAttachGlobal`                                        |
 | define       |`cudaMemAttachHost`                                  |                  |`hipMemAttachHost`                                          |
 | define       |`cudaMemAttachSingle`                                |                  |                                                            |
-| define       |`cudaOccupancyDefault`                               |                  |                                                            |
+| define       |`cudaOccupancyDefault`                               |                  |`hipOccupancyDefault`                                    |
 | define       |`cudaOccupancyDisableCachingOverride`                |                  |                                                            |
 | define       |`cudaPeerAccessDefault`                              |                  |                                                            |
 | define       |`cudaStreamDefault`                                  |                  |`hipStreamDefault`                                          |
@@ -0,0 +1,72 @@
+# Profiling HIP Code
+
+This section describes the tracing and debugging capabilities that HIP provides.  
+<!-- toc -->
+
+- [Tracing and Debug](#tracing-and-debug)
+  * [Tracing HIP APIs](#tracing-hip-apis)
+    + [Color](#color)
+
+<!-- tocstop -->
+
+## Tracing and Debug
+
+### Tracing HIP APIs
+The HIP runtime can print the HIP function strings to stderr using HIP_TRACE_API environment variable.
+The trace prints two messages for each API - one at the beginning of the API call (line starts with "<<") and one at the end of the API call (line ends with ">>").
+Here's an example for one API followed by a description for the sections of the trace:
+
+```
+<<hip-api tid:1.6 hipMemcpy (0x7f32154db010, 0x50446e000, 4000000, hipMemcpyDeviceToHost)
+  hip-api tid:1.6 hipMemcpy                      ret= 0 (hipSuccess)>>
+```
+
+- `<<hip-api` is the header used for all HIP API debug messages.  The message is also shown in a specific color.  This can be used to distinguish this API from other HIP or application messages.
+-  `tid:1.6` indicates that this API call came from thread #1 and is the 6th API call in that thread.   When the first API in a new thread is called, HIP will associates a short sequential ID with that thread.  You can see the full thread ID (reported by C++) as 0x7f6183b097c0 in the example below.  
+- `hipMemcpy` is the name of the API.
+- The first line then prints a comma-separated list of the arguments to the function.  APIs which return values to the caller by writing to pointers will show the pointer addresses rather than the pointer contents.  This behavior may change in the future.
+- The second line shows the completion of the API, including the numeric return value (`ret= 0`) as well as an string representation for the error code (`hipSuccess`).  If the returned error code is non-zero, then the csecond line message is shown in red (unless HIP_TRACE_API_COLOR is "none" - see below).
+
+
+Heres a specific example showing the output of the [square](https://github.com/ROCm-Developer-Tools/HIP/tree/master/samples/0_Intro/square) program running on HIP:
+
+```
+$ HIP_TRACE_API=1  ./square.hip.out 
+  hip-api tid:1:HIP initialized short_tid#1 (maps to full_tid: 0x7f6183b097c0)
+<<hip-api tid:1.1 hipGetDeviceProperties (0x7ffddb673e08, 0)
+  hip-api tid:1.1 hipGetDeviceProperties         ret= 0 (hipSuccess)>>
+info: running on device gfx803
+info: allocate host mem (  7.63 MB)
+info: allocate device mem (  7.63 MB)
+<<hip-api tid:1.2 hipMalloc (0x7ffddb673fb8, 4000000)
+  hip-api tid:1.2 hipMalloc                      ret= 0 (hipSuccess)>>
+<<hip-api tid:1.3 hipMalloc (0x7ffddb673fb0, 4000000)
+  hip-api tid:1.3 hipMalloc                      ret= 0 (hipSuccess)>>
+info: copy Host2Device
+<<hip-api tid:1.4 hipMemcpy (0x50409d000, 0x7f32158ac010, 4000000, hipMemcpyHostToDevice)
+  hip-api tid:1.4 hipMemcpy                      ret= 0 (hipSuccess)>>
+info: launch 'vector_square' kernel
+1.5 hipLaunchKernel 'HIP_KERNEL_NAME(vector_square)' gridDim:{512,1,1} groupDim:{256,1,1} sharedMem:+0 stream#0.0
+info: copy Device2Host
+<<hip-api tid:1.6 hipMemcpy (0x7f32154db010, 0x50446e000, 4000000, hipMemcpyDeviceToHost)
+  hip-api tid:1.6 hipMemcpy                      ret= 0 (hipSuccess)>>
+info: check result
+PASSED!
+```
+
+HIP_TRACE_API supports multiple levels of debug information:
+   - 0x1 = print all HIP APIs.  This is the most verbose setting; the flags below allow selecting a subset.
+   - 0x2 = print HIP APIs which initiate GPU kernel commands.  Includes hipLaunchKernel, hipLaunchModuleKernel
+   - 0x4 = print HIP APIs which initiate GPU memory commands.  Includes hipMemcpy*, hipMemset*.
+   - 0x8 = print HIP APIs which allocate or free memory.  Includes hipMalloc, hipHostMalloc, hipFree, hipHostFree.
+
+These can be combined.  For example, HIP_TRACE_API=6 shows a concise view of the HIP commands (both kernel and memory) that are sent to the GPU.
+
+
+#### Color
+Note this trace mode uses colors. "less -r" can handle raw control characters and will display the debug output in proper colors.
+You can change the color used for the trace mode with the HIP_TRACE_API_COLOR environment variable.  Possible values are None/Red/Green/Yellow/Blue/Magenta/Cyan/White.
+None will disable use of color control codes for both the opening and closing and may be useful when saving the trace file or when a pure text trace is desired.
+
+
+
@@ -447,7 +447,8 @@ if len(sys.argv) > 3: OUTPUT = sys.argv[3]

 # API declaration map
 api_map = {
-  'hipHccModuleLaunchKernel': ''
+  'hipHccModuleLaunchKernel': '',
+  'hipExtModuleLaunchKernel': ''
 }
 # API options map
 opts_map = {}
@@ -42,10 +42,10 @@ After applying all the matchers, the output HIP source is produced.

 `hipify-clang` requires:

-1. [**LLVM+CLANG**](http://releases.llvm.org) of at least version [3.8.0](http://releases.llvm.org/download.html#3.8.0); the latest stable and recommended release: [**9.0.1**](http://releases.llvm.org/download.html#9.0.1), the latest release candidate: [10.0.0-rc1](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc1).
+1. [**LLVM+CLANG**](http://releases.llvm.org) of at least version [3.8.0](http://releases.llvm.org/download.html#3.8.0); the latest stable and recommended release: [**9.0.1**](http://releases.llvm.org/download.html#9.0.1), the latest release candidate: [10.0.0-rc3](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc3).

 2. [**CUDA**](https://developer.nvidia.com/cuda-downloads) of at least version [7.0](https://developer.nvidia.com/cuda-toolkit-70), the latest supported version is [**10.1 Update 2**](https://developer.nvidia.com/cuda-10.1-download-archive-base).
-To use the latest CUDA version [10.2](https://developer.nvidia.com/cuda-downloads) please use the latest `LLVM` release candidate: [10.0.0-rc1](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc1).
+To use the latest CUDA version [10.2](https://developer.nvidia.com/cuda-downloads) please use the latest `LLVM` release candidate: [10.0.0-rc3](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc3).

 | **LLVM release version**                                   | **CUDA latest supported version**                                        | **Windows** | **Linux** |
 |:----------------------------------------------------------:|:------------------------------------------------------------------------:|:-----------:|:---------:|
@@ -67,7 +67,7 @@ To use the latest CUDA version [10.2](https://developer.nvidia.com/cuda-download
 | [8.0.1](http://releases.llvm.org/download.html#8.0.1)      | [10.0](https://developer.nvidia.com/cuda-10.0-download-archive)          | - <br/> not working due to <br/> the clang's bug [38811](https://bugs.llvm.org/show_bug.cgi?id=38811) <br/>+<br/>[patch](patches/patch_for_clang_8.0.1_bug_38811.zip)*</br> | + |
 | [9.0.0](http://releases.llvm.org/download.html#9.0.0)      | [10.1](https://developer.nvidia.com/cuda-10.1-download-archive-base)     | +           | +         |
 | [**9.0.1**](http://releases.llvm.org/download.html#9.0.1)  | [**10.1**](https://developer.nvidia.com/cuda-10.1-download-archive-base) | + <br/> **LATEST STABLE RELEASE** | + <br/> **LATEST STABLE RELEASE** |
-| [10.0.0-rc1](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc1) | [10.2](https://developer.nvidia.com/cuda-downloads)             | +           | +         |
+| [10.0.0-rc3](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc3) | [10.2](https://developer.nvidia.com/cuda-downloads)             | +           | +         |

 `*` Download the patch and unpack it into your `LLVM` distributive directory; a few header files will be overwritten; rebuilding of `LLVM` is not needed.

@@ -158,7 +158,7 @@ Run `Visual Studio 16 2019`, open the generated `LLVM.sln`, build all, build pro

 **LLVM 10.0.0 or newer:**

-1. download [`LLVM project`](https://github.com/llvm/llvm-project/archive/llvmorg-10.0.0-rc1.tar.gz) sources;
+1. download [`LLVM project`](https://github.com/llvm/llvm-project/archive/llvmorg-10.0.0-rc3.tar.gz) sources;
 2. build [`LLVM project`](http://llvm.org/docs/CMake.html):

 **Linux**:
@@ -168,6 +168,7 @@ Run `Visual Studio 16 2019`, open the generated `LLVM.sln`, build all, build pro
         -DLLVM_SOURCE_DIR=../llvm-project \
         -DLLVM_TARGETS_TO_BUILD="X86;NVPTX" \
         -DLLVM_ENABLE_PROJECTS="clang" \
+         -DLLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN=ON
         -DCMAKE_BUILD_TYPE=Release \
         ../llvm-project/llvm
        make -j install
@@ -181,6 +182,7 @@ Run `Visual Studio 16 2019`, open the generated `LLVM.sln`, build all, build pro
         -DLLVM_SOURCE_DIR=../llvm-project \
         -DLLVM_TARGETS_TO_BUILD="NVPTX" \
         -DLLVM_ENABLE_PROJECTS="clang" \
+         -DLLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN=ON
         -DCMAKE_BUILD_TYPE=Release \
         -Thost=x64 \
         ../llvm-project/llvm
@@ -247,7 +249,7 @@ On Linux the following configurations are tested:

 Ubuntu 14: LLVM 5.0.0 - 6.0.1, CUDA 7.0 - 9.0, cudnn-5.0.5 - cudnn-7.6.5.32

-Ubuntu 16-18: LLVM 8.0.0 - 10.0.0-rc1, CUDA 8.0 - 10.2, cudnn-5.1.10 - cudnn-7.6.5.32
+Ubuntu 16-18: LLVM 8.0.0 - 10.0.0-rc3, CUDA 8.0 - 10.2, cudnn-5.1.10 - cudnn-7.6.5.32

 Minimum build system requirements for the above configurations:

@@ -393,19 +395,19 @@ Testing Time: 3.07s
 ```
 ### <a name="windows"></a > hipify-clang: Windows

-On Windows 10 the following configurations are tested:
+*Tested configurations:*

-LLVM 5.0.0 - 5.0.2, CUDA 8.0, cudnn 5.1.10 - 7.1.4.18
+|    **LLVM**    | **CUDA** |      **cuDNN**      |     **Visual Studio**     | **cmake** | **Python** |
+|:--------------:|---------:|--------------------:|--------------------------:|----------:|-----------:|
+| 5.0.0 - 5.0.2  | 8.0      | 5.1.10 - 7.1.4.18   | 2017.15.5.2               | 3.5.1     | 3.6.4      |
+| 6.0.0 - 6.0.1  | 9.0      | 7.0.5.15 - 7.6.5.32 | 2017.15.5.5               | 3.6.0     | 3.7.2      |
+| 7.0.0 - 7.1.0  | 9.2      | 7.6.5.32            | 2017.15.9.11              | 3.13.3    | 3.7.3      |
+| 8.0.0 - 8.0.1  | 10.0     | 7.6.5.32            | 2017.15.9.15              | 3.14.2    | 3.7.4      |
+| 9.0.0 - 9.0.1  | 10.1     | 7.6.5.32            | 2017.15.9.20, 2019.16.4.5 | 3.16.4    | 3.8.0      |
+| 10.0.0-rc1-rc3 | 10.2     | 7.6.5.32            | 2017.15.9.20, 2019.16.4.5 | 3.16.4    | 3.8.1      |
+| 11.0.0git      | 10.2     | 7.6.5.32            | 2017.15.9.20, 2019.16.4.5 | 3.16.5    | 3.8.2      |

-LLVM 6.0.0 - 6.0.1, CUDA 9.0, cudnn 7.0.5.15 - 7.6.5.32
-
-LLVM 7.0.0 - 10.0.0-rc1, CUDA 7.5 - 10.2, cudnn 7.0.5.15 - 7.6.5.32
-
-Build system requirements for the latest stable configuration LLVM 9.0.1/CUDA 10.1 Update 2:
-
-Python 3.6.0 - 3.8.1, cmake 3.5.1 - 3.16.3, Visual Studio 2017 (15.5.2) - 2019 (16.4.4).
-
-Here is an example of building `hipify-clang` with testing support on `Windows 10` by `Visual Studio 16 2019`:
+*Building with testing support on `Windows 10` by `Visual Studio 16 2019`:*

 ```shell
 cmake
@@ -429,7 +431,7 @@ cmake
 --    - CMake module path: F:/LLVM/9.0.1/dist/lib/cmake/llvm
 --    - Include path     : F:/LLVM/9.0.1/dist/include
 --    - Binary path      : F:/LLVM/9.0.1/dist/bin
-- Found PythonInterp: C:/Program Files/Python38/python.exe (found suitable version "3.8.1", minimum required is "3.6")
+-- Found PythonInterp: C:/Program Files/Python38/python.exe (found suitable version "3.8.2", minimum required is "3.6")
 -- Found lit: C:/Program Files/Python38/Scripts/lit.exe
 -- Found FileCheck: F:/LLVM/9.0.1/dist/bin/FileCheck.exe
 -- Found CUDA: C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1 (found version "10.1")
@@ -545,9 +545,9 @@ const std::map<llvm::StringRef, hipCounter> CUDA_DRIVER_FUNCTION_MAP{

  // 5.21. Occupancy
  // cudaOccupancyMaxActiveBlocksPerMultiprocessor
-  {"cuOccupancyMaxActiveBlocksPerMultiprocessor",          {"hipOccupancyMaxActiveBlocksPerMultiprocessor",            "", CONV_OCCUPANCY, API_DRIVER}},
+  {"cuOccupancyMaxActiveBlocksPerMultiprocessor",          {"hipDrvOccupancyMaxActiveBlocksPerMultiprocessor",            "", CONV_OCCUPANCY, API_DRIVER}},
  // cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
-  {"cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", {"hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags",   "", CONV_OCCUPANCY, API_DRIVER}},
+  {"cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", {"hipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags",   "", CONV_OCCUPANCY, API_DRIVER}},
  // cudaOccupancyMaxPotentialBlockSize
  {"cuOccupancyMaxPotentialBlockSize",                     {"hipOccupancyMaxPotentialBlockSize",                       "", CONV_OCCUPANCY, API_DRIVER}},
  // cudaOccupancyMaxPotentialBlockSizeWithFlags
@@ -1047,7 +1047,7 @@ const std::map<llvm::StringRef, hipCounter> CUDA_DRIVER_TYPE_NAME_MAP{
  {"CUoccupancy_flags_enum",                                           {"hipOccupancyFlags",                                        "", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}},
  // CUoccupancy_flags enum values
  // cudaOccupancyDefault
-  {"CU_OCCUPANCY_DEFAULT",                                             {"hipOccupancyDefault",                                      "", CONV_NUMERIC_LITERAL, API_DRIVER, HIP_UNSUPPORTED}}, // 0x00
+  {"CU_OCCUPANCY_DEFAULT",                                             {"hipOccupancyDefault",                                      "", CONV_NUMERIC_LITERAL, API_DRIVER}}, // 0x00
  // cudaOccupancyDisableCachingOverride
  {"CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE",                            {"hipOccupancyDisableCachingOverride",                       "", CONV_NUMERIC_LITERAL, API_DRIVER, HIP_UNSUPPORTED}}, // 0x01

@@ -1284,6 +1284,8 @@ const std::map<llvm::StringRef, hipCounter> CUDA_DRIVER_TYPE_NAME_MAP{
  {"CUDA_ERROR_INVALID_PC",                                            {"hipErrorInvalidPc",                                        "", CONV_NUMERIC_LITERAL, API_DRIVER, HIP_UNSUPPORTED}}, // 718
  // cudaErrorLaunchFailure
  {"CUDA_ERROR_LAUNCH_FAILED",                                         {"hipErrorLaunchFailure",                                    "", CONV_NUMERIC_LITERAL, API_DRIVER}}, // 719
+  // cudaErrorCooperativeLaunchTooLarge
+  {"CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE",                          {"hipErrorCooperativeLaunchTooLarge",                        "", CONV_NUMERIC_LITERAL, API_DRIVER}}, // 720
  // cudaErrorNotPermitted
  {"CUDA_ERROR_NOT_PERMITTED",                                         {"hipErrorNotPermitted",                                     "", CONV_NUMERIC_LITERAL, API_DRIVER, HIP_UNSUPPORTED}}, // 800
  // cudaErrorNotSupported
@@ -1598,9 +1600,9 @@ const std::map<llvm::StringRef, hipCounter> CUDA_DRIVER_TYPE_NAME_MAP{
  // cudaArrayColorAttachment
  {"CUDA_ARRAY3D_COLOR_ATTACHMENT",                                    {"hipArrayColorAttachment",                                  "", CONV_DEFINE, API_DRIVER, HIP_UNSUPPORTED}}, // 0x20
  // cudaCooperativeLaunchMultiDeviceNoPreSync
-  {"CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC",          {"hipCooperativeLaunchMultiDeviceNoPreSync",                 "", CONV_DEFINE, API_DRIVER, HIP_UNSUPPORTED}}, // 0x01
+  {"CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC",          {"hipCooperativeLaunchMultiDeviceNoPreSync",                 "", CONV_DEFINE, API_DRIVER}}, // 0x01
  // cudaCooperativeLaunchMultiDeviceNoPostSync
-  {"CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC",         {"hipCooperativeLaunchMultiDeviceNoPostSync",                "", CONV_DEFINE, API_DRIVER, HIP_UNSUPPORTED}}, // 0x02
+  {"CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC",         {"hipCooperativeLaunchMultiDeviceNoPostSync",                "", CONV_DEFINE, API_DRIVER}}, // 0x02
  // cudaExternalMemoryDedicated
  {"CUDA_EXTERNAL_MEMORY_DEDICATED",                                   {"hipExternalMemoryDedicated",                               "", CONV_DEFINE, API_DRIVER, HIP_UNSUPPORTED}}, // 0x1
  // cudaExternalSemaphoreSignalSkipNvSciBufMemSync
@@ -787,8 +787,8 @@ const std::map<llvm::StringRef, hipCounter> CUDA_RUNTIME_TYPE_NAME_MAP {
  {"cudaErrorInvalidPc",                                               {"hipErrorInvalidPc",                                        "", CONV_NUMERIC_LITERAL, API_RUNTIME, HIP_UNSUPPORTED}}, // 718
  // CUDA_ERROR_LAUNCH_FAILED
  {"cudaErrorLaunchFailure",                                           {"hipErrorLaunchFailure",                                    "", CONV_NUMERIC_LITERAL, API_RUNTIME}}, // 719
-  // no analogue
-  {"cudaErrorCooperativeLaunchTooLarge",                               {"hipErrorCooperativeLaunchTooLarge",                        "", CONV_NUMERIC_LITERAL, API_RUNTIME, HIP_UNSUPPORTED}}, // 720
+  // CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE
+  {"cudaErrorCooperativeLaunchTooLarge",                               {"hipErrorCooperativeLaunchTooLarge",                        "", CONV_NUMERIC_LITERAL, API_RUNTIME}}, // 720
  // CUDA_ERROR_NOT_PERMITTED
  {"cudaErrorNotPermitted",                                            {"hipErrorNotPermitted",                                     "", CONV_NUMERIC_LITERAL, API_RUNTIME, HIP_UNSUPPORTED}}, // 800
  // CUDA_ERROR_NOT_SUPPORTED
@@ -1327,9 +1327,9 @@ const std::map<llvm::StringRef, hipCounter> CUDA_RUNTIME_TYPE_NAME_MAP {
  // CUDA_ARRAY3D_COLOR_ATTACHMENT
  {"cudaArrayColorAttachment",                                         {"hipArrayColorAttachment",                                  "", CONV_DEFINE, API_RUNTIME, HIP_UNSUPPORTED}}, // 0x20
  // CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC
-  {"cudaCooperativeLaunchMultiDeviceNoPreSync",                        {"hipCooperativeLaunchMultiDeviceNoPreSync",                 "", CONV_DEFINE, API_RUNTIME, HIP_UNSUPPORTED}}, // 0x01
+  {"cudaCooperativeLaunchMultiDeviceNoPreSync",                        {"hipCooperativeLaunchMultiDeviceNoPreSync",                 "", CONV_DEFINE, API_RUNTIME}}, // 0x01
  // CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC
-  {"cudaCooperativeLaunchMultiDeviceNoPostSync",                       {"hipCooperativeLaunchMultiDeviceNoPostSync",                "", CONV_DEFINE, API_RUNTIME, HIP_UNSUPPORTED}}, // 0x02
+  {"cudaCooperativeLaunchMultiDeviceNoPostSync",                       {"hipCooperativeLaunchMultiDeviceNoPostSync",                "", CONV_DEFINE, API_RUNTIME}}, // 0x02
  // CU_DEVICE_CPU ((CUdevice)-1)
  {"cudaCpuDeviceId",                                                  {"hipCpuDeviceId",                                           "", CONV_DEFINE, API_RUNTIME, HIP_UNSUPPORTED}}, // ((int)-1)
  // CU_DEVICE_INVALID ((CUdevice)-2)
@@ -1412,7 +1412,7 @@ const std::map<llvm::StringRef, hipCounter> CUDA_RUNTIME_TYPE_NAME_MAP {
  // no analogue
  {"cudaTextureTypeCubemapLayered",                                    {"hipTextureTypeCubemapLayered",                             "", CONV_DEFINE, API_RUNTIME}}, // 0xFC
  // CU_OCCUPANCY_DEFAULT
-  {"cudaOccupancyDefault",                                             {"hipOccupancyDefault",                                      "", CONV_DEFINE, API_RUNTIME, HIP_UNSUPPORTED}}, // 0x00
+  {"cudaOccupancyDefault",                                             {"hipOccupancyDefault",                                      "", CONV_DEFINE, API_RUNTIME}}, // 0x00
  // CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE
  {"cudaOccupancyDisableCachingOverride",                              {"hipOccupancyDisableCachingOverride",                       "", CONV_DEFINE, API_RUNTIME, HIP_UNSUPPORTED}}, // 0x01
  // CU_STREAM_DEFAULT
@@ -31,9 +31,11 @@ THE SOFTWARE.
 #include <string>
 #include <utility>
 #include <vector>
-
+#include <unordered_set>
 namespace hip_impl {
-
+#if !defined(DISABLE_REDUCED_GPU_BLOB_COPY)
+std::unordered_set<std::string>& get_all_gpuarch();
+#endif
 inline
 std::string transmogrify_triple(const std::string& triple)
 {
@@ -43,7 +45,6 @@ std::string transmogrify_triple(const std::string& triple)
    if (triple.find(old_prefix) == 0) {
        return new_prefix + triple.substr(sizeof(old_prefix) - 1);
    }
-
    return (triple.find(new_prefix) == 0) ? triple : "";
 }

@@ -114,9 +115,7 @@ class Bundled_code_header {
    friend inline bool read(RandomAccessIterator f, RandomAccessIterator l,
                            Bundled_code_header& x) {
        if (f == l) return false;
-
        std::copy_n(f, sizeof(x.header_.cbuf_), x.header_.cbuf_);
-
        if (valid(x)) {
            x.bundles_.resize(x.header_.bundle_cnt_);

@@ -126,11 +125,16 @@ class Bundled_code_header {
                it += sizeof(y.header.cbuf);

                y.triple.assign(it, it + y.header.triple_sz);
-
+                #ifdef DISABLE_REDUCED_GPU_BLOB_COPY
                std::copy_n(f + y.header.offset, y.header.bundle_sz, std::back_inserter(y.blob));
-
+                #else
+                auto& gpuArch = get_all_gpuarch();
+                auto itgpuArch = std::find(gpuArch.begin(),gpuArch.end(),y.triple);
+                if (itgpuArch != gpuArch.end()){
+                    std::copy_n(f + y.header.offset, y.header.bundle_sz, std::back_inserter(y.blob));
+                }
+                #endif
                it += y.header.triple_sz;
-
                x.bundled_code_size = std::max(x.bundled_code_size, 
                                               y.header.offset + y.header.bundle_sz);
            }
@@ -319,6 +319,41 @@ double __shfl(double var, int src_lane, int width = warpSize) {
    double tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
 }
+__device__
+inline
+long __shfl(long var, int src_lane, int width = warpSize)
+{
+    #ifndef _MSC_VER
+    static_assert(sizeof(long) == 2 * sizeof(int), "");
+    static_assert(sizeof(long) == sizeof(uint64_t), "");
+
+    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl(tmp[0], src_lane, width);
+    tmp[1] = __shfl(tmp[1], src_lane, width);
+
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+    #else
+    static_assert(sizeof(long) == sizeof(int), "");
+    return static_cast<long>(__shfl(static_cast<int>(var), src_lane, width));
+    #endif
+}
+__device__
+inline
+long long __shfl(long long var, int src_lane, int width = warpSize)
+{
+    static_assert(sizeof(long long) == 2 * sizeof(int), "");
+    static_assert(sizeof(long long) == sizeof(uint64_t), "");
+
+    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl(tmp[0], src_lane, width);
+    tmp[1] = __shfl(tmp[1], src_lane, width);
+
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+}

 __device__
 inline
@@ -356,6 +391,39 @@ double __shfl_up(double var, unsigned int lane_delta, int width = warpSize) {
    double tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
 }
+__device__
+inline
+long __shfl_up(long var, unsigned int lane_delta, int width = warpSize)
+{
+    #ifndef _MSC_VER
+    static_assert(sizeof(long) == 2 * sizeof(int), "");
+    static_assert(sizeof(long) == sizeof(uint64_t), "");
+
+    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_up(tmp[0], lane_delta, width);
+    tmp[1] = __shfl_up(tmp[1], lane_delta, width);
+
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+    #else
+    static_assert(sizeof(long) == sizeof(int), "");
+    return static_cast<long>(__shfl_up(static_cast<int>(var), lane_delta, width));
+    #endif
+}
+__device__
+inline
+long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize)
+{
+    static_assert(sizeof(long long) == 2 * sizeof(int), "");
+    static_assert(sizeof(long long) == sizeof(uint64_t), "");
+    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_up(tmp[0], lane_delta, width);
+    tmp[1] = __shfl_up(tmp[1], lane_delta, width);
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+}

 __device__
 inline
@@ -393,6 +461,39 @@ double __shfl_down(double var, unsigned int lane_delta, int width = warpSize) {
    double tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
 }
+__device__
+inline
+long __shfl_down(long var, unsigned int lane_delta, int width = warpSize)
+{
+    #ifndef _MSC_VER
+    static_assert(sizeof(long) == 2 * sizeof(int), "");
+    static_assert(sizeof(long) == sizeof(uint64_t), "");
+
+    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_down(tmp[0], lane_delta, width);
+    tmp[1] = __shfl_down(tmp[1], lane_delta, width);
+
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+    #else
+    static_assert(sizeof(long) == sizeof(int), "");
+    return static_cast<long>(__shfl_down(static_cast<int>(var), lane_delta, width));
+    #endif
+}
+__device__
+inline
+long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSize)
+{
+    static_assert(sizeof(long long) == 2 * sizeof(int), "");
+    static_assert(sizeof(long long) == sizeof(uint64_t), "");
+    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_down(tmp[0], lane_delta, width);
+    tmp[1] = __shfl_down(tmp[1], lane_delta, width);
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+}

 __device__
 inline
@@ -430,6 +531,39 @@ double __shfl_xor(double var, int lane_mask, int width = warpSize) {
    double tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
 }
+__device__
+inline
+long __shfl_xor(long var, int lane_mask, int width = warpSize)
+{
+    #ifndef _MSC_VER
+    static_assert(sizeof(long) == 2 * sizeof(int), "");
+    static_assert(sizeof(long) == sizeof(uint64_t), "");
+
+    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
+    tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
+
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+    #else
+    static_assert(sizeof(long) == sizeof(int), "");
+    return static_cast<long>(__shfl_down(static_cast<int>(var), lane_delta, width));
+    #endif
+}
+__device__
+inline
+long long __shfl_xor(long long var, int lane_mask, int width = warpSize)
+{
+    static_assert(sizeof(long long) == 2 * sizeof(int), "");
+    static_assert(sizeof(long long) == sizeof(uint64_t), "");
+    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
+    tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+}

 #define MASK1 0x00ff00ff
 #define MASK2 0xff00ff00
@@ -255,14 +255,14 @@ typedef struct hipMemcpy3DParms {
    hipArray_t srcArray;
    struct hipPos srcPos;
    struct hipPitchedPtr srcPtr;
-
    hipArray_t dstArray;
    struct hipPos dstPos;
    struct hipPitchedPtr dstPtr;
-
    struct hipExtent extent;
    enum hipMemcpyKind kind;
+} hipMemcpy3DParms;

+typedef struct HIP_MEMCPY3D {
    size_t Depth;
    size_t Height;
    size_t WidthInBytes;
@@ -283,10 +283,7 @@ typedef struct hipMemcpy3DParms {
    size_t srcLOD;
    hipMemoryType srcMemoryType;
    size_t srcPitch;
-    size_t srcXInBytes;
-    size_t srcY;
-    size_t srcZ;
-}hipMemcpy3DParms;
+} HIP_MEMCPY3D;

 static inline struct hipPitchedPtr make_hipPitchedPtr(void* d, size_t p, size_t xsz,
                                                          size_t ysz) {
@@ -212,6 +212,11 @@ enum hipLimit_t {
 #define hipArrayCubemap 0x04
 #define hipArrayTextureGather 0x08

+#define hipOccupancyDefault 0x00
+
+#define hipCooperativeLaunchMultiDeviceNoPreSync 0x01
+#define hipCooperativeLaunchMultiDeviceNoPostSync 0x02
+
 /*
 * @brief hipJitOption
 * @enum
@@ -2903,7 +2908,7 @@ hipError_t hipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX, unsigne
 * @param [in] stream    Stream where the kernel should be dispatched.  May be 0, in which case th
 * default stream is used with associated synchronization rules.
 *
- * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
+ * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue, hipErrorCooperativeLaunchTooLarge
 */
 hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDimX,
                                      void** kernelParams, unsigned int sharedMemBytes,
@@ -2917,7 +2922,7 @@ hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDim
 * @param [in] numDevices               Size of the launchParamsList array.
 * @param [in] flags                    Flags to control launch behavior.
 *
- * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
+ * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue, hipErrorCooperativeLaunchTooLarge
 */
 hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
                                                 int  numDevices, unsigned int  flags);
@@ -2954,13 +2959,36 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
 * @brief Returns occupancy for a device function.
 *
 * @param [out] numBlocks        Returned occupancy
- * @param [in]  func             Kernel function for which occupancy is calulated
+ * @param [in]  func             Kernel function (hipFunction) for which occupancy is calulated
+ * @param [in]  blockSize        Block size the kernel is intended to be launched with
+ * @param [in]  dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
+ */
+hipError_t hipDrvOccupancyMaxActiveBlocksPerMultiprocessor(
+   int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk);
+
+/**
+ * @brief Returns occupancy for a device function.
+ *
+ * @param [out] numBlocks        Returned occupancy
+ * @param [in]  f                Kernel function for which occupancy is calulated
 * @param [in]  blockSize        Block size the kernel is intended to be launched with
 * @param [in]  dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
 * @param [in]  flags            Extra flags for occupancy calculation (currently ignored)
 */
 hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
-   uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk, unsigned int flags);
+   uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk, unsigned int flags __dparm(hipOccupancyDefault));
+
+/**
+ * @brief Returns occupancy for a device function.
+ *
+ * @param [out] numBlocks        Returned occupancy
+ * @param [in]  f                Kernel function(hipFunction_t) for which occupancy is calulated
+ * @param [in]  blockSize        Block size the kernel is intended to be launched with
+ * @param [in]  dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
+ * @param [in]  flags            Extra flags for occupancy calculation (currently ignored)
+ */
+hipError_t hipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+   int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk, unsigned int flags);

 /**
 * @brief Launches kernels on multiple devices and guarantees all specified kernels are dispatched
@@ -19,10 +19,14 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
-#pragma once
+#ifndef HIPRTC_H
+#define HIPRTC_H

-#include <cstddef>
-#include <string>
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#include <stdlib.h>

 enum hiprtcResult {
    HIPRTC_SUCCESS = 0,
@@ -41,29 +45,22 @@ enum hiprtcResult {

 const char* hiprtcGetErrorString(hiprtcResult result);

-inline
-hiprtcResult hiprtcVersion(int* major, int* minor) noexcept
-{   // TODO: NVRTC versioning is somewhat unclear.
-    if (!major || !minor) return HIPRTC_ERROR_INVALID_INPUT;

-    // TODO: this should be generic / set by the build infrastructure.
-    *major = 9;
-    *minor = 0;
+hiprtcResult hiprtcVersion(int* major, int* minor);

-    return HIPRTC_SUCCESS;
-}
-
-struct _hiprtcProgram;
-using hiprtcProgram = _hiprtcProgram*;
+typedef struct _hiprtcProgram* hiprtcProgram;

 hiprtcResult hiprtcAddNameExpression(hiprtcProgram prog,
                                     const char* name_expression);

-hiprtcResult hiprtcCompileProgram(hiprtcProgram prog, int numOptions,
+hiprtcResult hiprtcCompileProgram(hiprtcProgram prog,
+                                  int numOptions,
                                  const char** options);

-hiprtcResult hiprtcCreateProgram(hiprtcProgram* prog, const char* src,
-                                 const char* name, int numHeaders,
+hiprtcResult hiprtcCreateProgram(hiprtcProgram* prog,
+                                 const char* src,
+                                 const char* name,
+                                 int numHeaders,
                                 const char** headers,
                                 const char** includeNames);

@@ -76,37 +73,14 @@ hiprtcResult hiprtcGetLoweredName(hiprtcProgram prog,
 hiprtcResult hiprtcGetProgramLog(hiprtcProgram prog, char* log);

 hiprtcResult hiprtcGetProgramLogSize(hiprtcProgram prog,
-                                     std::size_t* logSizeRet);
+                                     size_t* logSizeRet);

 hiprtcResult hiprtcGetCode(hiprtcProgram prog, char* code);

-hiprtcResult hiprtcGetCodeSize(hiprtcProgram prog, std::size_t* codeSizeRet);
+hiprtcResult hiprtcGetCodeSize(hiprtcProgram prog, size_t* codeSizeRet);

-namespace hip_impl
-{
-    char* demangle(const char* mangled_expression);
+#ifdef __cplusplus
 }
+#endif /* __cplusplus */

-#if defined(HIPRTC_GET_TYPE_NAME)
-    #include <typeinfo>
-
-    #if defined(_WIN32)
-        #include <dbghelp.h>
-
-        template<typename>
-        hiprtcResult hiprtcGetTypeName(std::string*) = delete;
-    #else
-        template<typename T>
-        inline
-        hiprtcResult hiprtcGetTypeName(std::string* result)
-        {
-            if (!result) return HIPRTC_ERROR_INVALID_INPUT;
-
-            char * res= hip_impl::demangle(typeid(T).name());
-            result->assign(res == nullptr ? "" : res);
-            std::free(res);
-            return (result->empty()) ? HIPRTC_ERROR_INTERNAL_ERROR :
-                                       HIPRTC_SUCCESS;
-        }
-    #endif
-#endif
+#endif //HIPRTC_H
@@ -1,16 +1,13 @@
 /*
 Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
-
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
-
 The above copyright notice and this permission notice shall be included in
 all copies or substantial portions of the Software.
-
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
@@ -23,23 +20,8 @@ THE SOFTWARE.
 #ifndef HIP_INCLUDE_HIP_HIP_PROFILE_H
 #define HIP_INCLUDE_HIP_HIP_PROFILE_H

-#warning "HIP Profiling through markers is deprecated, please check roctrace/rocTX support."
-
-#if not defined(ENABLE_HIP_PROFILE)
-#define ENABLE_HIP_PROFILE 1
-#endif
-
-#if defined(__HIP_PLATFORM_HCC__) and (ENABLE_HIP_PROFILE == 1)
-#warning "HIP Markers are deprecated and would be removed soon."
-#include <CXLActivityLogger.h>
-#define HIP_SCOPED_MARKER(markerName, group)                                                       \
-    amdtScopedMarker __scopedMarker(markerName, group, nullptr);
-#define HIP_BEGIN_MARKER(markerName, group) amdtBeginMarker(markerName, group, nullptr);
-#define HIP_END_MARKER() amdtEndMarker();
-#else
 #define HIP_SCOPED_MARKER(markerName, group)
 #define HIP_BEGIN_MARKER(markerName, group)
 #define HIP_END_MARKER()
-#endif

 #endif
@@ -255,6 +255,10 @@ typedef enum __HIP_NODISCARD hipError_t {
        713,  ///< Produced when trying to unlock a non-page-locked memory.
    hipErrorLaunchFailure =
        719,  ///< An exception occurred on the device while executing a kernel.
+    hipErrorCooperativeLaunchTooLarge =
+        720,  ///< This error indicates that the number of blocks launched per grid for a kernel
+              ///< that was launched via cooperative launch APIs exceeds the maximum number of
+              ///< allowed blocks for the current device
    hipErrorNotSupported = 801,  ///< Produced when the hip API is not supported/implemented
    hipErrorUnknown = 999,  //< Unknown error.
    // HSA Runtime Error Codes start here.
@@ -140,6 +140,14 @@ typedef enum cudaChannelFormatKind hipChannelFormatKind;
 #define hipLimitMallocHeapSize cudaLimitMallocHeapSize
 #define hipIpcMemLazyEnablePeerAccess cudaIpcMemLazyEnablePeerAccess

+#define hipOccupancyDefault cudaOccupancyDefault
+
+#define hipCooperativeLaunchMultiDeviceNoPreSync    \
+        cudaCooperativeLaunchMultiDeviceNoPreSync
+#define hipCooperativeLaunchMultiDeviceNoPostSync   \
+        cudaCooperativeLaunchMultiDeviceNoPostSync
+
+
 // enum CUjit_option redefines
 #define hipJitOptionMaxRegisters CU_JIT_MAX_REGISTERS
 #define hipJitOptionThreadsPerBlock CU_JIT_THREADS_PER_BLOCK
@@ -267,6 +275,8 @@ inline static hipError_t hipCUDAErrorTohipError(cudaError_t cuError) {
            return hipErrorNotInitialized;
        case cudaErrorLaunchFailure:
            return hipErrorLaunchFailure;
+        case cudaErrorCooperativeLaunchTooLarge:
+            return hipErrorCooperativeLaunchTooLarge;
        case cudaErrorPriorLaunchFailure:
            return hipErrorPriorLaunchFailure;
        case cudaErrorLaunchOutOfResources:
@@ -445,6 +455,8 @@ inline static hipError_t hipCUResultTohipError(CUresult cuError) {
            return hipErrorHostMemoryNotRegistered;
        case CUDA_ERROR_LAUNCH_FAILED:
            return hipErrorLaunchFailure;
+        case CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE:
+            return hipErrorCooperativeLaunchTooLarge;
        case CUDA_ERROR_NOT_SUPPORTED:
            return hipErrorNotSupported;
        case CUDA_ERROR_UNKNOWN:
@@ -601,6 +613,8 @@ inline static cudaError_t hipErrorToCudaError(hipError_t hError) {
            return cudaErrorSetOnActiveProcess;
        case hipErrorLaunchFailure:
            return cudaErrorLaunchFailure;
+        case hipErrorCooperativeLaunchTooLarge:
+            return cudaErrorCooperativeLaunchTooLarge;
        case hipErrorNotSupported:
            return cudaErrorNotSupported;
        // HSA: does not exist in CUDA
@@ -26,7 +26,7 @@ target_include_directories(ca PUBLIC ${PROJECT_SOURCE_DIR}/src)
 find_library(
    hsart NAMES libhsa-runtime64.so libhsa-runtime64.so.1 HINTS ${HSA_PATH}/lib)
 target_link_libraries(ca PUBLIC ${hsart})
-target_compile_options(ca PUBLIC -Wall)
+target_compile_options(ca PUBLIC -DDISABLE_REDUCED_GPU_BLOB_COPY -Wall)

 install(TARGETS ca RUNTIME DESTINATION bin)
-#-------------------------------------CA---------------------------------------#
+#-------------------------------------CA---------------------------------------#
@@ -36,11 +36,7 @@ set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}
 set(CPACK_GENERATOR "TGZ;DEB;RPM")
 set(CPACK_BINARY_DEB "ON")
 set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm")
-if(@COMPILE_HIP_ATP_MARKER@)
-    set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}), ${HCC_PACKAGE_NAME} (= @HCC_PACKAGE_VERSION@), rocm-profiler, comgr (>= 1.1)")
-else()
-    set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}), ${HCC_PACKAGE_NAME} (= @HCC_PACKAGE_VERSION@), comgr (>= 1.1)")
-endif()
+set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}), ${HCC_PACKAGE_NAME} (= @HCC_PACKAGE_VERSION@), comgr (>= 1.1)")
 set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip_hcc")
 set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_hcc")
 set(CPACK_DEBIAN_PACKAGE_CONFLICTS "hip_hcc")
@@ -50,11 +46,7 @@ set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst")
 set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm")
 set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
 string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION})
-if(@COMPILE_HIP_ATP_MARKER@)
-    set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}, ${HCC_PACKAGE_NAME} = @HCC_PACKAGE_VERSION@, rocm-profiler, comgr >= 1.1")
-else()
-    set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}, ${HCC_PACKAGE_NAME} = @HCC_PACKAGE_VERSION@, comgr >= 1.1")
-endif()
+set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}, ${HCC_PACKAGE_NAME} = @HCC_PACKAGE_VERSION@, comgr >= 1.1")
 set(CPACK_RPM_PACKAGE_OBSOLETES "hip_hcc")
 set(CPACK_RPM_PACKAGE_CONFLICTS "hip_hcc")
 set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt")
@@ -4,16 +4,17 @@ ifeq (,$(HIP_PATH))
 endif
 HIPCC=$(HIP_PATH)/bin/hipcc -std=c++11

-EXE=hipDispatchLatency
-
 CXXFLAGS = -O3

-all: test_kernel.code ${EXE}
+all: test_kernel.code hipDispatchLatency.out hipDispatchEnqueueRateMT.out

-$(EXE): hipDispatchLatency.cpp
+hipDispatchLatency.out: hipDispatchLatency.cpp
 	$(HIPCC) $(CXXFLAGS) hipDispatchLatency.cpp -o $@

+hipDispatchEnqueueRateMT.out: hipDispatchEnqueueRateMT.cpp
+	$(HIPCC) $(CXXFLAGS) hipDispatchEnqueueRateMT.cpp -o $@
+
 test_kernel.code: test_kernel.cpp
 	$(HIP_PATH)/bin/hipcc --genco  $(GENCO_FLAGS) $^ -o $@
 clean:
-	rm -f *.o $(EXE)
+	rm -f *.o *.out
@@ -0,0 +1,167 @@
+/*
+Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <stdio.h>
+#include "hip/hip_runtime.h"
+#ifdef __HIP_PLATFORM_HCC__
+#include "hip/hip_ext.h"
+#endif
+#include <iostream>
+#include <chrono>
+#include <algorithm>
+#include <atomic>
+#include <thread>
+#include <future>
+#include <functional>
+
+#define NUM_GROUPS 1
+#define GROUP_SIZE 1
+#define WARMUP_RUN_COUNT 10
+#define TIMING_RUN_COUNT 100
+#define TOTAL_RUN_COUNT WARMUP_RUN_COUNT + TIMING_RUN_COUNT
+
+__global__ void EmptyKernel() {}
+
+// Helper to print various timing metrics
+void print_timing(std::string test, std::array<float, TOTAL_RUN_COUNT> &results, int batch = 1)
+{
+
+    float total_us = 0.0f, mean_us = 0.0f, stddev_us = 0.0f;
+
+    // remove top outliers due to nature of variability across large number of multi-threaded runs
+    std::sort(results.begin(), results.end(), std::greater<float>());
+    auto start_iter = std::next(results.begin(), WARMUP_RUN_COUNT);
+    auto end_iter = results.end();
+
+    // mean
+    std::for_each(start_iter, end_iter, [&](const float &run_ms) {
+        total_us += (run_ms * 1000) / batch;
+    });
+    mean_us = total_us  / TIMING_RUN_COUNT;
+
+   // stddev
+    total_us = 0;
+    std::for_each(start_iter, end_iter, [&](const float &run_ms) {
+        float dev_us = ((run_ms * 1000) / batch) - mean_us;
+        total_us += dev_us * dev_us;
+    });
+    stddev_us = sqrt(total_us / TIMING_RUN_COUNT);
+
+    printf("\n %s: %.1f us, std: %.1f us\n", test.c_str(), mean_us, stddev_us);
+}
+
+// Measure time taken to enqueue a kernel on the GPU using hipModuleLaunchKernel
+void hipModuleLaunchKernel_enqueue_rate(std::atomic_int* shared, int max_threads)
+{
+    //resources necessary for this thread
+    hipStream_t stream;
+    hipStreamCreate(&stream);
+    hipModule_t module;
+    hipFunction_t function;
+    hipModuleLoad(&module, "test_kernel.code");
+    hipModuleGetFunction(&function, module, "test");
+    void* kernel_params = nullptr;
+    std::array<float, TOTAL_RUN_COUNT> results;
+
+    //synchronize all threads, before running
+    int tid = shared->fetch_add(1, std::memory_order_release);
+    while (max_threads != shared->load(std::memory_order_acquire)) {}
+
+    for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
+        auto start = std::chrono::high_resolution_clock::now();
+        hipModuleLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, stream, &kernel_params, nullptr);
+        auto stop = std::chrono::high_resolution_clock::now();
+        results[i] = std::chrono::duration<double, std::milli>(stop - start).count();
+    }
+    print_timing("Thread ID : " + std::to_string(tid) + " , " + "hipModuleLaunchKernel enqueue rate", results);
+}
+
+// Measure time taken to enqueue a kernel on the GPU using hipLaunchKernelGGL
+void hipLaunchKernelGGL_enqueue_rate(std::atomic_int* shared, int max_threads)
+{
+    //resources necessary for this thread
+    hipStream_t stream;
+    hipStreamCreate(&stream);
+    std::array<float, TOTAL_RUN_COUNT> results;
+
+    //synchronize all threads, before running
+    int tid = shared->fetch_add(1, std::memory_order_release);
+    while (max_threads != shared->load(std::memory_order_acquire)) {}
+
+    for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
+        auto start = std::chrono::high_resolution_clock::now();
+        hipLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream);
+        auto stop = std::chrono::high_resolution_clock::now();
+        results[i] = std::chrono::duration<double, std::milli>(stop - start).count();
+    }
+    print_timing("Thread ID : " + std::to_string(tid) + " , " + "hipLaunchKernelGGL enqueue rate", results);
+}
+
+// Simple thread pool
+struct thread_pool {
+    thread_pool(int total_threads) : max_threads(total_threads) {}
+    void start(std::function<void(std::atomic_int*, int)> f) {
+        for (int i = 0; i < max_threads; ++i) {
+            threads.push_back(std::async(std::launch::async, f, &shared, max_threads));
+        }
+    }
+    void finish() {
+        for (auto&&thread : threads) {
+            thread.get();
+        }
+        threads.clear();
+        shared = {0};
+    }
+    ~thread_pool() {
+        finish();
+    }
+private:
+    std::atomic_int shared {0};
+    std::vector<std::future<void>> threads;
+    int max_threads = 1;
+};
+
+
+int main(int argc, char* argv[])
+{
+    if (argc != 3) {
+        std::cerr << "Run test as 'hipDispatchEnqueueRateMT <num_threads> <0-hipModuleLaunchKernel /1-hipLaunchKernelGGL>'\n";
+        return -1;
+    }
+
+    int max_threads = atoi(argv[1]);
+    int run_module_test = atoi(argv[2]);
+    if(max_threads < 1 || run_module_test < 0 || run_module_test > 1) {
+        std::cerr << "Invalid Input.\n";
+        std::cerr << "Run test as 'hipDispatchEnqueueRateMT <num_threads> <0-hipModuleLaunchKernel /1-hipLaunchKernelGGL>'\n";
+        return -1;
+    }
+    thread_pool task(max_threads);
+
+    if(run_module_test == 0) {
+        task.start(hipModuleLaunchKernel_enqueue_rate);
+        task.finish();
+    } else {
+        task.start(hipLaunchKernelGGL_enqueue_rate);
+        task.finish();
+    }
+
+    return 0;
+}
+
@@ -39,6 +39,7 @@ THE SOFTWARE.
 #include <algorithm>
 #include <atomic>
 #include <mutex>
+#include <unordered_set>

 #include <hc.hpp>
 #include <hc_am.hpp>
@@ -71,7 +72,6 @@ int HIP_API_BLOCKING = 0;
 int HIP_PRINT_ENV = 0;
 int HIP_TRACE_API = 0;
 std::string HIP_TRACE_API_COLOR("green");
-int HIP_PROFILE_API = 0;

 // TODO - DB_START/STOP need more testing.
 std::string HIP_DB_START_API;
@@ -149,12 +149,10 @@ uint64_t recordApiTrace(TlsData *tls, std::string* fullStr, const std::string& a

    if ((tid < g_dbStartTriggers.size()) && (apiSeqNum >= g_dbStartTriggers[tid].nextTrigger())) {
        printf("info: resume profiling at %lu\n", apiSeqNum);
-        RESUME_PROFILING;
        g_dbStartTriggers.pop_back();
    };
    if ((tid < g_dbStopTriggers.size()) && (apiSeqNum >= g_dbStopTriggers[tid].nextTrigger())) {
        printf("info: stop profiling at %lu\n", apiSeqNum);
-        STOP_PROFILING;
        g_dbStopTriggers.pop_back();
    };

@@ -1294,9 +1292,6 @@ void HipReadEnv() {
               "executes.");
    READ_ENV_S(release, HIP_TRACE_API_COLOR, 0,
               "Color to use for HIP_API.  None/Red/Green/Yellow/Blue/Magenta/Cyan/White");
-    READ_ENV_I(release, HIP_PROFILE_API, 0,
-               "Add HIP API markers to ATP file generated with CodeXL. 0x1=short API name, "
-               "0x2=full API name including args.");
    READ_ENV_S(release, HIP_DB_START_API, 0,
               "Comma-separated list of tid.api_seq_num for when to start debug and profiling.");
    READ_ENV_S(release, HIP_DB_STOP_API, 0,
@@ -1372,14 +1367,6 @@ void HipReadEnv() {
        HIP_DB |= 0x1;
    }

-    if (HIP_PROFILE_API && !COMPILE_HIP_ATP_MARKER) {
-        fprintf(stderr,
-                "warning: env var HIP_PROFILE_API=0x%x but COMPILE_HIP_ATP_MARKER=0.  (perhaps "
-                "enable COMPILE_HIP_ATP_MARKER in src code before compiling?)\n",
-                HIP_PROFILE_API);
-        HIP_PROFILE_API = 0;
-    }
-
    if (HIP_DB) {
        fprintf(stderr, "HIP_DB=0x%x [%s]\n", HIP_DB, HIP_DB_string(HIP_DB).c_str());
    }
@@ -1423,11 +1410,6 @@ void HipReadEnv() {
 // This function creates a vector with only the GPU accelerators.
 // It is called with C++11 call_once, which provided thread-safety.
 void ihipInit() {
-#if COMPILE_HIP_ATP_MARKER
-    amdtInitializeActivityLogger();
-    amdtScopedMarker("ihipInit", "HIP", NULL);
-#endif
-

    HipReadEnv();

@@ -1617,7 +1599,7 @@ hipStream_t ihipSyncAndResolveStream(hipStream_t stream, bool lockAcquired) {

 void ihipPrintKernelLaunch(const char* kernelName, const grid_launch_parm* lp,
                           const hipStream_t stream) {
-    if ((HIP_TRACE_API & (1 << TRACE_KCMD)) || HIP_PROFILE_API ||
+    if ((HIP_TRACE_API & (1 << TRACE_KCMD)) ||
        (COMPILE_HIP_DB & HIP_TRACE_API)) {
        GET_TLS();
        std::stringstream os;
@@ -1630,14 +1612,6 @@ void ihipPrintKernelLaunch(const char* kernelName, const grid_launch_parm* lp,
            std::string fullStr;
            recordApiTrace(tls, &fullStr, os.str());
        }
-
-        if (HIP_PROFILE_API == 0x1) {
-            std::string shortAtpString("hipLaunchKernel:");
-            shortAtpString += kernelName;
-            MARKER_BEGIN(shortAtpString.c_str(), "HIP");
-        } else if (HIP_PROFILE_API == 0x2) {
-            MARKER_BEGIN(os.str().c_str(), "HIP");
-        }
    }
 }

@@ -1696,9 +1670,6 @@ void ihipPostLaunchKernel(const char* kernelName, hipStream_t stream, grid_launc
    tprintf(DB_SYNC, "ihipPostLaunchKernel, unlocking stream\n");

    stream->lockclose_postKernelCommand(kernelName, lp.av, unlockPostponed);
-    if (HIP_PROFILE_API) {
-        MARKER_END();
-    }
 }

 //=================================================================================================
@@ -1796,6 +1767,8 @@ const char* ihipErrorString(hipError_t hip_error) {
            return "hipErrorMissingConfiguration";
        case hipErrorLaunchFailure:
            return "hipErrorLaunchFailure";
+        case hipErrorCooperativeLaunchTooLarge:
+            return "hipErrorCooperativeLaunchTooLarge";
        case hipErrorPriorLaunchFailure:
            return "hipErrorPriorLaunchFailure";
        case hipErrorLaunchTimeOut:
@@ -2478,29 +2451,17 @@ bool ihipStream_t::locked_copy2DAsync(void* dst, const void* src, size_t width,
    return retStatus;
 }

-//-------------------------------------------------------------------------------------------------
-//-------------------------------------------------------------------------------------------------
-// Profiler, really these should live elsewhere:
 hipError_t hipProfilerStart() {
    HIP_INIT_API(hipProfilerStart);
-#if COMPILE_HIP_ATP_MARKER
-    amdtResumeProfiling(AMDT_ALL_PROFILING);
-#endif
-
    return ihipLogStatus(hipSuccess);
 };


 hipError_t hipProfilerStop() {
    HIP_INIT_API(hipProfilerStop);
-#if COMPILE_HIP_ATP_MARKER
-    amdtStopProfiling(AMDT_ALL_PROFILING);
-#endif
-
    return ihipLogStatus(hipSuccess);
 };

-
 //-------------------------------------------------------------------------------------------------
 //-------------------------------------------------------------------------------------------------
 // HCC-specific accessor functions:
@@ -2540,6 +2501,16 @@ hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view** a
 // TODO - add a contect sequence number for debug. Print operator<< ctx:0.1 (device.ctx)

 namespace hip_impl {
+    std::unordered_set<std::string>& get_all_gpuarch() {
+        static std::unordered_set<std::string> r{};
+        static std::once_flag init;
+        std::call_once(init, []() {
+            for (int i=0; i < g_deviceCnt; i++){
+                r.insert("hcc-amdgcn-amd-amdhsa--gfx"+std::to_string(g_deviceArray[i]->_props.gcnArch));
+        }});
+        return r;
+    }
+
    std::vector<hsa_agent_t> all_hsa_agents() {
        std::vector<hsa_agent_t> r{};
        std::vector<hc::accelerator> visible_accelerators;
@@ -63,7 +63,6 @@ extern int HIP_LAUNCH_BLOCKING;
 extern int HIP_API_BLOCKING;

 extern int HIP_PRINT_ENV;
-extern int HIP_PROFILE_API;
 // extern int HIP_TRACE_API;
 extern int HIP_ATP;
 extern int HIP_DB;
@@ -250,34 +249,6 @@ extern const char* API_COLOR_END;
 // Must be enabled at runtime with HIP_TRACE_API
 #define COMPILE_HIP_TRACE_API 0x3

-
-// Compile code that generates trace markers for CodeXL ATP at HIP function begin/end.
-// ATP is standard CodeXL format that includes timestamps for kernels, HSA RT APIs, and HIP APIs.
-#ifndef COMPILE_HIP_ATP_MARKER
-#define COMPILE_HIP_ATP_MARKER 0
-#endif
-
-
-// Compile support for trace markers that are displayed on CodeXL GUI at start/stop of each function
-// boundary.
-// TODO - currently we print the trace message at the beginning. if we waited, we could also
-// tls->tidInfo return codes, and any values returned through ptr-to-args (ie the pointers allocated
-// by hipMalloc).
-#if COMPILE_HIP_ATP_MARKER
-#include "CXLActivityLogger.h"
-#define MARKER_BEGIN(markerName, group) amdtBeginMarker(markerName, group, nullptr);
-#define MARKER_END() amdtEndMarker();
-#define RESUME_PROFILING amdtResumeProfiling(AMDT_ALL_PROFILING);
-#define STOP_PROFILING amdtStopProfiling(AMDT_ALL_PROFILING);
-#else
-// Swallow scoped markers:
-#define MARKER_BEGIN(markerName, group)
-#define MARKER_END()
-#define RESUME_PROFILING
-#define STOP_PROFILING
-#endif
-
-
 //---
 // HIP Trace modes - use with HIP_TRACE_API=...
 #define TRACE_ALL 0    // 0x01
@@ -336,22 +307,17 @@ static inline uint64_t getTicks() { return hc::get_system_ticks(); }
 //---
 extern uint64_t recordApiTrace(TlsData *tls, std::string* fullStr, const std::string& apiStr);

-#if COMPILE_HIP_ATP_MARKER || (COMPILE_HIP_TRACE_API & 0x1)
+#if (COMPILE_HIP_TRACE_API & 0x1)
 #define API_TRACE(forceTrace, ...)                                                                 \
    GET_TLS();                                                                                     \
    uint64_t hipApiStartTick = 0;                                                                  \
    {                                                                                              \
        tls->tidInfo.incApiSeqNum();                                                               \
        if (forceTrace ||                                                                          \
-            (HIP_PROFILE_API || (COMPILE_HIP_DB && (HIP_TRACE_API & (1 << TRACE_ALL))))) {         \
+            (COMPILE_HIP_DB && (HIP_TRACE_API & (1 << TRACE_ALL)))) {         \
            std::string apiStr = std::string(__func__) + " (" + ToString(__VA_ARGS__) + ')';       \
            std::string fullStr;                                                                   \
            hipApiStartTick = recordApiTrace(tls, &fullStr, apiStr);                               \
-            if (HIP_PROFILE_API == 0x1) {                                                          \
-                MARKER_BEGIN(__func__, "HIP")                                                      \
-            } else if (HIP_PROFILE_API == 0x2) {                                                   \
-                MARKER_BEGIN(fullStr.c_str(), "HIP");                                              \
-            }                                                                                      \
        }                                                                                          \
    }

@@ -398,9 +364,6 @@ extern uint64_t recordApiTrace(TlsData *tls, std::string* fullStr, const std::st
                    tls->tidInfo.apiSeqNum(), __func__, localHipStatus,                               \
                    ihipErrorString(localHipStatus), ticks, API_COLOR_END);                           \
        }                                                                                             \
-        if (HIP_PROFILE_API) {                                                                        \
-            MARKER_END();                                                                             \
-        }                                                                                             \
        localHipStatus;                                                                               \
    })

@@ -37,7 +37,6 @@ __device__ uint32_t __hip_device_page_flag[__HIP_NUM_PAGES];
 namespace hip_internal {

 namespace {
-
    inline
    const char* hsa_to_string(hsa_status_t err) noexcept
    {
@@ -140,6 +139,8 @@ namespace {
        return r;
    }()};

+    constexpr std::uint32_t is_cpu_owned{UINT32_MAX};
+
    inline
    hsa_amd_pointer_info_t info(const void* p)
    {
@@ -149,13 +150,14 @@ namespace {
                const_cast<void*>(p), &r, nullptr, nullptr, nullptr),
            __FILE__, __func__, __LINE__);

-        r.size = is_large_BAR || (type(r.agentOwner) == HSA_DEVICE_TYPE_CPU) ?
-            UINT32_MAX : sizeof(hsa_amd_pointer_info_t);
+        if (type(r.agentOwner) == HSA_DEVICE_TYPE_CPU) r.size = is_cpu_owned;

        return r;
    }

-    constexpr size_t staging_sz{4 * 1024 * 1024}; // 2 Pages.
+    constexpr size_t staging_sz{4 * 1024 * 1024};     // 2 Pages.
+    constexpr size_t max_h2d_std_memcpy_sz{8 * 1024}; // 8 KiB.
+    constexpr size_t max_d2h_std_memcpy_sz{64};       // 1 cacheline.

    thread_local const std::unique_ptr<void, void (*)(void *)> staging_buffer{
        []() {
@@ -202,7 +204,7 @@ namespace {
 } // Unnamed namespace.

 inline
-void do_copy(void* __restrict dst, const void* __restrict src, std::size_t n,
+void do_copy(void* __restrict dst, const void* __restrict src, size_t n,
             hsa_agent_t da, hsa_agent_t sa) {
    hsa_signal_silent_store_relaxed(copy_signal, 1);
    throwing_result_check(
@@ -224,14 +226,20 @@ void do_std_memcpy(
 inline
 void d2h_copy(void* __restrict dst, const void* __restrict src, size_t n,
              hsa_amd_pointer_info_t si) {
-    // TODO: characterise direct largeBAR reads from agent-allocated memory.
-    // if (si.size == UINT32_MAX) {
-    //     return do_std_memcpy(dst, src, n);
-    // }
-
    const auto di{info(dst)};
+    const auto is_locked{di.type == HSA_EXT_POINTER_TYPE_LOCKED};

-    if (di.type == HSA_EXT_POINTER_TYPE_LOCKED) {
+    if (!is_locked && si.size == is_cpu_owned) {
+        return do_std_memcpy(dst, src, n);
+    }
+    if (!is_locked && is_large_BAR && n <= max_d2h_std_memcpy_sz) {
+        return do_std_memcpy(dst, src, n);
+    }
+    if (di.type == HSA_EXT_POINTER_TYPE_HSA) {
+        return do_copy(dst, src, n, si.agentOwner, si.agentOwner);
+    }
+    
+    if (is_locked) {
        dst = static_cast<char*>(di.agentBaseAddress) +
              (static_cast<char*>(dst) -
               static_cast<char*>(di.hostBaseAddress));
@@ -247,7 +255,7 @@ void d2h_copy(void* __restrict dst, const void* __restrict src, size_t n,

        throwing_result_check(hsa_amd_memory_lock(dst, n, &si.agentOwner, 1,
                                                  const_cast<void**>(&dst)),
-            __FILE__, __func__, __LINE__);
+                              __FILE__, __func__, __LINE__);

        do_copy(dst, src, n, si.agentOwner, si.agentOwner);
    }
@@ -256,16 +264,23 @@ void d2h_copy(void* __restrict dst, const void* __restrict src, size_t n,
 inline
 void h2d_copy(void* __restrict dst, const void* __restrict src, size_t n,
              hsa_amd_pointer_info_t di) {
-    if (di.size == UINT32_MAX) {
+    const auto si{info(const_cast<void*>(src))};
+    const auto is_locked{si.type == HSA_EXT_POINTER_TYPE_LOCKED};
+
+    if (!is_locked && di.size == is_cpu_owned) {
        return do_std_memcpy(dst, src, n);
    }
+    if (!is_locked && is_large_BAR && n <= max_h2d_std_memcpy_sz) {
+        return do_std_memcpy(dst, src, n);
+    }
+    if (si.type == HSA_EXT_POINTER_TYPE_HSA) {
+        return do_copy(dst, src, n, di.agentOwner, di.agentOwner);
+    }

-    const auto si{info(const_cast<void*>(src))};
-
-    if (si.type == HSA_EXT_POINTER_TYPE_LOCKED) {
+    if (is_locked) {
        src = static_cast<char*>(si.agentBaseAddress) +
-             (static_cast<const char*>(src) -
-              static_cast<char*>(si.hostBaseAddress));
+            (static_cast<const char*>(src) -
+            static_cast<char*>(si.hostBaseAddress));
        do_copy(dst, src, n, di.agentOwner, di.agentOwner);
    }
    else if (n <= staging_sz) {
@@ -279,7 +294,7 @@ void h2d_copy(void* __restrict dst, const void* __restrict src, size_t n,
        throwing_result_check(hsa_amd_memory_lock(const_cast<void*>(src), n,
                                                  &di.agentOwner, 1,
                                                  const_cast<void**>(&src)),
-            __FILE__, __func__, __LINE__);
+                              __FILE__, __func__, __LINE__);

        do_copy(dst, src, n, di.agentOwner, di.agentOwner);
    }
@@ -288,69 +303,36 @@ void h2d_copy(void* __restrict dst, const void* __restrict src, size_t n,
 inline
 void generic_copy(void* __restrict dst, const void* __restrict src, size_t n,
                  hsa_amd_pointer_info_t di, hsa_amd_pointer_info_t si) {
-    if (di.size == UINT32_MAX && si.size == UINT32_MAX) {
+    if (di.size == is_cpu_owned && si.size == is_cpu_owned) {
        return do_std_memcpy(dst, src, n);
    }
+    if (di.size == is_cpu_owned) return d2h_copy(dst, src, n, si);
+    if (si.size == is_cpu_owned) return h2d_copy(dst, src, n, di);

-    std::unique_ptr<void, void (*)(void*)> lck0{
-        nullptr, [](void* p) { hsa_amd_memory_unlock(p); }};
-    std::unique_ptr<void, void (*)(void*)> lck1{nullptr, lck0.get_deleter()};
+    throwing_result_check(hsa_amd_agents_allow_access(1u, &si.agentOwner,
+                                                      nullptr,
+                                                      di.agentBaseAddress),
+                          __FILE__, __func__, __LINE__);

-    switch (si.type) {
-    case HSA_EXT_POINTER_TYPE_HSA:
-        if (di.type == HSA_EXT_POINTER_TYPE_HSA) {
-            hsa_memory_copy(dst, src, n);
-            return; // TODO: do_copy(dst, src, n, di.agentOwner, si.agentOwner);
-        }
-
-        if (di.type == HSA_EXT_POINTER_TYPE_UNKNOWN ||
-            di.type == HSA_EXT_POINTER_TYPE_LOCKED) {
-            return d2h_copy(dst, src, n, si);
-        }
-        break;
-    case HSA_EXT_POINTER_TYPE_LOCKED:
-        if (di.type == HSA_EXT_POINTER_TYPE_UNKNOWN) {
-            std::memcpy(dst, si.hostBaseAddress, n);
-
-            return;
-        }
-        if (di.type == HSA_EXT_POINTER_TYPE_LOCKED) {
-            std::memcpy(di.hostBaseAddress, si.hostBaseAddress, n);
-
-            return;
-        }
-        src = si.agentBaseAddress;
-        si.agentOwner = di.agentOwner;
-        break;
-    case HSA_EXT_POINTER_TYPE_UNKNOWN:
-        if (di.type == HSA_EXT_POINTER_TYPE_UNKNOWN) {
-            std::memcpy(dst, src, n);
-
-            return;
-        }
-        if (di.type == HSA_EXT_POINTER_TYPE_LOCKED) {
-            std::memcpy(di.hostBaseAddress, src, n);
-
-            return;
-        }
-        return h2d_copy(dst, src, n, di);
-    default: do_copy(dst, src, n, di.agentOwner, si.agentOwner); break;
-    }
+    return do_copy(dst, src, n, di.agentOwner, si.agentOwner);
 }

 inline
 void memcpy_impl(void* __restrict dst, const void* __restrict src, size_t n,
-                 hipMemcpyKind k) noexcept {
+                 hipMemcpyKind k) {
    switch (k) {
    case hipMemcpyHostToHost: std::memcpy(dst, src, n); break;
-    case hipMemcpyHostToDevice:
-        return is_large_BAR ? do_std_memcpy(dst, src, n)
-                            : h2d_copy(dst, src, n, info(dst));
-    case hipMemcpyDeviceToHost:
-        // TODO: characterise direct largeBAR reads from agent-allocated memory.
-        return /*is_large_BAR ? do_std_memcpy(dst, src, n)
-                            : */d2h_copy(dst, src, n, info(src));
-    case hipMemcpyDeviceToDevice: hsa_memory_copy(dst, src, n); break;
+    case hipMemcpyHostToDevice: return h2d_copy(dst, src, n, info(dst));
+    case hipMemcpyDeviceToHost: return d2h_copy(dst, src, n, info(src));
+    case hipMemcpyDeviceToDevice: {
+        const auto di{info(dst)};
+        const auto si{info(src)};
+        throwing_result_check(hsa_amd_agents_allow_access(1u, &si.agentOwner,
+                                                          nullptr,
+                                                          di.agentBaseAddress),
+                              __FILE__, __func__, __LINE__);
+        return do_copy(dst, src, n, di.agentOwner, si.agentOwner);
+    }
    default: return generic_copy(dst, src, n, info(dst), info(src));
    }
 }
@@ -1290,7 +1272,7 @@ hipError_t hipMemcpyToSymbolAsync(void* dst, const void* src, size_t count,
    if (dst == nullptr) {
        return ihipLogStatus(hipErrorInvalidSymbol);
    }
-    
+
    if (kind == hipMemcpyDeviceToHost || kind == hipMemcpyHostToHost) {
     	return ihipLogStatus(hipErrorInvalidMemcpyDirection);
    } else if (kind == hipMemcpyDeviceToDevice) {
@@ -1322,7 +1304,7 @@ hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* src, size_t count,
    if (src == nullptr || dst == nullptr) {
        return ihipLogStatus(hipErrorInvalidSymbol);
    }
-    
+
    if (kind == hipMemcpyHostToDevice || kind == hipMemcpyHostToHost) {
     	return ihipLogStatus(hipErrorInvalidMemcpyDirection);
    } else if (kind == hipMemcpyDeviceToDevice) {
@@ -1540,111 +1522,144 @@ hipError_t hipMemcpyAtoH(void* dst, hipArray* srcArray, size_t srcOffset, size_t
    return ihipLogStatus(e);
 }

+int getByteSizeFromFormat(const hipChannelFormatDesc& desc){
+    int byteSize =0;
+    switch (desc.f) {
+        case hipChannelFormatKindUnsigned:
+            switch (desc.x) {
+                case 32:
+                    byteSize = sizeof(uint32_t);
+                    break;
+                case 16:
+                    byteSize = sizeof(uint16_t);
+                    break;
+                case 8:
+                    byteSize = sizeof(uint8_t);
+                    break;
+                default:
+                    byteSize = sizeof(uint32_t);
+            }
+            break;
+        case hipChannelFormatKindSigned:
+            switch (desc.x) {
+                case 32:
+                    byteSize = sizeof(int32_t);
+                    break;
+                case 16:
+                    byteSize = sizeof(int16_t);
+                    break;
+                case 8:
+                    byteSize = sizeof(int8_t);
+                    break;
+                default:
+                    byteSize = sizeof(int32_t);
+            }
+            break;
+        case hipChannelFormatKindFloat:
+            switch (desc.x) {
+                case 32:
+                    byteSize = sizeof(float);
+                    break;
+                case 16:
+                    byteSize = sizeof(_Float16);
+                    break;
+                default:
+                    byteSize = sizeof(float);
+            }
+            break;
+        case hipChannelFormatKindNone:
+        default:
+            break;
+    }
+    return byteSize;
+}
+
 hipError_t ihipMemcpy3D(const struct hipMemcpy3DParms* p, hipStream_t stream, bool isAsync) {
    hipError_t e = hipSuccess;
    if(p) {
-        size_t byteSize, width, height, depth, widthInBytes, srcPitch, dstPitch, ySize;
-        hipChannelFormatDesc desc;
-        void* srcPtr;void* dstPtr;
+        size_t dstByteSize, srcByteSize, copyWidth, copyHeight, copyDepth, widthInBytes, srcPitch, dstPitch, srcYsize, dstYsize;
+        size_t srcXoffset, srcYoffset, srcZoffset, dstXoffset, dstYoffset, dstZoffset;
+        size_t srcWidth, srcHeight, srcDepth, dstWidth, dstHeight, dstDepth;
+
+        void* srcPtr, *dstPtr;
+        bool copyWidthUpdate= false;
+        copyDepth = p->extent.depth;
+        copyHeight = p->extent.height;
+        copyWidth =  p->extent.width; // in bytes ?
+        dstXoffset = p->dstPos.x;
+        dstYoffset = p->dstPos.y;
+        dstZoffset = p->dstPos.z;
+        srcXoffset = p->srcPos.x;
+        srcYoffset = p->srcPos.y;
+        srcZoffset = p->srcPos.z;
        if (p->dstArray != nullptr) {
-            if (p->dstArray->isDrv == false) {
-                switch (p->dstArray->desc.f) {
-                    case hipChannelFormatKindSigned:
-                        byteSize = sizeof(int);
-                        break;
-                    case hipChannelFormatKindUnsigned:
-                        byteSize = sizeof(unsigned int);
-                        break;
-                    case hipChannelFormatKindFloat:
-                        byteSize = sizeof(float);
-                        break;
-                    case hipChannelFormatKindNone:
-                        byteSize = sizeof(size_t);
-                        break;
-                    default:
-                        byteSize = 0;
-                        break;
-                }
-                depth = p->extent.depth;
-                height = p->extent.height;
-                width =  p->extent.width;
-                widthInBytes = p->extent.width * byteSize;
-                srcPitch = p->srcPtr.pitch;
-                srcPtr = p->srcPtr.ptr;
-                ySize = p->srcPtr.ysize;
-                desc = p->dstArray->desc;
-                dstPtr = p->dstArray->data;
-                hsa_ext_image_data_info_t imageInfo;
-                if(hipTextureType2DLayered == p->dstArray->textureType)
-                    GetImageInfo(HSA_EXT_IMAGE_GEOMETRY_2DA, width, height, 0, desc, imageInfo, depth);
-                else
-                    GetImageInfo(HSA_EXT_IMAGE_GEOMETRY_3D, width, height, depth, desc, imageInfo);
-                dstPitch = imageInfo.size/(height == 0 ? 1 : height)/(depth == 0 ? 1 : depth);
-            } else {
-                depth = p->Depth;
-                height = p->Height;
-                widthInBytes = p->WidthInBytes;
-                width =  p->dstArray->width;
-                hsa_ext_image_channel_order_t channelOrder;
-                switch(p->dstArray->NumChannels) {
-                    case 2:
-                       channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_RG;
-                       break;
-                    case 3:
-                       channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_RGB;
-                       break;
-                    case 4:
-                       channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA;
-                       break;
-                    case 1:
-                    default:
-                       channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_R;
-                       break;
-                }
-                hsa_ext_image_channel_type_t channelType;
-                e = ihipArrayToImageFormat(p->dstArray->Format,channelType);
-                srcPitch = p->srcPitch;
-                srcPtr = (void*)p->srcHost;
-                ySize = p->srcHeight;
-                dstPtr = p->dstArray->data;
-                hsa_ext_image_data_info_t imageInfo;
-                if(hipTextureType2DLayered == p->dstArray->textureType)
-                    GetImageInfo(HSA_EXT_IMAGE_GEOMETRY_2DA, width, height, 0, channelOrder, channelType, imageInfo, depth);
-                else
-                    GetImageInfo(HSA_EXT_IMAGE_GEOMETRY_3D, width, height, depth, channelOrder, channelType, imageInfo);
-                dstPitch = imageInfo.size/(height == 0 ? 1 : height)/(depth == 0 ? 1 : depth);
+            if ((p->dstArray->isDrv == true) ||( p->dstPtr.ptr!= nullptr)){
+                return hipErrorInvalidValue;
+            }
+            // Array destination
+            dstByteSize = getByteSizeFromFormat(p->dstArray->desc);
+            hipChannelFormatDesc desc;
+            desc = p->dstArray->desc;
+            dstPtr = p->dstArray->data;
+            dstWidth = p->dstArray->width;
+            dstHeight = p->dstArray->height;
+            dstDepth = p->dstArray->depth;
+            dstPitch = dstByteSize * alignUp(dstWidth, IMAGE_PITCH_ALIGNMENT);
+            if(!copyWidthUpdate) {
+                copyWidth = copyWidth * dstByteSize;
+                copyWidthUpdate = true;
            }
        } else {
-            // Non array destination
-            depth = p->extent.depth;
-            height = p->extent.height;
-            widthInBytes = p->extent.width;
-            srcPitch = p->srcPtr.pitch;
-            srcPtr = p->srcPtr.ptr;
+            //Non Array destination
            dstPtr = p->dstPtr.ptr;
-            ySize = p->srcPtr.ysize;
+            dstWidth = p->dstPtr.xsize;
+            dstHeight = p->dstPtr.ysize;
            dstPitch = p->dstPtr.pitch;
        }

+        if (p->srcArray != nullptr) {
+            if ((p->srcArray->isDrv == true) ||( p->srcPtr.ptr!= nullptr)){
+                return hipErrorInvalidValue;
+            }
+            // Array source
+            srcByteSize = getByteSizeFromFormat(p->srcArray->desc);
+            hipChannelFormatDesc desc;
+            desc = p->srcArray->desc;
+            srcPtr = p->srcArray->data;
+            srcWidth = p->srcArray->width;
+            srcHeight = p->srcArray->height;
+            srcDepth = p->srcArray->depth;
+            srcPitch = srcByteSize * alignUp(srcWidth, IMAGE_PITCH_ALIGNMENT);
+            if(!copyWidthUpdate) {
+                copyWidth = copyWidth * srcByteSize;
+                copyWidthUpdate = true;
+            }
+        } else {
+            //Non Array source
+            srcPtr = p->srcPtr.ptr;
+            srcWidth = p->srcPtr.xsize;
+            srcHeight = p->srcPtr.ysize;
+            srcPitch = p->srcPtr.pitch;
+        }
+
        stream = ihipSyncAndResolveStream(stream);
        try {
-            if((widthInBytes == dstPitch) && (widthInBytes == srcPitch)) {
+            if((copyWidth == dstPitch) && (copyWidth == srcPitch)&& (copyHeight == dstHeight) &&(copyHeight == srcHeight)) {
                if(isAsync)
-                    stream->locked_copyAsync((void*)dstPtr, (void*)srcPtr, widthInBytes*height*depth, p->kind);
+                    stream->locked_copyAsync((void*)dstPtr, (void*)srcPtr, copyWidth*copyHeight*copyDepth, p->kind);
                else
-                    stream->locked_copySync((void*)dstPtr, (void*)srcPtr, widthInBytes*height*depth, p->kind, false);
+                    stream->locked_copySync((void*)dstPtr, (void*)srcPtr, copyWidth*copyHeight*copyDepth, p->kind, false);
            } else {
-                for (int i = 0; i < depth; i++) {
-                    for (int j = 0; j < height; j++) {
-                        // TODO: p->srcPos or p->dstPos are not 0.
+                for (int i = 0; i < copyDepth; i++) {
+                    for (int j = 0; j < copyHeight; j++) {
                        unsigned char* src =
-                             (unsigned char*)srcPtr + i * ySize * srcPitch + j * srcPitch;
+                             (unsigned char*)srcPtr + (i + srcZoffset) * srcHeight * srcPitch + (j + srcYoffset) * srcPitch + srcXoffset;
                        unsigned char* dst =
-                             (unsigned char*)dstPtr + i * height * dstPitch + j * dstPitch;
+                             (unsigned char*)dstPtr + (i + dstZoffset) * dstHeight * dstPitch + (j + dstYoffset) * dstPitch + dstXoffset;
                        if(isAsync)
-                            stream->locked_copyAsync(dst, src, widthInBytes, p->kind);
+                            stream->locked_copyAsync(dst, src, copyWidth, p->kind);
                        else
-                            stream->locked_copySync(dst, src, widthInBytes, p->kind);
+                            stream->locked_copySync(dst, src, copyWidth, p->kind);
                     }
                }
           }
@@ -134,10 +134,10 @@ extern hipError_t ihipGetDeviceProperties(hipDeviceProp_t* props, int device);
        return ihipLogStatus(hipStatus);                                                           \
    }

-hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t gridSizeX,
-                                  uint32_t gridSizeY, uint32_t gridSizeZ,
-                                  uint32_t blockSizeX, uint32_t blockSizeY,
-                                  uint32_t blockSizeZ, size_t sharedMemBytes,
+hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t globalWorkSizeX,
+                                  uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
+                                  uint32_t localWorkSizeX, uint32_t localWorkSizeY,
+                                  uint32_t localWorkSizeZ, size_t sharedMemBytes,
                                  hipStream_t hStream, void** kernelParams, void** extra,
                                  hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags, bool isStreamLocked = 0,
                                  void** impCoopParams = 0) {
@@ -146,14 +146,6 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t gridSi
    auto ctx = ihipGetTlsDefaultCtx();
    hipError_t ret = hipSuccess;

-    size_t globalWorkSizeX = (size_t)gridSizeX * (size_t)blockSizeX;
-    size_t globalWorkSizeY = (size_t)gridSizeY * (size_t)blockSizeY;
-    size_t globalWorkSizeZ = (size_t)gridSizeZ * (size_t)blockSizeZ;
-    if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX)
-    {
-        return hipErrorInvalidConfiguration;
-    }
-
    if (ctx == nullptr) {
        ret = hipErrorInvalidDevice;

@@ -211,8 +203,8 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t gridSi
        lp.dynamic_group_mem_bytes =
            sharedMemBytes;  // TODO - this should be part of preLaunchKernel.
        hStream = ihipPreLaunchKernel(
-            hStream, dim3(globalWorkSizeX/blockSizeX, globalWorkSizeY/blockSizeY, globalWorkSizeZ/blockSizeZ),
-            dim3(blockSizeX, blockSizeY, blockSizeZ), &lp, f->_name.c_str(), isStreamLocked);
+            hStream, dim3(globalWorkSizeX/localWorkSizeX, globalWorkSizeY/localWorkSizeY, globalWorkSizeZ/localWorkSizeZ),
+            dim3(localWorkSizeX, localWorkSizeY, localWorkSizeZ), &lp, f->_name.c_str(), isStreamLocked);

        hsa_kernel_dispatch_packet_t aql;

@@ -221,9 +213,9 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t gridSi
        // aql.completion_signal._handle = 0;
        // aql.kernarg_address = 0;

-        aql.workgroup_size_x = blockSizeX;
-        aql.workgroup_size_y = blockSizeY;
-        aql.workgroup_size_z = blockSizeZ;
+        aql.workgroup_size_x = localWorkSizeX;
+        aql.workgroup_size_y = localWorkSizeY;
+        aql.workgroup_size_z = localWorkSizeZ;
        aql.grid_size_x = globalWorkSizeX;
        aql.grid_size_y = globalWorkSizeY;
        aql.grid_size_z = globalWorkSizeZ;
@@ -283,8 +275,17 @@ hipError_t hipModuleLaunchKernel(hipFunction_t f, uint32_t gridDimX, uint32_t gr
                                 void** kernelParams, void** extra) {
    HIP_INIT_API(hipModuleLaunchKernel, f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes,
                 hStream, kernelParams, extra);
+
+    size_t globalWorkSizeX = (size_t)gridDimX * (size_t)blockDimX;
+    size_t globalWorkSizeY = (size_t)gridDimY * (size_t)blockDimY;
+    size_t globalWorkSizeZ = (size_t)gridDimZ * (size_t)blockDimZ;
+    if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX)
+    {
+        return hipErrorInvalidConfiguration;
+    }
+
    return ihipLogStatus(ihipModuleLaunchKernel(tls,
-        f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY,
+        f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, blockDimX, blockDimY,
        blockDimZ, sharedMemBytes, hStream, kernelParams, extra, nullptr, nullptr, 0));
 }

@@ -297,11 +298,8 @@ hipError_t hipExtModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
    HIP_INIT_API(hipExtModuleLaunchKernel, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX,
                 localWorkSizeY, localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra);

-    if(localWorkSizeX == 0 || localWorkSizeY == 0 || localWorkSizeZ == 0)
-        return hipErrorInvalidValue;
-
    return ihipLogStatus(ihipModuleLaunchKernel(tls,
-        f, globalWorkSizeX/localWorkSizeX, globalWorkSizeY/localWorkSizeY, globalWorkSizeZ/localWorkSizeZ, localWorkSizeX, localWorkSizeY,
+        f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, localWorkSizeY,
        localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent, flags));
 }

@@ -314,11 +312,8 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
    HIP_INIT_API(hipHccModuleLaunchKernel, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX,
                 localWorkSizeY, localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra);

-    if(localWorkSizeX == 0 || localWorkSizeY == 0 || localWorkSizeZ == 0)
-        return hipErrorInvalidValue;
-
    return ihipLogStatus(ihipModuleLaunchKernel(tls,
-        f, globalWorkSizeX/localWorkSizeX, globalWorkSizeY/localWorkSizeY, globalWorkSizeZ/localWorkSizeZ, localWorkSizeX, localWorkSizeY,
+        f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, localWorkSizeY,
        localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent, 0));
 }

@@ -364,14 +359,26 @@ hipError_t ihipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList
     }

    GET_TLS();
+
+    size_t globalWorkSizeX = 0, globalWorkSizeY = 0, globalWorkSizeZ = 0;
+
    // launch kernels for each  device
    for (int i = 0; i < numDevices; ++i) {
        const hipLaunchParams& lp = launchParamsList[i];

+        globalWorkSizeX = (size_t)lp.gridDim.x * (size_t)lp.blockDim.x;
+        globalWorkSizeY = (size_t)lp.gridDim.y * (size_t)lp.blockDim.y;
+        globalWorkSizeZ = (size_t)lp.gridDim.z * (size_t)lp.blockDim.z;
+
+        if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX)
+        {
+            return hipErrorInvalidConfiguration;
+        }
+
        result = ihipModuleLaunchKernel(tls, kds[i],
-                lp.gridDim.x,
-                lp.gridDim.y,
-                lp.gridDim.z,
+                lp.gridDim.x * lp.blockDim.x,
+                lp.gridDim.y * lp.blockDim.y,
+                lp.gridDim.z * lp.blockDim.z,
                lp.blockDim.x, lp.blockDim.y,
                lp.blockDim.z, lp.sharedMem,
                lp.stream, lp.args, nullptr, nullptr, nullptr, 0,
@@ -424,6 +431,14 @@ hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim,
        return hipErrorInvalidConfiguration;
    }

+    size_t globalWorkSizeX = (size_t)gridDim.x * (size_t)blockDimX.x;
+    size_t globalWorkSizeY = (size_t)gridDim.y * (size_t)blockDimX.y;
+    size_t globalWorkSizeZ = (size_t)gridDim.z * (size_t)blockDimX.z;
+    if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX)
+    {
+        return hipErrorInvalidConfiguration;
+    }
+
    // Prepare the kernel descriptor for initializing the GWS
    hipFunction_t gwsKD = ps.kernel_descriptor(
            reinterpret_cast<std::uintptr_t>(&init_gws),
@@ -483,9 +498,9 @@ hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim,

    // launch the main kernel
    result = ihipModuleLaunchKernel(tls, kd,
-            gridDim.x,
-            gridDim.y,
-            gridDim.z,
+            gridDim.x * blockDimX.x,
+            gridDim.y * blockDimX.y,
+            gridDim.z * blockDimX.z,
            blockDimX.x, blockDimX.y, blockDimX.z,
            sharedMemBytes, stream, kernelParams, nullptr, nullptr,
            nullptr, 0, true, impCoopParams);
@@ -620,6 +635,8 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL

    void* impCoopParams[1];
    ulong prev_sum = 0;
+
+    size_t globalWorkSizeX = 0, globalWorkSizeY = 0, globalWorkSizeZ = 0;
    // launch the main kernels for each device
    for (int i = 0; i < numDevices; ++i) {
        const hipLaunchParams& lp = launchParamsList[i];
@@ -636,10 +653,18 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL

        impCoopParams[0] = &mg_info_ptr[i];

+        globalWorkSizeX = (size_t)lp.gridDim.x * (size_t)lp.blockDim.x;
+        globalWorkSizeY = (size_t)lp.gridDim.y * (size_t)lp.blockDim.y;
+        globalWorkSizeZ = (size_t)lp.gridDim.z * (size_t)lp.blockDim.z;
+        if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX)
+        {
+            return hipErrorInvalidConfiguration;
+        }
+
        result = ihipModuleLaunchKernel(tls, kds[i],
-                lp.gridDim.x,
-                lp.gridDim.y,
-                lp.gridDim.z,
+                lp.gridDim.x * lp.blockDim.x,
+                lp.gridDim.y * lp.blockDim.y,
+                lp.gridDim.z * lp.blockDim.z,
                lp.blockDim.x, lp.blockDim.y,
                lp.blockDim.z, lp.sharedMem,
                lp.stream, lp.args, nullptr, nullptr, nullptr, 0,
@@ -1469,6 +1494,15 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
        tls, numBlocks, f, blockSize, dynSharedMemPerBlk));
 }

+hipError_t hipDrvOccupancyMaxActiveBlocksPerMultiprocessor(
+   int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk)
+{
+    HIP_INIT_API(hipDrvOccupancyMaxActiveBlocksPerMultiprocessor, numBlocks, f, blockSize, dynSharedMemPerBlk);
+
+    return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor(
+        tls, (uint32_t*) numBlocks, f, blockSize, dynSharedMemPerBlk));
+}
+
 hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
   uint32_t* numBlocks, hipFunction_t f, uint32_t  blockSize, size_t dynSharedMemPerBlk,
   unsigned int flags)
@@ -1479,6 +1513,15 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
        tls, numBlocks, f, blockSize, dynSharedMemPerBlk));
 }

+hipError_t hipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+   int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk,
+   unsigned int flags)
+{
+    HIP_INIT_API(hipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, numBlocks, f, blockSize, dynSharedMemPerBlk, flags);
+    return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor(
+        tls, (uint32_t*) numBlocks, f, blockSize, dynSharedMemPerBlk));
+}
+
 hipError_t hipLaunchKernel(
    const void* func_addr, dim3 numBlocks, dim3 dimBlocks, void** args,
    size_t sharedMemBytes, hipStream_t stream)
@@ -128,17 +128,24 @@ hipError_t ihipEnablePeerAccess(TlsData* tls, hipCtx_t peerCtx, unsigned int fla
        if (thisCtx == peerCtx) {
            err = hipErrorInvalidDevice;  // Can't enable peer access to self.
        } else if ((thisCtx != NULL) && (peerCtx != NULL)) {
-            LockedAccessor_CtxCrit_t peerCrit(peerCtx->criticalData());
-            // Add thisCtx to peerCtx's access list so that new allocations on peer will be made
-            // visible to this device:
-            bool isNewPeer = peerCrit->addPeerWatcher(peerCtx, thisCtx);
-            if (isNewPeer) {
-                tprintf(DB_MEM, "device=%s can now see all memory allocated on peer=%s\n",
-                        thisCtx->toString().c_str(), peerCtx->toString().c_str());
-                am_memtracker_update_peers(peerCtx->getDevice()->_acc, peerCrit->peerCnt(),
-                                           peerCrit->peerAgents());
+
+            int canAccess = 0;
+            if ((hipSuccess != ihipDeviceCanAccessPeer(&canAccess,thisCtx,peerCtx)) || (canAccess == 0)){
+                 tprintf(DB_MEM, "device=%s can't access peer=%s\n",thisCtx->toString().c_str(), peerCtx->toString().c_str());
+                 err =  hipErrorInvalidDevice;
            } else {
-                err = hipErrorPeerAccessAlreadyEnabled;
+                 LockedAccessor_CtxCrit_t peerCrit(peerCtx->criticalData());
+                 // Add thisCtx to peerCtx's access list so that new allocations on peer will be made
+                 // visible to this device:
+                 bool isNewPeer = peerCrit->addPeerWatcher(peerCtx, thisCtx);
+                 if (isNewPeer) {
+                    tprintf(DB_MEM, "device=%s can now see all memory allocated on peer=%s\n",
+                        thisCtx->toString().c_str(), peerCtx->toString().c_str());
+                    am_memtracker_update_peers(peerCtx->getDevice()->_acc, peerCrit->peerCnt(),
+                                           peerCrit->peerAgents());
+                 } else {
+                    err = hipErrorPeerAccessAlreadyEnabled;
+                 }
            }
        } else {
            err = hipErrorInvalidDevice;
@@ -50,7 +50,7 @@ THE SOFTWARE.
 #include <iostream>
 #include <sys/stat.h>

-const char* hiprtcGetErrorString(hiprtcResult x)
+extern "C" const char* hiprtcGetErrorString(hiprtcResult x)
 {
    switch (x) {
    case HIPRTC_SUCCESS:
@@ -95,6 +95,21 @@ inline bool fileExists (const std::string& name) {
 }
 }  // namespace hip_impl

+namespace
+{
+    char* demangle(const char* x)
+    {
+        if (!x) return nullptr;
+
+        int s{};
+        char* tmp = abi::__cxa_demangle(x, nullptr, nullptr, &s);
+
+        if (s != 0) return nullptr;
+
+        return tmp;
+    }
+} // Unnamed namespace.
+
 namespace
 {
    struct Symbol {
@@ -158,7 +173,7 @@ struct _hiprtcProgram {
    {
        using namespace std;

-        char* demangled = hip_impl::demangle(name.c_str());
+        char* demangled = demangle(name.c_str());
        name.assign(demangled == nullptr ? "" : demangled);
        free(demangled);

@@ -352,7 +367,7 @@ namespace
    }
 } // Unnamed namespace.

-hiprtcResult hiprtcAddNameExpression(hiprtcProgram p, const char* n)
+extern "C" hiprtcResult hiprtcAddNameExpression(hiprtcProgram p, const char* n)
 {
    if (!n) return HIPRTC_ERROR_INVALID_INPUT;
    if (!isValidProgram(p)) return HIPRTC_ERROR_INVALID_PROGRAM;
@@ -413,21 +428,6 @@ namespace
    };
 } // Unnamed namespace.

-namespace hip_impl
-{
-    char* demangle(const char* x)
-    {
-        if (!x) return nullptr;
-
-        int s{};
-        char* tmp = abi::__cxa_demangle(x, nullptr, nullptr, &s);
-
-        if (s != 0) return nullptr;
-
-        return tmp;
-    }
-} // Namespace hip_impl.
-
 namespace
 {
    const std::string& defaultTarget()
@@ -492,7 +492,7 @@ namespace
    }
 } // Unnamed namespace.

-hiprtcResult hiprtcCompileProgram(hiprtcProgram p, int n, const char** o)
+extern "C" hiprtcResult hiprtcCompileProgram(hiprtcProgram p, int n, const char** o)
 {
    using namespace std;

@@ -530,7 +530,7 @@ hiprtcResult hiprtcCompileProgram(hiprtcProgram p, int n, const char** o)
    return HIPRTC_SUCCESS;
 }

-hiprtcResult hiprtcCreateProgram(hiprtcProgram* p, const char* src,
+extern "C" hiprtcResult hiprtcCreateProgram(hiprtcProgram* p, const char* src,
                                 const char* name, int n, const char** hdrs,
                                 const char** incs)
 {
@@ -548,14 +548,14 @@ hiprtcResult hiprtcCreateProgram(hiprtcProgram* p, const char* src,
    return HIPRTC_SUCCESS;
 }

-hiprtcResult hiprtcDestroyProgram(hiprtcProgram* p)
+extern "C" hiprtcResult hiprtcDestroyProgram(hiprtcProgram* p)
 {
    if (!p) return HIPRTC_SUCCESS;

    return _hiprtcProgram::destroy(*p);
 }

-hiprtcResult hiprtcGetLoweredName(hiprtcProgram p, const char* n,
+extern "C" hiprtcResult hiprtcGetLoweredName(hiprtcProgram p, const char* n,
                                  const char** ln)
 {
    using namespace std;
@@ -576,7 +576,7 @@ hiprtcResult hiprtcGetLoweredName(hiprtcProgram p, const char* n,
    return HIPRTC_SUCCESS;
 }

-hiprtcResult hiprtcGetProgramLog(hiprtcProgram p, char* l)
+extern "C" hiprtcResult hiprtcGetProgramLog(hiprtcProgram p, char* l)
 {
    if (!l) return HIPRTC_ERROR_INVALID_INPUT;
    if (!isValidProgram(p)) return HIPRTC_ERROR_INVALID_PROGRAM;
@@ -588,7 +588,7 @@ hiprtcResult hiprtcGetProgramLog(hiprtcProgram p, char* l)
    return HIPRTC_SUCCESS;
 }

-hiprtcResult hiprtcGetProgramLogSize(hiprtcProgram p, std::size_t* sz)
+extern "C" hiprtcResult hiprtcGetProgramLogSize(hiprtcProgram p, std::size_t* sz)
 {
    if (!sz) return HIPRTC_ERROR_INVALID_INPUT;
    if (!isValidProgram(p)) return HIPRTC_ERROR_INVALID_PROGRAM;
@@ -599,7 +599,7 @@ hiprtcResult hiprtcGetProgramLogSize(hiprtcProgram p, std::size_t* sz)
    return HIPRTC_SUCCESS;
 }

-hiprtcResult hiprtcGetCode(hiprtcProgram p, char* c)
+extern "C" hiprtcResult hiprtcGetCode(hiprtcProgram p, char* c)
 {
    if (!c) return HIPRTC_ERROR_INVALID_INPUT;
    if (!isValidProgram(p)) return HIPRTC_ERROR_INVALID_PROGRAM;
@@ -610,7 +610,7 @@ hiprtcResult hiprtcGetCode(hiprtcProgram p, char* c)
    return HIPRTC_SUCCESS;
 }

-hiprtcResult hiprtcGetCodeSize(hiprtcProgram p, std::size_t* sz)
+extern "C" hiprtcResult hiprtcGetCodeSize(hiprtcProgram p, std::size_t* sz)
 {
    if (!sz) return HIPRTC_ERROR_INVALID_INPUT;
    if (!isValidProgram(p)) return HIPRTC_ERROR_INVALID_PROGRAM;
@@ -620,3 +620,15 @@ hiprtcResult hiprtcGetCodeSize(hiprtcProgram p, std::size_t* sz)

    return HIPRTC_SUCCESS;
 }
+
+extern "C" hiprtcResult hiprtcVersion(int* major, int* minor)
+{
+  if (major == nullptr || minor == nullptr) {
+    return HIPRTC_ERROR_INVALID_INPUT;
+  }
+
+  *major = 9;
+  *minor = 0;
+
+  return HIPRTC_SUCCESS;
+}
@@ -26,6 +26,7 @@
 #include <cstddef>
 #include <cstdint>
 #include <cstdio>
+#include <deque>
 #include <memory>
 #include <mutex>
 #include <stdexcept>
@@ -202,7 +203,7 @@ public:
                        std::function<void(hsa_code_object_reader_t*)>>;
    std::pair<
        std::mutex,
-        std::vector<std::pair<std::string, RAII_code_reader>>> code_readers;
+        std::deque<std::pair<std::string, RAII_code_reader>>> code_readers;

    program_state_impl() {
        // Create placeholder for each agent for the per-agent members.
@@ -244,7 +245,8 @@ public:
                    if (!valid(tmp)) break;

                    for (auto&& bundle : bundles(tmp)) {
-                        impl.code_object_blobs.second[elf][triple_to_hsa_isa(bundle.triple)].push_back(bundle.blob);
+                        if(bundle.blob.size())
+                            impl.code_object_blobs.second[elf][triple_to_hsa_isa(bundle.triple)].push_back(bundle.blob);
                    }

                    blob_it += tmp.bundled_code_size;
@@ -418,13 +420,17 @@ public:
        decltype(code_readers.second)::iterator it;
        {
          std::lock_guard<std::mutex> lck{code_readers.first};
-          it = code_readers.second.emplace(code_readers.second.end(),
-                                           move(file), move(tmp));
+          code_readers.second.emplace_back(move(file), move(tmp));
+          it = std::prev(code_readers.second.end());
        }

        auto check_hsa_error = [](hsa_status_t s) {
            if (s != HSA_STATUS_SUCCESS) {
-                hip_throw(std::runtime_error{"error when loading code object"});
+                const char* hsa_err_msg;
+                hsa_status_string(s, &hsa_err_msg);
+                hip_throw(std::runtime_error{
+                              std::string("error when loading code object: ") +
+                              hsa_err_msg});
            }
        };

@@ -34,42 +34,52 @@ OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWA

 using namespace std;

-const string directed_dir = "directed_tests" + string(PATH_SEPERATOR_STR) + "hipEnvVar";
-const string dir = "." + string(PATH_SEPERATOR_STR) + "hipEnvVar";
+const string directed_dir = string(".") + PATH_SEPERATOR_STR + "directed_tests" + PATH_SEPERATOR_STR + "hipEnvVar";
+const string dir = string(".") + PATH_SEPERATOR_STR + "hipEnvVar";

-int getDeviceNumber() {
-    char buff[512];
-    std::this_thread::sleep_for(std::chrono::milliseconds(10));
-    FILE* in = popen((directed_dir + " -c").c_str(), "r");
-    if(fgets(buff, 512, in) == NULL){
-        pclose(in);
-        //Check at same level
-        in = popen((dir + " -c").c_str(), "r");
+int readHipEnvVar(string flags, char* buff){
+
+    std::cout << "\nFinding hipEnvVar in " << directed_dir << "...\n";
+    FILE* directed_in = popen((directed_dir + flags).c_str(), "r");
+    
+    if(fgets(buff, 512, directed_in) == NULL){
+        std::cout << "Finding hipEnvVar in " << dir << "...\n";
+        FILE* in = popen((dir + flags).c_str(), "r");
        if(fgets(buff, 512, in) == NULL){
+            pclose(directed_in);
            pclose(in);
            return 1;
        }
+        pclose(in);
+    }
+    std::cout << "hipEnvVar Found!\n";
+    pclose(directed_in);
+    return 0;
+}
+
+int getDeviceNumber(bool print_err=true) {
+    char buff[512];
+    std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+    if (readHipEnvVar(string(" -c"), buff)){
+        strncpy(buff, "1", 512);
+        if (print_err){
+            std::cerr << "The system cannot find hipEnvVar, using 1 as number of devices\n";
+        }
+    }
+    if (print_err) {
+        std::cout << buff;
    }
-    cout << buff;
-    pclose(in);
    return atoi(buff);
 }

 // Query the current device ID remotely to hipEnvVar
 void getDevicePCIBusNumRemote(int deviceID, char* pciBusID) {    
    std::this_thread::sleep_for(std::chrono::milliseconds(10));
-    FILE* in = popen((directed_dir + " -d " + std::to_string(deviceID)).c_str(), "r");
-    if(fgets(pciBusID, 100, in) == NULL){
-        pclose(in);
-        //Check at same level
-        in = popen((dir + " -d").c_str(), "r");
-        if(fgets(pciBusID, 100, in) == NULL){
-            pclose(in);
-            return;
-        }
+    if (readHipEnvVar((" -d " + std::to_string(deviceID)), pciBusID)){
+        std::cerr << "The system cannot find hipEnvVar\n";
    }
    cout << pciBusID;
-    pclose(in);
    return;
 }

@@ -78,15 +88,15 @@ void getDevicePCIBusNum(int deviceID, char* pciBusID) {
    hipDevice_t deviceT;
    hipDeviceGet(&deviceT, deviceID);

-    memset(pciBusID, 0, 100);
-    hipDeviceGetPCIBusId(pciBusID, 100, deviceT);
+    memset(pciBusID, 0, 512);
+    hipDeviceGetPCIBusId(pciBusID, 512, deviceT);
 }

 int main() {
    unsetenv(HIP_VISIBLE_DEVICES_STR);
    unsetenv(CUDA_VISIBLE_DEVICES_STR);
    std::vector<std::string> devPCINum;
-    char pciBusID[100];
+    char pciBusID[512];
    // collect the device pci bus ID for all devices
    int totalDeviceNum = getDeviceNumber();
    std::cout << "The total number of available devices is " << totalDeviceNum << std::endl
@@ -116,27 +126,27 @@ int main() {
    // check when set an invalid device number
    setenv("HIP_VISIBLE_DEVICES", "1000,0,1", 1);
    setenv("CUDA_VISIBLE_DEVICES", "1000,0,1", 1);
-    assert(getDeviceNumber() == 0);
+    assert(getDeviceNumber(false) == 0);

    if (totalDeviceNum > 2) {
        setenv("HIP_VISIBLE_DEVICES", "0,1,1000,2", 1);
        setenv("CUDA_VISIBLE_DEVICES", "0,1,1000,2", 1);
-        assert(getDeviceNumber() == 2);
+        assert(getDeviceNumber(false) == 2);

        setenv("HIP_VISIBLE_DEVICES", "0,1,2", 1);
        setenv("CUDA_VISIBLE_DEVICES", "0,1,2", 1);
-        assert(getDeviceNumber() == 3);
+        assert(getDeviceNumber(false) == 3);
        // test if CUDA_VISIBLE_DEVICES will be accepted by the runtime
        unsetenv(HIP_VISIBLE_DEVICES_STR);
        unsetenv(CUDA_VISIBLE_DEVICES_STR);
        setenv("CUDA_VISIBLE_DEVICES", "0,1,2", 1);
-        assert(getDeviceNumber() == 3);
+        assert(getDeviceNumber(false) == 3);
    }

    setenv("HIP_VISIBLE_DEVICES", "-100,0,1", 1);
    setenv("CUDA_VISIBLE_DEVICES", "-100,0,1", 1);
-    assert(getDeviceNumber() == 0);
+    assert(getDeviceNumber(false) == 0);

    std::cout << "PASSED" << std::endl;
    return 0;
-}
+}
@@ -1,147 +0,0 @@
-/*
-Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-/* HIT_START
- * BUILD: %t %s ../test_common.cpp LINK_OPTIONS hiprtc EXCLUDE_HIP_PLATFORM nvcc
- * TEST: %t
- * HIT_END
- */
-
-#include <test_common.h>
-
-#define HIPRTC_GET_TYPE_NAME
-#include <hip/hiprtc.h>
-#include <hip/hip_runtime.h>
-
-#include <iostream>
-#include <string>
-#include <vector>
-
-static constexpr auto gpu_program{
-R"(
-#include <hip/hip_runtime.h>
-
-namespace N1 { struct S1_t { int i; double d; }; }
-template<typename T>
-__global__ void f3(int *result) { *result = sizeof(T); }
-)"};
-
-// note: this structure is also defined in GPU code string. Should ideally
-// be in a header file included by both GPU code string and by CPU code.
-namespace N1 { struct S1_t { int i; double d; }; };
-
-template <typename T>
-std::string getKernelNameForType(void)
-{
-    std::string type_name;
-    hiprtcGetTypeName<T>(&type_name);
-    return std::string{"f3<"} + type_name + '>';
-}
-
-int main()
-{
-    using namespace std;
-
-    hiprtcProgram prog;
-    hiprtcCreateProgram(&prog, gpu_program, "gpu_program.cu", 0, nullptr,
-                        nullptr);
-
-    vector<string> name_vec;
-    vector<int> expected_result;
-
-    name_vec.push_back(getKernelNameForType<int>());
-    expected_result.push_back(sizeof(int));
-    name_vec.push_back(getKernelNameForType<double>());
-    expected_result.push_back(sizeof(double));
-    name_vec.push_back(getKernelNameForType<N1::S1_t>());
-    expected_result.push_back(sizeof(N1::S1_t));
-
-    for (auto&& x : name_vec) hiprtcAddNameExpression(prog, x.c_str());
-
-    hipDeviceProp_t props;
-    int device = 0;
-    hipGetDeviceProperties(&props, device);
-    std::string gfxName = "gfx" + std::to_string(props.gcnArch);
-    std::string sarg = "--gpu-architecture=" + gfxName;
-    const char* options[] = {
-            sarg.c_str()
-    };
-
-    hiprtcResult compileResult = hiprtcCompileProgram(prog, 1, options);
-
-    size_t logSize;
-    hiprtcGetProgramLogSize(prog, &logSize);
-
-    if (logSize) {
-        string log(logSize, '\0');
-        hiprtcGetProgramLog(prog, &log[0]);
-
-        cout << log << '\n';
-    }
-
-    if (compileResult != HIPRTC_SUCCESS) { failed("Compilation failed."); }
-
-    size_t codeSize;
-    hiprtcGetCodeSize(prog, &codeSize);
-
-    vector<char> code(codeSize);
-    hiprtcGetCode(prog, code.data());
-
-    hipModule_t module;
-    hipModuleLoadDataEx(&module, code.data(), 0, nullptr, nullptr);
-
-    hipDeviceptr_t dResult;
-    int hResult = 0;
-    hipMalloc(&dResult, sizeof(hResult));
-    hipMemcpyHtoD(dResult, &hResult, sizeof(hResult));
-
-    for (size_t i = 0; i < name_vec.size(); ++i) {
-        const char *name;
-        hiprtcGetLoweredName(prog, name_vec[i].c_str(), &name);
-
-        hipFunction_t kernel;
-        hipModuleGetFunction(&kernel, module, name);
-
-        struct { hipDeviceptr_t a_; } args{dResult};
-
-        auto size = sizeof(args);
-        void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args,
-                          HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
-                          HIP_LAUNCH_PARAM_END};
-
-        hipModuleLaunchKernel(kernel,
-                              1, 1, 1,
-                              1, 1, 1,
-                              0, nullptr,
-                              nullptr, config);
-
-        hipMemcpyDtoH(&hResult, dResult, sizeof(hResult));
-
-        if (expected_result[i] != hResult) { failed("Validation failed."); }
-    }
-
-    hipFree(dResult);
-    hipModuleUnload(module);
-
-    hiprtcDestroyProgram(&prog);
-
-    passed();
-}
@@ -0,0 +1,130 @@
+/*
+Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+/* HIT_START
+ * BUILD: %t %s ../test_common.cpp
+ * TEST: %t
+ * HIT_END
+ */
+
+#include <iostream>
+#include <hip/hip_runtime.h>
+#include "test_common.h"
+
+#define WIDTH 4
+
+#define NUM (WIDTH * WIDTH)
+
+#define THREADS_PER_BLOCK_X 4
+#define THREADS_PER_BLOCK_Y 4
+#define THREADS_PER_BLOCK_Z 1
+
+// Device (Kernel) function, it must be void
+template <typename T>
+__global__ void matrixTranspose(T* out, T* in, const int width) {
+    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+    T val = in[x];
+    for (int i = 0; i < width; i++) {
+        for (int j = 0; j < width; j++) out[i * width + j] = __shfl(val, j * width + i);
+    }
+}
+
+// CPU implementation of matrix transpose
+template <typename T>
+void matrixTransposeCPUReference(T* output, T* input, const unsigned int width) {
+    for (unsigned int j = 0; j < width; j++) {
+        for (unsigned int i = 0; i < width; i++) {
+            output[i * width + j] = input[j * width + i];
+        }
+    }
+}
+
+template<typename T>
+void runTest() {
+    T* Matrix;
+    T* TransposeMatrix;
+    T* cpuTransposeMatrix;
+
+    T* gpuMatrix;
+    T* gpuTransposeMatrix;
+
+    hipDeviceProp_t devProp;
+    hipGetDeviceProperties(&devProp, 0);
+
+    int i;
+    int errors;
+
+    Matrix = (T*)malloc(NUM * sizeof(T));
+    TransposeMatrix = (T*)malloc(NUM * sizeof(T));
+    cpuTransposeMatrix = (T*)malloc(NUM * sizeof(T));
+
+    // initialize the input data
+    for (i = 0; i < NUM; i++) {
+        Matrix[i] = (T)i * 10l;
+    }
+
+    // allocate the memory on the device side
+    hipMalloc((void**)&gpuMatrix, NUM * sizeof(T));
+    hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(T));
+
+    // Memory transfer from host to device
+    hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(T), hipMemcpyHostToDevice);
+
+    // Lauching kernel from host
+    hipLaunchKernelGGL(matrixTranspose<T>, dim3(1), dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y), 0, 0,
+                    gpuTransposeMatrix, gpuMatrix, WIDTH);
+
+    // Memory transfer from device to host
+    hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM * sizeof(T), hipMemcpyDeviceToHost);
+
+    // CPU MatrixTranspose computation
+    matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
+
+    // verify the results
+    errors = 0;
+    double eps = 1.0E-6;
+    for (i = 0; i < NUM; i++) {
+        if (TransposeMatrix[i] != cpuTransposeMatrix[i]) {
+            errors++;
+        }
+    }
+
+    // free the resources on device side
+    hipFree(gpuMatrix);
+    hipFree(gpuTransposeMatrix);
+
+    // free the resources on host side
+    free(Matrix);
+    free(TransposeMatrix);
+    free(cpuTransposeMatrix);
+
+    if (errors != 0) {
+        failed("Mismatch present");
+    }
+}
+
+int main() {
+    runTest<int>();
+    runTest<float>();
+    runTest<long>();
+    runTest<long long>();
+    passed();
+}
@@ -0,0 +1,102 @@
+/*
+Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+/* HIT_START
+ * BUILD: %t %s ../test_common.cpp
+ * TEST: %t
+ * HIT_END
+ */
+
+#include <iostream>
+#include <hip/hip_runtime.h>
+#include "test_common.h"
+
+template <typename T>
+__global__ void shflDownSum(T* a, int size) {
+    T val = a[threadIdx.x];
+    for (int i = size / 2; i > 0; i /= 2) {
+        val += __shfl_down(val, i, size);
+    }
+    a[threadIdx.x] = val;
+}
+
+template <typename T>
+__global__ void shflUpSum(T* a, int size) {
+    T val = a[threadIdx.x];
+    for (int i = size / 2; i > 0; i /= 2) {
+        val += __shfl_up(val, i, size);
+    }
+    a[threadIdx.x] = val;
+}
+
+template <typename T>
+void runTestShflUp() {
+    const int size = 32;
+    T a[size];
+    T cpuSum = 0;
+    for (int i = 0; i < size; i++) {
+        a[i] = i;
+        cpuSum += a[i];
+    }
+    T* d_a;
+    hipMalloc(&d_a, sizeof(T) * size);
+    hipMemcpy(d_a, &a, sizeof(T) * size, hipMemcpyDefault);
+    hipLaunchKernelGGL(shflUpSum<T>, 1, size, 0, 0, d_a, size);
+    hipMemcpy(&a, d_a, sizeof(T) * size, hipMemcpyDefault);
+    if (a[size - 1] != cpuSum) {
+        hipFree(d_a);
+        failed("Shfl Up Sum did not match.");
+    }
+    hipFree(d_a);
+}
+
+template <typename T>
+void runTestShflDown() {
+    const int size = 32;
+    T a[size];
+    T cpuSum = 0;
+    for (int i = 0; i < size; i++) {
+        a[i] = i;
+        cpuSum += a[i];
+    }
+    T* d_a;
+    hipMalloc(&d_a, sizeof(T) * size);
+    hipMemcpy(d_a, &a, sizeof(T) * size, hipMemcpyDefault);
+    hipLaunchKernelGGL(shflDownSum<T>, 1, size, 0, 0, d_a, size);
+    hipMemcpy(&a, d_a, sizeof(T) * size, hipMemcpyDefault);
+    if (a[0] != cpuSum) {
+        hipFree(d_a);
+        failed("Shfl Up Sum did not match.");
+    }
+    hipFree(d_a);
+}
+int main() {
+    runTestShflUp<int>();
+    runTestShflUp<float>();
+    runTestShflUp<long>();
+    runTestShflUp<long long>();
+
+    runTestShflDown<int>();
+    runTestShflDown<float>();
+    runTestShflDown<long>();
+    runTestShflDown<long long>();
+    passed();
+}
@@ -0,0 +1,110 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp
+ * TEST: %t
+ * HIT_END
+ */
+
+#include "test_common.h"
+
+template <typename T>
+void runTest(int width,int height,int depth, hipChannelFormatKind formatKind)
+{
+    unsigned int size = width * height * depth * sizeof(T);
+    T* hData = (T*) malloc(size);
+    memset(hData, 0, size);
+
+    for (int i = 0; i < depth; i++) {
+        for (int j = 0; j < height; j++) {
+            for (int k = 0; k < width; k++) {
+                hData[i*width*height + j*width +k] = i*width*height + j*width + k;
+            }
+        }
+    }
+    printf("test- sizeof(T) =%d\n", sizeof(T));
+    hipChannelFormatDesc channelDesc = hipCreateChannelDesc(sizeof(T)*8, 0, 0, 0, formatKind);
+    hipArray *arr,*arr1;
+
+    HIPCHECK(hipMalloc3DArray(&arr, &channelDesc, make_hipExtent(width, height, depth), hipArrayDefault));
+    HIPCHECK(hipMalloc3DArray(&arr1, &channelDesc, make_hipExtent(width, height, depth), hipArrayDefault));
+    hipMemcpy3DParms myparms = {0};
+    myparms.srcPos = make_hipPos(0,0,0);
+    myparms.dstPos = make_hipPos(0,0,0);
+    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T), width, height);
+    myparms.dstArray = arr;
+    myparms.extent = make_hipExtent(width , height, depth);
+#ifdef __HIP_PLATFORM_NVCC__
+    myparms.kind = cudaMemcpyHostToDevice;
+#else
+    myparms.kind = hipMemcpyHostToDevice;
+#endif
+    HIPCHECK(hipMemcpy3D(&myparms));
+    HIPCHECK(hipDeviceSynchronize());
+    //Array to Array
+    memset(&myparms,0x0, sizeof(hipMemcpy3DParms));
+    myparms.srcPos = make_hipPos(0,0,0);
+    myparms.dstPos = make_hipPos(0,0,0);
+    myparms.srcArray = arr;
+    myparms.dstArray = arr1;
+    myparms.extent = make_hipExtent(width, height, depth);
+#ifdef __HIP_PLATFORM_NVCC__
+    myparms.kind = cudaMemcpyDeviceToDevice;
+#else
+    myparms.kind = hipMemcpyDeviceToDevice;
+#endif
+    HIPCHECK(hipMemcpy3D(&myparms));
+    HIPCHECK(hipDeviceSynchronize());
+
+    T *hOutputData = (T*) malloc(size);
+    memset(hOutputData, 0,  size);
+    //Device to host
+    memset(&myparms,0x0, sizeof(hipMemcpy3DParms));
+    myparms.srcPos = make_hipPos(0,0,0);
+    myparms.dstPos = make_hipPos(0,0,0);
+    myparms.dstPtr = make_hipPitchedPtr(hOutputData, width * sizeof(T), width, height);
+    myparms.srcArray = arr1;
+    myparms.extent = make_hipExtent(width, height, depth);
+#ifdef __HIP_PLATFORM_NVCC__
+    myparms.kind = cudaMemcpyDeviceToHost;
+#else
+    myparms.kind = hipMemcpyDeviceToHost;
+#endif
+    HIPCHECK(hipMemcpy3D(&myparms));
+    HIPCHECK(hipDeviceSynchronize());
+
+    // Check result
+    HipTest::checkArray(hData,hOutputData,width,height,depth);
+    hipFreeArray(arr);
+    hipFreeArray(arr1);
+    free(hData);
+    free(hOutputData);
+}
+
+int main(int argc, char **argv)
+{
+    for(int i=1;i<25;i++)
+    {
+        runTest<float>(i,i,i, hipChannelFormatKindFloat);
+        runTest<int>(i+1,i,i, hipChannelFormatKindSigned);
+        runTest<char>(i,i+1,i, hipChannelFormatKindSigned);
+    }
+    passed();
+}
@@ -0,0 +1,152 @@
+/*
+Copyright (c) 2015-Present Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
+ * TEST: %t
+ * HIT_END
+ */
+
+#include "hip/hip_runtime.h"
+#include "hip/hip_runtime_api.h"
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <thread>
+#include <chrono>
+
+#include "test_common.h"
+
+#define LEN 64
+#define SIZE LEN << 2
+#define THREADS 2
+#define MAX_THREADS 16
+
+#define FILENAME "vcpy_kernel.code"
+#define kernel_name "hello_world"
+
+std::vector<char> load_file()
+{
+    std::ifstream file(FILENAME, std::ios::binary | std::ios::ate);
+    std::streamsize fsize = file.tellg();
+    file.seekg(0, std::ios::beg);
+
+    std::vector<char> buffer(fsize);
+    if (!file.read(buffer.data(), fsize)) {
+        failed("could not open code object '%s'\n", FILENAME);
+    }
+    return buffer;
+}
+
+void run(const std::vector<char>& buffer) {
+    hipDevice_t device;
+    HIPCHECK(hipDeviceGet(&device, 0));
+    hipCtx_t context;
+    HIPCHECK(hipCtxCreate(&context, 0, device));
+
+    hipModule_t Module;
+    hipFunction_t Function;
+    HIPCHECK(hipModuleLoadData(&Module, &buffer[0]));
+    HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name));
+    
+    float *A, *B, *Ad, *Bd;
+    A = new float[LEN];
+    B = new float[LEN];
+
+    for (uint32_t i = 0; i < LEN; i++) {
+        A[i] = i * 1.0f;
+        B[i] = 0.0f;
+    }
+
+    HIPCHECK(hipMalloc((void**)&Ad, SIZE));
+    HIPCHECK(hipMalloc((void**)&Bd, SIZE));
+
+    HIPCHECK(hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice));
+    HIPCHECK(hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice));
+
+    hipStream_t stream;
+    HIPCHECK(hipStreamCreate(&stream));
+
+    struct {
+        void* _Ad;
+        void* _Bd;
+    } args;
+    args._Ad = (void*) Ad;
+    args._Bd = (void*) Bd;
+    size_t size = sizeof(args);
+
+    void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args, HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
+                      HIP_LAUNCH_PARAM_END};
+    HIPCHECK(hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, stream, NULL, (void**)&config));
+
+    HIPCHECK(hipStreamDestroy(stream));
+
+    HIPCHECK(hipModuleUnload(Module));
+
+    HIPCHECK(hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost));
+
+    for (uint32_t i = 0; i < LEN; i++) {
+        assert(A[i] == B[i]);
+    }
+   
+    hipFree(Ad); 
+    hipFree(Bd); 
+    delete A;
+    delete B;
+    hipCtxDestroy(context);
+    
+}
+
+struct joinable_thread : std::thread
+{
+    template <class... Xs>
+    joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...) // NOLINT
+    {
+    }
+
+    joinable_thread& operator=(joinable_thread&& other) = default;
+    joinable_thread(joinable_thread&& other)            = default;
+
+    ~joinable_thread()
+    {
+        if(this->joinable())
+            this->join();
+    }
+};
+
+void run_multi_threads(uint32_t n, const std::vector<char>& buffer) {
+    
+    std::vector<joinable_thread> threads;
+    
+    for (uint32_t i = 0; i < n; i++) {
+        threads.emplace_back(std::thread{[&, buffer] {
+        run(buffer);
+        }});
+    }
+    
+}
+
+int main() {
+
+    HIPCHECK(hipInit(0));
+    auto buffer = load_file();
+    run_multi_threads(min(THREADS * std::thread::hardware_concurrency(), MAX_THREADS), buffer);
+
+    passed();
+}
@@ -37,10 +37,12 @@ int p_tests = -1; /*which tests to run. Interpretation is left to each test.  de
 const char* HIP_VISIBLE_DEVICES_STR = "HIP_VISIBLE_DEVICES=";
 const char* CUDA_VISIBLE_DEVICES_STR = "CUDA_VISIBLE_DEVICES=";
 const char* PATH_SEPERATOR_STR = "\\";
+const char* NULL_DEVICE = "NUL:";
 #else
 const char* HIP_VISIBLE_DEVICES_STR = "HIP_VISIBLE_DEVICES";
 const char* CUDA_VISIBLE_DEVICES_STR = "CUDA_VISIBLE_DEVICES";
 const char* PATH_SEPERATOR_STR = "/";
+const char* NULL_DEVICE = "/dev/null";
 #endif

 namespace HipTest {
@@ -105,6 +105,10 @@ THE SOFTWARE.
 #define pclose(x) _pclose(x)
 #define setenv(x,y,z) _putenv_s(x,y)
 #define unsetenv _putenv
+#define fileno(x) _fileno(x)
+#define dup(x) _dup(x)
+#define dup2(x,y) _dup2(x,y)
+#define close(x) _close(x)
 #else
 #define aligned_free(x) free(x)
 #endif
@@ -124,6 +128,7 @@ extern int p_tests;
 extern const char* HIP_VISIBLE_DEVICES_STR;
 extern const char* CUDA_VISIBLE_DEVICES_STR;
 extern const char* PATH_SEPERATOR_STR;
+extern const char* NULL_DEVICE;

 // ********************* CPP section *********************
 #ifdef __cplusplus
@@ -21,7 +21,7 @@ THE SOFTWARE.
 */

 /* HIT_START
- * BUILD: %t %s ../test_common.cpp
+ * BUILD: %t %s ../test_common.cpp EXCLUDE_HIP_PLATFORM vdi
 * TEST: %t
 * HIT_END
 */
@@ -66,7 +66,7 @@ void runTest(int width,int height,int num_layers,texture<T, hipTextureType2DLaye
    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T), width, height);
    myparms.dstArray = arr;
    myparms.extent = make_hipExtent(width , height, num_layers);
-    myparms.kind = hipMemcpyHostToDevice;
+    //myparms.kind = hipMemcpyHostToDevice;
    HIPCHECK(hipMemcpy3D(&myparms));

    // set texture parameters
@@ -84,10 +84,10 @@ void runTest(int width,int height,int depth,texture<T, hipTextureType3D, hipRead
    }

    // Allocate array and copy image data
-    hipChannelFormatDesc channelDesc = tex->channelDesc;
+    hipChannelFormatDesc channelDesc = hipCreateChannelDesc(sizeof(T)*8, 0, 0, 0, hipChannelFormatKindSigned);
    hipArray *arr;

-    HIPCHECK(hipMalloc3DArray(&arr, &channelDesc, make_hipExtent(width, height, depth), hipArrayDefault));
+    HIPCHECK(hipMalloc3DArray(&arr, &channelDesc, make_hipExtent(width, height, depth), hipArrayCubemap));
    hipMemcpy3DParms myparms = {0};
    myparms.srcPos = make_hipPos(0,0,0);
    myparms.dstPos = make_hipPos(0,0,0);
@@ -100,7 +100,6 @@ void runTest(int width,int height,int depth,texture<T, hipTextureType3D, hipRead
    // set texture parameters
    tex->addressMode[0] = hipAddressModeWrap;
    tex->addressMode[1] = hipAddressModeWrap;
-    tex->addressMode[2] = hipAddressModeWrap;
    tex->filterMode = hipFilterModePoint;
    tex->normalized = false;