Merge branch 'master' into amd-master-next

Change-Id: Ib7e7824073f4dfc391fb3833fc90e11b327d3c22
2020-02-05 14:56:08 -08:00
parent b57d64c22e 48d1f766d5
commit 258adce78e
34 changed files with 667 additions and 919 deletions
@@ -25,6 +25,7 @@ endmacro()
 # Setup version information
 #############################
 # Determine HIP_BASE_VERSION
+set(ENV{HIP_PATH} "")
 execute_process(COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/bin/hipconfig --version
    OUTPUT_VARIABLE HIP_BASE_VERSION
    OUTPUT_STRIP_TRAILING_WHITESPACE)
@@ -394,7 +394,7 @@ cuda_10_x:
    // Block of string constants customizing behavior for cuda
    String nvcc_ver = 'cuda-10.x'
    String from_image = 'ci_test_nodes/cuda-10.x/ubuntu-16.04:latest'
-    String inside_args = '--runtime=nvidia';
+    String inside_args = '--gpus all';

    // Checkout source code, dependencies and version files
    String source_hip_rel = checkout_and_version( nvcc_ver )
@@ -6,7 +6,7 @@ Key features include:
 * HIP is very thin and has little or no performance impact over coding directly in CUDA or hcc "HC" mode.
 * HIP allows coding in a single-source C++ programming language including features such as templates, C++11 lambdas, classes, namespaces, and more.
 * HIP allows developers to use the "best" development environment and tools on each target platform.
-* The "hipify" tool automatically converts source from CUDA to HIP.
+* The [HIPIFY](hipify-clang/README.md) tools automatically convert source from CUDA to HIP.
 * Developers can specialize for the platform (CUDA or hcc) to tune for performance or handle tricky cases 

 New projects can be developed directly in the portable HIP C++ language and can run on either NVIDIA or AMD platforms.  Additionally, HIP provides porting tools which make it easy to port existing CUDA codes to the HIP layer, with no loss of performance as compared to the original CUDA application.  HIP is not intended to be a drop-in replacement for CUDA, and developers should expect to do some manual coding and performance tuning work to complete the port.
@@ -36,7 +36,7 @@ HIP releases are typically of two types. The tag naming convention is different
 - [HIP Profiling ](docs/markdown/hip_profiling.md)
 - [HIP Debugging](docs/markdown/hip_debugging.md)
 - [HIP Terminology](docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenCL)
- [hipify-clang](hipify-clang/README.md)
+- [HIPIFY](hipify-clang/README.md)
 - Supported CUDA APIs:
  * [Runtime API](docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md)
  * [Driver API](docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md)
@@ -71,7 +71,7 @@ hipLaunchKernel(vector_square,   /* compute kernel*/
                dim3(blocks), dim3(threadsPerBlock), 0/*dynamic shared*/, 0/*stream*/,     /* launch config*/
                C_d, A_d, N);  /* arguments to the compute kernel */

-hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost); 
+hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost);
 ```


@@ -88,7 +88,7 @@ __global__ void
 vector_square(T *C_d, const T *A_d, size_t N)
 {
    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
-    size_t stride = hipBlockDim_x * hipGridDim_x ;
+    size_t stride = hipBlockDim_x * hipGridDim_x;

    for (size_t i=offset; i<N; i+=stride) {
        C_d[i] = A_d[i] * A_d[i];
@@ -100,12 +100,12 @@ The HIP Runtime API code and compute kernel definition can exist in the same sou

 ## HIP Portability and Compiler Technology
 HIP C++ code can be compiled with either :
- On the NVIDIA CUDA platform, HIP provides header file which translate from the HIP runtime APIs to CUDA runtime APIs.  The header file contains mostly inlined 
+- On the NVIDIA CUDA platform, HIP provides header file which translate from the HIP runtime APIs to CUDA runtime APIs.  The header file contains mostly inlined
  functions and thus has very low overhead - developers coding in HIP should expect the same performance as coding in native CUDA.  The code is then
  compiled with nvcc, the standard C++ compiler provided with the CUDA SDK.  Developers can use any tools supported by the CUDA SDK including the CUDA
  profiler and debugger.
- On the AMD ROCm platform, HIP provides a header and runtime library built on top of hcc compiler.  The HIP runtime implements HIP streams, events, and memory APIs, 
-  and is a object library that is linked with the application.  The source code for all headers and the library implementation is available on GitHub.  
+- On the AMD ROCm platform, HIP provides a header and runtime library built on top of hcc compiler.  The HIP runtime implements HIP streams, events, and memory APIs,
+  and is a object library that is linked with the application.  The source code for all headers and the library implementation is available on GitHub.
  HIP developers on ROCm can use AMD's CodeXL for debugging and profiling.

 Thus HIP source code can be compiled to run on either platform.  Platform-specific features can be isolated to a specific platform using conditional compilation.  Thus HIP
@@ -114,7 +114,7 @@ provides source portability to either platform.   HIP provides the _hipcc_ compi

 ## Examples and Getting Started:

-* A sample and [blog](http://gpuopen.com/hip-to-be-squared-an-introductory-hip-tutorial) that uses hipify to convert a simple app from CUDA to HIP:
+* A sample and [blog](http://gpuopen.com/hip-to-be-squared-an-introductory-hip-tutorial) that uses any of [HIPIFY](hipify-clang/README.md) tools to convert a simple app from CUDA to HIP:

 
 ```shell
@@ -136,19 +136,20 @@ The GitHub repository [HIP-Examples](https://github.com/ROCm-Developer-Tools/HIP
 The README with the procedures and tips the team used during this porting effort is here: [Rodinia Porting Guide](https://github.com/ROCm-Developer-Tools/HIP-Examples/blob/master/rodinia_3.0/hip/README.hip_porting)

 ## Tour of the HIP Directories
-* **include**: 
+* **include**:
    * **hip_runtime_api.h** : Defines HIP runtime APIs and can be compiled with many standard Linux compilers (hcc, GCC, ICC, CLANG, etc), in either C or C++ mode.
    * **hip_runtime.h** : Includes everything in hip_runtime_api.h PLUS hipLaunchKernel and syntax for writing device kernels and device functions.  hip_runtime.h can only be compiled with hcc.
-    * **hcc_detail/**** , **nvcc_detail/**** : Implementation details for specific platforms.  HIP applications should not include these files directly.
+    * **hcc_detail/**** , **nvcc_detail/**** : Implementation details for specific platforms. HIP applications should not include these files directly.
    * **hcc.h** : Includes interop APIs for HIP and HCC
    
 * **bin**: Tools and scripts to help with hip porting
-    * **hipify** : Tool to convert CUDA code to portable CPP.  Converts CUDA APIs and kernel builtins.  
-    * **hipcc** : Compiler driver that can be used to replace nvcc in existing CUDA code. hipcc will call nvcc or hcc depending on platform, and include appropriate platform-specific headers and libraries.
-    * **hipconfig** : Print HIP configuration (HIP_PATH, HIP_PLATFORM, CXX config flags, etc)
-    * **hipexamine.sh** : Script to scan directory, find all code, and report statistics on how much can be ported with HIP (and identify likely features not yet supported)
+    * **hipify-perl** : Script based tool to convert CUDA code to portable CPP. Converts CUDA APIs and kernel builtins.
+    * **hipcc** : Compiler driver that can be used to replace nvcc in existing CUDA code. hipcc will call nvcc or hcc depending on platform and include appropriate platform-specific headers and libraries.
+    * **hipconfig** : Print HIP configuration (HIP_PATH, HIP_PLATFORM, CXX config flags, etc.)
+    * **hipexamine-perl.sh** : Script to scan the directory, find all code, and report statistics on how much can be ported with HIP (and identify likely features not yet supported).
+    * **hipconvertinplace-perl.sh** : Script to scan the directory, find all code, and convert the found CUDA code to HIP reporting all unconverted things.

-* **doc**: Documentation - markdown and doxygen info
+* **doc**: Documentation - markdown and doxygen info.

 ## Reporting an issue
 Use the [GitHub issue tracker](https://github.com/ROCm-Developer-Tools/HIP/issues).
@@ -115,10 +115,8 @@ install(
  PATTERN "complex"
  PATTERN "new"
  PATTERN "ppc_wrappers" EXCLUDE
-  PATTERN "openmp_wrappers" EXCLUDE
-  )
+  PATTERN "openmp_wrappers" EXCLUDE)

-# Package: hipify-clang
 if (UNIX)
    set(BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/packages/hipify-clang)
    configure_file(packaging/hipify-clang.txt ${BUILD_DIR}/CMakeLists.txt @ONLY)
@@ -132,8 +130,8 @@ if (UNIX)

    file(GENERATE OUTPUT ${PROJECT_BINARY_DIR}/fixnames
        CONTENT "pwd; for i in *.deb; do mv \"\$i\" \"\${i/.deb/-amd64.deb}\" ; done
-        for i in *.rpm ; do mv \$i \${i/.rpm/.x86_64.rpm} ; done
-        ")
+        for i in *.rpm ; do mv \$i \${i/.rpm/.x86_64.rpm} ; done")
+
    add_custom_target(package_hipify-clang
        COMMAND bash ${PROJECT_BINARY_DIR}/fixnames
        WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
@@ -155,45 +153,44 @@ if (HIPIFY_CLANG_TESTS)
    require_program(lit)
    require_program(FileCheck)

-    # Populates CUDA_TOOLKIT_ROOT_DIR, which is then applied to the lit config to give the
-    # value of --cuda-path for the test runs.
    find_package(CUDA REQUIRED)
    if ((CUDA_VERSION VERSION_LESS "7.0") OR (LLVM_PACKAGE_VERSION VERSION_LESS "3.8") OR
        (CUDA_VERSION VERSION_GREATER "7.5" AND LLVM_PACKAGE_VERSION VERSION_LESS "4.0") OR
        (CUDA_VERSION VERSION_GREATER "8.0" AND LLVM_PACKAGE_VERSION VERSION_LESS "6.0") OR
        (CUDA_VERSION VERSION_GREATER "9.0" AND LLVM_PACKAGE_VERSION VERSION_LESS "7.0") OR
        (CUDA_VERSION VERSION_GREATER "9.2" AND LLVM_PACKAGE_VERSION VERSION_LESS "8.0") OR
-        (CUDA_VERSION VERSION_GREATER "10.0" AND LLVM_PACKAGE_VERSION VERSION_LESS "9.0"))
-        message(SEND_ERROR "CUDA ${CUDA_VERSION} is not supported by clang ${LLVM_PACKAGE_VERSION}.")
+        (CUDA_VERSION VERSION_GREATER "10.0" AND LLVM_PACKAGE_VERSION VERSION_LESS "9.0") OR
+        (CUDA_VERSION VERSION_GREATER "10.1" AND LLVM_PACKAGE_VERSION VERSION_LESS "10.0"))
+        message(SEND_ERROR "CUDA ${CUDA_VERSION} is not supported by LLVM ${LLVM_PACKAGE_VERSION}.")
        if (CUDA_VERSION_MAJOR VERSION_LESS "7")
            message(STATUS "Please install CUDA 7.0 or higher.")
        elseif (CUDA_VERSION_MAJOR VERSION_LESS "8")
-            message(STATUS "Please install clang 3.8 or higher.")
+            message(STATUS "Please install LLVM + clang 3.8 or higher.")
        elseif (CUDA_VERSION_MAJOR VERSION_LESS "9")
-            message(STATUS "Please install clang 4.0 or higher.")
+            message(STATUS "Please install LLVM + clang 4.0 or higher.")
        elseif (CUDA_VERSION VERSION_EQUAL "9.0")
-            message(STATUS "Please install clang 6.0 or higher.")
+            message(STATUS "Please install LLVM + clang 6.0 or higher.")
        elseif (CUDA_VERSION_MAJOR VERSION_LESS "10")
-            message(STATUS "Please install clang 7.0 or higher.")
+            message(STATUS "Please install LLVM + clang 7.0 or higher.")
        elseif (CUDA_VERSION VERSION_EQUAL "10.0")
-            message(STATUS "Please install clang 8.0 or higher.")
+            message(STATUS "Please install LLVM + clang 8.0 or higher.")
        elseif (CUDA_VERSION VERSION_EQUAL "10.1")
-            message(STATUS "Please install clang 9.0 or higher.")
+            message(STATUS "Please install LLVM + clang 9.0 or higher.")
+        elseif (CUDA_VERSION VERSION_EQUAL "10.2")
+            message(STATUS "Please install LLVM + clang 10.0 or higher.")
        endif()
    endif()

    configure_file(
        ${CMAKE_CURRENT_LIST_DIR}/../tests/hipify-clang/lit.site.cfg.in
        ${CMAKE_CURRENT_BINARY_DIR}/tests/hipify-clang/lit.site.cfg
-        @ONLY
-    )
+        @ONLY)

    add_lit_testsuite(test-hipify "Running HIPify regression tests"
        ${CMAKE_CURRENT_LIST_DIR}/../tests/hipify-clang
        PARAMS site_config=${CMAKE_CURRENT_BINARY_DIR}/tests/hipify-clang/lit.site.cfg
        ARGS -v
-        DEPENDS hipify-clang
-    )
+        DEPENDS hipify-clang)

    add_custom_target(test-hipify-clang)
    add_dependencies(test-hipify-clang test-hipify)
@@ -1,42 +1,51 @@
-# hipify-clang
-
-`hipify-clang` is a clang-based tool to translate CUDA source code into portable HIP C++ automatically.
-
+# HIPIFY
+### Tools to translate CUDA source code into portable HIP C++ automatically
 ## Table of Contents

 <!-- toc -->

- [Supported CUDA APIs](#cuda-apis)
- [Dependencies](#dependencies)
- [Build and install](#build-and-install)
+- [hipify-clang](#clang)
+     * [Dependencies](#dependencies)
+     * [Usage](#hipify-clang-usage)
     * [Building](#building)
     * [Testing](#testing)
     * [Linux](#linux)
     * [Windows](#windows)
- [Running and using hipify-clang](#running-and-using-hipify-clang)
-     * [hipify-perl](#perl)
+- [hipify-perl](#perl)
+     * [Usage](#hipify-perl-usage)
+     * [Building](#hipify-perl-building)
+- [Supported CUDA APIs](#cuda-apis)
 - [Disclaimer](#disclaimer)

 <!-- tocstop -->

-## <a name="cuda-apis"></a> Supported CUDA APIs
+## <a name="clang"></a> hipify-clang

- [Runtime API](../docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md)
- [Driver API](../docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md)
- [cuComplex API](../docs/markdown/cuComplex_API_supported_by_HIP.md)
- [cuBLAS](../docs/markdown/CUBLAS_API_supported_by_HIP.md)
- [cuRAND](../docs/markdown/CURAND_API_supported_by_HIP.md)
- [cuDNN](../docs/markdown/CUDNN_API_supported_by_HIP.md)
- [cuFFT](../docs/markdown/CUFFT_API_supported_by_HIP.md)
- [cuSPARSE](../docs/markdown/CUSPARSE_API_supported_by_HIP.md)
+`hipify-clang` is a clang-based tool for translation CUDA sources into HIP sources.
+It translates CUDA source into an abstract syntax tree, which is being traversed by transformation matchers.
+After applying all the matchers, the output HIP source is produced.

-## <a name="dependencies"></a> Dependencies
+**Advantages:**
+
+1. It is a translator; thus, any even very complicated constructs will be parsed successfully, or an error will be reported.
+2. It supports clang options like [`-I`](https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-i-dir), [`-D`](https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-d-macro), [`--cuda-path`](https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-cuda-path), etc.
+3. Seamless support of new CUDA versions as it is clang's responsibility.
+4. Ease in support.
+
+**Disadvantages:**
+
+1. The main advantage is also the main disadvantage: the input CUDA code should be correct; incorrect code wouldn't be translated to HIP.
+2. CUDA should be installed and provided in case of multiple installations by `--cuda-path` option.
+3. All the includes and defines should be provided to transform code successfully.
+
+### <a name="dependencies"></a> hipify-clang: dependencies

 `hipify-clang` requires:

-1. [**LLVM+CLANG**](http://releases.llvm.org) of at least version [3.8.0](http://releases.llvm.org/download.html#3.8.0); the latest stable and recommended release: [**9.0.1**](http://releases.llvm.org/download.html#9.0.1).
+1. [**LLVM+CLANG**](http://releases.llvm.org) of at least version [3.8.0](http://releases.llvm.org/download.html#3.8.0); the latest stable and recommended release: [**9.0.1**](http://releases.llvm.org/download.html#9.0.1), the latest release candidate: [10.0.0-rc1](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc1).

 2. [**CUDA**](https://developer.nvidia.com/cuda-downloads) of at least version [7.0](https://developer.nvidia.com/cuda-toolkit-70), the latest supported version is [**10.1 Update 2**](https://developer.nvidia.com/cuda-10.1-download-archive-base).
+To use the latest CUDA version [10.2](https://developer.nvidia.com/cuda-downloads) please use the latest `LLVM` release candidate: [10.0.0-rc1](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc1).

 | **LLVM release version**                                   | **CUDA latest supported version**                                        | **Windows** | **Linux** |
 |:----------------------------------------------------------:|:------------------------------------------------------------------------:|:-----------:|:---------:|
@@ -58,21 +67,41 @@
 | [8.0.1](http://releases.llvm.org/download.html#8.0.1)      | [10.0](https://developer.nvidia.com/cuda-10.0-download-archive)          | - <br/> not working due to <br/> the clang's bug [38811](https://bugs.llvm.org/show_bug.cgi?id=38811) <br/>+<br/>[patch](patches/patch_for_clang_8.0.1_bug_38811.zip)*</br> | + |
 | [9.0.0](http://releases.llvm.org/download.html#9.0.0)      | [10.1](https://developer.nvidia.com/cuda-10.1-download-archive-base)     | +           | +         |
 | [**9.0.1**](http://releases.llvm.org/download.html#9.0.1)  | [**10.1**](https://developer.nvidia.com/cuda-10.1-download-archive-base) | + <br/> **LATEST STABLE RELEASE** | + <br/> **LATEST STABLE RELEASE** |
+| [10.0.0-rc1](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc1) | [10.2](https://developer.nvidia.com/cuda-downloads)             | +           | +         |

-`*` Download the patch and unpack it into your LLVM distributive directory; a few header files will be overwritten; rebuilding of LLVM is not needed.
+`*` Download the patch and unpack it into your `LLVM` distributive directory; a few header files will be overwritten; rebuilding of `LLVM` is not needed.

-In most cases, you can get a suitable version of LLVM+CLANG with your package manager.
+In most cases, you can get a suitable version of `LLVM+CLANG` with your package manager.

-Failing that or having multiple versions of LLVM, you can [download a release archive](http://releases.llvm.org/), build or install it, and set
+Failing that or having multiple versions of `LLVM`, you can [download a release archive](http://releases.llvm.org/), build or install it, and set
 [CMAKE_PREFIX_PATH](https://cmake.org/cmake/help/v3.5/variable/CMAKE_PREFIX_PATH.html) so `cmake` can find it; for instance: `-DCMAKE_PREFIX_PATH=f:\LLVM\9.0.1\dist`

-## <a name="build-and-install"></a> Build and install
+### <a name="hipify-clang-usage"></a> hipify-clang: usage

-### <a name="building"></a> Build
+To process a file, `hipify-clang` needs access to the same headers that would be required to compile it with clang.
+
+For example:
+
+```shell
+./hipify-clang square.cu --cuda-path=/usr/local/cuda-10.1 -I /usr/local/cuda-10.1/samples/common/inc
+```
+
+`hipify-clang` arguments are given first, followed by a separator `'--'`, and then the arguments you'd pass to `clang` if you
+were compiling the input file. For example:
+
+```bash
+./hipify-clang cpp17.cu --cuda-path=/usr/local/cuda-10.1 -- -std=c++17
+```
+
+The [Clang manual for compiling CUDA](https://llvm.org/docs/CompileCudaWithLLVM.html#compiling-cuda-code) may be useful.
+
+For a list of `hipify-clang` options, run `hipify-clang --help`.
+
+### <a name="building"></a> hipify-clang: building

 Assuming this repository is at `./HIP`:

-```shell
+```bash
 cd hipify-clang
 mkdir build dist
 cd build
@@ -84,30 +113,27 @@ cmake \

 make -j install
 ```
-On Windows, the following option should be specified for `cmake` at first place: `-G "Visual Studio 16 2019 Win64"`; the generated `hipify-clang.sln` should be built by `Visual Studio 15 2017` instead of `make.`
+On Windows, the following option should be specified for `cmake` at first place: `-G "Visual Studio 16 2019 Win64"`; the generated `hipify-clang.sln` should be built by `Visual Studio 16 2019` instead of `make.`
+Please, see [hipify-clang: Windows](#windows) for the supported tools for building.

 Debug build type `-DCMAKE_BUILD_TYPE=Debug` is also supported and tested; `LLVM+CLANG` should be built in `Debug` mode as well.
 64-bit build mode (`-Thost=x64` on Windows) is also supported; `LLVM+CLANG` should be built in 64-bit mode as well.

 The binary can then be found at `./dist/bin/hipify-clang`.

-### <a name="testing"></a> Testing
+### <a name="testing"></a> hipify-clang: testing

-`hipify-clang` has unit tests using LLVM [`lit`](https://llvm.org/docs/CommandGuide/lit.html)/[`FileCheck`](https://llvm.org/docs/CommandGuide/FileCheck.html).
+`hipify-clang` has unit tests using `LLVM` [`lit`](https://llvm.org/docs/CommandGuide/lit.html)/[`FileCheck`](https://llvm.org/docs/CommandGuide/FileCheck.html).

-**LLVM+CLANG should be built from sources, pre-built binaries are not exhaustive for testing.**
+`LLVM+CLANG` should be built from sources, pre-built binaries are not exhaustive for testing.
+
+**LLVM 9.0.1 or older:**

-To run it:
 1. download [`LLVM`](http://releases.llvm.org/9.0.1/llvm-9.0.1.src.tar.xz)+[`CLANG`](http://releases.llvm.org/9.0.1/cfe-9.0.1.src.tar.xz) sources; 
-2. build [`LLVM+CLANG`](http://llvm.org/docs/CMake.html):
-   ```shell
-   cd llvm
-   mkdir build dist
-   cd build
-   ```
+2. build [`LLVM+CLANG`](http://releases.llvm.org/9.0.0/docs/CMake.html):

-     - **Linux**:
-   ```shell
+ **Linux**:
+   ```bash
        cmake \
         -DCMAKE_INSTALL_PREFIX=../dist \
         -DLLVM_SOURCE_DIR=../llvm \
@@ -116,8 +142,7 @@ To run it:
         ../llvm
        make -j install
   ```
-
-     - **Windows**:
+ **Windows**:
   ```shell
        cmake \
         -G "Visual Studio 16 2019" \
@@ -129,9 +154,38 @@ To run it:
         -Thost=x64 \
         ../llvm
   ```
+Run `Visual Studio 16 2019`, open the generated `LLVM.sln`, build all, build project `INSTALL`.

-&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Run `Visual Studio 16 2019`, open the generated `LLVM.sln`, build all, build project `INSTALL`.
+**LLVM 10.0.0 or newer:**

+1. download [`LLVM project`](https://github.com/llvm/llvm-project/archive/llvmorg-10.0.0-rc1.tar.gz) sources;
+2. build [`LLVM project`](http://llvm.org/docs/CMake.html):
+
+ **Linux**:
+   ```bash
+        cmake \
+         -DCMAKE_INSTALL_PREFIX=../dist \
+         -DLLVM_SOURCE_DIR=../llvm-project \
+         -DLLVM_TARGETS_TO_BUILD="X86;NVPTX" \
+         -DLLVM_ENABLE_PROJECTS="clang" \
+         -DCMAKE_BUILD_TYPE=Release \
+         ../llvm-project/llvm
+        make -j install
+   ```
+ **Windows**:
+   ```shell
+        cmake \
+         -G "Visual Studio 16 2019" \
+         -A x64 \
+         -DCMAKE_INSTALL_PREFIX=../dist \
+         -DLLVM_SOURCE_DIR=../llvm-project \
+         -DLLVM_TARGETS_TO_BUILD="NVPTX" \
+         -DLLVM_ENABLE_PROJECTS="clang" \
+         -DCMAKE_BUILD_TYPE=Release \
+         -Thost=x64 \
+         ../llvm-project/llvm
+   ```
+Run `Visual Studio 16 2019`, open the generated `LLVM.sln`, build all, build project `INSTALL`.

 3. Ensure [`CUDA`](https://developer.nvidia.com/cuda-toolkit-archive) of minimum version 7.0 is installed.

@@ -161,7 +215,7 @@ To run it:

 5. Ensure [`python`](https://www.python.org/downloads) of minimum required version 2.7 is installed.

-6. Ensure `lit` and `FileCheck` are installed - these are distributed with LLVM.
+6. Ensure `lit` and `FileCheck` are installed - these are distributed with `LLVM`.

    * Install `lit` into `python`:

@@ -185,23 +239,15 @@ To run it:

 7. Set `HIPIFY_CLANG_TESTS` option turned on: `-DHIPIFY_CLANG_TESTS=1`.

-8. Run `cmake`:
-     * [***Linux***](#linux)
-     * [***Windows***](#windows)
+8. Build and run tests:

-9. Run tests:
-
-     - ***Linux***: `make test-hipify`.
-
-     - ***Windows***: run `Visual Studio 16 2019`, open the generated `hipify-clang.sln`, build project `test-hipify`.
-
-### <a name="linux"></a >Linux
+### <a name="Linux"></a > hipify-clang: Linux

 On Linux the following configurations are tested:

 Ubuntu 14: LLVM 5.0.0 - 6.0.1, CUDA 7.0 - 9.0, cudnn-5.0.5 - cudnn-7.6.5.32

-Ubuntu 16-18: LLVM 8.0.0 - 9.0.1, CUDA 8.0 - 10.1 Update 2, cudnn-5.1.10 - cudnn-7.6.5.32
+Ubuntu 16-18: LLVM 8.0.0 - 10.0.0-rc1, CUDA 8.0 - 10.2, cudnn-5.1.10 - cudnn-7.6.5.32

 Minimum build system requirements for the above configurations:

@@ -209,7 +255,7 @@ Python 2.7, cmake 3.5.1, GNU C/C++ 5.4.0.

 Here is an example of building `hipify-clang` with testing support on `Ubuntu 16.04`:

-```shell
+```bash
 cmake
 -DHIPIFY_CLANG_TESTS=1 \
 -DCMAKE_BUILD_TYPE=Release \
@@ -345,7 +391,7 @@ Testing Time: 3.07s
  Expected Passes    : 67
 [100%] Built target test-hipify
 ```
-### <a name="windows"></a >Windows
+### <a name="windows"></a > hipify-clang: Windows

 On Windows 10 the following configurations are tested:

@@ -353,11 +399,11 @@ LLVM 5.0.0 - 5.0.2, CUDA 8.0, cudnn 5.1.10 - 7.1.4.18

 LLVM 6.0.0 - 6.0.1, CUDA 9.0, cudnn 7.0.5.15 - 7.6.5.32

-LLVM 7.0.0 - 9.0.1, CUDA 7.5 - 10.1 Update 2, cudnn 7.0.5.15 - 7.6.5.32
+LLVM 7.0.0 - 10.0.0-rc1, CUDA 7.5 - 10.2, cudnn 7.0.5.15 - 7.6.5.32

-Build system requirements for the latest configuration LLVM 9.0.1/CUDA 10.1 Update 2:
+Build system requirements for the latest stable configuration LLVM 9.0.1/CUDA 10.1 Update 2:

-Python 3.6.0 - 3.8.1, cmake 3.5.1 - 3.16.2, Visual Studio 2017 (15.5.2) - 2019 (16.4.2).
+Python 3.6.0 - 3.8.1, cmake 3.5.1 - 3.16.3, Visual Studio 2017 (15.5.2) - 2019 (16.4.4).

 Here is an example of building `hipify-clang` with testing support on `Windows 10` by `Visual Studio 16 2019`:

@@ -392,45 +438,62 @@ cmake
 -- Build files have been written to: f:/HIP/hipify-clang/build
 ```

-## <a name="running-and-using-hipify-clang"></a> Running and using hipify-clang
+Run `Visual Studio 16 2019`, open the generated `hipify-clang.sln`, build project `test-hipify`.

-To process a file, `hipify-clang` needs access to the same headers that would be needed to compile it with clang.
+## <a name="perl"></a> hipify-perl

-For example:
+`hipify-perl` is autogenerated perl-based script which heavily uses regular expressions.
+
+**Advantages:**
+
+1. Ease in use.
+
+2. It doesn't check the input source CUDA code for correctness.
+
+3. It doesn't have dependencies on 3rd party tools, including CUDA.
+
+**Disadvantages:**
+
+1. Current disability (and difficulty in implementing) of transforming the following constructs:
+
+    * macros expansion;
+
+    * namespaces:
+
+        - redefines of CUDA entities in user namespaces;
+
+        - using directive;
+
+    * templates (some cases);
+
+    * device/host function calls distinguishing;
+
+    * header files correct injection;
+
+    * complicated argument lists parsing.
+
+2. Difficulties in supporting.
+
+### <a name="hipify-perl-usage"></a> hipify-perl: usage

 ```shell
-./hipify-clang square.cu --cuda-path=/usr/local/cuda-10.1 -I /usr/local/cuda-10.1/samples/common/inc
+perl hipify-perl square.cu > square.cu.hip
 ```

-`hipify-clang` arguments are given first, followed by a separator, and then the arguments you'd pass to `clang` if you
-were compiling the input file. The [Clang manual for compiling CUDA](https://llvm.org/docs/CompileCudaWithLLVM.html#compiling-cuda-code)
-may be useful.
+### <a name="hipify-perl-building"></a> hipify-perl: building

-For a list of `hipify-clang` options, run `hipify-clang --help`.
+To generate `hipify-perl`, run `hipify-clang --perl`. Output directory for the generated `hipify-perl` file might be specified by `--o-hipify-perl-dir` option.

-### <a name="perl"></a> hipify-perl
+## <a name="cuda-apis"></a> Supported CUDA APIs

-To produce a Perl-based script `hipify-perl`, run `hipify-clang --perl`.
-
-The `hipify-perl` script, unlike the `hipify-clang`, being based on regular expressions, and not on an abstract syntax tree, has several gaps:
-
-1. macros expansion;
-
-2. namespaces:
-
-    - redefines of CUDA entities in user namespaces;
-
-    - using directive;
-
-3. templates (some cases);
-
-4. device/host function calls distinguishing;
-
-5. header files correct injection;
-
-6. complicated argument lists parsing.
-
-Nonetheless, `hipify-perl` is easy in use and doesn't check the input source CUDA code for correctness.
+- [Runtime API](../docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md)
+- [Driver API](../docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md)
+- [cuComplex API](../docs/markdown/cuComplex_API_supported_by_HIP.md)
+- [cuBLAS](../docs/markdown/CUBLAS_API_supported_by_HIP.md)
+- [cuRAND](../docs/markdown/CURAND_API_supported_by_HIP.md)
+- [cuDNN](../docs/markdown/CUDNN_API_supported_by_HIP.md)
+- [cuFFT](../docs/markdown/CUFFT_API_supported_by_HIP.md)
+- [cuSPARSE](../docs/markdown/CUSPARSE_API_supported_by_HIP.md)

 ## <a name="disclaimer"></a> Disclaimer

@@ -438,5 +501,4 @@ The information contained herein is for informational purposes only, and is subj

 AMD, the AMD Arrow logo, and combinations thereof are trademarks of Advanced Micro Devices, Inc. Other product names used in this publication are for identification purposes only and may be trademarks of their respective companies.

-Copyright (c) 2014-2019 Advanced Micro Devices, Inc. All rights reserved.
-
+Copyright (c) 2014-2020 Advanced Micro Devices, Inc. All rights reserved.
@@ -48,7 +48,7 @@ endif()
 set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH})
 set(CPACK_GENERATOR "TGZ;DEB;RPM")
 set(CPACK_BINARY_DEB "ON")
-set(CPACK_DEBIAN_PACKAGE_DEPENDS "cuda >= 7.0")
+set(CPACK_DEBIAN_PACKAGE_DEPENDS "cuda (>= 7.0)")
 set(CPACK_BINARY_RPM "ON")
 set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
 set(CPACK_RPM_PACKAGE_AUTOREQPROV "NO")
@@ -213,7 +213,7 @@ StringRef readSourceText(clang::SourceManager &SM, const clang::SourceRange &exp
  */
 std::string stringifyZeroDefaultedArg(clang::SourceManager &SM, const clang::Expr *arg) {
  if (clang::isa<clang::CXXDefaultArgExpr>(arg)) return "0";
-  else return readSourceText(SM, arg->getSourceRange());
+  else return std::string(readSourceText(SM, arg->getSourceRange()));
 }

 } // anonymous namespace
@@ -427,7 +427,7 @@ bool HipifyAction::cudaSharedIncompleteArrayVar(const mat::MatchFinder::MatchRes
        clang::LangOptions LO;
        LO.CUDA = true;
        clang::PrintingPolicy policy(LO);
-        typeName = BT->getName(policy);
+        typeName = std::string(BT->getName(policy));
      }
    } else {
      typeName = QT.getAsString();
@@ -44,7 +44,7 @@ ct::Replacements &getReplacements(ct::RefactoringTool &Tool, StringRef file) {
 #if LLVM_VERSION_MAJOR > 3
  // getReplacements() now returns a map from filename to Replacements - so create an entry
  // for this source file and return a reference to it.
-  return Tool.getReplacements()[file];
+  return Tool.getReplacements()[std::string(file)];
 #else
  return Tool.getReplacements();
 #endif
@@ -108,7 +108,7 @@ void sortInputFiles(int argc, const char **argv, std::vector<std::string> &files
  files.assign(sortedFiles.begin(), sortedFiles.end());
 }

-void appendArgumentsAdjusters(ct::RefactoringTool &Tool, const std::string &sSourceAbsPath) {
+void appendArgumentsAdjusters(ct::RefactoringTool &Tool, const std::string &sSourceAbsPath, const char *hipify_exe) {
  if (!IncludeDirs.empty()) {
    for (std::string s : IncludeDirs) {
      Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster(s.c_str(), ct::ArgumentInsertPosition::BEGIN));
@@ -122,9 +122,14 @@ void appendArgumentsAdjusters(ct::RefactoringTool &Tool, const std::string &sSou
    }
  }
  // Includes for clang's CUDA wrappers for using by packaged hipify-clang
-  Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("./include", ct::ArgumentInsertPosition::BEGIN));
+  static int Dummy;
+  std::string hipify = llvm::sys::fs::getMainExecutable(hipify_exe, (void *)&Dummy);
+  std::string clang_inc_path = std::string(llvm::sys::path::parent_path(hipify));
+  clang_inc_path.append("/include");
+  Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster(clang_inc_path.c_str(), ct::ArgumentInsertPosition::BEGIN));
  Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("-isystem", ct::ArgumentInsertPosition::BEGIN));
-  Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("./include/cuda_wrappers", ct::ArgumentInsertPosition::BEGIN));
+  clang_inc_path.append("/cuda_wrappers");
+  Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster(clang_inc_path.c_str(), ct::ArgumentInsertPosition::BEGIN));
  Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("-isystem", ct::ArgumentInsertPosition::BEGIN));
  // Ensure at least c++11 is used.
  std::string stdCpp = "-std=c++11";
@@ -315,7 +320,7 @@ int main(int argc, const char **argv) {
    ct::RefactoringTool Tool(OptionsParser.getCompilations(), std::string(tmpFile.c_str()));
    ct::Replacements &replacementsToUse = llcompat::getReplacements(Tool, tmpFile.c_str());
    ReplacementsFrontendActionFactory<HipifyAction> actionFactory(&replacementsToUse);
-    appendArgumentsAdjusters(Tool, sSourceAbsPath);
+    appendArgumentsAdjusters(Tool, sSourceAbsPath, argv[0]);
    Statistics &currentStat = Statistics::current();
    // Hipify _all_ the things!
    if (Tool.runAndSave(&actionFactory)) {
@@ -43,9 +43,10 @@ typedef struct hipChannelFormatDesc {
    enum hipChannelFormatKind f;
 }hipChannelFormatDesc;

-#define HIP_TRSF_NORMALIZED_COORDINATES 0x01
-#define HIP_TRSF_READ_AS_INTEGER 0x00
 #define HIP_TRSA_OVERRIDE_FORMAT 0x01
+#define HIP_TRSF_READ_AS_INTEGER 0x01
+#define HIP_TRSF_NORMALIZED_COORDINATES 0x02
+#define HIP_TRSF_SRGB 0x10

 typedef enum hipArray_Format {
    HIP_AD_FORMAT_UNSIGNED_INT8 = 0x01,
@@ -154,20 +154,6 @@ hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* block
                                      dynSharedMemPerBlk, blockSizeLimit);
 }

-template <typename F>
-inline
-hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(uint32_t* numBlocks, F kernel,
-    uint32_t blockSize, size_t dynSharedMemPerBlk) {
-
-    using namespace hip_impl;
-
-    hip_impl::hip_init();
-    auto f = get_program_state().kernel_descriptor(reinterpret_cast<std::uintptr_t>(kernel),
-                                                   target_agent(0));
-
-    return hipOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, f, blockSize, dynSharedMemPerBlk);
-}
-
 template <typename... Args, typename F = void (*)(Args...)>
 inline
 void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
@@ -106,7 +106,7 @@ __CG_STATIC_QUALIFIER__ uint32_t size() {
 }

 __CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
-  // Compute global id of the workgroup to which the current threads belongs to
+  // Compute global id of the workgroup to which the current thread belongs to
  uint32_t blkIdx =
           (uint32_t)((hipBlockIdx_z * hipGridDim_y * hipGridDim_x) +
                      (hipBlockIdx_y * hipGridDim_x) +
@@ -115,7 +115,7 @@ __CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
  // Compute total number of threads being passed to reach current workgroup
  // within grid
  uint32_t num_threads_till_current_workgroup =
-           (uint32_t)(blkIdx * (hipBlockIdx_x * hipBlockIdx_y * hipBlockIdx_z));
+           (uint32_t)(blkIdx * (hipBlockDim_x * hipBlockDim_y * hipBlockDim_z));

  // Compute thread local rank within current workgroup
  uint32_t local_thread_rank =
@@ -2948,7 +2948,7 @@ hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* block
 * @param [in]  dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
 */
 hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
-   uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk);
+   int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk);

 /**
 * @brief Returns occupancy for a device function.
@@ -2960,7 +2960,7 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
 * @param [in]  flags            Extra flags for occupancy calculation (currently ignored)
 */
 hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
-   uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk, unsigned int flags);
+   int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk, unsigned int flags);

 #if __HIP_VDI__ && !defined(__HCC__)
 /**
@@ -3230,21 +3230,6 @@ hipError_t hipLaunchKernel(const void* function_address,
 } /* extern "c" */
 #endif

-#if defined(__cplusplus) && !defined(__HCC__) && defined(__clang__) && defined(__HIP__)
-template <typename F>
-static hipError_t __host__ inline hipOccupancyMaxActiveBlocksPerMultiprocessor(
-    uint32_t* numBlocks, F func, uint32_t blockSize, size_t dynSharedMemPerBlk) {
-    return ::hipOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, (hipFunction_t)func, blockSize,
-                                                          dynSharedMemPerBlk);
-}
-template <typename F>
-static hipError_t __host__ inline hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
-    uint32_t* numBlocks, F func, uint32_t blockSize, size_t dynSharedMemPerBlk, unsigned int flags) {
-    return ::hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
-        numBlocks, (hipFunction_t)func, blockSize, dynSharedMemPerBlk, flags);
-}
-#endif  // defined(__cplusplus) && !defined(__HCC__) && defined(__clang__) && defined(__HIP__)
-
 #if USE_PROF_API
 #include <hip/hcc_detail/hip_prof_str.h>
 #endif
@@ -3385,6 +3370,20 @@ hipError_t hipBindTextureToMipmappedArray(const texture<T, dim, readMode>& tex,
    return hipSuccess;
 }

+template <class T>
+inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
+    int* numBlocks, T f, int blockSize, size_t dynSharedMemPerBlk) {
+    return hipOccupancyMaxActiveBlocksPerMultiprocessor(
+        numBlocks, reinterpret_cast<const void*>(f), blockSize, dynSharedMemPerBlk);
+}
+
+template <class T>
+inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+    int* numBlocks, T f, int blockSize, size_t dynSharedMemPerBlk, unsigned int flags) {
+    return hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+        numBlocks, reinterpret_cast<const void*>(f), blockSize, dynSharedMemPerBlk, flags);
+}
+
 #if __HIP_VDI__ && !defined(__HCC__)
 template <class T>
 inline hipError_t hipLaunchCooperativeKernel(T f, dim3 gridDim, dim3 blockDim,
@@ -124,6 +124,7 @@ typedef struct hipDeviceProp_t {
    unsigned int* hdpRegFlushCntl;      ///< Addres of HDP_REG_COHERENCY_FLUSH_CNTL register
    size_t memPitch;                 ///<Maximum pitch in bytes allowed by memory copies
    size_t textureAlignment;         ///<Alignment requirement for textures
+    size_t texturePitchAlignment;    ///<Pitch alignment requirement for texture references bound to pitched memory
    int kernelExecTimeoutEnabled;    ///<Run time limit for kernels executed on the device
    int ECCEnabled;                  ///<Device has ECC support enabled
    int tccDriver;                   ///< 1:If device is Tesla device using TCC driver, else 0
@@ -321,6 +322,7 @@ typedef enum hipDeviceAttribute_t {

    hipDeviceAttributeMaxPitch,             ///< Maximum pitch in bytes allowed by memory copies
    hipDeviceAttributeTextureAlignment,     ///<Alignment requirement for textures
+    hipDeviceAttributeTexturePitchAlignment, ///<Pitch alignment requirement for 2D texture references bound to pitched memory;
    hipDeviceAttributeKernelExecTimeout,    ///<Run time limit for kernels executed on the device
    hipDeviceAttributeCanMapHostMemory,     ///<Device can map host memory into device address space
    hipDeviceAttributeEccEnabled            ///<Device has ECC support enabled
@@ -1130,6 +1130,7 @@ inline static hipError_t hipGetDeviceProperties(hipDeviceProp_t* p_prop, int dev

    p_prop->memPitch                 = cdprop.memPitch;
    p_prop->textureAlignment         = cdprop.textureAlignment;
+    p_prop->texturePitchAlignment    = cdprop.texturePitchAlignment;
    p_prop->kernelExecTimeoutEnabled = cdprop.kernelExecTimeoutEnabled;
    p_prop->ECCEnabled               = cdprop.ECCEnabled;
    p_prop->tccDriver                = cdprop.tccDriver;
@@ -1244,6 +1245,9 @@ inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t att
        case hipDeviceAttributeTextureAlignment:
            cdattr = cudaDevAttrTextureAlignment;
            break;
+        case hipDeviceAttributeTexturePitchAlignment:
+            cdattr = cudaDevAttrTexturePitchAlignment;
+            break;
        case hipDeviceAttributeKernelExecTimeout:
            cdattr = cudaDevAttrKernelExecTimeout;
            break;
@@ -3,6 +3,6 @@
 ROCMDIR=@ROCM_PATH@
 HIPDIR=$ROCMDIR/hip

-if [ -d $ROCMDIR]
+if [ -d $ROCMDIR] ; then
    ln -s -f $ROCMDIR /opt/rocm
 fi
@@ -1,5 +1,5 @@
 #!/bin/bash

-if [ -L "/opt/rocm" ]
+if [ -L "/opt/rocm" ] ; then
   unlink /opt/rocm
 fi
@@ -2,19 +2,18 @@ HIP_PATH?= $(wildcard /opt/rocm/hip)
 ifeq (,$(HIP_PATH))
 	HIP_PATH=../../..
 endif
-HIPCC=$(HIP_PATH)/bin/hipcc
+HIPCC=$(HIP_PATH)/bin/hipcc -std=c++11

 EXE=hipDispatchLatency

 CXXFLAGS = -O3

-all: ${EXE}
+all: test_kernel.code ${EXE}

-$(EXE): hipDispatchLatency.cpp ResultDatabase.cpp
-	$(HIPCC) $(CXXFLAGS) hipDispatchLatency.cpp ResultDatabase.cpp -o $@
-
-install: $(EXE)
-	cp $(EXE) $(HIP_PATH)/bin
+$(EXE): hipDispatchLatency.cpp
+	$(HIPCC) $(CXXFLAGS) hipDispatchLatency.cpp -o $@

+test_kernel.code: test_kernel.cpp
+	$(HIP_PATH)/bin/hipcc --genco  $(GENCO_FLAGS) $^ -o $@
 clean:
 	rm -f *.o $(EXE)
@@ -1,473 +0,0 @@
-#include "ResultDatabase.h"
-
-#include <cfloat>
-#include <algorithm>
-#include <cmath>
-#include <iomanip>
-
-using namespace std;
-
-#define SORT_BY_NAME 0
-#define SORT_RETAIN_ATTS_ORDER 1
-
-
-bool ResultDatabase::Result::operator<(const Result& rhs) const {
-    if (test < rhs.test) return true;
-    if (test > rhs.test) return false;
-#if (SORT_RETAIN_ATTS_ORDER == 0)
-    // For ties, sort by the value of the attribute:
-    if (atts < rhs.atts) return true;
-    if (atts > rhs.atts) return false;
-#endif
-    return false;  // less-operator returns false on equal
-}
-
-double ResultDatabase::Result::GetMin() const {
-    double r = FLT_MAX;
-    for (int i = 0; i < value.size(); i++) {
-        r = min(r, value[i]);
-    }
-    return r;
-}
-
-double ResultDatabase::Result::GetMax() const {
-    double r = -FLT_MAX;
-    for (int i = 0; i < value.size(); i++) {
-        r = max(r, value[i]);
-    }
-    return r;
-}
-
-double ResultDatabase::Result::GetMedian() const { return GetPercentile(50); }
-
-double ResultDatabase::Result::GetPercentile(double q) const {
-    int n = value.size();
-    if (n == 0) return FLT_MAX;
-    if (n == 1) return value[0];
-
-    if (q <= 0) return value[0];
-    if (q >= 100) return value[n - 1];
-
-    double index = ((n + 1.) * q / 100.) - 1;
-
-    vector<double> sorted = value;
-    sort(sorted.begin(), sorted.end());
-
-    if (n == 2) return (sorted[0] * (1 - q / 100.) + sorted[1] * (q / 100.));
-
-    int index_lo = int(index);
-    double frac = index - index_lo;
-    if (frac == 0) return sorted[index_lo];
-
-    double lo = sorted[index_lo];
-    double hi = sorted[index_lo + 1];
-    return lo + (hi - lo) * frac;
-}
-
-double ResultDatabase::Result::GetMean() const {
-    double r = 0;
-    for (int i = 0; i < value.size(); i++) {
-        r += value[i];
-    }
-    return r / double(value.size());
-}
-
-double ResultDatabase::Result::GetStdDev() const {
-    double r = 0;
-    double u = GetMean();
-    if (u == FLT_MAX) return FLT_MAX;
-    for (int i = 0; i < value.size(); i++) {
-        r += (value[i] - u) * (value[i] - u);
-    }
-    r = sqrt(r / value.size());
-    return r;
-}
-
-
-void ResultDatabase::AddResults(const string& test, const string& atts, const string& unit,
-                                const vector<double>& values) {
-    for (int i = 0; i < values.size(); i++) {
-        AddResult(test, atts, unit, values[i]);
-    }
-}
-
-static string RemoveAllButLeadingSpaces(const string& a) {
-    string b;
-    int n = a.length();
-    int i = 0;
-    while (i < n && a[i] == ' ') {
-        b += a[i];
-        ++i;
-    }
-    for (; i < n; i++) {
-        if (a[i] != ' ' && a[i] != '\t') b += a[i];
-    }
-    return b;
-}
-
-void ResultDatabase::AddResult(const string& test_orig, const string& atts_orig,
-                               const string& unit_orig, double value) {
-    string test = RemoveAllButLeadingSpaces(test_orig);
-    string atts = RemoveAllButLeadingSpaces(atts_orig);
-    string unit = RemoveAllButLeadingSpaces(unit_orig);
-    int index;
-    for (index = 0; index < results.size(); index++) {
-        if (results[index].test == test && results[index].atts == atts) {
-            if (results[index].unit != unit) throw "Internal error: mixed units";
-
-            break;
-        }
-    }
-
-    if (index >= results.size()) {
-        Result r;
-        r.test = test;
-        r.atts = atts;
-        r.unit = unit;
-        results.push_back(r);
-    }
-
-    results[index].value.push_back(value);
-}
-
-// ****************************************************************************
-//  Method:  ResultDatabase::DumpDetailed
-//
-//  Purpose:
-//    Writes the full results, including all trials.
-//
-//  Arguments:
-//    out        where to print
-//
-//  Programmer:  Jeremy Meredith
-//  Creation:    August 14, 2009
-//
-//  Modifications:
-//    Jeremy Meredith, Wed Nov 10 14:25:17 EST 2010
-//    Renamed to DumpDetailed to make room for a DumpSummary.
-//
-//    Jeremy Meredith, Thu Nov 11 11:39:57 EST 2010
-//    Added note about (*) missing value tag.
-//
-//    Jeremy Meredith, Tue Nov 23 13:57:02 EST 2010
-//    Changed note about missing values to be worded a little better.
-//
-// ****************************************************************************
-void ResultDatabase::DumpDetailed(ostream& out) {
-    vector<Result> sorted(results);
-
-#if SORT_BY_NAME
-    stable_sort(sorted.begin(), sorted.end());
-#endif
-
-    const int testNameW = 24;
-    const int attW = 12;
-    const int fieldW = 11;
-    out << std::fixed << right << std::setprecision(4);
-
-    int maxtrials = 1;
-    for (int i = 0; i < sorted.size(); i++) {
-        if (sorted[i].value.size() > maxtrials) maxtrials = sorted[i].value.size();
-    }
-
-    // TODO: in big parallel runs, the "trials" are the procs
-    // and we really don't want to print them all out....
-    out << setw(testNameW) << "test\t" << setw(attW) << "atts\t" << setw(fieldW) << "median\t"
-        << "mean\t"
-        << "stddev\t"
-        << "min\t"
-        << "max\t";
-    for (int i = 0; i < maxtrials; i++) out << "trial" << i << "\t";
-    out << endl;
-
-    for (int i = 0; i < sorted.size(); i++) {
-        Result& r = sorted[i];
-        out << setw(testNameW) << r.test + "\t";
-        out << setw(attW) << r.atts + "\t";
-        out << setw(fieldW) << r.unit + "\t";
-        if (r.GetMedian() == FLT_MAX)
-            out << "N/A\t";
-        else
-            out << r.GetMedian() << "\t";
-        if (r.GetMean() == FLT_MAX)
-            out << "N/A\t";
-        else
-            out << r.GetMean() << "\t";
-        if (r.GetStdDev() == FLT_MAX)
-            out << "N/A\t";
-        else
-            out << r.GetStdDev() << "\t";
-        if (r.GetMin() == FLT_MAX)
-            out << "N/A\t";
-        else
-            out << r.GetMin() << "\t";
-        if (r.GetMax() == FLT_MAX)
-            out << "N/A\t";
-        else
-            out << r.GetMax() << "\t";
-        for (int j = 0; j < r.value.size(); j++) {
-            if (r.value[j] == FLT_MAX)
-                out << "N/A\t";
-            else
-                out << r.value[j] << "\t";
-        }
-
-        out << endl;
-    }
-    if (0) {
-        out << endl
-            << "Note: Any results marked with (*) had missing values." << endl
-            << "      This can occur on systems with a mixture of" << endl
-            << "      device types or architectural capabilities." << endl;
-    }
-}
-
-
-// ****************************************************************************
-//  Method:  ResultDatabase::DumpDetailed
-//
-//  Purpose:
-//    Writes the summary results (min/max/stddev/med/mean), but not
-//    every individual trial.
-//
-//  Arguments:
-//    out        where to print
-//
-//  Programmer:  Jeremy Meredith
-//  Creation:    November 10, 2010
-//
-//  Modifications:
-//    Jeremy Meredith, Thu Nov 11 11:39:57 EST 2010
-//    Added note about (*) missing value tag.
-//
-// ****************************************************************************
-void ResultDatabase::DumpSummary(ostream& out) {
-    vector<Result> sorted(results);
-
-#if SORT_BY_NAME
-    stable_sort(sorted.begin(), sorted.end());
-#endif
-
-    const int testNameW = 32;
-    const int attW = 12;
-    const int fieldW = 9;
-    out << std::fixed << right << std::setprecision(2);
-
-    // TODO: in big parallel runs, the "trials" are the procs
-    // and we really don't want to print them all out....
-    out << setw(testNameW) << "test\t" << setw(attW) << "atts\t" << setw(fieldW) << "units\t"
-        << "median\t"
-        << "mean\t"
-        << "stddev\t"
-        << "min\t"
-        << "max\t";
-    out << endl;
-
-    for (int i = 0; i < sorted.size(); i++) {
-        Result& r = sorted[i];
-        out << setw(testNameW) << r.test + "\t";
-        out << setw(attW) << r.atts + "\t";
-        out << setw(fieldW) << r.unit + "\t";
-        if (r.GetMedian() == FLT_MAX)
-            out << "N/A\t";
-        else
-            out << r.GetMedian() << "\t";
-        if (r.GetMean() == FLT_MAX)
-            out << "N/A\t";
-        else
-            out << r.GetMean() << "\t";
-        if (r.GetStdDev() == FLT_MAX)
-            out << "N/A\t";
-        else
-            out << r.GetStdDev() << "\t";
-        if (r.GetMin() == FLT_MAX)
-            out << "N/A\t";
-        else
-            out << r.GetMin() << "\t";
-        if (r.GetMax() == FLT_MAX)
-            out << "N/A\t";
-        else
-            out << r.GetMax() << "\t";
-
-        out << endl;
-    }
-    if (0) {
-        out << endl
-            << "Note: results marked with (*) had missing values such as" << endl
-            << "might occur with a mixture of architectural capabilities." << endl;
-    }
-}
-
-// ****************************************************************************
-//  Method:  ResultDatabase::ClearAllResults
-//
-//  Purpose:
-//    Clears all existing results from the ResultDatabase; used for multiple passes
-//    of the same test or multiple tests.
-//
-//  Arguments:
-//
-//  Programmer:  Jeffrey Young
-//  Creation:    September 10th, 2014
-//
-//  Modifications:
-//
-//
-// ****************************************************************************
-void ResultDatabase::ClearAllResults() { results.clear(); }
-
-// ****************************************************************************
-//  Method:  ResultDatabase::DumpCsv
-//
-//  Purpose:
-//    Writes either detailed or summary results (min/max/stddev/med/mean), but not
-//    every individual trial.
-//
-//  Arguments:
-//    out        file to print CSV results
-//
-//  Programmer:  Jeffrey Young
-//  Creation:    August 28th, 2014
-//
-//  Modifications:
-//
-// ****************************************************************************
-void ResultDatabase::DumpCsv(string fileName) {
-    bool emptyFile;
-    vector<Result> sorted(results);
-
-#if SORT_BY_NAME
-    stable_sort(sorted.begin(), sorted.end());
-#endif
-
-    // Check to see if the file is empty - if so, add the headers
-    emptyFile = this->IsFileEmpty(fileName);
-
-    // Open file and append by default
-    ofstream out;
-    out.open(fileName.c_str(), std::ofstream::out | std::ofstream::app);
-
-    // Add headers only for empty files
-    if (emptyFile) {
-        // TODO: in big parallel runs, the "trials" are the procs
-        // and we really don't want to print them all out....
-        out << "test, "
-            << "atts, "
-            << "units, "
-            << "median, "
-            << "mean, "
-            << "stddev, "
-            << "min, "
-            << "max, ";
-        out << endl;
-    }
-
-    for (int i = 0; i < sorted.size(); i++) {
-        Result& r = sorted[i];
-        out << r.test << ", ";
-        out << r.atts << ", ";
-        out << r.unit << ", ";
-        if (r.GetMedian() == FLT_MAX)
-            out << "N/A, ";
-        else
-            out << r.GetMedian() << ", ";
-        if (r.GetMean() == FLT_MAX)
-            out << "N/A, ";
-        else
-            out << r.GetMean() << ", ";
-        if (r.GetStdDev() == FLT_MAX)
-            out << "N/A, ";
-        else
-            out << r.GetStdDev() << ", ";
-        if (r.GetMin() == FLT_MAX)
-            out << "N/A, ";
-        else
-            out << r.GetMin() << ", ";
-        if (r.GetMax() == FLT_MAX)
-            out << "N/A, ";
-        else
-            out << r.GetMax() << ", ";
-
-        out << endl;
-    }
-    out << endl;
-
-    out.close();
-}
-
-// ****************************************************************************
-//  Method:  ResultDatabase::IsFileEmpty
-//
-//  Purpose:
-//    Returns whether a file is empty - used as a helper for CSV printing
-//
-//  Arguments:
-//    file  The input file to check for emptiness
-//
-//  Programmer:  Jeffrey Young
-//  Creation:    August 28th, 2014
-//
-//  Modifications:
-//
-// ****************************************************************************
-
-bool ResultDatabase::IsFileEmpty(string fileName) {
-    bool fileEmpty;
-
-    ifstream file(fileName.c_str());
-
-    // If the file doesn't exist it is by definition empty
-    if (!file.good()) {
-        return true;
-    } else {
-        fileEmpty = (bool)(file.peek() == ifstream::traits_type::eof());
-        file.close();
-
-        return fileEmpty;
-    }
-
-    // Otherwise, return false
-    return false;
-}
-
-
-// ****************************************************************************
-//  Method:  ResultDatabase::GetResultsForTest
-//
-//  Purpose:
-//    Returns a vector of results for just one test name.
-//
-//  Arguments:
-//    test       the name of the test results to search for
-//
-//  Programmer:  Jeremy Meredith
-//  Creation:    December  3, 2010
-//
-//  Modifications:
-//
-// ****************************************************************************
-vector<ResultDatabase::Result> ResultDatabase::GetResultsForTest(const string& test) {
-    // get only the given test results
-    vector<Result> retval;
-    for (int i = 0; i < results.size(); i++) {
-        Result& r = results[i];
-        if (r.test == test) retval.push_back(r);
-    }
-    return retval;
-}
-
-// ****************************************************************************
-//  Method:  ResultDatabase::GetResults
-//
-//  Purpose:
-//    Returns all the results.
-//
-//  Arguments:
-//
-//  Programmer:  Jeremy Meredith
-//  Creation:    December  3, 2010
-//
-//  Modifications:
-//
-// ****************************************************************************
-const vector<ResultDatabase::Result>& ResultDatabase::GetResults() const { return results; }
@@ -1,89 +0,0 @@
-#ifndef RESULT_DATABASE_H
-#define RESULT_DATABASE_H
-
-#include <string>
-#include <vector>
-#include <iostream>
-#include <fstream>
-#include <cfloat>
-using std::ifstream;
-using std::ofstream;
-using std::ostream;
-using std::string;
-using std::vector;
-
-
-// ****************************************************************************
-// Class:  ResultDatabase
-//
-// Purpose:
-//   Track numerical results as they are generated.
-//   Print statistics of raw results.
-//
-// Programmer:  Jeremy Meredith
-// Creation:    June 12, 2009
-//
-// Modifications:
-//    Jeremy Meredith, Wed Nov 10 14:20:47 EST 2010
-//    Split timing reports into detailed and summary.  E.g. for serial code,
-//    we might report all trial values, but skip them in parallel.
-//
-//    Jeremy Meredith, Thu Nov 11 11:40:18 EST 2010
-//    Added check for missing value tag.
-//
-//    Jeremy Meredith, Mon Nov 22 13:37:10 EST 2010
-//    Added percentile statistic.
-//
-//    Jeremy Meredith, Fri Dec  3 16:30:31 EST 2010
-//    Added a method to extract a subset of results based on test name.  Also,
-//    the Result class is now public, so that clients can use them directly.
-//    Added a GetResults method as well, and made several functions const.
-//
-// ****************************************************************************
-class ResultDatabase {
-   public:
-    //
-    // A performance result for a single SHOC benchmark run.
-    //
-    struct Result {
-        string test;           // e.g. "readback"
-        string atts;           // e.g. "pagelocked 4k^2"
-        string unit;           // e.g. "MB/sec"
-        vector<double> value;  // e.g. "837.14"
-        double GetMin() const;
-        double GetMax() const;
-        double GetMedian() const;
-        double GetPercentile(double q) const;
-        double GetMean() const;
-        double GetStdDev() const;
-
-        bool operator<(const Result& rhs) const;
-
-        bool HadAnyFLTMAXValues() const {
-            for (int i = 0; i < value.size(); ++i) {
-                if (value[i] >= FLT_MAX) return true;
-            }
-            return false;
-        }
-    };
-
-   protected:
-    vector<Result> results;
-
-   public:
-    void AddResult(const string& test, const string& atts, const string& unit, double value);
-    void AddResults(const string& test, const string& atts, const string& unit,
-                    const vector<double>& values);
-    vector<Result> GetResultsForTest(const string& test);
-    const vector<Result>& GetResults() const;
-    void ClearAllResults();
-    void DumpDetailed(ostream&);
-    void DumpSummary(ostream&);
-    void DumpCsv(string fileName);
-
-   private:
-    bool IsFileEmpty(string fileName);
-};
-
-
-#endif
@@ -1,16 +1,13 @@
 /*
 Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved.
-
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
-
 The above copyright notice and this permission notice shall be included in
 all copies or substantial portions of the Software.
-
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
@@ -21,142 +18,134 @@ THE SOFTWARE.
 */

 #include "hip/hip_runtime.h"
+#ifdef __HIP_PLATFORM_HCC__
+#include "hip/hip_ext.h"
+#endif
 #include <iostream>
-#include <time.h>
-#include "ResultDatabase.h"
-
-#define PRINT_PROGRESS 0
-
-#define check(cmd)                                                                                 \
-    {                                                                                              \
-        hipError_t status = cmd;                                                                   \
-        if (status != hipSuccess) {                                                                \
-            printf("error: '%s'(%d) from %s at %s:%d\n", hipGetErrorString(status), status, #cmd,  \
-                   __FILE__, __LINE__);                                                            \
-            abort();                                                                               \
-        }                                                                                          \
-    }
-
-#define LEN 1024 * 1024
+#include <chrono>
+#include <algorithm>

 #define NUM_GROUPS 1
-#define GROUP_SIZE 64
-#define TEST_ITERS 20
-#define DISPATCHES_PER_TEST 100
+#define GROUP_SIZE 1
+#define WARMUP_RUN_COUNT 10
+#define TIMING_RUN_COUNT 100
+#define TOTAL_RUN_COUNT WARMUP_RUN_COUNT + TIMING_RUN_COUNT
+#define BATCH_SIZE 1000

-const unsigned p_tests = 0xfffffff;
+#define FILE_NAME "test_kernel.code"
+#define KERNEL_NAME "test"

+__global__ void EmptyKernel() { }

-// HCC optimizes away fully NULL kernel calls, so run one that is nearly null:
-__global__ void NearlyNull(float* Ad) {
-    if (Ad) {
-        Ad[0] = 42;
-    }
+void print_timing(std::string test, const std::array<float, TOTAL_RUN_COUNT> &results, int batch = 1) {
+    
+    float total_us = 0.0f, mean_us = 0.0f, stddev_us = 0.0f;
+    
+    // skip warm-up runs
+    auto start_iter = std::next(results.begin(), WARMUP_RUN_COUNT);
+    auto end_iter = results.end();
+
+    // mean
+    std::for_each(start_iter, end_iter, [&](const float &run_ms) {
+        total_us += (run_ms * 1000) / batch;
+    });   
+    mean_us = total_us  / TIMING_RUN_COUNT;
+
+   // stddev
+    total_us = 0;
+    std::for_each(start_iter, end_iter, [&](const float &run_ms) {
+        float dev_us = ((run_ms * 1000) / batch) - mean_us;
+        total_us += dev_us * dev_us;
+    });
+    stddev_us = sqrt(total_us / TIMING_RUN_COUNT);
+
+    // display
+    printf("\n %s: %.1f us, std: %.1f us\n", test.c_str(), mean_us, stddev_us);
 }

-
-ResultDatabase resultDB;
-
-
-void stopTest(hipEvent_t start, hipEvent_t stop, const char* msg, int iters) {
-    float mS = 0;
-    check(hipEventRecord(stop));
-    check(hipDeviceSynchronize());
-    check(hipEventElapsedTime(&mS, start, stop));
-    resultDB.AddResult(std::string(msg), "", "uS", mS * 1000 / iters);
-    if (PRINT_PROGRESS & 0x1) {
-        std::cout << msg << "\t\t" << mS * 1000 / iters << " uS" << std::endl;
-    }
-    if (PRINT_PROGRESS & 0x2) {
-        resultDB.DumpSummary(std::cout);
-    }
-}
-
-
-int main() {
-    hipError_t err;
-    float* Ad;
-    check(hipMalloc(&Ad, 4));
-
-
-    hipStream_t stream;
-    check(hipStreamCreate(&stream));
-
-
-    hipEvent_t start, sync, stop;
-    check(hipEventCreate(&start));
-    check(hipEventCreateWithFlags(&sync, hipEventBlockingSync));
-    check(hipEventCreate(&stop));
-
-
+int main() {   
    hipStream_t stream0 = 0;
+    hipDevice_t device;
+    hipDeviceGet(&device, 0);
+    hipCtx_t context;     
+    hipCtxCreate(&context, 0, device); 
+    hipModule_t module;
+    hipFunction_t function;
+    hipModuleLoad(&module, FILE_NAME);
+    hipModuleGetFunction(&function, module, KERNEL_NAME);
+    void* params = nullptr;
+    
+    std::array<float, TOTAL_RUN_COUNT> results;
+    hipEvent_t start, stop;
+    hipEventCreate(&start);
+    hipEventCreate(&stop);

+    /************************************************************************************/
+    /* HIP kernel launch enqueue rate:                                                  */
+    /* Measure time taken to enqueue a kernel on the GPU                                */
+    /************************************************************************************/ 

-    if (p_tests & 0x1) {
-        hipEventRecord(start);
-        hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
-        stopTest(start, stop, "FirstKernelLaunch", 1);
+    // Timing hipModuleLaunchKernel
+    for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
+        auto start = std::chrono::high_resolution_clock::now();
+        hipModuleLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, &params, nullptr);
+        auto stop = std::chrono::high_resolution_clock::now();
+        results[i] = std::chrono::duration<float, std::milli>(stop - start).count();
    }
+    print_timing("hipModuleLaunchKernel enqueue rate", results);

-
-    if (p_tests & 0x2) {
-        hipEventRecord(start);
-        hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
-        stopTest(start, stop, "SecondKernelLaunch", 1);
+    // Timing hipLaunchKernelGGL
+    for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
+        auto start = std::chrono::high_resolution_clock::now();
+        hipLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0);
+        auto stop = std::chrono::high_resolution_clock::now();
+        results[i] = std::chrono::duration<float, std::milli>(stop - start).count();
    }
+    print_timing("hipLaunchKernelGGL enqueue rate", results);

+    /***********************************************************************************/
+    /* Single dispatch execution latency using HIP events:                             */   
+    /* Measures latency to start & finish executing a kernel with GPU-scope visibility    */ 
+    /***********************************************************************************/

-    if (p_tests & 0x4) {
-        for (int t = 0; t < TEST_ITERS; t++) {
-            hipEventRecord(start);
-            for (int i = 0; i < DISPATCHES_PER_TEST; i++) {
-                hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
-                hipEventRecord(sync);
-                hipEventSynchronize(sync);
-            }
-            stopTest(start, stop, "NullStreamASyncDispatchWait", DISPATCHES_PER_TEST);
-        }
-    }
-
-
-    if (p_tests & 0x10) {
-        for (int t = 0; t < TEST_ITERS; t++) {
-            hipEventRecord(start);
-            for (int i = 0; i < DISPATCHES_PER_TEST; i++) {
-                hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream, Ad);
-                hipEventRecord(sync);
-                hipEventSynchronize(sync);
-            }
-            stopTest(start, stop, "StreamASyncDispatchWait", DISPATCHES_PER_TEST);
-        }
-    }
-
-#if 1
-
-    if (p_tests & 0x40) {
-        for (int t = 0; t < TEST_ITERS; t++) {
-            hipEventRecord(start);
-            for (int i = 0; i < DISPATCHES_PER_TEST; i++) {
-                hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
-            }
-            stopTest(start, stop, "NullStreamASyncDispatchNoWait", DISPATCHES_PER_TEST);
-        }
-    }
-
-    if (p_tests & 0x80) {
-        for (int t = 0; t < TEST_ITERS; t++) {
-            hipEventRecord(start);
-            for (int i = 0; i < DISPATCHES_PER_TEST; i++) {
-                hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream, Ad);
-            }
-            stopTest(start, stop, "StreamASyncDispatchNoWait", DISPATCHES_PER_TEST);
-        }
+    //Timing directly the dispatch
+#ifdef __HIP_PLATFORM_HCC__
+    for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
+        hipExtLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, start, stop, 0);
+        hipEventSynchronize(stop);
+        hipEventElapsedTime(&results[i], start, stop);
    }
+    print_timing("Timing directly single dispatch latency", results);
 #endif
-    resultDB.DumpSummary(std::cout);

+    //Timing around the dispatch
+    for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
+        hipEventRecord(start, 0);
+        hipLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0);
+        hipEventRecord(stop, 0);
+        hipEventSynchronize(stop);
+        hipEventElapsedTime(&results[i], start, stop);
+    }
+    print_timing("Timing around single dispatch latency", results);

-    check(hipEventDestroy(start));
-    check(hipEventDestroy(sync));
-    check(hipEventDestroy(stop));
+    /*********************************************************************************/
+    /* Batch dispatch execution latency using HIP events:                            */
+    /* Measures latency to start & finish executing each dispatch in a batch    */ 
+    /*********************************************************************************/
+
+    for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
+         hipEventRecord(start, 0);
+         for (int j = 0; j < BATCH_SIZE; j++) {
+             hipLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0);
+         }
+         hipEventRecord(stop, 0);
+         hipEventSynchronize(stop);
+         hipEventElapsedTime(&results[i], start, stop);
+    }
+    print_timing("Batch dispatch latency", results, BATCH_SIZE);
+
+    hipEventDestroy(start);
+    hipEventDestroy(stop);
+    hipCtxDestroy(context);
 }
+
@@ -0,0 +1,24 @@
+/*
+Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "hip/hip_runtime.h"
+
+extern "C" __global__ void test() {
+}
+
@@ -86,8 +86,8 @@ void launchKernel(float* C, float* A, float* B, bool manual){
     printf("kernel Execution time = %6.3fms\n", eventMs);

     //Calculate Occupancy
-     uint32_t numBlock = 0;
-     HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, multiply, blockSize, 0));
+     int numBlock = 0;
+     HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, multiply,(int)blockSize, 0));
     
     if(devProp.maxThreadsPerMultiProcessor){
 	std::cout << "Theoretical Occupancy is " << (double)numBlock* blockSize/devProp.maxThreadsPerMultiProcessor * 100 << "%" << std::endl;
@@ -312,9 +312,12 @@ hipError_t ihipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device
            case hipDeviceAttributeMaxPitch:
                *pi = prop->memPitch;
                break;
-	    case hipDeviceAttributeTextureAlignment:
+            case hipDeviceAttributeTextureAlignment:
                *pi = prop->textureAlignment;
                break;
+            case hipDeviceAttributeTexturePitchAlignment:
+                *pi = prop->texturePitchAlignment;
+                 break;
            case hipDeviceAttributeKernelExecTimeout:
                *pi = prop->kernelExecTimeoutEnabled;
                break;
@@ -925,6 +925,7 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop) {

    prop->memPitch = INT_MAX; //Maximum pitch in bytes allowed by memory copies (hardcoded 128 bytes in hipMallocPitch)
    prop->textureAlignment = 0; //Alignment requirement for textures
+    prop->texturePitchAlignment = IMAGE_PITCH_ALIGNMENT; //Alignment requirment for texture pitch
    prop->kernelExecTimeoutEnabled = 0; //no run time limit for running kernels on device

    hsa_isa_t isa;
@@ -1368,7 +1368,7 @@ hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* block
 }

 hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
-   TlsData *tls, uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk)
+   TlsData *tls, int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk)
 {
    using namespace hip_impl;

@@ -1408,35 +1408,41 @@ hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
        : std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS));

    // Calculate blocks occupancy per CU based on SGPR usage
-    *numBlocks = std::min(*numBlocks, (uint32_t) (sgprs_alu_occupancy / numWavefronts));
+    *numBlocks = std::min(*numBlocks, (int) (sgprs_alu_occupancy / numWavefronts));

    size_t total_used_lds = usedLDS + dynSharedMemPerBlk;
    if (total_used_lds != 0) {
      // Calculate LDS occupacy per CU. lds_per_cu / (static_lsd + dynamic_lds)
      size_t lds_occupancy = prop.maxSharedMemoryPerMultiProcessor / total_used_lds;
-      *numBlocks = std::min(*numBlocks, (uint32_t) lds_occupancy);
+      *numBlocks = std::min(*numBlocks, (int) lds_occupancy);
    }

    return hipSuccess;
 }

 hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
-   uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk)
+   int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk)
 {
    HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessor, numBlocks, f, blockSize, dynSharedMemPerBlk);
+   
+    auto F = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)(f),
+                                                   hip_impl::target_agent(0));

    return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor(
-        tls, numBlocks, f, blockSize, dynSharedMemPerBlk));
+        tls, numBlocks, F, blockSize, dynSharedMemPerBlk));
 }

 hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
-   uint32_t* numBlocks, hipFunction_t f, uint32_t  blockSize, size_t dynSharedMemPerBlk,
+   int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk,
   unsigned int flags)
 {
    HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, numBlocks, f, blockSize, dynSharedMemPerBlk, flags);

+    auto F = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)(f),
+                                                   hip_impl::target_agent(0));
+
    return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor(
-        tls, numBlocks, f, blockSize, dynSharedMemPerBlk));
+        tls, numBlocks, F, blockSize, dynSharedMemPerBlk));
 }

 hipError_t hipLaunchKernel(
@@ -61,7 +61,7 @@ namespace hip_impl {
        if (it == impl->get_globals().end())
            return nullptr;
        else
-            return it->second;
+            return it->second.first;
    }

    hsa_executable_t program_state::load_executable(const char* data,
@@ -18,6 +18,7 @@
 #include <hsa/hsa_ext_amd.h>
 #include <hsa/hsa_ven_amd_loader.h>
 #include <amd_comgr.h>
+#include "hc.hpp"

 #include <link.h>

@@ -193,7 +194,8 @@ public:
    std::tuple<
        std::once_flag,
        std::mutex,
-        std::unordered_map<std::string, void*>> globals;
+        // map from string to pair<global_addr, pinned_addr>
+        std::unordered_map<std::string, std::pair<void*, void*>>> globals;

    using RAII_code_reader =
        std::unique_ptr<hsa_code_object_reader_t, 
@@ -308,7 +310,7 @@ public:
        return symbol_addresses.second;
    }

-    std::unordered_map<std::string, void*>& get_globals() {
+    std::unordered_map<std::string, std::pair<void*, void*>>& get_globals() {
        std::call_once(std::get<0>(globals), [this]() { 
            std::get<2>(globals).reserve(get_symbol_addresses().size()); 
        });
@@ -349,30 +351,52 @@ public:
        auto& g_mutex = get_globals_mutex();
        for (auto&& x : undefined_symbols) {

-            if (g.find(x) != g.cend()) return;
-
            const auto it1 = get_symbol_addresses().find(x);
-
            if (it1 == get_symbol_addresses().cend()) {
                hip_throw(std::runtime_error{
                    "Global symbol: " + x + " is undefined."});
            }

-            std::lock_guard<std::mutex> lck{g_mutex};
+            hsa_status_t status;
+            auto check_hsa_global_var_define_error = [&x](hsa_status_t s) {
+                if (s != HSA_STATUS_SUCCESS) {
+                    const char* es;
+                    hsa_status_string(s, &es);
+                    hip_throw(std::runtime_error{ "Error when defining symbol " + x + " : " + es});
+                }
+            };

-            if (g.find(x) != g.cend()) return;
+            auto retrieve_pinned_address_from_cache = [](decltype(g) g, decltype(x) x) {
+                const auto& global_addr = g.find(x);
+                if (global_addr != g.cend()) {
+                    return global_addr->second.second;
+                }
+                return (void*)nullptr;
+            };

-            g.emplace(x, (void*)(it1->second.first));
-            void* p = nullptr;
-            hsa_amd_memory_lock(
-                reinterpret_cast<void*>(it1->second.first),
-                it1->second.second,
-                nullptr,  // All agents.
-                0,
-                &p);
-
-            hsa_executable_agent_global_variable_define(
-                executable, agent, x.c_str(), p);
+            void* p = retrieve_pinned_address_from_cache(g, x);
+            if (p == nullptr) {
+                std::lock_guard<std::mutex> lck{g_mutex};
+                p = retrieve_pinned_address_from_cache(g, x);
+                if (p == nullptr) {
+                    if (x == "_ZN2hc13printf_bufferE") {
+                        // This is the printf buffer, get the pinned address from HCC
+                        p = Kalmar::getContext()->getPrintfBufferPointerVA();
+                    } 
+                    else {
+                        status = hsa_amd_memory_lock(reinterpret_cast<void*>(it1->second.first),
+                                                     it1->second.second,
+                                                     nullptr,  // All agents.
+                                                     0, &p);
+                        check_hsa_global_var_define_error(status);
+                    }
+                    // cache the global address and its pinned address
+                    g.emplace(x, std::make_pair(reinterpret_cast<void*>(it1->second.first), p));
+                }
+            }
+            status = hsa_executable_agent_global_variable_define(
+                         executable, agent, x.c_str(), p);
+            check_hsa_global_var_define_error(status);
        }
    }

@@ -398,13 +422,19 @@ public:
                                           move(file), move(tmp));
        }

-        hsa_code_object_reader_create_from_memory(
-            it->first.data(), it->first.size(), it->second.get());
+        auto check_hsa_error = [](hsa_status_t s) {
+            if (s != HSA_STATUS_SUCCESS) {
+                hip_throw(std::runtime_error{"error when loading code object"});
+            }
+        };

-        hsa_executable_load_agent_code_object(
-            executable, agent, *it->second, nullptr, nullptr);
+        check_hsa_error(hsa_code_object_reader_create_from_memory(
+            it->first.data(), it->first.size(), it->second.get()));

-        hsa_executable_freeze(executable, nullptr);
+        check_hsa_error(hsa_executable_load_agent_code_object(
+            executable, agent, *it->second, nullptr, nullptr));
+
+        check_hsa_error(hsa_executable_freeze(executable, nullptr));
    }


@@ -30,7 +30,12 @@ THE SOFTWARE.
 __global__ void run_printf() { printf("Hello World\n"); }

 int main() {
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(run_printf), dim3(1), dim3(1), 0, 0);
-    hipDeviceSynchronize();
+    int device_count = 0;
+    hipGetDeviceCount(&device_count);
+    for (int i = 0; i < device_count; ++i) {
+        hipSetDevice(i);
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(run_printf), dim3(1), dim3(1), 0, 0);
+        hipDeviceSynchronize();
+    }
    passed();
 }
@@ -0,0 +1,212 @@
+/*
+Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+// Simple test for hipLaunchCooperativeKernelMultiDevice API.
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM all
+ * TEST: %t
+ * HIT_END
+ */
+
+#include "hip/hip_runtime.h"
+#include "hip/hip_runtime_api.h"
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <thread>
+#include <chrono>
+#include "hip/hip_cooperative_groups.h"
+#include "test_common.h"
+
+using namespace std::chrono;
+
+const static uint NumOfLoopIterrations = 16 * 1024;
+const static uint BufferSizeInDwords = 28672 * NumOfLoopIterrations;
+const static uint numQueues = 4;
+const static uint numIter = 100;
+constexpr uint NumKernelArgs = 4;
+constexpr uint MaxGPUs = 8;
+
+#include <stdio.h>
+/*
+namespace cg = cooperative_groups;
+using namespace cooperative_groups;
+*/
+
+__global__ void test_gws(uint* buf, uint bufSize, long* tmpBuf, long* result)
+{
+    extern __shared__ long tmp[];
+    uint groups = gridDim.x;
+    uint group_id = blockIdx.x;
+    uint local_id = threadIdx.x;
+    uint chunk = gridDim.x * blockDim.x;
+
+    uint i = group_id * blockDim.x + local_id;
+    long sum = 0;
+    while (i < bufSize) {
+      sum += buf[i];
+      i += chunk;
+    }
+    tmp[local_id] = sum;
+    __syncthreads();
+    i = 0;
+    if (local_id == 0) {
+        sum = 0;
+        while (i < blockDim.x) {
+          sum += tmp[i];
+          i++;
+        }
+        tmpBuf[group_id] = sum;
+    }
+
+    // wait
+    cooperative_groups::this_grid().sync();
+
+    if (((blockIdx.x * blockDim.x) + threadIdx.x) == 0) {
+        for (uint i = 1; i < groups; ++i) {
+          sum += tmpBuf[i];
+       }
+       //*result = sum;
+       result[1 + cooperative_groups::this_multi_grid().grid_rank()] = sum;
+    }
+    cooperative_groups::this_multi_grid().sync();
+    if (cooperative_groups::this_multi_grid().grid_rank() == 0) {
+      sum = 0;
+      for (uint i = 1; i <= cooperative_groups::this_multi_grid().num_grids(); ++i) {
+        sum += result[i];
+      }
+      *result = sum;
+    }
+}
+
+int main() {
+  float *A, *B;
+  uint* dA[MaxGPUs];
+  long* dB[MaxGPUs];
+  long* dC;
+  hipModule_t Module;
+  hipStream_t stream[MaxGPUs];
+
+  uint32_t* init = new uint32_t[BufferSizeInDwords];
+  for (uint32_t i = 0; i < BufferSizeInDwords; ++i) {
+    init[i] = i;
+  }
+
+  int nGpu = 0;
+  HIPCHECK(hipGetDeviceCount(&nGpu));
+  size_t copySizeInDwords = BufferSizeInDwords / nGpu;
+  hipDeviceProp_t deviceProp[MaxGPUs];
+
+  for (int i = 0; i < nGpu; i++) {
+    HIPCHECK(hipSetDevice(i));
+
+    // Calculate the device occupancy to know how many blocks can be run concurrently
+    hipGetDeviceProperties(&deviceProp[i], 0);
+    if (!deviceProp[i].cooperativeMultiDeviceLaunch) {
+      printf("Device doesn't support cooperative launch!");
+      passed();
+      return 0;
+    }
+    size_t SIZE = copySizeInDwords * sizeof(uint);
+
+    HIPCHECK(hipMalloc((void**)&dA[i], SIZE));
+    if (i == 0) {
+      HIPCHECK(hipHostMalloc((void**)&dC, (nGpu + 1) * sizeof(long), hipHostMallocCoherent));
+    }
+    HIPCHECK(hipMemcpy(dA[i], &init[i * copySizeInDwords] , SIZE, hipMemcpyHostToDevice));
+    HIPCHECK(hipStreamCreate(&stream[i]));
+  }
+
+  dim3 dimBlock;
+  dim3 dimGrid;
+  dimGrid.x = 1;
+  dimGrid.y = 1;
+  dimGrid.z = 1;
+  dimBlock.x = 64;
+  dimBlock.y = 1;
+  dimBlock.z = 1;
+
+  int numBlocks = 0;
+  uint workgroups[3] = {64, 128, 256};
+
+  hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu];
+
+  system_clock::time_point start = system_clock::now();
+
+  for (uint set = 0; set < 3; ++set) {
+    void* args[MaxGPUs * NumKernelArgs];
+    std::cout << "---------- Test#" << set << "---------------\n";
+    for (int i = 0; i < nGpu; i++) {
+      HIPCHECK(hipSetDevice(i));
+      dimBlock.x = workgroups[set];
+      HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks,
+      (hipFunction_t)test_gws, dimBlock.x * dimBlock.y * dimBlock.z, dimBlock.x * sizeof(long)));
+      
+      std::cout << "GPU(" << i << ") Block size: " << dimBlock.x << " Num blocks per CU: " << numBlocks << "\n";
+
+      dimGrid.x = deviceProp[i].multiProcessorCount * std::min(numBlocks, 32);
+      HIPCHECK(hipMalloc((void**)&dB[i], dimGrid.x * sizeof(long)));
+
+      args[i * NumKernelArgs]     = (void*)&dA[i];
+      args[i * NumKernelArgs + 1] = (void*)&copySizeInDwords;
+      args[i * NumKernelArgs + 2] = (void*)&dB[i];
+      args[i * NumKernelArgs + 3] = (void*)&dC;
+
+      launchParamsList[i].func = reinterpret_cast<void*>(test_gws);
+      launchParamsList[i].gridDim = dimGrid;
+      launchParamsList[i].blockDim = dimBlock;
+      launchParamsList[i].sharedMem = dimBlock.x * sizeof(long);
+      launchParamsList[i].stream = stream[i];
+      launchParamsList[i].args = &args[i * NumKernelArgs];
+    }
+ 
+    hipLaunchCooperativeKernelMultiDevice(launchParamsList, nGpu, 0);
+
+    HIPCHECK(hipMemcpy(init, dC, sizeof(long), hipMemcpyDeviceToHost));
+
+    if (*dC != (((long)(BufferSizeInDwords) * (BufferSizeInDwords - 1)) / 2)) {
+      std::cout << "Data validation failed for grid size = " << dimGrid.x << " and block size = " << dimBlock.x << "\n";
+      std::cout << "Test failed! \n";
+    }
+    for (int i = 0; i < nGpu; i++) {
+      hipFree(dB[i]);
+    }
+  }
+  system_clock::time_point end = system_clock::now();
+
+  delete [] launchParamsList;
+
+  std::chrono::duration<double> elapsed_seconds = end - start;
+
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+ 
+  std::cout << "finished computation at " << std::ctime(&end_time) <<
+    "elapsed time: " << elapsed_seconds.count() << "s\n";
+
+  hipSetDevice(0);
+  hipFree(dC);
+  for (int i = 0; i < nGpu; i++) {
+    hipFree(dA[i]);
+    HIPCHECK(hipStreamDestroy(stream[i]));
+  }
+  delete [] init;
+  passed();
+  return 0;
+}
@@ -116,7 +116,7 @@ int main() {

    dimBlock.x = workgroups[i];
    // Calculate the device occupancy to know how many blocks can be run concurrently
-    hipOccupancyMaxActiveBlocksPerMultiprocessor(reinterpret_cast<uint32_t*>(&numBlocks),
+    hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks,
        test_gws, dimBlock.x * dimBlock.y * dimBlock.z, dimBlock.x * sizeof(long));

    dimGrid.x = deviceProp.multiProcessorCount * std::min(numBlocks, 32);
@@ -30,10 +30,6 @@ THE SOFTWARE.
 #include "hip/hip_runtime.h"
 #include "test_common.h"

-#define fileName "vcpy_kernel.code"
-#define kernel_name "hello_world"
-
-
 __global__ void f1(float *a) { *a = 1.0; }

 template <typename T>
@@ -49,11 +45,10 @@ int main(int argc, char* argv[]) {
    hipOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, f1, 0, 0);
    assert(gridSize != 0 && blockSize != 0);

-    uint32_t numBlock = 0;
-    hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, f1, blockSize, 0);
+    int numBlock = 0;
+    hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, f1, (int)blockSize, 0);
    assert(numBlock != 0);

-
    // test case for using kernel function pointer with template
    gridSize = 0;
    blockSize = 0;
@@ -61,17 +56,7 @@ int main(int argc, char* argv[]) {
    assert(gridSize != 0 && blockSize != 0);

    numBlock = 0;
-    hipOccupancyMaxActiveBlocksPerMultiprocessor<void(*)(int *)>(&numBlock, f2, blockSize, 0);
-    assert(numBlock != 0);
-
-
-    // test case for using kernel with hipFunction_t type
-    numBlock = 0;
-    hipModule_t Module;
-    hipFunction_t Function;
-    HIPCHECK(hipModuleLoad(&Module, fileName));
-    HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name));
-    HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, Function, blockSize, 0));
+    hipOccupancyMaxActiveBlocksPerMultiprocessor<void(*)(int *)>(&numBlock, f2, (int)blockSize, 0);
    assert(numBlock != 0);

    passed();
@@ -33,7 +33,6 @@ THE SOFTWARE.
 #define fileName "vcpy_kernel.code"
 #define kernel_name "hello_world"

-
 __global__ void f1(float *a) { *a = 1.0; }

 template <typename T>
@@ -76,8 +76,7 @@ void texture2Dtest()
    texDescr.readMode = hipReadModeElementType;

    hipTextureObject_t texObj;
-    hipResourceViewDesc resDesc;
-    HIPCHECK( hipCreateTextureObject(&texObj, &texRes, &texDescr, &resDesc));
+    HIPCHECK( hipCreateTextureObject(&texObj, &texRes, &texDescr, NULL));

    HIPCHECK(hipMalloc((void**)&devPtrB, SIZE_W*sizeof(TYPE_t)*SIZE_H)) ;