Re-sync with upstream.

2018-10-10 11:43:49 +01:00
commit 9ec697c620
@@ -288,12 +288,22 @@ if(HIP_PLATFORM STREQUAL "hcc")
    install(EXPORT hip-targets DESTINATION ${CONFIG_PACKAGE_INSTALL_DIR} NAMESPACE hip::)
    include(CMakePackageConfigHelpers)

+    if(HIP_COMPILER STREQUAL "hcc")
    configure_package_config_file(
-        hip-config.cmake.in
+        hip-config-hcc.cmake.in
        ${CMAKE_CURRENT_BINARY_DIR}/hip-config.cmake
        INSTALL_DESTINATION ${CONFIG_PACKAGE_INSTALL_DIR}
        PATH_VARS LIB_INSTALL_DIR INCLUDE_INSTALL_DIR BIN_INSTALL_DIR
        )
+    elseif(HIP_COMPILER STREQUAL "clang")
+    configure_package_config_file(
+        hip-config-clang.cmake.in
+        ${CMAKE_CURRENT_BINARY_DIR}/hip-config.cmake
+        INSTALL_DESTINATION ${CONFIG_PACKAGE_INSTALL_DIR}
+        PATH_VARS LIB_INSTALL_DIR INCLUDE_INSTALL_DIR BIN_INSTALL_DIR
+        )
+    endif()
+
    write_basic_package_version_file(
        ${CMAKE_CURRENT_BINARY_DIR}/hip-config-version.cmake
        VERSION "${HIP_VERSION}"
@@ -167,7 +167,8 @@ def docker_build_inside_image( def build_image, String inside_args, String platf
    }

    // Cap the maximum amount of testing, in case of hangs
-    // Excluding hipPrintfKernel test from automation; variable fails on CI test machines
+    // Excluding hipVectorTypes test from automation; due to regression from HCC commit 2367133
+    // Excluding hipFloatMath test from automation; due to regression from ROCDL commit 2fc04e1
    timeout(time: 1, unit: 'HOURS')
    {
      stage("${platform} unit testing")
@@ -177,7 +178,7 @@ def docker_build_inside_image( def build_image, String inside_args, String platf
            cd ${build_dir_rel}
            make install -j\$(nproc)
            make build_tests -i -j\$(nproc)
-            ctest -E hipVectorTypes
+            ctest -E "(hipVectorTypes.tst|hipVectorTypesDevice.tst|hipFloatMath.tst)"
          """
        // If unit tests output a junit or xunit file in the future, jenkins can parse that file
        // to display test results on the dashboard
@@ -385,4 +386,40 @@ rocm_head:
    docker_clean_images( job_name, hip_image_name )
    */
  }
+},
+cuda_9_x:
+{
+  node('hip-cuda')
+  {
+    ////////////////////////////////////////////////////////////////////////
+    // Block of string constants customizing behavior for cuda
+    String nvcc_ver = 'nvcc-9.x'
+    String from_image = 'ci_test_nodes/cuda-9.x/ubuntu-16.04:latest'
+    String inside_args = '--runtime=nvidia';
+
+    // Checkout source code, dependencies and version files
+    String source_hip_rel = checkout_and_version( nvcc_ver )
+
+    // Create/reuse a docker image that represents the hip build environment
+    def hip_build_image = docker_build_image( nvcc_ver, 'hip', '', source_hip_rel, from_image )
+
+    // Print system information for the log
+    hip_build_image.inside( inside_args )
+    {
+      sh  """#!/usr/bin/env bash
+          set -x
+          nvidia-smi
+          nvcc --version
+        """
+    }
+
+    // Conctruct a binary directory path based on build config
+    String build_hip_rel = build_directory_rel( build_config );
+
+    // Build hip inside of the build environment
+    docker_build_inside_image( hip_build_image, inside_args, nvcc_ver, "-DHIP_NVCC_FLAGS=--Wno-deprecated-gpu-targets", build_config, source_hip_rel, build_hip_rel )
+
+    // Clean docker image
+    docker_clean_images( 'hip', docker_build_image_name( ) )
+  }
 }
@@ -35,9 +35,9 @@
 | enum         |***`cublasAtomicsMode_t`***                                    |                                                            |
 |            0 |*`CUBLAS_ATOMICS_NOT_ALLOWED`*                                 |                                                            |
 |            1 |*`CUBLAS_ATOMICS_ALLOWED`*                                     |                                                            |
-| enum         |***`cublasAtomicsMode_t`***                                    |                                                            |
-|           -1 |*`CUBLAS_GEMM_DFALT`*                                          |                                                            |
-|           -1 |*`CUBLAS_GEMM_DEFAULT`*                                        |                                                            |
+| enum         |***`cublasGemmAlgo_t`***                                       |***`hipblasGemmAlgo_t`***                                   |
+|           -1 |*`CUBLAS_GEMM_DFALT`*                                          |*`HIPBLAS_GEMM_DEFAULT`*                                    | 160                       |
+|           -1 |*`CUBLAS_GEMM_DEFAULT`*                                        |*`HIPBLAS_GEMM_DEFAULT`*                                    | 160                       |
 |            0 |*`CUBLAS_GEMM_ALGO0`*                                          |                                                            |
 |            1 |*`CUBLAS_GEMM_ALGO1`*                                          |                                                            |
 |            2 |*`CUBLAS_GEMM_ALGO2`*                                          |                                                            |
@@ -391,7 +391,7 @@
 |`cublasZgemm3m`                                            |                                                 |
 |`cublasHgemm`                                              |`hipblasHgemm`                                   |
 |`cublasSgemmEx`                                            |                                                 |
-|`cublasGemmEx`                                             |                                                 |
+|`cublasGemmEx`                                             |`hipblasGemmEx`                                  |
 |`cublasCgemmEx`                                            |                                                 |
 |`cublasUint8gemmBias`                                      |                                                 |
 |`cublasSsyrk`                                              |                                                 |
@@ -93,10 +93,10 @@

 ## **7. Occupancy**

-|   **CUDA**                                                |   **HIP**                     |
-|-----------------------------------------------------------|-------------------------------|
+|   **CUDA**                                                |   **HIP**                                     |
+|-----------------------------------------------------------|-----------------------------------------------|
 | `cudaOccupancyMaxActiveBlocksPerMultiprocessor`           | `hipOccupancyMaxActiveBlocksPerMultiprocessor`|
-| `cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags`  |                               |
+| `cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags`  |                                               |

 ## **8. Execution Control [deprecated since 7.0]**

@@ -326,13 +326,13 @@

 ## **25. Texture Object Management**

-|   **CUDA**                                                |   **HIP**                     |
-|-----------------------------------------------------------|-------------------------------|
-| `cudaCreateTextureObject`                                 |`hipCreateTextureObject`       |
-| `cudaDestroyTextureObject`                                |`hipDestroyTextureObject`      |
-| `cudaGetTextureObjectResourceDesc`                        |`hipGetTextureObjectResourceDesc` |
+|   **CUDA**                                                |   **HIP**                            |
+|-----------------------------------------------------------|--------------------------------------|
+| `cudaCreateTextureObject`                                 |`hipCreateTextureObject`              |
+| `cudaDestroyTextureObject`                                |`hipDestroyTextureObject`             |
+| `cudaGetTextureObjectResourceDesc`                        |`hipGetTextureObjectResourceDesc`     |
 | `cudaGetTextureObjectResourceViewDesc`                    |`hipGetTextureObjectResourceViewDesc` |
-| `cudaGetTextureObjectTextureDesc`                         |`hipGetTextureObjectTextureDesc` |
+| `cudaGetTextureObjectTextureDesc`                         |`hipGetTextureObjectTextureDesc`      |

 ## **26. Surface Object Management**

@@ -352,36 +352,36 @@
 ## **28. C++ API Routines**
 *(7.0 contains, 7.5 doesn’t)*

-|   **CUDA**                                                |   **HIP**                     |
-|-----------------------------------------------------------|-------------------------------|
-| `cudaBindSurfaceToArray`                                  |                               |
-| `cudaBindTexture`                                         | `hipBindTexture`              |
-| `cudaBindTexture2D`                                       |                               |
-| `cudaBindTextureToArray`                                  |                               |
-| `cudaBindTextureToMipmappedArray`                         |                               |
-| `cudaCreateChannelDesc`                                   | `hipCreateChannelDesc`        |
-| `cudaFuncGetAttributes`                                   |                               |
-| `cudaFuncSetCacheConfig`                                  |                               |
-| `cudaGetSymbolAddress`                                    |                               |
-| `cudaGetSymbolSize`                                       |                               |
-| `cudaGetTextureAlignmentOffset`                           |                               |
-| `cudaLaunch`                                              |                               |
-| `cudaLaunchKernel`                                        |                               |
-| `cudaMallocHost`                                          |                               |
-| `cudaMallocManaged`                                       |                               |
-| `cudaMemcpyFromSymbol`                                    |                               |
-| `cudaMemcpyFromSymbolAsync`                               |                               |
-| `cudaMemcpyToSymbol`                                      |                               |
-| `cudaMemcpyToSymbolAsync`                                 |                               |
+|   **CUDA**                                                |   **HIP**                                      |
+|-----------------------------------------------------------|------------------------------------------------|
+| `cudaBindSurfaceToArray`                                  |                                                |
+| `cudaBindTexture`                                         | `hipBindTexture`                               |
+| `cudaBindTexture2D`                                       |                                                |
+| `cudaBindTextureToArray`                                  |                                                |
+| `cudaBindTextureToMipmappedArray`                         |                                                |
+| `cudaCreateChannelDesc`                                   | `hipCreateChannelDesc`                         |
+| `cudaFuncGetAttributes`                                   |                                                |
+| `cudaFuncSetCacheConfig`                                  |                                                |
+| `cudaGetSymbolAddress`                                    |                                                |
+| `cudaGetSymbolSize`                                       |                                                |
+| `cudaGetTextureAlignmentOffset`                           |                                                |
+| `cudaLaunch`                                              |                                                |
+| `cudaLaunchKernel`                                        |                                                |
+| `cudaMallocHost`                                          |                                                |
+| `cudaMallocManaged`                                       |                                                |
+| `cudaMemcpyFromSymbol`                                    |                                                |
+| `cudaMemcpyFromSymbolAsync`                               |                                                |
+| `cudaMemcpyToSymbol`                                      |                                                |
+| `cudaMemcpyToSymbolAsync`                                 |                                                |
 | `cudaOccupancyMaxActiveBlocksPerMultiprocessor`           | `hipOccupancyMaxActiveBlocksPerMultiprocessor` |
-| `cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags`  |                               |
-| `cudaOccupancyMaxPotentialBlockSize`                      | `hipOccupancyMaxPotentialBlockSize` |
-| `cudaOccupancyMaxPotentialBlockSizeVariableSMem`          |                               |
-| `cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags` |                               |
-| `cudaOccupancyMaxPotentialBlockSizeWithFlags`             |                               |
-| `cudaSetupArgument`                                       |                               |
-| `cudaStreamAttachMemAsync`                                |                               |
-| `cudaUnbindTexture`                                       | `hipUnbindTexture`            |
+| `cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags`  |                                                |
+| `cudaOccupancyMaxPotentialBlockSize`                      | `hipOccupancyMaxPotentialBlockSize`            |
+| `cudaOccupancyMaxPotentialBlockSizeVariableSMem`          |                                                |
+| `cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags` |                                                |
+| `cudaOccupancyMaxPotentialBlockSizeWithFlags`             |                                                |
+| `cudaSetupArgument`                                       |                                                |
+| `cudaStreamAttachMemAsync`                                |                                                |
+| `cudaUnbindTexture`                                       | `hipUnbindTexture`                             |

 ## **30. Profiler Control**

@@ -395,8 +395,8 @@

 ## **31. Data types**

-| **type**     |   **CUDA**                                    |   **HIP**                                            |
-|-------------:|-----------------------------------------------|------------------------------------------------------|
+| **type**     |   **CUDA**                                    |   **HIP**                                            |**HIP value** (if differs) |
+|-------------:|-----------------------------------------------|------------------------------------------------------|---------------------------|
 | struct       | `cudaChannelFormatDesc`                       | `hipChannelFormatDesc`                               |
 | struct       | `cudaDeviceProp`                              | `hipDeviceProp_t`                                    |
 | struct       | `cudaExtent`                                  | `hipExtent`                                          |
@@ -790,3 +790,19 @@
 | define       | `cudaTextureType1DLayered`                    | `hipTextureType1DLayered`                            |
 | define       | `cudaTextureType2DLayered`                    | `hipTextureType2DLayered`                            |
 | define       | `cudaTextureTypeCubemapLayered`               | `hipTextureTypeCubemapLayered`                       |
+| enum         |***`cudaDataType_t`***                         |***`hipblasDatatype_t`***                             |
+| enum         |***`cudaDataType`***                           |***`hipblasDatatype_t`***                             |
+|            2 |*`CUDA_R_16F`*                                 |*`HIPBLAS_R_16F`*                                     | 150                       |
+|            6 |*`CUDA_C_16F`*                                 |*`HIPBLAS_C_16F`*                                     | 153                       |
+|            0 |*`CUDA_R_32F`*                                 |*`HIPBLAS_R_32F`*                                     | 151                       |
+|            4 |*`CUDA_C_32F`*                                 |*`HIPBLAS_C_32F`*                                     | 154                       |
+|            1 |*`CUDA_R_64F`*                                 |*`HIPBLAS_R_64F`*                                     | 152                       |
+|            5 |*`CUDA_C_64F`*                                 |*`HIPBLAS_C_64F`*                                     | 155                       |
+|            3 |*`CUDA_R_8I`*                                  |                                                      |
+|            7 |*`CUDA_C_8I`*                                  |                                                      |
+|            8 |*`CUDA_R_8U`*                                  |                                                      |
+|            9 |*`CUDA_C_8U`*                                  |                                                      |
+|           10 |*`CUDA_R_32I`*                                 |                                                      |
+|           11 |*`CUDA_C_32I`*                                 |                                                      |
+|           12 |*`CUDA_R_32U`*                                 |                                                      |
+|           13 |*`CUDA_C_32U`*                                 |                                                      |
@@ -239,8 +239,8 @@
 |`cudnnSetTensor`                                           |`hipdnnSetTensor`                                |
 |`cudnnScaleTensor`                                         |`hipdnnScaleTensor`                              |
 |`cudnnCreateFilterDescriptor`                              |`hipdnnCreateFilterDescriptor`                   |
-|`cudnnSetFilter4dDescriptor`                               |                                                 |
-|`cudnnGetFilter4dDescriptor`                               |                                                 |
+|`cudnnSetFilter4dDescriptor`                               |`hipdnnSetFilter4dDescriptor`                    |
+|`cudnnGetFilter4dDescriptor`                               |`hipdnnGetFilter4dDescriptor`                    |
 |`cudnnSetFilterNdDescriptor`                               |`hipdnnSetFilterNdDescriptor`                    |
 |`cudnnGetFilterNdDescriptor`                               |`hipdnnGetFilterNdDescriptor`                    |
 |`cudnnDestroyFilterDescriptor`                             |`hipdnnDestroyFilterDescriptor`                  |
@@ -91,8 +91,22 @@ Setting HCC_UNPINNED_COPY_MODE = 3, forces all unpinned transfer to use direct m
 
 Following environment variables can be used to control the transfer thresholds:

-   HCC_H2D_STAGING_THRESHOLD - Threshold in KB for H2D copy. For sizes smaller than threshold direct copy logic would be used else staging buffers logic. By default it is set to 64.
+-   HCC_H2D_STAGING_THRESHOLD - Threshold in KB for H2D copy. For sizes smaller than threshold direct copy logic would be used else staging buffers logic. By default it is set to 64.
   
 -   HCC_H2D_PININPLACE_THRESHOLD - Threshold in KB for H2D copy. For sizes smaller than threshold staging buffers logic would be used else PinInPlace logic. By default it is set to 4096.

 -   HCC_D2H_PININPLACE_THRESHOLD  - Threshold in KB for D2H copy. For sizes smaller than threshold staging buffer logic would be used else PinInPlace logic. By default it is set to 1024.
+
+## Device-Side Malloc
+
+hip-hcc and hip-clang supports device-side malloc and free. Users can allocate
+memory dynamically in a kernel. The allocated memory are in global address
+space, however, different threads get different memory allocations for the same
+call of malloc. The allocated memory can be accessed or freed by other threads
+or other kernels. It persists in the life time of the HIP program until it is
+freed.
+
+The memory are allocated in pages. Users can define macro
+`__HIP_SIZE_OF_PAGE` for controlling the page size in bytes and macro
+`__HIP_NUM_PAGES` for controlling the total number of pages that can be
+allocated.
@@ -48,9 +48,6 @@ set_and_check( hip_BIN_INSTALL_DIR "@PACKAGE_BIN_INSTALL_DIR@" )
 set_and_check(hip_HIPCC_EXECUTABLE "${hip_BIN_INSTALL_DIR}/hipcc")
 set_and_check(hip_HIPCONFIG_EXECUTABLE "${hip_BIN_INSTALL_DIR}/hipconfig")

-if(HIP_COMPILER STREQUAL "hcc")
-  find_dependency(hcc)
-endif()
 include( "${CMAKE_CURRENT_LIST_DIR}/hip-targets.cmake" )

 set( hip_LIBRARIES hip::host hip::device)
@@ -0,0 +1,65 @@
+@PACKAGE_INIT@
+
+include(CMakeFindDependencyMacro OPTIONAL RESULT_VARIABLE _CMakeFindDependencyMacro_FOUND)
+if (NOT _CMakeFindDependencyMacro_FOUND)
+  macro(find_dependency dep)
+    if (NOT ${dep}_FOUND)
+      set(cmake_fd_version)
+      if (${ARGC} GREATER 1)
+        set(cmake_fd_version ${ARGV1})
+      endif()
+      set(cmake_fd_exact_arg)
+      if(${CMAKE_FIND_PACKAGE_NAME}_FIND_VERSION_EXACT)
+        set(cmake_fd_exact_arg EXACT)
+      endif()
+      set(cmake_fd_quiet_arg)
+      if(${CMAKE_FIND_PACKAGE_NAME}_FIND_QUIETLY)
+        set(cmake_fd_quiet_arg QUIET)
+      endif()
+      set(cmake_fd_required_arg)
+      if(${CMAKE_FIND_PACKAGE_NAME}_FIND_REQUIRED)
+        set(cmake_fd_required_arg REQUIRED)
+      endif()
+      find_package(${dep} ${cmake_fd_version}
+          ${cmake_fd_exact_arg}
+          ${cmake_fd_quiet_arg}
+          ${cmake_fd_required_arg}
+      )
+      string(TOUPPER ${dep} cmake_dep_upper)
+      if (NOT ${dep}_FOUND AND NOT ${cmake_dep_upper}_FOUND)
+        set(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE "${CMAKE_FIND_PACKAGE_NAME} could not be found because dependency ${dep} could not be found.")
+        set(${CMAKE_FIND_PACKAGE_NAME}_FOUND False)
+        return()
+      endif()
+      set(cmake_fd_version)
+      set(cmake_fd_required_arg)
+      set(cmake_fd_quiet_arg)
+      set(cmake_fd_exact_arg)
+    endif()
+  endmacro()
+endif()
+
+
+set_and_check( hip_INCLUDE_DIR "@PACKAGE_INCLUDE_INSTALL_DIR@" )
+set_and_check( hip_INCLUDE_DIRS "${hip_INCLUDE_DIR}" )
+set_and_check( hip_LIB_INSTALL_DIR "@PACKAGE_LIB_INSTALL_DIR@" )
+set_and_check( hip_BIN_INSTALL_DIR "@PACKAGE_BIN_INSTALL_DIR@" )
+
+set_and_check(hip_HIPCC_EXECUTABLE "${hip_BIN_INSTALL_DIR}/hipcc")
+set_and_check(hip_HIPCONFIG_EXECUTABLE "${hip_BIN_INSTALL_DIR}/hipconfig")
+
+find_dependency(hcc)
+include( "${CMAKE_CURRENT_LIST_DIR}/hip-targets.cmake" )
+
+set( hip_LIBRARIES hip::host hip::device)
+set( hip_LIBRARY ${hip_LIBRARIES})
+
+set(HIP_INCLUDE_DIR ${hip_INCLUDE_DIR})
+set(HIP_INCLUDE_DIRS ${hip_INCLUDE_DIRS})
+set(HIP_LIB_INSTALL_DIR ${hip_LIB_INSTALL_DIR})
+set(HIP_BIN_INSTALL_DIR ${hip_BIN_INSTALL_DIR})
+set(HIP_LIBRARIES ${hip_LIBRARIES})
+set(HIP_LIBRARY ${hip_LIBRARY})
+set(HIP_HIPCC_EXECUTABLE ${hip_HIPCC_EXECUTABLE})
+set(HIP_HIPCONFIG_EXECUTABLE ${hip_HIPCONFIG_EXECUTABLE})
+
@@ -89,7 +89,8 @@ if (HIPIFY_CLANG_TESTS)
    if ((CUDA_VERSION VERSION_LESS "7.0") OR (LLVM_PACKAGE_VERSION VERSION_LESS "3.8") OR
        (CUDA_VERSION VERSION_GREATER "7.5" AND LLVM_PACKAGE_VERSION VERSION_LESS "4.0") OR
        (CUDA_VERSION VERSION_GREATER "8.0" AND LLVM_PACKAGE_VERSION VERSION_LESS "6.0") OR
-        (CUDA_VERSION VERSION_GREATER "9.0" AND LLVM_PACKAGE_VERSION VERSION_LESS "7.0"))
+        (CUDA_VERSION VERSION_GREATER "9.0" AND LLVM_PACKAGE_VERSION VERSION_LESS "7.0") OR
+         CUDA_VERSION VERSION_EQUAL "10.0")
        message(SEND_ERROR "CUDA ${CUDA_VERSION} is not supported by clang ${LLVM_PACKAGE_VERSION}.")
        if (CUDA_VERSION VERSION_LESS "7.0")
            message(STATUS "Please install CUDA 7.0 or higher.")
@@ -30,11 +30,16 @@
 ## <a name="dependencies"></a> Dependencies

 `hipify-clang` requires:
-1. LLVM+CLANG of at least version 3.8.0, latest stable and recommended release is 6.0.1.
-2. CUDA at least version 7.5, latest supported release is 9.0.
+1. LLVM+CLANG of at least version 3.8.0, latest stable and recommended releases:
+   
+   6.0.1 (linux and windows),

-| **LLVM release version** | **CUDA latest supported version** |
-|:------------------------:|:---------------------------------:|
+   7.0.0 (linux only).
+
+2. CUDA at least version 7.5, latest supported release is 9.2.
+
+| **LLVM release version** | **CUDA latest supported version** | **Comments** |
+|:------------------------:|:---------------------------------:|:------------:|
 | 3.8.0                    | 7.5                               |
 | 3.8.1                    | 7.5                               |
 | 3.9.0                    | 7.5                               |
@@ -46,6 +51,8 @@
 | 5.0.2                    | 8.0                               |
 | 6.0.0                    | 9.0                               |
 | 6.0.1                    | 9.0                               |
+| 7.0.0                    | 9.2                               | linux only |
+|                          | 10.0                              | not yet supported |

 In most cases, you can get a suitable version of LLVM+CLANG with your package manager.

@@ -1417,22 +1417,23 @@ const std::map<llvm::StringRef, hipCounter> CUDA_IDENTIFIER_MAP{

 /////////////////////////////// CUDA RT API ///////////////////////////////
    // Data types
-    {"cudaDataType_t", {"hipDataType_t", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}},
-    {"cudaDataType",   {"hipDataType_t", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}},
-    {"CUDA_R_16F",     {"hipR16F",       CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}},
-    {"CUDA_C_16F",     {"hipC16F",       CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}},
-    {"CUDA_R_32F",     {"hipR32F",       CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}},
-    {"CUDA_C_32F",     {"hipC32F",       CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}},
-    {"CUDA_R_64F",     {"hipR64F",       CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}},
-    {"CUDA_C_64F",     {"hipC64F",       CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}},
-    {"CUDA_R_8I",      {"hipR8I",        CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}},
-    {"CUDA_C_8I",      {"hipC8I",        CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}},
-    {"CUDA_R_8U",      {"hipR8U",        CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}},
-    {"CUDA_C_8U",      {"hipC8U",        CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}},
-    {"CUDA_R_32I",     {"hipR32I",       CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}},
-    {"CUDA_C_32I",     {"hipC32I",       CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}},
-    {"CUDA_R_32U",     {"hipR32U",       CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}},
-    {"CUDA_C_32U",     {"hipC32U",       CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}},
+    // TODO: rename hipblasDatatype_t to hipDataType_t and move from hipBLAS to HIP
+    {"cudaDataType_t", {"hipblasDatatype_t", CONV_TYPE, API_RUNTIME}},
+    {"cudaDataType",   {"hipblasDatatype_t", CONV_TYPE, API_RUNTIME}},
+    {"CUDA_R_16F",     {"HIPBLAS_R_16F",     CONV_NUMERIC_LITERAL, API_RUNTIME}},                  //  2  // 150
+    {"CUDA_C_16F",     {"HIPBLAS_C_16F",     CONV_NUMERIC_LITERAL, API_RUNTIME}},                  //  6  // 153
+    {"CUDA_R_32F",     {"HIPBLAS_R_32F",     CONV_NUMERIC_LITERAL, API_RUNTIME}},                  //  0  // 151
+    {"CUDA_C_32F",     {"HIPBLAS_C_32F",     CONV_NUMERIC_LITERAL, API_RUNTIME}},                  //  4  // 154
+    {"CUDA_R_64F",     {"HIPBLAS_R_64F",     CONV_NUMERIC_LITERAL, API_RUNTIME}},                  //  1  // 152
+    {"CUDA_C_64F",     {"HIPBLAS_C_64F",     CONV_NUMERIC_LITERAL, API_RUNTIME}},                  //  5  // 155
+    {"CUDA_R_8I",      {"HIPBLAS_R_8I",      CONV_NUMERIC_LITERAL, API_RUNTIME, HIP_UNSUPPORTED}}, //  3  //
+    {"CUDA_C_8I",      {"HIPBLAS_C_8I",      CONV_NUMERIC_LITERAL, API_RUNTIME, HIP_UNSUPPORTED}}, //  7  //
+    {"CUDA_R_8U",      {"HIPBLAS_R_8U",      CONV_NUMERIC_LITERAL, API_RUNTIME, HIP_UNSUPPORTED}}, //  8  //
+    {"CUDA_C_8U",      {"HIPBLAS_C_8U",      CONV_NUMERIC_LITERAL, API_RUNTIME, HIP_UNSUPPORTED}}, //  9  //
+    {"CUDA_R_32I",     {"HIPBLAS_R_32I",     CONV_NUMERIC_LITERAL, API_RUNTIME, HIP_UNSUPPORTED}}, // 10  //
+    {"CUDA_C_32I",     {"HIPBLAS_C_32I",     CONV_NUMERIC_LITERAL, API_RUNTIME, HIP_UNSUPPORTED}}, // 11  //
+    {"CUDA_R_32U",     {"HIPBLAS_R_32U",     CONV_NUMERIC_LITERAL, API_RUNTIME, HIP_UNSUPPORTED}}, // 12  //
+    {"CUDA_C_32U",     {"HIPBLAS_C_32U",     CONV_NUMERIC_LITERAL, API_RUNTIME, HIP_UNSUPPORTED}}, // 13  //

    // Library property types
    // IMPORTANT: no cuda prefix
@@ -2143,11 +2144,7 @@ const std::map<llvm::StringRef, hipCounter> CUDA_IDENTIFIER_MAP{
    {"CUBLAS_ATOMICS_ALLOWED",         {"HIPBLAS_ATOMICS_ALLOWED",         CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED}},

    // Blas Data Type
-    {"cublasDataType_t",               {"hipblasDataType_t",               CONV_TYPE, API_BLAS, HIP_UNSUPPORTED}},
-    {"CUBLAS_DATA_FLOAT",              {"HIPBLAS_DATA_FLOAT",              CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED}},
-    {"CUBLAS_DATA_DOUBLE",             {"HIPBLAS_DATA_DOUBLE",             CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED}},
-    {"CUBLAS_DATA_HALF",               {"HIPBLAS_DATA_HALF",               CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED}},
-    {"CUBLAS_DATA_INT8",               {"HIPBLAS_DATA_INT8",               CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED}},
+    {"cublasDataType_t",               {"hipblasDatatype_t",               CONV_TYPE, API_BLAS}},

    // Blas Math mode/tensor operation
    {"cublasMath_t",                   {"hipblasMath_t",                   CONV_TYPE, API_BLAS, HIP_UNSUPPORTED}},
@@ -2155,9 +2152,9 @@ const std::map<llvm::StringRef, hipCounter> CUDA_IDENTIFIER_MAP{
    {"CUBLAS_TENSOR_OP_MATH",          {"HIPBLAS_TENSOR_OP_MATH",          CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED}},

    // Blass different GEMM algorithms
-    {"cublasGemmAlgo_t",               {"hipblasGemmAlgo_t",               CONV_TYPE, API_BLAS, HIP_UNSUPPORTED}},
-    {"CUBLAS_GEMM_DFALT",              {"HIPBLAS_GEMM_DFALT",              CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED}},  //  -1
-    {"CUBLAS_GEMM_DEFAULT",            {"HIPBLAS_GEMM_DEFAULT",            CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED}},  //  -1
+    {"cublasGemmAlgo_t",               {"hipblasGemmAlgo_t",               CONV_TYPE, API_BLAS}},
+    {"CUBLAS_GEMM_DFALT",              {"HIPBLAS_GEMM_DEFAULT",            CONV_NUMERIC_LITERAL, API_BLAS}},                   //  -1  // 160
+    {"CUBLAS_GEMM_DEFAULT",            {"HIPBLAS_GEMM_DEFAULT",            CONV_NUMERIC_LITERAL, API_BLAS}},                   //  -1  // 160
    {"CUBLAS_GEMM_ALGO0",              {"HIPBLAS_GEMM_ALGO0",              CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED}},  //   0
    {"CUBLAS_GEMM_ALGO1",              {"HIPBLAS_GEMM_ALGO1",              CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED}},  //   1
    {"CUBLAS_GEMM_ALGO2",              {"HIPBLAS_GEMM_ALGO2",              CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED}},  //   2
@@ -2715,7 +2712,7 @@ const std::map<llvm::StringRef, hipCounter> CUDA_IDENTIFIER_MAP{

    //IO in FP16 / FP32, computation in float
    {"cublasSgemmEx",                  {"hipblasSgemmEx",                  CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}},
-    {"cublasGemmEx",                   {"hipblasGemmEx",                   CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}},
+    {"cublasGemmEx",                   {"hipblasGemmEx",                   CONV_MATH_FUNC, API_BLAS}},
    {"cublasGemmBatchedEx",            {"hipblasGemmBatchedEx",            CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}},
    {"cublasGemmStridedBatchedEx",     {"hipblasGemmStridedBatchedEx",     CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}},
    // IO in Int8 complex/cuComplex, computation in cuComplex
@@ -27,11 +27,12 @@ THE SOFTWARE.
 #include "math_fwd.h"

 #include <hip/hip_runtime_api.h>
+#include <stddef.h>
+
+
 #include <hip/hip_vector_types.h>
 #include <hip/hcc_detail/device_library_decls.h>
 #include <hip/hcc_detail/llvm_intrinsics.h>
-#include <stddef.h>
-
 /*
 Integer Intrinsics
 */
@@ -1036,4 +1037,5 @@ static inline __device__ void* memset(void* ptr, int val, size_t size) {
    unsigned char val8 = static_cast<unsigned char>(val);
    return __hip_hc_memset(ptr, val8, size);
 }
+
 #endif
@@ -29,7 +29,7 @@ THE SOFTWARE.
    #include <utility>
 #endif

-#if defined(__clang__) && (__clang_major__ > 3)
+#if defined(__clang__) && (__clang_major__ > 5)
    typedef _Float16 _Float16_2 __attribute__((ext_vector_type(2)));

    struct __half_raw {
@@ -26,6 +26,8 @@ THE SOFTWARE.
 // Half Math Functions
 // */

+#include "host_defines.h"
+
 extern "C"
 {
    __device__ __attribute__((const)) _Float16 __ocml_ceil_f16(_Float16);
@@ -0,0 +1,109 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_MEMORY_H
+#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_MEMORY_H
+
+// Implementation of malloc and free device functions.
+// HIP heap is implemented as a global array with fixed size. Users may define
+// __HIP_SIZE_OF_PAGE and __HIP_NUM_PAGES to have a larger heap.
+
+// Size of page in bytes.
+#ifndef __HIP_SIZE_OF_PAGE
+#define __HIP_SIZE_OF_PAGE 64
+#endif
+
+// Total number of pages
+#ifndef __HIP_NUM_PAGES
+#define __HIP_NUM_PAGES (16 * 64 * 64)
+#endif
+
+#define __HIP_SIZE_OF_HEAP (__HIP_NUM_PAGES * __HIP_SIZE_OF_PAGE)
+
+#if __HCC__ || __HIP__
+
+__attribute__((weak)) __device__ char __hip_device_heap[__HIP_SIZE_OF_HEAP];
+__attribute__((weak)) __device__
+    uint32_t __hip_device_page_flag[__HIP_NUM_PAGES];
+
+extern "C" inline __device__ void* __hip_malloc(size_t size) {
+    char* heap = (char*)__hip_device_heap;
+    if (size > __HIP_SIZE_OF_HEAP) {
+        return (void*)nullptr;
+    }
+    uint32_t totalThreads =
+        hipBlockDim_x * hipGridDim_x * hipBlockDim_y
+        * hipGridDim_y * hipBlockDim_z * hipGridDim_z;
+    uint32_t currentWorkItem = hipThreadIdx_x + hipBlockDim_x * hipBlockIdx_x
+        + (hipThreadIdx_y + hipBlockDim_y * hipBlockIdx_y) * hipBlockDim_x
+        + (hipThreadIdx_z + hipBlockDim_z * hipBlockIdx_z) * hipBlockDim_x
+        * hipBlockDim_y;
+
+    uint32_t numHeapsPerWorkItem = __HIP_NUM_PAGES / totalThreads;
+    uint32_t heapSizePerWorkItem = __HIP_SIZE_OF_HEAP / totalThreads;
+
+    uint32_t stride = size / __HIP_SIZE_OF_PAGE;
+    uint32_t start = numHeapsPerWorkItem * currentWorkItem;
+
+    uint32_t k = 0;
+
+    while (__hip_device_page_flag[k] > 0) {
+        k++;
+    }
+
+    for (uint32_t i = 0; i < stride - 1; i++) {
+        __hip_device_page_flag[i + start + k] = 1;
+    }
+
+    __hip_device_page_flag[start + stride - 1 + k] = 2;
+
+    void* ptr = (void*)(heap
+        + heapSizePerWorkItem * currentWorkItem + k * __HIP_SIZE_OF_PAGE);
+
+    return ptr;
+}
+
+extern "C" inline __device__ void* __hip_free(void* ptr) {
+    if (ptr == nullptr) {
+        return nullptr;
+    }
+
+    uint32_t offsetByte = (uint64_t)ptr - (uint64_t)__hip_device_heap;
+    uint32_t offsetPage = offsetByte / __HIP_SIZE_OF_PAGE;
+
+    while (__hip_device_page_flag[offsetPage] != 0) {
+        if (__hip_device_page_flag[offsetPage] == 2) {
+            __hip_device_page_flag[offsetPage] = 0;
+            offsetPage++;
+            break;
+        } else {
+            __hip_device_page_flag[offsetPage] = 0;
+            offsetPage++;
+        }
+    }
+
+    return nullptr;
+}
+
+#endif
+
+#endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_MEMORY_H
@@ -128,7 +128,7 @@ extern int HIP_TRACE_API;


 // Feature tests:
-#if defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0)
+#if (defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0)) || __HIP_DEVICE_COMPILE__
 // Device compile and not host compile:

 // 32-bit Atomics:
@@ -245,11 +245,11 @@ static constexpr Coordinates<hc_get_workitem_id> threadIdx;

 #endif // defined __HCC__
 #if __HCC_OR_HIP_CLANG__
-extern "C" __device__ void* __hip_hc_malloc(size_t);
-extern "C" __device__ void* __hip_hc_free(void* ptr);
+extern "C" __device__ void* __hip_malloc(size_t);
+extern "C" __device__ void* __hip_free(void* ptr);

-static inline __device__ void* malloc(size_t size) { return __hip_hc_malloc(size); }
-static inline __device__ void* free(void* ptr) { return __hip_hc_free(ptr); }
+static inline __device__ void* malloc(size_t size) { return __hip_malloc(size); }
+static inline __device__ void* free(void* ptr) { return __hip_free(ptr); }

 #if defined(__HCC_ACCELERATOR__) && defined(HC_FEATURE_PRINTF)
 template <typename... All>
@@ -331,13 +331,13 @@ extern void ihipPostLaunchKernel(const char* kernelName, hipStream_t stream, gri
 typedef int hipLaunchParm;

 template <typename... Args, typename F = void (*)(Args...)>
-inline void hipLaunchKernelGGL(F kernelName, const dim3& numblocks, const dim3& numthreads,
+inline void hipLaunchKernelGGL(F&& kernelName, const dim3& numblocks, const dim3& numthreads,
                               unsigned memperblock, hipStream_t streamId, Args... args) {
  kernelName<<<numblocks, numthreads, memperblock, streamId>>>(args...);
 }

 template <typename... Args, typename F = void (*)(hipLaunchParm, Args...)>
-inline void hipLaunchKernel(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
+inline void hipLaunchKernel(F&& kernel, const dim3& numBlocks, const dim3& dimBlocks,
                            std::uint32_t groupMemBytes, hipStream_t stream, Args... args) {
    hipLaunchKernelGGL(kernel, numBlocks, dimBlocks, groupMemBytes, stream, hipLaunchParm{},
                       std::move(args)...);
@@ -423,6 +423,8 @@ extern const __device__ __attribute__((weak)) __hip_builtin_gridDim_t gridDim;
 #define hipGridDim_y gridDim.y
 #define hipGridDim_z gridDim.z

+#include <hip/hcc_detail/math_functions.h>
+
 #if __HIP_HCC_COMPAT_MODE__
 // Define HCC work item functions in terms of HIP builtin variables.
 #pragma push_macro("__DEFINE_HCC_FUNC")
@@ -462,7 +464,6 @@ hc_get_workitem_absolute_id(int dim)
 #undef __CUDA__
 #pragma pop_macro("__CUDA__")

-#include <hip/hcc_detail/math_functions.h>

 hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
                                    uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
@@ -474,4 +475,6 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,

 #endif // defined(__clang__) && defined(__HIP__)

+#include <hip/hcc_detail/hip_memory.h>
+
 #endif  // HIP_HCC_DETAIL_RUNTIME_H
@@ -22,6 +22,7 @@ THE SOFTWARE.

 #pragma once

+#include "hip_fp16_math_fwd.h"
 #include "math_fwd.h"

 #include <hip/hcc_detail/host_defines.h>
@@ -1159,6 +1160,16 @@ long long llabs(long long x)
 #endif
 // END INTEGER

+__DEVICE__
+inline _Float16 fma(_Float16 x, _Float16 y, _Float16 z) {
+    return __ocml_fma_f16(x, y, z);
+}
+
+__DEVICE__
+inline float fma(float x, float y, float z) {
+    return fmaf(x, y, z);
+}
+
 #pragma push_macro("__DEF_FLOAT_FUN")
 #pragma push_macro("__DEF_FLOAT_FUN2")
 #pragma push_macro("__DEF_FLOAT_FUN2I")
@@ -1352,10 +1363,10 @@ __DEVICE__ inline static unsigned long long max(long long arg1, unsigned long lo
  return max((unsigned long long) arg1, arg2);
 }*/
 #else
-__DEVICE__ inline static int min(int arg1, int arg2) {
+__DEVICE__ inline int min(int arg1, int arg2) {
  return (arg1 < arg2) ? arg1 : arg2;
 }
-__DEVICE__ inline static int max(int arg1, int arg2) {
+__DEVICE__ inline int max(int arg1, int arg2) {
  return (arg1 > arg2) ? arg1 : arg2;
 }

@@ -118,6 +118,9 @@ typedef int hipLaunchParm;
    }
 #endif

+#define __clock() clock()
+#define __clock64() clock64()
+
 #endif

 #endif
@@ -25,7 +25,7 @@ popd () {
 function setupENV()
 {
    sudo apt-get update
-    sudo apt-get install dpkg-dev rpm doxygen libelf-dev
+    sudo apt-get install dpkg-dev rpm doxygen libelf-dev rename
 }

 function buildHIP()
@@ -27,70 +27,6 @@ THE SOFTWARE.
 #include "hip/hip_runtime.h"
 #include <atomic>

-//=================================================================================================
-/*
- Implementation of malloc and free device functions.
-
- This is the best place to put them because the device
- global variables need to be initialized at the start.
-*/
-__device__ char gpuHeap[SIZE_OF_HEAP];
-__device__ uint32_t gpuFlags[NUM_PAGES];
-
-__device__ void* __hip_hc_malloc(size_t size) {
-    char* heap = (char*)gpuHeap;
-    if (size > SIZE_OF_HEAP) {
-        return (void*)nullptr;
-    }
-    uint32_t totalThreads =
-        blockDim.x * gridDim.x * blockDim.y * gridDim.y * blockDim.z * gridDim.z;
-    uint32_t currentWorkItem = threadIdx.x + blockDim.x * blockIdx.x;
-
-    uint32_t numHeapsPerWorkItem = NUM_PAGES / totalThreads;
-    uint32_t heapSizePerWorkItem = SIZE_OF_HEAP / totalThreads;
-
-    uint32_t stride = size / SIZE_OF_PAGE;
-    uint32_t start = numHeapsPerWorkItem * currentWorkItem;
-
-    uint32_t k = 0;
-
-    while (gpuFlags[k] > 0) {
-        k++;
-    }
-
-    for (uint32_t i = 0; i < stride - 1; i++) {
-        gpuFlags[i + start + k] = 1;
-    }
-
-    gpuFlags[start + stride - 1 + k] = 2;
-
-    void* ptr = (void*)(heap + heapSizePerWorkItem * currentWorkItem + k * SIZE_OF_PAGE);
-
-    return ptr;
-}
-
-__device__ void* __hip_hc_free(void* ptr) {
-    if (ptr == nullptr) {
-        return nullptr;
-    }
-
-    uint32_t offsetByte = (uint64_t)ptr - (uint64_t)gpuHeap;
-    uint32_t offsetPage = offsetByte / SIZE_OF_PAGE;
-
-    while (gpuFlags[offsetPage] != 0) {
-        if (gpuFlags[offsetPage] == 2) {
-            gpuFlags[offsetPage] = 0;
-            offsetPage++;
-            break;
-        } else {
-            gpuFlags[offsetPage] = 0;
-            offsetPage++;
-        }
-    }
-
-    return nullptr;
-}
-
 // abort
 __device__ void abort() { return hc::abort(); }

@@ -29,14 +29,6 @@ THE SOFTWARE.
 Heap size computation for malloc and free device functions.
 */

-#define NUM_PAGES_PER_THREAD 16
-#define SIZE_OF_PAGE 64
-#define NUM_THREADS_PER_CU 64
-#define NUM_CUS_PER_GPU 64  // Specific for r9 Nano
-#define NUM_PAGES NUM_PAGES_PER_THREAD* NUM_THREADS_PER_CU* NUM_CUS_PER_GPU
-#define SIZE_MALLOC NUM_PAGES* SIZE_OF_PAGE
-#define SIZE_OF_HEAP SIZE_MALLOC
-
 #define HIP_SQRT_2 1.41421356237
 #define HIP_SQRT_PI 1.77245385091

@@ -62,9 +54,6 @@ THE SOFTWARE.

 #define HIP_PI 3.14159265358979323846

-__device__ void* __hip_hc_malloc(size_t size);
-__device__ void* __hip_hc_free(void* ptr);
-
 __device__ float __hip_erfinvf(float x);
 __device__ double __hip_erfinv(double x);

@@ -99,7 +99,7 @@ hipError_t hipDeviceGetLimit(size_t* pValue, hipLimit_t limit) {
        return ihipLogStatus(hipErrorInvalidValue);
    }
    if (limit == hipLimitMallocHeapSize) {
-        *pValue = (size_t)SIZE_OF_HEAP;
+        *pValue = (size_t)__HIP_SIZE_OF_HEAP;
        return ihipLogStatus(hipSuccess);
    } else {
        return ihipLogStatus(hipErrorUnsupportedLimit);
@@ -1543,6 +1543,8 @@ hipError_t ihipMemset(void* dst, int  value, size_t sizeBytes, hipStream_t strea
 {
    hipError_t e = hipSuccess;

+    if (sizeBytes == 0) return e;
+
    if (stream && (dst != NULL)) {
        if(copyDataType == ihipMemsetDataTypeChar){
            if ((sizeBytes & 0x3) == 0) {
@@ -0,0 +1,188 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+/* HIT_START
+ * BUILD: %t %s NVCC_OPTIONS -std=c++11
+ * RUN: %t EXCLUDE_HIP_PLATFORM nvcc
+ * HIT_END
+ */
+#include "test_common.h"
+#include <iostream>
+#include <complex>
+
+// Tolerance for error
+const double tolerance = 1e-6;
+const bool verbose = false;
+
+#define BLKDIM_X 64
+#define BLKDIM_Y 1
+#define BLKDIM_Z 1
+#define NUM_BLK_X 1
+#define NUM_BLK_Y 1
+#define NUM_BLK_Z 1
+
+#define LEN (BLKDIM_X * BLKDIM_Y * BLKDIM_Z * NUM_BLK_X * NUM_BLK_Y * NUM_BLK_Z)
+
+#define ALL_FUN \
+  OP(add) \
+  OP(sub) \
+  OP(mul) \
+  OP(div)
+
+#define OP(x) CK_##x,
+enum CalcKind {
+  ALL_FUN
+};
+#undef OP
+
+#define OP(x) case CK_##x: return #x;
+std::string getName(enum CalcKind CK) {
+  switch(CK){
+  ALL_FUN
+  }
+}
+#undef OP
+
+// Calculates function.
+// If the function has one argument, B is ignored.
+#define ONE_ARG(func)                                                                              \
+    case CK_##func:                                                                                \
+        return std::func(A);
+
+template <typename FloatT>
+__device__ __host__ FloatT calc(FloatT A, FloatT B, enum CalcKind CK) {
+    switch (CK) {
+        case CK_add:
+            return A + B;
+        case CK_sub:
+            return A - B;
+        case CK_mul:
+            return A * B;
+        case CK_div:
+            return A / B;
+    }
+}
+
+// Allocate memory in kernel and save the address to pA and pB.
+// Copy value from A, B to allocated memory.
+template <typename FloatT>
+__global__ void kernel_alloc(FloatT* A, FloatT* B, FloatT** pA, FloatT** pB) {
+    int tx = hipThreadIdx_x + hipBlockDim_x * hipBlockIdx_x
+        + (hipThreadIdx_y + hipBlockDim_y * hipBlockIdx_y) * hipBlockDim_x
+        + (hipThreadIdx_z + hipBlockDim_z * hipBlockIdx_z) * hipBlockDim_x
+        * hipBlockDim_y;
+    if (tx == 0) {
+        *pA = (FloatT*)malloc(sizeof(FloatT) * LEN);
+        *pB = (FloatT*)malloc(sizeof(FloatT) * LEN);
+        for (int i = 0; i < LEN; i++) {
+            (*pA)[i] = A[i];
+            (*pB)[i] = B[i];
+      }
+    }
+}
+
+// Do calculation using values saved in allocated memmory. pA, pB are buffers
+// containing the address of the device-side allocated array.
+template <typename FloatT>
+__global__ void kernel_free(FloatT** pA, FloatT** pB, FloatT* C, enum CalcKind CK) {
+    int tx = hipThreadIdx_x + hipBlockDim_x * hipBlockIdx_x
+        + (hipThreadIdx_y + hipBlockDim_y * hipBlockIdx_y) * hipBlockDim_x
+        + (hipThreadIdx_z + hipBlockDim_z * hipBlockIdx_z) * hipBlockDim_x
+        * hipBlockDim_y;
+    C[tx] = calc<FloatT>((*pA)[tx], (*pB)[tx], CK);
+    if (tx == 0) {
+        free(*pA);
+        free(*pB);
+    }
+}
+
+template<typename FloatT>
+void test() {
+    FloatT *A, *Ad, *B, *Bd, *C, *Cd, *D;
+    A = new FloatT[LEN];
+    B = new FloatT[LEN];
+    C = new FloatT[LEN];
+    D = new FloatT[LEN];
+    hipMalloc((void**)&Ad, sizeof(FloatT) * LEN);
+    hipMalloc((void**)&Bd, sizeof(FloatT) * LEN);
+    hipMalloc((void**)&Cd, sizeof(FloatT) * LEN);
+
+    for (uint32_t i = 0; i < LEN; i++) {
+        A[i] = (i + 1) * 1.0f;
+        B[i] = A[i];
+        C[i] = A[i];
+    }
+    hipMemcpy(Ad, A, sizeof(FloatT) * LEN, hipMemcpyHostToDevice);
+    hipMemcpy(Bd, B, sizeof(FloatT) * LEN, hipMemcpyHostToDevice);
+
+    // Run kernel for a calculation kind and verify by comparing with host
+    // calculation result. Returns false if fails.
+    auto test_fun = [&](enum CalcKind CK) {
+      // kernel_alloc allocates memory on device side and initialize it.
+      // kernel_free uses allocated memory from kernel_alloc and does the
+      // calculation then free the memory.
+      // pA and pB are buffers to pass the device-side allocated memory address
+      // from kernel_alloc to kernel_free.
+      FloatT **pA, **pB;
+      hipMalloc((FloatT***)&pA, sizeof(FloatT*));
+      hipMalloc((FloatT***)&pB, sizeof(FloatT*));
+      dim3 blkDim(BLKDIM_X, BLKDIM_Y, BLKDIM_Z);
+      dim3 numBlk(NUM_BLK_X, NUM_BLK_Y, NUM_BLK_Z);
+      hipLaunchKernelGGL(kernel_alloc<FloatT>, numBlk, blkDim, 0, 0,
+          Ad, Bd, pA, pB);
+      hipDeviceSynchronize();
+      hipLaunchKernelGGL(kernel_free<FloatT>, numBlk, blkDim, 0, 0,
+          pA, pB, Cd, CK);
+      hipMemcpy(C, Cd, sizeof(FloatT) * LEN, hipMemcpyDeviceToHost);
+      hipFree(pA);
+      hipFree(pB);
+      for (int i = 0; i < LEN; i++) {
+          FloatT Expected = calc(A[i], B[i], CK);
+          FloatT error = std::abs(C[i] - Expected);
+          if (std::abs(Expected) > tolerance) error /= std::abs(Expected);
+          bool pass = error < tolerance;
+          if (verbose || !pass) {
+              std::cout << "Function: " << getName(CK) << " Operands: " << A[i] << " " << B[i]
+                        << " Result: " << C[i] << " Expected: " << Expected << " Error: " << error
+                        << " Pass: " << pass << std::endl;
+        }
+        if (!pass)
+          return false;
+      }
+      return true;
+    };
+
+#define OP(x) assert(test_fun(CK_##x));
+    ALL_FUN
+#undef OP
+
+    hipFree(Ad);
+    hipFree(Bd);
+    hipFree(Cd);
+    delete[] A;
+    delete[] B;
+    delete[] C;
+    delete[] D;
+}
+
+int main() {
+  test<float>();
+  test<double>();
+  passed();
+  return 0;
+}
@@ -40,10 +40,10 @@ __global__ void floatMath(hipLaunchParm lp, float* In, float* Out) {
    Out[tid] = __exp10f(Out[tid]);
    Out[tid] = __expf(Out[tid]);
    Out[tid] = __frsqrt_rn(Out[tid]);
-    Out[tid] = __fsqrt_rd(Out[tid]);
-    Out[tid] = __fsqrt_rn(Out[tid]);
-    Out[tid] = __fsqrt_ru(Out[tid]);
-    Out[tid] = __fsqrt_rz(Out[tid]);
+    //Out[tid] = __fsqrt_rd(Out[tid]);
+    //Out[tid] = __fsqrt_rn(Out[tid]);
+    //Out[tid] = __fsqrt_ru(Out[tid]);
+    //Out[tid] = __fsqrt_rz(Out[tid]);
    Out[tid] = __log10f(Out[tid]);
    Out[tid] = __log2f(Out[tid]);
    Out[tid] = __logf(Out[tid]);
@@ -18,7 +18,7 @@ THE SOFTWARE.
 */

 /* HIT_START
- * BUILD: %t %s ../test_common.cpp
+ * BUILD: %t %s ../test_common.cpp NVCC_OPTIONS -std=c++11 --gpu-architecture=sm_60
 * RUN: %t
 * HIT_END
 */
@@ -334,4 +334,4 @@ int main(int argc, char** argv) {
    hipDeviceReset();
    printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
    exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
-}
+}
@@ -33,8 +33,6 @@ THE SOFTWARE.
 #define LEN 512
 #define SIZE 2048

-struct TestClock {
-     
  static __global__ void kernel1(int* Ad) {
      int tid = threadIdx.x + blockIdx.x * blockDim.x;
      Ad[tid] = clock() + clock64() + __clock() + __clock64();
@@ -61,9 +59,8 @@ struct TestClock {
        assert(0 != A[i]);
    }
  }
-};

 int main() {
-  TestClock().run();
+  run();
  passed();
 }
@@ -0,0 +1,140 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/* HIT_START
+ * BUILD: %t %s ../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
+ * RUN: %t
+ * HIT_END
+ */
+
+#include "test_common.h"
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+#include <hip/math_functions.h>
+#include <iostream>
+
+#define HIP_ASSERT(status) assert(status == hipSuccess)
+
+#define LEN 50
+#define SIZE (LEN * sizeof(bool))
+
+  __global__ void kernelTestFMA(bool *Ad) {
+    float f = 1.0f / 3.0f;
+    double d = f;
+    int i = 0;
+    auto Check = [&](bool Cond) { Ad[i++] = Cond; };
+    // f * f + 3.0f will be different if promoted to double.
+    float floatResult = fma(f, f, 3.0f);
+    double doubleResult = fma(d, d, 3.0);
+    Check(floatResult != doubleResult);
+
+    // check promote to float.
+    Check(fma(f, f, 3) == floatResult);
+    Check(fma(f, f, (char)3) == floatResult);
+    Check(fma(f, f, (unsigned char)3) == floatResult);
+    Check(fma(f, f, (short)3) == floatResult);
+    Check(fma(f, f, (unsigned short)3) == floatResult);
+    Check(fma(f, f, (int)3) == floatResult);
+    Check(fma(f, f, (unsigned int)3) == floatResult);
+    Check(fma(f, f, (long)3) == floatResult);
+    Check(fma(f, f, (unsigned long)3) == floatResult);
+    Check(fma(f, f, true) == fma(f, f, 1.0f));
+
+    // check promote to double.
+    Check(fma(d, (double)f, 3) == doubleResult);
+    Check(fma(d, (double)f, (char)3) == doubleResult);
+    Check(fma(d, (double)f, (unsigned char)3) == doubleResult);
+    Check(fma(d, (double)f, (short)3) == doubleResult);
+    Check(fma(d, (double)f, (unsigned short)3) == doubleResult);
+    Check(fma(d, (double)f, (int)3) == doubleResult); 
+    Check(fma(d, (double)f, (unsigned int)3) == doubleResult);
+    Check(fma(d, (double)f, (long)3) == doubleResult);
+    Check(fma(d, (double)f, (unsigned long)3) == doubleResult);
+    Check(fma(d, (double)f, true) == fma((double)f, (double)f, 1.0));
+
+    while (i < LEN)
+      Check(true);
+  }
+
+  void runTestFMA() {
+    bool *Ad;
+    bool A[LEN];
+    for (unsigned i = 0; i < LEN; i++) {
+      A[i] = 0;
+    }
+
+    HIP_ASSERT(hipMalloc((void **)&Ad, SIZE));
+    hipLaunchKernelGGL(kernelTestFMA, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0, Ad);
+    HIP_ASSERT(hipMemcpy(A, Ad, SIZE, hipMemcpyDeviceToHost));
+
+    for (unsigned i = 0; i < LEN; i++) {
+      assert(A[i]);
+    }
+  }
+
+    __global__ void kernelTestHalfFMA(bool *Ad) {
+      _Float16 h = (_Float16)(1.0f/3.0f);
+      float f = h;
+      double d = f;
+      int i = 0;
+      auto Check = [&](bool Cond) { Ad[i++] = Cond; };
+      // h * h + 3 will be different if promoted to float.
+      _Float16 halfResult = fma(h, h, (_Float16)3);
+      float floatResult = fma(f, f, 3.0f);
+      double doubleResult = fma(d, d, 3.0);
+      Check(halfResult != floatResult);
+      Check(halfResult != doubleResult);
+
+      // check promote to half.
+      Check(fma(h, h, 3) == halfResult);
+      Check(fma(h, h, (char)3) == halfResult);
+      Check(fma(h, h, (unsigned char)3) == halfResult);
+      Check(fma(h, h, (short)3) == halfResult);
+      Check(fma(h, h, (unsigned short)3) == halfResult);
+      Check(fma(h, h, (int)3) == halfResult);
+      Check(fma(h, h, (unsigned int)3) == halfResult);
+      Check(fma(h, h, (long)3) == halfResult);
+      Check(fma(h, h, (unsigned long)3) == halfResult);
+      Check(fma(h, h, true) == fma(h, h, (_Float16)1));
+
+      while (i < LEN)
+        Check(true);
+    }
+
+  void runTestHalfFMA() {
+    bool *Ad;
+    bool A[LEN];
+    for (unsigned i = 0; i < LEN; i++) {
+      A[i] = 0;
+    }
+
+    HIP_ASSERT(hipMalloc((void **)&Ad, SIZE));
+    hipLaunchKernelGGL(kernelTestHalfFMA, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0, Ad);
+    HIP_ASSERT(hipMemcpy(A, Ad, SIZE, hipMemcpyDeviceToHost));
+
+    for (unsigned i = 0; i < LEN; i++) {
+      assert(A[i]);
+    }
+  }
+
+int main() {
+  runTestFMA();
+  runTestHalfFMA();
+  passed();
+}
@@ -30,6 +30,7 @@ THE SOFTWARE.
 // Incorrect implementation causes compilation failure due to conflict
 // declartions.

+#include <new>
 #include <hip/math_functions.h>

 // Test __HIP_DEVICE_COMPILE__ is defined after math_functions.h
@@ -45,14 +46,6 @@ __device__ __host__ inline void throw_std_bad_alloc()
  #endif
 }

-// Test __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ and __HIP_ARCH_HAS_DYNAMIC_PARALLEL__
-// is defined. Eigen HIP/hcc/Half.h __ldg depends on this.
-#if !defined(__HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__) ||                            \
-    !defined(__HIP_ARCH_HAS_DYNAMIC_PARALLEL__)
-#error                                                                         \
-    "__HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ or __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ not defined"
-#endif
-
 #include <hip/hip_runtime.h>
 #include "test_common.h"

@@ -33,7 +33,6 @@ THE SOFTWARE.
 #define LEN 512
 #define SIZE 2048

-struct TestPlacementNew {
  class A {
  public:
      __device__ A() {
@@ -63,9 +62,8 @@ struct TestPlacementNew {
        assert(i == A[i]);
    }
  }
-};

 int main() {
-  TestPlacementNew().run();
+  run();
  passed();
 }
@@ -34,21 +34,26 @@ THE SOFTWARE.

 #include <iostream>

+#define HIP_ASSERT(x) (assert((x) == hipSuccess))
+
 #define LEN 512
 #define SIZE LEN << 2

+#define TEST_DEBUG (0)
+
 __global__ void kernel_trig(hipLaunchParm lp, float* In, float* sin_d, float* cos_d, float* tan_d,
                            float* sin_pd, float* cos_pd) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
-    sin_d[tid] = __sinf(In[tid]);
-    cos_d[tid] = __cosf(In[tid]);
-    tan_d[tid] = __tanf(In[tid]);
-    __sincosf(In[tid], &sin_pd[tid], &cos_pd[tid]);
+    sin_d[tid] = sinf(In[tid]);
+    cos_d[tid] = cosf(In[tid]);
+    tan_d[tid] = tanf(In[tid]);
+    sincosf(In[tid], &sin_pd[tid], &cos_pd[tid]);
 }

 int main() {
    float *In, *sin_h, *cos_h, *tan_h, *sin_ph, *cos_ph;
    float *In_d, *sin_d, *cos_d, *tan_d, *sin_pd, *cos_pd;
+    int errors = 0;
    In = new float[LEN];
    sin_h = new float[LEN];
    cos_h = new float[LEN];
@@ -63,14 +68,16 @@ int main() {
        sin_ph[i] = 0.0f;
        cos_ph[i] = 0.0f;
    }
-    hipMalloc((void**)&In_d, SIZE);
-    hipMalloc((void**)&sin_d, SIZE);
-    hipMalloc((void**)&cos_d, SIZE);
-    hipMalloc((void**)&tan_d, SIZE);
-    hipMalloc((void**)&sin_pd, SIZE);
-    hipMalloc((void**)&cos_pd, SIZE);
+    HIP_ASSERT(hipMalloc((void**)&In_d, SIZE));
+    HIP_ASSERT(hipMalloc((void**)&sin_d, SIZE));
+    HIP_ASSERT(hipMalloc((void**)&cos_d, SIZE));
+    HIP_ASSERT(hipMalloc((void**)&tan_d, SIZE));
+    HIP_ASSERT(hipMalloc((void**)&sin_pd, SIZE));
+    HIP_ASSERT(hipMalloc((void**)&cos_pd, SIZE));
+
    hipMemcpy(In_d, In, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(kernel_trig, dim3(LEN, 1, 1), dim3(1, 1, 1), 0, 0, In_d, sin_d, cos_d, tan_d,
+    hipLaunchKernel(kernel_trig, dim3(LEN, 1, 1), dim3(1, 1, 1), 0, 0,
+                    In_d, sin_d, cos_d, tan_d,
                    sin_pd, cos_pd);
    hipMemcpy(sin_h, sin_d, SIZE, hipMemcpyDeviceToHost);
    hipMemcpy(cos_h, cos_d, SIZE, hipMemcpyDeviceToHost);
@@ -79,8 +86,28 @@ int main() {
    hipMemcpy(cos_ph, cos_pd, SIZE, hipMemcpyDeviceToHost);
    for (int i = 0; i < LEN; i++) {
        if (sin_h[i] != sin_ph[i] || cos_h[i] != cos_ph[i] || tan_h[i] * cos_h[i] != sin_h[i]) {
-            std::cout << "Failed!" << std::endl;
+            errors++;
+#if TEST_DEBUG
+            std::cout << "Check Failed!" << std::endl;
+            std::cout << " sin_h: " << sin_h[i] << " sin_ph: " << sin_ph[i] << "\n"
+                      << " cos_h: " << cos_h[i] << " cos_ph:" << cos_ph[i] << "\n"
+                      << " tan_h * cos_h: " << tan_h[i] * cos_h[i] << " sin_h[i]: " << sin_h[i] << "\n";
+#endif
        }
    }
-    passed();
+
+    HIP_ASSERT(hipFree(In_d));
+    HIP_ASSERT(hipFree(sin_d));
+    HIP_ASSERT(hipFree(cos_d));
+    HIP_ASSERT(hipFree(tan_d));
+    HIP_ASSERT(hipFree(sin_pd));
+    HIP_ASSERT(hipFree(cos_pd));
+
+    if (errors != 0) {
+        std::cout << "hip_trig FAILED!" << std::endl;
+        return -1;
+    } else {
+        std::cout << "hip_trig PASSED!" << std::endl;
+    }
+    return errors;
 }
@@ -38,7 +38,10 @@ int getDeviceNumber() {
    string str;
    std::this_thread::sleep_for(std::chrono::milliseconds(10));
    if (!(in = popen("./directed_tests/hipEnvVar -c", "r"))) {
-        return 1;
+        // Check at same level
+        if (!(in = popen("./hipEnvVar -c", "r"))) {
+            return 1;
+        }
    }
    while (fgets(buff, 512, in) != NULL) {
        cout << buff;
@@ -54,7 +57,11 @@ void getDevicePCIBusNumRemote(int deviceID, char* pciBusID) {
    str += std::to_string(deviceID);
    std::this_thread::sleep_for(std::chrono::milliseconds(10));
    if (!(in = popen(str.c_str(), "r"))) {
-        exit(1);
+        // Check at same level
+        if (!(in = popen("./hipEnvVar -d ", "r"))) {
+            exit(1);
+        }
+
    }
    while (fgets(pciBusID, 100, in) != NULL) {
        cout << pciBusID;
@@ -0,0 +1,91 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/* HIT_START
+ * BUILD: %t %s ../test_common.cpp
+ * RUN: %t
+ * HIT_END
+ */
+
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+#include <iostream>
+#include "test_common.h"
+
+#define HIP_ASSERT(status) assert(status == hipSuccess)
+
+#define LEN 512
+#define SIZE 2048
+
+  __constant__ int ConstantGlobalVar = 123;
+
+  static __global__ void kernel(int* Ad) {
+      int tid = threadIdx.x + blockIdx.x * blockDim.x;
+      Ad[tid] = ConstantGlobalVar;
+  }
+
+  void runTestConstantGlobalVar() {
+    int *A, *Ad;
+    A = new int[LEN];
+    for (unsigned i = 0; i < LEN; i++) {
+        A[i] = 0;
+    }
+
+    HIP_ASSERT(hipMalloc((void**)&Ad, SIZE));
+    hipLaunchKernelGGL(kernel, dim3(1, 1, 1), dim3(LEN, 1, 1), 0, 0, Ad);
+    HIP_ASSERT(hipMemcpy(A, Ad, SIZE, hipMemcpyDeviceToHost));
+
+    for (unsigned i = 0; i < LEN; i++) {
+        assert(123 == A[i]);
+    }
+  }
+
+  __device__ int GlobalArray[LEN];
+
+  static __global__ void kernelWrite() {
+      int tid = threadIdx.x + blockIdx.x * blockDim.x;
+      GlobalArray[tid] = tid;
+  }
+  static __global__ void kernelRead(int* Ad) {
+      int tid = threadIdx.x + blockIdx.x * blockDim.x;
+      Ad[tid] = GlobalArray[tid];
+  }
+
+  void runTestGlobalArray() {
+    int *A, *Ad;
+    A = new int[LEN];
+    for (unsigned i = 0; i < LEN; i++) {
+        A[i] = 0;
+    }
+
+    HIP_ASSERT(hipMalloc((void**)&Ad, SIZE));
+    hipLaunchKernelGGL(kernelWrite, dim3(1, 1, 1), dim3(LEN, 1, 1), 0, 0);
+    hipLaunchKernelGGL(kernelRead, dim3(1, 1, 1), dim3(LEN, 1, 1), 0, 0, Ad);
+    HIP_ASSERT(hipMemcpy(A, Ad, SIZE, hipMemcpyDeviceToHost));
+
+    for (unsigned i = 0; i < LEN; i++) {
+        assert(i == A[i]);
+    }
+  }
+
+int main() {
+  runTestConstantGlobalVar();
+  runTestGlobalArray();
+  passed();
+}
@@ -170,8 +170,8 @@ void runTests(int64_t numElements) {
    // for (int waitStart=0; waitStart<2; waitStart++) {
    for (int waitStart = 1; waitStart >= 0; waitStart--) {
        unsigned W = waitStart ? 0x1000 : 0;
-        test(W | 0x01, C_d, C_h, numElements, 0, waitStart, syncNone);
-        test(W | 0x02, C_d, C_h, numElements, stream, waitStart, syncNone);
+        test(W | 0x01, C_d, C_h, numElements, 0, 0, syncNone);
+        test(W | 0x02, C_d, C_h, numElements, stream, 0, syncNone);
        test(W | 0x04, C_d, C_h, numElements, 0, waitStart, syncStream);
        test(W | 0x08, C_d, C_h, numElements, stream, waitStart, syncStream);
        test(W | 0x10, C_d, C_h, numElements, 0, waitStart, syncStopEvent);
@@ -56,7 +56,7 @@ int runTest() {
    int testResult = 1;
    float *texBuf;
    float val[N], output[N];
-    size_t size = 0;
+    size_t offset = 0;
    float *devBuf;
    for (int i = 0; i < N; i++) {
        val[i] = (float)i;
@@ -74,7 +74,8 @@ int runTest() {
    tex.filterMode = hipFilterModePoint;
    tex.normalized = 0;

-    HIPCHECK(hipBindTexture(&size, tex, (void *)texBuf, chanDesc, N * sizeof(float)));
+    HIPCHECK(hipBindTexture(&offset, tex, (void *)texBuf, chanDesc, N * sizeof(float)));
+    HIPCHECK(hipGetTextureAlignmentOffset(&offset,&tex));

    dim3 dimBlock(64, 1, 1);
    dim3 dimGrid(N / dimBlock.x, 1, 1);
@@ -1,78 +0,0 @@
-/*
-Copyright (c) 2015-Present Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-/* HIT_START
- * BUILD: %t %s ../test_common.cpp
- * RUN: %t
- * HIT_END
- */
-#include <hip/hip_runtime.h>
-#include "test_common.h"
-
-using namespace std;
-#define R 8 //rows, height
-#define C 8 //columns, width
-
-texture<int, hipTextureType2D,hipReadModeElementType> tex;
-
-bool runTest(void);
-
-int main(int argc, char** argv) {
-    bool testResult=runTest();
-
-    if (testResult) {
-        passed();
-    } else {
-        exit(EXIT_FAILURE);
-    }
-}
-
-bool runTest()
-{
-int val[R][C],i,j;
-size_t offset;
-
-for(i=0;i<R;i++)
-    for(j=0;j<C;j++)
-	{
-	 val[i][j]=(i+1)*(j+1);
-	}
-hipChannelFormatDesc chan_desc=hipCreateChannelDesc(32,0,0,0,hipChannelFormatKindSigned);
-hipArray *hipArray;
-HIPCHECK(hipMallocArray(&hipArray, &chan_desc,C,R,0));
-
-HIPCHECK(hipMemcpyToArray(hipArray,0,0, val, R*C*sizeof(int), hipMemcpyHostToDevice));
-
-tex.addressMode[0]=hipAddressModeWrap;
-tex.addressMode[1]=hipAddressModeWrap;
-tex.filterMode=hipFilterModePoint;
-tex.normalized=0;
-
-HIPCHECK(hipBindTextureToArray(&tex, hipArray, &chan_desc));
-HIPCHECK(hipGetTextureAlignmentOffset(&offset,&tex));
-HIPCHECK(hipUnbindTexture(&tex));
-HIPCHECK(hipFreeArray(hipArray));
-if(offset != 0)
-    return false;
-  else
-    return true;
-}