Merge 'hipamd/amd-staging' into amd-staging

Change-Id: I1bbfe6261643fce1c3ce407452ac545f0111e893 [ROCm/clr commit: 854fb9dc95]
2023-03-26 15:42:30 +00:00
@@ -0,0 +1,10 @@
+Language: Cpp
+BasedOnStyle: Google
+AlignEscapedNewlinesLeft: false
+AlignOperands: false
+ColumnLimit: 100
+AlwaysBreakTemplateDeclarations: false
+DerivePointerAlignment: false
+IndentFunctionDeclarationAfterType: false
+MaxEmptyLinesToKeep: 2
+SortIncludes: false
@@ -0,0 +1,20 @@
+# Set the default behavior, in case people don't have core.autolf set.
+* text=auto
+
+# Explicitly declare text files you want to always be normalized and converted
+# to have LF line endings on checkout.
+*.c text eol=lf
+*.cpp text eol=lf
+*.cc text eol=lf
+*.h text eol=lf
+*.hpp text eol=lf
+*.txt text eol=lf
+
+# Define files to support auto-remove trailing white space
+# Need to run the command below, before add modified file(s) to the staging area
+# git config filter.trimspace.clean 'sed -e "s/[[:space:]]*$//g"'
+*.cpp filter=trimspace
+*.c filter=trimspace
+*.h filter=trimspacecpp
+*.hpp filter=trimspace
+*.md filter=trimspace
@@ -0,0 +1,16 @@
+.*
+!.gitignore
+*.o
+*.exe
+*.swp
+lib
+packages
+build
+bin/hipInfo
+bin/hipBusBandwidth
+bin/hipDispatchLatency
+bin/hipify-clang
+tags
+samples/1_Utils/hipInfo/hipInfo
+samples/1_Utils/hipBusBandwidth/hipBusBandwidth
+samples/1_Utils/hipDispatchLatency/hipDispatchLatency
@@ -0,0 +1,480 @@
+# Copyright (c) 2016 - 2021 Advanced Micro Devices, Inc. All Rights Reserved.
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+cmake_minimum_required(VERSION 3.16.8)
+project(hip)
+
+include(GNUInstallDirs)
+
+# sample command for hip-rocclr runtime, you'll need to have rocclr built
+# ROCM_PATH is the path where ROCM is installed
+#  For shared lib of hip-rocclr runtime
+#    For release version
+#      cmake -DHIP_COMMON_DIR="$HIP_DIR" -DAMD_OPENCL_PATH=$OPENCL_DIR -DROCCLR_PATH=$ROCCLR_DIR -DCMAKE_PREFIX_PATH="<ROCM_PATH>/" -DCMAKE_INSTALL_PREFIX=</where/to/install/hip> ..
+#    For debug version
+#      cmake -DHIP_COMMON_DIR="$HIP_DIR" -DAMD_OPENCL_PATH=$OPENCL_DIR -DROCCLR_PATH=$ROCCLR_DIR -DCMAKE_PREFIX_PATH="<ROCM_PATH>/" -DCMAKE_BUILD_TYPE=Debug -DCMAKE_INSTALL_PREFIX=</where/to/install/hip> ..
+#  For static lib of hip-rocclr runtime
+#    For release version
+#      cmake -DHIP_COMMON_DIR="$HIP_DIR" -DAMD_OPENCL_PATH=$OPENCL_DIR -DROCCLR_PATH=$ROCCLR_DIR -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="<ROCM_PATH>/" -DCMAKE_INSTALL_PREFIX=</where/to/install/hip> ..
+#    For debug version
+#      cmake -DHIP_COMMON_DIR="$HIP_DIR" -DAMD_OPENCL_PATH=$OPENCL_DIR -DROCCLR_PATH=$ROCCLR_DIR -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=Debug -DCMAKE_PREFIX_PATH="<ROCM_PATH>/" -DCMAKE_INSTALL_PREFIX=</where/to/install/hip> ..
+#  If you don't specify CMAKE_INSTALL_PREFIX, hip-rocclr runtime will be installed to "<ROCM_PATH>/hip".
+#  By default, CMake will search for a folder named vdi or ROCclr relative to the current path. Specify -DROCCLR_PATH=$ROCCLR_DIR if rocclr source is in obscure location.
+#  By default, CMake will search for a folder named opencl or ROCm-OpenCL-Runtime relative to the current path. Specify -DAMD_OPENCL_PATH=$OPENCL_DIR if opencl source is in obscure location.
+list(APPEND CMAKE_MODULE_PATH ${HIP_COMMON_DIR}/cmake)
+
+# required to add the right link to libhsa-runtime in install/lib path
+# CMAKE_PREFIX_PATH is used as rpath to search for libs outside HIP
+set(CMAKE_INSTALL_RPATH "${CMAKE_PREFIX_PATH}/${CMAKE_INSTALL_LIBDIR}")
+set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
+
+#############################
+# Options
+#############################
+option(BUILD_HIPIFY_CLANG "Enable building the CUDA->HIP converter" OFF)
+option(__HIP_ENABLE_PCH "Enable/Disable pre-compiled hip headers" ON)
+option(HIP_OFFICIAL_BUILD "Enable/Disable for mainline/staging builds" OFF)
+option(FILE_REORG_BACKWARD_COMPATIBILITY "Enable File Reorg with backward compatibility" ON)
+set(HIPCC_BIN_DIR "" CACHE STRING "HIPCC and HIPCONFIG binary directories")
+
+if(__HIP_ENABLE_PCH)
+  set(_pchStatus 1)
+else()
+  set(_pchStatus 0)
+endif()
+
+message(STATUS "HIPCC_BIN_DIR found at ${HIPCC_BIN_DIR}")
+message(STATUS "HIP_COMMON_DIR found at ${HIP_COMMON_DIR}")
+set(HIP_COMMON_INCLUDE_DIR ${HIP_COMMON_DIR}/include)
+set(HIP_COMMON_BIN_DIR ${HIP_COMMON_DIR}/bin)
+set(__HIPCONFIG_EXECUTABLE__ ${HIP_COMMON_DIR}/bin/hipconfig)
+
+#############################
+# Setup config generation
+#############################
+string(TIMESTAMP _timestamp UTC)
+set(_versionInfo "# Auto-generated by cmake\n")
+set(_buildInfo "# Auto-generated by cmake on ${_timestamp} UTC\n")
+macro(add_to_config _configfile _variable)
+    set(${_configfile} "${${_configfile}}${_variable}=${${_variable}}\n")
+endmacro()
+
+#############################
+# Setup version information
+#############################
+find_package(Perl REQUIRED)
+
+# Determine HIP_BASE_VERSION
+set(ENV{HIP_PATH} "")
+file(STRINGS ${HIP_COMMON_DIR}/VERSION VERSION_LIST REGEX "^[0-9]+")
+list(GET VERSION_LIST 0 HIP_VERSION_MAJOR)
+list(GET VERSION_LIST 1 HIP_VERSION_MINOR)
+list(GET VERSION_LIST 2 HIP_VERSION_PATCH)
+set(HIP_VERSION_GITDATE 0)
+
+find_package(Git)
+
+# FIXME: Two different version strings used.
+# Below we use UNIX commands, not compatible with Windows.
+if(GIT_FOUND)
+  # use the commit date, instead of build date
+  execute_process(COMMAND ${GIT_EXECUTABLE} show -s --format=%ct
+    RESULT_VARIABLE git_result
+    OUTPUT_VARIABLE git_output
+    WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
+    OUTPUT_STRIP_TRAILING_WHITESPACE)
+  if(git_result EQUAL 0)
+    set(HIP_VERSION_UNIXDATE ${git_output})
+  endif()
+
+  # get date information based on UTC
+  # use the last two digits of year + week number + day in the week as HIP_VERSION_GITDATE
+  execute_process(COMMAND ${PERL_EXECUTABLE} "-MPOSIX=strftime" "-le" "print strftime \'%y%W%w\',gmtime(${HIP_VERSION_UNIXDATE})"
+    RESULT_VARIABLE git_result
+    OUTPUT_VARIABLE git_output
+    WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
+    OUTPUT_STRIP_TRAILING_WHITESPACE)
+  if(git_result EQUAL 0)
+    set(HIP_VERSION_GITDATE ${git_output})
+  endif()
+
+  # get commit short hash
+  execute_process(COMMAND ${GIT_EXECUTABLE} rev-parse --short HEAD
+    WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
+    RESULT_VARIABLE git_result
+    OUTPUT_VARIABLE git_output
+    OUTPUT_STRIP_TRAILING_WHITESPACE)
+  if(git_result EQUAL 0)
+    set(HIP_VERSION_GITHASH ${git_output})
+  endif()
+
+  set(HIP_VERSION_BUILD_ID 0)
+  set(HIP_VERSION_BUILD_NAME "")
+  if(NOT DEFINED ENV{HIP_OFFICIAL_BUILD} AND NOT HIP_OFFICIAL_BUILD)
+    set(HIP_VERSION_PATCH ${HIP_VERSION_GITDATE})
+  endif()
+
+  if(DEFINED ENV{ROCM_LIBPATCH_VERSION})
+    set(HIP_PACKAGING_VERSION_PATCH ${HIP_VERSION_PATCH}.$ENV{ROCM_LIBPATCH_VERSION})
+  else()
+    set(HIP_PACKAGING_VERSION_PATCH ${HIP_VERSION_PATCH}-${HIP_VERSION_GITHASH})
+  endif()
+else()
+  set(HIP_VERSION_BUILD_ID 0)
+  set(HIP_VERSION_BUILD_NAME "")
+  # FIXME: Some parts depend on this being set.
+  set(HIP_PACKAGING_VERSION_PATCH "0")
+endif()
+
+## Debian package specific variables
+if ( DEFINED ENV{CPACK_DEBIAN_PACKAGE_RELEASE} )
+  set ( CPACK_DEBIAN_PACKAGE_RELEASE $ENV{CPACK_DEBIAN_PACKAGE_RELEASE} )
+else()
+  set ( CPACK_DEBIAN_PACKAGE_RELEASE "local" )
+endif()
+message (STATUS "Using CPACK_DEBIAN_PACKAGE_RELEASE ${CPACK_DEBIAN_PACKAGE_RELEASE}" )
+
+## RPM package specific variables
+if ( DEFINED ENV{CPACK_RPM_PACKAGE_RELEASE} )
+  set ( CPACK_RPM_PACKAGE_RELEASE $ENV{CPACK_RPM_PACKAGE_RELEASE} )
+else()
+  set ( CPACK_RPM_PACKAGE_RELEASE "local" )
+endif()
+
+## 'dist' breaks manual builds on debian systems due to empty Provides
+execute_process( COMMAND rpm --eval %{?dist}
+                 RESULT_VARIABLE PROC_RESULT
+                 OUTPUT_VARIABLE EVAL_RESULT
+                 OUTPUT_STRIP_TRAILING_WHITESPACE )
+
+if ( PROC_RESULT EQUAL "0" AND NOT EVAL_RESULT STREQUAL "" )
+  string ( APPEND CPACK_RPM_PACKAGE_RELEASE "%{?dist}" )
+endif()
+message(STATUS "CPACK_RPM_PACKAGE_RELEASE: ${CPACK_RPM_PACKAGE_RELEASE}")
+
+add_to_config(_versionInfo HIP_PACKAGING_VERSION_PATCH)
+add_to_config(_versionInfo CPACK_DEBIAN_PACKAGE_RELEASE)
+add_to_config(_versionInfo CPACK_RPM_PACKAGE_RELEASE)
+
+add_to_config(_versionInfo HIP_VERSION_MAJOR)
+add_to_config(_versionInfo HIP_VERSION_MINOR)
+add_to_config(_versionInfo HIP_VERSION_PATCH)
+add_to_config(_versionInfo HIP_VERSION_GITHASH)
+
+set (HIP_LIB_VERSION_MAJOR ${HIP_VERSION_MAJOR})
+set (HIP_LIB_VERSION_MINOR ${HIP_VERSION_MINOR})
+if (${ROCM_PATCH_VERSION} )
+   set (HIP_LIB_VERSION_PATCH ${ROCM_PATCH_VERSION})
+elseif (DEFINED HIP_VERSION_GITHASH)
+   set (HIP_LIB_VERSION_PATCH ${HIP_VERSION_PATCH}-${HIP_VERSION_GITHASH})
+else ()
+   set (HIP_LIB_VERSION_PATCH ${HIP_VERSION_PATCH})
+endif ()
+set (HIP_LIB_VERSION_STRING "${HIP_LIB_VERSION_MAJOR}.${HIP_LIB_VERSION_MINOR}.${HIP_LIB_VERSION_PATCH}")
+if (DEFINED ENV{ROCM_RPATH})
+    set (CMAKE_INSTALL_RPATH "$ENV{ROCM_RPATH}")
+    set (CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
+    set (CMAKE_SKIP_BUILD_RPATH TRUE)
+    set (CMAKE_INSTALL_RPATH_USE_LINK_PATH FALSE)
+endif ()
+
+# overwrite HIP_VERSION_PATCH for packaging
+set(HIP_VERSION ${HIP_VERSION_MAJOR}.${HIP_VERSION_MINOR}.${HIP_PACKAGING_VERSION_PATCH})
+
+# Remove when CI is updated
+if(HIP_PLATFORM STREQUAL "rocclr")
+   set(HIP_PLATFORM "amd")
+endif()
+#############################
+# Configure variables
+#############################
+# Determine HIP_PLATFORM
+if(NOT DEFINED HIP_PLATFORM)
+    if(NOT DEFINED ENV{HIP_PLATFORM})
+        execute_process(COMMAND ${__HIPCONFIG_EXECUTABLE__} --platform
+            OUTPUT_VARIABLE HIP_PLATFORM
+            OUTPUT_STRIP_TRAILING_WHITESPACE)
+    else()
+        set(HIP_PLATFORM $ENV{HIP_PLATFORM} CACHE STRING "HIP Platform")
+    endif()
+endif()
+message(STATUS "HIP Platform: " ${HIP_PLATFORM})
+
+if(HIP_PLATFORM STREQUAL "nvidia")
+    set(HIP_RUNTIME "cuda"  CACHE STRING "HIP Runtime")
+    set(HIP_COMPILER "nvcc" CACHE STRING "HIP Compiler")
+elseif(HIP_PLATFORM STREQUAL "amd")
+    set(HIP_RUNTIME "rocclr" CACHE STRING "HIP Runtime")
+    set(HIP_COMPILER "clang" CACHE STRING "HIP Compiler")
+else()
+    message(FATAL_ERROR "Unexpected HIP_PLATFORM: " ${HIP_PLATFORM})
+endif()
+
+message(STATUS "HIP Runtime: " ${HIP_RUNTIME})
+message(STATUS "HIP Compiler: " ${HIP_COMPILER})
+
+add_to_config(_buildInfo HIP_RUNTIME)
+add_to_config(_buildInfo HIP_COMPILER)
+
+# Set default build type
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Release")
+endif()
+
+if (NOT DEFINED ROCM_PATH )
+     set ( ROCM_PATH "/opt/rocm"  CACHE STRING "Default ROCM installation directory." )
+endif ()
+message (STATUS "ROCM Installation path(ROCM_PATH): ${ROCM_PATH}")
+
+# Determine HIP install path
+if (UNIX)
+    set(HIP_DEFAULT_INSTALL_PREFIX "${ROCM_PATH}")
+endif()
+if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
+    set(CMAKE_INSTALL_PREFIX ${HIP_DEFAULT_INSTALL_PREFIX} CACHE PATH "Installation path for HIP" FORCE)
+endif()
+
+if(DEV_LOG_ENABLE MATCHES "yes")
+  add_definitions(-DDEV_LOG_ENABLE)
+endif()
+
+# Set default install path as "${ROCM_PATH}", can override the path from cmake build.
+set(CPACK_INSTALL_PREFIX ${HIP_DEFAULT_INSTALL_PREFIX} CACHE PATH "Package Installation path for HIP")
+
+if(IS_ABSOLUTE ${CMAKE_INSTALL_PREFIX})
+    message(STATUS "HIP will be installed in: " ${CMAKE_INSTALL_PREFIX})
+else()
+    message(FATAL_ERROR "Don't know where to install HIP. Please specify absolute path using -DCMAKE_INSTALL_PREFIX")
+endif()
+
+# set the installation path for the installer package
+set(CPACK_SET_DESTDIR ON CACHE BOOL "Installer package will install hip to CMAKE_INSTALL_PREFIX instead of CPACK_PACKAGING_INSTALL_PREFIX")
+if (NOT CPACK_SET_DESTDIR)
+  set(CPACK_PACKAGING_INSTALL_PREFIX "${ROCM_PATH}" CACHE PATH "Default installation path of hcc installer package")
+endif (NOT CPACK_SET_DESTDIR)
+
+#############################
+# Build steps
+#############################
+set(BIN_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR})
+set(LIB_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR})
+set(INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR})
+set(CONFIG_PACKAGE_INSTALL_DIR ${LIB_INSTALL_DIR}/cmake/hip)
+set(CONFIG_LANG_PACKAGE_INSTALL_DIR ${LIB_INSTALL_DIR}/cmake/hip-lang)
+set(CONFIG_RTC_PACKAGE_INSTALL_DIR ${LIB_INSTALL_DIR}/cmake/hiprtc)
+
+# Build clang hipify if enabled
+if (BUILD_HIPIFY_CLANG)
+    add_subdirectory(hipify-clang)
+endif()
+
+# Generate hip_version.h
+set(_versionInfoHeader
+"// Auto-generated by cmake\n
+#ifndef HIP_VERSION_H
+#define HIP_VERSION_H\n
+#define HIP_VERSION_MAJOR ${HIP_VERSION_MAJOR}
+#define HIP_VERSION_MINOR ${HIP_VERSION_MINOR}
+#define HIP_VERSION_PATCH ${HIP_VERSION_PATCH}
+#define HIP_VERSION_GITHASH \"${HIP_VERSION_GITHASH}\"
+#define HIP_VERSION_BUILD_ID ${HIP_VERSION_BUILD_ID}
+#define HIP_VERSION_BUILD_NAME \"${HIP_VERSION_BUILD_NAME}\"
+#define HIP_VERSION    (HIP_VERSION_MAJOR * 10000000 + HIP_VERSION_MINOR * 100000 + HIP_VERSION_PATCH)\n
+#define __HIP_HAS_GET_PCH ${_pchStatus}\n
+#endif\n
+")
+file(WRITE "${PROJECT_BINARY_DIR}/include/hip/hip_version.h" ${_versionInfoHeader})
+
+if(HIP_RUNTIME STREQUAL "rocclr")
+	add_subdirectory(src)
+endif()
+
+# Generate .hipInfo
+file(WRITE "${PROJECT_BINARY_DIR}/.hipInfo" ${_buildInfo})
+
+# Generate .hipVersion
+file(WRITE "${PROJECT_BINARY_DIR}/.hipVersion" ${_versionInfo})
+
+# Build doxygen documentation
+find_program(DOXYGEN_EXE doxygen)
+if(DOXYGEN_EXE)
+    add_custom_target(doc COMMAND HIP_PATH=${CMAKE_CURRENT_SOURCE_DIR} ${DOXYGEN_EXE} ${CMAKE_CURRENT_SOURCE_DIR}/docs/doxygen-input/doxy.cfg
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/docs)
+endif()
+
+#############################
+# Install steps
+#############################
+
+# Install .hipInfo
+install(FILES ${PROJECT_BINARY_DIR}/.hipInfo DESTINATION ${CMAKE_INSTALL_LIBDIR})
+
+# Install .hipVersion
+install(FILES ${PROJECT_BINARY_DIR}/.hipVersion DESTINATION ${CMAKE_INSTALL_BINDIR})
+
+# Install src, bin, include & cmake if necessary
+execute_process(COMMAND test ${CMAKE_INSTALL_PREFIX} -ef ${CMAKE_CURRENT_SOURCE_DIR}
+    RESULT_VARIABLE INSTALL_SOURCE)
+if(NOT ${INSTALL_SOURCE} EQUAL 0)
+    if(WIN32)
+      install(DIRECTORY ${HIP_COMMON_BIN_DIR} DESTINATION . USE_SOURCE_PERMISSIONS)
+      if (CMAKE_BUILD_TYPE STREQUAL "Debug")
+        install(DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/src/" DESTINATION ${CMAKE_INSTALL_BINDIR}
+                FILES_MATCHING PATTERN "*.pdb"
+                               PATTERN "*.ilk"
+                               PATTERN "CMakeFiles" EXCLUDE
+                               PATTERN "hip_rtc_gen" EXCLUDE
+                               PATTERN "libelf" EXCLUDE
+                               PATTERN "loader" EXCLUDE
+                               PATTERN "pal" EXCLUDE
+                               PATTERN "libamdhsacode" EXCLUDE)
+      endif()
+    else()
+      # Exclude .bat files on Linux.
+      #Hip bin files moved to /opt/rocm/bin and the file permission need to set properly
+      install(DIRECTORY ${HIP_COMMON_BIN_DIR} DESTINATION . USE_SOURCE_PERMISSIONS
+              DIRECTORY_PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE
+              PATTERN *.bat EXCLUDE)
+    endif()
+
+    if(WIN32) #not required for flat folder structure
+      # The following two lines will be removed after upstream updation
+      install(CODE "MESSAGE(\"Removing ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}\")")
+      install(CODE "file(REMOVE_RECURSE ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR})")
+    endif()
+
+    install(DIRECTORY include DESTINATION .)
+    install(DIRECTORY ${HIP_COMMON_INCLUDE_DIR}/hip/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/hip/)
+    if(WIN32)
+      install(DIRECTORY ${HIP_COMMON_DIR}/cmake DESTINATION .)
+    else()
+      install(DIRECTORY ${HIP_COMMON_DIR}/cmake/ DESTINATION ${CONFIG_PACKAGE_INSTALL_DIR})
+    endif()
+endif()
+
+# Install generated headers
+# FIXME: Associate with individual targets.
+if(HIP_PLATFORM STREQUAL "amd")
+install(FILES ${PROJECT_BINARY_DIR}/include/hip/amd_detail/hip_prof_str.h
+        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/hip/amd_detail)
+install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/bin DESTINATION . USE_SOURCE_PERMISSIONS)
+endif()
+install(FILES ${PROJECT_BINARY_DIR}/include/hip/hip_version.h
+        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/hip)
+
+if (NOT ${HIPCC_BIN_DIR} STREQUAL "")
+  file(TO_CMAKE_PATH "${HIPCC_BIN_DIR}" HIPCC_BIN_DIR)
+  set(hipcc_bin ${HIPCC_BIN_DIR}/hipcc.bin)
+  set(hipconfig_bin ${HIPCC_BIN_DIR}/hipconfig.bin)
+  if(WIN32)
+    set(hipcc_bin ${hipcc_bin}.exe)
+    set(hipconfig_bin ${hipconfig_bin}.exe)
+  endif()
+  install(PROGRAMS ${hipcc_bin} DESTINATION bin)
+  install(PROGRAMS ${hipconfig_bin} DESTINATION bin)
+endif()
+
+#############################
+# hip-config
+#############################
+include(CMakePackageConfigHelpers)
+
+configure_package_config_file(
+    hip-config.cmake.in
+    ${CMAKE_CURRENT_BINARY_DIR}/hip-config.cmake
+    INSTALL_DESTINATION ${CONFIG_PACKAGE_INSTALL_DIR}
+    PATH_VARS LIB_INSTALL_DIR INCLUDE_INSTALL_DIR BIN_INSTALL_DIR
+    )
+
+write_basic_package_version_file(
+    ${CMAKE_CURRENT_BINARY_DIR}/hip-config-version.cmake
+    VERSION "${HIP_VERSION_MAJOR}.${HIP_VERSION_MINOR}.${HIP_VERSION_GITDATE}"
+    COMPATIBILITY SameMajorVersion
+    )
+install(
+    FILES
+    ${CMAKE_CURRENT_BINARY_DIR}/hip-config.cmake
+    ${CMAKE_CURRENT_BINARY_DIR}/hip-config-version.cmake
+    DESTINATION
+    ${CONFIG_PACKAGE_INSTALL_DIR}
+    )
+# Packaging invokes UNIX commands, which are not available on Windows.
+
+if(NOT WIN32)
+    add_subdirectory(packaging)
+endif()
+
+#############################
+# Code formatting
+#############################
+# Target: clangformat
+find_program(CLANGFORMAT_EXE clang-format PATHS ${HCC_HOME}/bin)
+if(CLANGFORMAT_EXE)
+    file(GLOB_RECURSE FORMAT_SOURCE_FILE_LIST *.cpp *.hpp *.h)
+    add_custom_target(clangformat COMMAND ${CLANGFORMAT_EXE} -style=file -i ${FORMAT_SOURCE_FILE_LIST}
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+endif()
+
+#############################
+# Testing steps
+#############################
+# HIT is not compatible with Windows
+if(NOT WIN32)
+set(HIP_ROOT_DIR ${CMAKE_CURRENT_BINARY_DIR})
+set(HIP_SRC_PATH ${CMAKE_CURRENT_SOURCE_DIR})
+if(HIP_PLATFORM STREQUAL "nvidia")
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_SRC_PATH}/include" "${HIP_ROOT_DIR}/include" RESULT_VARIABLE RUN_HIT ERROR_QUIET)
+endif()
+execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_COMMON_INCLUDE_DIR}/hip/" "${HIP_ROOT_DIR}/include/hip/" RESULT_VARIABLE RUN_HIT ERROR_QUIET)
+execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_COMMON_DIR}/cmake" "${HIP_ROOT_DIR}/cmake" RESULT_VARIABLE RUN_HIT ERROR_QUIET)
+if(${RUN_HIT} EQUAL 0)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_COMMON_BIN_DIR}" "${HIP_ROOT_DIR}/bin" RESULT_VARIABLE RUN_HIT ERROR_QUIET)
+endif()
+if(HIP_CATCH_TEST EQUAL "1")
+	message(STATUS "Building of catch tests through hipamd is no longer supported. Testing targets will not be available. catch tests have been moved to an independent github project hip-tests. Please refer to hip-tests Readme for build instructions! ")
+else()
+    if(${RUN_HIT} EQUAL 0)
+        set(CMAKE_MODULE_PATH "${HIP_ROOT_DIR}/cmake" ${CMAKE_MODULE_PATH})
+        include(${HIP_COMMON_DIR}/tests/hit/HIT.cmake)
+        include(${HIP_COMMON_DIR}/tests/Tests.cmake)
+    else()
+        message(STATUS "Testing targets will not be available. To enable them please ensure that the HIP installation directory is writeable. Use -DCMAKE_INSTALL_PREFIX to specify a suitable location")
+    endif()
+endif()
+endif()
+
+#############################
+# Code analysis
+#############################
+# Target: clang
+if(HIP_HIPCC_EXECUTABLE)
+    add_custom_target(analyze
+        COMMAND ${HIP_HIPCC_EXECUTABLE} -fvisibility=hidden -fvisibility-inlines-hidden --analyze --analyzer-outputtext  -isystem ${ROCM_PATH}/${CMAKE_INSTALL_INCLUDEDIR} -Wno-unused-command-line-argument -I${ROCM_PATH}/${CMAKE_INSTALL_INCLUDEDIR} -c  src/*.cpp -Iinclude/ -I./
+    WORKING_DIRECTORY ${HIP_SRC_PATH})
+    if(CPPCHECK_EXE)
+        add_dependencies(analyze cppcheck)
+    endif()
+endif()
+
+#File reorg Backward compatibility function
+if(NOT WIN32)
+    if(FILE_REORG_BACKWARD_COMPATIBILITY)
+        include(hip-backward-compat.cmake)
+    endif()
+endif()
@@ -0,0 +1,62 @@
+## Prerequisites
+
+-   Install mesa-common-dev
+-   Either build or install [COMGR](https://github.com/RadeonOpenCompute/ROCm-CompilerSupport), [CLANG](https://github.com/RadeonOpenCompute/llvm-project) and [Device Library](https://github.com/RadeonOpenCompute/ROCm-Device-Libs)
+
+## Branch of repository
+
+Before get HIP source code, set the expected branch of repository at the variable HIP_BRANCH.
+For example, for ROCm5.0 release branch, set
+```
+export HIP_BRANCH=rocm-5.0.x
+```
+
+ROCm5.1 release branch, set
+```
+export HIP_BRANCH=rocm-5.1.x
+```
+Similiar format for future branches.
+
+## Getting the source code
+
+```bash
+git clone -b $HIP_BRANCH https://github.com/ROCm-Developer-Tools/hipamd.git
+git clone -b $HIP_BRANCH https://github.com/ROCm-Developer-Tools/hip.git
+git clone -b $HIP_BRANCH https://github.com/ROCm-Developer-Tools/ROCclr.git
+git clone -b $HIP_BRANCH https://github.com/RadeonOpenCompute/ROCm-OpenCL-Runtime.git
+```
+
+## Set the environment variables
+
+```bash
+export HIPAMD_DIR="$(readlink -f hipamd)"
+export HIP_DIR="$(readlink -f hip)"
+export ROCclr_DIR="$(readlink -f ROCclr)"
+export OPENCL_DIR="$(readlink -f ROCm-OpenCL-Runtime)"
+```
+
+## Build HIPAMD
+
+Commands to build hipamd are as following,
+```bash
+cd "$HIPAMD_DIR"
+mkdir -p build; cd build
+cmake -DHIP_COMMON_DIR=$HIP_DIR -DAMD_OPENCL_PATH=$OPENCL_DIR -DROCCLR_PATH=$ROCCLR_DIR -DCMAKE_PREFIX_PATH="<ROCM_PATH>/" ..
+make -j$(nproc)
+sudo make install
+```
+
+Please note, HIP_COMMON_DIR looks for hip common ([HIP](https://github.com/ROCm-Developer-Tools/HIP/)) source codes.
+By default, release version of hipamd is built. hip will be installed to the default path <ROCM_PATH>/hip
+
+Developer can use cmake option CMAKE_INSTALL_PREFIX to define the path where hip is expected to be installed, commands to build are as following,
+```bash
+cd "$HIPAMD_DIR"
+mkdir -p build; cd build
+cmake -DHIP_COMMON_DIR=$HIP_DIR -DAMD_OPENCL_PATH=$OPENCL_DIR -DROCCLR_PATH=$ROCCLR_DIR -DCMAKE_PREFIX_PATH="<ROCM_PATH>/" -DCMAKE_INSTALL_PREFIX=$PWD/install ..
+make -j$(nproc)
+sudo make install
+```
+
+After installation, make sure HIP_PATH is pointed to the path where hip is installed.
+
@@ -0,0 +1,20 @@
+Copyright (c) 2008 - 2022 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
@@ -0,0 +1,24 @@
+## What is this repository for? ###
+
+This repository provides [HIP](https://github.com/ROCm-Developer-Tools/HIP) implementation specifically for AMD platform.
+
+## DISCLAIMER
+
+The information presented in this document is for informational purposes only and may contain technical inaccuracies, omissions, and typographical errors. The information contained herein is subject to change and may be rendered inaccurate for many reasons, including but not limited to product and roadmap changes, component and motherboard versionchanges, new model and/or product releases, product differences between differing manufacturers, software changes, BIOS flashes, firmware upgrades, or the like. Any computer system has risks of security vulnerabilities that cannot be completely prevented or mitigated.AMD assumes no obligation to update or otherwise correct or revise this information. However, AMD reserves the right to revise this information and to make changes from time to time to the content hereof without obligation of AMD to notify any person of such revisions or changes.THIS INFORMATION IS PROVIDED ‘AS IS.” AMD MAKES NO REPRESENTATIONS OR WARRANTIES WITH RESPECT TO THE CONTENTS HEREOF AND ASSUMES NO RESPONSIBILITY FOR ANY INACCURACIES, ERRORS, OR OMISSIONS THAT MAY APPEAR IN THIS INFORMATION. AMD SPECIFICALLY DISCLAIMS ANY IMPLIED WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR ANY PARTICULAR PURPOSE. IN NO EVENT WILL AMD BE LIABLE TO ANY PERSON FOR ANY RELIANCE, DIRECT, INDIRECT, SPECIAL, OR OTHER CONSEQUENTIAL DAMAGES ARISING FROM THE USE OF ANY INFORMATION CONTAINED HEREIN, EVEN IF AMD IS EXPRESSLY ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. AMD, the AMD Arrow logo, and combinations thereof are trademarks of Advanced Micro Devices, Inc. Other product names used in this publication are for identification purposes only and may be trademarks of their respective companies.
+
+© 2021 Advanced Micro Devices, Inc. All Rights Reserved.
+
+## Repository branches:
+
+The hipamd repository maintains several branches. The branches that are of importance are:
+
+* Main branch: This is the stable branch. It is up to date with the latest release branch, for example, if the latest HIP release is rocm-4.4, main branch will be the repository based on this release.
+* Develop branch: This is the default branch, on which the new features are still under development and visible. While this maybe of interest to many, it should be noted that this branch and the features under development might not be stable.
+* Release branches. These are branches corresponding to each ROCM release, listed with release tags, such as rocm-4.4, etc.
+
+## Release tagging:
+
+hipamd releases are typically naming convention for each ROCM release to help differentiate them.
+
+* rocm x.yy: These are the stable releases based on the ROCM release.
+  This type of release is typically made once a month.*
@@ -0,0 +1,265 @@
+#!/bin/bash
+
+#| Usage: roc-obj [-h] [-t REGEXP] [-o OUTDIR] [-I REPLACE-STRING|-i] [-d]
+#|                EXECUTABLE... [: [SUFFIX COMMAND [ARGS...] ;]...]
+#|
+#| Wrapper for roc-obj-ls and roc-obj-extract which extracts code objects
+#| embedded in each EXECUTABLE and optionally applies COMMANDs to them.
+#|
+#| If the POSIX extended regular expression REGEXP is specified, only embedded
+#| code objects whose Target ID matches REGEXP are extracted; otherwise all
+#| code objects are extracted.
+#|
+#| If the directory path OUTDIR is specified, it is created if it does not
+#| already exist, and the code objects are extracted into it; otherwise they
+#| are extracted into the current working directory.
+#|
+#| The extracted files are named by appending a ":" followed by the Target ID
+#| of the extracted code object to the input filename EXECUTABLE they were
+#| extracted from.
+#|
+#| If the list of EXECUTABLE arguments is terminated with ":" then after all
+#| selected files are successfully extracted, zero or more additional embedded
+#| command-lines, separated by ";", are read from the command-line starting
+#| after the ":". These must specify a SUFFIX used to name the output of the
+#| corresponding COMMAND, along with the COMMAND name and any ARGS to it.
+#|
+#| Then each COMMAND is executed, as if by a POSIX "execvp" function, once for
+#| each embedded code object that was created in OUTDIR. (Note: Typically this
+#| means the user must ensure the commands are present in at least one
+#| directory of the "PATH" environment variable.) For each execution of
+#| COMMAND:
+#|
+#| If REPLACE-STRING is specified, all instances of REPLACE-STRING in ARGS are
+#| replaced with the file path of the extracted code object before executing
+#| COMMAND.
+#|
+#| The standard input is redirected from the extracted code object.
+#|
+#| If SUFFIX is "-" the standard output is not redirected. If SUFFIX is "!" the
+#| standard output is redirected to /dev/null. Otherwise, the standard output
+#| is redirected to files named by the file path of the extracted code object
+#| with SUFFIX appended.
+#|
+#| Note: The executables roc-obj-ls, roc-obj-extract, and llvm-objdump (in the
+#| case of disassembly requested using the -d flag) are searched for in a
+#| unique way. A series of directories are searched, some conditionally, until
+#| a suitable executable is found. If all directories are searched without
+#| finding the executable, an error occurs. The first directory searched is the
+#| one containing the hard-link to the roc-obj being executed, known as the
+#| "base directory". Next, if the environment variable HIP_CLANG_PATH is set,
+#| it is searched; otherwise, the base directory path is appended with
+#| "../../llvm/bin" and it is searched. Finally, the PATH is searched as if by
+#| a POSIX "execvp" function.
+#|
+#| Option Descriptions:
+#|   -h, --help              print this help text and exit
+#|   -t, --target-id         only extract code objects from EXECUTABLE whose Target ID
+#|                           matches the POSIX extended regular expression REGEXP
+#|   -o, --outdir            set the output directory, which is created if it
+#|                           does not exist
+#|   -I, --replace-string    replace all occurrences of the literal string
+#|                           REPLACE-STRING in ARGS with the input filename
+#|   -i, --replace           equivalent to -I{}
+#|   -d, --disassemble       diassemble extracted code objects; equivalent to
+#|                           : .s llvm-objdump -d - ;
+#|
+#| Example Usage:
+#|
+#| Extract all code objects embedded in a.so:
+#| $ roc-obj a.so
+#|
+#| Extract all code objects embedded in a.so, b.so, and c.so:
+#| $ roc-obj a.so b.so c.so
+#|
+#| Extract all code objects embedded in a.so with "gfx9" in their Target ID:
+#| $ roc-obj -t gfx9 a.so
+#|
+#| Extract all code objects embedded in a.so into output/ (creating it if needed):
+#| $ roc-obj -o output/ a.so
+#|
+#| Extract all code objects embedded in a.so with "gfx9" in their Target ID
+#| into output/ (creating it if needed):
+#| $ roc-obj -t gfx9 -o output/ a.so
+#|
+#| Extract all code objects embedded in a.so, and then disassemble each of them
+#| to files ending with .s:
+#| $ roc-obj -d a.so
+#|
+#| Extract all code objects embedded in a.so, and count the number of bytes in
+#| each, writing the results to files ending with .count:
+#| $ roc-obj a.so : .count wc -c
+#|
+#| Extract all code objects embedded in a.so, and inspect their ELF headers
+#| using llvm-readelf (which will not read from standard input), writing to
+#| files ending with .hdr:
+#| $ roc-obj -I'{}' a.so : .hdr llvm-readelf -h '{}'
+#|
+#| Extract all code objects embedded in a.so, and then extract each of their
+#| .text sections using llvm-objcopy (which won't read from standard input
+#| or write to standard output):
+#| $ roc-obj -I'{}' a.so : ! llvm-objcopy -O binary :only-section=.text '{}' '{}.text'
+#|
+#| Extract all code objects embedded in a.so, b.so, and c.so with target
+#| feature xnack disabled into directory out/. Then, for each:
+#| Write the size in bytes into a file ending with .count, and
+#| Write a textual description of the ELF headers to a file ending with .hdr, and
+#| Extract the .text section to a file ending with .text
+#| $ roc-obj -I'{}' -t xnack- -o out/ a.so b.so c.so : \
+#|     .count wc -c \;
+#|     .hdr llvm-readelf -h '{}' \;
+#|     ! llvm-objcopy -O binary --only-section=.text '{}' '{}.text'
+
+set -euo pipefail
+
+usage() {
+  sed -n 's/^#| \?\(.*\)$/\1/p' "$0"
+}
+
+usage_then_exit() {
+  local -r status="$1"; shift
+  usage >&$(( status ? 2 : 1 ))
+  exit "$status"
+}
+
+fail() {
+  printf "error: %s\n" "$*" >&2
+  exit 1
+}
+
+# Account for the fact that we do not necessarily put ROCm tools in the PATH,
+# nor do we have a single, unified ROCm "bin/" directory.
+#
+# Note that this is only used for roc-obj-ls, roc-obj-extract, and "shortcut"
+# options like -d, and the user can still use any copy of llvm-* by explicitly
+# invoking it with a full path, e.g. : /path/to/llvm-* ... ;
+find_rocm_executable_or_fail() {
+  local -r command="$1"; shift
+  local file
+  local searched=()
+  for dir in "$BASE_DIR" "${HIP_CLANG_PATH:-"$BASE_DIR/../../llvm/bin"}"; do
+    file="$dir/$command"
+    if [[ -x $file ]]; then
+      printf "%s" "$file"
+      return
+    else
+      searched+=("$dir")
+    fi
+  done
+  if hash "$command" 2>/dev/null; then
+    printf "%s" "$command"
+  else
+    fail could not find "$command" in "${searched[*]}" or PATH
+  fi
+}
+
+# Extract the embedded code objects of the executable file given as the first
+# argument into OPT_OUTDIR, filtering them via OPT_TARGET_ID.
+#
+# Deletes any resulting files which are empty, and prints the paths of the
+# remaining files.
+extract() {
+  local -r executable="$1"; shift
+  local prefix
+  prefix="$(basename -- "$executable")"
+  # We want the shell to split the result of roc-obj-ls on whitespace, as
+  # neither the Target ID nor the URI can have embedded spaces.
+  # shellcheck disable=SC2046
+  set -- $("$ROC_OBJ_LS" -- "$executable" | awk "\$2~/$OPT_TARGET_ID/")
+  while (( $# )); do
+    local output="$prefix:$1"; shift
+    output="$output.$1"; shift
+    local uri="$1"; shift
+    [[ -n $OPT_OUTDIR ]] && output="$OPT_OUTDIR/$output"
+    "$ROC_OBJ_EXTRACT" -o - -- "$uri" >"$output"
+    if [[ -s $output ]]; then
+      printf '%s\n' "$output"
+    else
+      rm "$output"
+    fi
+  done
+  (( $# )) && fail expected even number of fields from roc-obj-ls
+}
+
+# Run a command over a list of inputs, naming output files with the supplied
+# suffix and applying OPT_REPLACE_STRING if needed.
+#
+# Arguments are of the form:
+# $suffix $command $args... ; $inputs
+run_command() {
+  local -r suffix="$1"; shift
+  local -r command="$1"; shift
+  local args=()
+  while (( $# )); do
+    local arg="$1"; shift
+    [[ $arg == ';' ]] && break
+    args+=("$arg")
+  done
+  local inputs=("$@")
+  for input in "${inputs[@]}"; do
+    case "$suffix" in
+      '-') output=/dev/stdout;;
+      '!') output=/dev/null;;
+      *) output="$input$suffix";;
+    esac
+    "$command" "${args[@]//$OPT_REPLACE_STRING/$input}" <"$input" >"$output"
+  done
+}
+
+main() {
+  [[ -n $OPT_OUTDIR ]] && mkdir -p "$OPT_OUTDIR"
+  local inputs=()
+  while (( $# )); do
+    local executable="$1"; shift
+    [[ $executable == : ]] && break
+    # Append the file paths extracted from $executable to $inputs
+    readarray -t -O "${#inputs[@]}" inputs < <(extract "$executable")
+  done
+  (( ${#inputs[@]} )) || fail no executables specified
+  while (( $# )); do
+    local suffix="$1"; shift
+    local command="$1"; shift
+    local args=()
+    while (( $# )); do
+      local arg="$1"; shift
+      [[ $arg == \; ]] && break
+      args+=("$arg")
+    done
+    run_command "$suffix" "$command" "${args[@]}" \; "${inputs[@]}"
+  done
+  (( OPT_DISASSEMBLE )) && run_command .s "$OBJDUMP" -d - \; "${inputs[@]}"
+}
+
+OPT_TARGET_ID=''
+OPT_OUTDIR=''
+OPT_REPLACE_STRING=''
+OPT_DISASSEMBLE=0
+! getopt -T || fail util-linux enhanced getopt required
+getopt="$(getopt -o +ht:o:I:id \
+          --long help,target-id:,outdir:,replace:,replace-default,disassemble \
+          -n roc-obj -- "$@")"
+eval set -- "$getopt"
+unset getopt
+while true; do
+  case "$1" in
+    -h | --help) usage_then_exit 0;;
+    -t | --target-id) OPT_TARGET_ID="${2//\//\\\/}"; shift 2;;
+    -o | --outdir) OPT_OUTDIR="$2"; shift 2;;
+    -I | --replace-string) OPT_REPLACE_STRING="$2"; shift 2;;
+    -i | --replace) OPT_REPLACE_STRING='{}'; shift;;
+    -d | --disassemble) OPT_DISASSEMBLE=1; shift;;
+    --) shift; break;;
+    *) usage_then_exit 1;;
+  esac
+done
+readonly -- OPT_TARGET_ID OPT_OUTDIR OPT_REPLACE_STRING OPT_DISASSEMBLE
+
+# We expect to be installed as ROCM_PATH/hip/bin/roc-obj, which means BASE_DIR
+# is ROCM_PATH/hip/bin.
+BASE_DIR="$(cd "$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")" && pwd)"
+(( OPT_DISASSEMBLE )) && OBJDUMP="$(find_rocm_executable_or_fail llvm-objdump)"
+ROC_OBJ_LS="$(find_rocm_executable_or_fail roc-obj-ls)"
+ROC_OBJ_EXTRACT="$(find_rocm_executable_or_fail roc-obj-extract)"
+readonly -- BASE_DIR OBJDUMP ROC_OBJ_LS ROC_OBJ_EXTRACT
+
+main "$@"
@@ -0,0 +1,244 @@
+#!/usr/bin/perl
+# Copyright (c) 2020-2021 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+use strict;
+use File::Copy;
+use File::Spec;
+use File::Basename;
+use File::Which;
+use Cwd 'realpath';
+use Getopt::Std;
+use List::Util qw(max);
+use URI::Encode;
+
+my $extract_range_specifier;
+my $extract_pid;
+my $extract_file;
+my $output_file;
+my $output_path;
+my $extract_offset;
+my $extract_size;
+my $pid_running;
+my $verbose=0;
+my $error=0;
+my $output_to_stdout=0;
+
+sub usage {
+  print("Usage: $0 [-o|v|h] URI... \n");
+  print("  URIs can be read from STDIN, one per line.\n");
+  print("  From the URIs specified, extracts code objects into files named: ");
+  print("<executable_name>-[pid<number>]-offset<number>-size<number>.co\n\n");
+  print("Options:\n");
+  print("  -o <path> \tPath for output. If \"-\" specified, code object is printed to STDOUT.\n");
+  print("  -v        \tVerbose output to STDOUT.\n");
+  print("  -h        \tShow this help message.\n");
+  print("\nURI syntax:\n");
+  print("\tcode_object_uri ::== file_uri | memory_uri\n");
+  print("\tfile_uri        ::== \"file://\" extract_file [ range_specifier ]\n");
+  print("\tmemory_uri      ::== \"memory://\" process_id range_specifier\n");
+  print("\trange_specifier ::== range_delimiter range_attribute [\"&\" range_attribute]\n");
+  print("\trange_delimiter ::== \"#\" | \"?\"\n");
+  print("\trange_attribute ::== [\"offset=\" number | \"size=\" number ]\n");
+  print("\textract_file    ::== URI_ENCODED_OS_FILE_PATH\n");
+  print("\tprocess_id      ::== DECIMAL_NUMBER\n");
+  print("\tnumber          ::== HEX_NUMBER \| DECIMAL_NUMBER \| OCTAL_NUMBER\n\n");
+  print("\tExample: file://dir1/dir2/hello_world#offset=133&size=14472 \n");
+  print("\t         memory://1234#offset=0x20000&size=3000\n\n");
+  print("  NOTES:\n\n");
+  print("\tWhen specifying a URI in a shell command you will need to escape the \'&\' character in the range_specifier.\n");
+  print("\tIf \"size=\" is not specified, the default is the remainder of the file from the given offset.\n\n");
+
+  exit($error);
+}
+
+# Process options
+my %options=();
+getopts('vho:', \%options);
+
+if (defined $options{h}) {
+  usage();
+}
+
+if (defined $options{v}) {
+  $verbose = 1;
+}
+
+if (defined $options{o}) {
+  $output_path = $options{o};
+  if ($output_path eq "-") {
+    $output_to_stdout=1;
+  } else {
+    (-d $output_path) || die("Error: Path \'$output_path\' cannot be found.\n");
+  }
+}
+
+# Only push STDIN if there are no arguments -- otherwise this
+# consumes the caller's stdin by accident.
+# push STDIN to ARGV array.
+if ($#ARGV < 0) {
+  push @ARGV, <STDIN> unless -t STDIN;
+}
+
+# error check: enough arguments presented.
+if ($#ARGV < 0) {
+  print(STDERR "Error: No arguments.\n"); $error++;
+  usage();
+}
+
+# error check: command dd is available.
+my $dd_cmd = which("dd");
+(-f $dd_cmd) || die("Error: Can't find dd command\n");
+
+foreach my $uri_str(@ARGV) {
+  chomp $uri_str;
+
+  my ($uri_protocol, $specs) = split(/:\/\//,$uri_str);
+  my $obj_uri_encode = URI::Encode->new();
+  my $decoded_extract_file;
+  my $file_size;
+
+  if (lc($uri_protocol) eq "file") {
+    # expect file path
+    ($extract_file, $extract_range_specifier) = split(/[#,?]/,$specs);
+
+    # decode the file name. URIs may have file/path names with non-alphanumeric characters, which will be encoded with %.  We need to decode these.
+    $decoded_extract_file = $obj_uri_encode->decode($extract_file);
+
+    # verify file exists:
+    if (! -e $decoded_extract_file) {
+      print(STDERR "Error: can't find file: $decoded_extract_file\n"); $error++;
+      next;
+    }
+
+    # use the output_path is specified, otherwise use current working dir.
+    if ($output_path ne "") {
+      $output_file = File::Spec->catfile($output_path, basename($decoded_extract_file));
+    } else {
+      $output_file = basename($decoded_extract_file);
+    }
+
+  } elsif ( lc($uri_protocol) eq "memory") {
+    # expect memory specifier
+    ($extract_pid, $extract_range_specifier) = split(/[#,?]/,$specs);
+
+    # verify pid is currently running
+    $pid_running = kill 0, $extract_pid;
+    if (! $pid_running) {
+      print(STDERR "Error: PID: $extract_pid is NOT running\n"); $error++;
+      next;
+    }
+
+    # get pid filename:
+    $extract_file = "/proc/$extract_pid/mem";
+   
+    # verify file exists:
+    if (! -e $extract_file) {
+      print(STDERR "Error: can't find file: $extract_file\n"); $error++;
+      next;
+    }
+
+    # for extracting from a pid, make the output file in the current dir/path with the pid value as a name.
+    $output_file = "pid${extract_pid}";
+
+    # need to set $decoded_extract_file, because later we use this for other checks.
+    $decoded_extract_file = $extract_file;
+  } else {
+    # error, unrecognized Code Object URI
+    print(STDERR "Error: \'$uri_protocol\' is not recognized as a supported code object URI.\n"); $error++;
+    next;
+  }
+
+  # it is valid to not give a range specifier in a URI, in which case the entire code object will be extracted.
+  if ($extract_range_specifier ne "") {
+    my @tokens;
+    my $str;
+    my $value;
+    my $size_specified = 0;
+
+    @tokens = split(/[&]/,$extract_range_specifier);
+    foreach (@tokens) {
+      ($str,$value) = split(/=/,$_);
+      if ($str eq "size") {
+        $extract_size=$value;
+        $size_specified = 1;
+      } elsif ($str eq "offset") {
+        $extract_offset=$value;
+      }
+    }
+
+    if ($size_specified != 1) {
+      # "size" not specified.  default to rest of file (total size - offset)
+      $extract_size = -s $decoded_extract_file;
+      $extract_size -= $extract_offset;
+    }
+
+  } else {
+    # Error if URI is a memory request, and we have no range_specifier.
+    if ($pid_running) {
+      print(STDERR "Error: must specify a Range Specifier (offset and size) for a memory URI: $uri_str\n"); $error++;
+      next;
+    }
+
+    $extract_offset = 0;
+    $extract_size = -s $decoded_extract_file;
+  }
+
+  # We should have at least a valid size to extract; ignore cases with size=0.
+  if ($extract_size != 0) {
+    print("Reading input file \"$extract_file\" ...\n") if ($verbose);
+
+    # only if this is a File URI.
+    if (lc($uri_protocol) eq "file") {
+      # verify that offset+size does not exceed file size:
+      my $file_size = -s $decoded_extract_file;
+      my $size = int($extract_offset) + int($extract_size);
+      if ( $size > $file_size ) {
+        print(STDERR "Error: requested offset($extract_offset) + size($extract_size) exceeds file size($file_size) for file \"$decoded_extract_file\".\n"); $error++;
+        next;
+      }
+    }
+
+    open(INPUT_FP, "<", $decoded_extract_file) || die $!;
+    binmode INPUT_FP;
+
+    # extract the code object
+    my $co_filename;
+    if (!$output_to_stdout) {
+      $co_filename = "of=\'${output_file}-offset${extract_offset}-size${extract_size}.co\'";
+    }
+
+    my $dd_cmd_str = "$dd_cmd if=\'$decoded_extract_file\' $co_filename skip=$extract_offset count=$extract_size bs=1 status=none";
+
+    print("DD Command: $dd_cmd_str\n") if ($verbose);
+
+    my $dd_ret = system($dd_cmd_str);
+    if ($dd_ret != 0) {
+       print(STDERR "Error: DD command ($dd_cmd_str)  failed with RC: $dd_ret\n"); $error++;
+    }
+
+    print("Extract request:  file: $extract_file offset: $extract_offset size: $extract_size\n") if ($verbose);
+  } else {
+    print("Warning: trying to extract from $extract_file at offset=$extract_offset with size=0.  Nothing to extract.\n") if ($verbose);
+  }
+
+} # end of for each (URI) argument
+
+exit($error);
@@ -0,0 +1,192 @@
+#!/usr/bin/perl
+# Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+use strict;
+use File::Copy;
+use File::Spec;
+use File::Basename;
+use File::Which;
+use Cwd 'realpath';
+use Getopt::Std;
+use List::Util qw(max);
+use URI::Encode;
+
+sub usage {
+  print("Usage: $0 [-v|h] executable...\n");
+  print("List the URIs of the code objects embedded in the specfied host executables.\n");
+  print("-v \tVerbose output (includes Entry ID)\n");
+  print("-h \tShow this help message\n");
+  exit;
+}
+
+# sub to read a qword. 1st arg is a FP, 2nd arg is ref to destination var.
+sub readq {
+ my ($input_fp, $qword) = @_;
+ read($input_fp, my $bytes, 8) == 8 or die("Error: Failed to read 8 bytes\n");
+ ${$qword} = unpack("Q<", $bytes);
+}
+
+# sub to move address to next alignment boundary
+#  first arg is address to move
+#  second arg is alignment requirement/boundary
+sub align_up {
+	my ($address, $alignment) = @_;
+	$address = int(($address + ($alignment - 1)) / $alignment) * $alignment;
+}
+
+# Process options
+my %options=();
+getopts('vhd', \%options);
+
+if (defined $options{h}) {
+  usage();
+}
+
+my $verbose = $options{v};
+my $debug = $options{d};
+
+my $num_bundles = 1;
+my $bundle_alignment = 4096;
+
+# look for objdump
+my $objdump = which("objdump");
+(-f $objdump) || die("Error: Can't find objdump command\n");
+
+# for each argument (which should be an executable):
+foreach my $executable_file(@ARGV) {
+
+  # debug message
+  print("Reading input file \"$executable_file\" ...\n") if ($debug);
+
+  # verify/open file specified.
+  open (INPUT_FP, "<", $executable_file) || die("Error: failed to open file: $executable_file\n");
+  binmode INPUT_FP;
+
+  # kernel section information
+  my $escaped_name=quotemeta($executable_file);
+  my $bundle_section_name = ".hip_fatbin";
+  my $bundle_section_size = hex(`$objdump -h $escaped_name | grep $bundle_section_name | awk '{print \$3}'`);
+  my $bundle_section_offset =  hex(`$objdump -h $escaped_name | grep $bundle_section_name | awk '{print \$6}'`);
+
+  $bundle_section_size or die("Error: No kernel section found\n");
+
+  my $bundle_section_end = $bundle_section_offset + $bundle_section_size;
+
+  if ($debug) {
+    printf("Code Objects Bundle section size: %x\n",$bundle_section_size);
+    printf("Code Objects Bundle section offset: %x\n",$bundle_section_offset);
+    printf("Code Objects Bundle section end: %x\n\n",$bundle_section_end);
+  }
+
+  my $current_bundle_offset = $bundle_section_offset;
+  printf("Current Bundle offset: 0x%X\n",$current_bundle_offset) if ($debug);
+
+  # move fp to current_bundle_offset.
+  seek(INPUT_FP, $current_bundle_offset, 0);
+
+  while ($current_bundle_offset < $bundle_section_end) {
+
+    # skip OFFLOAD_BUNDLER_MAGIC_STR
+    my $magic_str;
+    my $read_bytes = read(INPUT_FP, $magic_str, 24);
+    if (($read_bytes != 24) || ($magic_str ne "__CLANG_OFFLOAD_BUNDLE__")) {
+      print(STDERR "Error: Offload bundle magic string not detected\n") if ($debug);
+      last;
+    }
+
+    # read number of bundle entries, which are code objects.
+    my $num_codeobjects;
+    readq(\*INPUT_FP,\$num_codeobjects);
+
+    # header with current bundle number and number of embedded code objcts in that bundle.
+    #    print("Bundle Number: $num_bundles with $num_codeobjects Code Objects:\n") if ($very_verbose);
+
+    my $end_of_current_bundle = $current_bundle_offset;
+
+    # Column Header.
+    printf("%-8s%-40s%35s\n","Bundle#","Entry ID:","URI:") if ($verbose);
+
+    # for each Bundle entry (code object)  ....
+    for (my $iter = 0; $iter < $num_codeobjects; $iter++) {
+
+      print("\nEntry #$iter\n") if $debug;
+
+      # read bundle entry (code object) offset
+      my $entry_offset;
+      my $abs_offset;
+      readq(*INPUT_FP,\$entry_offset);
+      printf("entry_offset: 0x%X\n",$entry_offset) if $debug;
+
+      # read bundle entry (code object) size
+      my $entry_size;
+      readq(*INPUT_FP,\$entry_size);
+      printf("entry_size: 0x%X\n",$entry_size) if $debug;
+
+      # read triple size
+      my $triple_size;
+      readq(*INPUT_FP,\$triple_size);
+      printf("triple_size: 0x%X\n",$triple_size) if $debug;
+
+      # read triple string
+      my $triple;
+      my $read_bytes = read(INPUT_FP, $triple, $triple_size);
+      $read_bytes == $triple_size or die("Error: Fail to parse triple\n");
+      print("triple: $triple\n") if $debug;
+
+      # because the bundle entry's offset is relative to the beginning of the bundled code object section.
+      $abs_offset = int($current_bundle_offset + $entry_offset); 
+
+      # and we need to keep track of where we are in the current bundle.
+      $end_of_current_bundle = int($abs_offset + $entry_size);
+
+      printf("abs_offset: 0x%X\n",$abs_offset) if $debug;
+
+      my $obj_uri_encode = URI::Encode->new();
+      my $encoded_executable_file = $obj_uri_encode->encode($executable_file);
+
+      printf("%-8s%-40s%35s%s%s%s%s%s%s\n",$num_bundles,$triple,"file:\/\/",$encoded_executable_file,"\#offset=",$abs_offset, "\&size=",$entry_size);
+
+      printf("end_of_current_bundle: 0x%X\n",$end_of_current_bundle) if $debug;
+      printf("Hex values: file:\/\/$encoded_executable_file#offset=0x%X$abs_offset\&size=0x%X\n", $abs_offset, $entry_size) if $debug;
+
+    } # End of for each Bundle entry (code object) ...
+
+    printf("\n") if ($verbose);
+
+    # we've finished listing this current bundle ...
+    printf("current_bundle_offset: %x \n",$current_bundle_offset) if ($debug);
+    printf("bundle_section_end: %x \n", $bundle_section_end) if ($debug);
+
+    # move current_bundle_offset to next alignment boundary.
+    $current_bundle_offset = align_up($end_of_current_bundle,$bundle_alignment);
+    printf("Adjusting for alignment of next bundle: current_bundle_offset: %x \n\n\n", $current_bundle_offset) if ($debug);
+
+    # seek to the end of the current bundle:
+    seek(INPUT_FP, $current_bundle_offset, 0);
+
+    # increment the number of bundles listed.
+    $num_bundles = $num_bundles+1;
+
+  }  # End of while loop
+
+} # End of for each command line argument
+
+exit(0);
@@ -0,0 +1,39 @@
+/*
+    Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be included in
+    all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+   */
+#ifndef @include_guard@
+#define @include_guard@
+
+#if defined(__GNUC__)
+#warning "This file is deprecated. Use header files from HIP_PATH/include, where HIP_PATH corresponds to HIP install path"
+#else
+#pragma message("This file is deprecated. Use header files from HIP_PATH/include, where HIP_PATH corresponds to HIP install path")
+#endif
+
+@include_statements@
+
+@hashzero_check@
+
+@file_contents@
+
+@hash_endif@
+
+#endif
@@ -0,0 +1,261 @@
+# Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+cmake_minimum_required(VERSION 3.16.8)
+
+set(HIP_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR})
+set(HIP_WRAPPER_DIR ${HIP_BUILD_DIR}/wrapper_dir)
+set(HIP_WRAPPER_INC_DIR ${HIP_WRAPPER_DIR}/include/hip)
+set(HIP_WRAPPER_BIN_DIR ${HIP_WRAPPER_DIR}/bin)
+set(HIP_WRAPPER_LIB_DIR ${HIP_WRAPPER_DIR}/lib)
+set(HIP_WRAPPER_CMAKE_DIR ${HIP_WRAPPER_DIR}/cmake)
+set(HIP_WRAPPER_FINDHIP_DIR ${HIP_WRAPPER_DIR}/FindHIP)
+set(HIP_SRC_INC_DIR ${HIP_SRC_PATH}/include/hip)
+set(HIP_SRC_BIN_DIR ${HIP_SRC_PATH}/bin)
+set(HIP_INFO_FILE ".hipInfo")
+
+#Function to set actual file contents in wrapper files
+#Some components grep for the contents in the file
+function(set_file_contents input_file)
+    set(hashzero_check "#if 0
+/* The following is a copy of the original file for the benefit of build systems which grep for values
+ * in this file rather than preprocess it. This is just for backward compatibility */")
+
+    file(READ ${input_file} file_contents)
+    set(hash_endif "#endif")
+    get_filename_component(file_name ${input_file} NAME)
+    configure_file(${HIP_SRC_PATH}/header_template.hpp.in ${HIP_WRAPPER_INC_DIR}/${file_name})
+endfunction()
+
+#use header template file and generate wrapper header files
+function(generate_wrapper_header)
+#create respecitve folder in /opt/rocm/hip
+  file(MAKE_DIRECTORY ${HIP_WRAPPER_INC_DIR}/amd_detail)
+  file(MAKE_DIRECTORY ${HIP_WRAPPER_INC_DIR}/nvidia_detail)
+
+  #find all header files from include/hip
+  file(GLOB include_files ${HIP_BUILD_DIR}/include/hip/*.h)
+  #Convert the list of files into #includes
+  foreach(header_file ${include_files})
+    # set include guard
+    get_filename_component(INC_GAURD_NAME ${header_file} NAME_WE)
+    string(TOUPPER ${INC_GAURD_NAME} INC_GAURD_NAME)
+    set(include_guard "HIP_WRAPPER_INCLUDE_HIP_${INC_GAURD_NAME}_H")
+    #set #include statement
+    get_filename_component(file_name ${header_file} NAME)
+    set(include_statements "#include \"../../../${CMAKE_INSTALL_INCLUDEDIR}/hip/${file_name}\"\n")
+    if(${file_name} STREQUAL "hip_version.h")
+      set_file_contents(${header_file})
+    else()
+      configure_file(${HIP_SRC_PATH}/header_template.hpp.in ${HIP_WRAPPER_INC_DIR}/${file_name})
+    endif()
+  endforeach()
+
+  #find all header files from include/hip/amd_detail
+  file(GLOB include_files ${HIP_SRC_INC_DIR}/amd_detail/*)
+  #Convert the list of files into #includes
+  foreach(header_file ${include_files})
+    # set include guard
+    get_filename_component(INC_GAURD_NAME ${header_file} NAME_WE)
+    string(TOUPPER ${INC_GAURD_NAME} INC_GAURD_NAME)
+    set(include_guard "HIP_WRAPPER_INCLUDE_HIP_AMD_DETAIL_${INC_GAURD_NAME}_H")
+    #set #include statement
+    get_filename_component(file_name ${header_file} NAME)
+    set(include_statements "#include \"../../../../${CMAKE_INSTALL_INCLUDEDIR}/hip/amd_detail/${file_name}\"\n")
+
+    configure_file(${HIP_SRC_PATH}/header_template.hpp.in ${HIP_WRAPPER_INC_DIR}/amd_detail/${file_name})
+  endforeach()
+
+  #find all header files from include/hip/nvidia_detail
+  file(GLOB include_files ${HIP_SRC_INC_DIR}/nvidia_detail/*)
+  #Convert the list of files into #includes
+  foreach(header_file ${include_files})
+    # set include guard
+    get_filename_component(INC_GAURD_NAME ${header_file} NAME_WE)
+    string(TOUPPER ${INC_GAURD_NAME} INC_GAURD_NAME)
+    set(include_guard "HIP_WRAPPER_INCLUDE_HIP_NVIDIA_DETAIL_${INC_GAURD_NAME}_H")
+    #set #include statement
+    get_filename_component(file_name ${header_file} NAME)
+    set(include_statements "#include \"../../../../${CMAKE_INSTALL_INCLUDEDIR}/hip/nvidia_detail/${file_name}\"\n")
+
+    configure_file(${HIP_SRC_PATH}/header_template.hpp.in ${HIP_WRAPPER_INC_DIR}/nvidia_detail/${file_name})
+  endforeach()
+
+endfunction()
+
+#function to create symlink to binaries
+function(create_binary_symlink)
+  file(MAKE_DIRECTORY ${HIP_WRAPPER_BIN_DIR})
+  #get all  binaries
+  file(GLOB binary_files ${HIP_SRC_BIN_DIR}/*)
+  #Add .hipVersion to binary list
+  set(binary_files "${binary_files}" ".hipVersion")
+  foreach(binary_file ${binary_files})
+    get_filename_component(file_name ${binary_file} NAME)
+    add_custom_target(link_${file_name} ALL
+                  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+                  COMMAND ${CMAKE_COMMAND} -E create_symlink
+                  ../../${CMAKE_INSTALL_BINDIR}/${file_name} ${HIP_WRAPPER_BIN_DIR}/${file_name})
+  endforeach()
+
+  unset(binary_files)
+  file(GLOB binary_files ${HIP_BUILD_DIR}/bin/*)
+  foreach(binary_file ${binary_files})
+    get_filename_component(file_name ${binary_file} NAME)
+    if(WIN32)
+      add_custom_target(link_${file_name} ALL
+                  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+                  COMMAND ${CMAKE_COMMAND} -E create_symlink
+                  ../../${CMAKE_INSTALL_BINDIR}/${file_name} ${HIP_WRAPPER_BIN_DIR}/${file_name})
+
+    else()
+      if( NOT ${file_name} MATCHES ".bat$")
+        add_custom_target(link_${file_name} ALL
+                  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+                  COMMAND ${CMAKE_COMMAND} -E create_symlink
+                  ../../${CMAKE_INSTALL_BINDIR}/${file_name} ${HIP_WRAPPER_BIN_DIR}/${file_name})
+      endif()#end of bat file check
+    endif()#end of OS check
+  endforeach()
+endfunction()
+
+#function to create symlink to libraries
+function(create_library_symlink)
+  file(MAKE_DIRECTORY ${HIP_WRAPPER_LIB_DIR})
+  if(BUILD_SHARED_LIBS)
+    set(LIB_AMDHIP "libamdhip64.so")
+    set(MAJ_VERSION "${HIP_LIB_VERSION_MAJOR}")
+    set(SO_VERSION "${HIP_LIB_VERSION_STRING}")
+    set(library_files "${LIB_AMDHIP}"  "${LIB_AMDHIP}.${MAJ_VERSION}" "${LIB_AMDHIP}.${SO_VERSION}")
+    set(LIB_HIPRTC "libhiprtc-builtins.so")
+    set(library_files "${library_files}" "${LIB_HIPRTC}"  "${LIB_HIPRTC}.${MAJ_VERSION}" "${LIB_HIPRTC}.${SO_VERSION}" )    
+    set(LIB_RTC "libhiprtc.so")
+    set(library_files "${library_files}" "${LIB_RTC}"  "${LIB_RTC}.${MAJ_VERSION}" "${LIB_RTC}.${SO_VERSION}" )
+  else()
+    set(library_files "libamdhip64.a")
+  endif()
+
+  foreach(file_name ${library_files})
+     add_custom_target(link_${file_name} ALL
+                  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+                  COMMAND ${CMAKE_COMMAND} -E create_symlink
+                  ../../${CMAKE_INSTALL_LIBDIR}/${file_name} ${HIP_WRAPPER_LIB_DIR}/${file_name})
+  endforeach()
+  #Add symlink for .hipInfo
+  set(file_name ${HIP_INFO_FILE})
+  add_custom_target(link_${file_name} ALL
+                  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+                  COMMAND ${CMAKE_COMMAND} -E create_symlink
+                  ../../${CMAKE_INSTALL_LIBDIR}/${file_name} ${HIP_WRAPPER_LIB_DIR}/${file_name})
+endfunction()
+
+function(create_cmake_symlink)
+  file(MAKE_DIRECTORY ${HIP_WRAPPER_CMAKE_DIR}/hip)
+
+  #create symlink to all hip config files
+  file(GLOB config_files ${HIP_BUILD_DIR}/hip-config*)
+  foreach(config_name ${config_files})
+    get_filename_component(file_name ${config_name} NAME)
+    add_custom_target(link_${file_name} ALL
+                  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+                  COMMAND ${CMAKE_COMMAND} -E create_symlink
+                  ../../../../${CMAKE_INSTALL_LIBDIR}/cmake/hip/${file_name} ${HIP_WRAPPER_CMAKE_DIR}/hip/${file_name})
+  endforeach()
+  unset(config_files)
+
+  #create symlink to hip-lang
+  file(MAKE_DIRECTORY ${HIP_WRAPPER_CMAKE_DIR}/hip-lang)
+  file(GLOB config_files ${HIP_BUILD_DIR}/src/hip-lang-config*)
+  foreach(config_name ${config_files})
+    get_filename_component(file_name ${config_name} NAME)
+    add_custom_target(link_${file_name} ALL
+                  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+                  COMMAND ${CMAKE_COMMAND} -E create_symlink
+                  ../../../../${CMAKE_INSTALL_LIBDIR}/cmake/hip-lang/${file_name} ${HIP_WRAPPER_CMAKE_DIR}/hip-lang/${file_name})
+  endforeach()
+  unset(config_files)
+
+  #create symlink to hiprtc config files
+  file(MAKE_DIRECTORY ${HIP_WRAPPER_CMAKE_DIR}/hiprtc)
+  file(GLOB config_files ${HIP_BUILD_DIR}/hiprtc-config*)
+  foreach(config_name ${config_files})
+    get_filename_component(file_name ${config_name} NAME)
+    add_custom_target(link_${file_name} ALL
+                  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+                  COMMAND ${CMAKE_COMMAND} -E create_symlink
+                  ../../../../${CMAKE_INSTALL_LIBDIR}/cmake/hiprtc/${file_name} ${HIP_WRAPPER_CMAKE_DIR}/hiprtc/${file_name})
+  endforeach()
+  unset(config_files)
+
+  #create symlink to FindHIP
+  file(MAKE_DIRECTORY ${HIP_WRAPPER_FINDHIP_DIR}/FindHIP)
+  file(GLOB config_files ${HIP_BUILD_DIR}/cmake/FindHIP/*.cmake)
+  foreach(config_name ${config_files})
+    get_filename_component(file_name ${config_name} NAME)
+    add_custom_target(link_${file_name} ALL
+                  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+                  COMMAND ${CMAKE_COMMAND} -E create_symlink
+                  ../../../${CMAKE_INSTALL_LIBDIR}/cmake/hip/FindHIP/${file_name} ${HIP_WRAPPER_FINDHIP_DIR}/FindHIP/${file_name})
+  endforeach()
+  unset(config_files)
+
+  file(GLOB config_files ${HIP_BUILD_DIR}/cmake/*.cmake)
+  foreach(config_name ${config_files})
+    get_filename_component(file_name ${config_name} NAME)
+    add_custom_target(link_${file_name} ALL
+                  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+                  COMMAND ${CMAKE_COMMAND} -E create_symlink
+                  ../../${CMAKE_INSTALL_LIBDIR}/cmake/hip/${file_name} ${HIP_WRAPPER_FINDHIP_DIR}/${file_name})
+  endforeach()
+  unset(config_files)
+
+endfunction()
+
+#Use template header file and generater wrapper header files
+generate_wrapper_header()
+install(DIRECTORY ${HIP_WRAPPER_INC_DIR} DESTINATION hip/include  COMPONENT dev)
+# Create symlink to binaries
+create_binary_symlink()
+install(DIRECTORY ${HIP_WRAPPER_BIN_DIR} DESTINATION hip COMPONENT dev)
+
+option(BUILD_SHARED_LIBS "Build the shared library" ON)
+# Create symlink to library files
+create_library_symlink()
+if(HIP_PLATFORM STREQUAL "amd" )
+  if(BUILD_SHARED_LIBS)
+    install(FILES ${HIP_WRAPPER_LIB_DIR}/libamdhip64.so DESTINATION hip/lib COMPONENT binary)
+    install(FILES ${HIP_WRAPPER_LIB_DIR}/libamdhip64.so.${HIP_LIB_VERSION_MAJOR} DESTINATION hip/lib COMPONENT binary)
+    install(FILES ${HIP_WRAPPER_LIB_DIR}/libamdhip64.so.${HIP_LIB_VERSION_STRING} DESTINATION hip/lib COMPONENT binary)
+    install(FILES ${HIP_WRAPPER_LIB_DIR}/libhiprtc-builtins.so DESTINATION hip/lib COMPONENT binary)
+    install(FILES ${HIP_WRAPPER_LIB_DIR}/libhiprtc-builtins.so.${HIP_LIB_VERSION_MAJOR} DESTINATION hip/lib COMPONENT binary)
+    install(FILES ${HIP_WRAPPER_LIB_DIR}/libhiprtc-builtins.so.${HIP_LIB_VERSION_STRING} DESTINATION hip/lib COMPONENT binary)
+    install(FILES ${HIP_WRAPPER_LIB_DIR}/libhiprtc.so DESTINATION hip/lib COMPONENT binary)
+    install(FILES ${HIP_WRAPPER_LIB_DIR}/libhiprtc.so.${HIP_LIB_VERSION_MAJOR} DESTINATION hip/lib COMPONENT binary)
+    install(FILES ${HIP_WRAPPER_LIB_DIR}/libhiprtc.so.${HIP_LIB_VERSION_STRING} DESTINATION hip/lib COMPONENT binary)
+
+  else()
+    install(FILES ${HIP_WRAPPER_LIB_DIR}/libamdhip64.a DESTINATION hip/lib COMPONENT binary)
+  endif()#End BUILD_SHARED_LIBS
+endif()#End HIP_PLATFORM AMD
+#install hipInfo
+install(FILES ${HIP_WRAPPER_LIB_DIR}/${HIP_INFO_FILE} DESTINATION hip/lib COMPONENT binary)
+#create symlink to cmake files
+create_cmake_symlink()
+install(DIRECTORY ${HIP_WRAPPER_CMAKE_DIR} DESTINATION hip/lib COMPONENT binary)
+install(DIRECTORY ${HIP_WRAPPER_FINDHIP_DIR}/ DESTINATION hip/cmake COMPONENT dev)
@@ -0,0 +1,266 @@
+# Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+cmake_minimum_required(VERSION 3.3)
+
+@PACKAGE_INIT@
+include(CheckCXXCompilerFlag)
+include(CMakeFindDependencyMacro OPTIONAL RESULT_VARIABLE _CMakeFindDependencyMacro_FOUND)
+if (NOT _CMakeFindDependencyMacro_FOUND)
+  macro(find_dependency dep)
+    if (NOT ${dep}_FOUND)
+      set(cmake_fd_version)
+      if (${ARGC} GREATER 1)
+        set(cmake_fd_version ${ARGV1})
+      endif()
+      set(cmake_fd_exact_arg)
+      if(${CMAKE_FIND_PACKAGE_NAME}_FIND_VERSION_EXACT)
+        set(cmake_fd_exact_arg EXACT)
+      endif()
+      set(cmake_fd_quiet_arg)
+      if(${CMAKE_FIND_PACKAGE_NAME}_FIND_QUIETLY)
+        set(cmake_fd_quiet_arg QUIET)
+      endif()
+      set(cmake_fd_required_arg)
+      if(${CMAKE_FIND_PACKAGE_NAME}_FIND_REQUIRED)
+        set(cmake_fd_required_arg REQUIRED)
+      endif()
+      find_package(${dep} ${cmake_fd_version}
+          ${cmake_fd_exact_arg}
+          ${cmake_fd_quiet_arg}
+          ${cmake_fd_required_arg}
+      )
+      string(TOUPPER ${dep} cmake_dep_upper)
+      if (NOT ${dep}_FOUND AND NOT ${cmake_dep_upper}_FOUND)
+        set(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE "${CMAKE_FIND_PACKAGE_NAME} could not be found because dependency ${dep} could not be found.")
+        set(${CMAKE_FIND_PACKAGE_NAME}_FOUND False)
+        return()
+      endif()
+      set(cmake_fd_version)
+      set(cmake_fd_required_arg)
+      set(cmake_fd_quiet_arg)
+      set(cmake_fd_exact_arg)
+    endif()
+  endmacro()
+endif()
+
+set(_HIP_SHELL "SHELL:")
+if(CMAKE_VERSION VERSION_LESS 3.12)
+  set(_HIP_SHELL "")
+endif()
+
+function(hip_add_interface_compile_flags TARGET)
+  set_property(TARGET ${TARGET} APPEND PROPERTY
+    INTERFACE_COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CXX>:${_HIP_SHELL}${ARGN}>"
+  )
+endfunction()
+
+function(hip_add_interface_link_flags TARGET)
+  if(CMAKE_VERSION VERSION_LESS 3.20)
+    set_property(TARGET ${TARGET} APPEND PROPERTY
+      INTERFACE_LINK_LIBRARIES "${ARGN}"
+    )
+  else()
+    set_property(TARGET ${TARGET} APPEND PROPERTY
+      INTERFACE_LINK_LIBRARIES "$<$<LINK_LANGUAGE:CXX>:${ARGN}>"
+    )
+  endif()
+endfunction()
+
+#Number of parallel jobs by default is 1
+if(NOT DEFINED HIP_CLANG_NUM_PARALLEL_JOBS)
+  set(HIP_CLANG_NUM_PARALLEL_JOBS 1)
+endif()
+set(HIP_COMPILER "@HIP_COMPILER@")
+set(HIP_RUNTIME "@HIP_RUNTIME@")
+
+# NOTE: If hip-config is invoked from /opt/rocm-ver/hip/lib/cmake/hip/
+# then PACKAGE_PREFIX_DIR will resolve to /opt/rocm-ver/hip, which is for backward compatibility
+# The following will ensure PACKAGE_PREFIX_DIR will resolves to /opt/rocm-ver
+# First find the real path to hip-config file with symlinks resolved
+# Real Path : /opt/rocm-ver/lib/cmake/hip/hip-config.cmake
+# Then go up 4 levels to get PACKAGE_PREFIX_DIR
+# PACKAGE_PREFIX_DIR : /opt/rocm-ver
+# TODO:once file reorg backward compatibility is turned off this can be removed.
+if(IS_SYMLINK ${CMAKE_CURRENT_LIST_FILE})
+  get_filename_component(CONFIG_FILE_PATH "${CMAKE_CURRENT_LIST_FILE}" REALPATH)
+  get_filename_component(PACKAGE_PREFIX_DIR "${CONFIG_FILE_PATH}/../../../../" ABSOLUTE)
+endif()
+# end of TODO
+set(HIP_PACKAGE_PREFIX_DIR  ${PACKAGE_PREFIX_DIR})
+
+set_and_check( hip_INCLUDE_DIR "@PACKAGE_INCLUDE_INSTALL_DIR@" )
+set_and_check( hip_INCLUDE_DIRS "${hip_INCLUDE_DIR}" )
+set_and_check( hip_LIB_INSTALL_DIR "@PACKAGE_LIB_INSTALL_DIR@" )
+set_and_check( hip_BIN_INSTALL_DIR "@PACKAGE_BIN_INSTALL_DIR@" )
+if(WIN32)
+  set_and_check(hip_HIPCC_EXECUTABLE "${hip_BIN_INSTALL_DIR}/hipcc.bat")
+  set_and_check(hip_HIPCONFIG_EXECUTABLE "${hip_BIN_INSTALL_DIR}/hipconfig.bat")
+else()
+  set_and_check(hip_HIPCC_EXECUTABLE "${hip_BIN_INSTALL_DIR}/hipcc")
+  set_and_check(hip_HIPCONFIG_EXECUTABLE "${hip_BIN_INSTALL_DIR}/hipconfig")
+endif()
+# Windows Specific Definition here:
+if(WIN32)
+  if(DEFINED ENV{HIP_PATH})
+    file(TO_CMAKE_PATH "$ENV{HIP_PATH}" HIP_PATH)
+  elseif(DEFINED ENV{HIP_DIR})
+    file(TO_CMAKE_PATH "$ENV{HIP_DIR}" HIP_DIR)
+  else()
+    # using the HIP found
+    set(HIP_PATH ${PACKAGE_PREFIX_DIR})
+  endif()
+else()
+  # Linux
+  # If HIP is not installed under ROCm, need this to find HSA assuming HSA is under ROCm
+  if(DEFINED ENV{ROCM_PATH})
+    set(ROCM_PATH "$ENV{ROCM_PATH}")
+  endif()
+
+  # set a default path for ROCM_PATH
+  if(NOT DEFINED ROCM_PATH)
+    set(ROCM_PATH ${PACKAGE_PREFIX_DIR})
+  endif()
+
+endif()
+
+if(HIP_COMPILER STREQUAL "clang")
+  if(WIN32)
+    # Using SDK folder
+    file(TO_CMAKE_PATH "${HIP_PATH}" HIP_CLANG_ROOT)
+    if (NOT EXISTS "${HIP_CLANG_ROOT}/bin/clang.exe")
+      # if using install folder
+      file(TO_CMAKE_PATH "${HIP_PATH}/../lc" HIP_CLANG_ROOT)
+    endif()
+  else()
+    set(HIP_CLANG_ROOT "${ROCM_PATH}/llvm")
+  endif()
+  if(NOT HIP_CXX_COMPILER)
+    set(HIP_CXX_COMPILER ${CMAKE_CXX_COMPILER})
+  endif()
+
+  if(NOT WIN32)
+    find_dependency(AMDDeviceLibs)
+  endif()
+  set(AMDGPU_TARGETS "gfx900;gfx906;gfx908;gfx90a;gfx1030" CACHE STRING "AMD GPU targets to compile for")
+  set(GPU_TARGETS "${AMDGPU_TARGETS}" CACHE STRING "GPU targets to compile for")
+endif() # HIP_COMPILER check
+
+if(NOT WIN32)
+  find_dependency(amd_comgr)
+endif()
+
+include( "${CMAKE_CURRENT_LIST_DIR}/hip-targets.cmake" )
+
+#Using find_dependency to locate the dependency for the packages
+#This makes the cmake generated file xxxx-targets to supply the linker libraries
+# without worrying other transitive dependencies
+if(NOT WIN32)
+  find_dependency(hsa-runtime64)
+  find_dependency(Threads)
+endif()
+
+set(_IMPORT_PREFIX ${HIP_PACKAGE_PREFIX_DIR})
+# Right now this is only supported for amd platforms
+set_target_properties(hip::host PROPERTIES
+  INTERFACE_COMPILE_DEFINITIONS "__HIP_PLATFORM_HCC__=1;__HIP_PLATFORM_AMD__=1"
+)
+
+if(HIP_RUNTIME MATCHES "rocclr")
+  set_target_properties(hip::amdhip64 PROPERTIES
+    INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include"
+    INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include"
+  )
+
+  get_target_property(amdhip64_type hip::amdhip64 TYPE)
+  message(STATUS "hip::amdhip64 is ${amdhip64_type}")
+
+  if(NOT WIN32)
+    set_target_properties(hip::device PROPERTIES
+      INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include"
+      INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include"
+    )
+  endif()
+endif()
+
+if(HIP_COMPILER STREQUAL "clang")
+  get_property(compilePropIsSet TARGET hip::device PROPERTY INTERFACE_COMPILE_OPTIONS SET)
+
+  if (NOT compilePropIsSet AND HIP_CXX_COMPILER MATCHES ".*clang\\+\\+")
+    hip_add_interface_compile_flags(hip::device -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false)
+  endif()
+
+  if (NOT compilePropIsSet)
+    hip_add_interface_compile_flags(hip::device -x hip)
+  endif()
+
+  hip_add_interface_link_flags(hip::device --hip-link)
+
+  foreach(GPU_TARGET ${GPU_TARGETS})
+      if (NOT compilePropIsSet)
+        hip_add_interface_compile_flags(hip::device --offload-arch=${GPU_TARGET})
+      endif()
+      hip_add_interface_link_flags(hip::device --offload-arch=${GPU_TARGET})
+  endforeach()
+  #Add support for parallel build and link
+  if(${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
+    check_cxx_compiler_flag("-parallel-jobs=1" HIP_CLANG_SUPPORTS_PARALLEL_JOBS)
+  endif()
+  if(HIP_CLANG_NUM_PARALLEL_JOBS GREATER 1)
+    if(${HIP_CLANG_SUPPORTS_PARALLEL_JOBS} )
+      if (NOT compilePropIsSet)
+        hip_add_interface_compile_flags(hip::device -parallel-jobs=${HIP_CLANG_NUM_PARALLEL_JOBS} -Wno-format-nonliteral)
+      endif()
+      hip_add_interface_link_flags(hip::device -parallel-jobs=${HIP_CLANG_NUM_PARALLEL_JOBS})
+    else()
+      message("clang compiler doesn't support parallel jobs")
+    endif()
+  endif()
+
+
+  # Use HIP_CXX option -print-libgcc-file-name --rtlib=compiler-rt
+  # To fetch the compiler rt library file name.
+  execute_process(
+    COMMAND ${HIP_CXX_COMPILER} -print-libgcc-file-name --rtlib=compiler-rt
+    OUTPUT_VARIABLE CLANGRT_BUILTINS
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    RESULT_VARIABLE CLANGRT_BUILTINS_FETCH_EXIT_CODE)
+
+  # Add support for __fp16 and _Float16, explicitly link with compiler-rt
+  if( "${CLANGRT_BUILTINS_FETCH_EXIT_CODE}" STREQUAL "0" )
+    # CLANG_RT Builtins found Successfully Set interface link libraries property
+    set_property(TARGET hip::host APPEND PROPERTY INTERFACE_LINK_LIBRARIES "${CLANGRT_BUILTINS}")
+    set_property(TARGET hip::device APPEND PROPERTY INTERFACE_LINK_LIBRARIES "${CLANGRT_BUILTINS}")
+  else()
+    message(STATUS "clangrt builtins lib not found: ${CLANGRT_BUILTINS_FETCH_EXIT_CODE}")
+  endif() # CLANGRT_BUILTINS_FETCH_EXIT_CODE Check
+endif() # HIP_COMPILER Check
+
+set( hip_LIBRARIES hip::host hip::device)
+set( hip_LIBRARY ${hip_LIBRARIES})
+
+set(HIP_INCLUDE_DIR ${hip_INCLUDE_DIR})
+set(HIP_INCLUDE_DIRS ${hip_INCLUDE_DIRS})
+set(HIP_LIB_INSTALL_DIR ${hip_LIB_INSTALL_DIR})
+set(HIP_BIN_INSTALL_DIR ${hip_BIN_INSTALL_DIR})
+set(HIP_LIBRARIES ${hip_LIBRARIES})
+set(HIP_LIBRARY ${hip_LIBRARY})
+set(HIP_HIPCC_EXECUTABLE ${hip_HIPCC_EXECUTABLE})
+set(HIP_HIPCONFIG_EXECUTABLE ${hip_HIPCONFIG_EXECUTABLE})
+
@@ -0,0 +1,348 @@
+/*
+Copyright (c) 2015 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_CHANNEL_DESCRIPTOR_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_CHANNEL_DESCRIPTOR_H
+
+#include <hip/hip_common.h>
+#include <hip/driver_types.h>
+#include <hip/amd_detail/amd_hip_vector_types.h>
+
+#ifdef __cplusplus
+
+extern "C" HIP_PUBLIC_API
+hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w, hipChannelFormatKind f);
+
+static inline hipChannelFormatDesc hipCreateChannelDescHalf() {
+    int e = (int)sizeof(unsigned short) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
+}
+
+static inline hipChannelFormatDesc hipCreateChannelDescHalf1() {
+    int e = (int)sizeof(unsigned short) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
+}
+
+static inline hipChannelFormatDesc hipCreateChannelDescHalf2() {
+    int e = (int)sizeof(unsigned short) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
+}
+
+template <typename T>
+static inline hipChannelFormatDesc hipCreateChannelDesc() {
+    return hipCreateChannelDesc(0, 0, 0, 0, hipChannelFormatKindNone);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<char>() {
+    int e = (int)sizeof(char) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<signed char>() {
+    int e = (int)sizeof(signed char) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<unsigned char>() {
+    int e = (int)sizeof(unsigned char) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<uchar1>() {
+    int e = (int)sizeof(unsigned char) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<char1>() {
+    int e = (int)sizeof(signed char) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<uchar2>() {
+    int e = (int)sizeof(unsigned char) * 8;
+    return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<char2>() {
+    int e = (int)sizeof(signed char) * 8;
+    return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
+}
+
+#ifndef __GNUC__  // vector3 is the same as vector4
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<uchar3>() {
+    int e = (int)sizeof(unsigned char) * 8;
+    return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<char3>() {
+    int e = (int)sizeof(signed char) * 8;
+    return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
+}
+#endif
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<uchar4>() {
+    int e = (int)sizeof(unsigned char) * 8;
+    return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<char4>() {
+    int e = (int)sizeof(signed char) * 8;
+    return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<unsigned short>() {
+    int e = (int)sizeof(unsigned short) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<signed short>() {
+    int e = (int)sizeof(signed short) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<ushort1>() {
+    int e = (int)sizeof(unsigned short) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<short1>() {
+    int e = (int)sizeof(signed short) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<ushort2>() {
+    int e = (int)sizeof(unsigned short) * 8;
+    return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<short2>() {
+    int e = (int)sizeof(signed short) * 8;
+    return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
+}
+
+#ifndef __GNUC__
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<ushort3>() {
+    int e = (int)sizeof(unsigned short) * 8;
+    return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<short3>() {
+    int e = (int)sizeof(signed short) * 8;
+    return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
+}
+#endif
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<ushort4>() {
+    int e = (int)sizeof(unsigned short) * 8;
+    return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<short4>() {
+    int e = (int)sizeof(signed short) * 8;
+    return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<unsigned int>() {
+    int e = (int)sizeof(unsigned int) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<signed int>() {
+    int e = (int)sizeof(signed int) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<uint1>() {
+    int e = (int)sizeof(unsigned int) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<int1>() {
+    int e = (int)sizeof(signed int) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<uint2>() {
+    int e = (int)sizeof(unsigned int) * 8;
+    return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<int2>() {
+    int e = (int)sizeof(signed int) * 8;
+    return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
+}
+
+#ifndef __GNUC__
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<uint3>() {
+    int e = (int)sizeof(unsigned int) * 8;
+    return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<int3>() {
+    int e = (int)sizeof(signed int) * 8;
+    return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
+}
+#endif
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<uint4>() {
+    int e = (int)sizeof(unsigned int) * 8;
+    return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<int4>() {
+    int e = (int)sizeof(signed int) * 8;
+    return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<float>() {
+    int e = (int)sizeof(float) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<float1>() {
+    int e = (int)sizeof(float) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<float2>() {
+    int e = (int)sizeof(float) * 8;
+    return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindFloat);
+}
+
+#ifndef __GNUC__
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<float3>() {
+    int e = (int)sizeof(float) * 8;
+    return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindFloat);
+}
+#endif
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<float4>() {
+    int e = (int)sizeof(float) * 8;
+    return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindFloat);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<unsigned long>() {
+    int e = (int)sizeof(unsigned long) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<signed long>() {
+    int e = (int)sizeof(signed long) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<ulong1>() {
+    int e = (int)sizeof(unsigned long) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<long1>() {
+    int e = (int)sizeof(signed long) * 8;
+    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<ulong2>() {
+    int e = (int)sizeof(unsigned long) * 8;
+    return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<long2>() {
+    int e = (int)sizeof(signed long) * 8;
+    return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
+}
+
+#ifndef __GNUC__
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<ulong3>() {
+    int e = (int)sizeof(unsigned long) * 8;
+    return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<long3>() {
+    int e = (int)sizeof(signed long) * 8;
+    return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
+}
+#endif
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<ulong4>() {
+    int e = (int)sizeof(unsigned long) * 8;
+    return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<long4>() {
+    int e = (int)sizeof(signed long) * 8;
+    return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
+}
+
+#else
+
+struct hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w,
+                                                 enum hipChannelFormatKind f);
+
+#endif
+
+#endif
@@ -0,0 +1,293 @@
+/**
+ * MIT License
+ *
+ * Copyright (c) 2019 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*!\file
+ * \brief hip_bfloat16.h provides struct for hip_bfloat16 typedef
+ */
+
+#ifndef _HIP_INCLUDE_HIP_AMD_DETAIL_HIP_BFLOAT16_H_
+#define _HIP_INCLUDE_HIP_AMD_DETAIL_HIP_BFLOAT16_H_
+
+#include "host_defines.h"
+#if defined(__HIPCC_RTC__)
+    #define __HOST_DEVICE__ __device__
+#else
+    #define __HOST_DEVICE__ __host__ __device__
+#endif
+
+#if __cplusplus < 201103L || !defined(__HIPCC__)
+
+// If this is a C compiler, C++ compiler below C++11, or a host-only compiler, we only
+// include a minimal definition of hip_bfloat16
+
+#include <stdint.h>
+/*! \brief Struct to represent a 16 bit brain floating point number. */
+typedef struct
+{
+    uint16_t data;
+} hip_bfloat16;
+
+#else // __cplusplus < 201103L || !defined(__HIPCC__)
+
+#include <hip/hip_runtime.h>
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wshadow"
+struct hip_bfloat16
+{
+    __hip_uint16_t data;
+
+    enum truncate_t
+    {
+        truncate
+    };
+
+    __HOST_DEVICE__ hip_bfloat16() = default;
+
+    // round upper 16 bits of IEEE float to convert to bfloat16
+    explicit __HOST_DEVICE__ hip_bfloat16(float f)
+        : data(float_to_bfloat16(f))
+    {
+    }
+
+    explicit __HOST_DEVICE__ hip_bfloat16(float f, truncate_t)
+        : data(truncate_float_to_bfloat16(f))
+    {
+    }
+
+    // zero extend lower 16 bits of bfloat16 to convert to IEEE float
+    __HOST_DEVICE__ operator float() const
+    {
+        union
+        {
+            uint32_t int32;
+            float    fp32;
+        } u = {uint32_t(data) << 16};
+        return u.fp32;
+    }
+
+    __HOST_DEVICE__ hip_bfloat16 &operator=(const float& f)
+    {
+       data = float_to_bfloat16(f);
+       return *this;
+    }
+
+    static  __HOST_DEVICE__ hip_bfloat16 round_to_bfloat16(float f)
+    {
+        hip_bfloat16 output;
+        output.data = float_to_bfloat16(f);
+        return output;
+    }
+
+    static  __HOST_DEVICE__ hip_bfloat16 round_to_bfloat16(float f, truncate_t)
+    {
+        hip_bfloat16 output;
+        output.data = truncate_float_to_bfloat16(f);
+        return output;
+    }
+
+private:
+    static __HOST_DEVICE__ __hip_uint16_t float_to_bfloat16(float f)
+    {
+        union
+        {
+            float    fp32;
+            uint32_t int32;
+        } u = {f};
+        if(~u.int32 & 0x7f800000)
+        {
+            // When the exponent bits are not all 1s, then the value is zero, normal,
+            // or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
+            // 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
+            // This causes the bfloat16's mantissa to be incremented by 1 if the 16
+            // least significant bits of the float mantissa are greater than 0x8000,
+            // or if they are equal to 0x8000 and the least significant bit of the
+            // bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when
+            // the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already
+            // has the value 0x7f, then incrementing it causes it to become 0x00 and
+            // the exponent is incremented by one, which is the next higher FP value
+            // to the unrounded bfloat16 value. When the bfloat16 value is subnormal
+            // with an exponent of 0x00 and a mantissa of 0x7F, it may be rounded up
+            // to a normal value with an exponent of 0x01 and a mantissa of 0x00.
+            // When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
+            // incrementing it causes it to become an exponent of 0xFF and a mantissa
+            // of 0x00, which is Inf, the next higher value to the unrounded value.
+            u.int32 += 0x7fff + ((u.int32 >> 16) & 1); // Round to nearest, round to even
+        }
+        else if(u.int32 & 0xffff)
+        {
+            // When all of the exponent bits are 1, the value is Inf or NaN.
+            // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
+            // mantissa bit. Quiet NaN is indicated by the most significant mantissa
+            // bit being 1. Signaling NaN is indicated by the most significant
+            // mantissa bit being 0 but some other bit(s) being 1. If any of the
+            // lower 16 bits of the mantissa are 1, we set the least significant bit
+            // of the bfloat16 mantissa, in order to preserve signaling NaN in case
+            // the bloat16's mantissa bits are all 0.
+            u.int32 |= 0x10000; // Preserve signaling NaN
+        }
+        return __hip_uint16_t(u.int32 >> 16);
+    }
+
+    // Truncate instead of rounding, preserving SNaN
+    static __HOST_DEVICE__ __hip_uint16_t truncate_float_to_bfloat16(float f)
+    {
+        union
+        {
+            float    fp32;
+            uint32_t int32;
+        } u = {f};
+        return __hip_uint16_t(u.int32 >> 16) | (!(~u.int32 & 0x7f800000) && (u.int32 & 0xffff));
+    }
+};
+#pragma clang diagnostic pop
+
+typedef struct
+{
+    __hip_uint16_t data;
+} hip_bfloat16_public;
+
+static_assert(__hip_internal::is_standard_layout<hip_bfloat16>{},
+              "hip_bfloat16 is not a standard layout type, and thus is "
+              "incompatible with C.");
+
+static_assert(__hip_internal::is_trivial<hip_bfloat16>{},
+              "hip_bfloat16 is not a trivial type, and thus is "
+              "incompatible with C.");
+#if !defined(__HIPCC_RTC__)
+static_assert(sizeof(hip_bfloat16) == sizeof(hip_bfloat16_public)
+                  && offsetof(hip_bfloat16, data) == offsetof(hip_bfloat16_public, data),
+              "internal hip_bfloat16 does not match public hip_bfloat16");
+
+inline std::ostream& operator<<(std::ostream& os, const hip_bfloat16& bf16)
+{
+  return os << float(bf16);
+}
+#endif
+
+inline __HOST_DEVICE__ hip_bfloat16 operator+(hip_bfloat16 a)
+{
+    return a;
+}
+inline __HOST_DEVICE__ hip_bfloat16 operator-(hip_bfloat16 a)
+{
+    a.data ^= 0x8000;
+    return a;
+}
+inline __HOST_DEVICE__ hip_bfloat16 operator+(hip_bfloat16 a, hip_bfloat16 b)
+{
+    return hip_bfloat16(float(a) + float(b));
+}
+inline __HOST_DEVICE__ hip_bfloat16 operator-(hip_bfloat16 a, hip_bfloat16 b)
+{
+    return hip_bfloat16(float(a) - float(b));
+}
+inline __HOST_DEVICE__ hip_bfloat16 operator*(hip_bfloat16 a, hip_bfloat16 b)
+{
+    return hip_bfloat16(float(a) * float(b));
+}
+inline __HOST_DEVICE__ hip_bfloat16 operator/(hip_bfloat16 a, hip_bfloat16 b)
+{
+    return hip_bfloat16(float(a) / float(b));
+}
+inline __HOST_DEVICE__ bool operator<(hip_bfloat16 a, hip_bfloat16 b)
+{
+    return float(a) < float(b);
+}
+inline __HOST_DEVICE__ bool operator==(hip_bfloat16 a, hip_bfloat16 b)
+{
+    return float(a) == float(b);
+}
+inline __HOST_DEVICE__ bool operator>(hip_bfloat16 a, hip_bfloat16 b)
+{
+    return b < a;
+}
+inline __HOST_DEVICE__ bool operator<=(hip_bfloat16 a, hip_bfloat16 b)
+{
+    return !(a > b);
+}
+inline __HOST_DEVICE__ bool operator!=(hip_bfloat16 a, hip_bfloat16 b)
+{
+    return !(a == b);
+}
+inline __HOST_DEVICE__ bool operator>=(hip_bfloat16 a, hip_bfloat16 b)
+{
+    return !(a < b);
+}
+inline __HOST_DEVICE__ hip_bfloat16& operator+=(hip_bfloat16& a, hip_bfloat16 b)
+{
+    return a = a + b;
+}
+inline __HOST_DEVICE__ hip_bfloat16& operator-=(hip_bfloat16& a, hip_bfloat16 b)
+{
+    return a = a - b;
+}
+inline __HOST_DEVICE__ hip_bfloat16& operator*=(hip_bfloat16& a, hip_bfloat16 b)
+{
+    return a = a * b;
+}
+inline __HOST_DEVICE__ hip_bfloat16& operator/=(hip_bfloat16& a, hip_bfloat16 b)
+{
+    return a = a / b;
+}
+inline __HOST_DEVICE__ hip_bfloat16& operator++(hip_bfloat16& a)
+{
+    return a += hip_bfloat16(1.0f);
+}
+inline __HOST_DEVICE__ hip_bfloat16& operator--(hip_bfloat16& a)
+{
+    return a -= hip_bfloat16(1.0f);
+}
+inline __HOST_DEVICE__ hip_bfloat16 operator++(hip_bfloat16& a, int)
+{
+    hip_bfloat16 orig = a;
+    ++a;
+    return orig;
+}
+inline __HOST_DEVICE__ hip_bfloat16 operator--(hip_bfloat16& a, int)
+{
+    hip_bfloat16 orig = a;
+    --a;
+    return orig;
+}
+
+namespace std
+{
+    constexpr __HOST_DEVICE__ bool isinf(hip_bfloat16 a)
+    {
+        return !(~a.data & 0x7f80) && !(a.data & 0x7f);
+    }
+    constexpr __HOST_DEVICE__ bool isnan(hip_bfloat16 a)
+    {
+        return !(~a.data & 0x7f80) && +(a.data & 0x7f);
+    }
+    constexpr __HOST_DEVICE__ bool iszero(hip_bfloat16 a)
+    {
+        return !(a.data & 0x7fff);
+    }
+}
+
+#endif // __cplusplus < 201103L || !defined(__HIPCC__)
+
+#endif // _HIP_BFLOAT16_H_
@@ -0,0 +1,32 @@
+/*
+Copyright (c) 2019 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMMON_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMMON_H
+
+#if defined(__clang__) && defined(__HIP__)
+#define __HIP_CLANG_ONLY__ 1
+#else
+#define __HIP_CLANG_ONLY__ 0
+#endif
+
+#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMMON_H
@@ -0,0 +1,314 @@
+/*
+Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMPLEX_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMPLEX_H
+
+#include "hip/amd_detail/amd_hip_vector_types.h"
+
+#if defined(__HIPCC_RTC__)
+#define __HOST_DEVICE__ __device__
+#else
+#define __HOST_DEVICE__ __host__ __device__
+// TODO: Clang has a bug which allows device functions to call std functions
+// when std functions are introduced into default namespace by using statement.
+// math.h may be included after this bug is fixed.
+#if __cplusplus
+#include <cmath>
+#else
+#include "math.h"
+#endif
+#endif // !defined(__HIPCC_RTC__)
+
+#if __cplusplus
+#define COMPLEX_NEG_OP_OVERLOAD(type)                                                              \
+    __HOST_DEVICE__ static inline type operator-(const type& op) {                             \
+        type ret;                                                                                  \
+        ret.x = -op.x;                                                                             \
+        ret.y = -op.y;                                                                             \
+        return ret;                                                                                \
+    }
+
+#define COMPLEX_EQ_OP_OVERLOAD(type)                                                               \
+    __HOST_DEVICE__ static inline bool operator==(const type& lhs, const type& rhs) {          \
+        return lhs.x == rhs.x && lhs.y == rhs.y;                                                   \
+    }
+
+#define COMPLEX_NE_OP_OVERLOAD(type)                                                               \
+    __HOST_DEVICE__ static inline bool operator!=(const type& lhs, const type& rhs) {          \
+        return !(lhs == rhs);                                                                      \
+    }
+
+#define COMPLEX_ADD_OP_OVERLOAD(type)                                                              \
+    __HOST_DEVICE__ static inline type operator+(const type& lhs, const type& rhs) {           \
+        type ret;                                                                                  \
+        ret.x = lhs.x + rhs.x;                                                                     \
+        ret.y = lhs.y + rhs.y;                                                                     \
+        return ret;                                                                                \
+    }
+
+#define COMPLEX_SUB_OP_OVERLOAD(type)                                                              \
+    __HOST_DEVICE__ static inline type operator-(const type& lhs, const type& rhs) {           \
+        type ret;                                                                                  \
+        ret.x = lhs.x - rhs.x;                                                                     \
+        ret.y = lhs.y - rhs.y;                                                                     \
+        return ret;                                                                                \
+    }
+
+#define COMPLEX_MUL_OP_OVERLOAD(type)                                                              \
+    __HOST_DEVICE__ static inline type operator*(const type& lhs, const type& rhs) {           \
+        type ret;                                                                                  \
+        ret.x = lhs.x * rhs.x - lhs.y * rhs.y;                                                     \
+        ret.y = lhs.x * rhs.y + lhs.y * rhs.x;                                                     \
+        return ret;                                                                                \
+    }
+
+#define COMPLEX_DIV_OP_OVERLOAD(type)                                                              \
+    __HOST_DEVICE__ static inline type operator/(const type& lhs, const type& rhs) {           \
+        type ret;                                                                                  \
+        ret.x = (lhs.x * rhs.x + lhs.y * rhs.y);                                                   \
+        ret.y = (rhs.x * lhs.y - lhs.x * rhs.y);                                                   \
+        ret.x = ret.x / (rhs.x * rhs.x + rhs.y * rhs.y);                                           \
+        ret.y = ret.y / (rhs.x * rhs.x + rhs.y * rhs.y);                                           \
+        return ret;                                                                                \
+    }
+
+#define COMPLEX_ADD_PREOP_OVERLOAD(type)                                                           \
+    __HOST_DEVICE__ static inline type& operator+=(type& lhs, const type& rhs) {               \
+        lhs.x += rhs.x;                                                                            \
+        lhs.y += rhs.y;                                                                            \
+        return lhs;                                                                                \
+    }
+
+#define COMPLEX_SUB_PREOP_OVERLOAD(type)                                                           \
+    __HOST_DEVICE__ static inline type& operator-=(type& lhs, const type& rhs) {               \
+        lhs.x -= rhs.x;                                                                            \
+        lhs.y -= rhs.y;                                                                            \
+        return lhs;                                                                                \
+    }
+
+#define COMPLEX_MUL_PREOP_OVERLOAD(type)                                                            \
+    __HOST_DEVICE__ static inline type& operator*=(type& lhs, const type& rhs) {                    \
+        type temp{lhs};                                                                             \
+        lhs.x = rhs.x * temp.x - rhs.y * temp.y;                                                    \
+        lhs.y = rhs.y * temp.x + rhs.x * temp.y;                                                    \
+        return lhs;                                                                                 \
+    }
+
+#define COMPLEX_DIV_PREOP_OVERLOAD(type)                                                           \
+    __HOST_DEVICE__ static inline type& operator/=(type& lhs, const type& rhs) {                   \
+        type temp;                                                                                 \
+        temp.x = (lhs.x*rhs.x + lhs.y * rhs.y) / (rhs.x*rhs.x + rhs.y*rhs.y);                      \
+        temp.y = (lhs.y * rhs.x - lhs.x * rhs.y) / (rhs.x*rhs.x + rhs.y*rhs.y);                    \
+        lhs = temp;                                                                                \
+        return lhs;                                                                                \
+    }
+
+#define COMPLEX_SCALAR_PRODUCT(type, type1)                                                        \
+    __HOST_DEVICE__ static inline type operator*(const type& lhs, type1 rhs) {                 \
+        type ret;                                                                                  \
+        ret.x = lhs.x * rhs;                                                                       \
+        ret.y = lhs.y * rhs;                                                                       \
+        return ret;                                                                                \
+    }
+
+#endif
+
+typedef float2 hipFloatComplex;
+
+__HOST_DEVICE__ static inline float hipCrealf(hipFloatComplex z) { return z.x; }
+
+__HOST_DEVICE__ static inline float hipCimagf(hipFloatComplex z) { return z.y; }
+
+__HOST_DEVICE__ static inline hipFloatComplex make_hipFloatComplex(float a, float b) {
+    hipFloatComplex z;
+    z.x = a;
+    z.y = b;
+    return z;
+}
+
+__HOST_DEVICE__ static inline hipFloatComplex hipConjf(hipFloatComplex z) {
+    hipFloatComplex ret;
+    ret.x = z.x;
+    ret.y = -z.y;
+    return ret;
+}
+
+__HOST_DEVICE__ static inline float hipCsqabsf(hipFloatComplex z) {
+    return z.x * z.x + z.y * z.y;
+}
+
+__HOST_DEVICE__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q) {
+    return make_hipFloatComplex(p.x + q.x, p.y + q.y);
+}
+
+__HOST_DEVICE__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q) {
+    return make_hipFloatComplex(p.x - q.x, p.y - q.y);
+}
+
+__HOST_DEVICE__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q) {
+    return make_hipFloatComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
+}
+
+__HOST_DEVICE__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q) {
+    float sqabs = hipCsqabsf(q);
+    hipFloatComplex ret;
+    ret.x = (p.x * q.x + p.y * q.y) / sqabs;
+    ret.y = (p.y * q.x - p.x * q.y) / sqabs;
+    return ret;
+}
+
+__HOST_DEVICE__ static inline float hipCabsf(hipFloatComplex z) { return sqrtf(hipCsqabsf(z)); }
+
+
+typedef double2 hipDoubleComplex;
+
+__HOST_DEVICE__ static inline double hipCreal(hipDoubleComplex z) { return z.x; }
+
+__HOST_DEVICE__ static inline double hipCimag(hipDoubleComplex z) { return z.y; }
+
+__HOST_DEVICE__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b) {
+    hipDoubleComplex z;
+    z.x = a;
+    z.y = b;
+    return z;
+}
+
+__HOST_DEVICE__ static inline hipDoubleComplex hipConj(hipDoubleComplex z) {
+    hipDoubleComplex ret;
+    ret.x = z.x;
+    ret.y = -z.y;
+    return ret;
+}
+
+__HOST_DEVICE__ static inline double hipCsqabs(hipDoubleComplex z) {
+    return z.x * z.x + z.y * z.y;
+}
+
+__HOST_DEVICE__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q) {
+    return make_hipDoubleComplex(p.x + q.x, p.y + q.y);
+}
+
+__HOST_DEVICE__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q) {
+    return make_hipDoubleComplex(p.x - q.x, p.y - q.y);
+}
+
+__HOST_DEVICE__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q) {
+    return make_hipDoubleComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
+}
+
+__HOST_DEVICE__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q) {
+    double sqabs = hipCsqabs(q);
+    hipDoubleComplex ret;
+    ret.x = (p.x * q.x + p.y * q.y) / sqabs;
+    ret.y = (p.y * q.x - p.x * q.y) / sqabs;
+    return ret;
+}
+
+__HOST_DEVICE__ static inline double hipCabs(hipDoubleComplex z) { return sqrt(hipCsqabs(z)); }
+
+
+#if __cplusplus
+
+COMPLEX_NEG_OP_OVERLOAD(hipFloatComplex)
+COMPLEX_EQ_OP_OVERLOAD(hipFloatComplex)
+COMPLEX_NE_OP_OVERLOAD(hipFloatComplex)
+COMPLEX_ADD_OP_OVERLOAD(hipFloatComplex)
+COMPLEX_SUB_OP_OVERLOAD(hipFloatComplex)
+COMPLEX_MUL_OP_OVERLOAD(hipFloatComplex)
+COMPLEX_DIV_OP_OVERLOAD(hipFloatComplex)
+COMPLEX_ADD_PREOP_OVERLOAD(hipFloatComplex)
+COMPLEX_SUB_PREOP_OVERLOAD(hipFloatComplex)
+COMPLEX_MUL_PREOP_OVERLOAD(hipFloatComplex)
+COMPLEX_DIV_PREOP_OVERLOAD(hipFloatComplex)
+COMPLEX_SCALAR_PRODUCT(hipFloatComplex, unsigned short)
+COMPLEX_SCALAR_PRODUCT(hipFloatComplex, signed short)
+COMPLEX_SCALAR_PRODUCT(hipFloatComplex, unsigned int)
+COMPLEX_SCALAR_PRODUCT(hipFloatComplex, signed int)
+COMPLEX_SCALAR_PRODUCT(hipFloatComplex, float)
+COMPLEX_SCALAR_PRODUCT(hipFloatComplex, unsigned long)
+COMPLEX_SCALAR_PRODUCT(hipFloatComplex, signed long)
+COMPLEX_SCALAR_PRODUCT(hipFloatComplex, double)
+COMPLEX_SCALAR_PRODUCT(hipFloatComplex, signed long long)
+COMPLEX_SCALAR_PRODUCT(hipFloatComplex, unsigned long long)
+
+COMPLEX_NEG_OP_OVERLOAD(hipDoubleComplex)
+COMPLEX_EQ_OP_OVERLOAD(hipDoubleComplex)
+COMPLEX_NE_OP_OVERLOAD(hipDoubleComplex)
+COMPLEX_ADD_OP_OVERLOAD(hipDoubleComplex)
+COMPLEX_SUB_OP_OVERLOAD(hipDoubleComplex)
+COMPLEX_MUL_OP_OVERLOAD(hipDoubleComplex)
+COMPLEX_DIV_OP_OVERLOAD(hipDoubleComplex)
+COMPLEX_ADD_PREOP_OVERLOAD(hipDoubleComplex)
+COMPLEX_SUB_PREOP_OVERLOAD(hipDoubleComplex)
+COMPLEX_MUL_PREOP_OVERLOAD(hipDoubleComplex)
+COMPLEX_DIV_PREOP_OVERLOAD(hipDoubleComplex)
+COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned short)
+COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, signed short)
+COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned int)
+COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, signed int)
+COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, float)
+COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned long)
+COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, signed long)
+COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, double)
+COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, signed long long)
+COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned long long)
+
+#endif
+
+
+typedef hipFloatComplex hipComplex;
+
+__HOST_DEVICE__ static inline hipComplex make_hipComplex(float x, float y) {
+    return make_hipFloatComplex(x, y);
+}
+
+__HOST_DEVICE__ static inline hipFloatComplex hipComplexDoubleToFloat(hipDoubleComplex z) {
+    return make_hipFloatComplex((float)z.x, (float)z.y);
+}
+
+__HOST_DEVICE__ static inline hipDoubleComplex hipComplexFloatToDouble(hipFloatComplex z) {
+    return make_hipDoubleComplex((double)z.x, (double)z.y);
+}
+
+__HOST_DEVICE__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r) {
+    float real = (p.x * q.x) + r.x;
+    float imag = (q.x * p.y) + r.y;
+
+    real = -(p.y * q.y) + real;
+    imag = (p.x * q.y) + imag;
+
+    return make_hipComplex(real, imag);
+}
+
+__HOST_DEVICE__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q,
+                                                           hipDoubleComplex r) {
+    double real = (p.x * q.x) + r.x;
+    double imag = (q.x * p.y) + r.y;
+
+    real = -(p.y * q.y) + real;
+    imag = (p.x * q.y) + imag;
+
+    return make_hipDoubleComplex(real, imag);
+}
+
+#endif //HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMPLEX_H
@@ -0,0 +1,708 @@
+/*
+Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ *  @file  amd_detail/hip_cooperative_groups.h
+ *
+ *  @brief Device side implementation of `Cooperative Group` feature.
+ *
+ *  Defines new types and device API wrappers related to `Cooperative Group`
+ *  feature, which the programmer can directly use in his kernel(s) in order to
+ *  make use of this feature.
+ */
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wc++98-compat"
+#pragma clang diagnostic ignored "-Wsign-conversion"
+#pragma clang diagnostic ignored "-Wunused-parameter"
+#pragma clang diagnostic ignored "-Wreserved-macro-identifier"
+#pragma clang diagnostic ignored "-Wpadded"
+
+#if __cplusplus
+#if !defined(__HIPCC_RTC__)
+#include <hip/amd_detail/hip_cooperative_groups_helper.h>
+#endif
+
+#define __hip_abort()                                                                              \
+  { asm("trap;"); }
+#if defined(NDEBUG)
+#define __hip_assert(COND)
+#else
+#define __hip_assert(COND)                                                                         \
+  {                                                                                                \
+    if (!COND) {                                                                                   \
+      __hip_abort();                                                                               \
+    }                                                                                              \
+  }
+#endif
+
+namespace cooperative_groups {
+
+/** \brief The base type of all cooperative group types
+ *
+ *  \details Holds the key properties of a constructed cooperative group types
+ *           object, like the group type, its size, etc
+ */
+class thread_group {
+ protected:
+  uint32_t _type;  // thread_group type
+  uint32_t _size;  // total number of threads in the tread_group
+  uint64_t _mask;  // Lanemask for coalesced and tiled partitioned group types,
+                   // LSB represents lane 0, and MSB represents lane 63
+
+  // Construct a thread group, and set thread group type and other essential
+  // thread group properties. This generic thread group is directly constructed
+  // only when the group is supposed to contain only the calling the thread
+  // (throurh the API - `this_thread()`), and in all other cases, this thread
+  // group object is a sub-object of some other derived thread group object
+  __CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t size = static_cast<uint64_t>(0),
+                                uint64_t mask = static_cast<uint64_t>(0)) {
+    _type = type;
+    _size = size;
+    _mask = mask;
+  }
+
+  struct _tiled_info {
+    bool is_tiled;
+    unsigned int size;
+  };
+
+  struct _coalesced_info {
+    lane_mask member_mask;
+    unsigned int size;
+    struct _tiled_info tiled_info;
+  } coalesced_info;
+
+  friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
+                                                       unsigned int tile_size);
+  friend class thread_block;
+
+ public:
+  // Total number of threads in the thread group, and this serves the purpose
+  // for all derived cooperative group types since their `size` is directly
+  // saved during the construction
+  __CG_QUALIFIER__ uint32_t size() const { return _size; }
+  __CG_QUALIFIER__ unsigned int cg_type() const { return _type; }
+  // Rank of the calling thread within [0, size())
+  __CG_QUALIFIER__ uint32_t thread_rank() const;
+  // Is this cooperative group type valid?
+  __CG_QUALIFIER__ bool is_valid() const;
+  // synchronize the threads in the thread group
+  __CG_QUALIFIER__ void sync() const;
+};
+
+/** \brief The multi-grid cooperative group type
+ *
+ *  \details Represents an inter-device cooperative group type where the
+ *           participating threads within the group spans across multple
+ *           devices, running the (same) kernel on these devices
+ */
+class multi_grid_group : public thread_group {
+  // Only these friend functions are allowed to construct an object of this class
+  // and access its resources
+  friend __CG_QUALIFIER__ multi_grid_group this_multi_grid();
+
+ protected:
+  // Construct mutli-grid thread group (through the API this_multi_grid())
+  explicit __CG_QUALIFIER__ multi_grid_group(uint32_t size)
+      : thread_group(internal::cg_multi_grid, size) {}
+
+ public:
+  // Number of invocations participating in this multi-grid group. In other
+  // words, the number of GPUs
+  __CG_QUALIFIER__ uint32_t num_grids() { return internal::multi_grid::num_grids(); }
+  // Rank of this invocation. In other words, an ID number within the range
+  // [0, num_grids()) of the GPU, this kernel is running on
+  __CG_QUALIFIER__ uint32_t grid_rank() { return internal::multi_grid::grid_rank(); }
+  __CG_QUALIFIER__ uint32_t thread_rank() const { return internal::multi_grid::thread_rank(); }
+  __CG_QUALIFIER__ bool is_valid() const { return internal::multi_grid::is_valid(); }
+  __CG_QUALIFIER__ void sync() const { internal::multi_grid::sync(); }
+};
+
+/** \brief User exposed API interface to construct multi-grid cooperative
+ *         group type object - `multi_grid_group`
+ *
+ *  \details User is not allowed to directly construct an object of type
+ *           `multi_grid_group`. Instead, he should construct it through this
+ *           API function
+ */
+__CG_QUALIFIER__ multi_grid_group this_multi_grid() {
+  return multi_grid_group(internal::multi_grid::size());
+}
+
+/** \brief The grid cooperative group type
+ *
+ *  \details Represents an inter-workgroup cooperative group type where the
+ *           participating threads within the group spans across multiple
+ *           workgroups running the (same) kernel on the same device
+ */
+class grid_group : public thread_group {
+  // Only these friend functions are allowed to construct an object of this class
+  // and access its resources
+  friend __CG_QUALIFIER__ grid_group this_grid();
+
+ protected:
+  // Construct grid thread group (through the API this_grid())
+  explicit __CG_QUALIFIER__ grid_group(uint32_t size) : thread_group(internal::cg_grid, size) {}
+
+ public:
+  __CG_QUALIFIER__ uint32_t thread_rank() const { return internal::grid::thread_rank(); }
+  __CG_QUALIFIER__ bool is_valid() const { return internal::grid::is_valid(); }
+  __CG_QUALIFIER__ void sync() const { internal::grid::sync(); }
+};
+
+/** \brief User exposed API interface to construct grid cooperative group type
+ *         object - `grid_group`
+ *
+ *  \details User is not allowed to directly construct an object of type
+ *           `multi_grid_group`. Instead, he should construct it through this
+ *           API function
+ */
+__CG_QUALIFIER__ grid_group this_grid() { return grid_group(internal::grid::size()); }
+
+/** \brief   The workgroup (thread-block in CUDA terminology) cooperative group
+ *           type
+ *
+ *  \details Represents an intra-workgroup cooperative group type where the
+ *           participating threads within the group are exactly the same threads
+ *           which are participated in the currently executing `workgroup`
+ */
+class thread_block : public thread_group {
+  // Only these friend functions are allowed to construct an object of thi
+  // class and access its resources
+  friend __CG_QUALIFIER__ thread_block this_thread_block();
+  friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
+                                                       unsigned int tile_size);
+  friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent,
+                                                       unsigned int tile_size);
+
+ protected:
+  // Construct a workgroup thread group (through the API this_thread_block())
+  explicit __CG_QUALIFIER__ thread_block(uint32_t size)
+      : thread_group(internal::cg_workgroup, size) {}
+
+  __CG_QUALIFIER__ thread_group new_tiled_group(unsigned int tile_size) const {
+    const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
+    // Invalid tile size, assert
+    if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) {
+      __hip_assert(false && "invalid tile size")
+    }
+
+    thread_group tiledGroup = thread_group(internal::cg_tiled_group, tile_size);
+    tiledGroup.coalesced_info.tiled_info.size = tile_size;
+    tiledGroup.coalesced_info.tiled_info.is_tiled = true;
+    return tiledGroup;
+  }
+
+ public:
+  // 3-dimensional block index within the grid
+  __CG_QUALIFIER__ dim3 group_index() { return internal::workgroup::group_index(); }
+  // 3-dimensional thread index within the block
+  __CG_QUALIFIER__ dim3 thread_index() { return internal::workgroup::thread_index(); }
+  __CG_QUALIFIER__ uint32_t thread_rank() const { return internal::workgroup::thread_rank(); }
+  __CG_QUALIFIER__ bool is_valid() const { return internal::workgroup::is_valid(); }
+  __CG_QUALIFIER__ void sync() const { internal::workgroup::sync(); }
+};
+
+/** \brief   User exposed API interface to construct workgroup cooperative
+ *           group type object - `thread_block`.
+ *
+ *  \details User is not allowed to directly construct an object of type
+ *           `thread_block`. Instead, he should construct it through this API
+ *           function.
+ */
+__CG_QUALIFIER__ thread_block this_thread_block() {
+  return thread_block(internal::workgroup::size());
+}
+
+/** \brief   The tiled_group cooperative group type
+ *
+ *  \details Represents one tiled thread group in a wavefront.
+ *           This group type also supports sub-wave level intrinsics.
+ */
+
+class tiled_group : public thread_group {
+ private:
+  friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
+                                                       unsigned int tile_size);
+  friend __CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent,
+                                                      unsigned int tile_size);
+
+  __CG_QUALIFIER__ tiled_group new_tiled_group(unsigned int tile_size) const {
+    const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
+
+    if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) {
+      __hip_assert(false && "invalid tile size")
+    }
+
+    if (size() <= tile_size) {
+      return *this;
+    }
+
+    tiled_group tiledGroup = tiled_group(tile_size);
+    tiledGroup.coalesced_info.tiled_info.is_tiled = true;
+    return tiledGroup;
+  }
+
+ protected:
+  explicit __CG_QUALIFIER__ tiled_group(unsigned int tileSize)
+      : thread_group(internal::cg_tiled_group, tileSize) {
+    coalesced_info.tiled_info.size = tileSize;
+    coalesced_info.tiled_info.is_tiled = true;
+  }
+
+ public:
+  __CG_QUALIFIER__ unsigned int size() const { return (coalesced_info.tiled_info.size); }
+
+  __CG_QUALIFIER__ unsigned int thread_rank() const {
+    return (internal::workgroup::thread_rank() & (coalesced_info.tiled_info.size - 1));
+  }
+
+  __CG_QUALIFIER__ void sync() const {
+    internal::tiled_group::sync();
+  }
+};
+
+/** \brief   The coalesced_group cooperative group type
+ *
+ *  \details Represents a active thread group in a wavefront.
+ *           This group type also supports sub-wave level intrinsics.
+ */
+class coalesced_group : public thread_group {
+ private:
+  friend __CG_QUALIFIER__ coalesced_group coalesced_threads();
+  friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, unsigned int tile_size);
+  friend __CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tile_size);
+
+  __CG_QUALIFIER__ coalesced_group new_tiled_group(unsigned int tile_size) const {
+    const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
+
+    if (!tile_size || (tile_size > size()) || !pow2) {
+      return coalesced_group(0);
+    }
+
+    // If a tiled group is passed to be partitioned further into a coalesced_group.
+    // prepare a mask for further partitioning it so that it stays coalesced.
+    if (coalesced_info.tiled_info.is_tiled) {
+      unsigned int base_offset = (thread_rank() & (~(tile_size - 1)));
+      unsigned int masklength = min(static_cast<unsigned int>(size()) - base_offset, tile_size);
+      lane_mask member_mask = static_cast<lane_mask>(-1) >> (__AMDGCN_WAVEFRONT_SIZE - masklength);
+
+      member_mask <<= (__lane_id() & ~(tile_size - 1));
+      coalesced_group coalesced_tile = coalesced_group(member_mask);
+      coalesced_tile.coalesced_info.tiled_info.is_tiled = true;
+      return coalesced_tile;
+    }
+    // Here the parent coalesced_group is not partitioned.
+    else {
+      lane_mask member_mask = 0;
+      unsigned int tile_rank = 0;
+      int lanes_to_skip = ((thread_rank()) / tile_size) * tile_size;
+
+      for (unsigned int i = 0; i < __AMDGCN_WAVEFRONT_SIZE; i++) {
+        lane_mask active = coalesced_info.member_mask & (1 << i);
+        // Make sure the lane is active
+        if (active) {
+          if (lanes_to_skip <= 0 && tile_rank < tile_size) {
+             // Prepare a member_mask that is appropriate for a tile
+            member_mask |= active;
+            tile_rank++;
+          }
+          lanes_to_skip--;
+        }
+      }
+      coalesced_group coalesced_tile = coalesced_group(member_mask);
+      return coalesced_tile;
+    }
+     return coalesced_group(0);
+  }
+
+ protected:
+ // Constructor
+  explicit __CG_QUALIFIER__ coalesced_group(lane_mask member_mask)
+      : thread_group(internal::cg_coalesced_group) {
+    coalesced_info.member_mask = member_mask; // Which threads are active
+    coalesced_info.size = __popcll(coalesced_info.member_mask); // How many threads are active
+    coalesced_info.tiled_info.is_tiled = false; // Not a partitioned group
+  }
+
+ public:
+   __CG_QUALIFIER__ unsigned int size() const {
+     return coalesced_info.size;
+   }
+
+   __CG_QUALIFIER__ unsigned int thread_rank() const {
+     return internal::coalesced_group::masked_bit_count(coalesced_info.member_mask);
+    }
+
+   __CG_QUALIFIER__ void sync() const {
+       internal::coalesced_group::sync();
+    }
+
+  template <class T>
+  __CG_QUALIFIER__ T shfl(T var, int srcRank) const {
+    static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
+
+    srcRank = srcRank % static_cast<int>(size());
+
+    int lane = (size() == __AMDGCN_WAVEFRONT_SIZE) ? srcRank
+             : (__AMDGCN_WAVEFRONT_SIZE == 64)     ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1))
+                                          : __fns32(coalesced_info.member_mask, 0, (srcRank + 1));
+
+    return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE);
+  }
+
+  template <class T>
+  __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const {
+    static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
+
+    // Note: The cuda implementation appears to use the remainder of lane_delta
+    // and WARP_SIZE as the shift value rather than lane_delta itself.
+    // This is not described in the documentation and is not done here.
+
+    if (size() == __AMDGCN_WAVEFRONT_SIZE) {
+      return __shfl_down(var, lane_delta, __AMDGCN_WAVEFRONT_SIZE);
+    }
+
+    int lane;
+    if (__AMDGCN_WAVEFRONT_SIZE == 64) {
+      lane = __fns64(coalesced_info.member_mask, __lane_id(), lane_delta + 1);
+    }
+    else {
+      lane = __fns32(coalesced_info.member_mask, __lane_id(), lane_delta + 1);
+    }
+
+    if (lane == -1) {
+      lane = __lane_id();
+    }
+
+    return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE);
+  }
+
+  template <class T>
+  __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const {
+    static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
+
+    // Note: The cuda implementation appears to use the remainder of lane_delta
+    // and WARP_SIZE as the shift value rather than lane_delta itself.
+    // This is not described in the documentation and is not done here.
+
+    if (size() == __AMDGCN_WAVEFRONT_SIZE) {
+      return __shfl_up(var, lane_delta, __AMDGCN_WAVEFRONT_SIZE);
+    }
+
+    int lane;
+    if (__AMDGCN_WAVEFRONT_SIZE == 64) {
+      lane = __fns64(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1));
+    }
+    else if (__AMDGCN_WAVEFRONT_SIZE == 32) {
+      lane = __fns32(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1));
+    }
+
+    if (lane == -1) {
+      lane = __lane_id();
+    }
+
+    return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE);
+  }
+};
+
+/** \brief   User exposed API to create coalesced groups.
+ *
+ *  \details A collective operation that groups  all active lanes into a new thread group.
+ */
+
+__CG_QUALIFIER__ coalesced_group coalesced_threads() {
+    return cooperative_groups::coalesced_group(__builtin_amdgcn_read_exec());
+}
+
+/**
+ *  Implemenation of all publicly exposed base class APIs
+ */
+__CG_QUALIFIER__ uint32_t thread_group::thread_rank() const {
+  switch (this->_type) {
+    case internal::cg_multi_grid: {
+      return (static_cast<const multi_grid_group*>(this)->thread_rank());
+    }
+    case internal::cg_grid: {
+      return (static_cast<const grid_group*>(this)->thread_rank());
+    }
+    case internal::cg_workgroup: {
+      return (static_cast<const thread_block*>(this)->thread_rank());
+    }
+    case internal::cg_tiled_group: {
+      return (static_cast<const tiled_group*>(this)->thread_rank());
+    }
+    case internal::cg_coalesced_group: {
+      return (static_cast<const coalesced_group*>(this)->thread_rank());
+    }
+    default: {
+      __hip_assert(false && "invalid cooperative group type")
+      return -1;
+    }
+  }
+}
+
+__CG_QUALIFIER__ bool thread_group::is_valid() const {
+  switch (this->_type) {
+    case internal::cg_multi_grid: {
+      return (static_cast<const multi_grid_group*>(this)->is_valid());
+    }
+    case internal::cg_grid: {
+      return (static_cast<const grid_group*>(this)->is_valid());
+    }
+    case internal::cg_workgroup: {
+      return (static_cast<const thread_block*>(this)->is_valid());
+    }
+    case internal::cg_tiled_group: {
+      return (static_cast<const tiled_group*>(this)->is_valid());
+    }
+    case internal::cg_coalesced_group: {
+      return (static_cast<const coalesced_group*>(this)->is_valid());
+    }
+    default: {
+      __hip_assert(false && "invalid cooperative group type")
+      return false;
+    }
+  }
+}
+
+__CG_QUALIFIER__ void thread_group::sync() const {
+  switch (this->_type) {
+    case internal::cg_multi_grid: {
+      static_cast<const multi_grid_group*>(this)->sync();
+      break;
+    }
+    case internal::cg_grid: {
+      static_cast<const grid_group*>(this)->sync();
+      break;
+    }
+    case internal::cg_workgroup: {
+      static_cast<const thread_block*>(this)->sync();
+      break;
+    }
+    case internal::cg_tiled_group: {
+      static_cast<const tiled_group*>(this)->sync();
+      break;
+    }
+    case internal::cg_coalesced_group: {
+      static_cast<const coalesced_group*>(this)->sync();
+      break;
+    }
+    default: {
+      __hip_assert(false && "invalid cooperative group type")
+    }
+  }
+}
+
+/**
+ *  Implemenation of publicly exposed `wrapper` APIs on top of basic cooperative
+ *  group type APIs
+ */
+template <class CGTy> __CG_QUALIFIER__ uint32_t group_size(CGTy const& g) { return g.size(); }
+
+template <class CGTy> __CG_QUALIFIER__ uint32_t thread_rank(CGTy const& g) {
+  return g.thread_rank();
+}
+
+template <class CGTy> __CG_QUALIFIER__ bool is_valid(CGTy const& g) { return g.is_valid(); }
+
+template <class CGTy> __CG_QUALIFIER__ void sync(CGTy const& g) { g.sync(); }
+
+template <unsigned int tileSize> class tile_base {
+ protected:
+  _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
+
+ public:
+  // Rank of the thread within this tile
+  _CG_STATIC_CONST_DECL_ unsigned int thread_rank() {
+    return (internal::workgroup::thread_rank() & (numThreads - 1));
+  }
+
+  // Number of threads within this tile
+  __CG_STATIC_QUALIFIER__ unsigned int size() { return numThreads; }
+};
+
+template <unsigned int size> class thread_block_tile_base : public tile_base<size> {
+  static_assert(is_valid_tile_size<size>::value,
+                "Tile size is either not a power of 2 or greater than the wavefront size");
+  using tile_base<size>::numThreads;
+
+ public:
+  __CG_STATIC_QUALIFIER__ void sync() {
+    internal::tiled_group::sync();
+  }
+
+  template <class T> __CG_QUALIFIER__ T shfl(T var, int srcRank) const {
+    static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
+    return (__shfl(var, srcRank, numThreads));
+  }
+
+  template <class T> __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const {
+    static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
+    return (__shfl_down(var, lane_delta, numThreads));
+  }
+
+  template <class T> __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const {
+    static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
+    return (__shfl_up(var, lane_delta, numThreads));
+  }
+
+  template <class T> __CG_QUALIFIER__ T shfl_xor(T var, unsigned int laneMask) const {
+    static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
+    return (__shfl_xor(var, laneMask, numThreads));
+  }
+};
+
+/** \brief   Group type - thread_block_tile
+ *
+ *  \details  Represents one tile of thread group.
+ */
+
+template <unsigned int tileSize, class ParentCGTy = void>
+class thread_block_tile_type : public thread_block_tile_base<tileSize>, public tiled_group {
+  _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
+
+  friend class thread_block_tile_type<tileSize, ParentCGTy>;
+
+  typedef thread_block_tile_base<numThreads> tbtBase;
+
+ protected:
+  __CG_QUALIFIER__ thread_block_tile_type() : tiled_group(numThreads) {
+    coalesced_info.tiled_info.size = numThreads;
+    coalesced_info.tiled_info.is_tiled = true;
+  }
+
+ public:
+  using tbtBase::size;
+  using tbtBase::sync;
+  using tbtBase::thread_rank;
+};
+
+
+/** \brief   User exposed API to partition groups.
+ *
+ *  \details A collective operation that partitions the parent group into a one-dimensional,
+ *           row-major, tiling of subgroups.
+ */
+
+__CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, unsigned int tile_size) {
+  if (parent.cg_type() == internal::cg_tiled_group) {
+    const tiled_group* cg = static_cast<const tiled_group*>(&parent);
+    return cg->new_tiled_group(tile_size);
+  }
+  else if(parent.cg_type() == internal::cg_coalesced_group) {
+    const coalesced_group* cg = static_cast<const coalesced_group*>(&parent);
+    return cg->new_tiled_group(tile_size);
+  }
+  else {
+    const thread_block* tb = static_cast<const thread_block*>(&parent);
+    return tb->new_tiled_group(tile_size);
+  }
+}
+
+// Thread block type overload
+__CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent, unsigned int tile_size) {
+  return (parent.new_tiled_group(tile_size));
+}
+
+__CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent, unsigned int tile_size) {
+  return (parent.new_tiled_group(tile_size));
+}
+
+// If a coalesced group is passed to be partitioned, it should remain coalesced
+__CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tile_size) {
+    return (parent.new_tiled_group(tile_size));
+}
+
+template <unsigned int size, class ParentCGTy> class thread_block_tile;
+
+namespace impl {
+template <unsigned int size, class ParentCGTy> class thread_block_tile_internal;
+
+template <unsigned int size, class ParentCGTy>
+class thread_block_tile_internal : public thread_block_tile_type<size, ParentCGTy> {
+ protected:
+  template <unsigned int tbtSize, class tbtParentT>
+  __CG_QUALIFIER__ thread_block_tile_internal(
+      const thread_block_tile_internal<tbtSize, tbtParentT>& g)
+      : thread_block_tile_type<size, ParentCGTy>() {}
+
+  __CG_QUALIFIER__ thread_block_tile_internal(const thread_block& g)
+      : thread_block_tile_type<size, ParentCGTy>() {}
+};
+}  // namespace impl
+
+template <unsigned int size, class ParentCGTy>
+class thread_block_tile : public impl::thread_block_tile_internal<size, ParentCGTy> {
+ protected:
+  __CG_QUALIFIER__ thread_block_tile(const ParentCGTy& g)
+      : impl::thread_block_tile_internal<size, ParentCGTy>(g) {}
+
+ public:
+  __CG_QUALIFIER__ operator thread_block_tile<size, void>() const {
+    return thread_block_tile<size, void>(*this);
+  }
+};
+
+
+template <unsigned int size>
+class thread_block_tile<size, void> : public impl::thread_block_tile_internal<size, void> {
+  template <unsigned int, class ParentCGTy> friend class thread_block_tile;
+
+ protected:
+ public:
+  template <class ParentCGTy>
+  __CG_QUALIFIER__ thread_block_tile(const thread_block_tile<size, ParentCGTy>& g)
+      : impl::thread_block_tile_internal<size, void>(g) {}
+};
+
+template <unsigned int size, class ParentCGTy = void> class thread_block_tile;
+
+namespace impl {
+template <unsigned int size, class ParentCGTy = void> struct tiled_partition_internal;
+
+template <unsigned int size>
+struct tiled_partition_internal<size, thread_block> : public thread_block_tile<size, thread_block> {
+  __CG_QUALIFIER__ tiled_partition_internal(const thread_block& g)
+      : thread_block_tile<size, thread_block>(g) {}
+};
+
+}  // namespace impl
+
+/** \brief   User exposed API to partition groups.
+ *
+ *  \details  This constructs a templated class derieved from thread_group.
+ *            The template defines tile size of the new thread group at compile time.
+ */
+template <unsigned int size, class ParentCGTy>
+__CG_QUALIFIER__ thread_block_tile<size, ParentCGTy> tiled_partition(const ParentCGTy& g) {
+  static_assert(is_valid_tile_size<size>::value,
+                "Tiled partition with size > wavefront size. Currently not supported ");
+  return impl::tiled_partition_internal<size, ParentCGTy>(g);
+}
+}  // namespace cooperative_groups
+#pragma clang diagnostic pop
+#endif  // __cplusplus
+#endif  // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
@@ -0,0 +1,59 @@
+/*
+Copyright (c) 2015 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#ifndef AMD_HIP_MATH_CONSTANTS_H
+#define AMD_HIP_MATH_CONSTANTS_H
+#define HIP_INF_F            __int_as_float(0x7f800000U)
+#define HIP_NAN_F            __int_as_float(0x7fffffffU)
+#define HIP_MIN_DENORM_F     __int_as_float(0x00000001U)
+#define HIP_MAX_NORMAL_F     __int_as_float(0x7f7fffffU)
+#define HIP_NEG_ZERO_F       __int_as_float(0x80000000U)
+#define HIP_ZERO_F           0.0F
+#define HIP_ONE_F            1.0F
+#define HIP_SQRT_HALF_F      0.707106781F
+#define HIP_SQRT_HALF_HI_F   0.707106781F
+#define HIP_SQRT_HALF_LO_F   1.210161749e-08F
+#define HIP_SQRT_TWO_F       1.414213562F
+#define HIP_THIRD_F          0.333333333F
+#define HIP_PIO4_F           0.785398163F
+#define HIP_PIO2_F           1.570796327F
+#define HIP_3PIO4_F          2.356194490F
+#define HIP_2_OVER_PI_F      0.636619772F
+#define HIP_SQRT_2_OVER_PI_F 0.797884561F
+#define HIP_PI_F             3.141592654F
+#define HIP_L2E_F            1.442695041F
+#define HIP_L2T_F            3.321928094F
+#define HIP_LG2_F            0.301029996F
+#define HIP_LGE_F            0.434294482F
+#define HIP_LN2_F            0.693147181F
+#define HIP_LNT_F            2.302585093F
+#define HIP_LNPI_F           1.144729886F
+#define HIP_TWO_TO_M126_F    1.175494351e-38F
+#define HIP_TWO_TO_126_F     8.507059173e37F
+#define HIP_NORM_HUGE_F      3.402823466e38F
+#define HIP_TWO_TO_23_F      8388608.0F
+#define HIP_TWO_TO_24_F      16777216.0F
+#define HIP_TWO_TO_31_F      2147483648.0F
+#define HIP_TWO_TO_32_F      4294967296.0F
+#define HIP_REMQUO_BITS_F    3U
+#define HIP_REMQUO_MASK_F    (~((~0U)<<HIPRT_REMQUO_BITS_F))
+#define HIP_TRIG_PLOSS_F     105615.0F
+#endif
@@ -0,0 +1,435 @@
+/*
+Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ *  @file  amd_detail/hip_runtime.h
+ *  @brief Contains definitions of APIs for HIP runtime.
+ */
+
+//#pragma once
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_H
+
+#include <hip/amd_detail/amd_hip_common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief Query the installed library build name.
+ *
+ * This function can be used even when the library is not initialized.
+ *
+ * @returns Returns a string describing the build version of the library.  The
+ * string is owned by the library.
+ */
+const char* amd_dbgapi_get_build_name();
+
+/**
+ * @brief Query the installed library git hash.
+ *
+ * This function can be used even when the library is not initialized.
+ *
+ * @returns Returns git hash of the library.
+ */
+const char* amd_dbgapi_get_git_hash();
+
+/**
+ * @brief Query the installed library build ID.
+ *
+ * This function can be used even when the library is not initialized.
+ *
+ * @returns Returns build ID of the library.
+ */
+size_t amd_dbgapi_get_build_id();
+
+#ifdef __cplusplus
+} /* extern "c" */
+#endif
+
+//---
+// Top part of file can be compiled with any compiler
+
+#if !defined(__HIPCC_RTC__)
+//#include <cstring>
+#if __cplusplus
+#include <cmath>
+#include <cstdint>
+#else
+#include <math.h>
+#include <string.h>
+#include <stddef.h>
+#endif // __cplusplus
+#else
+typedef unsigned int uint32_t;
+typedef unsigned long long uint64_t;
+typedef signed int int32_t;
+typedef signed long long int64_t;
+namespace std {
+using ::uint32_t;
+using ::uint64_t;
+using ::int32_t;
+using ::int64_t;
+}
+#endif // !defined(__HIPCC_RTC__)
+
+#if __HIP_CLANG_ONLY__
+
+#if !defined(__align__)
+#define __align__(x) __attribute__((aligned(x)))
+#endif
+
+#define CUDA_SUCCESS hipSuccess
+
+#if !defined(__HIPCC_RTC__)
+#include <hip/hip_runtime_api.h>
+extern int HIP_TRACE_API;
+#endif // !defined(__HIPCC_RTC__)
+
+#ifdef __cplusplus
+#include <hip/amd_detail/hip_ldg.h>
+#endif
+#include <hip/amd_detail/amd_hip_atomic.h>
+#include <hip/amd_detail/host_defines.h>
+#include <hip/amd_detail/amd_device_functions.h>
+#include <hip/amd_detail/amd_surface_functions.h>
+#include <hip/amd_detail/texture_fetch_functions.h>
+#include <hip/amd_detail/texture_indirect_functions.h>
+
+// TODO-HCC remove old definitions ; ~1602 hcc supports __HCC_ACCELERATOR__ define.
+#if defined(__KALMAR_ACCELERATOR__) && !defined(__HCC_ACCELERATOR__)
+#define __HCC_ACCELERATOR__ __KALMAR_ACCELERATOR__
+#endif
+
+// Feature tests:
+#if (defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0)) || __HIP_DEVICE_COMPILE__
+// Device compile and not host compile:
+
+// 32-bit Atomics:
+#define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (1)
+#define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (1)
+#define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (1)
+#define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (1)
+#define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (1)
+
+// 64-bit Atomics:
+#define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (1)
+#define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (1)
+
+// Doubles
+#define __HIP_ARCH_HAS_DOUBLES__ (1)
+
+// warp cross-lane operations:
+#define __HIP_ARCH_HAS_WARP_VOTE__ (1)
+#define __HIP_ARCH_HAS_WARP_BALLOT__ (1)
+#define __HIP_ARCH_HAS_WARP_SHUFFLE__ (1)
+#define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0)
+
+// sync
+#define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (1)
+#define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0)
+
+// misc
+#define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0)
+#define __HIP_ARCH_HAS_3DGRID__ (1)
+#define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0)
+
+#endif /* Device feature flags */
+
+
+#define launch_bounds_impl0(requiredMaxThreadsPerBlock)                                            \
+    __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock)))
+#define launch_bounds_impl1(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor)                \
+    __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock),                     \
+                   amdgpu_waves_per_eu(minBlocksPerMultiprocessor)))
+#define select_impl_(_1, _2, impl_, ...) impl_
+#define __launch_bounds__(...)                                                                     \
+  select_impl_(__VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0, )(__VA_ARGS__)
+
+#if !defined(__HIPCC_RTC__)
+__host__ inline void* __get_dynamicgroupbaseptr() { return nullptr; }
+#endif // !defined(__HIPCC_RTC__)
+
+// End doxygen API:
+/**
+ *   @}
+ */
+
+//
+// hip-clang functions
+//
+#if !defined(__HIPCC_RTC__)
+#define HIP_KERNEL_NAME(...) __VA_ARGS__
+#define HIP_SYMBOL(X) X
+
+typedef int hipLaunchParm;
+
+template <std::size_t n, typename... Ts,
+          typename std::enable_if<n == sizeof...(Ts)>::type* = nullptr>
+void pArgs(const std::tuple<Ts...>&, void*) {}
+
+template <std::size_t n, typename... Ts,
+          typename std::enable_if<n != sizeof...(Ts)>::type* = nullptr>
+void pArgs(const std::tuple<Ts...>& formals, void** _vargs) {
+    using T = typename std::tuple_element<n, std::tuple<Ts...> >::type;
+
+    static_assert(!std::is_reference<T>{},
+                  "A __global__ function cannot have a reference as one of its "
+                  "arguments.");
+#if defined(HIP_STRICT)
+    static_assert(std::is_trivially_copyable<T>{},
+                  "Only TriviallyCopyable types can be arguments to a __global__ "
+                  "function");
+#endif
+    _vargs[n] = const_cast<void*>(reinterpret_cast<const void*>(&std::get<n>(formals)));
+    return pArgs<n + 1>(formals, _vargs);
+}
+
+template <typename... Formals, typename... Actuals>
+std::tuple<Formals...> validateArgsCountType(void (*kernel)(Formals...), std::tuple<Actuals...>(actuals)) {
+    static_assert(sizeof...(Formals) == sizeof...(Actuals), "Argument Count Mismatch");
+    std::tuple<Formals...> to_formals{std::move(actuals)};
+    return to_formals;
+}
+
+#if defined(HIP_TEMPLATE_KERNEL_LAUNCH)
+template <typename... Args, typename F = void (*)(Args...)>
+void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
+                        std::uint32_t sharedMemBytes, hipStream_t stream, Args... args) {
+    constexpr size_t count = sizeof...(Args);
+    auto tup_ = std::tuple<Args...>{args...};
+    auto tup = validateArgsCountType(kernel, tup_);
+    void* _Args[count];
+    pArgs<0>(tup, _Args);
+
+    auto k = reinterpret_cast<void*>(kernel);
+    hipLaunchKernel(k, numBlocks, dimBlocks, _Args, sharedMemBytes, stream);
+}
+#else
+#define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...)  \
+    do {                                                                                           \
+        kernelName<<<(numBlocks), (numThreads), (memPerBlock), (streamId)>>>(__VA_ARGS__);         \
+    } while (0)
+
+#define hipLaunchKernelGGL(kernelName, ...)  hipLaunchKernelGGLInternal((kernelName), __VA_ARGS__)
+#endif
+
+#include <hip/hip_runtime_api.h>
+#endif // !defined(__HIPCC_RTC__)
+
+extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_id(uint);
+extern "C" __device__ __attribute__((const)) size_t __ockl_get_group_id(uint);
+extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_size(uint);
+extern "C" __device__ __attribute__((const)) size_t __ockl_get_num_groups(uint);
+struct __HIP_BlockIdx {
+  __device__
+  std::uint32_t operator()(std::uint32_t x) const noexcept { return __ockl_get_group_id(x); }
+};
+struct __HIP_BlockDim {
+  __device__
+  std::uint32_t operator()(std::uint32_t x) const noexcept {
+    return __ockl_get_local_size(x);
+  }
+};
+struct __HIP_GridDim {
+  __device__
+  std::uint32_t operator()(std::uint32_t x) const noexcept {
+    return __ockl_get_num_groups(x);
+  }
+};
+struct __HIP_ThreadIdx {
+  __device__
+  std::uint32_t operator()(std::uint32_t x) const noexcept {
+    return __ockl_get_local_id(x);
+  }
+};
+
+#if defined(__HIPCC_RTC__)
+typedef struct dim3 {
+    uint32_t x;  ///< x
+    uint32_t y;  ///< y
+    uint32_t z;  ///< z
+#ifdef __cplusplus
+    constexpr __device__ dim3(uint32_t _x = 1, uint32_t _y = 1, uint32_t _z = 1) : x(_x), y(_y), z(_z){};
+#endif
+} dim3;
+#endif // !defined(__HIPCC_RTC__)
+
+template <typename F>
+struct __HIP_Coordinates {
+  using R = decltype(F{}(0));
+
+  struct __X {
+    __device__ operator R() const noexcept { return F{}(0); }
+    __device__ R operator+=(const R& rhs) { return F{}(0) + rhs; }
+  };
+  struct __Y {
+    __device__ operator R() const noexcept { return F{}(1); }
+    __device__ R operator+=(const R& rhs) { return F{}(1) + rhs; }
+  };
+  struct __Z {
+    __device__ operator R() const noexcept { return F{}(2); }
+    __device__ R operator+=(const R& rhs) { return F{}(2) + rhs; }
+  };
+
+  static constexpr __X x{};
+  static constexpr __Y y{};
+  static constexpr __Z z{};
+#ifdef __cplusplus
+  __device__ operator dim3() const { return dim3(x, y, z); }
+#endif
+
+};
+template <typename F>
+#if !defined(_MSC_VER)
+__attribute__((weak))
+#endif
+constexpr typename __HIP_Coordinates<F>::__X __HIP_Coordinates<F>::x;
+template <typename F>
+#if !defined(_MSC_VER)
+__attribute__((weak))
+#endif
+constexpr typename __HIP_Coordinates<F>::__Y __HIP_Coordinates<F>::y;
+template <typename F>
+#if !defined(_MSC_VER)
+__attribute__((weak))
+#endif
+constexpr typename __HIP_Coordinates<F>::__Z __HIP_Coordinates<F>::z;
+
+extern "C" __device__ __attribute__((const)) size_t __ockl_get_global_size(uint);
+inline
+__device__
+std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::__X,
+                        __HIP_Coordinates<__HIP_BlockDim>::__X) noexcept {
+  return __ockl_get_global_size(0);
+}
+inline
+__device__
+std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::__X,
+                        __HIP_Coordinates<__HIP_GridDim>::__X) noexcept {
+  return __ockl_get_global_size(0);
+}
+inline
+__device__
+std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::__Y,
+                        __HIP_Coordinates<__HIP_BlockDim>::__Y) noexcept {
+  return __ockl_get_global_size(1);
+}
+inline
+__device__
+std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::__Y,
+                        __HIP_Coordinates<__HIP_GridDim>::__Y) noexcept {
+  return __ockl_get_global_size(1);
+}
+inline
+__device__
+std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::__Z,
+                        __HIP_Coordinates<__HIP_BlockDim>::__Z) noexcept {
+  return __ockl_get_global_size(2);
+}
+inline
+__device__
+std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::__Z,
+                        __HIP_Coordinates<__HIP_GridDim>::__Z) noexcept {
+  return __ockl_get_global_size(2);
+}
+
+static constexpr __HIP_Coordinates<__HIP_BlockDim> blockDim{};
+static constexpr __HIP_Coordinates<__HIP_BlockIdx> blockIdx{};
+static constexpr __HIP_Coordinates<__HIP_GridDim> gridDim{};
+static constexpr __HIP_Coordinates<__HIP_ThreadIdx> threadIdx{};
+
+extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_id(uint);
+#define hipThreadIdx_x (__ockl_get_local_id(0))
+#define hipThreadIdx_y (__ockl_get_local_id(1))
+#define hipThreadIdx_z (__ockl_get_local_id(2))
+
+extern "C" __device__ __attribute__((const)) size_t __ockl_get_group_id(uint);
+#define hipBlockIdx_x (__ockl_get_group_id(0))
+#define hipBlockIdx_y (__ockl_get_group_id(1))
+#define hipBlockIdx_z (__ockl_get_group_id(2))
+
+extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_size(uint);
+#define hipBlockDim_x (__ockl_get_local_size(0))
+#define hipBlockDim_y (__ockl_get_local_size(1))
+#define hipBlockDim_z (__ockl_get_local_size(2))
+
+extern "C" __device__ __attribute__((const)) size_t __ockl_get_num_groups(uint);
+#define hipGridDim_x (__ockl_get_num_groups(0))
+#define hipGridDim_y (__ockl_get_num_groups(1))
+#define hipGridDim_z (__ockl_get_num_groups(2))
+
+#include <hip/amd_detail/amd_math_functions.h>
+
+#if __HIP_HCC_COMPAT_MODE__
+// Define HCC work item functions in terms of HIP builtin variables.
+#pragma push_macro("__DEFINE_HCC_FUNC")
+#define __DEFINE_HCC_FUNC(hc_fun,hip_var) \
+inline __device__ __attribute__((always_inline)) uint hc_get_##hc_fun(uint i) { \
+  if (i==0) \
+    return hip_var.x; \
+  else if(i==1) \
+    return hip_var.y; \
+  else \
+    return hip_var.z; \
+}
+
+__DEFINE_HCC_FUNC(workitem_id, threadIdx)
+__DEFINE_HCC_FUNC(group_id, blockIdx)
+__DEFINE_HCC_FUNC(group_size, blockDim)
+__DEFINE_HCC_FUNC(num_groups, gridDim)
+#pragma pop_macro("__DEFINE_HCC_FUNC")
+
+extern "C" __device__ __attribute__((const)) size_t __ockl_get_global_id(uint);
+inline __device__ __attribute__((always_inline)) uint
+hc_get_workitem_absolute_id(int dim)
+{
+  return (uint)__ockl_get_global_id(dim);
+}
+
+#endif
+
+#if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+#if !defined(__HIPCC_RTC__)
+// Support std::complex.
+#if !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
+#pragma push_macro("__CUDA__")
+#define __CUDA__
+#include <__clang_cuda_math_forward_declares.h>
+#include <__clang_cuda_complex_builtins.h>
+// Workaround for using libc++ with HIP-Clang.
+// The following headers requires clang include path before standard C++ include path.
+// However libc++ include path requires to be before clang include path.
+// To workaround this, we pass -isystem with the parent directory of clang include
+// path instead of the clang include path itself.
+#include <include/cuda_wrappers/algorithm>
+#include <include/cuda_wrappers/complex>
+#include <include/cuda_wrappers/new>
+#undef __CUDA__
+#pragma pop_macro("__CUDA__")
+#endif // !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
+#endif // !defined(__HIPCC_RTC__)
+#endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+#endif // __HIP_CLANG_ONLY__
+
+#endif  // HIP_AMD_DETAIL_RUNTIME_H
@@ -0,0 +1,194 @@
+/*
+Copyright (c) 2022 - Present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#ifndef HIP_INCLUDE_HIP_HIP_RUNTIME_PT_API_H
+#define HIP_INCLUDE_HIP_HIP_RUNTIME_PT_API_H
+
+#if (defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)) && !(defined(__HIP_PLATFORM_NVCC__) || defined(__HIP_PLATFORM_NVIDIA__))
+
+/// hipStreamPerThread implementation
+#if defined(HIP_API_PER_THREAD_DEFAULT_STREAM)
+    #define __HIP_STREAM_PER_THREAD
+    #define __HIP_API_SPT(api) api ## _spt
+#else
+    #define __HIP_API_SPT(api) api
+#endif
+
+#if defined(__HIP_STREAM_PER_THREAD)
+    // Memory APIs
+    #define hipMemcpy                     __HIP_API_SPT(hipMemcpy)
+    #define hipMemcpyToSymbol             __HIP_API_SPT(hipMemcpyToSymbol)
+    #define hipMemcpyFromSymbol           __HIP_API_SPT(hipMemcpyFromSymbol)
+    #define hipMemcpy2D                   __HIP_API_SPT(hipMemcpy2D)
+    #define hipMemcpy2DFromArray          __HIP_API_SPT(hipMemcpy2DFromArray)
+    #define hipMemcpy3D                   __HIP_API_SPT(hipMemcpy3D)
+    #define hipMemset                     __HIP_API_SPT(hipMemset)
+    #define hipMemset2D                   __HIP_API_SPT(hipMemset2D)
+    #define hipMemset3D                   __HIP_API_SPT(hipMemset3D)
+    #define hipMemcpyAsync                __HIP_API_SPT(hipMemcpyAsync)
+    #define hipMemset3DAsync              __HIP_API_SPT(hipMemset3DAsync)
+    #define hipMemset2DAsync              __HIP_API_SPT(hipMemset2DAsync)
+    #define hipMemsetAsync                __HIP_API_SPT(hipMemsetAsync)
+    #define hipMemcpy3DAsync              __HIP_API_SPT(hipMemcpy3DAsync)
+    #define hipMemcpy2DAsync              __HIP_API_SPT(hipMemcpy2DAsync)
+    #define hipMemcpyFromSymbolAsync      __HIP_API_SPT(hipMemcpyFromSymbolAsync)
+    #define hipMemcpyToSymbolAsync        __HIP_API_SPT(hipMemcpyToSymbolAsync)
+    #define hipMemcpyFromArray            __HIP_API_SPT(hipMemcpyFromArray)
+    #define hipMemcpy2DToArray            __HIP_API_SPT(hipMemcpy2DToArray)
+    #define hipMemcpy2DFromArrayAsync     __HIP_API_SPT(hipMemcpy2DFromArrayAsync)
+    #define hipMemcpy2DToArrayAsync       __HIP_API_SPT(hipMemcpy2DToArrayAsync)
+
+    // Stream APIs
+    #define hipStreamSynchronize          __HIP_API_SPT(hipStreamSynchronize)
+    #define hipStreamQuery                __HIP_API_SPT(hipStreamQuery)
+    #define hipStreamGetFlags             __HIP_API_SPT(hipStreamGetFlags)
+    #define hipStreamGetPriority          __HIP_API_SPT(hipStreamGetPriority)
+    #define hipStreamWaitEvent            __HIP_API_SPT(hipStreamWaitEvent)
+    #define hipStreamAddCallback          __HIP_API_SPT(hipStreamAddCallback)
+    #define hipLaunchHostFunc             __HIP_API_SPT(hipLaunchHostFunc)
+
+    // Event APIs
+    #define hipEventRecord               __HIP_API_SPT(hipEventRecord)
+
+    // Launch APIs
+    #define hipLaunchKernel               __HIP_API_SPT(hipLaunchKernel)
+    #define hipLaunchCooperativeKernel    __HIP_API_SPT(hipLaunchCooperativeKernel)
+
+    // Graph APIs
+    #define hipGraphLaunch                __HIP_API_SPT(hipGraphLaunch)
+    #define hipStreamBeginCapture         __HIP_API_SPT(hipStreamBeginCapture)
+    #define hipStreamEndCapture           __HIP_API_SPT(hipStreamEndCapture)
+    #define hipStreamIsCapturing          __HIP_API_SPT(hipStreamIsCapturing)
+    #define hipStreamGetCaptureInfo       __HIP_API_SPT(hipStreamGetCaptureInfo)
+    #define hipStreamGetCaptureInfo_v2    __HIP_API_SPT(hipStreamGetCaptureInfo_v2)
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+hipError_t hipMemcpy_spt(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind);
+
+hipError_t hipMemcpyToSymbol_spt(const void* symbol, const void* src, size_t sizeBytes,
+                             size_t offset, hipMemcpyKind kind);
+
+hipError_t hipMemcpyFromSymbol_spt(void* dst, const void* symbol,size_t sizeBytes,
+                               size_t offset, hipMemcpyKind kind);
+
+hipError_t hipMemcpy2D_spt(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width,
+                        size_t height, hipMemcpyKind kind);
+
+hipError_t hipMemcpy2DFromArray_spt( void* dst, size_t dpitch, hipArray_const_t src, size_t wOffset,
+                        size_t hOffset, size_t width, size_t height, hipMemcpyKind kind);
+
+hipError_t hipMemcpy3D_spt(const struct hipMemcpy3DParms* p);
+
+hipError_t hipMemset_spt(void* dst, int value, size_t sizeBytes);
+
+hipError_t hipMemsetAsync_spt(void* dst, int value, size_t sizeBytes, hipStream_t stream);
+
+hipError_t hipMemset2D_spt(void* dst, size_t pitch, int value, size_t width, size_t height);
+
+hipError_t hipMemset2DAsync_spt(void* dst, size_t pitch, int value,
+                            size_t width, size_t height, hipStream_t stream);
+
+hipError_t hipMemset3DAsync_spt(hipPitchedPtr pitchedDevPtr, int value, hipExtent extent, hipStream_t stream);
+
+hipError_t hipMemset3D_spt(hipPitchedPtr pitchedDevPtr, int  value, hipExtent extent );
+
+hipError_t hipMemcpyAsync_spt(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind,
+                          hipStream_t stream);
+
+hipError_t hipMemcpy3DAsync_spt(const hipMemcpy3DParms* p, hipStream_t stream);
+
+hipError_t hipMemcpy2DAsync_spt(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width,
+                            size_t height, hipMemcpyKind kind, hipStream_t stream);
+
+hipError_t hipMemcpyFromSymbolAsync_spt(void* dst, const void* symbol, size_t sizeBytes,
+                                    size_t offset, hipMemcpyKind kind, hipStream_t stream);
+
+hipError_t hipMemcpyToSymbolAsync_spt(const void* symbol, const void* src, size_t sizeBytes,
+                                  size_t offset, hipMemcpyKind kind, hipStream_t stream);
+
+hipError_t hipMemcpyFromArray_spt(void* dst, hipArray_const_t src, size_t wOffsetSrc, size_t hOffset,
+                                  size_t count, hipMemcpyKind kind);
+
+hipError_t hipMemcpy2DToArray_spt(hipArray* dst, size_t wOffset, size_t hOffset, const void* src,
+                                  size_t spitch, size_t width, size_t height, hipMemcpyKind kind);
+
+hipError_t hipMemcpy2DFromArrayAsync_spt(void* dst, size_t dpitch, hipArray_const_t src,
+                                  size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height,
+                                  hipMemcpyKind kind, hipStream_t stream);
+
+hipError_t hipMemcpy2DToArrayAsync_spt(hipArray* dst, size_t wOffset, size_t hOffset, const void* src,
+                                  size_t spitch, size_t width, size_t height, hipMemcpyKind kind,
+                                  hipStream_t stream);
+
+hipError_t hipStreamQuery_spt(hipStream_t stream);
+
+hipError_t hipStreamSynchronize_spt(hipStream_t stream);
+
+hipError_t hipStreamGetPriority_spt(hipStream_t stream, int* priority);
+
+hipError_t hipStreamWaitEvent_spt(hipStream_t stream, hipEvent_t event, unsigned int flags);
+
+hipError_t hipStreamGetFlags_spt(hipStream_t stream, unsigned int* flags);
+
+hipError_t hipStreamAddCallback_spt(hipStream_t stream, hipStreamCallback_t callback, void* userData,
+                                unsigned int flags);
+#ifdef __cplusplus
+hipError_t hipEventRecord_spt(hipEvent_t event, hipStream_t stream = NULL);
+#else
+hipError_t hipEventRecord_spt(hipEvent_t event, hipStream_t stream);
+#endif
+
+hipError_t hipLaunchCooperativeKernel_spt(const void* f,
+                                      dim3 gridDim, dim3 blockDim,
+                                      void **kernelParams, uint32_t sharedMemBytes, hipStream_t hStream);
+
+hipError_t hipLaunchKernel_spt(const void* function_address,
+                           dim3 numBlocks,
+                           dim3 dimBlocks,
+                           void** args,
+                           size_t sharedMemBytes, hipStream_t stream);
+
+hipError_t hipGraphLaunch_spt(hipGraphExec_t graphExec, hipStream_t stream);
+hipError_t hipStreamBeginCapture_spt(hipStream_t stream, hipStreamCaptureMode mode);
+hipError_t hipStreamEndCapture_spt(hipStream_t stream, hipGraph_t* pGraph);
+hipError_t hipStreamIsCapturing_spt(hipStream_t stream, hipStreamCaptureStatus* pCaptureStatus);
+hipError_t hipStreamGetCaptureInfo_spt(hipStream_t stream, hipStreamCaptureStatus* pCaptureStatus,
+                                   unsigned long long* pId);
+hipError_t hipStreamGetCaptureInfo_v2_spt(hipStream_t stream, hipStreamCaptureStatus* captureStatus_out,
+                                      unsigned long long* id_out, hipGraph_t* graph_out,
+                                      const hipGraphNode_t** dependencies_out,
+                                      size_t* numDependencies_out);
+hipError_t hipLaunchHostFunc_spt(hipStream_t stream, hipHostFn_t fn, void* userData);
+
+
+#ifdef __cplusplus
+}
+#endif // extern "C"
+
+#endif //(defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)) && !(defined(__HIP_PLATFORM_NVCC__) || defined(__HIP_PLATFORM_NVIDIA__))
+#endif //HIP_INCLUDE_HIP_HIP_RUNTIME_PT_API_H
@@ -0,0 +1,570 @@
+/*
+Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#ifdef __cplusplus
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wold-style-cast"
+
+/**
+ * @brief Unsafe floating point rmw atomic add.
+ *
+ * Performs a relaxed read-modify-write floating point atomic add with
+ * device memory scope. Original value at \p addr is returned and
+ * the value of \p addr is updated to have the original value plus \p value
+ *
+ * @note This operation currently only performs different operations for
+ * the gfx90a target. Other devices continue to use safe atomics.
+ *
+ * It can be used to generate code that uses fast hardware floating point atomic
+ * operations which may handle rounding and subnormal values differently than
+ * non-atomic floating point operations.
+ *
+ * The operation is not always safe and can have undefined behavior unless
+ * following condition are met:
+ *
+ * - \p addr is at least 4 bytes aligned
+ * - If \p addr is a global segment address, it is in a coarse grain allocation.
+ * Passing in global segment addresses in fine grain allocations will result in
+ * undefined behavior and is not supported.
+ *
+ * @param [in,out] addr Pointer to value to be increment by \p value.
+ * @param [in] value Value by \p addr is to be incremented.
+ * @return Original value contained in \p addr.
+ */
+__device__ inline float unsafeAtomicAdd(float* addr, float value) {
+#if defined(__gfx940__) &&                                                     \
+    __has_builtin(__builtin_amdgcn_flat_atomic_fadd_f32)
+  return __builtin_amdgcn_flat_atomic_fadd_f32(addr, value);
+#elif defined(__gfx90a__) &&                                                   \
+    __has_builtin(__builtin_amdgcn_is_shared) &&                               \
+    __has_builtin(__builtin_amdgcn_is_private) &&                              \
+    __has_builtin(__builtin_amdgcn_ds_atomic_fadd_f32) &&                      \
+    __has_builtin(__builtin_amdgcn_global_atomic_fadd_f32)
+  if (__builtin_amdgcn_is_shared(
+        (const __attribute__((address_space(0))) void*)addr))
+    return __builtin_amdgcn_ds_atomic_fadd_f32(addr, value);
+  else if (__builtin_amdgcn_is_private(
+              (const __attribute__((address_space(0))) void*)addr)) {
+    float temp = *addr;
+    *addr = temp + value;
+    return temp;
+  }
+  else
+    return __builtin_amdgcn_global_atomic_fadd_f32(addr, value);
+#elif __has_builtin(__hip_atomic_fetch_add)
+  return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+#else
+  return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
+#endif
+}
+
+/**
+ * @brief Unsafe floating point rmw atomic max.
+ *
+ * Performs a relaxed read-modify-write floating point atomic max with
+ * device memory scope. The original value at \p addr is returned and
+ * the value at \p addr is replaced by \p val if greater.
+ *
+ * @note This operation is currently identical to that performed by
+ * atomicMax and is included for completeness.
+ *
+ * @param [in,out] addr Pointer to value to be updated
+ * @param [in] val Value used to update the value at \p addr.
+ * @return Original value contained in \p addr.
+ */
+__device__ inline float unsafeAtomicMax(float* addr, float val) {
+  #if __has_builtin(__hip_atomic_load) && \
+      __has_builtin(__hip_atomic_compare_exchange_strong)
+  float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  bool done = false;
+  while (!done && value < val) {
+    done = __hip_atomic_compare_exchange_strong(addr, &value, val,
+               __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  }
+  return value;
+  #else
+  unsigned int *uaddr = (unsigned int *)addr;
+  unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
+  bool done = false;
+  while (!done && __uint_as_float(value) < val) {
+    done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
+               __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+  }
+  return __uint_as_float(value);
+  #endif
+}
+
+/**
+ * @brief Unsafe floating point rmw atomic min.
+ *
+ * Performs a relaxed read-modify-write floating point atomic min with
+ * device memory scope. The original value at \p addr is returned and
+ * the value at \p addr is replaced by \p val if lesser.
+ *
+ * @note This operation is currently identical to that performed by
+ * atomicMin and is included for completeness.
+ *
+ * @param [in,out] addr Pointer to value to be updated
+ * @param [in] val Value used to update the value at \p addr.
+ * @return Original value contained in \p addr.
+ */
+__device__ inline float unsafeAtomicMin(float* addr, float val) {
+  #if __has_builtin(__hip_atomic_load) && \
+      __has_builtin(__hip_atomic_compare_exchange_strong)
+  float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  bool done = false;
+  while (!done && value > val) {
+    done = __hip_atomic_compare_exchange_strong(addr, &value, val,
+               __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  }
+  return value;
+  #else
+  unsigned int *uaddr = (unsigned int *)addr;
+  unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
+  bool done = false;
+  while (!done && __uint_as_float(value) > val) {
+    done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
+               __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+  }
+  return __uint_as_float(value);
+  #endif
+}
+
+/**
+ * @brief Unsafe double precision rmw atomic add.
+ *
+ * Performs a relaxed read-modify-write double precision atomic add with
+ * device memory scope. Original value at \p addr is returned and
+ * the value of \p addr is updated to have the original value plus \p value
+ *
+ * @note This operation currently only performs different operations for
+ * the gfx90a target. Other devices continue to use safe atomics.
+ *
+ * It can be used to generate code that uses fast hardware floating point atomic
+ * operations which may handle rounding and subnormal values differently than
+ * non-atomic floating point operations.
+ *
+ * The operation is not always safe and can have undefined behavior unless
+ * following condition are met:
+ *
+ * - \p addr is at least 8 byte aligned
+ * - If \p addr is a global segment address, it is in a coarse grain allocation.
+ * Passing in global segment addresses in fine grain allocations will result in
+ * undefined behavior and are not supported.
+ *
+ * @param [in,out] addr Pointer to value to be updated.
+ * @param [in] value Value by \p addr is to be incremented.
+ * @return Original value contained in \p addr.
+ */
+__device__ inline double unsafeAtomicAdd(double* addr, double value) {
+#if (defined(__gfx90a__) || defined(__gfx940__)) &&                              \
+    __has_builtin(__builtin_amdgcn_flat_atomic_fadd_f64)
+  return __builtin_amdgcn_flat_atomic_fadd_f64(addr, value);
+#elif defined (__hip_atomic_fetch_add)
+  return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+#else
+  return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
+#endif
+}
+
+/**
+ * @brief Unsafe double precision rmw atomic max.
+ *
+ * Performs a relaxed read-modify-write double precision atomic max with
+ * device memory scope. Original value at \p addr is returned and
+ * the value of \p addr is updated with \p val if greater.
+ *
+ * @note This operation currently only performs different operations for
+ * the gfx90a target. Other devices continue to use safe atomics.
+ *
+ * It can be used to generate code that uses fast hardware floating point atomic
+ * operations which may handle rounding and subnormal values differently than
+ * non-atomic floating point operations.
+ *
+ * The operation is not always safe and can have undefined behavior unless
+ * following condition are met:
+ *
+ * - \p addr is at least 8 byte aligned
+ * - If \p addr is a global segment address, it is in a coarse grain allocation.
+ * Passing in global segment addresses in fine grain allocations will result in
+ * undefined behavior and are not supported.
+ *
+ * @param [in,out] addr Pointer to value to be updated.
+ * @param [in] val Value used to updated the contents at \p addr
+ * @return Original value contained at \p addr.
+ */
+__device__ inline double unsafeAtomicMax(double* addr, double val) {
+#if (defined(__gfx90a__) || defined(__gfx940__)) && \
+    __has_builtin(__builtin_amdgcn_flat_atomic_fmax_f64)
+  return __builtin_amdgcn_flat_atomic_fmax_f64(addr, val);
+#else
+  #if __has_builtin(__hip_atomic_load) && \
+      __has_builtin(__hip_atomic_compare_exchange_strong)
+  double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  bool done = false;
+  while (!done && value < val) {
+    done = __hip_atomic_compare_exchange_strong(addr, &value, val,
+               __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  }
+  return value;
+  #else
+  unsigned long long *uaddr = (unsigned long long *)addr;
+  unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
+  bool done = false;
+  while (!done && __longlong_as_double(value) < val) {
+    done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
+               __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+  }
+  return __longlong_as_double(value);
+  #endif
+#endif
+}
+
+/**
+ * @brief Unsafe double precision rmw atomic min.
+ *
+ * Performs a relaxed read-modify-write double precision atomic min with
+ * device memory scope. Original value at \p addr is returned and
+ * the value of \p addr is updated with \p val if lesser.
+ *
+ * @note This operation currently only performs different operations for
+ * the gfx90a target. Other devices continue to use safe atomics.
+ *
+ * It can be used to generate code that uses fast hardware floating point atomic
+ * operations which may handle rounding and subnormal values differently than
+ * non-atomic floating point operations.
+ *
+ * The operation is not always safe and can have undefined behavior unless
+ * following condition are met:
+ *
+ * - \p addr is at least 8 byte aligned
+ * - If \p addr is a global segment address, it is in a coarse grain allocation.
+ * Passing in global segment addresses in fine grain allocations will result in
+ * undefined behavior and are not supported.
+ *
+ * @param [in,out] addr Pointer to value to be updated.
+ * @param [in] val Value used to updated the contents at \p addr
+ * @return Original value contained at \p addr.
+ */
+__device__ inline double unsafeAtomicMin(double* addr, double val) {
+#if (defined(__gfx90a__) || defined(__gfx940__)) && \
+    __has_builtin(__builtin_amdgcn_flat_atomic_fmin_f64)
+  return __builtin_amdgcn_flat_atomic_fmin_f64(addr, val);
+#else
+  #if __has_builtin(__hip_atomic_load) && \
+      __has_builtin(__hip_atomic_compare_exchange_strong)
+  double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  bool done = false;
+  while (!done && value > val) {
+    done = __hip_atomic_compare_exchange_strong(addr, &value, val,
+               __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  }
+  return value;
+  #else
+  unsigned long long *uaddr = (unsigned long long *)addr;
+  unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
+  bool done = false;
+  while (!done && __longlong_as_double(value) > val) {
+    done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
+               __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+  }
+  return __longlong_as_double(value);
+  #endif
+#endif
+}
+
+/**
+ * @brief Safe floating point rmw atomic add.
+ *
+ * Performs a relaxed read-modify-write floating point atomic add with
+ * device memory scope. Original value at \p addr is returned and
+ * the value of \p addr is updated to have the original value plus \p value
+ *
+ * @note This operation ensures that, on all targets, we produce safe atomics.
+ * This will be the case even when -munsafe-fp-atomics is passed into the compiler.
+ *
+ * @param [in,out] addr Pointer to value to be increment by \p value.
+ * @param [in] value Value by \p addr is to be incremented.
+ * @return Original value contained in \p addr.
+ */
+__device__ inline float safeAtomicAdd(float* addr, float value) {
+#if defined(__gfx908__) ||                                                    \
+    (defined(__gfx90a__) && !__has_builtin(__hip_atomic_fetch_add))
+  // On gfx908, we can generate unsafe FP32 atomic add that does not follow all
+  // IEEE rules when -munsafe-fp-atomics is passed. Do a CAS loop emulation instead.
+  // On gfx90a, if we do not have the __hip_atomic_fetch_add builtin, we need to
+  // force a CAS loop here.
+  float old_val;
+#if __has_builtin(__hip_atomic_load)
+  old_val = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+#else // !__has_builtin(__hip_atomic_load)
+  old_val = __uint_as_float(__atomic_load_n(reinterpret_cast<unsigned int*>(addr), __ATOMIC_RELAXED));
+#endif // __has_builtin(__hip_atomic_load)
+  float expected, temp;
+  do {
+    temp = expected = old_val;
+#if __has_builtin(__hip_atomic_compare_exchange_strong)
+    __hip_atomic_compare_exchange_strong(addr, &expected, old_val + value, __ATOMIC_RELAXED,
+                                         __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+#else // !__has_builtin(__hip_atomic_compare_exchange_strong)
+    __atomic_compare_exchange_n(addr, &expected, old_val + value, false,
+                                __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+#endif // __has_builtin(__hip_atomic_compare_exchange_strong)
+    old_val = expected;
+  } while (__float_as_uint(temp) != __float_as_uint(old_val));
+  return old_val;
+#elif defined(__gfx90a__)
+  // On gfx90a, with the __hip_atomic_fetch_add builtin, relaxed system-scope
+  // atomics will produce safe CAS loops, but are otherwise not different than
+  // agent-scope atomics. This logic is only applicable for gfx90a, and should
+  // not be assumed on other architectures.
+  return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+#elif __has_builtin(__hip_atomic_fetch_add)
+  return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+#else
+  return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
+#endif
+}
+
+/**
+ * @brief Safe floating point rmw atomic max.
+ *
+ * Performs a relaxed read-modify-write floating point atomic max with
+ * device memory scope. The original value at \p addr is returned and
+ * the value at \p addr is replaced by \p val if greater.
+ *
+ * @note This operation ensures that, on all targets, we produce safe atomics.
+ * This will be the case even when -munsafe-fp-atomics is passed into the compiler.
+ *
+ * @param [in,out] addr Pointer to value to be updated
+ * @param [in] val Value used to update the value at \p addr.
+ * @return Original value contained in \p addr.
+ */
+__device__ inline float safeAtomicMax(float* addr, float val) {
+  #if __has_builtin(__hip_atomic_load) && \
+      __has_builtin(__hip_atomic_compare_exchange_strong)
+  float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  bool done = false;
+  while (!done && value < val) {
+    done = __hip_atomic_compare_exchange_strong(addr, &value, val,
+               __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  }
+  return value;
+  #else
+  unsigned int *uaddr = (unsigned int *)addr;
+  unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
+  bool done = false;
+  while (!done && __uint_as_float(value) < val) {
+    done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
+               __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+  }
+  return __uint_as_float(value);
+  #endif
+}
+
+/**
+ * @brief Safe floating point rmw atomic min.
+ *
+ * Performs a relaxed read-modify-write floating point atomic min with
+ * device memory scope. The original value at \p addr is returned and
+ * the value at \p addr is replaced by \p val if lesser.
+ *
+ * @note This operation ensures that, on all targets, we produce safe atomics.
+ * This will be the case even when -munsafe-fp-atomics is passed into the compiler.
+ *
+ * @param [in,out] addr Pointer to value to be updated
+ * @param [in] val Value used to update the value at \p addr.
+ * @return Original value contained in \p addr.
+ */
+__device__ inline float safeAtomicMin(float* addr, float val) {
+  #if __has_builtin(__hip_atomic_load) && \
+      __has_builtin(__hip_atomic_compare_exchange_strong)
+  float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  bool done = false;
+  while (!done && value > val) {
+    done = __hip_atomic_compare_exchange_strong(addr, &value, val,
+               __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  }
+  return value;
+  #else
+  unsigned int *uaddr = (unsigned int *)addr;
+  unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
+  bool done = false;
+  while (!done && __uint_as_float(value) > val) {
+    done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
+               __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+  }
+  return __uint_as_float(value);
+  #endif
+}
+
+/**
+ * @brief Safe double precision rmw atomic add.
+ *
+ * Performs a relaxed read-modify-write double precision atomic add with
+ * device memory scope. Original value at \p addr is returned and
+ * the value of \p addr is updated to have the original value plus \p value
+ *
+ * @note This operation ensures that, on all targets, we produce safe atomics.
+ * This will be the case even when -munsafe-fp-atomics is passed into the compiler.
+ *
+ * @param [in,out] addr Pointer to value to be increment by \p value.
+ * @param [in] value Value by \p addr is to be incremented.
+ * @return Original value contained in \p addr.
+ */
+__device__ inline double safeAtomicAdd(double* addr, double value) {
+#if (defined(__gfx90a__) || defined(__gfx940__)) &&                                                    \
+    __has_builtin(__hip_atomic_fetch_add)
+  // On gfx90a, with the __hip_atomic_fetch_add builtin, relaxed system-scope
+  // atomics will produce safe CAS loops, but are otherwise not different than
+  // agent-scope atomics. This logic is only applicable for gfx90a, and should
+  // not be assumed on other architectures.
+  return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+#elif defined(__gfx90a__)
+  // On gfx90a, if we do not have the __hip_atomic_fetch_add builtin, we need to
+  // force a CAS loop here.
+  double old_val;
+#if __has_builtin(__hip_atomic_load)
+  old_val = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+#else // !__has_builtin(__hip_atomic_load)
+  old_val = __longlong_as_double(__atomic_load_n(reinterpret_cast<unsigned long long*>(addr), __ATOMIC_RELAXED));
+#endif // __has_builtin(__hip_atomic_load)
+  double expected, temp;
+  do {
+    temp = expected = old_val;
+#if __has_builtin(__hip_atomic_compare_exchange_strong)
+    __hip_atomic_compare_exchange_strong(addr, &expected, old_val + value, __ATOMIC_RELAXED,
+                                         __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+#else // !__has_builtin(__hip_atomic_compare_exchange_strong)
+    __atomic_compare_exchange_n(addr, &expected, old_val + value, false,
+                                __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+#endif // __has_builtin(__hip_atomic_compare_exchange_strong)
+    old_val = expected;
+  } while (__double_as_longlong(temp) != __double_as_longlong(old_val));
+  return old_val;
+#else // !defined(__gfx90a__)
+#if __has_builtin(__hip_atomic_fetch_add)
+  return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+#else  // !__has_builtin(__hip_atomic_fetch_add)
+  return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
+#endif // __has_builtin(__hip_atomic_fetch_add)
+#endif
+}
+
+/**
+ * @brief Safe double precision rmw atomic max.
+ *
+ * Performs a relaxed read-modify-write double precision atomic max with
+ * device memory scope. Original value at \p addr is returned and
+ * the value of \p addr is updated with \p val if greater.
+ *
+ * @note This operation ensures that, on all targets, we produce safe atomics.
+ * This will be the case even when -munsafe-fp-atomics is passed into the compiler.
+ *
+ * @param [in,out] addr Pointer to value to be updated.
+ * @param [in] val Value used to updated the contents at \p addr
+ * @return Original value contained at \p addr.
+ */
+__device__ inline double safeAtomicMax(double* addr, double val) {
+  #if __has_builtin(__builtin_amdgcn_is_private)
+  if (__builtin_amdgcn_is_private(
+          (const __attribute__((address_space(0))) void*)addr)) {
+    double old = *addr;
+    *addr = __builtin_fmax(old, val);
+    return old;
+  } else {
+  #endif
+  #if __has_builtin(__hip_atomic_load) && \
+      __has_builtin(__hip_atomic_compare_exchange_strong)
+  double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  bool done = false;
+  while (!done && value < val) {
+    done = __hip_atomic_compare_exchange_strong(addr, &value, val,
+               __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  }
+  return value;
+  #else
+  unsigned long long *uaddr = (unsigned long long *)addr;
+  unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
+  bool done = false;
+  while (!done && __longlong_as_double(value) < val) {
+    done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
+               __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+  }
+  return __longlong_as_double(value);
+  #endif
+  #if __has_builtin(__builtin_amdgcn_is_private)
+  }
+  #endif
+}
+
+/**
+ * @brief Safe double precision rmw atomic min.
+ *
+ * Performs a relaxed read-modify-write double precision atomic min with
+ * device memory scope. Original value at \p addr is returned and
+ * the value of \p addr is updated with \p val if lesser.
+ *
+ * @note This operation ensures that, on all targets, we produce safe atomics.
+ * This will be the case even when -munsafe-fp-atomics is passed into the compiler.
+ *
+ * @param [in,out] addr Pointer to value to be updated.
+ * @param [in] val Value used to updated the contents at \p addr
+ * @return Original value contained at \p addr.
+ */
+__device__ inline double safeAtomicMin(double* addr, double val) {
+  #if __has_builtin(__builtin_amdgcn_is_private)
+  if (__builtin_amdgcn_is_private(
+           (const __attribute__((address_space(0))) void*)addr)) {
+    double old = *addr;
+    *addr = __builtin_fmin(old, val);
+    return old;
+  } else {
+  #endif
+  #if __has_builtin(__hip_atomic_load) && \
+      __has_builtin(__hip_atomic_compare_exchange_strong)
+  double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  bool done = false;
+  while (!done && value > val) {
+    done = __hip_atomic_compare_exchange_strong(addr, &value, val,
+               __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  }
+  return value;
+  #else
+  unsigned long long *uaddr = (unsigned long long *)addr;
+  unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
+  bool done = false;
+  while (!done && __longlong_as_double(value) > val) {
+    done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
+               __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+  }
+  return __longlong_as_double(value);
+  #endif
+  #if __has_builtin(__builtin_amdgcn_is_private)
+  }
+  #endif
+}
+
+#pragma clang diagnostic pop
+#endif
@@ -0,0 +1,362 @@
+/*
+Copyright (c) 2018 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_SURFACE_FUNCTIONS_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_SURFACE_FUNCTIONS_H
+
+#if defined(__cplusplus)
+
+#include <hip/surface_types.h>
+#include <hip/hip_vector_types.h>
+#include <hip/amd_detail/ockl_image.h>
+
+#define __HIP_SURFACE_OBJECT_PARAMETERS_INIT                                                            \
+    unsigned int ADDRESS_SPACE_CONSTANT* i = (unsigned int ADDRESS_SPACE_CONSTANT*)surfObj; 
+
+template<typename T>
+struct __hip_is_isurf_channel_type
+{
+    static constexpr bool value =
+        std::is_same<T, char>::value ||
+        std::is_same<T, unsigned char>::value ||
+        std::is_same<T, short>::value ||
+        std::is_same<T, unsigned short>::value ||
+        std::is_same<T, int>::value ||
+        std::is_same<T, unsigned int>::value ||
+        std::is_same<T, float>::value;
+};
+
+template<
+    typename T,
+    unsigned int rank>
+struct __hip_is_isurf_channel_type<HIP_vector_type<T, rank>>
+{
+    static constexpr bool value =
+        __hip_is_isurf_channel_type<T>::value &&
+        ((rank == 1) ||
+         (rank == 2) ||
+         (rank == 3) ||
+         (rank == 4));
+};
+
+// CUDA is using byte address, need map to pixel address for HIP
+static __HOST_DEVICE__ __forceinline__ int __hipGetPixelAddr(int x, int format, int order) {
+    /*
+    * use below format index to generate format LUT
+      typedef enum {
+        HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT8 = 0,
+        HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT16 = 1,
+        HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT8 = 2,
+        HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT16 = 3,
+        HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT24 = 4,
+        HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 = 5,
+        HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 = 6,
+        HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_101010 = 7,
+        HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8 = 8,
+        HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT16 = 9,
+        HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32 = 10,
+        HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 = 11,
+        HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 = 12,
+        HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 = 13,
+        HSA_EXT_IMAGE_CHANNEL_TYPE_HALF_FLOAT = 14,
+        HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT = 15
+      } hsa_ext_image_channel_type_t;
+    */
+    static const int FormatLUT[] = { 0, 1, 0, 1, 3, 1, 1, 1, 0, 1, 2, 0, 1, 2, 1, 2 };
+    x = FormatLUT[format] == 3 ? x / FormatLUT[format] : x >> FormatLUT[format];
+
+    /*
+    * use below order index to generate order LUT
+      typedef enum {
+        HSA_EXT_IMAGE_CHANNEL_ORDER_A = 0,
+        HSA_EXT_IMAGE_CHANNEL_ORDER_R = 1,
+        HSA_EXT_IMAGE_CHANNEL_ORDER_RX = 2,
+        HSA_EXT_IMAGE_CHANNEL_ORDER_RG = 3,
+        HSA_EXT_IMAGE_CHANNEL_ORDER_RGX = 4,
+        HSA_EXT_IMAGE_CHANNEL_ORDER_RA = 5,
+        HSA_EXT_IMAGE_CHANNEL_ORDER_RGB = 6,
+        HSA_EXT_IMAGE_CHANNEL_ORDER_RGBX = 7,
+        HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA = 8,
+        HSA_EXT_IMAGE_CHANNEL_ORDER_BGRA = 9,
+        HSA_EXT_IMAGE_CHANNEL_ORDER_ARGB = 10,
+        HSA_EXT_IMAGE_CHANNEL_ORDER_ABGR = 11,
+        HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB = 12,
+        HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBX = 13,
+        HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBA = 14,
+        HSA_EXT_IMAGE_CHANNEL_ORDER_SBGRA = 15,
+        HSA_EXT_IMAGE_CHANNEL_ORDER_INTENSITY = 16,
+        HSA_EXT_IMAGE_CHANNEL_ORDER_LUMINANCE = 17,
+        HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH = 18,
+        HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL = 19
+      } hsa_ext_image_channel_order_t;
+    */
+    static const int OrderLUT[] = { 0, 0, 1, 1, 3, 1, 3, 2, 2, 2, 2, 2, 3, 2, 2, 2, 0, 0, 0, 0 };
+    return x = OrderLUT[order] == 3 ? x / OrderLUT[order] : x >> OrderLUT[order];
+}
+
+template <
+    typename T,
+    typename std::enable_if<std::is_scalar<T>::value>::type* = nullptr>
+static __HOST_DEVICE__ __forceinline__ float4::Native_vec_ __hipMapToNativeFloat4(const T& t) {
+    float4::Native_vec_ tmp;
+    tmp.x = static_cast<float>(t);
+    return tmp;
+}
+
+template <
+    typename T,
+    typename std::enable_if<!std::is_scalar<T>::value && sizeof(T) / sizeof(typename T::value_type) == 1>::type* = nullptr>
+static __HOST_DEVICE__ __forceinline__ float4::Native_vec_ __hipMapToNativeFloat4(const T& t) {
+    float4::Native_vec_ tmp;
+    tmp.x = static_cast<float>(t.x);
+    return tmp;
+}
+
+template <
+    typename T,
+    typename std::enable_if<!std::is_scalar<T>::value && sizeof(T) / sizeof(typename T::value_type) == 2>::type* = nullptr>
+static __HOST_DEVICE__ __forceinline__ float4::Native_vec_ __hipMapToNativeFloat4(const T& t) {
+    float4::Native_vec_ tmp;
+    tmp.x = static_cast<float>(t.x);
+    tmp.y = static_cast<float>(t.y);
+    return tmp;
+}
+
+template <
+    typename T,
+    typename std::enable_if<!std::is_scalar<T>::value && sizeof(T) / sizeof(typename T::value_type) == 3>::type* = nullptr>
+static __HOST_DEVICE__ __forceinline__ float4::Native_vec_ __hipMapToNativeFloat4(const T& t) {
+    float4::Native_vec_ tmp;
+    tmp.x = static_cast<float>(t.x);
+    tmp.y = static_cast<float>(t.y);
+    tmp.z = static_cast<float>(t.z);
+    return tmp;
+}
+
+template <
+    typename T,
+    typename std::enable_if<!std::is_scalar<T>::value && sizeof(T) / sizeof(typename T::value_type) == 4>::type* = nullptr>
+static __HOST_DEVICE__ __forceinline__ float4::Native_vec_ __hipMapToNativeFloat4(const T& t) {
+    float4::Native_vec_ tmp;
+    tmp.x = static_cast<float>(t.x);
+    tmp.y = static_cast<float>(t.y);
+    tmp.z = static_cast<float>(t.z);
+    tmp.w = static_cast<float>(t.w);
+    return tmp;
+}
+
+template<typename T>
+static __HOST_DEVICE__ __forceinline__ 
+typename std::enable_if<std::is_scalar<T>::value, const T>::type 
+__hipMapFromNativeFloat4(const float4::Native_vec_& u) {
+    T tmp;
+    tmp = static_cast<T>(u.x);
+    return tmp;
+}
+
+template<typename T>
+static __HOST_DEVICE__ __forceinline__
+typename std::enable_if<!std::is_scalar<T>::value && sizeof(T) / sizeof(typename T::value_type) == 1, const T>::type 
+__hipMapFromNativeFloat4(const float4::Native_vec_& u) {
+    T tmp;
+    tmp.x = static_cast<typename T::value_type>(u.x);
+    return tmp;
+}
+
+template<typename T>
+static __HOST_DEVICE__ __forceinline__
+typename std::enable_if<!std::is_scalar<T>::value && sizeof(T) / sizeof(typename T::value_type) == 2, const T>::type
+__hipMapFromNativeFloat4(const float4::Native_vec_& u) {
+    T tmp;
+    tmp.x = static_cast<typename T::value_type>(u.x);
+    tmp.y = static_cast<typename T::value_type>(u.y);
+    return tmp;
+}
+
+template<typename T>
+static __HOST_DEVICE__ __forceinline__
+typename std::enable_if<!std::is_scalar<T>::value && sizeof(T) / sizeof(typename T::value_type) == 3, const T>::type
+__hipMapFromNativeFloat4(const float4::Native_vec_& u) {
+    T tmp;
+    tmp.x = static_cast<typename T::value_type>(u.x);
+    tmp.y = static_cast<typename T::value_type>(u.y);
+    tmp.z = static_cast<typename T::value_type>(u.z);
+    return tmp;
+}
+
+template<typename T>
+static __HOST_DEVICE__ __forceinline__
+typename std::enable_if<!std::is_scalar<T>::value && sizeof(T) / sizeof(typename T::value_type) == 4, const T>::type
+__hipMapFromNativeFloat4(const float4::Native_vec_& u) {
+    T tmp;
+    tmp.x = static_cast<typename T::value_type>(u.x);
+    tmp.y = static_cast<typename T::value_type>(u.y);
+    tmp.z = static_cast<typename T::value_type>(u.z);
+    tmp.w = static_cast<typename T::value_type>(u.w);
+    return tmp;
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_isurf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void surf1Dread(T* data, hipSurfaceObject_t surfObj, int x,
+        int boundaryMode = hipBoundaryModeZero) {
+    __HIP_SURFACE_OBJECT_PARAMETERS_INIT
+    x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i));
+    auto tmp = __ockl_image_load_1D(i, x);
+    *data = __hipMapFromNativeFloat4<T>(tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_isurf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void surf1Dwrite(T data, hipSurfaceObject_t surfObj, int x) {
+    __HIP_SURFACE_OBJECT_PARAMETERS_INIT
+    x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i));
+    auto tmp = __hipMapToNativeFloat4(data);
+    __ockl_image_store_1D(i, x, tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_isurf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void surf2Dread(T* data, hipSurfaceObject_t surfObj, int x, int y) {
+    __HIP_SURFACE_OBJECT_PARAMETERS_INIT
+    x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
+    auto tmp = __ockl_image_load_2D(i, int2(x, y).data);
+    *data = __hipMapFromNativeFloat4<T>(tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_isurf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void surf2Dwrite(T data, hipSurfaceObject_t surfObj, int x, int y) {
+    __HIP_SURFACE_OBJECT_PARAMETERS_INIT
+    x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
+    auto tmp = __hipMapToNativeFloat4(data);
+    __ockl_image_store_2D(i, int2(x, y).data, tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_isurf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void surf3Dread(T* data, hipSurfaceObject_t surfObj, int x, int y, int z) {
+    __HIP_SURFACE_OBJECT_PARAMETERS_INIT
+    x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_3D(i), __ockl_image_channel_order_3D(i));
+    auto tmp = __ockl_image_load_3D(i, int4(x, y, z, 0).data);
+    *data = __hipMapFromNativeFloat4<T>(tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_isurf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void surf3Dwrite(T data, hipSurfaceObject_t surfObj, int x, int y, int z) {
+    __HIP_SURFACE_OBJECT_PARAMETERS_INIT
+    x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_3D(i), __ockl_image_channel_order_3D(i));
+    auto tmp = __hipMapToNativeFloat4(data);
+    __ockl_image_store_3D(i, int4(x, y, z, 0).data, tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_isurf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void surf1DLayeredread(T* data, hipSurfaceObject_t surfObj, int x, int layer) {
+    __HIP_SURFACE_OBJECT_PARAMETERS_INIT
+    x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i));
+    auto tmp = __ockl_image_load_lod_1D(i, x, layer);
+    *data = __hipMapFromNativeFloat4<T>(tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_isurf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void surf1DLayeredwrite(T data, hipSurfaceObject_t surfObj, int x, int layer) {
+    __HIP_SURFACE_OBJECT_PARAMETERS_INIT
+    x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i));
+    auto tmp = __hipMapToNativeFloat4(data);
+    __ockl_image_store_lod_1D(i, x, layer, tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_isurf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void surf2DLayeredread(T* data, hipSurfaceObject_t surfObj, int x, int y, int layer) {
+    __HIP_SURFACE_OBJECT_PARAMETERS_INIT
+    x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
+    auto tmp = __ockl_image_load_lod_2D(i, int2(x, y).data, layer);
+    *data = __hipMapFromNativeFloat4<T>(tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_isurf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void surf2DLayeredwrite(T data, hipSurfaceObject_t surfObj, int x, int y, int layer) {
+    __HIP_SURFACE_OBJECT_PARAMETERS_INIT
+    x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
+    auto tmp = __hipMapToNativeFloat4(data);
+    __ockl_image_store_lod_2D(i, int2(x, y).data, layer, tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_isurf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void surfCubemapread(T* data, hipSurfaceObject_t surfObj, int x, int y, int face) {
+    __HIP_SURFACE_OBJECT_PARAMETERS_INIT
+    x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
+    auto tmp = __ockl_image_load_CM(i, int2(x, y).data, face);
+    *data = __hipMapFromNativeFloat4<T>(tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_isurf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void surfCubemapwrite(T data, hipSurfaceObject_t surfObj, int x, int y, int face) {
+    __HIP_SURFACE_OBJECT_PARAMETERS_INIT
+    x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
+    auto tmp = __hipMapToNativeFloat4(data);
+    __ockl_image_store_CM(i, int2(x, y).data, face, tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_isurf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void surfCubemapLayeredread(T* data, hipSurfaceObject_t surfObj, int x, int y, int face,
+        int layer) {
+    __HIP_SURFACE_OBJECT_PARAMETERS_INIT
+    x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
+    auto tmp = __ockl_image_load_lod_CM(i, int2(x, y).data, face, layer);
+    *data = __hipMapFromNativeFloat4<T>(tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_isurf_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void surfCubemapLayeredwrite(T* data, hipSurfaceObject_t surfObj, int x, int y, int face,
+        int layer) {
+    __HIP_SURFACE_OBJECT_PARAMETERS_INIT
+    x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
+    auto tmp = __hipMapToNativeFloat4(data);
+    __ockl_image_store_lod_CM(i, int2(x, y).data, face, layer, tmp);
+}
+
+#endif
+#endif
@@ -0,0 +1,503 @@
+/*
+Copyright (c) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wreserved-identifier"
+#pragma clang diagnostic ignored "-Wreserved-macro-identifier"
+#pragma clang diagnostic ignored "-Wsign-conversion"
+#pragma clang diagnostic ignored "-Wold-style-cast"
+#pragma clang diagnostic ignored "-Wc++98-compat"
+#pragma clang diagnostic ignored "-Wc++98-compat-pedantic"
+
+__device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) {
+    union { int i; unsigned u; float f; } tmp; tmp.u = src;
+    tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
+    return tmp.u;
+}
+
+__device__ static inline float __hip_ds_bpermutef(int index, float src) {
+    union { int i; unsigned u; float f; } tmp; tmp.f = src;
+    tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
+    return tmp.f;
+}
+
+__device__ static inline unsigned __hip_ds_permute(int index, unsigned src) {
+    union { int i; unsigned u; float f; } tmp; tmp.u = src;
+    tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
+    return tmp.u;
+}
+
+__device__ static inline float __hip_ds_permutef(int index, float src) {
+    union { int i; unsigned u; float f; } tmp; tmp.f = src;
+    tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
+    return tmp.f;
+}
+
+#define __hip_ds_swizzle(src, pattern)  __hip_ds_swizzle_N<(pattern)>((src))
+#define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src))
+
+template <int pattern>
+__device__ static inline unsigned __hip_ds_swizzle_N(unsigned int src) {
+    union { int i; unsigned u; float f; } tmp; tmp.u = src;
+    tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
+    return tmp.u;
+}
+
+template <int pattern>
+__device__ static inline float __hip_ds_swizzlef_N(float src) {
+    union { int i; unsigned u; float f; } tmp; tmp.f = src;
+    tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
+    return tmp.f;
+}
+
+#define __hip_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl) \
+  __hip_move_dpp_N<(dpp_ctrl), (row_mask), (bank_mask), (bound_ctrl)>((src))
+
+template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
+__device__ static inline int __hip_move_dpp_N(int src) {
+    return __builtin_amdgcn_mov_dpp(src, dpp_ctrl, row_mask, bank_mask,
+                                    bound_ctrl);
+}
+
+static constexpr int warpSize = __AMDGCN_WAVEFRONT_SIZE;
+
+__device__
+inline
+int __shfl(int var, int src_lane, int width = warpSize) {
+    int self = __lane_id();
+    int index = src_lane + (self & ~(width-1));
+    return __builtin_amdgcn_ds_bpermute(index<<2, var);
+}
+__device__
+inline
+unsigned int __shfl(unsigned int var, int src_lane, int width = warpSize) {
+     union { int i; unsigned u; float f; } tmp; tmp.u = var;
+    tmp.i = __shfl(tmp.i, src_lane, width);
+    return tmp.u;
+}
+__device__
+inline
+float __shfl(float var, int src_lane, int width = warpSize) {
+    union { int i; unsigned u; float f; } tmp; tmp.f = var;
+    tmp.i = __shfl(tmp.i, src_lane, width);
+    return tmp.f;
+}
+__device__
+inline
+double __shfl(double var, int src_lane, int width = warpSize) {
+    static_assert(sizeof(double) == 2 * sizeof(int), "");
+    static_assert(sizeof(double) == sizeof(uint64_t), "");
+
+    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl(tmp[0], src_lane, width);
+    tmp[1] = __shfl(tmp[1], src_lane, width);
+
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    double tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+}
+__device__
+inline
+long __shfl(long var, int src_lane, int width = warpSize)
+{
+    #ifndef _MSC_VER
+    static_assert(sizeof(long) == 2 * sizeof(int), "");
+    static_assert(sizeof(long) == sizeof(uint64_t), "");
+
+    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl(tmp[0], src_lane, width);
+    tmp[1] = __shfl(tmp[1], src_lane, width);
+
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+    #else
+    static_assert(sizeof(long) == sizeof(int), "");
+    return static_cast<long>(__shfl(static_cast<int>(var), src_lane, width));
+    #endif
+}
+__device__
+inline
+unsigned long __shfl(unsigned long var, int src_lane, int width = warpSize) {
+    #ifndef _MSC_VER
+    static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
+    static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
+
+    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl(tmp[0], src_lane, width);
+    tmp[1] = __shfl(tmp[1], src_lane, width);
+
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    unsigned long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+    #else
+    static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
+    return static_cast<unsigned long>(__shfl(static_cast<unsigned int>(var), src_lane, width));
+    #endif
+}
+__device__
+inline
+long long __shfl(long long var, int src_lane, int width = warpSize)
+{
+    static_assert(sizeof(long long) == 2 * sizeof(int), "");
+    static_assert(sizeof(long long) == sizeof(uint64_t), "");
+
+    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl(tmp[0], src_lane, width);
+    tmp[1] = __shfl(tmp[1], src_lane, width);
+
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+}
+__device__
+inline
+unsigned long long __shfl(unsigned long long var, int src_lane, int width = warpSize) {
+    static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
+    static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
+
+    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl(tmp[0], src_lane, width);
+    tmp[1] = __shfl(tmp[1], src_lane, width);
+
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    unsigned long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+}
+
+__device__
+inline
+int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) {
+    int self = __lane_id();
+    int index = self - lane_delta;
+    index = (index < (self & ~(width-1)))?self:index;
+    return __builtin_amdgcn_ds_bpermute(index<<2, var);
+}
+__device__
+inline
+unsigned int __shfl_up(unsigned int var, unsigned int lane_delta, int width = warpSize) {
+    union { int i; unsigned u; float f; } tmp; tmp.u = var;
+    tmp.i = __shfl_up(tmp.i, lane_delta, width);
+    return tmp.u;
+}
+__device__
+inline
+float __shfl_up(float var, unsigned int lane_delta, int width = warpSize) {
+    union { int i; unsigned u; float f; } tmp; tmp.f = var;
+    tmp.i = __shfl_up(tmp.i, lane_delta, width);
+    return tmp.f;
+}
+__device__
+inline
+double __shfl_up(double var, unsigned int lane_delta, int width = warpSize) {
+    static_assert(sizeof(double) == 2 * sizeof(int), "");
+    static_assert(sizeof(double) == sizeof(uint64_t), "");
+
+    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_up(tmp[0], lane_delta, width);
+    tmp[1] = __shfl_up(tmp[1], lane_delta, width);
+
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    double tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+}
+__device__
+inline
+long __shfl_up(long var, unsigned int lane_delta, int width = warpSize)
+{
+    #ifndef _MSC_VER
+    static_assert(sizeof(long) == 2 * sizeof(int), "");
+    static_assert(sizeof(long) == sizeof(uint64_t), "");
+
+    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_up(tmp[0], lane_delta, width);
+    tmp[1] = __shfl_up(tmp[1], lane_delta, width);
+
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+    #else
+    static_assert(sizeof(long) == sizeof(int), "");
+    return static_cast<long>(__shfl_up(static_cast<int>(var), lane_delta, width));
+    #endif
+}
+
+__device__
+inline
+unsigned long __shfl_up(unsigned long var, unsigned int lane_delta, int width = warpSize)
+{
+    #ifndef _MSC_VER
+    static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
+    static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
+
+    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_up(tmp[0], lane_delta, width);
+    tmp[1] = __shfl_up(tmp[1], lane_delta, width);
+
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    unsigned long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+    #else
+    static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
+    return static_cast<unsigned long>(__shfl_up(static_cast<unsigned int>(var), lane_delta, width));
+    #endif
+}
+
+__device__
+inline
+long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize)
+{
+    static_assert(sizeof(long long) == 2 * sizeof(int), "");
+    static_assert(sizeof(long long) == sizeof(uint64_t), "");
+    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_up(tmp[0], lane_delta, width);
+    tmp[1] = __shfl_up(tmp[1], lane_delta, width);
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+}
+
+__device__
+inline
+unsigned long long __shfl_up(unsigned long long var, unsigned int lane_delta, int width = warpSize)
+{
+    static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
+    static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
+    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_up(tmp[0], lane_delta, width);
+    tmp[1] = __shfl_up(tmp[1], lane_delta, width);
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    unsigned long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+}
+
+__device__
+inline
+int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) {
+    int self = __lane_id();
+    int index = self + lane_delta;
+    index = (int)((self&(width-1))+lane_delta) >= width?self:index;
+    return __builtin_amdgcn_ds_bpermute(index<<2, var);
+}
+__device__
+inline
+unsigned int __shfl_down(unsigned int var, unsigned int lane_delta, int width = warpSize) {
+    union { int i; unsigned u; float f; } tmp; tmp.u = var;
+    tmp.i = __shfl_down(tmp.i, lane_delta, width);
+    return tmp.u;
+}
+__device__
+inline
+float __shfl_down(float var, unsigned int lane_delta, int width = warpSize) {
+    union { int i; unsigned u; float f; } tmp; tmp.f = var;
+    tmp.i = __shfl_down(tmp.i, lane_delta, width);
+    return tmp.f;
+}
+__device__
+inline
+double __shfl_down(double var, unsigned int lane_delta, int width = warpSize) {
+    static_assert(sizeof(double) == 2 * sizeof(int), "");
+    static_assert(sizeof(double) == sizeof(uint64_t), "");
+
+    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_down(tmp[0], lane_delta, width);
+    tmp[1] = __shfl_down(tmp[1], lane_delta, width);
+
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    double tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+}
+__device__
+inline
+long __shfl_down(long var, unsigned int lane_delta, int width = warpSize)
+{
+    #ifndef _MSC_VER
+    static_assert(sizeof(long) == 2 * sizeof(int), "");
+    static_assert(sizeof(long) == sizeof(uint64_t), "");
+
+    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_down(tmp[0], lane_delta, width);
+    tmp[1] = __shfl_down(tmp[1], lane_delta, width);
+
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+    #else
+    static_assert(sizeof(long) == sizeof(int), "");
+    return static_cast<long>(__shfl_down(static_cast<int>(var), lane_delta, width));
+    #endif
+}
+__device__
+inline
+unsigned long __shfl_down(unsigned long var, unsigned int lane_delta, int width = warpSize)
+{
+    #ifndef _MSC_VER
+    static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
+    static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
+
+    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_down(tmp[0], lane_delta, width);
+    tmp[1] = __shfl_down(tmp[1], lane_delta, width);
+
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    unsigned long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+    #else
+    static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
+    return static_cast<unsigned long>(__shfl_down(static_cast<unsigned int>(var), lane_delta, width));
+    #endif
+}
+__device__
+inline
+long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSize)
+{
+    static_assert(sizeof(long long) == 2 * sizeof(int), "");
+    static_assert(sizeof(long long) == sizeof(uint64_t), "");
+    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_down(tmp[0], lane_delta, width);
+    tmp[1] = __shfl_down(tmp[1], lane_delta, width);
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+}
+__device__
+inline
+unsigned long long __shfl_down(unsigned long long var, unsigned int lane_delta, int width = warpSize)
+{
+    static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
+    static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
+    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_down(tmp[0], lane_delta, width);
+    tmp[1] = __shfl_down(tmp[1], lane_delta, width);
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    unsigned long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+}
+
+__device__
+inline
+int __shfl_xor(int var, int lane_mask, int width = warpSize) {
+    int self = __lane_id();
+    int index = self^lane_mask;
+    index = index >= ((self+width)&~(width-1))?self:index;
+    return __builtin_amdgcn_ds_bpermute(index<<2, var);
+}
+__device__
+inline
+unsigned int __shfl_xor(unsigned int var, int lane_mask, int width = warpSize) {
+    union { int i; unsigned u; float f; } tmp; tmp.u = var;
+    tmp.i = __shfl_xor(tmp.i, lane_mask, width);
+    return tmp.u;
+}
+__device__
+inline
+float __shfl_xor(float var, int lane_mask, int width = warpSize) {
+    union { int i; unsigned u; float f; } tmp; tmp.f = var;
+    tmp.i = __shfl_xor(tmp.i, lane_mask, width);
+    return tmp.f;
+}
+__device__
+inline
+double __shfl_xor(double var, int lane_mask, int width = warpSize) {
+    static_assert(sizeof(double) == 2 * sizeof(int), "");
+    static_assert(sizeof(double) == sizeof(uint64_t), "");
+
+    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
+    tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
+
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    double tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+}
+__device__
+inline
+long __shfl_xor(long var, int lane_mask, int width = warpSize)
+{
+    #ifndef _MSC_VER
+    static_assert(sizeof(long) == 2 * sizeof(int), "");
+    static_assert(sizeof(long) == sizeof(uint64_t), "");
+
+    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
+    tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
+
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+    #else
+    static_assert(sizeof(long) == sizeof(int), "");
+    return static_cast<long>(__shfl_xor(static_cast<int>(var), lane_mask, width));
+    #endif
+}
+__device__
+inline
+unsigned long __shfl_xor(unsigned long var, int lane_mask, int width = warpSize)
+{
+    #ifndef _MSC_VER
+    static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
+    static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
+
+    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
+    tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
+
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    unsigned long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+    #else
+    static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
+    return static_cast<unsigned long>(__shfl_xor(static_cast<unsigned int>(var), lane_mask, width));
+    #endif
+}
+__device__
+inline
+long long __shfl_xor(long long var, int lane_mask, int width = warpSize)
+{
+    static_assert(sizeof(long long) == 2 * sizeof(int), "");
+    static_assert(sizeof(long long) == sizeof(uint64_t), "");
+    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
+    tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+}
+__device__
+inline
+unsigned long long __shfl_xor(unsigned long long var, int lane_mask, int width = warpSize)
+{
+    static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
+    static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
+    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
+    tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    unsigned long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+}
+
+#pragma clang diagnostic pop
+#endif
@@ -0,0 +1,30 @@
+/*
+Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+namespace hip_impl  // Documentation only.
+{
+#define requires(...)
+
+#define FunctionalProcedure typename
+}  // namespace hip_impl
@@ -0,0 +1,131 @@
+/*
+Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ *  @file  amd_detail/device_library_decls.h
+ *  @brief Contains declarations for types and functions in device library.
+ *         Uses int64_t and uint64_t instead of long, long long, unsigned
+ *         long and unsigned long long types for device library API
+ *         declarations.
+ */
+
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_LIBRARY_DECLS_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_LIBRARY_DECLS_H
+
+#include "hip/amd_detail/host_defines.h"
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+typedef unsigned int uint;
+typedef unsigned long ulong;
+typedef unsigned long long ullong;
+
+extern "C" __device__ __attribute__((const)) bool __ockl_wfany_i32(int);
+extern "C" __device__ __attribute__((const)) bool __ockl_wfall_i32(int);
+extern "C" __device__ uint __ockl_activelane_u32(void);
+
+extern "C" __device__ __attribute__((const)) uint __ockl_mul24_u32(uint, uint);
+extern "C" __device__ __attribute__((const)) int __ockl_mul24_i32(int, int);
+extern "C" __device__ __attribute__((const)) uint __ockl_mul_hi_u32(uint, uint);
+extern "C" __device__ __attribute__((const)) int __ockl_mul_hi_i32(int, int);
+extern "C" __device__ __attribute__((const)) uint __ockl_sadd_u32(uint, uint, uint);
+
+extern "C" __device__ __attribute__((const)) uchar __ockl_clz_u8(uchar);
+extern "C" __device__ __attribute__((const)) ushort __ockl_clz_u16(ushort);
+extern "C" __device__ __attribute__((const)) uint __ockl_clz_u32(uint);
+extern "C" __device__ __attribute__((const)) uint64_t __ockl_clz_u64(uint64_t);
+
+extern "C" __device__ __attribute__((const)) float __ocml_floor_f32(float);
+extern "C" __device__ __attribute__((const)) float __ocml_rint_f32(float);
+extern "C" __device__ __attribute__((const)) float __ocml_ceil_f32(float);
+extern "C" __device__ __attribute__((const)) float __ocml_trunc_f32(float);
+
+extern "C" __device__ __attribute__((const)) float __ocml_fmin_f32(float, float);
+extern "C" __device__ __attribute__((const)) float __ocml_fmax_f32(float, float);
+
+extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_f64(double);
+extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_f64(double);
+extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_f64(double);
+
+extern "C" __device__ __attribute__((const)) _Float16 __ocml_cvtrtn_f16_f32(float);
+extern "C" __device__ __attribute__((const)) _Float16 __ocml_cvtrtp_f16_f32(float);
+extern "C" __device__ __attribute__((const)) _Float16 __ocml_cvtrtz_f16_f32(float);
+
+extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_s32(int);
+extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_s32(int);
+extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_s32(int);
+extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_u32(uint32_t);
+extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_u32(uint32_t);
+extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_u32(uint32_t);
+extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_s64(int64_t);
+extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_s64(int64_t);
+extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_s64(int64_t);
+extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_u64(uint64_t);
+extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_u64(uint64_t);
+extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_u64(uint64_t);
+extern "C" __device__ __attribute__((const)) double __ocml_cvtrtn_f64_s64(int64_t);
+extern "C" __device__ __attribute__((const)) double __ocml_cvtrtp_f64_s64(int64_t);
+extern "C" __device__ __attribute__((const)) double __ocml_cvtrtz_f64_s64(int64_t);
+extern "C" __device__ __attribute__((const)) double __ocml_cvtrtn_f64_u64(uint64_t);
+extern "C" __device__ __attribute__((const)) double __ocml_cvtrtp_f64_u64(uint64_t);
+extern "C" __device__ __attribute__((const)) double __ocml_cvtrtz_f64_u64(uint64_t);
+
+extern "C" __device__ __attribute__((convergent)) void __ockl_gws_init(uint nwm1, uint rid);
+extern "C" __device__ __attribute__((convergent)) void __ockl_gws_barrier(uint nwm1, uint rid);
+
+extern "C" __device__ __attribute__((const)) uint32_t __ockl_lane_u32();
+extern "C" __device__ __attribute__((const)) int __ockl_grid_is_valid(void);
+extern "C" __device__ __attribute__((convergent)) void __ockl_grid_sync(void);
+extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_num_grids(void);
+extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_grid_rank(void);
+extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_size(void);
+extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_thread_rank(void);
+extern "C" __device__ __attribute__((const)) int __ockl_multi_grid_is_valid(void);
+extern "C" __device__ __attribute__((convergent)) void __ockl_multi_grid_sync(void);
+
+extern "C" __device__ void __ockl_atomic_add_noret_f32(float*, float);
+
+extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_add_i32(int a);
+extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_and_i32(int a);
+extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_or_i32(int a);
+
+extern "C" __device__ uint64_t __ockl_fprintf_stderr_begin();
+extern "C" __device__ uint64_t __ockl_fprintf_append_args(uint64_t msg_desc, uint32_t num_args,
+                                                          uint64_t value0, uint64_t value1,
+                                                          uint64_t value2, uint64_t value3,
+                                                          uint64_t value4, uint64_t value5,
+                                                          uint64_t value6, uint32_t is_last);
+extern "C" __device__ uint64_t __ockl_fprintf_append_string_n(uint64_t msg_desc, const char* data,
+                                                              uint64_t length, uint32_t is_last);
+
+// Introduce local address space
+#define __local __attribute__((address_space(3)))
+
+#ifdef __HIP_DEVICE_COMPILE__
+__device__ inline static __local void* __to_local(unsigned x) { return (__local void*)x; }
+#endif //__HIP_DEVICE_COMPILE__
+
+// Using hip.amdgcn.bc - sync threads
+#define __CLK_LOCAL_MEM_FENCE    0x01
+typedef unsigned __cl_mem_fence_flags;
+
+#endif
@@ -0,0 +1,218 @@
+/*
+Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include "concepts.hpp"
+#include "helpers.hpp"
+#include "program_state.hpp"
+#include "hip_runtime_api.h"
+
+#include <cstdint>
+#include <cstring>
+#include <stdexcept>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+hipError_t ihipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList, int numDevices,
+                                               unsigned int flags, hip_impl::program_state& ps);
+
+hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim,
+                                    dim3 blockDim, void** args,
+                                    size_t sharedMem, hipStream_t stream,
+                                    hip_impl::program_state& ps);
+
+hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
+                                                 int  numDevices,
+                                                 unsigned int flags,
+                                                 hip_impl::program_state& ps);
+
+#pragma GCC visibility push(hidden)
+
+namespace hip_impl {
+template <typename T, typename std::enable_if<std::is_integral<T>{}>::type* = nullptr>
+inline T round_up_to_next_multiple_nonnegative(T x, T y) {
+    T tmp = x + y - 1;
+    return tmp - tmp % y;
+}
+
+template <
+    std::size_t n,
+    typename... Ts,
+    typename std::enable_if<n == sizeof...(Ts)>::type* = nullptr>
+inline hip_impl::kernarg make_kernarg(
+    const std::tuple<Ts...>&,
+    const kernargs_size_align&,
+    hip_impl::kernarg kernarg) {
+    return kernarg;
+}
+
+template <
+    std::size_t n,
+    typename... Ts,
+    typename std::enable_if<n != sizeof...(Ts)>::type* = nullptr>
+inline hip_impl::kernarg make_kernarg(
+    const std::tuple<Ts...>& formals,
+    const kernargs_size_align& size_align,
+    hip_impl::kernarg kernarg) {
+    using T = typename std::tuple_element<n, std::tuple<Ts...>>::type;
+
+    static_assert(
+        !std::is_reference<T>{},
+        "A __global__ function cannot have a reference as one of its "
+            "arguments.");
+    #if defined(HIP_STRICT)
+        static_assert(
+            std::is_trivially_copyable<T>{},
+            "Only TriviallyCopyable types can be arguments to a __global__ "
+                "function");
+    #endif
+
+    kernarg.resize(round_up_to_next_multiple_nonnegative(
+        kernarg.size(), size_align.alignment(n)) + size_align.size(n));
+
+    std::memcpy(
+        kernarg.data() + kernarg.size() - size_align.size(n),
+        &std::get<n>(formals),
+        size_align.size(n));
+    return make_kernarg<n + 1>(formals, size_align, std::move(kernarg));
+}
+
+template <typename... Formals, typename... Actuals>
+inline hip_impl::kernarg make_kernarg(
+    void (*kernel)(Formals...), std::tuple<Actuals...> actuals) {
+    static_assert(sizeof...(Formals) == sizeof...(Actuals),
+        "The count of formal arguments must match the count of actuals.");
+
+    if (sizeof...(Formals) == 0) return {};
+
+    std::tuple<Formals...> to_formals{std::move(actuals)};
+    hip_impl::kernarg kernarg;
+    kernarg.reserve(sizeof(to_formals));
+
+    auto& ps = hip_impl::get_program_state();
+    return make_kernarg<0>(to_formals, 
+                           ps.get_kernargs_size_align(
+                               reinterpret_cast<std::uintptr_t>(kernel)),
+                           std::move(kernarg));
+}
+
+
+HIP_INTERNAL_EXPORTED_API hsa_agent_t target_agent(hipStream_t stream);
+
+inline
+__attribute__((visibility("hidden")))
+void hipLaunchKernelGGLImpl(
+    std::uintptr_t function_address,
+    const dim3& numBlocks,
+    const dim3& dimBlocks,
+    std::uint32_t sharedMemBytes,
+    hipStream_t stream,
+    void** kernarg) {
+
+    const auto& kd = hip_impl::get_program_state().kernel_descriptor(function_address, 
+                                                               target_agent(stream));
+
+    hipModuleLaunchKernel(kd, numBlocks.x, numBlocks.y, numBlocks.z,
+                          dimBlocks.x, dimBlocks.y, dimBlocks.z, sharedMemBytes,
+                          stream, nullptr, kernarg);
+}
+} // Namespace hip_impl.
+
+
+template <class T>
+inline
+hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
+    T kernel, size_t dynSharedMemPerBlk = 0, int blockSizeLimit = 0) {
+
+    using namespace hip_impl;
+
+    hip_impl::hip_init();
+    auto f = get_program_state().kernel_descriptor(reinterpret_cast<std::uintptr_t>(kernel),
+                                                   target_agent(0));
+
+    return hipModuleOccupancyMaxPotentialBlockSize(gridSize, blockSize, f,
+                                      dynSharedMemPerBlk, blockSizeLimit);
+}
+
+template <class T>
+inline
+hipError_t hipOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int* blockSize,
+    T kernel, size_t dynSharedMemPerBlk = 0, int blockSizeLimit = 0, unsigned int  flags = 0 ) {
+
+    using namespace hip_impl;
+
+    hip_impl::hip_init();
+    if(flags != hipOccupancyDefault) return hipErrorNotSupported;
+    auto f = get_program_state().kernel_descriptor(reinterpret_cast<std::uintptr_t>(kernel),
+                                                   target_agent(0));
+
+    return hipModuleOccupancyMaxPotentialBlockSize(gridSize, blockSize, f,
+                                      dynSharedMemPerBlk, blockSizeLimit);
+}
+
+template <typename... Args, typename F = void (*)(Args...)>
+inline
+void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
+                        std::uint32_t sharedMemBytes, hipStream_t stream,
+                        Args... args) {
+    hip_impl::hip_init();
+    auto kernarg = hip_impl::make_kernarg(kernel, std::tuple<Args...>{std::move(args)...});
+    std::size_t kernarg_size = kernarg.size();
+
+    void* config[]{
+        HIP_LAUNCH_PARAM_BUFFER_POINTER,
+        kernarg.data(),
+        HIP_LAUNCH_PARAM_BUFFER_SIZE,
+        &kernarg_size,
+        HIP_LAUNCH_PARAM_END};
+
+    hip_impl::hipLaunchKernelGGLImpl(reinterpret_cast<std::uintptr_t>(kernel),
+                                     numBlocks, dimBlocks, sharedMemBytes,
+                                     stream, &config[0]);
+}
+
+template <typename F>
+inline
+__attribute__((visibility("hidden")))
+hipError_t hipLaunchCooperativeKernel(F f, dim3 gridDim, dim3 blockDim,
+                                      void** args, size_t sharedMem,
+                                      hipStream_t stream) {
+    hip_impl::hip_init();
+    auto& ps = hip_impl::get_program_state();
+    return hipLaunchCooperativeKernel(reinterpret_cast<void*>(f), gridDim,
+                                      blockDim, args, sharedMem, stream, ps);
+}
+
+inline
+__attribute__((visibility("hidden")))
+hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
+                                                 int  numDevices,
+                                                 unsigned int  flags) {
+
+    hip_impl::hip_init();
+    auto& ps = hip_impl::get_program_state();
+    return hipLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags, ps);
+}
+
+#pragma GCC visibility pop
@@ -0,0 +1,67 @@
+#pragma once
+
+#include <stdint.h>
+
+#include <hc_defines.h>
+
+#define GRID_LAUNCH_VERSION 20
+
+// Extern definitions
+namespace hc{
+class completion_future;
+class accelerator_view;
+}
+
+
+// 3 dim structure for groups and grids.
+typedef struct gl_dim3
+{
+  int x,y,z;
+  gl_dim3(uint32_t _x=1, uint32_t _y=1, uint32_t _z=1) : x(_x), y(_y), z(_z) {};
+} gl_dim3;
+
+typedef enum gl_barrier_bit {
+    barrier_bit_queue_default,
+    barrier_bit_none,
+    barrier_bit_wait,
+} gl_barrier_bit;
+
+
+// grid_launch_parm contains information used to launch the kernel.
+typedef struct grid_launch_parm
+{
+  //! Grid dimensions
+  gl_dim3      grid_dim;
+
+  //! Group dimensions
+  gl_dim3      group_dim;
+
+  //! Amount of dynamic group memory to use with the kernel launch.
+  //! This memory is in addition to the amount used statically in the kernel.
+  unsigned int  dynamic_group_mem_bytes;
+
+  //! Control setting of barrier bit on per-packet basis:
+  //! See gl_barrier_bit description.  
+  //! Placeholder, is not used to control packet dispatch yet
+  enum gl_barrier_bit barrier_bit;
+
+  //! Value of packet fences to apply to launch.
+  //! The correspond to the value of bits 9:14 in the AQL packet,
+  //! see HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE and hsa_fence_scope_t.
+  unsigned int  launch_fence;
+
+  //! Pointer to the accelerator_view where the kernel should execute.
+  //! If NULL, the default view on the default accelerator is used.
+  hc::accelerator_view  *av;
+
+  //! Pointer to the completion_future used to track the status of the command.
+  //! If NULL, the command does not write status.  In this case, 
+  //! synchronization can be enforced with queue-level waits or 
+  //! waiting on younger commands.
+  hc::completion_future *cf;
+
+  grid_launch_parm() = default;
+} grid_launch_parm;
+
+
+extern void init_grid_launch(grid_launch_parm *gl);
@@ -0,0 +1,50 @@
+#pragma once
+
+#include "grid_launch.h"
+#include "hc.hpp"
+
+class grid_launch_parm_cxx : public grid_launch_parm
+{
+public:
+  grid_launch_parm_cxx() = default;
+
+  // customized serialization: don't need av and cf in kernel
+  __attribute__((annotate("serialize")))
+  void __cxxamp_serialize(Kalmar::Serialize& s) const {
+    s.Append(sizeof(int), &grid_dim.x);
+    s.Append(sizeof(int), &grid_dim.y);
+    s.Append(sizeof(int), &grid_dim.z);
+    s.Append(sizeof(int), &group_dim.x);
+    s.Append(sizeof(int), &group_dim.y);
+    s.Append(sizeof(int), &group_dim.z);
+  }
+
+  __attribute__((annotate("user_deserialize")))
+  grid_launch_parm_cxx(int grid_dim_x,  int grid_dim_y,  int grid_dim_z,
+                   int group_dim_x, int group_dim_y, int group_dim_z) {
+    grid_dim.x  = grid_dim_x;
+    grid_dim.y  = grid_dim_y;
+    grid_dim.z  = grid_dim_z;
+    group_dim.x = group_dim_x;
+    group_dim.y = group_dim_y;
+    group_dim.z = group_dim_z;
+  }
+};
+
+
+extern inline void grid_launch_init(grid_launch_parm *lp) {
+  lp->grid_dim.x = lp->grid_dim.y = lp->grid_dim.z = 1;
+
+  lp->group_dim.x = lp->group_dim.y = lp->group_dim.z = 1;
+
+  lp->dynamic_group_mem_bytes = 0;
+
+  lp->barrier_bit = barrier_bit_queue_default;
+  lp->launch_fence = -1;
+
+  // TODO - set to NULL?
+  static hc::accelerator_view av = hc::accelerator().get_default_view();
+  lp->av = &av;
+  lp->cf = NULL;
+}
+
@@ -0,0 +1,26 @@
+/*
+Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#pragma once
+
+#if GENERIC_GRID_LAUNCH == 1
+#include "macro_based_grid_launch.hpp"
+#endif  // GENERIC_GRID_LAUNCH
@@ -0,0 +1,137 @@
+/*
+Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+#include "concepts.hpp"
+
+#include <type_traits>  // For std::conditional, std::decay, std::enable_if,
+                        // std::false_type, std result_of and std::true_type.
+#include <utility>      // For std::declval.
+
+#ifdef __has_include                      // Check if __has_include is present
+#  if __has_include(<version>)            // Check for version header
+#    include <version>
+#    if defined(__cpp_lib_is_invocable) && !defined(HIP_HAS_INVOCABLE)
+#       define HIP_HAS_INVOCABLE __cpp_lib_is_invocable
+#    endif
+#    if defined(__cpp_lib_result_of_sfinae) && !defined(HIP_HAS_RESULT_OF_SFINAE)
+#       define HIP_HAS_RESULT_OF_SFINAE __cpp_lib_result_of_sfinae
+#    endif
+#  endif
+#endif
+
+#ifndef HIP_HAS_INVOCABLE
+#define HIP_HAS_INVOCABLE 0
+#endif
+
+#ifndef HIP_HAS_RESULT_OF_SFINAE
+#define HIP_HAS_RESULT_OF_SFINAE 0
+#endif
+
+namespace std {  // TODO: these should be removed as soon as possible.
+#if (__cplusplus < 201406L)
+#if (__cplusplus < 201402L)
+template <bool cond, typename T = void>
+using enable_if_t = typename enable_if<cond, T>::type;
+template <bool cond, typename T, typename U>
+using conditional_t = typename conditional<cond, T, U>::type;
+template <typename T>
+using decay_t = typename decay<T>::type;
+template <FunctionalProcedure F, typename... Ts>
+using result_of_t = typename result_of<F(Ts...)>::type;
+template <typename T>
+using remove_reference_t = typename remove_reference<T>::type;
+#endif
+#endif
+}  // namespace std
+
+namespace hip_impl {
+template <typename...>
+using void_t_ = void;
+
+#if HIP_HAS_INVOCABLE
+template <typename, typename = void>
+struct is_callable_impl;
+
+template <FunctionalProcedure F, typename... Ts>
+struct is_callable_impl<F(Ts...)> : std::is_invocable<F, Ts...> {};
+#elif HIP_HAS_RESULT_OF_SFINAE
+template <typename, typename = void>
+struct is_callable_impl : std::false_type {};
+
+template <FunctionalProcedure F, typename... Ts>
+struct is_callable_impl<F(Ts...), void_t_<typename std::result_of<F(Ts...)>::type > > : std::true_type {};
+#else
+template <class Base, class T, class Derived>
+auto simple_invoke(T Base::*pmd, Derived&& ref)
+-> decltype(static_cast<Derived&&>(ref).*pmd);
+ 
+template <class PMD, class Pointer>
+auto simple_invoke(PMD&& pmd, Pointer&& ptr)
+-> decltype((*static_cast<Pointer&&>(ptr)).*static_cast<PMD&&>(pmd));
+
+template <class Base, class T, class Derived>
+auto simple_invoke(T Base::*pmd, const std::reference_wrapper<Derived>& ref)
+-> decltype(ref.get().*pmd);
+ 
+template <class Base, class T, class Derived, class... Args>
+auto simple_invoke(T Base::*pmf, Derived&& ref, Args&&... args)
+-> decltype((static_cast<Derived&&>(ref).*pmf)(static_cast<Args&&>(args)...));
+ 
+template <class PMF, class Pointer, class... Args>
+auto simple_invoke(PMF&& pmf, Pointer&& ptr, Args&&... args)
+-> decltype(((*static_cast<Pointer&&>(ptr)).*static_cast<PMF&&>(pmf))(static_cast<Args&&>(args)...));
+
+template <class Base, class T, class Derived, class... Args>
+auto simple_invoke(T Base::*pmf, const std::reference_wrapper<Derived>& ref, Args&&... args)
+-> decltype((ref.get().*pmf)(static_cast<Args&&>(args)...));
+
+template<class F, class... Ts>
+auto simple_invoke(F&& f, Ts&&... xs) 
+-> decltype(f(static_cast<Ts&&>(xs)...));
+
+template <typename, typename = void>
+struct is_callable_impl : std::false_type {};
+
+template <FunctionalProcedure F, typename... Ts>
+struct is_callable_impl<F(Ts...), void_t_<decltype(simple_invoke(std::declval<F>(), std::declval<Ts>()...))> >
+    : std::true_type {};
+
+#endif
+
+template <typename Call>
+struct is_callable : is_callable_impl<Call> {};
+
+#define count_macro_args_impl_hip_(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13,     \
+                                   _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25,     \
+                                   _26, _27, _28, _29, _30, _31, _n, ...)                          \
+    _n
+#define count_macro_args_hip_(...)                                                                 \
+    count_macro_args_impl_hip_(, ##__VA_ARGS__, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20,    \
+                               19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,  \
+                               0)
+
+#define overloaded_macro_expand_hip_(macro, arg_cnt) macro##arg_cnt
+#define overload_macro_impl_hip_(macro, arg_cnt) overloaded_macro_expand_hip_(macro, arg_cnt)
+#define overload_macro_hip_(macro, ...)                                                            \
+    overload_macro_impl_hip_(macro, count_macro_args_hip_(__VA_ARGS__))(__VA_ARGS__)
+}  // namespace hip_impl
@@ -0,0 +1,222 @@
+/*
+Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ *  @file  amd_detail/hip_cooperative_groups_helper.h
+ *
+ *  @brief Device side implementation of cooperative group feature.
+ *
+ *  Defines helper constructs and APIs which aid the types and device API
+ *  wrappers defined within `amd_detail/hip_cooperative_groups.h`.
+ */
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
+
+#if __cplusplus
+#if !defined(__HIPCC_RTC__)
+#include <hip/amd_detail/amd_device_functions.h>
+#endif
+#if !defined(__align__)
+#define __align__(x) __attribute__((aligned(x)))
+#endif
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wreserved-macro-identifier"
+#pragma clang diagnostic ignored "-Wc++98-compat"
+#pragma clang diagnostic ignored "-Wc++98-compat-pedantic"
+#pragma clang diagnostic ignored "-Wshorten-64-to-32"
+
+#if !defined(__CG_QUALIFIER__)
+#define __CG_QUALIFIER__ __device__ __forceinline__
+#endif
+
+#if !defined(__CG_STATIC_QUALIFIER__)
+#define __CG_STATIC_QUALIFIER__ __device__ static __forceinline__
+#endif
+
+#if !defined(_CG_STATIC_CONST_DECL_)
+#define _CG_STATIC_CONST_DECL_ static constexpr
+#endif
+
+#if __AMDGCN_WAVEFRONT_SIZE == 32
+using lane_mask = unsigned int;
+#else
+using lane_mask = unsigned long long int;
+#endif
+
+namespace cooperative_groups {
+
+/* Global scope */
+template <unsigned int size>
+using is_power_of_2 = std::integral_constant<bool, (size & (size - 1)) == 0>;
+
+template <unsigned int size>
+using is_valid_wavefront = std::integral_constant<bool, (size <= __AMDGCN_WAVEFRONT_SIZE)>;
+
+template <unsigned int size>
+using is_valid_tile_size =
+    std::integral_constant<bool, is_power_of_2<size>::value && is_valid_wavefront<size>::value>;
+
+template <typename T>
+using is_valid_type =
+    std::integral_constant<bool, std::is_integral<T>::value || std::is_floating_point<T>::value>;
+
+namespace internal {
+
+/** \brief Enums representing different cooperative group types
+ */
+typedef enum {
+  cg_invalid,
+  cg_multi_grid,
+  cg_grid,
+  cg_workgroup,
+  cg_tiled_group,
+  cg_coalesced_group
+} group_type;
+
+/**
+ *  Functionalities related to multi-grid cooperative group type
+ */
+namespace multi_grid {
+
+__CG_STATIC_QUALIFIER__ uint32_t num_grids() {
+  return static_cast<uint32_t>(__ockl_multi_grid_num_grids()); }
+
+__CG_STATIC_QUALIFIER__ uint32_t grid_rank() {
+  return static_cast<uint32_t>(__ockl_multi_grid_grid_rank()); }
+
+__CG_STATIC_QUALIFIER__ uint32_t size() { return static_cast<uint32_t>(__ockl_multi_grid_size()); }
+
+__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
+  return static_cast<uint32_t>(__ockl_multi_grid_thread_rank()); }
+
+__CG_STATIC_QUALIFIER__ bool is_valid() { return static_cast<bool>(__ockl_multi_grid_is_valid()); }
+
+__CG_STATIC_QUALIFIER__ void sync() { __ockl_multi_grid_sync(); }
+
+}  // namespace multi_grid
+
+/**
+ *  Functionalities related to grid cooperative group type
+ */
+namespace grid {
+
+__CG_STATIC_QUALIFIER__ uint32_t size() {
+  return static_cast<uint32_t>((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y) *
+                    (blockDim.x * gridDim.x));
+}
+
+__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
+  // Compute global id of the workgroup to which the current thread belongs to
+  uint32_t blkIdx = static_cast<uint32_t>((blockIdx.z * gridDim.y * gridDim.x) +
+                               (blockIdx.y * gridDim.x) + (blockIdx.x));
+
+  // Compute total number of threads being passed to reach current workgroup
+  // within grid
+  uint32_t num_threads_till_current_workgroup =
+      static_cast<uint32_t>(blkIdx * (blockDim.x * blockDim.y * blockDim.z));
+
+  // Compute thread local rank within current workgroup
+  uint32_t local_thread_rank = static_cast<uint32_t>((threadIdx.z * blockDim.y * blockDim.x) +
+                                          (threadIdx.y * blockDim.x) + (threadIdx.x));
+
+  return (num_threads_till_current_workgroup + local_thread_rank);
+}
+
+__CG_STATIC_QUALIFIER__ bool is_valid() { return static_cast<bool>(__ockl_grid_is_valid()); }
+
+__CG_STATIC_QUALIFIER__ void sync() { __ockl_grid_sync(); }
+
+}  // namespace grid
+
+/**
+ *  Functionalities related to `workgroup` (thread_block in CUDA terminology)
+ *  cooperative group type
+ */
+namespace workgroup {
+
+__CG_STATIC_QUALIFIER__ dim3 group_index() {
+  return (dim3(static_cast<uint32_t>(blockIdx.x), static_cast<uint32_t>(blockIdx.y),
+               static_cast<uint32_t>(blockIdx.z)));
+}
+
+__CG_STATIC_QUALIFIER__ dim3 thread_index() {
+  return (dim3(static_cast<uint32_t>(threadIdx.x), static_cast<uint32_t>(threadIdx.y),
+               static_cast<uint32_t>(threadIdx.z)));
+}
+
+__CG_STATIC_QUALIFIER__ uint32_t size() {
+  return (static_cast<uint32_t>(blockDim.x * blockDim.y * blockDim.z));
+}
+
+__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
+  return (static_cast<uint32_t>((threadIdx.z * blockDim.y * blockDim.x) +
+                     (threadIdx.y * blockDim.x) + (threadIdx.x)));
+}
+
+__CG_STATIC_QUALIFIER__ bool is_valid() {
+  // TODO(mahesha) any functionality need to be added here? I believe not
+  return true;
+}
+
+__CG_STATIC_QUALIFIER__ void sync() { __syncthreads(); }
+
+}  // namespace workgroup
+
+namespace tiled_group {
+
+// enforce ordering for memory intructions
+__CG_STATIC_QUALIFIER__ void sync() { __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "agent"); }
+
+}  // namespace tiled_group
+
+namespace coalesced_group {
+
+// enforce ordering for memory intructions
+__CG_STATIC_QUALIFIER__ void sync() { __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "agent"); }
+
+// Masked bit count
+//
+// For each thread, this function returns the number of active threads which
+// have i-th bit of x set and come before the current thread.
+__CG_STATIC_QUALIFIER__ unsigned int masked_bit_count(lane_mask x, unsigned int add = 0) {
+  unsigned int counter=0;
+    #if __AMDGCN_WAVEFRONT_SIZE == 32
+      counter = __builtin_amdgcn_mbcnt_lo(x, add);
+    #else
+      counter = __builtin_amdgcn_mbcnt_lo(static_cast<lane_mask>(x), add);
+      counter = __builtin_amdgcn_mbcnt_hi(static_cast<lane_mask>(x >> 32), counter);
+    #endif
+
+    return counter;
+}
+
+}  // namespace coalesced_group
+
+
+}  // namespace internal
+
+}  // namespace cooperative_groups
+
+#pragma clang diagnostic pop
+#endif  // __cplusplus
+#endif  // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
@@ -0,0 +1,254 @@
+#pragma once
+
+#if defined(__cplusplus)
+    #include <cstring>
+#endif
+
+struct __half_raw {
+    unsigned short x;
+};
+
+struct __half2_raw {
+    unsigned short x;
+    unsigned short y;
+};
+
+#if defined(__cplusplus)
+    struct __half;
+
+    __half __float2half(float);
+    float __half2float(__half);
+
+    // BEGIN STRUCT __HALF
+    struct __half {
+    protected:
+        unsigned short __x;
+    public:
+        // CREATORS
+        __half() = default;
+        __half(const __half_raw& x) : __x{x.x} {}
+        #if !defined(__HIP_NO_HALF_CONVERSIONS__)
+            __half(float x) : __x{__float2half(x).__x} {}
+            __half(double x) : __x{__float2half(x).__x} {}
+        #endif
+        __half(const __half&) = default;
+        __half(__half&&) = default;
+        ~__half() = default;
+
+        // MANIPULATORS
+        __half& operator=(const __half&) = default;
+        __half& operator=(__half&&) = default;
+        __half& operator=(const __half_raw& x) { __x = x.x; return *this; }
+        #if !defined(__HIP_NO_HALF_CONVERSIONS__)
+            __half& operator=(float x)
+            {
+                __x = __float2half(x).__x;
+                return *this;
+            }
+            __half& operator=(double x)
+            {
+                return *this = static_cast<float>(x);
+            }
+        #endif
+
+        // ACCESSORS
+        operator float() const { return __half2float(*this); }
+        operator __half_raw() const { return __half_raw{__x}; }
+    };
+    // END STRUCT __HALF
+
+    // BEGIN STRUCT __HALF2
+    struct __half2 {
+    public:
+        __half x;
+        __half y;
+
+        // CREATORS
+        __half2() = default;
+        __half2(const __half2_raw& ix)
+            :
+            x{reinterpret_cast<const __half&>(ix.x)},
+            y{reinterpret_cast<const __half&>(ix.y)}
+        {}
+        __half2(const __half& ix, const __half& iy) : x{ix}, y{iy} {}
+        __half2(const __half2&) = default;
+        __half2(__half2&&) = default;
+        ~__half2() = default;
+
+        // MANIPULATORS
+        __half2& operator=(const __half2&) = default;
+        __half2& operator=(__half2&&) = default;
+        __half2& operator=(const __half2_raw& ix)
+        {
+            x = reinterpret_cast<const __half_raw&>(ix.x);
+            y = reinterpret_cast<const __half_raw&>(ix.y);
+            return *this;
+        }
+
+        // ACCESSORS
+        operator __half2_raw() const
+        {
+            return __half2_raw{
+                reinterpret_cast<const unsigned short&>(x),
+                reinterpret_cast<const unsigned short&>(y)};
+        }
+    };
+    // END STRUCT __HALF2
+
+    inline
+    unsigned short __internal_float2half(
+        float flt, unsigned int& sgn, unsigned int& rem)
+    {
+        unsigned int x{};
+        std::memcpy(&x, &flt, sizeof(flt));
+
+        unsigned int u = (x & 0x7fffffffU);
+        sgn = ((x >> 16) & 0x8000U);
+
+        // NaN/+Inf/-Inf
+        if (u >= 0x7f800000U) {
+            rem = 0;
+            return static_cast<unsigned short>(
+                (u == 0x7f800000U) ? (sgn | 0x7c00U) : 0x7fffU);
+        }
+        // Overflows
+        if (u > 0x477fefffU) {
+            rem = 0x80000000U;
+            return static_cast<unsigned short>(sgn | 0x7bffU);
+        }
+        // Normal numbers
+        if (u >= 0x38800000U) {
+            rem = u << 19;
+            u -= 0x38000000U;
+            return static_cast<unsigned short>(sgn | (u >> 13));
+        }
+        // +0/-0
+        if (u < 0x33000001U) {
+            rem = u;
+            return static_cast<unsigned short>(sgn);
+        }
+        // Denormal numbers
+        unsigned int exponent = u >> 23;
+        unsigned int mantissa = (u & 0x7fffffU);
+        unsigned int shift = 0x7eU - exponent;
+        mantissa |= 0x800000U;
+        rem = mantissa << (32 - shift);
+        return static_cast<unsigned short>(sgn | (mantissa >> shift));
+    }
+
+    inline
+    __half __float2half(float x)
+    {
+        __half_raw r;
+        unsigned int sgn{};
+        unsigned int rem{};
+        r.x = __internal_float2half(x, sgn, rem);
+        if (rem > 0x80000000U || (rem == 0x80000000U && (r.x & 0x1))) ++r.x;
+
+        return r;
+    }
+
+    inline
+    __half __float2half_rn(float x) { return __float2half(x); }
+
+    inline
+    __half __float2half_rz(float x)
+    {
+        __half_raw r;
+        unsigned int sgn{};
+        unsigned int rem{};
+        r.x = __internal_float2half(x, sgn, rem);
+
+        return r;
+    }
+
+    inline
+    __half __float2half_rd(float x)
+    {
+        __half_raw r;
+        unsigned int sgn{};
+        unsigned int rem{};
+        r.x = __internal_float2half(x, sgn, rem);
+        if (rem && sgn) ++r.x;
+
+        return r;
+    }
+
+    inline
+    __half __float2half_ru(float x)
+    {
+        __half_raw r;
+        unsigned int sgn{};
+        unsigned int rem{};
+        r.x = __internal_float2half(x, sgn, rem);
+        if (rem && !sgn) ++r.x;
+
+        return r;
+    }
+
+    inline
+    __half2 __float2half2_rn(float x)
+    {
+        return __half2{__float2half_rn(x), __float2half_rn(x)};
+    }
+
+    inline
+    __half2 __floats2half2_rn(float x, float y)
+    {
+        return __half2{__float2half_rn(x), __float2half_rn(y)};
+    }
+
+    inline
+    float __internal_half2float(unsigned short x)
+    {
+        unsigned int sign = ((x >> 15) & 1);
+        unsigned int exponent = ((x >> 10) & 0x1f);
+        unsigned int mantissa = ((x & 0x3ff) << 13);
+
+        if (exponent == 0x1fU) { /* NaN or Inf */
+            mantissa = (mantissa ? (sign = 0, 0x7fffffU) : 0);
+            exponent = 0xffU;
+        } else if (!exponent) { /* Denorm or Zero */
+            if (mantissa) {
+                unsigned int msb;
+                exponent = 0x71U;
+                do {
+                    msb = (mantissa & 0x400000U);
+                    mantissa <<= 1; /* normalize */
+                    --exponent;
+                } while (!msb);
+                mantissa &= 0x7fffffU; /* 1.mantissa is implicit */
+            }
+        } else {
+            exponent += 0x70U;
+        }
+        unsigned int u = ((sign << 31) | (exponent << 23) | mantissa);
+        float f;
+        memcpy(&f, &u, sizeof(u));
+
+        return f;
+    }
+
+    inline
+    float __half2float(__half x)
+    {
+        return __internal_half2float(static_cast<__half_raw>(x).x);
+    }
+
+    inline
+    float __low2float(__half2 x)
+    {
+        return __internal_half2float(static_cast<__half2_raw>(x).x);
+    }
+
+    inline
+    float __high2float(__half2 x)
+    {
+        return __internal_half2float(static_cast<__half2_raw>(x).y);
+    }
+
+    #if !defined(HIP_NO_HALF)
+        using half = __half;
+        using half2 = __half2;
+    #endif
+#endif // defined(__cplusplus)
@@ -0,0 +1,96 @@
+/*
+Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+// /*
+// Half Math Functions
+// */
+#if !defined(__HIPCC_RTC__)
+#include "host_defines.h"
+#endif
+#if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+extern "C"
+{
+    __device__ __attribute__((const)) _Float16 __ocml_ceil_f16(_Float16);
+    __device__ _Float16 __ocml_cos_f16(_Float16);
+    __device__ __attribute__((pure)) _Float16 __ocml_exp_f16(_Float16);
+    __device__ __attribute__((pure)) _Float16 __ocml_exp10_f16(_Float16);
+    __device__ __attribute__((pure)) _Float16 __ocml_exp2_f16(_Float16);
+    __device__ __attribute__((const)) _Float16 __ocml_floor_f16(_Float16);
+    __device__ __attribute__((const))
+    _Float16 __ocml_fma_f16(_Float16, _Float16, _Float16);
+    __device__ __attribute__((const)) _Float16 __ocml_fabs_f16(_Float16);
+    __device__ __attribute__((const)) int __ocml_isinf_f16(_Float16);
+    __device__ __attribute__((const)) int __ocml_isnan_f16(_Float16);
+    __device__ __attribute__((pure)) _Float16 __ocml_log_f16(_Float16);
+    __device__ __attribute__((pure)) _Float16 __ocml_log10_f16(_Float16);
+    __device__ __attribute__((pure)) _Float16 __ocml_log2_f16(_Float16);
+    __device__ __attribute__((pure)) _Float16 __ocml_pown_f16(_Float16, int);
+    __device__ __attribute__((const)) _Float16 __ocml_rint_f16(_Float16);
+    __device__ __attribute__((const)) _Float16 __ocml_rsqrt_f16(_Float16);
+    __device__ _Float16 __ocml_sin_f16(_Float16);
+    __device__ __attribute__((const)) _Float16 __ocml_sqrt_f16(_Float16);
+    __device__ __attribute__((const)) _Float16 __ocml_trunc_f16(_Float16);
+    __device__ __attribute__((const)) _Float16 __ocml_fmax_f16(_Float16, _Float16);
+    __device__ __attribute__((const)) _Float16 __ocml_fmin_f16(_Float16, _Float16);
+
+    typedef _Float16 __2f16 __attribute__((ext_vector_type(2)));
+    typedef short __2i16 __attribute__((ext_vector_type(2)));
+
+    #if defined(__clang__) && defined(__HIP__)
+    __device__ __attribute__((const)) float __ockl_fdot2(__2f16 a, __2f16 b, float c, bool s);
+    #endif
+
+    __device__ __attribute__((const)) __2f16 __ocml_ceil_2f16(__2f16);
+    __device__ __attribute__((const)) __2f16 __ocml_fabs_2f16(__2f16);
+    __device__ __2f16 __ocml_cos_2f16(__2f16);
+    __device__ __attribute__((pure)) __2f16 __ocml_exp_2f16(__2f16);
+    __device__ __attribute__((pure)) __2f16 __ocml_exp10_2f16(__2f16);
+    __device__ __attribute__((pure)) __2f16 __ocml_exp2_2f16(__2f16);
+    __device__ __attribute__((const)) __2f16 __ocml_floor_2f16(__2f16);
+    __device__ __attribute__((const)) __2f16 __ocml_fma_2f16(__2f16, __2f16, __2f16);
+    __device__ __attribute__((const)) __2i16 __ocml_isinf_2f16(__2f16);
+    __device__ __attribute__((const)) __2i16 __ocml_isnan_2f16(__2f16);
+    __device__ __attribute__((pure)) __2f16 __ocml_log_2f16(__2f16);
+    __device__ __attribute__((pure)) __2f16 __ocml_log10_2f16(__2f16);
+    __device__ __attribute__((pure)) __2f16 __ocml_log2_2f16(__2f16);
+    __device__ __attribute__((const)) __2f16 __ocml_rint_2f16(__2f16);
+    __device__ __attribute__((const)) __2f16 __ocml_rsqrt_2f16(__2f16);
+    __device__ __2f16 __ocml_sin_2f16(__2f16);
+    __device__ __attribute__((const)) __2f16 __ocml_sqrt_2f16(__2f16);
+    __device__ __attribute__((const)) __2f16 __ocml_trunc_2f16(__2f16);
+
+    __device__ __attribute__((const)) _Float16 __ocml_cvtrtn_f16_f32(float);
+    __device__ __attribute__((const)) _Float16 __ocml_cvtrtp_f16_f32(float);
+    __device__ __attribute__((const)) _Float16 __ocml_cvtrtz_f16_f32(float);
+
+}
+#endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+//TODO: remove these after they get into clang header __clang_hip_libdevice_declares.h'
+extern "C" {
+    __device__ __attribute__((const)) _Float16 __ocml_fmax_f16(_Float16, _Float16);
+    __device__ __attribute__((const)) _Float16 __ocml_fmin_f16(_Float16, _Float16);
+    __device__ __attribute__((const)) _Float16 __ocml_cvtrtn_f16_f32(float);
+    __device__ __attribute__((const)) _Float16 __ocml_cvtrtp_f16_f32(float);
+    __device__ __attribute__((const)) _Float16 __ocml_cvtrtz_f16_f32(float);
+}
@@ -0,0 +1,100 @@
+/*
+Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_LDG_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_LDG_H
+
+#if __HIP_CLANG_ONLY__
+#include "amd_hip_vector_types.h"
+#include "host_defines.h"
+
+__device__ inline static char __ldg(const char* ptr) { return *ptr; }
+
+__device__ inline static char2 __ldg(const char2* ptr) { return *ptr; }
+
+__device__ inline static char4 __ldg(const char4* ptr) { return *ptr; }
+
+__device__ inline static signed char __ldg(const signed char* ptr) { return ptr[0]; }
+
+__device__ inline static unsigned char __ldg(const unsigned char* ptr) { return ptr[0]; }
+
+
+__device__ inline static short __ldg(const short* ptr) { return ptr[0]; }
+
+__device__ inline static short2 __ldg(const short2* ptr) { return ptr[0]; }
+
+__device__ inline static short4 __ldg(const short4* ptr) { return ptr[0]; }
+
+__device__ inline static unsigned short __ldg(const unsigned short* ptr) { return ptr[0]; }
+
+
+__device__ inline static int __ldg(const int* ptr) { return ptr[0]; }
+
+__device__ inline static int2 __ldg(const int2* ptr) { return ptr[0]; }
+
+__device__ inline static int4 __ldg(const int4* ptr) { return ptr[0]; }
+
+__device__ inline static unsigned int __ldg(const unsigned int* ptr) { return ptr[0]; }
+
+
+__device__ inline static long __ldg(const long* ptr) { return ptr[0]; }
+
+__device__ inline static unsigned long __ldg(const unsigned long* ptr) { return ptr[0]; }
+
+
+__device__ inline static long long __ldg(const long long* ptr) { return ptr[0]; }
+
+__device__ inline static longlong2 __ldg(const longlong2* ptr) { return ptr[0]; }
+
+__device__ inline static unsigned long long __ldg(const unsigned long long* ptr) { return ptr[0]; }
+
+
+__device__ inline static uchar2 __ldg(const uchar2* ptr) { return ptr[0]; }
+
+__device__ inline static uchar4 __ldg(const uchar4* ptr) { return ptr[0]; }
+
+
+__device__ inline static ushort2 __ldg(const ushort2* ptr) { return ptr[0]; }
+
+
+__device__ inline static uint2 __ldg(const uint2* ptr) { return ptr[0]; }
+
+__device__ inline static uint4 __ldg(const uint4* ptr) { return ptr[0]; }
+
+
+__device__ inline static ulonglong2 __ldg(const ulonglong2* ptr) { return ptr[0]; }
+
+
+__device__ inline static float __ldg(const float* ptr) { return ptr[0]; }
+
+__device__ inline static float2 __ldg(const float2* ptr) { return ptr[0]; }
+
+__device__ inline static float4 __ldg(const float4* ptr) { return ptr[0]; }
+
+
+__device__ inline static double __ldg(const double* ptr) { return ptr[0]; }
+
+__device__ inline static double2 __ldg(const double2* ptr) { return ptr[0]; }
+
+#endif  // __HIP_CLANG_ONLY__
+
+#endif  // HIP_LDG_H
@@ -0,0 +1,77 @@
+/*
+Copyright (c) 2019 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_PROF_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_PROF_H
+
+// HIP ROCclr Op IDs enumeration
+enum HipVdiOpId {
+  kHipVdiOpIdDispatch = 0,
+  kHipVdiOpIdCopy     = 1,
+  kHipVdiOpIdBarrier  = 2,
+  kHipVdiOpIdNumber   = 3
+};
+
+// Types of ROCclr commands
+enum HipVdiCommandKind {
+  kHipVdiCommandKernel            = 0x11F0,
+  kHipVdiMemcpyDeviceToHost       = 0x11F3,
+  kHipHipVdiMemcpyHostToDevice    = 0x11F4,
+  kHipVdiMemcpyDeviceToDevice     = 0x11F5,
+  kHipVidMemcpyDeviceToHostRect   = 0x1201,
+  kHipVdiMemcpyHostToDeviceRect   = 0x1202,
+  kHipVdiMemcpyDeviceToDeviceRect = 0x1203,
+  kHipVdiFillMemory               = 0x1207,
+}; 
+
+/**
+ * @brief Initializes activity callback
+ *
+ * @param [input] id_callback Event ID callback function
+ * @param [input] op_callback Event operation callback function
+ * @param [input] arg         Arguments passed into callback
+ *
+ * @returns None
+ */
+void hipInitActivityCallback(void* id_callback, void* op_callback, void* arg);
+
+/**
+ * @brief Enables activity callback
+ *
+ * @param [input] op      Operation, which will trigger a callback (@see HipVdiOpId)
+ * @param [input] enable  Enable state for the callback
+ *
+ * @returns True if successful
+ */
+bool hipEnableActivityCallback(uint32_t op, bool enable);
+
+/**
+ * @brief Returns the description string for the operation kind
+ *
+ * @param [input] id      Command kind id (@see HipVdiCommandKind)
+ *
+ * @returns A pointer to a const string with the command description
+ */
+const char* hipGetCmdName(uint32_t id);
+
+#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_PROF_H
+
@@ -0,0 +1,184 @@
+/*
+Copyright (c) 2015 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ *  @file  amd_detail/host_defines.h
+ *  @brief TODO-doc
+ */
+
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HOST_DEFINES_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_HOST_DEFINES_H
+
+// The follow macro should be removed after upstream updation.
+// It's defined here for workarround of rocThrust building failure.
+#define HIP_INCLUDE_HIP_HCC_DETAIL_HOST_DEFINES_H
+
+// Add guard to Generic Grid Launch method
+#ifndef GENERIC_GRID_LAUNCH
+#define GENERIC_GRID_LAUNCH 1
+#endif
+
+#if defined(__clang__) && defined(__HIP__)
+
+namespace __hip_internal {
+typedef unsigned char uint8_t;
+typedef unsigned short uint16_t;
+typedef unsigned int uint32_t;
+typedef unsigned long long uint64_t;
+typedef signed char int8_t;
+typedef signed short int16_t;
+typedef signed int int32_t;
+typedef signed long long int64_t;
+
+template <class _Tp, _Tp __v> struct integral_constant {
+  static constexpr const _Tp value = __v;
+  typedef _Tp value_type;
+  typedef integral_constant type;
+  constexpr operator value_type() const { return value; }
+  constexpr value_type operator()() const { return value; }
+};
+template <class _Tp, _Tp __v> constexpr const _Tp integral_constant<_Tp, __v>::value;
+
+typedef integral_constant<bool, true> true_type;
+typedef integral_constant<bool, false> false_type;
+
+template <bool B> using bool_constant = integral_constant<bool, B>;
+typedef bool_constant<true> true_type;
+typedef bool_constant<false> false_type;
+
+template <bool __B, class __T = void> struct enable_if {};
+template <class __T> struct enable_if<true, __T> { typedef __T type; };
+
+template<bool _B> struct true_or_false_type : public false_type {};
+template<> struct true_or_false_type<true> : public true_type {};
+
+template <class _Tp> struct is_integral : public false_type {};
+template <> struct is_integral<bool> : public true_type {};
+template <> struct is_integral<char> : public true_type {};
+template <> struct is_integral<signed char> : public true_type {};
+template <> struct is_integral<unsigned char> : public true_type {};
+template <> struct is_integral<wchar_t> : public true_type {};
+template <> struct is_integral<short> : public true_type {};
+template <> struct is_integral<unsigned short> : public true_type {};
+template <> struct is_integral<int> : public true_type {};
+template <> struct is_integral<unsigned int> : public true_type {};
+template <> struct is_integral<long> : public true_type {};
+template <> struct is_integral<unsigned long> : public true_type {};
+template <> struct is_integral<long long> : public true_type {};
+template <> struct is_integral<unsigned long long> : public true_type {};
+
+template <class _Tp> struct is_arithmetic : public false_type {};
+template <> struct is_arithmetic<bool> : public true_type {};
+template <> struct is_arithmetic<char> : public true_type {};
+template <> struct is_arithmetic<signed char> : public true_type {};
+template <> struct is_arithmetic<unsigned char> : public true_type {};
+template <> struct is_arithmetic<wchar_t> : public true_type {};
+template <> struct is_arithmetic<short> : public true_type {};
+template <> struct is_arithmetic<unsigned short> : public true_type {};
+template <> struct is_arithmetic<int> : public true_type {};
+template <> struct is_arithmetic<unsigned int> : public true_type {};
+template <> struct is_arithmetic<long> : public true_type {};
+template <> struct is_arithmetic<unsigned long> : public true_type {};
+template <> struct is_arithmetic<long long> : public true_type {};
+template <> struct is_arithmetic<unsigned long long> : public true_type {};
+template <> struct is_arithmetic<float> : public true_type {};
+template <> struct is_arithmetic<double> : public true_type {};
+
+template<typename _Tp> struct is_floating_point : public false_type {};
+template<> struct is_floating_point<float> : public true_type {};
+template<> struct is_floating_point<double> : public true_type {};
+template<> struct is_floating_point<long double> : public true_type {};
+
+template <typename __T, typename __U> struct is_same : public false_type {};
+template <typename __T> struct is_same<__T, __T> : public true_type {};
+
+template<typename _Tp, bool = is_arithmetic<_Tp>::value>
+  struct is_signed : public false_type {};
+template<typename _Tp>
+  struct is_signed<_Tp, true> : public true_or_false_type<_Tp(-1) < _Tp(0)> {};
+
+template<typename _CharT> struct char_traits;
+template<typename _CharT, typename _Traits = char_traits<_CharT>> class basic_istream;
+template<typename _CharT, typename _Traits = char_traits<_CharT>> class basic_ostream;
+typedef basic_istream<char> istream;
+typedef basic_ostream<char> ostream;
+
+template<typename _Tp>
+    struct is_standard_layout
+    : public integral_constant<bool, __is_standard_layout(_Tp)>
+    { };
+
+template<typename _Tp>
+    struct is_trivial
+    : public integral_constant<bool, __is_trivial(_Tp)>
+    { };
+}
+typedef __hip_internal::uint8_t __hip_uint8_t;
+typedef __hip_internal::uint16_t __hip_uint16_t;
+typedef __hip_internal::uint32_t __hip_uint32_t;
+typedef __hip_internal::uint64_t __hip_uint64_t;
+typedef __hip_internal::int8_t __hip_int8_t;
+typedef __hip_internal::int16_t __hip_int16_t;
+typedef __hip_internal::int32_t __hip_int32_t;
+typedef __hip_internal::int64_t __hip_int64_t;
+
+#if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+#define __host__ __attribute__((host))
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+#define __shared__ __attribute__((shared))
+#define __constant__ __attribute__((constant))
+#endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+
+#if !defined(__has_feature) || !__has_feature(cuda_noinline_keyword)
+#define __noinline__ __attribute__((noinline))
+#endif
+
+#define __forceinline__ inline __attribute__((always_inline))
+
+#if __HIP_NO_IMAGE_SUPPORT
+#define __hip_img_chk__ __attribute__((unavailable("The image/texture API not supported on the device")))
+#else
+#define __hip_img_chk__
+#endif
+
+#else
+
+// Non-HCC compiler
+/**
+ * Function and kernel markers
+ */
+#define __host__
+#define __device__
+
+#define __global__
+
+#define __noinline__
+#define __forceinline__ inline
+
+#define __shared__
+#define __constant__
+
+#define __hip_img_chk__
+#endif
+
+#endif
@@ -0,0 +1,102 @@
+/*
+Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#pragma once
+
+#include <hsa/hsa.h>
+
+#include <cstdint>
+#include <functional>
+#include <string>
+
+namespace hip_impl {
+inline void* address(hsa_executable_symbol_t x) {
+    void* r = nullptr;
+    hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &r);
+
+    return r;
+}
+
+inline hsa_agent_t agent(hsa_executable_symbol_t x) {
+    hsa_agent_t r = {};
+    hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_AGENT, &r);
+
+    return r;
+}
+
+inline std::uint32_t group_size(hsa_executable_symbol_t x) {
+    std::uint32_t r = 0u;
+    hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &r);
+
+    return r;
+}
+
+inline hsa_isa_t isa(hsa_agent_t x) {
+    hsa_isa_t r = {};
+    hsa_agent_iterate_isas(x,
+                           [](hsa_isa_t i, void* o) {
+                               *static_cast<hsa_isa_t*>(o) = i;  // Pick the first.
+
+                               return HSA_STATUS_INFO_BREAK;
+                           },
+                           &r);
+
+    return r;
+}
+
+inline std::uint64_t kernel_object(hsa_executable_symbol_t x) {
+    std::uint64_t r = 0u;
+    hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &r);
+
+    return r;
+}
+
+inline std::string name(hsa_executable_symbol_t x) {
+    std::uint32_t sz = 0u;
+    hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &sz);
+
+    std::string r(sz, '\0');
+    hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_NAME, &r.front());
+
+    return r;
+}
+
+inline std::uint32_t private_size(hsa_executable_symbol_t x) {
+    std::uint32_t r = 0u;
+    hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &r);
+
+    return r;
+}
+
+inline std::uint32_t size(hsa_executable_symbol_t x) {
+    std::uint32_t r = 0;
+    hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &r);
+
+    return r;
+}
+
+inline hsa_symbol_kind_t type(hsa_executable_symbol_t x) {
+    hsa_symbol_kind_t r = {};
+    hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &r);
+
+    return r;
+}
+}  // namespace hip_impl
@@ -0,0 +1,798 @@
+/*
+Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include "concepts.hpp"
+#include "helpers.hpp"
+
+#include "hc.hpp"
+#include "hip/hip_ext.h"
+#include "hip_runtime.h"
+
+#include <functional>
+#include <iostream>
+#include <stdexcept>
+#include <type_traits>
+#include <utility>
+
+namespace hip_impl {
+namespace {
+struct New_grid_launch_tag {};
+struct Old_grid_launch_tag {};
+
+template <typename C, typename D>
+class RAII_guard {
+    D dtor_;
+
+   public:
+    RAII_guard() = default;
+
+    RAII_guard(const C& ctor, D dtor) : dtor_{std::move(dtor)} { ctor(); }
+
+    RAII_guard(const RAII_guard&) = default;
+    RAII_guard(RAII_guard&&) = default;
+
+    RAII_guard& operator=(const RAII_guard&) = default;
+    RAII_guard& operator=(RAII_guard&&) = default;
+
+    ~RAII_guard() { dtor_(); }
+};
+
+template <typename C, typename D>
+RAII_guard<C, D> make_RAII_guard(const C& ctor, D dtor) {
+    return RAII_guard<C, D>{ctor, std::move(dtor)};
+}
+
+template <FunctionalProcedure F, typename... Ts>
+using is_new_grid_launch_t = typename std::conditional<is_callable<F(Ts...)>{}, New_grid_launch_tag,
+                                                       Old_grid_launch_tag>::type;
+}  // namespace
+
+// TODO: - dispatch rank should be derived from the domain dimensions passed
+//         in, and not always assumed to be 3;
+
+template <FunctionalProcedure K, typename... Ts>
+requires(Domain<K> ==
+         {Ts...}) inline void grid_launch_hip_impl_(New_grid_launch_tag, dim3 num_blocks,
+                                                    dim3 dim_blocks, int group_mem_bytes,
+                                                    const hc::accelerator_view& acc_v, K k) {
+    const auto d =
+        hc::extent<3>{num_blocks.z * dim_blocks.z, num_blocks.y * dim_blocks.y,
+                      num_blocks.x * dim_blocks.x}
+            .tile_with_dynamic(dim_blocks.z, dim_blocks.y, dim_blocks.x, group_mem_bytes);
+
+    try {
+        hc::parallel_for_each(acc_v, d, k);
+    } catch (std::exception& ex) {
+        std::cerr << "Failed in " << __func__ << ", with exception: " << ex.what() << std::endl;
+        hip_throw(ex);
+    }
+}
+
+// TODO: these are workarounds, they should be removed.
+
+hc::accelerator_view lock_stream_hip_(hipStream_t&, void*&);
+void print_prelaunch_trace_(const char*, dim3, dim3, int, hipStream_t);
+void unlock_stream_hip_(hipStream_t, void*, const char*, hc::accelerator_view*);
+
+template <FunctionalProcedure K, typename... Ts>
+requires(Domain<K> == {Ts...}) inline void grid_launch_hip_impl_(New_grid_launch_tag,
+                                                                 dim3 num_blocks, dim3 dim_blocks,
+                                                                 int group_mem_bytes,
+                                                                 hipStream_t stream,
+                                                                 const char* kernel_name, K k) {
+    void* lck_stream = nullptr;
+    auto acc_v = lock_stream_hip_(stream, lck_stream);
+    auto stream_guard =
+        make_RAII_guard(std::bind(print_prelaunch_trace_, kernel_name, num_blocks, dim_blocks,
+                                  group_mem_bytes, stream),
+                        std::bind(unlock_stream_hip_, stream, lck_stream, kernel_name, &acc_v));
+
+    try {
+        grid_launch_hip_impl_(New_grid_launch_tag{}, std::move(num_blocks), std::move(dim_blocks),
+                              group_mem_bytes, acc_v, std::move(k));
+    } catch (std::exception& ex) {
+        std::cerr << "Failed in " << __func__ << ", with exception: " << ex.what() << std::endl;
+        hip_throw(ex);
+    }
+}
+
+template <FunctionalProcedure K, typename... Ts>
+requires(Domain<K> ==
+         {hipLaunchParm, Ts...}) inline void grid_launch_hip_impl_(Old_grid_launch_tag,
+                                                                   dim3 num_blocks, dim3 dim_blocks,
+                                                                   int group_mem_bytes,
+                                                                   hipStream_t stream, K k) {
+    grid_launch_hip_impl_(New_grid_launch_tag{}, std::move(num_blocks), std::move(dim_blocks),
+                          group_mem_bytes, std::move(stream), std::move(k));
+}
+
+template <FunctionalProcedure K, typename... Ts>
+requires(Domain<K> == {hipLaunchParm, Ts...}) inline void grid_launch_hip_impl_(
+    Old_grid_launch_tag, dim3 num_blocks, dim3 dim_blocks, int group_mem_bytes, hipStream_t stream,
+    const char* kernel_name, K k) {
+    grid_launch_hip_impl_(New_grid_launch_tag{}, std::move(num_blocks), std::move(dim_blocks),
+                          group_mem_bytes, std::move(stream), kernel_name, std::move(k));
+}
+
+template <FunctionalProcedure K, typename... Ts>
+requires(Domain<K> == {Ts...}) inline std::enable_if_t<
+    !std::is_function<K>::value> grid_launch_hip_(dim3 num_blocks, dim3 dim_blocks,
+                                                  int group_mem_bytes, hipStream_t stream,
+                                                  const char* kernel_name, K k) {
+    grid_launch_hip_impl_(is_new_grid_launch_t<K, Ts...>{}, std::move(num_blocks),
+                          std::move(dim_blocks), group_mem_bytes, std::move(stream), kernel_name,
+                          std::move(k));
+}
+
+template <FunctionalProcedure K, typename... Ts>
+requires(Domain<K> == {Ts...}) inline std::enable_if_t<
+    !std::is_function<K>::value> grid_launch_hip_(dim3 num_blocks, dim3 dim_blocks,
+                                                  int group_mem_bytes, hipStream_t stream, K k) {
+    grid_launch_hip_impl_(is_new_grid_launch_t<K, Ts...>{}, std::move(num_blocks),
+                          std::move(dim_blocks), group_mem_bytes, std::move(stream), std::move(k));
+}
+
+// TODO: these are temporary and purposefully noisy and disruptive.
+#define make_kernel_name_hip(k, n)                                                                 \
+    HIP_kernel_functor_name_begin##_##k##_##HIP_kernel_functor_name_end##_##n
+
+#define make_kernel_functor_hip_30(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+                                   p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, \
+                                   p22, p23, p24, p25, p26, p27)                                   \
+    struct make_kernel_name_hip(function_name, 28) {                                               \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        std::decay_t<decltype(p7)> _p7_;                                                           \
+        std::decay_t<decltype(p8)> _p8_;                                                           \
+        std::decay_t<decltype(p9)> _p9_;                                                           \
+        std::decay_t<decltype(p10)> _p10_;                                                         \
+        std::decay_t<decltype(p11)> _p11_;                                                         \
+        std::decay_t<decltype(p12)> _p12_;                                                         \
+        std::decay_t<decltype(p13)> _p13_;                                                         \
+        std::decay_t<decltype(p14)> _p14_;                                                         \
+        std::decay_t<decltype(p15)> _p15_;                                                         \
+        std::decay_t<decltype(p16)> _p16_;                                                         \
+        std::decay_t<decltype(p17)> _p17_;                                                         \
+        std::decay_t<decltype(p18)> _p18_;                                                         \
+        std::decay_t<decltype(p19)> _p19_;                                                         \
+        std::decay_t<decltype(p20)> _p20_;                                                         \
+        std::decay_t<decltype(p21)> _p21_;                                                         \
+        std::decay_t<decltype(p22)> _p22_;                                                         \
+        std::decay_t<decltype(p23)> _p23_;                                                         \
+        std::decay_t<decltype(p24)> _p24_;                                                         \
+        std::decay_t<decltype(p25)> _p25_;                                                         \
+        std::decay_t<decltype(p26)> _p26_;                                                         \
+        std::decay_t<decltype(p27)> _p27_;                                                         \
+        void operator()(const hc::tiled_index<3>&) const [[hc]] {                                  \
+            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
+                        _p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_, _p21_,      \
+                        _p22_, _p23_, _p24_, _p25_, _p26_, _p27_);                                 \
+        }                                                                                          \
+    }
+#define make_kernel_functor_hip_29(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+                                   p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, \
+                                   p22, p23, p24, p25, p26)                                        \
+    struct make_kernel_name_hip(function_name, 27) {                                               \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        std::decay_t<decltype(p7)> _p7_;                                                           \
+        std::decay_t<decltype(p8)> _p8_;                                                           \
+        std::decay_t<decltype(p9)> _p9_;                                                           \
+        std::decay_t<decltype(p10)> _p10_;                                                         \
+        std::decay_t<decltype(p11)> _p11_;                                                         \
+        std::decay_t<decltype(p12)> _p12_;                                                         \
+        std::decay_t<decltype(p13)> _p13_;                                                         \
+        std::decay_t<decltype(p14)> _p14_;                                                         \
+        std::decay_t<decltype(p15)> _p15_;                                                         \
+        std::decay_t<decltype(p16)> _p16_;                                                         \
+        std::decay_t<decltype(p17)> _p17_;                                                         \
+        std::decay_t<decltype(p18)> _p18_;                                                         \
+        std::decay_t<decltype(p19)> _p19_;                                                         \
+        std::decay_t<decltype(p20)> _p20_;                                                         \
+        std::decay_t<decltype(p21)> _p21_;                                                         \
+        std::decay_t<decltype(p22)> _p22_;                                                         \
+        std::decay_t<decltype(p23)> _p23_;                                                         \
+        std::decay_t<decltype(p24)> _p24_;                                                         \
+        std::decay_t<decltype(p25)> _p25_;                                                         \
+        std::decay_t<decltype(p26)> _p26_;                                                         \
+        void operator()(const hc::tiled_index<3>&) const [[hc]] {                                  \
+            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
+                        _p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_, _p21_,      \
+                        _p22_, _p23_, _p24_, _p25_, _p26_);                                        \
+        }                                                                                          \
+    }
+#define make_kernel_functor_hip_28(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+                                   p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, \
+                                   p22, p23, p24, p25)                                             \
+    struct make_kernel_name_hip(function_name, 26) {                                               \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        std::decay_t<decltype(p7)> _p7_;                                                           \
+        std::decay_t<decltype(p8)> _p8_;                                                           \
+        std::decay_t<decltype(p9)> _p9_;                                                           \
+        std::decay_t<decltype(p10)> _p10_;                                                         \
+        std::decay_t<decltype(p11)> _p11_;                                                         \
+        std::decay_t<decltype(p12)> _p12_;                                                         \
+        std::decay_t<decltype(p13)> _p13_;                                                         \
+        std::decay_t<decltype(p14)> _p14_;                                                         \
+        std::decay_t<decltype(p15)> _p15_;                                                         \
+        std::decay_t<decltype(p16)> _p16_;                                                         \
+        std::decay_t<decltype(p17)> _p17_;                                                         \
+        std::decay_t<decltype(p18)> _p18_;                                                         \
+        std::decay_t<decltype(p19)> _p19_;                                                         \
+        std::decay_t<decltype(p20)> _p20_;                                                         \
+        std::decay_t<decltype(p21)> _p21_;                                                         \
+        std::decay_t<decltype(p22)> _p22_;                                                         \
+        std::decay_t<decltype(p23)> _p23_;                                                         \
+        std::decay_t<decltype(p24)> _p24_;                                                         \
+        std::decay_t<decltype(p25)> _p25_;                                                         \
+        void operator()(const hc::tiled_index<3>&) const [[hc]] {                                  \
+            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
+                        _p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_, _p21_,      \
+                        _p22_, _p23_, _p24_, _p25_);                                               \
+        }                                                                                          \
+    }
+#define make_kernel_functor_hip_27(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+                                   p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, \
+                                   p22, p23, p24)                                                  \
+    struct make_kernel_name_hip(function_name, 25) {                                               \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        std::decay_t<decltype(p7)> _p7_;                                                           \
+        std::decay_t<decltype(p8)> _p8_;                                                           \
+        std::decay_t<decltype(p9)> _p9_;                                                           \
+        std::decay_t<decltype(p10)> _p10_;                                                         \
+        std::decay_t<decltype(p11)> _p11_;                                                         \
+        std::decay_t<decltype(p12)> _p12_;                                                         \
+        std::decay_t<decltype(p13)> _p13_;                                                         \
+        std::decay_t<decltype(p14)> _p14_;                                                         \
+        std::decay_t<decltype(p15)> _p15_;                                                         \
+        std::decay_t<decltype(p16)> _p16_;                                                         \
+        std::decay_t<decltype(p17)> _p17_;                                                         \
+        std::decay_t<decltype(p18)> _p18_;                                                         \
+        std::decay_t<decltype(p19)> _p19_;                                                         \
+        std::decay_t<decltype(p20)> _p20_;                                                         \
+        std::decay_t<decltype(p21)> _p21_;                                                         \
+        std::decay_t<decltype(p22)> _p22_;                                                         \
+        std::decay_t<decltype(p23)> _p23_;                                                         \
+        std::decay_t<decltype(p24)> _p24_;                                                         \
+        void operator()(const hc::tiled_index<3>&) const [[hc]] {                                  \
+            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
+                        _p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_, _p21_,      \
+                        _p22_, _p23_, _p24_);                                                      \
+        }                                                                                          \
+    }
+#define make_kernel_functor_hip_26(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+                                   p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, \
+                                   p22, p23)                                                       \
+    struct make_kernel_name_hip(function_name, 24) {                                               \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        std::decay_t<decltype(p7)> _p7_;                                                           \
+        std::decay_t<decltype(p8)> _p8_;                                                           \
+        std::decay_t<decltype(p9)> _p9_;                                                           \
+        std::decay_t<decltype(p10)> _p10_;                                                         \
+        std::decay_t<decltype(p11)> _p11_;                                                         \
+        std::decay_t<decltype(p12)> _p12_;                                                         \
+        std::decay_t<decltype(p13)> _p13_;                                                         \
+        std::decay_t<decltype(p14)> _p14_;                                                         \
+        std::decay_t<decltype(p15)> _p15_;                                                         \
+        std::decay_t<decltype(p16)> _p16_;                                                         \
+        std::decay_t<decltype(p17)> _p17_;                                                         \
+        std::decay_t<decltype(p18)> _p18_;                                                         \
+        std::decay_t<decltype(p19)> _p19_;                                                         \
+        std::decay_t<decltype(p20)> _p20_;                                                         \
+        std::decay_t<decltype(p21)> _p21_;                                                         \
+        std::decay_t<decltype(p22)> _p22_;                                                         \
+        std::decay_t<decltype(p23)> _p23_;                                                         \
+        void operator()(const hc::tiled_index<3>&) const [[hc]] {                                  \
+            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
+                        _p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_, _p21_,      \
+                        _p22_, _p23_);                                                             \
+        }                                                                                          \
+    }
+#define make_kernel_functor_hip_25(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+                                   p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, \
+                                   p22)                                                            \
+    struct make_kernel_name_hip(function_name, 23) {                                               \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        std::decay_t<decltype(p7)> _p7_;                                                           \
+        std::decay_t<decltype(p8)> _p8_;                                                           \
+        std::decay_t<decltype(p9)> _p9_;                                                           \
+        std::decay_t<decltype(p10)> _p10_;                                                         \
+        std::decay_t<decltype(p11)> _p11_;                                                         \
+        std::decay_t<decltype(p12)> _p12_;                                                         \
+        std::decay_t<decltype(p13)> _p13_;                                                         \
+        std::decay_t<decltype(p14)> _p14_;                                                         \
+        std::decay_t<decltype(p15)> _p15_;                                                         \
+        std::decay_t<decltype(p16)> _p16_;                                                         \
+        std::decay_t<decltype(p17)> _p17_;                                                         \
+        std::decay_t<decltype(p18)> _p18_;                                                         \
+        std::decay_t<decltype(p19)> _p19_;                                                         \
+        std::decay_t<decltype(p20)> _p20_;                                                         \
+        std::decay_t<decltype(p21)> _p21_;                                                         \
+        std::decay_t<decltype(p22)> _p22_;                                                         \
+        __attribute__((used, flatten)) void operator()(const hc::tiled_index<3>&) const [[hc]] {   \
+            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
+                        _p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_, _p21_,      \
+                        _p22_);                                                                    \
+        }                                                                                          \
+    }
+#define make_kernel_functor_hip_24(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+                                   p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21) \
+    struct make_kernel_name_hip(function_name, 22) {                                               \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        std::decay_t<decltype(p7)> _p7_;                                                           \
+        std::decay_t<decltype(p8)> _p8_;                                                           \
+        std::decay_t<decltype(p9)> _p9_;                                                           \
+        std::decay_t<decltype(p10)> _p10_;                                                         \
+        std::decay_t<decltype(p11)> _p11_;                                                         \
+        std::decay_t<decltype(p12)> _p12_;                                                         \
+        std::decay_t<decltype(p13)> _p13_;                                                         \
+        std::decay_t<decltype(p14)> _p14_;                                                         \
+        std::decay_t<decltype(p15)> _p15_;                                                         \
+        std::decay_t<decltype(p16)> _p16_;                                                         \
+        std::decay_t<decltype(p17)> _p17_;                                                         \
+        std::decay_t<decltype(p18)> _p18_;                                                         \
+        std::decay_t<decltype(p19)> _p19_;                                                         \
+        std::decay_t<decltype(p20)> _p20_;                                                         \
+        std::decay_t<decltype(p21)> _p21_;                                                         \
+        void operator()(const hc::tiled_index<3>&) const [[hc]] {                                  \
+            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
+                        _p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_, _p21_);     \
+        }                                                                                          \
+    }
+#define make_kernel_functor_hip_23(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+                                   p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20)      \
+    struct make_kernel_name_hip(function_name, 21) {                                               \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        std::decay_t<decltype(p7)> _p7_;                                                           \
+        std::decay_t<decltype(p8)> _p8_;                                                           \
+        std::decay_t<decltype(p9)> _p9_;                                                           \
+        std::decay_t<decltype(p10)> _p10_;                                                         \
+        std::decay_t<decltype(p11)> _p11_;                                                         \
+        std::decay_t<decltype(p12)> _p12_;                                                         \
+        std::decay_t<decltype(p13)> _p13_;                                                         \
+        std::decay_t<decltype(p14)> _p14_;                                                         \
+        std::decay_t<decltype(p15)> _p15_;                                                         \
+        std::decay_t<decltype(p16)> _p16_;                                                         \
+        std::decay_t<decltype(p17)> _p17_;                                                         \
+        std::decay_t<decltype(p18)> _p18_;                                                         \
+        std::decay_t<decltype(p19)> _p19_;                                                         \
+        std::decay_t<decltype(p20)> _p20_;                                                         \
+        void operator()(const hc::tiled_index<3>&) const [[hc]] {                                  \
+            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
+                        _p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_);            \
+        }                                                                                          \
+    }
+#define make_kernel_functor_hip_22(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+                                   p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19)           \
+    struct make_kernel_name_hip(function_name, 20) {                                               \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        std::decay_t<decltype(p7)> _p7_;                                                           \
+        std::decay_t<decltype(p8)> _p8_;                                                           \
+        std::decay_t<decltype(p9)> _p9_;                                                           \
+        std::decay_t<decltype(p10)> _p10_;                                                         \
+        std::decay_t<decltype(p11)> _p11_;                                                         \
+        std::decay_t<decltype(p12)> _p12_;                                                         \
+        std::decay_t<decltype(p13)> _p13_;                                                         \
+        std::decay_t<decltype(p14)> _p14_;                                                         \
+        std::decay_t<decltype(p15)> _p15_;                                                         \
+        std::decay_t<decltype(p16)> _p16_;                                                         \
+        std::decay_t<decltype(p17)> _p17_;                                                         \
+        std::decay_t<decltype(p18)> _p18_;                                                         \
+        std::decay_t<decltype(p19)> _p19_;                                                         \
+        void operator()(const hc::tiled_index<3>&) const [[hc]] {                                  \
+            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
+                        _p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_);                   \
+        }                                                                                          \
+    }
+#define make_kernel_functor_hip_21(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+                                   p9, p10, p11, p12, p13, p14, p15, p16, p17, p18)                \
+    struct make_kernel_name_hip(function_name, 19) {                                               \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        std::decay_t<decltype(p7)> _p7_;                                                           \
+        std::decay_t<decltype(p8)> _p8_;                                                           \
+        std::decay_t<decltype(p9)> _p9_;                                                           \
+        std::decay_t<decltype(p10)> _p10_;                                                         \
+        std::decay_t<decltype(p11)> _p11_;                                                         \
+        std::decay_t<decltype(p12)> _p12_;                                                         \
+        std::decay_t<decltype(p13)> _p13_;                                                         \
+        std::decay_t<decltype(p14)> _p14_;                                                         \
+        std::decay_t<decltype(p15)> _p15_;                                                         \
+        std::decay_t<decltype(p16)> _p16_;                                                         \
+        std::decay_t<decltype(p17)> _p17_;                                                         \
+        std::decay_t<decltype(p18)> _p18_;                                                         \
+        void operator()(const hc::tiled_index<3>&) const [[hc]] {                                  \
+            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
+                        _p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_);                          \
+        }                                                                                          \
+    }
+#define make_kernel_functor_hip_20(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+                                   p9, p10, p11, p12, p13, p14, p15, p16, p17)                     \
+    struct make_kernel_name_hip(function_name, 18) {                                               \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        std::decay_t<decltype(p7)> _p7_;                                                           \
+        std::decay_t<decltype(p8)> _p8_;                                                           \
+        std::decay_t<decltype(p9)> _p9_;                                                           \
+        std::decay_t<decltype(p10)> _p10_;                                                         \
+        std::decay_t<decltype(p11)> _p11_;                                                         \
+        std::decay_t<decltype(p12)> _p12_;                                                         \
+        std::decay_t<decltype(p13)> _p13_;                                                         \
+        std::decay_t<decltype(p14)> _p14_;                                                         \
+        std::decay_t<decltype(p15)> _p15_;                                                         \
+        std::decay_t<decltype(p16)> _p16_;                                                         \
+        std::decay_t<decltype(p17)> _p17_;                                                         \
+        void operator()(const hc::tiled_index<3>&) const [[hc]] {                                  \
+            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
+                        _p12_, _p13_, _p14_, _p15_, _p16_, _p17_);                                 \
+        }                                                                                          \
+    }
+#define make_kernel_functor_hip_19(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+                                   p9, p10, p11, p12, p13, p14, p15, p16)                          \
+    struct make_kernel_name_hip(function_name, 17) {                                               \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        std::decay_t<decltype(p7)> _p7_;                                                           \
+        std::decay_t<decltype(p8)> _p8_;                                                           \
+        std::decay_t<decltype(p9)> _p9_;                                                           \
+        std::decay_t<decltype(p10)> _p10_;                                                         \
+        std::decay_t<decltype(p11)> _p11_;                                                         \
+        std::decay_t<decltype(p12)> _p12_;                                                         \
+        std::decay_t<decltype(p13)> _p13_;                                                         \
+        std::decay_t<decltype(p14)> _p14_;                                                         \
+        std::decay_t<decltype(p15)> _p15_;                                                         \
+        std::decay_t<decltype(p16)> _p16_;                                                         \
+        void operator()(const hc::tiled_index<3>&) const [[hc]] {                                  \
+            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
+                        _p12_, _p13_, _p14_, _p15_, _p16_);                                        \
+        }                                                                                          \
+    }
+#define make_kernel_functor_hip_18(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+                                   p9, p10, p11, p12, p13, p14, p15)                               \
+    struct make_kernel_name_hip(function_name, 16) {                                               \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        std::decay_t<decltype(p7)> _p7_;                                                           \
+        std::decay_t<decltype(p8)> _p8_;                                                           \
+        std::decay_t<decltype(p9)> _p9_;                                                           \
+        std::decay_t<decltype(p10)> _p10_;                                                         \
+        std::decay_t<decltype(p11)> _p11_;                                                         \
+        std::decay_t<decltype(p12)> _p12_;                                                         \
+        std::decay_t<decltype(p13)> _p13_;                                                         \
+        std::decay_t<decltype(p14)> _p14_;                                                         \
+        std::decay_t<decltype(p15)> _p15_;                                                         \
+        void operator()(const hc::tiled_index<3>&) const [[hc]] {                                  \
+            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
+                        _p12_, _p13_, _p14_, _p15_);                                               \
+        }                                                                                          \
+    }
+#define make_kernel_functor_hip_17(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+                                   p9, p10, p11, p12, p13, p14)                                    \
+    struct make_kernel_name_hip(function_name, 15) {                                               \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        std::decay_t<decltype(p7)> _p7_;                                                           \
+        std::decay_t<decltype(p8)> _p8_;                                                           \
+        std::decay_t<decltype(p9)> _p9_;                                                           \
+        std::decay_t<decltype(p10)> _p10_;                                                         \
+        std::decay_t<decltype(p11)> _p11_;                                                         \
+        std::decay_t<decltype(p12)> _p12_;                                                         \
+        std::decay_t<decltype(p13)> _p13_;                                                         \
+        std::decay_t<decltype(p14)> _p14_;                                                         \
+        void operator()(const hc::tiled_index<3>&) const [[hc]] {                                  \
+            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
+                        _p12_, _p13_, _p14_);                                                      \
+        }                                                                                          \
+    }
+#define make_kernel_functor_hip_16(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+                                   p9, p10, p11, p12, p13)                                         \
+    struct make_kernel_name_hip(function_name, 14) {                                               \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        std::decay_t<decltype(p7)> _p7_;                                                           \
+        std::decay_t<decltype(p8)> _p8_;                                                           \
+        std::decay_t<decltype(p9)> _p9_;                                                           \
+        std::decay_t<decltype(p10)> _p10_;                                                         \
+        std::decay_t<decltype(p11)> _p11_;                                                         \
+        std::decay_t<decltype(p12)> _p12_;                                                         \
+        std::decay_t<decltype(p13)> _p13_;                                                         \
+        void operator()(const hc::tiled_index<3>&) const [[hc]] {                                  \
+            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
+                        _p12_, _p13_);                                                             \
+        }                                                                                          \
+    }
+#define make_kernel_functor_hip_15(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+                                   p9, p10, p11, p12)                                              \
+    struct make_kernel_name_hip(function_name, 13) {                                               \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        std::decay_t<decltype(p7)> _p7_;                                                           \
+        std::decay_t<decltype(p8)> _p8_;                                                           \
+        std::decay_t<decltype(p9)> _p9_;                                                           \
+        std::decay_t<decltype(p10)> _p10_;                                                         \
+        std::decay_t<decltype(p11)> _p11_;                                                         \
+        std::decay_t<decltype(p12)> _p12_;                                                         \
+        void operator()(const hc::tiled_index<3>&) const [[hc]] {                                  \
+            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
+                        _p12_);                                                                    \
+        }                                                                                          \
+    }
+#define make_kernel_functor_hip_14(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+                                   p9, p10, p11)                                                   \
+    struct make_kernel_name_hip(function_name, 12) {                                               \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        std::decay_t<decltype(p7)> _p7_;                                                           \
+        std::decay_t<decltype(p8)> _p8_;                                                           \
+        std::decay_t<decltype(p9)> _p9_;                                                           \
+        std::decay_t<decltype(p10)> _p10_;                                                         \
+        std::decay_t<decltype(p11)> _p11_;                                                         \
+        void operator()(const hc::tiled_index<3>&) const [[hc]] {                                  \
+            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_); \
+        }                                                                                          \
+    }
+#define make_kernel_functor_hip_13(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+                                   p9, p10)                                                        \
+    struct make_kernel_name_hip(function_name, 11) {                                               \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        std::decay_t<decltype(p7)> _p7_;                                                           \
+        std::decay_t<decltype(p8)> _p8_;                                                           \
+        std::decay_t<decltype(p9)> _p9_;                                                           \
+        std::decay_t<decltype(p10)> _p10_;                                                         \
+        void operator()(const hc::tiled_index<3>&) const [[hc]] {                                  \
+            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_);        \
+        }                                                                                          \
+    }
+#define make_kernel_functor_hip_12(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+                                   p9)                                                             \
+    struct make_kernel_name_hip(function_name, 10) {                                               \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        std::decay_t<decltype(p7)> _p7_;                                                           \
+        std::decay_t<decltype(p8)> _p8_;                                                           \
+        std::decay_t<decltype(p9)> _p9_;                                                           \
+        void operator()(const hc::tiled_index<3>&) const                                           \
+            [[hc]] { kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_); }    \
+    }
+#define make_kernel_functor_hip_11(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8) \
+    struct make_kernel_name_hip(function_name, 9) {                                                \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        std::decay_t<decltype(p7)> _p7_;                                                           \
+        std::decay_t<decltype(p8)> _p8_;                                                           \
+        void operator()(const hc::tiled_index<3>&) const                                           \
+            [[hc]] { kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_); }          \
+    }
+#define make_kernel_functor_hip_10(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7)     \
+    struct make_kernel_name_hip(function_name, 8) {                                                \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        std::decay_t<decltype(p7)> _p7_;                                                           \
+        void operator()(const hc::tiled_index<3>&) const                                           \
+            [[hc]] { kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_); }                \
+    }
+#define make_kernel_functor_hip_9(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6)          \
+    struct make_kernel_name_hip(function_name, 7) {                                                \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        std::decay_t<decltype(p6)> _p6_;                                                           \
+        void operator()(const hc::tiled_index<3>&) const                                           \
+            [[hc]] { kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_); }                      \
+    }
+#define make_kernel_functor_hip_8(function_name, kernel_name, p0, p1, p2, p3, p4, p5)              \
+    struct make_kernel_name_hip(function_name, 6) {                                                \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        std::decay_t<decltype(p5)> _p5_;                                                           \
+        void operator()(const hc::tiled_index<3>&) const                                           \
+            [[hc]] { kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_); }                            \
+    }
+#define make_kernel_functor_hip_7(function_name, kernel_name, p0, p1, p2, p3, p4)                  \
+    struct make_kernel_name_hip(function_name, 5) {                                                \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        std::decay_t<decltype(p4)> _p4_;                                                           \
+        void operator()(const hc::tiled_index<3>&) const                                           \
+            [[hc]] { kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_); }                                  \
+    }
+#define make_kernel_functor_hip_6(function_name, kernel_name, p0, p1, p2, p3)                      \
+    struct make_kernel_name_hip(function_name, 4) {                                                \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        std::decay_t<decltype(p3)> _p3_;                                                           \
+        void operator()(const hc::tiled_index<3>&) const                                           \
+            [[hc]] { kernel_name(_p0_, _p1_, _p2_, _p3_); }                                        \
+    }
+#define make_kernel_functor_hip_5(function_name, kernel_name, p0, p1, p2)                          \
+    struct make_kernel_name_hip(function_name, 3) {                                                \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        std::decay_t<decltype(p2)> _p2_;                                                           \
+        void operator()(const hc::tiled_index<3>&) const [[hc]] { kernel_name(_p0_, _p1_, _p2_); } \
+    }
+#define make_kernel_functor_hip_4(function_name, kernel_name, p0, p1)                              \
+    struct make_kernel_name_hip(function_name, 2) {                                                \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        std::decay_t<decltype(p1)> _p1_;                                                           \
+        void operator()(const hc::tiled_index<3>&) const [[hc]] { kernel_name(_p0_, _p1_); }       \
+    }
+#define fofo(f, n) kernel_prefix_hip##f##kernel_suffix_hip##n
+#define make_kernel_functor_hip_3(function_name, kernel_name, p0)                                  \
+    struct make_kernel_name_hip(function_name, 1) {                                                \
+        std::decay_t<decltype(p0)> _p0_;                                                           \
+        void operator()(const hc::tiled_index<3>&) const [[hc]] { kernel_name(_p0_); }             \
+    }
+#define make_kernel_functor_hip_2(function_name, kernel_name)                                      \
+    struct make_kernel_name_hip(function_name, 0) {                                                \
+        void operator()(const hc::tiled_index<3>&)[[hc]] { return kernel_name(hipLaunchParm{}); }  \
+    }
+#define make_kernel_functor_hip_1(...)
+#define make_kernel_functor_hip_0(...)
+#define make_kernel_functor_hip_(...) overload_macro_hip_(make_kernel_functor_hip_, __VA_ARGS__)
+
+
+#define hipLaunchNamedKernelGGL(function_name, kernel_name, num_blocks, dim_blocks,                \
+                                group_mem_bytes, stream, ...)                                      \
+    do {                                                                                           \
+        make_kernel_functor_hip_(function_name, kernel_name, __VA_ARGS__)                          \
+            hip_kernel_functor_impl_{__VA_ARGS__};                                                 \
+        hip_impl::grid_launch_hip_(num_blocks, dim_blocks, group_mem_bytes, stream, #kernel_name,  \
+                                   hip_kernel_functor_impl_);                                      \
+    } while (0)
+
+#define hipLaunchKernelGGL(kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, ...)      \
+    do {                                                                                           \
+        hipLaunchNamedKernelGGL(unnamed, kernel_name, num_blocks, dim_blocks, group_mem_bytes,     \
+                                stream, ##__VA_ARGS__);                                            \
+    } while (0)
+
+#define hipLaunchKernel(kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, ...)         \
+    do {                                                                                           \
+        hipLaunchKernelGGL(kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream,           \
+                           hipLaunchParm{}, ##__VA_ARGS__);                                        \
+    } while (0)
+}  // namespace hip_impl
@@ -0,0 +1,694 @@
+/*
+Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include "host_defines.h"
+#if defined(__cplusplus)
+    extern "C" {
+#endif
+
+// DOT FUNCTIONS
+#if __HIP_CLANG_ONLY__
+__device__
+__attribute__((const))
+int __ockl_sdot2(
+    HIP_vector_base<short, 2>::Native_vec_,
+    HIP_vector_base<short, 2>::Native_vec_,
+    int, bool);
+
+__device__
+__attribute__((const))
+unsigned int __ockl_udot2(
+    HIP_vector_base<unsigned short, 2>::Native_vec_,
+    HIP_vector_base<unsigned short, 2>::Native_vec_,
+    unsigned int, bool);
+
+__device__
+__attribute__((const))
+int __ockl_sdot4(
+    HIP_vector_base<char, 4>::Native_vec_,
+    HIP_vector_base<char, 4>::Native_vec_,
+    int, bool);
+
+__device__
+__attribute__((const))
+unsigned int __ockl_udot4(
+    HIP_vector_base<unsigned char, 4>::Native_vec_,
+    HIP_vector_base<unsigned char, 4>::Native_vec_,
+    unsigned int, bool);
+
+__device__
+__attribute__((const))
+int __ockl_sdot8(int, int, int, bool);
+
+__device__
+__attribute__((const))
+unsigned int __ockl_udot8(unsigned int, unsigned int, unsigned int, bool);
+#endif
+
+#if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+// BEGIN FLOAT
+__device__
+__attribute__((const))
+float __ocml_acos_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_acosh_f32(float);
+__device__
+__attribute__((const))
+float __ocml_asin_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_asinh_f32(float);
+__device__
+__attribute__((const))
+float __ocml_atan2_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_atan_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_atanh_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_cbrt_f32(float);
+__device__
+__attribute__((const))
+float __ocml_ceil_f32(float);
+__device__
+__attribute__((const))
+__device__
+float __ocml_copysign_f32(float, float);
+__device__
+float __ocml_cos_f32(float);
+__device__
+float __ocml_native_cos_f32(float);
+__device__
+__attribute__((pure))
+__device__
+float __ocml_cosh_f32(float);
+__device__
+float __ocml_cospi_f32(float);
+__device__
+float __ocml_i0_f32(float);
+__device__
+float __ocml_i1_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_erfc_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_erfcinv_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_erfcx_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_erf_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_erfinv_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_exp10_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_native_exp10_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_exp2_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_exp_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_native_exp_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_expm1_f32(float);
+__device__
+__attribute__((const))
+float __ocml_fabs_f32(float);
+__device__
+__attribute__((const))
+float __ocml_fdim_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_floor_f32(float);
+__device__
+__attribute__((const))
+float __ocml_fma_f32(float, float, float);
+__device__
+__attribute__((const))
+float __ocml_fmax_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_fmin_f32(float, float);
+__device__
+__attribute__((const))
+__device__
+float __ocml_fmod_f32(float, float);
+__device__
+float __ocml_frexp_f32(float, __attribute__((address_space(5))) int*);
+__device__
+__attribute__((const))
+float __ocml_hypot_f32(float, float);
+__device__
+__attribute__((const))
+int __ocml_ilogb_f32(float);
+__device__
+__attribute__((const))
+int __ocml_isfinite_f32(float);
+__device__
+__attribute__((const))
+int __ocml_isinf_f32(float);
+__device__
+__attribute__((const))
+int __ocml_isnan_f32(float);
+__device__
+float __ocml_j0_f32(float);
+__device__
+float __ocml_j1_f32(float);
+__device__
+__attribute__((const))
+float __ocml_ldexp_f32(float, int);
+__device__
+float __ocml_lgamma_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_log10_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_native_log10_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_log1p_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_log2_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_native_log2_f32(float);
+__device__
+__attribute__((const))
+float __ocml_logb_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_log_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_native_log_f32(float);
+__device__
+float __ocml_modf_f32(float, __attribute__((address_space(5))) float*);
+__device__
+__attribute__((const))
+float __ocml_nearbyint_f32(float);
+__device__
+__attribute__((const))
+float __ocml_nextafter_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_len3_f32(float, float, float);
+__device__
+__attribute__((const))
+float __ocml_len4_f32(float, float, float, float);
+__device__
+__attribute__((pure))
+float __ocml_ncdf_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_ncdfinv_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_pow_f32(float, float);
+__device__
+__attribute__((pure))
+float __ocml_pown_f32(float, int);
+__device__
+__attribute__((pure))
+float __ocml_rcbrt_f32(float);
+__device__
+__attribute__((const))
+float __ocml_remainder_f32(float, float);
+__device__
+float __ocml_remquo_f32(float, float, __attribute__((address_space(5))) int*);
+__device__
+__attribute__((const))
+float __ocml_rhypot_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_rint_f32(float);
+__device__
+__attribute__((const))
+float __ocml_rlen3_f32(float, float, float);
+__device__
+__attribute__((const))
+float __ocml_rlen4_f32(float, float, float, float);
+__device__
+__attribute__((const))
+float __ocml_round_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_rsqrt_f32(float);
+__device__
+__attribute__((const))
+float __ocml_scalb_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_scalbn_f32(float, int);
+__device__
+__attribute__((const))
+int __ocml_signbit_f32(float);
+__device__
+float __ocml_sincos_f32(float, __attribute__((address_space(5))) float*);
+__device__
+float __ocml_sincospi_f32(float, __attribute__((address_space(5))) float*);
+__device__
+float __ocml_sin_f32(float);
+__device__
+float __ocml_native_sin_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_sinh_f32(float);
+__device__
+float __ocml_sinpi_f32(float);
+__device__
+__attribute__((const))
+float __ocml_sqrt_f32(float);
+__device__
+__attribute__((const))
+float __ocml_native_sqrt_f32(float);
+__device__
+float __ocml_tan_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_tanh_f32(float);
+__device__
+float __ocml_tgamma_f32(float);
+__device__
+__attribute__((const))
+float __ocml_trunc_f32(float);
+__device__
+float __ocml_y0_f32(float);
+__device__
+float __ocml_y1_f32(float);
+
+// BEGIN INTRINSICS
+__device__
+__attribute__((const))
+float __ocml_add_rte_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_add_rtn_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_add_rtp_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_add_rtz_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_sub_rte_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_sub_rtn_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_sub_rtp_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_sub_rtz_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_mul_rte_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_mul_rtn_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_mul_rtp_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_mul_rtz_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_div_rte_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_div_rtn_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_div_rtp_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_div_rtz_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_sqrt_rte_f32(float);
+__device__
+__attribute__((const))
+float __ocml_sqrt_rtn_f32(float);
+__device__
+__attribute__((const))
+float __ocml_sqrt_rtp_f32(float);
+__device__
+__attribute__((const))
+float __ocml_sqrt_rtz_f32(float);
+__device__
+__attribute__((const))
+float __ocml_fma_rte_f32(float, float, float);
+__device__
+__attribute__((const))
+float __ocml_fma_rtn_f32(float, float, float);
+__device__
+__attribute__((const))
+float __ocml_fma_rtp_f32(float, float, float);
+__device__
+__attribute__((const))
+float __ocml_fma_rtz_f32(float, float, float);
+// END INTRINSICS
+// END FLOAT
+
+// BEGIN DOUBLE
+__device__
+__attribute__((const))
+double __ocml_acos_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_acosh_f64(double);
+__device__
+__attribute__((const))
+double __ocml_asin_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_asinh_f64(double);
+__device__
+__attribute__((const))
+double __ocml_atan2_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_atan_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_atanh_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_cbrt_f64(double);
+__device__
+__attribute__((const))
+double __ocml_ceil_f64(double);
+__device__
+__attribute__((const))
+double __ocml_copysign_f64(double, double);
+__device__
+double __ocml_cos_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_cosh_f64(double);
+__device__
+double __ocml_cospi_f64(double);
+__device__
+double __ocml_i0_f64(double);
+__device__
+double __ocml_i1_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_erfc_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_erfcinv_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_erfcx_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_erf_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_erfinv_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_exp10_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_exp2_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_exp_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_expm1_f64(double);
+__device__
+__attribute__((const))
+double __ocml_fabs_f64(double);
+__device__
+__attribute__((const))
+double __ocml_fdim_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_floor_f64(double);
+__device__
+__attribute__((const))
+double __ocml_fma_f64(double, double, double);
+__device__
+__attribute__((const))
+double __ocml_fmax_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_fmin_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_fmod_f64(double, double);
+__device__
+double __ocml_frexp_f64(double, __attribute__((address_space(5))) int*);
+__device__
+__attribute__((const))
+double __ocml_hypot_f64(double, double);
+__device__
+__attribute__((const))
+int __ocml_ilogb_f64(double);
+__device__
+__attribute__((const))
+int __ocml_isfinite_f64(double);
+__device__
+__attribute__((const))
+int __ocml_isinf_f64(double);
+__device__
+__attribute__((const))
+int __ocml_isnan_f64(double);
+__device__
+double __ocml_j0_f64(double);
+__device__
+double __ocml_j1_f64(double);
+__device__
+__attribute__((const))
+double __ocml_ldexp_f64(double, int);
+__device__
+double __ocml_lgamma_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_log10_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_log1p_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_log2_f64(double);
+__device__
+__attribute__((const))
+double __ocml_logb_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_log_f64(double);
+__device__
+double __ocml_modf_f64(double, __attribute__((address_space(5))) double*);
+__device__
+__attribute__((const))
+double __ocml_nearbyint_f64(double);
+__device__
+__attribute__((const))
+double __ocml_nextafter_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_len3_f64(double, double, double);
+__device__
+__attribute__((const))
+double __ocml_len4_f64(double, double, double, double);
+__device__
+__attribute__((pure))
+double __ocml_ncdf_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_ncdfinv_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_pow_f64(double, double);
+__device__
+__attribute__((pure))
+double __ocml_pown_f64(double, int);
+__device__
+__attribute__((pure))
+double __ocml_rcbrt_f64(double);
+__device__
+__attribute__((const))
+double __ocml_remainder_f64(double, double);
+__device__
+double __ocml_remquo_f64(
+    double, double, __attribute__((address_space(5))) int*);
+__device__
+__attribute__((const))
+double __ocml_rhypot_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_rint_f64(double);
+__device__
+__attribute__((const))
+double __ocml_rlen3_f64(double, double, double);
+__device__
+__attribute__((const))
+double __ocml_rlen4_f64(double, double, double, double);
+__device__
+__attribute__((const))
+double __ocml_round_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_rsqrt_f64(double);
+__device__
+__attribute__((const))
+double __ocml_scalb_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_scalbn_f64(double, int);
+__device__
+__attribute__((const))
+int __ocml_signbit_f64(double);
+__device__
+double __ocml_sincos_f64(double, __attribute__((address_space(5))) double*);
+__device__
+double __ocml_sincospi_f64(double, __attribute__((address_space(5))) double*);
+__device__
+double __ocml_sin_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_sinh_f64(double);
+__device__
+double __ocml_sinpi_f64(double);
+__device__
+__attribute__((const))
+double __ocml_sqrt_f64(double);
+__device__
+double __ocml_tan_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_tanh_f64(double);
+__device__
+double __ocml_tgamma_f64(double);
+__device__
+__attribute__((const))
+double __ocml_trunc_f64(double);
+__device__
+double __ocml_y0_f64(double);
+__device__
+double __ocml_y1_f64(double);
+
+// BEGIN INTRINSICS
+__device__
+__attribute__((const))
+double __ocml_add_rte_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_add_rtn_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_add_rtp_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_add_rtz_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_sub_rte_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_sub_rtn_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_sub_rtp_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_sub_rtz_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_mul_rte_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_mul_rtn_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_mul_rtp_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_mul_rtz_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_div_rte_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_div_rtn_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_div_rtp_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_div_rtz_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_sqrt_rte_f64(double);
+__device__
+__attribute__((const))
+double __ocml_sqrt_rtn_f64(double);
+__device__
+__attribute__((const))
+double __ocml_sqrt_rtp_f64(double);
+__device__
+__attribute__((const))
+double __ocml_sqrt_rtz_f64(double);
+__device__
+__attribute__((const))
+double __ocml_fma_rte_f64(double, double, double);
+__device__
+__attribute__((const))
+double __ocml_fma_rtn_f64(double, double, double);
+__device__
+__attribute__((const))
+double __ocml_fma_rtp_f64(double, double, double);
+__device__
+__attribute__((const))
+double __ocml_fma_rtz_f64(double, double, double);
+// END INTRINSICS
+// END DOUBLE
+
+#endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+
+#if defined(__cplusplus)
+    } // extern "C"
+#endif
@@ -0,0 +1,175 @@
+/*
+Copyright (c) 2015 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <hip/hip_vector_types.h>
+
+extern "C" {
+
+#define ADDRESS_SPACE_CONSTANT __attribute__((address_space(4)))
+
+__device__ float4::Native_vec_ __ockl_image_load_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, int c);
+
+__device__ float4::Native_vec_ __ockl_image_load_1Db(unsigned int ADDRESS_SPACE_CONSTANT*i, int c);
+
+__device__ float4::Native_vec_ __ockl_image_load_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_load_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_load_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_load_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_load_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int f);
+
+__device__ float4::Native_vec_ __ockl_image_load_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int f);
+
+__device__ float4::Native_vec_ __ockl_image_load_lod_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, int c, int l);
+
+__device__ float4::Native_vec_ __ockl_image_load_lod_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int l);
+
+__device__ float4::Native_vec_ __ockl_image_load_lod_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int l);
+
+__device__ float4::Native_vec_ __ockl_image_load_lod_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l);
+
+__device__ float4::Native_vec_ __ockl_image_load_lod_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l);
+
+__device__ float4::Native_vec_ __ockl_image_load_lod_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int f, int l);
+
+__device__ float4::Native_vec_ __ockl_image_load_lod_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int f, int l);
+
+__device__ void __ockl_image_store_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, int c, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int f, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int f, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_lod_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, int c, int l, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_lod_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int l, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_lod_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int l, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_lod_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_lod_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_lod_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int f, int l, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_lod_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int f, int l, float4::Native_vec_ p);
+
+__device__ float4::Native_vec_ __ockl_image_sample_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float c);
+
+__device__ float4::Native_vec_ __ockl_image_sample_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_sample_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_sample_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_sample_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_sample_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_sample_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_sample_grad_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float c, float dx, float dy);
+
+__device__ float4::Native_vec_ __ockl_image_sample_grad_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c, float dx, float dy);
+
+__device__ float4::Native_vec_ __ockl_image_sample_grad_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c, float2::Native_vec_ dx, float2::Native_vec_ dy);
+
+__device__ float4::Native_vec_ __ockl_image_sample_grad_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float2::Native_vec_ dx, float2::Native_vec_ dy);
+
+__device__ float4::Native_vec_ __ockl_image_sample_grad_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float4::Native_vec_ dx, float4::Native_vec_ dy);
+
+__device__ float4::Native_vec_ __ockl_image_sample_lod_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float c, float l);
+
+__device__ float4::Native_vec_ __ockl_image_sample_lod_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c, float l);
+
+__device__ float4::Native_vec_ __ockl_image_sample_lod_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c, float l);
+
+__device__ float4::Native_vec_ __ockl_image_sample_lod_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float l);
+
+__device__ float4::Native_vec_ __ockl_image_sample_lod_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float l);
+
+__device__ float4::Native_vec_ __ockl_image_sample_lod_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float l);
+
+__device__ float4::Native_vec_ __ockl_image_sample_lod_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float l);
+
+__device__ float4::Native_vec_ __ockl_image_gather4r_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_gather4g_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_gather4b_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_gather4a_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
+
+__device__ int __ockl_image_channel_data_type_1D(unsigned int ADDRESS_SPACE_CONSTANT* i);
+
+__device__ int __ockl_image_channel_data_type_1Da(unsigned int ADDRESS_SPACE_CONSTANT* i);
+
+__device__ int __ockl_image_channel_data_type_1Db(unsigned int ADDRESS_SPACE_CONSTANT* i);
+
+__device__ int __ockl_image_channel_data_type_2D(unsigned int ADDRESS_SPACE_CONSTANT* i);
+
+__device__ int __ockl_image_channel_data_type_2Da(unsigned int ADDRESS_SPACE_CONSTANT* i);
+
+__device__ int __ockl_image_channel_data_type_2Dad(unsigned int ADDRESS_SPACE_CONSTANT* i);
+
+__device__ int __ockl_image_channel_data_type_2Dd(unsigned int ADDRESS_SPACE_CONSTANT* i);
+
+__device__ int __ockl_image_channel_data_type_3D(unsigned int ADDRESS_SPACE_CONSTANT* i);
+
+__device__ int __ockl_image_channel_data_type_CM(unsigned int ADDRESS_SPACE_CONSTANT* i);
+
+__device__ int __ockl_image_channel_data_type_CMa(unsigned int ADDRESS_SPACE_CONSTANT* i);
+
+__device__ int __ockl_image_channel_order_1D(unsigned int ADDRESS_SPACE_CONSTANT* i);
+
+__device__ int __ockl_image_channel_order_1Da(unsigned int ADDRESS_SPACE_CONSTANT* i);
+
+__device__ int __ockl_image_channel_order_1Db(unsigned int ADDRESS_SPACE_CONSTANT* i);
+
+__device__ int __ockl_image_channel_order_2D(unsigned int ADDRESS_SPACE_CONSTANT* i);
+
+__device__ int __ockl_image_channel_order_2Da(unsigned int ADDRESS_SPACE_CONSTANT* i);
+
+__device__ int __ockl_image_channel_order_2Dad(unsigned int ADDRESS_SPACE_CONSTANT* i);
+
+__device__ int __ockl_image_channel_order_2Dd(unsigned int ADDRESS_SPACE_CONSTANT* i);
+
+__device__ int __ockl_image_channel_order_3D(unsigned int ADDRESS_SPACE_CONSTANT* i);
+
+__device__ int __ockl_image_channel_order_CM(unsigned int ADDRESS_SPACE_CONSTANT* i);
+
+__device__ int __ockl_image_channel_order_CMa(unsigned int ADDRESS_SPACE_CONSTANT* i);
+
+};
@@ -0,0 +1,107 @@
+/*
+Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <hsa/amd_hsa_kernel_code.h>
+#include <hsa/hsa.h>
+#include <hsa/hsa_ext_amd.h>
+#include <hsa/hsa_ven_amd_loader.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+
+#include <hip/hip_common.h>
+
+struct ihipModuleSymbol_t;
+using hipFunction_t = ihipModuleSymbol_t*;
+
+namespace hip_impl {
+
+// This section contains internal APIs that
+// needs to be exported
+#ifdef __GNUC__
+#pragma GCC visibility push (default)
+#endif
+
+struct kernarg_impl;
+class kernarg {
+public:
+    kernarg();
+    kernarg(kernarg&&);
+    ~kernarg();
+    std::uint8_t* data();
+    std::size_t   size();
+    void reserve(std::size_t);
+    void resize(std::size_t);
+private:
+    kernarg_impl* impl;
+};
+
+class kernargs_size_align;
+class program_state_impl;
+class program_state {
+public:
+    program_state();
+    ~program_state();
+    program_state(const program_state&) = delete;
+
+    hipFunction_t kernel_descriptor(std::uintptr_t,
+                                    hsa_agent_t);
+
+    kernargs_size_align get_kernargs_size_align(std::uintptr_t);
+    hsa_executable_t load_executable(const char*, const size_t,
+                                     hsa_executable_t,
+                                     hsa_agent_t);
+    hsa_executable_t load_executable_no_copy(const char*, const size_t,
+                                             hsa_executable_t,
+                                             hsa_agent_t);
+
+    void* global_addr_by_name(const char* name);
+
+private:
+    friend class agent_globals_impl;
+    program_state_impl* impl;
+};
+
+class kernargs_size_align {
+public:
+    std::size_t size(std::size_t n) const;
+    std::size_t alignment(std::size_t n) const;
+    const void* getHandle() const {return handle;};
+private:
+    const void* handle;
+    friend kernargs_size_align program_state::get_kernargs_size_align(std::uintptr_t);
+};
+
+#ifdef __GNUC__
+#pragma GCC visibility pop
+#endif
+
+inline
+__attribute__((visibility("hidden")))
+program_state& get_program_state() {
+    static program_state ps;
+    return ps;
+}
+}  // Namespace hip_impl.
@@ -0,0 +1,388 @@
+/*
+Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#if defined(__cplusplus)
+
+#include <hip/hip_vector_types.h>
+#include <hip/hip_texture_types.h>
+#include <hip/amd_detail/ockl_image.h>
+
+#if !defined(__HIPCC_RTC__)
+#include <type_traits>
+#endif // !defined(__HIPCC_RTC__)
+
+#define TEXTURE_PARAMETERS_INIT                                                                     \
+    unsigned int ADDRESS_SPACE_CONSTANT* i = (unsigned int ADDRESS_SPACE_CONSTANT*)t.textureObject; \
+    unsigned int ADDRESS_SPACE_CONSTANT* s = i + HIP_SAMPLER_OBJECT_OFFSET_DWORD;
+
+template<typename T>
+struct __hip_is_tex_channel_type
+{
+    static constexpr bool value =
+        std::is_same<T, char>::value ||
+        std::is_same<T, unsigned char>::value ||
+        std::is_same<T, short>::value ||
+        std::is_same<T, unsigned short>::value ||
+        std::is_same<T, int>::value ||
+        std::is_same<T, unsigned int>::value ||
+        std::is_same<T, float>::value;
+};
+
+template<
+    typename T,
+    unsigned int rank>
+struct __hip_is_tex_channel_type<HIP_vector_type<T, rank>>
+{
+    static constexpr bool value =
+        __hip_is_tex_channel_type<T>::value &&
+        ((rank == 1) ||
+         (rank == 2) ||
+         (rank == 4));
+};
+
+template<typename T>
+struct __hip_is_tex_normalized_channel_type
+{
+    static constexpr bool value =
+        std::is_same<T, char>::value ||
+        std::is_same<T, unsigned char>::value ||
+        std::is_same<T, short>::value ||
+        std::is_same<T, unsigned short>::value;
+};
+
+template<
+    typename T,
+    unsigned int rank>
+struct __hip_is_tex_normalized_channel_type<HIP_vector_type<T, rank>>
+{
+    static constexpr bool value =
+        __hip_is_tex_normalized_channel_type<T>::value &&
+        ((rank == 1) ||
+         (rank == 2) ||
+         (rank == 4));
+};
+
+template <
+    typename T,
+    hipTextureReadMode readMode,
+    typename Enable = void>
+struct __hip_tex_ret
+{
+    static_assert(std::is_same<Enable, void>::value, "Invalid channel type!");
+};
+
+template <
+    typename T,
+    hipTextureReadMode readMode>
+using __hip_tex_ret_t = typename __hip_tex_ret<T, readMode, bool>::type;
+
+template <typename T>
+struct __hip_tex_ret<
+    T,
+    hipReadModeElementType,
+    typename std::enable_if<__hip_is_tex_channel_type<T>::value, bool>::type>
+{
+    using type = T;
+};
+
+template<
+    typename T,
+    unsigned int rank>
+struct __hip_tex_ret<
+    HIP_vector_type<T, rank>,
+    hipReadModeElementType,
+    typename std::enable_if<__hip_is_tex_channel_type<HIP_vector_type<T, rank>>::value, bool>::type>
+{
+    using type = HIP_vector_type<__hip_tex_ret_t<T, hipReadModeElementType>, rank>;
+};
+
+template<typename T>
+struct __hip_tex_ret<
+    T,
+    hipReadModeNormalizedFloat,
+    typename std::enable_if<__hip_is_tex_normalized_channel_type<T>::value, bool>::type>
+{
+    using type = float;
+};
+
+template<
+    typename T,
+    unsigned int rank>
+struct __hip_tex_ret<
+    HIP_vector_type<T, rank>,
+    hipReadModeNormalizedFloat,
+    typename std::enable_if<__hip_is_tex_normalized_channel_type<HIP_vector_type<T, rank>>::value, bool>::type>
+{
+    using type = HIP_vector_type<__hip_tex_ret_t<T, hipReadModeNormalizedFloat>, rank>;
+};
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1Dfetch(texture<T, hipTextureType1D, readMode> t, int x)
+{
+    TEXTURE_PARAMETERS_INIT;
+    auto tmp = __ockl_image_load_1Db(i, x);
+    return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1D(texture<T, hipTextureType1D, readMode> t, float x)
+{
+    TEXTURE_PARAMETERS_INIT;
+    auto tmp = __ockl_image_sample_1D(i, s, x);
+    return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2D(texture<T, hipTextureType2D, readMode> t, float x, float y)
+{
+    TEXTURE_PARAMETERS_INIT;
+    auto tmp = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DLayered(texture<T, hipTextureType1DLayered, readMode> t, float x, int layer)
+{
+    TEXTURE_PARAMETERS_INIT;
+    auto tmp = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DLayered(texture<T, hipTextureType2DLayered, readMode> t, float x, float y, int layer)
+{
+    TEXTURE_PARAMETERS_INIT;
+    auto tmp = __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex3D(texture<T, hipTextureType3D, readMode> t, float x, float y, float z)
+{
+    TEXTURE_PARAMETERS_INIT;
+    auto tmp = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemap(texture<T, hipTextureTypeCubemap, readMode> t, float x, float y, float z)
+{
+    TEXTURE_PARAMETERS_INIT;
+    auto tmp = __ockl_image_sample_CM(i, s, float4(x, y, z, 0.0f).data);
+    return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DLod(texture<T, hipTextureType1D, readMode> t, float x, float level)
+{
+    TEXTURE_PARAMETERS_INIT;
+    auto tmp = __ockl_image_sample_lod_1D(i, s, x, level);
+    return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DLod(texture<T, hipTextureType2D, readMode> t, float x, float y, float level)
+{
+    TEXTURE_PARAMETERS_INIT;
+    auto tmp = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DLayeredLod(texture<T, hipTextureType1DLayered, readMode> t, float x, int layer, float level)
+{
+    TEXTURE_PARAMETERS_INIT;
+    auto tmp = __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+    return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DLayeredLod(texture<T, hipTextureType2DLayered, readMode> t, float x, float y, int layer, float level)
+{
+    TEXTURE_PARAMETERS_INIT;
+    auto tmp = __ockl_image_sample_lod_2Da(i, s, float4(x, y, layer, 0.0f).data, level);
+    return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex3DLod(texture<T, hipTextureType3D, readMode> t, float x, float y, float z, float level)
+{
+    TEXTURE_PARAMETERS_INIT;
+    auto tmp = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data, level);
+    return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapLod(texture<T, hipTextureTypeCubemap, readMode> t, float x, float y, float z, float level)
+{
+    TEXTURE_PARAMETERS_INIT;
+    auto tmp = __ockl_image_sample_lod_CM(i, s, float4(x, y, z, 0.0f).data, level);
+    return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapLayered(texture<T, hipTextureTypeCubemapLayered, readMode> t, float x, float y, float z, int layer)
+{
+    TEXTURE_PARAMETERS_INIT;
+    auto tmp = __ockl_image_sample_CMa(i, s, float4(x, y, z, layer).data);
+    return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapLayeredLod(texture<T, hipTextureTypeCubemapLayered, readMode> t, float x, float y, float z, int layer, float level)
+{
+    TEXTURE_PARAMETERS_INIT;
+    auto tmp = __ockl_image_sample_lod_CMa(i, s, float4(x, y, z, layer).data, level);
+    return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapGrad(texture<T, hipTextureTypeCubemap, readMode> t, float x, float y, float z, float4 dPdx, float4 dPdy)
+{
+    TEXTURE_PARAMETERS_INIT;
+    // TODO missing in device libs.
+    // auto tmp = __ockl_image_sample_grad_CM(i, s, float4(x, y, z, 0.0f).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data);
+    // return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+    return {};
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapLayeredGrad(texture<T, hipTextureTypeCubemapLayered, readMode> t, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
+{
+    TEXTURE_PARAMETERS_INIT;
+    // TODO missing in device libs.
+    // auto tmp = __ockl_image_sample_grad_CMa(i, s, float4(x, y, z, layer).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data);
+    // return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+    return {};
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DGrad(texture<T, hipTextureType1D, readMode> t, float x, float dPdx, float dPdy)
+{
+    TEXTURE_PARAMETERS_INIT;
+    auto tmp = __ockl_image_sample_grad_1D(i, s, x, dPdx, dPdy);
+    return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DGrad(texture<T, hipTextureType2D, readMode> t, float x, float y, float2 dPdx, float2 dPdy)
+{
+    TEXTURE_PARAMETERS_INIT;
+    auto tmp = __ockl_image_sample_grad_2D(i, s, float2(x, y).data, float2(dPdx.x, dPdx.y).data,  float2(dPdy.x, dPdy.y).data);
+    return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DLayeredGrad(texture<T, hipTextureType1DLayered, readMode> t, float x, int layer, float dPdx, float dPdy)
+{
+    TEXTURE_PARAMETERS_INIT;
+    auto tmp = __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dPdx, dPdy);
+    return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DLayeredGrad(texture<T, hipTextureType2DLayered, readMode> t, float x, float y, int layer, float2 dPdx, float2 dPdy)
+{
+    TEXTURE_PARAMETERS_INIT;
+    auto tmp = __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data, float2(dPdx.x, dPdx.y).data, float2(dPdy.x, dPdy.y).data);
+    return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex3DGrad(texture<T, hipTextureType3D, readMode> t, float x, float y, float z, float4 dPdx, float4 dPdy)
+{
+    TEXTURE_PARAMETERS_INIT;
+    auto tmp = __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data);
+    return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
+}
+
+template <
+    typename T,
+    hipTextureReadMode readMode,
+    typename Enable = void>
+struct __hip_tex2dgather_ret
+{
+    static_assert(std::is_same<Enable, void>::value, "Invalid channel type!");
+};
+
+template <
+    typename T,
+    hipTextureReadMode readMode>
+using __hip_tex2dgather_ret_t = typename __hip_tex2dgather_ret<T, readMode, bool>::type;
+
+template <typename T>
+struct __hip_tex2dgather_ret<
+    T,
+    hipReadModeElementType,
+    typename std::enable_if<__hip_is_tex_channel_type<T>::value, bool>::type>
+{
+    using type = HIP_vector_type<T, 4>;
+};
+
+template<
+    typename T,
+    unsigned int rank>
+struct __hip_tex2dgather_ret<
+    HIP_vector_type<T, rank>,
+    hipReadModeElementType,
+    typename std::enable_if<__hip_is_tex_channel_type<HIP_vector_type<T, rank>>::value, bool>::type>
+{
+    using type = HIP_vector_type<T, 4>;
+};
+
+template <typename T>
+struct __hip_tex2dgather_ret<
+    T,
+    hipReadModeNormalizedFloat,
+    typename std::enable_if<__hip_is_tex_normalized_channel_type<T>::value, bool>::type>
+{
+    using type = float4;
+};
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex2dgather_ret_t<T, readMode> tex2Dgather(texture<T, hipTextureType2D, readMode> t, float x, float y, int comp=0)
+{
+    TEXTURE_PARAMETERS_INIT;
+    switch (comp) {
+    case 1: {
+        auto tmp = __ockl_image_gather4g_2D(i, s, float2(x, y).data);
+        return mapFrom<__hip_tex2dgather_ret_t<T, readMode>>(tmp);
+    }
+    case 2: {
+        auto tmp = __ockl_image_gather4b_2D(i, s, float2(x, y).data);
+        return mapFrom<__hip_tex2dgather_ret_t<T, readMode>>(tmp);
+    }
+    case 3: {
+        auto tmp = __ockl_image_gather4a_2D(i, s, float2(x, y).data);
+        return mapFrom<__hip_tex2dgather_ret_t<T, readMode>>(tmp);
+    }
+    default: {
+        auto tmp = __ockl_image_gather4r_2D(i, s, float2(x, y).data);
+        return mapFrom<__hip_tex2dgather_ret_t<T, readMode>>(tmp);
+    }
+    }
+    return {};
+}
+
+#endif
@@ -0,0 +1,503 @@
+/*
+Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#if defined(__cplusplus)
+
+#include <hip/hip_vector_types.h>
+#include <hip/hip_texture_types.h>
+#include <hip/amd_detail/ockl_image.h>
+
+#if !defined(__HIPCC_RTC__)
+#include <type_traits>
+#endif // !defined(__HIPCC_RTC__)
+
+#define TEXTURE_OBJECT_PARAMETERS_INIT                                                            \
+    unsigned int ADDRESS_SPACE_CONSTANT* i = (unsigned int ADDRESS_SPACE_CONSTANT*)textureObject; \
+    unsigned int ADDRESS_SPACE_CONSTANT* s = i + HIP_SAMPLER_OBJECT_OFFSET_DWORD;
+
+template<typename T>
+struct __hip_is_itex_channel_type
+{
+    static constexpr bool value =
+        std::is_same<T, char>::value ||
+        std::is_same<T, unsigned char>::value ||
+        std::is_same<T, short>::value ||
+        std::is_same<T, unsigned short>::value ||
+        std::is_same<T, int>::value ||
+        std::is_same<T, unsigned int>::value ||
+        std::is_same<T, float>::value;
+};
+
+template<
+    typename T,
+    unsigned int rank>
+struct __hip_is_itex_channel_type<HIP_vector_type<T, rank>>
+{
+    static constexpr bool value =
+        __hip_is_itex_channel_type<T>::value &&
+        ((rank == 1) ||
+         (rank == 2) ||
+         (rank == 4));
+};
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T tex1Dfetch(hipTextureObject_t textureObject, int x)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    auto tmp = __ockl_image_load_1Db(i, x);
+    return mapFrom<T>(tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void tex1Dfetch(T *ptr, hipTextureObject_t textureObject, int x)
+{
+    *ptr = tex1Dfetch<T>(textureObject, x);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T tex1D(hipTextureObject_t textureObject, float x)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    auto tmp = __ockl_image_sample_1D(i, s, x);
+    return mapFrom<T>(tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void tex1D(T *ptr, hipTextureObject_t textureObject, float x)
+{
+    *ptr = tex1D<T>(textureObject, x);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T tex2D(hipTextureObject_t textureObject, float x, float y)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    auto tmp = __ockl_image_sample_2D(i, s, float2(x, y).data);
+    return mapFrom<T>(tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void tex2D(T *ptr, hipTextureObject_t textureObject, float x, float y)
+{
+    *ptr = tex2D<T>(textureObject, x, y);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T tex3D(hipTextureObject_t textureObject, float x, float y, float z)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    auto tmp = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+    return mapFrom<T>(tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void tex3D(T *ptr, hipTextureObject_t textureObject, float x, float y, float z)
+{
+    *ptr = tex3D<T>(textureObject, x, y, z);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T tex1DLayered(hipTextureObject_t textureObject, float x, int layer)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    auto tmp = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    return mapFrom<T>(tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void tex1DLayered(T *ptr, hipTextureObject_t textureObject, float x, int layer)
+{
+    *ptr = tex1DLayered<T>(textureObject, x, layer);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T tex2DLayered(hipTextureObject_t textureObject, float x, float y, int layer)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    auto tmp = __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    return mapFrom<T>(tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void tex2DLayered(T *ptr, hipTextureObject_t textureObject, float x, float y, int layer)
+{
+    *ptr = tex1DLayered<T>(textureObject, x, y, layer);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__  T texCubemap(hipTextureObject_t textureObject, float x, float y, float z)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    auto tmp = __ockl_image_sample_CM(i, s, float4(x, y, z, 0.0f).data);
+    return mapFrom<T>(tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void texCubemap(T *ptr, hipTextureObject_t textureObject, float x, float y, float z)
+{
+    *ptr = texCubemap<T>(textureObject, x, y, z);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T texCubemapLayered(hipTextureObject_t textureObject, float x, float y, float z, int layer)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    auto tmp = __ockl_image_sample_CMa(i, s, float4(x, y, z, layer).data);
+    return mapFrom<T>(tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void texCubemapLayered(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, int layer)
+{
+    *ptr = texCubemapLayered<T>(textureObject, x, y, z, layer);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T tex2Dgather(hipTextureObject_t textureObject, float x, float y, int comp = 0)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    switch (comp) {
+    case 1: {
+        auto tmp = __ockl_image_gather4r_2D(i, s, float2(x, y).data);
+        return mapFrom<T>(tmp);
+        break;
+    }
+    case 2: {
+        auto tmp = __ockl_image_gather4g_2D(i, s, float2(x, y).data);
+        return mapFrom<T>(tmp);
+        break;
+    }
+    case 3: {
+        auto tmp = __ockl_image_gather4b_2D(i, s, float2(x, y).data);
+        return mapFrom<T>(tmp);
+        break;
+    }
+    default: {
+        auto tmp = __ockl_image_gather4a_2D(i, s, float2(x, y).data);
+        return mapFrom<T>(tmp);
+        break;
+    }
+    };
+    return {};
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void tex2Dgather(T *ptr, hipTextureObject_t textureObject, float x, float y, int comp = 0)
+{
+    *ptr = texCubemapLayered<T>(textureObject, x, y, comp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T tex1DLod(hipTextureObject_t textureObject, float x, float level)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    auto tmp = __ockl_image_sample_lod_1D(i, s, x, level);
+    return mapFrom<T>(tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void tex1DLod(T *ptr, hipTextureObject_t textureObject, float x, float level)
+{
+    *ptr = tex1DLod<T>(textureObject, x, level);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T tex2DLod(hipTextureObject_t textureObject, float x, float y, float level)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    auto tmp = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+    return mapFrom<T>(tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void tex2DLod(T *ptr, hipTextureObject_t textureObject, float x, float y, float level)
+{
+    *ptr = tex2DLod<T>(textureObject, x, y, level);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T tex3DLod(hipTextureObject_t textureObject, float x, float y, float z, float level)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    auto tmp = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data, level);
+    return mapFrom<T>(tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void tex3DLod(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, float level)
+{
+    *ptr = tex3DLod<T>(textureObject, x, y, z, level);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T tex1DLayeredLod(hipTextureObject_t textureObject, float x, int layer, float level)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    auto tmp = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+    return mapFrom<T>(tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void tex1DLayeredLod(T *ptr, hipTextureObject_t textureObject, float x, int layer, float level)
+{
+    *ptr = tex1DLayeredLod<T>(textureObject, x, layer, level);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__  T tex2DLayeredLod(hipTextureObject_t textureObject, float x, float y, int layer, float level)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    auto tmp = __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+    return mapFrom<T>(tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void tex2DLayeredLod(T *ptr, hipTextureObject_t textureObject, float x, float y, int layer, float level)
+{
+    *ptr = tex2DLayeredLod<T>(textureObject, x, y, layer, level);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T texCubemapLod(hipTextureObject_t textureObject, float x, float y, float z, float level)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    auto tmp = __ockl_image_sample_lod_CM(i, s, float4(x, y, z, 0.0f).data, level);
+    return mapFrom<T>(tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void texCubemapLod(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, float level)
+{
+    *ptr = texCubemapLod<T>(textureObject, x, y, z, level);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T texCubemapGrad(hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    // TODO missing in device libs.
+    // auto tmp = __ockl_image_sample_grad_CM(i, s, float4(x, y, z, 0.0f).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data);
+    // return mapFrom<T>(tmp);
+    return {};
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void texCubemapGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy)
+{
+    *ptr = texCubemapGrad<T>(textureObject, x, y, z, dPdx, dPdy);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T texCubemapLayeredLod(hipTextureObject_t textureObject, float x, float y, float z, int layer, float level)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    auto tmp = __ockl_image_sample_lod_CMa(i, s, float4(x, y, z, layer).data, level);
+    return mapFrom<T>(tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void texCubemapLayeredLod(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, int layer, float level)
+{
+    *ptr = texCubemapLayeredLod<T>(textureObject, x, y, z, layer, level);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T tex1DGrad(hipTextureObject_t textureObject, float x, float dPdx, float dPdy)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    auto tmp = __ockl_image_sample_grad_1D(i, s, x, dPdx, dPdy);
+    return mapFrom<T>(tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void tex1DGrad(T *ptr, hipTextureObject_t textureObject, float x, float dPdx, float dPdy)
+{
+    *ptr = tex1DGrad<T>(textureObject, x, dPdx, dPdy);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T tex2DGrad(hipTextureObject_t textureObject, float x, float y, float2 dPdx, float2 dPdy)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    auto tmp = __ockl_image_sample_grad_2D(i, s, float2(x, y).data, float2(dPdx.x, dPdx.y).data,  float2(dPdy.x, dPdy.y).data);
+    return mapFrom<T>(tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void tex2DGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, float2 dPdx, float2 dPdy)
+{
+    *ptr = tex2DGrad<T>(textureObject, x, y, dPdx, dPdy);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T tex3DGrad(hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    auto tmp = __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data);
+    return mapFrom<T>(tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void tex3DGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy)
+{
+    *ptr = tex3DGrad<T>(textureObject, x, y, z, dPdx, dPdy);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T tex1DLayeredGrad(hipTextureObject_t textureObject, float x, int layer, float dPdx, float dPdy)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    auto tmp = __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dPdx, dPdy);
+    return mapFrom<T>(tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void tex1DLayeredGrad(T *ptr, hipTextureObject_t textureObject, float x, int layer, float dPdx, float dPdy)
+{
+    *ptr = tex1DLayeredGrad<T>(textureObject, x, layer, dPdx, dPdy);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ T tex2DLayeredGrad(hipTextureObject_t textureObject, float x, float y, int layer, float2 dPdx, float2 dPdy)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    auto tmp = __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data, float2(dPdx.x, dPdx.y).data, float2(dPdy.x, dPdy.y).data);
+    return mapFrom<T>(tmp);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void tex2DLayeredGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, int layer, float2 dPdx, float2 dPdy)
+{
+    *ptr = tex2DLayeredGrad<T>(textureObject, x, y, layer, dPdx, dPdy);
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__  T texCubemapLayeredGrad(hipTextureObject_t textureObject, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
+{
+    TEXTURE_OBJECT_PARAMETERS_INIT
+    // TODO missing in device libs.
+    // auto tmp = __ockl_image_sample_grad_CMa(i, s, float4(x, y, z, layer).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data);
+    // return mapFrom<T>(tmp);
+    return {};
+}
+
+template <
+    typename T,
+    typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ __hip_img_chk__ void texCubemapLayeredGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
+{
+    *ptr = texCubemapLayeredGrad<T>(textureObject, x, y, z, layer, dPdx, dPdy);
+}
+
+#endif
@@ -0,0 +1 @@
+amd_detail
@@ -0,0 +1 @@
+nvidia_detail
@@ -0,0 +1,28 @@
+/*
+Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_CHANNEL_DESCRIPTOR_H
+#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_CHANNEL_DESCRIPTOR_H
+
+#include "channel_descriptor.h"
+
+#endif
@@ -0,0 +1,75 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_ATOMICS_H
+#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_ATOMICS_H
+
+
+__device__ inline float atomicMax(float* addr, float val) {
+    unsigned int *uaddr = (unsigned int *)addr;
+    float value = __uint_as_float(*uaddr);
+
+    while (value < val) {
+        value = __uint_as_float(atomicCAS(uaddr, __float_as_uint(value),
+                                                 __float_as_uint(val)));
+    }
+    return value;
+}
+
+__device__ inline double atomicMax(double* addr, double val) {
+    unsigned long long* uaddr  = (unsigned long long *)addr;
+    double value = __longlong_as_double(*uaddr);
+
+    while (value < val) {
+        value = __longlong_as_double(atomicCAS(uaddr,
+                                        __double_as_longlong(value),
+                                        __double_as_longlong(val)));
+    }
+
+    return value;
+}
+
+__device__ inline float atomicMin(float* addr, float val) {
+    unsigned int *uaddr = (unsigned int *)addr;
+    float value = __uint_as_float(*uaddr);
+
+    while (value > val) {
+        value = __uint_as_float(atomicCAS(uaddr, __float_as_uint(value),
+                                                 __float_as_uint(val)));
+    }
+    return value;
+}
+
+__device__ inline double atomicMin(double* addr, double val) {
+    unsigned long long* uaddr  = (unsigned long long *)addr;
+    double value = __longlong_as_double(*uaddr);
+
+    while (value > val) {
+        value = __longlong_as_double(atomicCAS(uaddr,
+                                         __double_as_longlong(value),
+                                         __double_as_longlong(val)));
+    }
+
+    return value;
+}
+
+#endif
@@ -0,0 +1,119 @@
+/*
+Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_COMPLEX_H
+#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_COMPLEX_H
+
+#include "cuComplex.h"
+
+typedef cuFloatComplex hipFloatComplex;
+
+__device__ __host__ static inline float hipCrealf(hipFloatComplex z) { return cuCrealf(z); }
+
+__device__ __host__ static inline float hipCimagf(hipFloatComplex z) { return cuCimagf(z); }
+
+__device__ __host__ static inline hipFloatComplex make_hipFloatComplex(float a, float b) {
+    return make_cuFloatComplex(a, b);
+}
+
+__device__ __host__ static inline hipFloatComplex hipConjf(hipFloatComplex z) { return cuConjf(z); }
+
+__device__ __host__ static inline float hipCsqabsf(hipFloatComplex z) {
+    return cuCabsf(z) * cuCabsf(z);
+}
+
+__device__ __host__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q) {
+    return cuCaddf(p, q);
+}
+
+__device__ __host__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q) {
+    return cuCsubf(p, q);
+}
+
+__device__ __host__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q) {
+    return cuCmulf(p, q);
+}
+
+__device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q) {
+    return cuCdivf(p, q);
+}
+
+__device__ __host__ static inline float hipCabsf(hipFloatComplex z) { return cuCabsf(z); }
+
+typedef cuDoubleComplex hipDoubleComplex;
+
+__device__ __host__ static inline double hipCreal(hipDoubleComplex z) { return cuCreal(z); }
+
+__device__ __host__ static inline double hipCimag(hipDoubleComplex z) { return cuCimag(z); }
+
+__device__ __host__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b) {
+    return make_cuDoubleComplex(a, b);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipConj(hipDoubleComplex z) { return cuConj(z); }
+
+__device__ __host__ static inline double hipCsqabs(hipDoubleComplex z) {
+    return cuCabs(z) * cuCabs(z);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q) {
+    return cuCadd(p, q);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q) {
+    return cuCsub(p, q);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q) {
+    return cuCmul(p, q);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q) {
+    return cuCdiv(p, q);
+}
+
+__device__ __host__ static inline double hipCabs(hipDoubleComplex z) { return cuCabs(z); }
+
+typedef cuFloatComplex hipComplex;
+
+__device__ __host__ static inline hipComplex make_Complex(float x, float y) {
+    return make_cuComplex(x, y);
+}
+
+__device__ __host__ static inline hipFloatComplex hipComplexDoubleToFloat(hipDoubleComplex z) {
+    return cuComplexDoubleToFloat(z);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipComplexFloatToDouble(hipFloatComplex z) {
+    return cuComplexFloatToDouble(z);
+}
+
+__device__ __host__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r) {
+    return cuCfmaf(p, q, r);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q,
+                                                           hipDoubleComplex r) {
+    return cuCfma(p, q, r);
+}
+
+#endif
@@ -0,0 +1,12 @@
+#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_COOPERATIVE_GROUPS_H
+#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_COOPERATIVE_GROUPS_H
+
+// Include CUDA headers
+#include <cuda_runtime.h>
+#include <cooperative_groups.h>
+
+// Include HIP wrapper headers around CUDA
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+
+#endif // HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_COOPERATIVE_GROUPS_H
@@ -0,0 +1,62 @@
+/*
+Copyright (c) 2015 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#ifndef NVIDIA_HIP_MATH_CONSTANTS_H
+#define NVIDIA_HIP_MATH_CONSTANTS_H
+#include <math_constants.h>
+#define HIP_INF_F            CUDART_INF_F
+#define HIP_NAN_F            CUDART_NAN_F
+#define HIP_MIN_DENORM_F     CUDART_MIN_DENORM_F
+#define HIP_MAX_NORMAL_F     CUDART_MAX_NORMAL_F
+#define HIP_NEG_ZERO_F       CUDART_NEG_ZERO_F
+#define HIP_ZERO_F           CUDART_ZERO_F
+#define HIP_ONE_F            CUDART_ONE_F
+#define HIP_SQRT_HALF_F      CUDART_SQRT_HALF_F
+#define HIP_SQRT_HALF_HI_F   CUDART_SQRT_HALF_HI_F
+#define HIP_SQRT_HALF_LO_F   CUDART_SQRT_HALF_LO_F
+#define HIP_SQRT_TWO_F       CUDART_SQRT_TWO_F
+#define HIP_THIRD_F          CUDART_THIRD_F
+#define HIP_PIO4_F           CUDART_PIO4_F
+#define HIP_PIO2_F           CUDART_PIO2_F
+#define HIP_3PIO4_F          CUDART_3PIO4_F
+#define HIP_2_OVER_PI_F      CUDART_2_OVER_PI_F
+#define HIP_SQRT_2_OVER_PI_F CUDART_SQRT_2_OVER_PI_F
+#define HIP_PI_F             CUDART_PI_F
+#define HIP_L2E_F            CUDART_L2E_F
+#define HIP_L2T_F            CUDART_L2T_F
+#define HIP_LG2_F            CUDART_LG2_F
+#define HIP_LGE_F            CUDART_LGE_F
+#define HIP_LN2_F            CUDART_LN2_F
+#define HIP_LNT_F            CUDART_LNT_F
+#define HIP_LNPI_F           CUDART_LNPI_F
+#define HIP_TWO_TO_M126_F    CUDART_TWO_TO_M126_F
+#define HIP_TWO_TO_126_F     CUDART_TWO_TO_126_F
+#define HIP_NORM_HUGE_F      CUDART_NORM_HUGE_F
+#define HIP_TWO_TO_23_F      CUDART_TWO_TO_23_F
+#define HIP_TWO_TO_24_F      CUDART_TWO_TO_24_F
+#define HIP_TWO_TO_31_F      CUDART_TWO_TO_31_F
+#define HIP_TWO_TO_32_F      CUDART_TWO_TO_32_F
+#define HIP_REMQUO_BITS_F    CUDART_REMQUO_BITS_F
+#define HIP_REMQUO_MASK_F    CUDART_REMQUO_MASK_F
+#define HIP_TRIG_PLOSS_F     CUDART_TRIG_PLOSS_F
+#endif
+
+
@@ -0,0 +1,124 @@
+/*
+Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_RUNTIME_H
+#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_RUNTIME_H
+
+#include <cuda_runtime.h>
+
+#include <hip/hip_runtime_api.h>
+
+#define HIP_KERNEL_NAME(...) __VA_ARGS__
+
+typedef int hipLaunchParm;
+
+#define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...)  \
+    do {                                                                                           \
+        kernelName<<<numBlocks, numThreads, memPerBlock, streamId>>>(__VA_ARGS__);                 \
+    } while (0)
+
+#define hipLaunchKernelGGL(kernelName, ...)  hipLaunchKernelGGLInternal((kernelName), __VA_ARGS__)
+
+#define hipReadModeElementType cudaReadModeElementType
+
+#ifdef __CUDA_ARCH__
+
+
+// 32-bit Atomics:
+#define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (__CUDA_ARCH__ >= 110)
+#define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (__CUDA_ARCH__ >= 110)
+#define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (__CUDA_ARCH__ >= 120)
+#define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (__CUDA_ARCH__ >= 120)
+#define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (__CUDA_ARCH__ >= 200)
+
+// 64-bit Atomics:
+#define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (__CUDA_ARCH__ >= 200)
+#define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (__CUDA_ARCH__ >= 120)
+
+// Doubles
+#define __HIP_ARCH_HAS_DOUBLES__ (__CUDA_ARCH__ >= 120)
+
+// warp cross-lane operations:
+#define __HIP_ARCH_HAS_WARP_VOTE__ (__CUDA_ARCH__ >= 120)
+#define __HIP_ARCH_HAS_WARP_BALLOT__ (__CUDA_ARCH__ >= 200)
+#define __HIP_ARCH_HAS_WARP_SHUFFLE__ (__CUDA_ARCH__ >= 300)
+#define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (__CUDA_ARCH__ >= 350)
+
+// sync
+#define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (__CUDA_ARCH__ >= 200)
+#define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (__CUDA_ARCH__ >= 200)
+
+// misc
+#define __HIP_ARCH_HAS_SURFACE_FUNCS__ (__CUDA_ARCH__ >= 200)
+#define __HIP_ARCH_HAS_3DGRID__ (__CUDA_ARCH__ >= 200)
+#define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (__CUDA_ARCH__ >= 350)
+
+#endif
+
+#ifdef __CUDACC__
+
+#include "nvidia_hip_atomics.h"
+#include "nvidia_hip_unsafe_atomics.h"
+
+#define hipThreadIdx_x threadIdx.x
+#define hipThreadIdx_y threadIdx.y
+#define hipThreadIdx_z threadIdx.z
+
+#define hipBlockIdx_x blockIdx.x
+#define hipBlockIdx_y blockIdx.y
+#define hipBlockIdx_z blockIdx.z
+
+#define hipBlockDim_x blockDim.x
+#define hipBlockDim_y blockDim.y
+#define hipBlockDim_z blockDim.z
+
+#define hipGridDim_x gridDim.x
+#define hipGridDim_y gridDim.y
+#define hipGridDim_z gridDim.z
+
+#define HIP_SYMBOL(X) &X
+
+/**
+ * Map HIP_DYNAMIC_SHARED to "extern __shared__" for compatibility with old HIP applications
+ * To be removed in a future release.
+ */
+#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];
+#define HIP_DYNAMIC_SHARED_ATTRIBUTE
+
+#ifdef __HIP_DEVICE_COMPILE__
+#define abort_()                                                                                    \
+    { asm("trap;"); }
+#undef assert
+#define assert(COND)                                                                               \
+    {                                                                                              \
+        if (!COND) {                                                                               \
+            abort_();                                                                               \
+        }                                                                                          \
+    }
+#endif
+
+#define __clock() clock()
+#define __clock64() clock64()
+
+#endif
+
+#endif
@@ -0,0 +1,6 @@
+#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_TEXTURE_TYPES_H
+#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_TEXTURE_TYPES_H
+
+#include <texture_types.h>
+
+#endif
@@ -0,0 +1,100 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_UNSAFE_ATOMICS_H
+#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_UNSAFE_ATOMICS_H
+
+__device__ inline float unsafeAtomicAdd(float* addr, float value) {
+    return atomicAdd(addr, value);
+}
+
+__device__ inline double unsafeAtomicAdd(double* addr, double value) {
+#if __CUDA_ARCH__ < 600
+    unsigned long long *addr_cast = (unsigned long long*)addr;
+    unsigned long long old_val = *addr_cast;
+    unsigned long long expected;
+    do {
+        expected = old_val;
+        old_val = atomicCAS(addr_cast, expected,
+                            __double_as_longlong(value +
+                                                 __longlong_as_double(expected)));
+    } while (__double_as_longlong(expected) != __double_as_longlong(old_val));
+    return old_val;
+#else
+    return atomicAdd(addr, value);
+#endif
+}
+
+__device__ inline float unsafeAtomicMax(float* addr, float value) {
+    return atomicMax(addr, value);
+}
+
+__device__ inline double unsafeAtomicMax(double* addr, double val) {
+    return atomicMax(addr, val);
+}
+
+__device__ inline float unsafeAtomicMin(float* addr, float value) {
+    return atomicMin(addr, value);
+}
+
+__device__ inline double unsafeAtomicMin(double* addr, double val) {
+    return atomicMin(addr, val);
+}
+
+__device__ inline float safeAtomicAdd(float* addr, float value) {
+    return atomicAdd(addr, value);
+}
+
+__device__ inline double safeAtomicAdd(double* addr, double value) {
+#if __CUDA_ARCH__ < 600
+    unsigned long long *addr_cast = (unsigned long long*)addr;
+    unsigned long long old_val = *addr_cast;
+    unsigned long long expected;
+    do {
+        expected = old_val;
+        old_val = atomicCAS(addr_cast, expected,
+                            __double_as_longlong(value +
+                                                 __longlong_as_double(expected)));
+    } while (__double_as_longlong(expected) != __double_as_longlong(old_val));
+    return old_val;
+#else
+    return atomicAdd(addr, value);
+#endif
+}
+
+__device__ inline float safeAtomicMax(float* addr, float value) {
+    return atomicMax(addr, value);
+}
+
+__device__ inline double safeAtomicMax(double* addr, double val) {
+    return atomicMax(addr, val);
+}
+
+__device__ inline float safeAtomicMin(float* addr, float value) {
+    return atomicMin(addr, value);
+}
+
+__device__ inline double safeAtomicMin(double* addr, double val) {
+    return atomicMin(addr, val);
+}
+
+#endif
@@ -0,0 +1,172 @@
+/*
+Copyright (c) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#ifndef HIPRTC_H
+#define HIPRTC_H
+
+#include <cuda.h>
+#include <nvrtc.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#include <stdlib.h>
+
+#if !defined(_WIN32)
+#pragma GCC visibility push(default)
+#endif
+
+typedef enum hiprtcResult {
+  HIPRTC_SUCCESS = 0,
+  HIPRTC_ERROR_OUT_OF_MEMORY = 1,
+  HIPRTC_ERROR_PROGRAM_CREATION_FAILURE = 2,
+  HIPRTC_ERROR_INVALID_INPUT = 3,
+  HIPRTC_ERROR_INVALID_PROGRAM = 4,
+  HIPRTC_ERROR_INVALID_OPTION = 5,
+  HIPRTC_ERROR_COMPILATION = 6,
+  HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7,
+  HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8,
+  HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9,
+  HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10,
+  HIPRTC_ERROR_INTERNAL_ERROR = 11
+} hiprtcResult;
+
+inline static nvrtcResult hiprtcResultTonvrtcResult(hiprtcResult result) {
+  switch (result) {
+    case HIPRTC_SUCCESS:
+      return NVRTC_SUCCESS;
+    case HIPRTC_ERROR_OUT_OF_MEMORY:
+      return NVRTC_ERROR_OUT_OF_MEMORY;
+    case HIPRTC_ERROR_PROGRAM_CREATION_FAILURE:
+      return NVRTC_ERROR_PROGRAM_CREATION_FAILURE;
+    case HIPRTC_ERROR_INVALID_INPUT:
+      return NVRTC_ERROR_INVALID_INPUT;
+    case HIPRTC_ERROR_INVALID_PROGRAM:
+      return NVRTC_ERROR_INVALID_PROGRAM;
+    case HIPRTC_ERROR_INVALID_OPTION:
+      return NVRTC_ERROR_INVALID_OPTION;
+    case HIPRTC_ERROR_COMPILATION:
+      return NVRTC_ERROR_COMPILATION;
+    case HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE:
+      return NVRTC_ERROR_BUILTIN_OPERATION_FAILURE;
+    case HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION:
+      return NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION;
+    case HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION:
+      return NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION;
+    case HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID:
+      return NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID;
+    case HIPRTC_ERROR_INTERNAL_ERROR:
+      return NVRTC_ERROR_INTERNAL_ERROR;
+    default:
+      return NVRTC_ERROR_INTERNAL_ERROR;
+  }
+}
+
+inline static hiprtcResult nvrtcResultTohiprtcResult(nvrtcResult result) {
+  switch (result) {
+    case NVRTC_SUCCESS:
+      return HIPRTC_SUCCESS;
+    case NVRTC_ERROR_OUT_OF_MEMORY:
+      return HIPRTC_ERROR_OUT_OF_MEMORY;
+    case NVRTC_ERROR_PROGRAM_CREATION_FAILURE:
+      return HIPRTC_ERROR_PROGRAM_CREATION_FAILURE;
+    case NVRTC_ERROR_INVALID_INPUT:
+      return HIPRTC_ERROR_INVALID_INPUT;
+    case NVRTC_ERROR_INVALID_PROGRAM:
+      return HIPRTC_ERROR_INVALID_PROGRAM;
+    case NVRTC_ERROR_INVALID_OPTION:
+      return HIPRTC_ERROR_INVALID_OPTION;
+    case NVRTC_ERROR_COMPILATION:
+      return HIPRTC_ERROR_COMPILATION;
+    case NVRTC_ERROR_BUILTIN_OPERATION_FAILURE:
+      return HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE;
+    case NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION:
+      return HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION;
+    case NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION:
+      return HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION;
+    case NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID:
+      return HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID;
+    case NVRTC_ERROR_INTERNAL_ERROR:
+      return HIPRTC_ERROR_INTERNAL_ERROR;
+    default:
+      return HIPRTC_ERROR_INTERNAL_ERROR;
+  }
+}
+
+inline static const char* hiprtcGetErrorString(hiprtcResult result) {
+  return nvrtcGetErrorString(hiprtcResultTonvrtcResult(result));
+}
+
+inline static hiprtcResult hiprtcVersion(int* major, int* minor) {
+  return nvrtcResultTohiprtcResult(nvrtcVersion(major, minor));
+}
+
+typedef nvrtcProgram hiprtcProgram;
+
+inline static hiprtcResult hiprtcAddNameExpression(hiprtcProgram prog, const char* name_expression) {
+  return nvrtcResultTohiprtcResult(nvrtcAddNameExpression(prog, name_expression));
+}
+
+inline static hiprtcResult hiprtcCompileProgram(hiprtcProgram prog, int numOptions, const char** options) {
+  return nvrtcResultTohiprtcResult(nvrtcCompileProgram(prog, numOptions, options));
+}
+
+inline static hiprtcResult hiprtcCreateProgram(hiprtcProgram* prog, const char* src, const char* name,
+                                 int numHeaders, const char** headers, const char** includeNames) {
+  return nvrtcResultTohiprtcResult(
+      nvrtcCreateProgram(prog, src, name, numHeaders, headers, includeNames));
+}
+
+inline static hiprtcResult hiprtcDestroyProgram(hiprtcProgram* prog) {
+  return nvrtcResultTohiprtcResult(nvrtcDestroyProgram(prog));
+}
+
+inline static hiprtcResult hiprtcGetLoweredName(hiprtcProgram prog, const char* name_expression,
+                                  const char** lowered_name) {
+  return nvrtcResultTohiprtcResult(nvrtcGetLoweredName(prog, name_expression, lowered_name));
+}
+
+inline static hiprtcResult hiprtcGetProgramLog(hiprtcProgram prog, char* log) {
+  return nvrtcResultTohiprtcResult(nvrtcGetProgramLog(prog, log));
+}
+
+inline static hiprtcResult hiprtcGetProgramLogSize(hiprtcProgram prog, size_t* logSizeRet) {
+  return nvrtcResultTohiprtcResult(nvrtcGetProgramLogSize(prog, logSizeRet));
+}
+
+inline static hiprtcResult hiprtcGetCode(hiprtcProgram prog, char* code) {
+  return nvrtcResultTohiprtcResult(nvrtcGetPTX(prog, code));
+}
+
+inline static hiprtcResult hiprtcGetCodeSize(hiprtcProgram prog, size_t* codeSizeRet) {
+  return nvrtcResultTohiprtcResult(nvrtcGetPTXSize(prog, codeSizeRet));
+}
+
+#if !defined(_WIN32)
+#pragma GCC visibility pop
+#endif
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif  // HIPRTC_H
@@ -0,0 +1,135 @@
+#!/bin/bash
+# Copyright (c) 2017 - 2021 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+# Parse command-line options
+# Option strings
+SHORT=h
+LONG=help,opencl:,hip:,rocclr:
+# read the options
+OPTS=$(getopt --options $SHORT --long $LONG --name "$0" -- "$@")
+if [ $? != 0 ] ; then echo "Failed to parse options...exiting." >&2 ; exit 1 ; fi
+
+usage() {
+    echo "Usage: $0 --hip <PATH to the hip common src> --opencl <PATH to the opencl src> --rocclr <PATH to the rocclr src>" ;
+    exit 1;
+}
+
+[ $# -eq 0 ] && usage
+
+eval set -- "$OPTS"
+
+# extract options and their arguments into variables.
+while true ; do
+  case "$1" in
+    --hip )
+      HIP_DIR="$2"
+      shift 2
+      ;;
+    --rocclr )
+      ROCCLR_DIR="$2"
+      shift 2
+      ;;
+    --opencl )
+      OPENCL_DIR="$2"
+      shift 2
+      ;;
+    -h | --help )
+      usage
+      shift
+      ;;
+    -- )
+      shift
+      break
+      ;;
+    *)
+      echo "Internal error!"
+      exit 1
+      ;;
+  esac
+done
+
+BUILD_ROOT="$( mktemp -d )"
+SRC_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+WORKING_DIR=$PWD
+DASH_JAY="-j $(getconf _NPROCESSORS_ONLN)"
+OS_NAME="$(cat /etc/os-release | awk -F '=' '/^NAME/{print $2}' | awk '{print $1}' | tr -d '"')"
+[[ -z "$ROCM_PATH" ]] && ROCM_PATH=/opt/rocm
+
+err() {
+    echo "${1-Died}." >&2
+}
+
+die() {
+    err "$1"
+    exit 1
+}
+
+pushd () {
+    command pushd "$@" > /dev/null
+}
+
+popd () {
+    command popd "$@" > /dev/null
+}
+
+function setupENV()
+{
+    if [ "$OS_NAME" == "Ubuntu" ]
+    then
+      sudo apt-get update
+      sudo apt-get install dpkg-dev rpm doxygen libelf-dev rename liburi-encode-perl \
+         libfile-basedir-perl libfile-copy-recursive-perl libfile-listing-perl
+    elif [ "$OS_NAME" == "CentOS" ]
+    then
+      yum install dpkg-dev rpm-build doxygen elfutils-libelf-devel prename \
+         perl-URI-Encode perl-File-Listing perl-File-BaseDir
+    fi
+}
+
+function buildHIP()
+{
+    pushd $BUILD_ROOT
+    HIP_BUILD_DIR="$BUILD_ROOT/hip_build"
+    mkdir $HIP_BUILD_DIR
+    pushd $HIP_BUILD_DIR
+    cmake $SRC_ROOT -DHIP_COMMON_DIR="$HIP_DIR" -DAMD_OPENCL_PATH=$OPENCL_DIR -DROCCLR_PATH=$ROCCLR_DIR -DCMAKE_PREFIX_PATH="$ROCM_PATH" -DCMAKE_BUILD_TYPE=Release
+    make $DASH_JAY
+    make package
+    if [ "$OS_NAME" == "Ubuntu" ]
+    then
+      cp hip-*.deb $WORKING_DIR
+      sudo dpkg -i -B hip-dev*.deb hip-runtime-amd*.deb hip-sample*.deb hip-doc*.deb
+    elif [ "$OS_NAME" == "CentOS" ]
+    then
+      cp hip-*.rpm $WORKING_DIR
+      sudo rpm -ivh --replacefiles --force hip-devel*.rpm hip-runtime-amd*.rpm hip-sample*.rpm \
+         hip-doc*.rpm
+    fi
+    popd
+    popd
+    rm -rf $BUILD_ROOT
+}
+
+echo "Preparing build environment"
+setupENV || die "setupENV failed"
+echo "Building and installing HIP packages"
+buildHIP || die "buildHIP failed"
+echo "Finished building HIP packages"
@@ -0,0 +1,251 @@
+# Copyright (c) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+cmake_minimum_required(VERSION 3.16.8)
+
+#set components for HIP
+set(CPACK_COMPONENTS_ALL binary dev doc samples runtime-nvidia)
+
+###############Install Required files for all compnents########
+
+#Enable Component Install
+set(CPACK_RPM_COMPONENT_INSTALL ON)
+set(CPACK_DEB_COMPONENT_INSTALL ON)
+
+###Set License####
+set(CPACK_RESOURCE_FILE_LICENSE ${hip_SOURCE_DIR}/LICENSE.txt)
+install(FILES ${CPACK_RESOURCE_FILE_LICENSE} DESTINATION ${CMAKE_INSTALL_DOCDIR} COMPONENT binary)
+set(CPACK_RPM_PACKAGE_LICENSE "MIT")
+
+#Begin binary files install
+if(HIP_PLATFORM STREQUAL "amd" )
+   if(BUILD_SHARED_LIBS)
+      install(FILES ${CMAKE_BINARY_DIR}/lib/libamdhip64.so DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT binary)
+      install(FILES ${CMAKE_BINARY_DIR}/lib/libamdhip64.so.${HIP_LIB_VERSION_MAJOR} DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT binary)
+      install(FILES ${CMAKE_BINARY_DIR}/lib/libamdhip64.so.${HIP_LIB_VERSION_STRING} DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT binary)
+      install(FILES ${CMAKE_BINARY_DIR}/lib/libhiprtc.so DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT binary)
+      install(FILES ${CMAKE_BINARY_DIR}/lib/libhiprtc.so.${HIP_LIB_VERSION_MAJOR} DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT binary)
+      install(FILES ${CMAKE_BINARY_DIR}/lib/libhiprtc.so.${HIP_LIB_VERSION_STRING} DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT binary)
+      install(FILES ${CMAKE_BINARY_DIR}/lib/libhiprtc-builtins.so DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT binary)
+      install(FILES ${CMAKE_BINARY_DIR}/lib/libhiprtc-builtins.so.${HIP_LIB_VERSION_MAJOR} DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT binary)
+      install(FILES ${CMAKE_BINARY_DIR}/lib/libhiprtc-builtins.so.${HIP_LIB_VERSION_STRING} DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT binary)
+   else()
+      install(FILES ${CMAKE_BINARY_DIR}/lib/libamdhip64.a DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT binary)
+   endif()#End BUILD_SHARED_LIBS
+
+#TODO:This do not belong in BINARY package.
+#Keeping it as is for now
+install(FILES ${CMAKE_BINARY_DIR}/.hipInfo DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT binary)
+
+install(FILES ${CMAKE_BINARY_DIR}/hip-config.cmake ${CMAKE_BINARY_DIR}/hip-config-version.cmake DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/hip COMPONENT binary)
+install ( EXPORT hip-targets  FILE hip-targets.cmake NAMESPACE hip:: DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/hip  COMPONENT binary)
+
+install(FILES ${CMAKE_BINARY_DIR}/src/hip-lang-config.cmake ${CMAKE_BINARY_DIR}/src/hip-lang-config-version.cmake DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/hip-lang COMPONENT binary)
+install ( EXPORT hip-lang-targets  FILE hip-lang-targets.cmake NAMESPACE hip-lang:: DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/hip-lang  COMPONENT binary)
+
+install(FILES ${CMAKE_BINARY_DIR}/hiprtc-config.cmake ${CMAKE_BINARY_DIR}/hiprtc-config-version.cmake DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/hiprtc COMPONENT binary)
+install ( EXPORT hiprtc-targets  FILE hiprtc-targets.cmake NAMESPACE hiprtc:: DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/hiprtc  COMPONENT binary)
+
+endif()#End HIP_PLATFORM = "amd"
+#End bianry files install
+
+#Begin dev files install
+if(WIN32)
+  install(DIRECTORY ${HIP_COMMON_DIR}/bin DESTINATION . COMPONENT dev
+          USE_SOURCE_PERMISSIONS)
+else()
+  install(DIRECTORY ${HIP_COMMON_DIR}/bin DESTINATION .  COMPONENT dev
+          USE_SOURCE_PERMISSIONS
+          DIRECTORY_PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE
+          PATTERN *.bat EXCLUDE)
+endif()
+
+install(DIRECTORY ${hip_SOURCE_DIR}/bin DESTINATION . COMPONENT dev
+        USE_SOURCE_PERMISSIONS
+        DIRECTORY_PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE)
+install(DIRECTORY ${HIP_COMMON_DIR}/include DESTINATION . COMPONENT dev)
+install(DIRECTORY ${hip_SOURCE_DIR}/include/hip/amd_detail
+        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/hip COMPONENT dev)
+install(DIRECTORY ${hip_SOURCE_DIR}/include/hip/nvidia_detail
+        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/hip COMPONENT dev)
+install(FILES ${CMAKE_BINARY_DIR}/include/hip/amd_detail/hip_prof_str.h
+        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/hip/amd_detail COMPONENT dev)
+install(FILES ${CMAKE_BINARY_DIR}/include/hip/hip_version.h
+        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/hip COMPONENT dev)
+install(FILES ${CMAKE_BINARY_DIR}/.hipVersion DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT dev)
+install(DIRECTORY ${HIP_COMMON_DIR}/cmake/ DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/hip COMPONENT dev)
+#End dev files install
+
+#Begin doc files install
+find_program(DOXYGEN_EXE doxygen)
+if(DOXYGEN_EXE)
+    add_custom_target(build_doxygen ALL
+                  COMMAND HIP_PATH=${HIP_COMMON_DIR} doxygen ${HIP_COMMON_DIR}/docs/doxygen-input/doxy.cfg)
+    install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/RuntimeAPI/html
+            DESTINATION ${CMAKE_INSTALL_DOCDIR}/RuntimeAPI COMPONENT doc)
+endif()
+#End doc files install
+
+#Begin samples files install
+install(DIRECTORY ${HIP_COMMON_DIR}/samples DESTINATION ${CMAKE_INSTALL_DATADIR}/hip COMPONENT samples)
+#End samples files install
+
+
+##################################
+# Packaging steps COMMON Variables
+##################################
+set(CPACK_SET_DESTDIR TRUE)
+
+set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.")
+set(CPACK_PACKAGE_CONTACT "HIP Support <hip.support@amd.com>")
+set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HIP:Heterogenous-computing Interface for Portability")
+set(CPACK_PACKAGE_VERSION_MAJOR ${HIP_VERSION_MAJOR})
+set(CPACK_PACKAGE_VERSION_MINOR ${HIP_VERSION_MINOR})
+set(CPACK_PACKAGE_VERSION_PATCH ${HIP_VERSION_PATCH})
+set(CPACK_PACKAGE_VERSION ${HIP_VERSION_MAJOR}.${HIP_VERSION_MINOR}.${HIP_PACKAGING_VERSION_PATCH})
+set(CPACK_GENERATOR "TGZ;DEB;RPM" CACHE STRING "Package types to build")
+
+set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt")
+if (CPACK_RPM_PACKAGE_RELEASE MATCHES "local" )
+  #If building locally default value will cause build failure
+  #DEBUG SYMBOL pacaking require SOURCE_DIR to be small
+  set(CPACK_RPM_BUILD_SOURCE_DIRS_PREFIX ${CPACK_INSTALL_PREFIX})
+endif()
+set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
+set(CPACK_RPM_FILE_NAME "RPM-DEFAULT")
+
+set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT")
+
+set(CPACK_SOURCE_GENERATOR "TGZ")
+
+
+#Begin Binary Packaging setting
+
+set(CPACK_BINARY_DEB "ON")
+set(CPACK_BINARY_RPM "ON")
+
+set(CPACK_DEBIAN_BINARY_PACKAGE_NAME "hip-runtime-amd")
+set(CPACK_RPM_BINARY_PACKAGE_NAME "hip-runtime-amd")
+
+set(CPACK_COMPONENT_BINARY_DESCRIPTION "HIP:Heterogenous-computing Interface for Portability [RUNTIME - AMD]")
+if(FILE_REORG_BACKWARD_COMPATIBILITY)
+#This is used for softlinking hip-target files
+   configure_file(hip-runtime-amd.postinst ${CMAKE_CURRENT_BINARY_DIR}/binary/postinst @ONLY)
+   configure_file(hip-runtime-amd.prerm    ${CMAKE_CURRENT_BINARY_DIR}/binary/prerm @ONLY)
+   set(CPACK_DEBIAN_BINARY_PACKAGE_CONTROL_EXTRA "${CMAKE_CURRENT_BINARY_DIR}/binary/postinst;${CMAKE_CURRENT_BINARY_DIR}/binary/prerm")
+endif()
+set(CPACK_DEBIAN_BINARY_PACKAGE_DEPENDS "hsa-rocr-dev (>= 1.3), rocminfo,  comgr (>= 2.0), rocm-llvm, libc6, rocm-core")
+set(CPACK_DEBIAN_BINARY_PACKAGE_PROVIDES "hip-rocclr (= ${CPACK_PACKAGE_VERSION})")
+set(CPACK_DEBIAN_BINARY_PACKAGE_REPLACES "hip-rocclr (= ${CPACK_PACKAGE_VERSION})")
+
+set(CPACK_RPM_BINARY_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
+if(FILE_REORG_BACKWARD_COMPATIBILITY)
+   set(CPACK_RPM_BINARY_POST_INSTALL_SCRIPT_FILE "${CMAKE_CURRENT_BINARY_DIR}/binary/postinst")
+   set(CPACK_RPM_BINARY_PRE_UNINSTALL_SCRIPT_FILE "${CMAKE_CURRENT_BINARY_DIR}/binary/prerm")
+endif()
+string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION})
+set(CPACK_RPM_BINARY_PACKAGE_REQUIRES "hsa-rocr-dev >= 1.3, rocminfo,  comgr >= 2.0, rocm-llvm, rocm-core")
+set(CPACK_RPM_BINARY_PACKAGE_PROVIDES "hip-rocclr = ${HIP_BASE_VERSION}")
+set(CPACK_RPM_BINARY_PACKAGE_OBSOLETES "hip-rocclr = ${HIP_BASE_VERSION}")
+#End Binary Packaging setting
+
+#Begin dev Packaging setting
+set(CPACK_DEV_DEB "ON")
+set(CPACK_DEV_RPM "ON")
+
+set(CPACK_DEBIAN_DEV_PACKAGE_NAME "hip-dev")
+set(CPACK_RPM_DEV_PACKAGE_NAME "hip-devel")
+
+set(CPACK_COMPONENT_DEV_DESCRIPTION "HIP: Heterogenous-computing Interface for Portability [DEVELOPMENT]")
+
+configure_file(hip-devel.postinst ${CMAKE_CURRENT_BINARY_DIR}/dev/postinst @ONLY)
+configure_file(hip-devel.prerm    ${CMAKE_CURRENT_BINARY_DIR}/dev/prerm @ONLY)
+set(CPACK_DEBIAN_DEV_PACKAGE_CONTROL_EXTRA "${CMAKE_CURRENT_BINARY_DIR}/dev/postinst;${CMAKE_CURRENT_BINARY_DIR}/dev/prerm")
+
+set(CPACK_DEBIAN_DEV_PACKAGE_DEPENDS "perl (>= 5.0), liburi-encode-perl, libfile-basedir-perl, libfile-copy-recursive-perl, libfile-listing-perl, libfile-which-perl, libc6, file, rocm-core")
+set(CPACK_DEBIAN_DEV_PACKAGE_PROVIDES "hip-base")
+set(CPACK_DEBIAN_DEV_PACKAGE_REPLACES "hip-base")
+
+set(CPACK_RPM_DEV_POST_INSTALL_SCRIPT_FILE "${CMAKE_CURRENT_BINARY_DIR}/dev/postinst")
+set(CPACK_RPM_DEV_PRE_UNINSTALL_SCRIPT_FILE "${CMAKE_CURRENT_BINARY_DIR}/dev/prerm")
+set(CPACK_RPM_DEV_PACKAGE_REQUIRES "perl >= 5.0, perl-File-Which, perl-File-Listing, perl-File-BaseDir, perl-URI-Encode, file, rocm-core")
+
+set(CPACK_RPM_DEV_PACKAGE_PROVIDES "hip-base")
+set(CPACK_RPM_DEV_PACKAGE_OBSOLETES "hip-base")
+#End dev Packaging setting
+
+#Begin doc Packaging setting
+set(CPACK_DOC_DEB "ON")
+set(CPACK_DOC_RPM "ON")
+set(CPACK_DEBIAN_DOC_PACKAGE_NAME "hip-doc")
+set(CPACK_RPM_DOC_PACKAGE_NAME "hip-doc")
+set(CPACK_COMPONENT_DOC_DESCRIPTION "HIP: Heterogenous-computing Interface for Portability [DOCUMENTATION]")
+
+set(CPACK_DEBIAN_DOC_PACKAGE_DEPENDS "hip-dev (= ${CPACK_PACKAGE_VERSION}-${CPACK_DEBIAN_PACKAGE_RELEASE}), rocm-core")
+set(CPACK_DEBIAN_DOC_PACKAGE_PROVIDES "hip-doc")
+
+string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION})
+set(CPACK_RPM_DOC_PACKAGE_REQUIRES "hip-devel = ${HIP_BASE_VERSION}-${CPACK_RPM_PACKAGE_RELEASE}, rocm-core")
+
+#End doc Packaging setting
+
+#Begin samples Packaging setting
+set(CPACK_SAMPLES_DEB "ON")
+set(CPACK_SAMPLES_RPM "ON")
+set(CPACK_DEBIAN_SAMPLES_PACKAGE_NAME "hip-samples")
+set(CPACK_RPM_SAMPLES_PACKAGE_NAME "hip-samples")
+set(CPACK_COMPONENT_SAMPLES_DESCRIPTION "HIP: Heterogenous-computing Interface for Portability [SAMPLES]")
+set(CPACK_DEBIAN_SAMPLES_PACKAGE_DEPENDS "hip-dev (= ${CPACK_PACKAGE_VERSION}-${CPACK_DEBIAN_PACKAGE_RELEASE}), rocm-core")
+set(CPACK_DEBIAN_SAMPLES_PACKAGE_PROVIDES "hip-samples")
+
+set(CPACK_RPM_SAMPLES_PACKAGE_REQUIRES "hip-devel = ${HIP_BASE_VERSION}-${CPACK_RPM_PACKAGE_RELEASE}, rocm-core")
+#End samples Packaging setting
+
+#Begin runtime-nvidia Packaging setting
+set(CPACK_RUNTIME-NVIDIA_DEB "ON")
+set(CPACK_RUNTIME-NVIDIA_RPM "ON")
+set(CPACK_DEBIAN_RUNTIME-NVIDIA_PACKAGE_NAME "hip-runtime-nvidia")
+set(CPACK_RPM_RUNTIME-NVIDIA_PACKAGE_NAME "hip-runtime-nvidia")
+set(CPACK_COMPONENT_RUNTIME-NVIDIA_DESCRIPTION "HIP: Heterogenous-computing Interface for Portability [RUNTIME-NVIDIA]")
+
+set(CPACK_DEBIAN_RUNTIME-NVIDIA_PACKAGE_DEPENDS "cuda (>= 7.5), rocm-core")
+set(CPACK_DEBIAN_RUNTIME-NVIDIA_PACKAGE_PROVIDES "hip-nvcc")
+set(CPACK_DEBIAN_RUNTIME-NVIDIA_PACKAGE_REPLACES "hip-nvcc")
+
+set(CPACK_RPM_RUNTIME-NVIDIA_PACKAGE_PROVIDES "hip-nvcc")
+set(CPACK_RPM_RUNTIME-NVIDIA_PACKAGE_OBSOLETES "hip-nvcc")
+set(CPACK_RPM_RUNTIME-NVIDIA_PACKAGE_REQUIRES "cuda >= 7.5, rocm-core")
+
+# Remove dependency on rocm-core if -DROCM_DEP_ROCMCORE=ON not given to cmake
+if(NOT ROCM_DEP_ROCMCORE)
+
+    string(REGEX REPLACE ",? ?rocm-core" "" CPACK_RPM_BINARY_PACKAGE_REQUIRES ${CPACK_RPM_BINARY_PACKAGE_REQUIRES})
+    string(REGEX REPLACE ",? ?rocm-core" "" CPACK_DEBIAN_BINARY_PACKAGE_DEPENDS ${CPACK_DEBIAN_BINARY_PACKAGE_DEPENDS})
+    string(REGEX REPLACE ",? ?rocm-core" "" CPACK_RPM_DEV_PACKAGE_REQUIRES ${CPACK_RPM_DEV_PACKAGE_REQUIRES})
+    string(REGEX REPLACE ",? ?rocm-core" "" CPACK_DEBIAN_DEV_PACKAGE_DEPENDS ${CPACK_DEBIAN_DEV_PACKAGE_DEPENDS})
+    string(REGEX REPLACE ",? ?rocm-core" "" CPACK_RPM_DOC_PACKAGE_REQUIRES ${CPACK_RPM_DOC_PACKAGE_REQUIRES})
+    string(REGEX REPLACE ",? ?rocm-core" "" CPACK_DEBIAN_DOC_PACKAGE_DEPENDS ${CPACK_DEBIAN_DOC_PACKAGE_DEPENDS})
+    string(REGEX REPLACE ",? ?rocm-core" "" CPACK_RPM_SAMPLES_PACKAGE_REQUIRES ${CPACK_RPM_SAMPLES_PACKAGE_REQUIRES})
+    string(REGEX REPLACE ",? ?rocm-core" "" CPACK_DEBIAN_SAMPLES_PACKAGE_DEPENDS ${CPACK_DEBIAN_SAMPLES_PACKAGE_DEPENDS})
+    string(REGEX REPLACE ",? ?rocm-core" "" CPACK_RPM_RUNTIME-NVIDIA_PACKAGE_REQUIRES ${CPACK_RPM_RUNTIME-NVIDIA_PACKAGE_REQUIRES})
+    string(REGEX REPLACE ",? ?rocm-core" "" CPACK_DEBIAN_RUNTIME-NVIDIA_PACKAGE_DEPENDS ${CPACK_DEBIAN_RUNTIME-NVIDIA_PACKAGE_DEPENDS})
+endif()
+
+include(CPack)
@@ -0,0 +1,77 @@
+#!/bin/bash
+# Copyright (c) 2016 - 2021 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+function die {
+    echo "${1-Died}." >&2
+    exit 1
+}
+
+function cleanup {
+    rm -rf "$workdir"
+}
+
+# parse arguments
+hip_srcdir=$1
+html_destdir=$2
+[ "$hip_srcdir" != "" ] || [ "$html_destdir" != "" ] || die "Invalid arguments!"
+
+# create temporary directory for grip settings
+workdir=`mktemp -d`
+trap cleanup EXIT
+
+# setup grip
+export GRIPURL=$hip_srcdir
+export GRIPHOME=$workdir
+echo "CACHE_DIRECTORY = '$html_destdir/asset'" > $workdir/settings.py
+mkdir -p $html_destdir $html_destdir/docs/markdown
+
+# convert all md files to html
+pushd $hip_srcdir
+for f in *.md docs/markdown/*.md; do grip --export --no-inline $f $html_destdir/${f%.*}.html; done
+popd
+
+# convert absolute links to relative links
+pushd $html_destdir
+for f in *.html; do sed -i "s?$GRIPURL/??g" $f; done
+for f in docs/markdown/*.html; do sed -i "s?$GRIPURL/?../../?g" $f; done
+popd
+
+# update document titles
+pushd $html_destdir
+for f in *.html; do sed -i "s?.md - Grip??g" $f; done
+for f in docs/markdown/*.html; do sed -i "s?.md - Grip??g" $f; done
+popd
+
+# replace .md with .html in links
+pushd $html_destdir
+for f in *.html; do sed -i "s?.md\"?.html\"?g" $f; done
+for f in *.html; do sed -i "s?.md#?.html#?g" $f; done
+for f in docs/markdown/*.html; do sed -i "s?.md\"?.html\"?g" $f; done
+for f in docs/markdown/*.html; do sed -i "s?.md#?.html#?g" $f; done
+popd
+
+# replace github.io links
+pushd $html_destdir
+sed -i "s?http://rocm-developer-tools.github.io/HIP?docs/RuntimeAPI/html/index.html?g" README.html
+sed -i "s?http://rocm-developer-tools.github.io/HIP?docs/RuntimeAPI/html/?g" RELEASE.html
+popd
+
+exit 0
@@ -0,0 +1,38 @@
+#!/bin/bash
+# Copyright (c) 2016 - 2021 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+ROCMDIR=@ROCM_PATH@
+HIPINCDIR=$ROCMDIR/@CMAKE_INSTALL_INCLUDEDIR@/hip
+CURRENTDIR=`pwd`
+# The following will be removed after upstream updation
+cd $HIPINCDIR
+ln -r -s -f amd_detail hcc_detail
+ln -r -s -f nvidia_detail nvcc_detail
+cd $CURRENTDIR
+
+#FILE_REORG_BACKWARD_COMPATIBILITY
+HIPINCDIR=$ROCMDIR/hip/include/hip
+if [ -d $HIPINCDIR ]; then
+  # The following will be removed after upstream updation
+  cd $HIPINCDIR
+  ln -r -s -f amd_detail hcc_detail
+  ln -r -s -f nvidia_detail nvcc_detail
+  cd $CURRENTDIR
+fi
@@ -0,0 +1,41 @@
+#!/bin/bash
+# Copyright (c) 2016 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+ROCMDIR=@ROCM_PATH@
+CURRENTDIR=`pwd`
+
+HIPINCDIR=$ROCMDIR/@CMAKE_INSTALL_INCLUDEDIR@/hip
+([ ! -d $HIPINCDIR ]) && exit 0
+cd $HIPINCDIR
+rm hcc_detail
+rm nvcc_detail
+cd $CURRENTDIR
+
+#FILE_REORG_BACKWARD_COMPATIBILITY
+  #backward copatibility code , to be removed later
+HIPDIR=$ROCMDIR/hip
+HIPINCDIR=$ROCMDIR/hip/include/hip
+([ ! -d $HIPINCDIR ]) && exit 0
+cd $HIPINCDIR
+rm -f hcc_detail
+rm -f nvcc_detail
+cd $CURRENTDIR
+([ ! -d $HIPDIR ]) && exit 0
+rmdir --ignore-fail-on-non-empty $HIPDIR
@@ -0,0 +1,53 @@
+#!/bin/bash
+# Copyright (c) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+ROCMDIR=@ROCM_PATH@
+ROCMCMAKEDIR=$ROCMDIR/@CMAKE_INSTALL_LIBDIR@/cmake
+HIPCMAKEDIR=$ROCMDIR/hip/lib/cmake
+CURRENTDIR=`pwd`
+
+mkdir -p $HIPCMAKEDIR/hip
+mkdir -p $HIPCMAKEDIR/hip-lang
+mkdir -p $HIPCMAKEDIR/hiprtc
+
+HIPTARGETFILES=$(ls -A $ROCMCMAKEDIR/hip | grep  "^hip-targets")
+cd $HIPCMAKEDIR/hip
+for f in $HIPTARGETFILES
+do
+    ln -s -r -f $ROCMCMAKEDIR/hip/$f $(basename $f)
+done
+cd $CURRENTDIR
+
+HIPLANGTARGETFILES=$(ls -A $ROCMCMAKEDIR/hip-lang | grep  "^hip-lang-targets")
+cd $HIPCMAKEDIR/hip-lang
+for f in $HIPLANGTARGETFILES
+do
+    ln -s -r -f $ROCMCMAKEDIR/hip-lang/$f $(basename $f)
+done
+cd $CURRENTDIR
+
+HIPRTCTARGETFILES=$(ls -A $ROCMCMAKEDIR/hiprtc | grep  "^hiprtc-targets")
+cd $HIPCMAKEDIR/hiprtc
+for f in $HIPRTCTARGETFILES
+do
+    ln -s -r -f $ROCMCMAKEDIR/hiprtc/$f $(basename $f)
+done
+cd $CURRENTDIR
@@ -0,0 +1,66 @@
+#!/bin/bash
+# Copyright (c) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+ROCMDIR=@ROCM_PATH@
+HIPDIR=$ROCMDIR/hip
+HIPCMAKEDIR=$ROCMDIR/hip/lib/cmake/hip
+HIPLANGCMAKEDIR=$ROCMDIR/hip/lib/cmake/hip-lang
+HIPRTCCMAKEDIR=$ROCMDIR/hip/lib/cmake/hiprtc
+CURRENTDIR=`pwd`
+([ ! -d $ROCMDIR ] || [ ! -d $HIPDIR ]) && exit 0
+
+([ ! -d $HIPCMAKEDIR ] ) && exit 0
+# Remove soft-links to hip-target
+HIPTARGETFILES=$(ls -A $HIPCMAKEDIR | grep "^hip-targets")
+
+cd  $HIPCMAKEDIR
+for f in $HIPTARGETFILES; do
+    [ -e $f ] || continue
+    rm $(basename $f)
+done
+cd $CURRENTDIR
+([ ! -d $HIPLANGCMAKEDIR ] ) && exit 0
+# Remove soft-links to hip-lang-target
+HIPLANGTARGETFILES=$(ls -A $HIPLANGCMAKEDIR | grep "^hip-lang-targets")
+
+cd  $HIPLANGCMAKEDIR
+for f in $HIPLANGTARGETFILES; do
+    [ -e $f ] || continue
+    rm $(basename $f)
+done
+
+cd $CURRENTDIR
+
+([ ! -d $HIPRTCCMAKEDIR ] ) && exit 0
+# Remove soft-links to hiprtc-target
+HIPRTCTARGETFILES=$(ls -A $HIPRTCCMAKEDIR | grep "^hiprtc-targets")
+
+cd  $HIPRTCCMAKEDIR
+for f in $HIPRTCTARGETFILES; do
+    [ -e $f ] || continue
+    rm $(basename $f)
+done
+
+cd $CURRENTDIR
+
+rmdir --ignore-fail-on-non-empty $HIPCMAKEDIR
+rmdir --ignore-fail-on-non-empty $HIPLANGCMAKEDIR
+rmdir --ignore-fail-on-non-empty $HIPRTCCMAKEDIR
@@ -0,0 +1,27 @@
+#!/bin/bash
+# Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+ROCMDIR=@ROCM_PATH@
+HIPDIR=$ROCMDIR/hip
+
+if [ -d $ROCMDIR ] ; then
+    ln -s -f $ROCMDIR /opt/rocm
+fi
@@ -0,0 +1,24 @@
+#!/bin/bash
+# Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+if [ -L "/opt/rocm" ] ; then
+   unlink /opt/rocm
+fi
@@ -0,0 +1,319 @@
+# Copyright (c) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+cmake_minimum_required(VERSION 3.5.1)
+
+include(GNUInstallDirs)
+
+set(VERSION_MAJOR_AMDHIP ${HIP_VERSION_MAJOR})
+set(VERSION_MINOR_AMDHIP ${HIP_VERSION_MINOR})
+
+if(ADDRESS_SANITIZER)
+  set(ASAN_LINKER_FLAGS "-fsanitize=address")
+  set(ASAN_COMPILER_FLAGS "-fno-omit-frame-pointer -fsanitize=address")
+
+  if(NOT CMAKE_COMPILER_IS_GNUCC)
+    if(BUILD_SHARED_LIBS)
+      set(ASAN_COMPILER_FLAGS "${ASAN_COMPILER_FLAGS} -shared-libsan")
+      set(ASAN_LINKER_FLAGS "${ASAN_LINKER_FLAGS} -shared-libsan")
+    else()
+      set(ASAN_LINKER_FLAGS "${ASAN_LINKER_FLAGS} -static-libsan")
+    endif()
+  endif()
+
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ASAN_COMPILER_FLAGS}")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ASAN_COMPILER_FLAGS}")
+  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${ASAN_LINKER_FLAGS} -s -Wl,--build-id=sha1")
+  set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${ASAN_LINKER_FLAGS} -Wl,--build-id=sha1")
+endif()
+
+if(CMAKE_COMPILER_IS_GNUCC)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror")
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-error=deprecated-declarations")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations")
+endif()
+
+option(DISABLE_DIRECT_DISPATCH "Disable Direct Dispatch" OFF)
+
+option(BUILD_SHARED_LIBS "Build the shared library" ON)
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake")
+find_package(ROCclr)
+
+if(BUILD_SHARED_LIBS)
+  add_library(amdhip64 SHARED)
+  # Windows doesn't have a strip utility, so CMAKE_STRIP won't be set.
+  if((CMAKE_BUILD_TYPE STREQUAL "Release") AND NOT ("${CMAKE_STRIP}" STREQUAL ""))
+    add_custom_command(TARGET amdhip64 POST_BUILD COMMAND ${CMAKE_STRIP} $<TARGET_FILE:amdhip64>)
+  endif()
+else()
+  add_library(amdhip64 STATIC $<TARGET_OBJECTS:rocclr>)
+endif()
+
+set_target_properties(amdhip64 PROPERTIES
+  CXX_STANDARD 17
+  CXX_STANDARD_REQUIRED ON
+  CXX_EXTENSIONS OFF
+  POSITION_INDEPENDENT_CODE ON
+  # Workaround for many places in the HIP project
+  # having hardcoded references to build/lib/libamdhip64.so
+  LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
+  ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+
+if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+  set_target_properties(amdhip64 PROPERTIES OUTPUT_NAME "amdhip64")
+else()
+  set_target_properties(amdhip64 PROPERTIES OUTPUT_NAME "amdhip32")
+endif()
+
+# Disable versioning for Windows
+# as currently HIP_LIB_VERSION_STRING and HIP_LIB_VERSION_MAJOR
+# are not being populated
+if(NOT WIN32)
+  if(BUILD_SHARED_LIBS)
+    set_target_properties(amdhip64 PROPERTIES
+      VERSION ${HIP_LIB_VERSION_STRING}
+      SOVERSION ${HIP_LIB_VERSION_MAJOR})
+  endif()
+endif()
+
+target_sources(amdhip64 PRIVATE
+  cl_gl.cpp
+  fixme.cpp
+  hip_activity.cpp
+  hip_code_object.cpp
+  hip_context.cpp
+  hip_device_runtime.cpp
+  hip_device.cpp
+  hip_error.cpp
+  hip_event.cpp
+  hip_event_ipc.cpp
+  hip_fatbin.cpp
+  hip_global.cpp
+  hip_graph_internal.cpp
+  hip_graph.cpp
+  hip_hmm.cpp
+  hip_intercept.cpp
+  hip_memory.cpp
+  hip_mempool.cpp
+  hip_mempool_impl.cpp
+  hip_module.cpp
+  hip_peer.cpp
+  hip_platform.cpp
+  hip_profile.cpp
+  hip_stream_ops.cpp
+  hip_stream.cpp
+  hip_surface.cpp
+  hip_texture.cpp
+  hip_gl.cpp
+  hip_vm.cpp)
+
+if(WIN32)
+  target_sources(amdhip64 PRIVATE
+    cl_d3d9.cpp
+    cl_d3d10.cpp
+    cl_d3d11.cpp
+    hip_runtime.cpp)
+endif()
+
+if(BUILD_SHARED_LIBS)
+  if(WIN32)
+    target_sources(amdhip64 PRIVATE amdhip.def)
+  else()
+    target_link_libraries(amdhip64 PRIVATE "-Wl,--version-script=${CMAKE_CURRENT_LIST_DIR}/hip_hcc.map.in")
+    set_target_properties(amdhip64 PROPERTIES LINK_DEPENDS "${CMAKE_CURRENT_LIST_DIR}/hip_hcc.map.in")
+  endif()
+endif()
+
+if(WIN32)
+  configure_file(hip_hcc_in.rc.in hip_hcc_info.rc @ONLY)
+  target_sources(amdhip64 PRIVATE hip_hcc_info.rc)
+endif()
+
+target_include_directories(amdhip64
+  PRIVATE
+    ${HIP_COMMON_INCLUDE_DIR}
+    ${PROJECT_SOURCE_DIR}/include
+    ${PROJECT_BINARY_DIR}/include)
+
+target_compile_definitions(amdhip64 PRIVATE __HIP_PLATFORM_AMD__)
+target_link_libraries(amdhip64 PRIVATE ${OPENGL_LIBRARIES})
+target_link_libraries(amdhip64 PRIVATE ${CMAKE_DL_LIBS})
+
+# Note in static case we cannot link against rocclr.
+# If we would, we'd also have to export rocclr and have hipcc pass it to the linker.
+if(BUILD_SHARED_LIBS)
+  target_link_libraries(amdhip64 PRIVATE rocclr)
+else()
+  target_compile_definitions(amdhip64 PRIVATE $<TARGET_PROPERTY:rocclr,COMPILE_DEFINITIONS>)
+  target_include_directories(amdhip64 PRIVATE $<TARGET_PROPERTY:rocclr,INCLUDE_DIRECTORIES>)
+endif()
+
+if(DISABLE_DIRECT_DISPATCH)
+  target_compile_definitions(amdhip64 PRIVATE DISABLE_DIRECT_DISPATCH)
+endif()
+
+# Short-Term solution for pre-compiled headers for online compilation
+# Enable pre compiled header
+if(__HIP_ENABLE_PCH)
+  find_package(LLVM REQUIRED CONFIG
+    PATHS
+      ${ROCM_PATH}/llvm)
+  # find_package(LLVM) returns the lib/cmake/llvm location. We require the root.
+  if(NOT DEFINED HIP_LLVM_ROOT)
+    set(HIP_LLVM_ROOT "${LLVM_DIR}/../../..")
+  endif()
+
+  execute_process(COMMAND sh -c "${CMAKE_CURRENT_SOURCE_DIR}/hip_embed_pch.sh ${HIP_COMMON_INCLUDE_DIR} ${PROJECT_BINARY_DIR}/include ${PROJECT_SOURCE_DIR}/include ${HIP_LLVM_ROOT}" COMMAND_ECHO STDERR RESULT_VARIABLE EMBED_PCH_RC WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+  if (EMBED_PCH_RC AND NOT EMBED_PCH_RC EQUAL 0)
+    message(FATAL_ERROR "Failed to embed PCH")
+  endif()
+
+  target_compile_definitions(amdhip64 PRIVATE __HIP_ENABLE_PCH)
+  target_sources(amdhip64 PRIVATE ${CMAKE_BINARY_DIR}/hip_pch.o)
+endif()
+
+set(HIPRTC_OBJECTS)
+# Add hiprtc
+add_subdirectory(hiprtc)
+
+if(NOT WIN32)
+  if(BUILD_SHARED_LIBS)
+    target_link_libraries(amdhip64 PRIVATE ${HIPRTC_OBJECTS})
+    target_compile_definitions(amdhip64 PRIVATE __HIP_ENABLE_RTC)
+    add_dependencies(amdhip64 hiprtc-builtins)
+    INSTALL(TARGETS hiprtc-builtins
+      RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+      LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+      ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
+  endif()
+endif()
+
+#############################
+# Profiling API support
+#############################
+# Generate profiling API macros/structures header
+option(USE_PROF_API "Enable roctracer integration" ON)
+# Enable profiling API
+if(USE_PROF_API)
+  set(PROF_API_STR "${PROJECT_BINARY_DIR}/include/hip/amd_detail/hip_prof_str.h")
+  set(PROF_API_STR_IN "${CMAKE_SOURCE_DIR}/include/hip/amd_detail/hip_prof_str.h")
+  set(PROF_API_HDR "${HIP_COMMON_INCLUDE_DIR}/hip/hip_runtime_api.h")
+  set(PROF_API_SRC "${CMAKE_CURRENT_SOURCE_DIR}")
+  set(PROF_API_GEN "${CMAKE_CURRENT_SOURCE_DIR}/hip_prof_gen.py")
+  set(PROF_API_LOG "${PROJECT_BINARY_DIR}/hip_prof_gen.log.txt")
+
+  find_package(Python3 COMPONENTS Interpreter REQUIRED)
+
+  execute_process(COMMAND ${Python3_EXECUTABLE} -c "import CppHeaderParser"
+                  RESULT_VARIABLE CPP_HEADER_PARSER
+                  OUTPUT_QUIET)
+
+  if(NOT ${CPP_HEADER_PARSER} EQUAL 0)
+    message(FATAL_ERROR "\
+The \"CppHeaderParser\" Python3 package is not installed. \
+Please install it using the following command: \"pip3 install CppHeaderParser\".\
+")
+  endif()
+
+  add_custom_command(OUTPUT ${PROF_API_STR}
+    COMMAND ${Python3_EXECUTABLE} ${PROF_API_GEN} -v -t --priv ${PROF_API_HDR} ${PROF_API_SRC} ${PROF_API_STR_IN} ${PROF_API_STR}
+    DEPENDS ${PROF_API_STR_IN} ${PROF_API_HDR} ${PROF_API_GEN}
+    COMMENT "Generating profiling primitives: ${PROF_API_STR}")
+
+  add_custom_target(gen-prof-api-str-header ALL
+    DEPENDS ${PROF_API_STR}
+    SOURCES ${PROF_API_HDR})
+
+  set_target_properties(amdhip64 PROPERTIES PUBLIC_HEADER ${PROF_API_STR})
+
+  find_path(PROF_API_HEADER_DIR prof_protocol.h
+    HINTS
+      ${PROF_API_HEADER_PATH}
+    PATHS
+      ${ROCM_PATH}/roctracer
+    PATH_SUFFIXES
+      include/ext)
+
+  if(NOT PROF_API_HEADER_DIR)
+    message(WARNING "Profiling API header not found. Disabling roctracer integration. Use -DPROF_API_HEADER_PATH=<path to prof_protocol.h header>")
+  else()
+    target_compile_definitions(amdhip64 PUBLIC USE_PROF_API=1)
+    target_include_directories(amdhip64 PUBLIC ${PROF_API_HEADER_DIR})
+    message(STATUS "Profiling API: ${PROF_API_HEADER_DIR}")
+  endif()
+
+  add_dependencies(amdhip64 gen-prof-api-str-header)
+endif()
+
+add_custom_command(TARGET amdhip64 POST_BUILD COMMAND
+  ${CMAKE_COMMAND} -E copy ${PROJECT_BINARY_DIR}/.hipInfo ${PROJECT_BINARY_DIR}/lib/.hipInfo)
+add_custom_command(TARGET amdhip64 POST_BUILD COMMAND
+  ${CMAKE_COMMAND} -E copy_directory ${PROJECT_SOURCE_DIR}/include ${PROJECT_BINARY_DIR}/include)
+
+add_library(host INTERFACE)
+target_link_libraries(host INTERFACE amdhip64)
+
+add_library(device INTERFACE)
+target_link_libraries(device INTERFACE host)
+
+# Current packaging assumes that HIP runtime will always be installed in ${ROCM_PATH}/lib
+# This is false to assume, because some distros like CentOS will use the lib64 directory instead of lib
+# Relying on CMake to choose the library directory for us will default in that case to lib64
+# Hence there will be a mismatch between where HIP is installed and where CMake thinks it is
+
+INSTALL(TARGETS amdhip64 host device
+  EXPORT hip-targets
+  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+INSTALL(EXPORT hip-targets DESTINATION ${CONFIG_PACKAGE_INSTALL_DIR} NAMESPACE hip::)
+
+INSTALL(TARGETS amdhip64 host device
+  EXPORT hip-lang-targets
+  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+INSTALL(EXPORT hip-lang-targets DESTINATION ${CONFIG_LANG_PACKAGE_INSTALL_DIR} NAMESPACE hip-lang::)
+
+if(NOT WIN32)
+include(CMakePackageConfigHelpers)
+
+configure_package_config_file(
+  ${HIP_COMMON_DIR}/hip-lang-config.cmake.in
+  ${CMAKE_CURRENT_BINARY_DIR}/hip-lang-config.cmake
+  INSTALL_DESTINATION ${CONFIG_LANG_PACKAGE_INSTALL_DIR}
+  PATH_VARS LIB_INSTALL_DIR INCLUDE_INSTALL_DIR BIN_INSTALL_DIR)
+
+write_basic_package_version_file(
+  ${CMAKE_CURRENT_BINARY_DIR}/hip-lang-config-version.cmake
+  VERSION "${HIP_VERSION_MAJOR}.${HIP_VERSION_MINOR}.${HIP_VERSION_GITDATE}"
+  COMPATIBILITY SameMajorVersion)
+install(
+    FILES
+    ${CMAKE_CURRENT_BINARY_DIR}/hip-lang-config.cmake
+    ${CMAKE_CURRENT_BINARY_DIR}/hip-lang-config-version.cmake
+    DESTINATION
+    ${CONFIG_LANG_PACKAGE_INSTALL_DIR}/
+    )
+endif()
@@ -0,0 +1,135 @@
+/*
+Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+// This header file is partially copied from
+//   https://github.com/llvm/llvm-project/blob/main/llvm/include/llvm/BinaryFormat/ELF.h
+
+// AMDGPU OS for HSA compatible compute kernels.
+enum { ELFOSABI_AMDGPU_HSA = 64, ELFOSABI_AMDGPU_PAL = 65, ELFOSABI_AMDGPU_MESA3D = 66 };
+
+enum {
+  ELFABIVERSION_AMDGPU_HSA_V2 = 0,
+  ELFABIVERSION_AMDGPU_HSA_V3 = 1,
+  ELFABIVERSION_AMDGPU_HSA_V4 = 2,
+  ELFABIVERSION_AMDGPU_HSA_V5 = 3
+};
+
+// AMDGPU specific e_flags
+enum : unsigned {
+  EF_AMDGPU_MACH = 0x0ff,
+  // AMDGPU processors
+  EF_AMDGPU_MACH_NONE = 0x000,
+  EF_AMDGPU_MACH_R600_R600 = 0x001,
+  EF_AMDGPU_MACH_R600_R630 = 0x002,
+  EF_AMDGPU_MACH_R600_RS880 = 0x003,
+  EF_AMDGPU_MACH_R600_RV670 = 0x004,
+  EF_AMDGPU_MACH_R600_RV710 = 0x005,
+  EF_AMDGPU_MACH_R600_RV730 = 0x006,
+  EF_AMDGPU_MACH_R600_RV770 = 0x007,
+  EF_AMDGPU_MACH_R600_CEDAR = 0x008,
+  EF_AMDGPU_MACH_R600_CYPRESS = 0x009,
+  EF_AMDGPU_MACH_R600_JUNIPER = 0x00a,
+  EF_AMDGPU_MACH_R600_REDWOOD = 0x00b,
+  EF_AMDGPU_MACH_R600_SUMO = 0x00c,
+  EF_AMDGPU_MACH_R600_BARTS = 0x00d,
+  EF_AMDGPU_MACH_R600_CAICOS = 0x00e,
+  EF_AMDGPU_MACH_R600_CAYMAN = 0x00f,
+  EF_AMDGPU_MACH_R600_TURKS = 0x010,
+  EF_AMDGPU_MACH_R600_RESERVED_FIRST = 0x011,
+  EF_AMDGPU_MACH_R600_RESERVED_LAST = 0x01f,
+  EF_AMDGPU_MACH_R600_FIRST = EF_AMDGPU_MACH_R600_R600,
+  EF_AMDGPU_MACH_R600_LAST = EF_AMDGPU_MACH_R600_TURKS,
+
+  // AMDGCN-based processors.
+  EF_AMDGPU_MACH_AMDGCN_GFX600        = 0x020,
+  EF_AMDGPU_MACH_AMDGCN_GFX601        = 0x021,
+  EF_AMDGPU_MACH_AMDGCN_GFX700        = 0x022,
+  EF_AMDGPU_MACH_AMDGCN_GFX701        = 0x023,
+  EF_AMDGPU_MACH_AMDGCN_GFX702        = 0x024,
+  EF_AMDGPU_MACH_AMDGCN_GFX703        = 0x025,
+  EF_AMDGPU_MACH_AMDGCN_GFX704        = 0x026,
+  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X27 = 0x027,
+  EF_AMDGPU_MACH_AMDGCN_GFX801        = 0x028,
+  EF_AMDGPU_MACH_AMDGCN_GFX802        = 0x029,
+  EF_AMDGPU_MACH_AMDGCN_GFX803        = 0x02a,
+  EF_AMDGPU_MACH_AMDGCN_GFX810        = 0x02b,
+  EF_AMDGPU_MACH_AMDGCN_GFX900        = 0x02c,
+  EF_AMDGPU_MACH_AMDGCN_GFX902        = 0x02d,
+  EF_AMDGPU_MACH_AMDGCN_GFX904        = 0x02e,
+  EF_AMDGPU_MACH_AMDGCN_GFX906        = 0x02f,
+  EF_AMDGPU_MACH_AMDGCN_GFX908        = 0x030,
+  EF_AMDGPU_MACH_AMDGCN_GFX909        = 0x031,
+  EF_AMDGPU_MACH_AMDGCN_GFX90C        = 0x032,
+  EF_AMDGPU_MACH_AMDGCN_GFX1010       = 0x033,
+  EF_AMDGPU_MACH_AMDGCN_GFX1011       = 0x034,
+  EF_AMDGPU_MACH_AMDGCN_GFX1012       = 0x035,
+  EF_AMDGPU_MACH_AMDGCN_GFX1030       = 0x036,
+  EF_AMDGPU_MACH_AMDGCN_GFX1031       = 0x037,
+  EF_AMDGPU_MACH_AMDGCN_GFX1032       = 0x038,
+  EF_AMDGPU_MACH_AMDGCN_GFX1033       = 0x039,
+  EF_AMDGPU_MACH_AMDGCN_GFX602        = 0x03a,
+  EF_AMDGPU_MACH_AMDGCN_GFX705        = 0x03b,
+  EF_AMDGPU_MACH_AMDGCN_GFX805        = 0x03c,
+  EF_AMDGPU_MACH_AMDGCN_GFX1035       = 0x03d,
+  EF_AMDGPU_MACH_AMDGCN_GFX1034       = 0x03e,
+  EF_AMDGPU_MACH_AMDGCN_GFX90A        = 0x03f,
+  EF_AMDGPU_MACH_AMDGCN_GFX940        = 0x040,
+  EF_AMDGPU_MACH_AMDGCN_GFX1100       = 0x041,
+  EF_AMDGPU_MACH_AMDGCN_GFX1013       = 0x042,
+  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X43 = 0x043,
+  EF_AMDGPU_MACH_AMDGCN_GFX1103       = 0x044,
+  EF_AMDGPU_MACH_AMDGCN_GFX1036       = 0x045,
+  EF_AMDGPU_MACH_AMDGCN_GFX1101       = 0x046,
+  EF_AMDGPU_MACH_AMDGCN_GFX1102       = 0x047,
+
+  // First/last AMDGCN-based processors.
+  EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600,
+  EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX1102,
+
+  // Indicates if the "xnack" target feature is enabled for all code contained
+  // in the object.
+  //
+  // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V3.
+  EF_AMDGPU_FEATURE_XNACK_V3 = 0x100,
+  // Indicates if the "sramecc" target feature is enabled for all code
+  // contained in the object.
+  //
+  // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V3.
+  EF_AMDGPU_FEATURE_SRAMECC_V3 = 0x200,
+
+  // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V4.
+  EF_AMDGPU_FEATURE_XNACK_V4 = 0x300,
+  EF_AMDGPU_FEATURE_XNACK_UNSUPPORTED_V4 = 0x000,
+  EF_AMDGPU_FEATURE_XNACK_ANY_V4 = 0x100,
+  EF_AMDGPU_FEATURE_XNACK_OFF_V4 = 0x200,
+  EF_AMDGPU_FEATURE_XNACK_ON_V4 = 0x300,
+
+  // SRAMECC selection mask for EF_AMDGPU_FEATURE_SRAMECC_* values.
+  // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V4.
+  EF_AMDGPU_FEATURE_SRAMECC_V4 = 0xc00,
+  EF_AMDGPU_FEATURE_SRAMECC_UNSUPPORTED_V4 = 0x000,
+  EF_AMDGPU_FEATURE_SRAMECC_ANY_V4 = 0x400,
+  EF_AMDGPU_FEATURE_SRAMECC_OFF_V4 = 0x800,
+  EF_AMDGPU_FEATURE_SRAMECC_ON_V4 = 0xc00,
+};
@@ -0,0 +1,446 @@
+EXPORTS
+hipChooseDevice
+hipCtxCreate
+hipCtxDestroy
+hipCtxDisablePeerAccess
+hipCtxEnablePeerAccess
+hipCtxGetApiVersion
+hipCtxGetCacheConfig
+hipCtxGetCurrent
+hipCtxGetDevice
+hipCtxGetFlags
+hipCtxGetSharedMemConfig
+hipCtxPopCurrent
+hipCtxPushCurrent
+hipCtxSetCacheConfig
+hipCtxSetCurrent
+hipCtxSetSharedMemConfig
+hipCtxSynchronize
+hipDeviceCanAccessPeer
+hipDeviceComputeCapability
+hipDeviceDisablePeerAccess
+hipDeviceEnablePeerAccess
+hipDeviceGet
+hipDeviceGetAttribute
+hipDeviceGetByPCIBusId
+hipDeviceGetCacheConfig
+hipDeviceGetStreamPriorityRange
+hipDeviceGetLimit
+hipDeviceGetName
+hipDeviceGetUuid
+hipDeviceGetPCIBusId
+hipDeviceGetSharedMemConfig
+hipDeviceGetP2PAttribute
+hipDevicePrimaryCtxGetState
+hipDevicePrimaryCtxRelease
+hipDevicePrimaryCtxReset
+hipDevicePrimaryCtxRetain
+hipDevicePrimaryCtxSetFlags
+hipDeviceReset
+hipDeviceSetCacheConfig
+hipDeviceSetSharedMemConfig
+hipDeviceSynchronize
+hipDeviceTotalMem
+hipDriverGetVersion
+hipEventCreate
+hipEventCreateWithFlags
+hipEventDestroy
+hipEventElapsedTime
+hipEventQuery
+hipEventRecord
+hipEventSynchronize
+hipExtGetLinkTypeAndHopCount
+hipExtLaunchMultiKernelMultiDevice
+hipExtMallocWithFlags
+hipExtModuleLaunchKernel
+hipExtLaunchKernel
+hipFree
+hipFreeArray
+hipFuncSetAttribute
+hipFuncSetCacheConfig
+hipFuncSetSharedMemConfig
+hipGetDevice
+hipGetDeviceCount
+hipGetDeviceProperties
+hipGetErrorName
+hipGetErrorString
+hipGetLastError
+hipMemAllocHost
+hipHostAlloc
+hipHostFree
+hipHostGetDevicePointer
+hipHostGetFlags
+hipHostMalloc
+hipHostRegister
+hipHostUnregister
+hipInit
+hipIpcCloseMemHandle
+hipIpcGetMemHandle
+hipIpcOpenMemHandle
+hipIpcGetEventHandle
+hipIpcOpenEventHandle
+hipMalloc
+hipMalloc3D
+hipMalloc3DArray
+hipMallocManaged
+hipDeviceGetDefaultMemPool
+hipDeviceSetMemPool
+hipDeviceGetMemPool
+hipMallocAsync
+hipFreeAsync
+hipMemPoolTrimTo
+hipMemPoolSetAttribute
+hipMemPoolGetAttribute
+hipMemPoolSetAccess
+hipMemPoolGetAccess
+hipMemPoolCreate
+hipMemPoolDestroy
+hipMallocFromPoolAsync
+hipMemPoolExportToShareableHandle
+hipMemPoolImportFromShareableHandle
+hipMemPoolExportPointer
+hipMemPoolImportPointer
+hipArrayCreate
+hipArray3DCreate
+hipArrayDestroy
+hipArrayGetInfo
+hipArrayGetDescriptor
+hipArray3DGetDescriptor
+hipMallocArray
+hipMemAdvise
+hipMemAllocPitch
+hipMallocPitch
+hipMemcpy
+hipMemcpyWithStream
+hipMemcpyParam2D
+hipMemcpy2D
+hipMemcpy2DAsync
+hipMemcpy2DToArray
+hipMemcpy2DToArrayAsync
+hipMemcpy3D
+hipMemcpy3DAsync
+hipDrvMemcpy3D
+hipDrvMemcpy3DAsync
+hipMemcpyAsync
+hipMemcpyDtoD
+hipMemcpyDtoDAsync
+hipMemcpyDtoH
+hipMemcpyDtoHAsync
+hipMemcpyFromSymbol
+hipMemcpyFromSymbolAsync
+hipMemcpyHtoD
+hipMemcpyHtoDAsync
+hipMemcpyPeer
+hipMemcpyPeerAsync
+hipMemcpyToArray
+hipMemcpyFromArray
+hipMemcpyToSymbol
+hipMemcpyToSymbolAsync
+hipMemGetAddressRange
+hipGetSymbolAddress
+hipGetSymbolSize
+hipMemGetInfo
+hipMemPrefetchAsync
+hipMemPtrGetInfo
+hipMemRangeGetAttribute
+hipMemRangeGetAttributes
+hipMemset
+hipMemsetAsync
+hipMemsetD8
+hipMemsetD8Async
+hipMemsetD16
+hipMemsetD16Async
+hipMemsetD32
+hipMemsetD32Async
+hipMemset2D
+hipMemset2DAsync
+hipMemset3D
+hipMemset3DAsync
+hipModuleGetFunction
+hipModuleGetGlobal
+hipModuleGetTexRef
+hipModuleLaunchKernel
+hipModuleLaunchKernelExt
+hipModuleLaunchCooperativeKernel
+hipModuleLaunchCooperativeKernelMultiDevice
+hipLaunchCooperativeKernel
+hipLaunchCooperativeKernelMultiDevice
+hipHccModuleLaunchKernel
+hipModuleLoad
+hipModuleLoadData
+hipModuleLoadDataEx
+hipModuleUnload
+hipModuleOccupancyMaxPotentialBlockSize
+hipModuleOccupancyMaxPotentialBlockSizeWithFlags
+hipModuleOccupancyMaxActiveBlocksPerMultiprocessor
+hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+hipOccupancyMaxPotentialBlockSize
+hipOccupancyMaxActiveBlocksPerMultiprocessor
+hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+hipFuncGetAttribute
+hipFuncGetAttributes
+hipPeekAtLastError
+hipPointerGetAttributes
+hipProfilerStart
+hipProfilerStop
+hipRuntimeGetVersion
+hipGetDeviceFlags
+hipSetDevice
+hipSetDeviceFlags
+hipStreamAddCallback
+hipStreamAttachMemAsync
+hipStreamCreate
+hipStreamCreateWithFlags
+hipStreamCreateWithPriority
+hipStreamDestroy
+hipStreamGetDevice
+hipStreamGetFlags
+hipStreamQuery
+hipStreamSynchronize
+hipStreamWaitEvent
+__hipPopCallConfiguration
+__hipPushCallConfiguration
+__hipRegisterFatBinary
+__hipRegisterFunction
+__hipRegisterVar
+__hipRegisterSurface
+__hipRegisterTexture
+__hipRegisterManagedVar
+__hipUnregisterFatBinary
+hipConfigureCall
+hipSetupArgument
+hipLaunchByPtr
+hipLaunchKernel
+hipRegisterTracerCallback
+hipApiName
+hipKernelNameRef
+hipBindTexture
+hipBindTexture2D
+hipBindTextureToArray
+hipBindTextureToMipmappedArray
+hipGetTextureAlignmentOffset
+hipGetTextureReference
+hipUnbindTexture
+hipCreateChannelDesc
+hipCreateTextureObject
+hipDestroyTextureObject
+hipGetChannelDesc
+hipGetTextureObjectResourceDesc
+hipGetTextureObjectResourceViewDesc
+hipGetTextureObjectTextureDesc
+hipTexRefGetAddress
+hipTexRefGetAddressMode
+hipTexRefGetArray
+hipTexRefGetBorderColor
+hipTexRefGetFilterMode
+hipTexRefGetFlags
+hipTexRefGetFormat
+hipTexRefGetMaxAnisotropy
+hipTexRefGetMipmapFilterMode
+hipTexRefGetMipmapLevelBias
+hipTexRefGetMipmapLevelClamp
+hipTexRefGetMipmappedArray
+hipTexRefSetAddress
+hipTexRefSetAddress2D
+hipTexRefSetAddressMode
+hipTexRefSetArray
+hipTexRefSetBorderColor
+hipTexRefSetFilterMode
+hipTexRefSetFlags
+hipTexRefSetFormat
+hipTexRefSetMaxAnisotropy
+hipTexRefSetMipmapFilterMode
+hipTexRefSetMipmapLevelBias
+hipTexRefSetMipmapLevelClamp
+hipTexRefSetMipmappedArray
+hipProfilerStart
+hipProfilerStop
+hipCreateSurfaceObject
+hipDestroySurfaceObject
+hipGetCmdName
+hipMipmappedArrayCreate
+hipMallocMipmappedArray
+hipMipmappedArrayDestroy
+hipFreeMipmappedArray
+hipMipmappedArrayGetLevel
+hipGetMipmappedArrayLevel
+hipMallocHost
+hipFreeHost
+hipTexObjectCreate
+hipTexObjectDestroy
+hipTexObjectGetResourceDesc
+hipTexObjectGetResourceViewDesc
+hipTexObjectGetTextureDesc
+hipExtStreamCreateWithCUMask
+hipStreamGetPriority
+hipMemcpy2DFromArray
+hipMemcpy2DFromArrayAsync
+hipDrvMemcpy2DUnaligned
+hipMemcpyAtoH
+hipMemcpyHtoA
+hipMemcpyParam2DAsync
+__gnu_h2f_ieee
+__gnu_f2h_ieee
+hipExtStreamGetCUMask
+hipImportExternalMemory
+hipExternalMemoryGetMappedBuffer
+hipDestroyExternalMemory
+hipGraphCreate
+hipGraphDestroy
+hipGraphAddKernelNode
+hipGraphAddMemsetNode
+hipGraphAddMemcpyNode
+hipGraphAddMemcpyNode1D
+hipGraphInstantiate
+hipGraphLaunch
+hipStreamIsCapturing
+hipStreamBeginCapture
+hipStreamEndCapture
+hipGraphExecDestroy
+hipPointerGetAttribute
+hipDrvPointerGetAttributes
+hipImportExternalSemaphore
+hipSignalExternalSemaphoresAsync
+hipWaitExternalSemaphoresAsync
+hipDestroyExternalSemaphore
+hipGLGetDevices
+hipGraphicsGLRegisterBuffer
+hipGraphicsGLRegisterImage
+hipGraphicsMapResources
+hipGraphicsResourceGetMappedPointer
+hipGraphicsSubResourceGetMappedArray
+hipGraphicsUnmapResources
+hipGraphicsUnregisterResource
+hipGraphGetNodes
+hipGraphGetRootNodes
+hipGraphKernelNodeGetParams
+hipGraphKernelNodeSetParams
+hipGraphKernelNodeSetAttribute
+hipGraphKernelNodeGetAttribute
+hipGraphMemcpyNodeGetParams
+hipGraphMemcpyNodeSetParams
+hipGraphMemsetNodeGetParams
+hipGraphMemsetNodeSetParams
+hipGraphAddDependencies
+hipGraphExecKernelNodeSetParams
+hipGraphAddEmptyNode
+hipStreamGetCaptureInfo
+hipStreamGetCaptureInfo_v2
+hipStreamUpdateCaptureDependencies
+hipGraphRemoveDependencies
+hipGraphGetEdges
+hipGraphNodeGetDependencies
+hipGraphNodeGetDependentNodes
+hipGraphNodeGetType
+hipGraphDestroyNode
+hipGraphClone
+hipGraphNodeFindInClone
+hipGraphAddChildGraphNode
+hipGraphChildGraphNodeGetGraph
+hipGraphExecChildGraphNodeSetParams
+hipGraphAddMemcpyNodeFromSymbol
+hipGraphMemcpyNodeSetParamsFromSymbol
+hipGraphExecMemcpyNodeSetParamsFromSymbol
+hipGraphAddMemcpyNodeToSymbol
+hipGraphMemcpyNodeSetParamsToSymbol
+hipGraphExecMemcpyNodeSetParamsToSymbol
+hipGraphExecMemcpyNodeSetParams
+hipGraphMemcpyNodeSetParams1D
+hipGraphExecMemcpyNodeSetParams1D
+hipGraphAddEventRecordNode
+hipGraphEventRecordNodeGetEvent
+hipGraphEventRecordNodeSetEvent
+hipGraphExecEventRecordNodeSetEvent
+hipGraphAddEventWaitNode
+hipGraphEventWaitNodeGetEvent
+hipGraphEventWaitNodeSetEvent
+hipGraphExecEventWaitNodeSetEvent
+hipGraphAddHostNode
+hipGraphHostNodeGetParams
+hipGraphHostNodeSetParams
+hipGraphExecHostNodeSetParams
+hipGraphExecUpdate
+hipGraphInstantiateWithFlags
+hipGraphExecMemsetNodeSetParams
+hipDeviceGetGraphMemAttribute
+hipDeviceSetGraphMemAttribute
+hipDeviceGraphMemTrim
+amd_dbgapi_get_build_name
+amd_dbgapi_get_git_hash
+amd_dbgapi_get_build_id
+hipThreadExchangeStreamCaptureMode
+hipMemAddressFree
+hipMemAddressReserve
+hipMemCreate
+hipMemExportToShareableHandle
+hipMemGetAccess
+hipMemGetAllocationGranularity
+hipMemGetAllocationPropertiesFromHandle
+hipMemImportFromShareableHandle
+hipMemMap
+hipMemMapArrayAsync
+hipMemRelease
+hipMemRetainAllocationHandle
+hipMemSetAccess
+hipMemUnmap
+hipMemcpy_spt
+hipMemcpyAsync_spt
+hipStreamSynchronize_spt
+hipMemcpyToSymbol_spt
+hipMemcpyFromSymbol_spt
+hipMemcpy2D_spt
+hipMemcpy2DToArray_spt
+hipMemcpy2DFromArray_spt
+hipMemcpy3D_spt
+hipMemset_spt
+hipMemset2D_spt
+hipMemset3D_spt
+hipStreamQuery_spt
+hipStreamGetFlags_spt
+hipStreamGetPriority_spt
+hipStreamWaitEvent_spt
+hipEventRecord_spt
+hipLaunchKernel_spt
+hipLaunchCooperativeKernel_spt
+hipStreamWriteValue32
+hipStreamWriteValue64
+hipStreamWaitValue32
+hipStreamWaitValue64
+hipDeviceSetLimit
+hipGetStreamDeviceId
+hipGraphLaunch_spt
+hipStreamBeginCapture_spt
+hipStreamEndCapture_spt
+hipStreamIsCapturing_spt
+hipStreamGetCaptureInfo_spt
+hipStreamGetCaptureInfo_v2_spt
+hipStreamAddCallback_spt
+hipMemsetAsync_spt
+hipMemset2DAsync_spt
+hipMemset3DAsync_spt
+hipMemcpy3DAsync_spt
+hipMemcpy2DAsync_spt
+hipMemcpyFromSymbolAsync_spt
+hipMemcpyToSymbolAsync_spt
+hipMemcpyFromArray_spt
+hipMemcpy2DToArray_spt
+hipMemcpy2DFromArrayAsync_spt
+hipMemcpy2DToArrayAsync_spt
+hipDrvGetErrorName
+hipDrvGetErrorString
+hipUserObjectCreate
+hipUserObjectRelease
+hipUserObjectRetain
+hipGraphRetainUserObject
+hipGraphReleaseUserObject
+hipLaunchHostFunc
+hipLaunchHostFunc_spt
+hipGraphDebugDotPrint
+hipGraphKernelNodeCopyAttributes
+hipGraphNodeGetEnabled
+hipGraphNodeSetEnabled
+hipGraphUpload
+hipGraphAddMemAllocNode
+hipGraphMemAllocNodeGetParams
+hipGraphAddMemFreeNode
+hipGraphMemFreeNodeGetParams
@@ -0,0 +1,854 @@
+/* Copyright (c) 2012 - 2021 Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifdef _WIN32
+
+#include "top.hpp"
+
+#include "cl_d3d9_amd.hpp"
+#include "platform/command.hpp"
+
+#include <cstring>
+#include <utility>
+
+#define D3DFMT_NV_12 static_cast<D3DFORMAT>(MAKEFOURCC('N', 'V', '1', '2'))
+#define D3DFMT_P010  static_cast<D3DFORMAT>(MAKEFOURCC('P', '0', '1', '0'))
+#define D3DFMT_YV_12 static_cast<D3DFORMAT>(MAKEFOURCC('Y', 'V', '1', '2'))
+#define D3DFMT_YUY2  static_cast<D3DFORMAT>(MAKEFOURCC('Y', 'U', 'Y', '2'))
+
+
+RUNTIME_ENTRY(cl_int, clGetDeviceIDsFromDX9MediaAdapterKHR,
+              (cl_platform_id platform, cl_uint num_media_adapters,
+               cl_dx9_media_adapter_type_khr* media_adapters_type, void* media_adapters,
+               cl_dx9_media_adapter_set_khr media_adapter_set, cl_uint num_entries,
+               cl_device_id* devices, cl_uint* num_devices)) {
+  cl_int errcode;
+  // Accept an array of DX9 devices here as the spec mention of array of num_media_adapters size.
+  IDirect3DDevice9Ex** d3d9_device = static_cast<IDirect3DDevice9Ex**>(media_adapters);
+  cl_device_id* gpu_devices = NULL;
+  cl_uint num_gpu_devices = 0;
+  static const bool VALIDATE_ONLY = true;
+
+  if (platform != NULL && platform != AMD_PLATFORM) {
+    LogWarning("\"platrform\" is not a valid AMD platform");
+    return CL_INVALID_PLATFORM;
+  }
+  // check if input parameter are correct
+  if ((num_media_adapters == 0) || (media_adapters_type == NULL) || (media_adapters == NULL) ||
+      (media_adapter_set != CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR &&
+       media_adapter_set != CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR) ||
+      (num_entries == 0 && devices != NULL)) {
+    return CL_INVALID_VALUE;
+  }
+  // Get GPU devices
+  errcode = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 0, NULL, &num_gpu_devices);
+  if (errcode != CL_SUCCESS && errcode != CL_DEVICE_NOT_FOUND) {
+    return CL_INVALID_VALUE;
+  }
+
+  if (!num_gpu_devices) {
+    *not_null(num_devices) = 0;
+    return CL_DEVICE_NOT_FOUND;
+  }
+
+  switch (media_adapter_set) {
+    case CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR:
+    case CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR: {
+      gpu_devices = new cl_device_id[num_gpu_devices];
+      errcode = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, num_gpu_devices, gpu_devices, NULL);
+      if (errcode != CL_SUCCESS) {
+        break;
+      }
+
+      std::vector<amd::Device*> compatible_devices;
+      for (cl_uint i = 0; i < num_gpu_devices; ++i) {
+        cl_device_id device = gpu_devices[i];
+        amd::Context::Flags context_flag;
+        amd::Context::DeviceFlagIdx devIdx;
+        switch (media_adapters_type[i]) {
+          case CL_ADAPTER_D3D9_KHR:
+            context_flag = amd::Context::Flags::D3D9DeviceKhr;
+            devIdx = amd::Context::DeviceFlagIdx::D3D9DeviceKhrIdx;
+            break;
+          case CL_ADAPTER_D3D9EX_KHR:
+            context_flag = amd::Context::Flags::D3D9DeviceEXKhr;
+            devIdx = amd::Context::DeviceFlagIdx::D3D9DeviceEXKhrIdx;
+            break;
+          case CL_ADAPTER_DXVA_KHR:
+            context_flag = amd::Context::Flags::D3D9DeviceVAKhr;
+            devIdx = amd::Context::DeviceFlagIdx::D3D9DeviceVAKhrIdx;
+            break;
+        }
+
+        for (cl_uint j = 0; j < num_media_adapters; ++j) {
+          // Since there can be multiple DX9 adapters passed in the array we need to validate
+          // interopability with each.
+          void* external_device[amd::Context::DeviceFlagIdx::LastDeviceFlagIdx] = {};
+          external_device[devIdx] = d3d9_device[j];
+
+          if (is_valid(device) && (media_adapters_type[j] == CL_ADAPTER_D3D9EX_KHR) &&
+              as_amd(device)->bindExternalDevice(context_flag, external_device, NULL,
+                                                 VALIDATE_ONLY)) {
+            compatible_devices.push_back(as_amd(device));
+          }
+        }
+      }
+      if (compatible_devices.size() == 0) {
+        *not_null(num_devices) = 0;
+        errcode = CL_DEVICE_NOT_FOUND;
+        break;
+      }
+
+      auto it = compatible_devices.cbegin();
+      cl_uint compatible_count = std::min(num_entries, (cl_uint)compatible_devices.size());
+
+      while (compatible_count--) {
+        *devices++ = as_cl(*it++);
+        --num_entries;
+      }
+      while (num_entries--) {
+        *devices++ = (cl_device_id)0;
+      }
+
+      *not_null(num_devices) = (cl_uint)compatible_devices.size();
+    } break;
+
+    default:
+      LogWarning("\"d3d9_device_set\" is invalid");
+      errcode = CL_INVALID_VALUE;
+  }
+
+  delete[] gpu_devices;
+  return errcode;
+}
+RUNTIME_EXIT
+
+RUNTIME_ENTRY_RET(cl_mem, clCreateFromDX9MediaSurfaceKHR,
+                  (cl_context context, cl_mem_flags flags,
+                   cl_dx9_media_adapter_type_khr adapter_type, void* surface_info, cl_uint plane,
+                   cl_int* errcode_ret)) {
+  cl_mem clMemObj = NULL;
+
+  cl_dx9_surface_info_khr* cl_surf_info = NULL;
+
+  if (!is_valid(context)) {
+    *not_null(errcode_ret) = CL_INVALID_CONTEXT;
+    LogWarning("invalid parameter \"context\"");
+    return clMemObj;
+  }
+
+  if (!flags) flags = CL_MEM_READ_WRITE;
+  if (!(((flags & CL_MEM_READ_ONLY) == CL_MEM_READ_ONLY) ||
+        ((flags & CL_MEM_WRITE_ONLY) == CL_MEM_WRITE_ONLY) ||
+        ((flags & CL_MEM_READ_WRITE) == CL_MEM_READ_WRITE))) {
+    *not_null(errcode_ret) = CL_INVALID_VALUE;
+    LogWarning("invalid parameter \"flags\"");
+    return clMemObj;
+  }
+
+  if ((adapter_type != CL_ADAPTER_D3D9_KHR) && (adapter_type != CL_ADAPTER_D3D9EX_KHR) &&
+      (adapter_type != CL_ADAPTER_DXVA_KHR)) {
+    *not_null(errcode_ret) = CL_INVALID_VALUE;
+    return clMemObj;
+  }
+
+  if (!surface_info) {
+    *not_null(errcode_ret) = CL_INVALID_VALUE;
+    LogWarning("parameter \"pD3DResource\" is a NULL pointer");
+    return clMemObj;
+  }
+
+  cl_surf_info = (cl_dx9_surface_info_khr*)surface_info;
+  IDirect3DSurface9* pD3D9Resource = cl_surf_info->resource;
+  HANDLE shared_handle = cl_surf_info->shared_handle;
+
+  if (!pD3D9Resource) {
+    *not_null(errcode_ret) = CL_INVALID_VALUE;
+    LogWarning("parameter \"surface_info\" is a NULL pointer");
+    return clMemObj;
+  }
+
+  D3DSURFACE_DESC Desc;
+  pD3D9Resource->GetDesc(&Desc);
+
+  if ((Desc.Format != D3DFMT_NV_12) &&
+      (Desc.Format != D3DFMT_P010) &&
+      (Desc.Format != D3DFMT_YV_12) && (plane != 0)) {
+    *not_null(errcode_ret) = CL_INVALID_VALUE;
+    LogWarning("The plane has to be Zero if the surface format is non-planar !");
+    return clMemObj;
+  }
+
+  // Check for image support
+  const std::vector<amd::Device*>& devices = as_amd(context)->devices();
+  bool supportPass = false;
+  for (const auto& it : devices) {
+    if (it->info().imageSupport_) {
+      supportPass = true;
+    }
+  }
+  if (!supportPass) {
+    *not_null(errcode_ret) = CL_INVALID_OPERATION;
+    LogWarning("there are no devices in context to support images");
+    return (cl_mem)0;
+  }
+  // Verify the resource is a 2D image
+  return amd::clCreateImage2DFromD3D9ResourceAMD(*as_amd(context), flags, adapter_type,
+                                                 cl_surf_info, plane, errcode_ret);
+}
+RUNTIME_EXIT
+
+RUNTIME_ENTRY(cl_int, clEnqueueAcquireDX9MediaSurfacesKHR,
+              (cl_command_queue command_queue, cl_uint num_objects, const cl_mem* mem_objects,
+               cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event)) {
+  return amd::clEnqueueAcquireExtObjectsAMD(command_queue, num_objects, mem_objects,
+                                            num_events_in_wait_list, event_wait_list, event,
+                                            CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR);
+}
+RUNTIME_EXIT
+
+RUNTIME_ENTRY(cl_int, clEnqueueReleaseDX9MediaSurfacesKHR,
+              (cl_command_queue command_queue, cl_uint num_objects, const cl_mem* mem_objects,
+               cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event)) {
+  return amd::clEnqueueReleaseExtObjectsAMD(command_queue, num_objects, mem_objects,
+                                            num_events_in_wait_list, event_wait_list, event,
+                                            CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR);
+}
+RUNTIME_EXIT
+
+//
+//
+//          namespace amd
+//
+//
+namespace amd {
+/*! @}
+ *  \addtogroup CL-D3D9 interop helper functions
+ *  @{
+ */
+//
+// Class D3D9Object implementation
+//
+std::vector<std::pair<TD3D9RESINFO, TD3D9RESINFO>> D3D9Object::resources_;
+Monitor D3D9Object::resLock_;
+
+//
+//      clCreateImage2DFromD3D9ResourceAMD
+//
+cl_mem clCreateImage2DFromD3D9ResourceAMD(Context& amdContext, cl_mem_flags flags,
+                                          cl_dx9_media_adapter_type_khr adapter_type,
+                                          cl_dx9_surface_info_khr* surface_info, cl_uint plane,
+                                          int* errcode_ret) {
+  cl_dx9_surface_info_khr* cl_surf_info = reinterpret_cast<cl_dx9_surface_info_khr*>(surface_info);
+  IDirect3DSurface9* pD3D9Resource = cl_surf_info->resource;
+  HANDLE shared_handle = cl_surf_info->shared_handle;
+
+  D3D9Object obj;
+  cl_int errcode = D3D9Object::initD3D9Object(amdContext, adapter_type, surface_info, plane, obj);
+  if (CL_SUCCESS != errcode) {
+    *not_null(errcode_ret) = errcode;
+    return (cl_mem)0;
+  }
+
+  Image2DD3D9* pImage2DD3D9 = new (amdContext) Image2DD3D9(amdContext, flags, obj);
+  if (!pImage2DD3D9) {
+    *not_null(errcode_ret) = CL_OUT_OF_HOST_MEMORY;
+    return (cl_mem)0;
+  }
+  if (!pImage2DD3D9->create()) {
+    *not_null(errcode_ret) = CL_MEM_OBJECT_ALLOCATION_FAILURE;
+    pImage2DD3D9->release();
+    return (cl_mem)0;
+  }
+
+  *not_null(errcode_ret) = CL_SUCCESS;
+  return as_cl<Memory>(pImage2DD3D9);
+}
+
+//
+// Helper function SyncD3D9Objects
+//
+void SyncD3D9Objects(std::vector<amd::Memory*>& memObjects) {
+  Memory*& mem = memObjects.front();
+  if (!mem) {
+    LogWarning("\nNULL memory object\n");
+    return;
+  }
+  InteropObject* interop = mem->getInteropObj();
+  if (!interop) {
+    LogWarning("\nNULL interop object\n");
+    return;
+  }
+  D3D9Object* d3d9Obj = interop->asD3D9Object();
+  if (!d3d9Obj) {
+    LogWarning("\nNULL D3D9 object\n");
+    return;
+  }
+  IDirect3DQuery9* query = d3d9Obj->getQuery();
+  if (!query) {
+    LogWarning("\nNULL IDirect3DQuery9\n");
+    return;
+  }
+  ScopedLock sl(d3d9Obj->getResLock());
+  query->Issue(D3DISSUE_END);
+  BOOL data = FALSE;
+  while (S_OK != query->GetData(&data, sizeof(BOOL), D3DGETDATA_FLUSH)) {
+  }
+}
+
+//
+// Class D3D10Object implementation
+//
+size_t D3D9Object::getElementBytes(D3DFORMAT d3d9Format, cl_uint plane) {
+  size_t bytesPerPixel;
+
+  switch (d3d9Format) {
+    case D3DFMT_UNKNOWN:
+    case D3DFMT_UYVY:
+    case D3DFMT_DXT1:
+    case D3DFMT_DXT2:
+    case D3DFMT_DXT3:
+    case D3DFMT_DXT4:
+    case D3DFMT_DXT5:
+    case D3DFMT_VERTEXDATA:
+    case D3DFMT_D32:
+    case D3DFMT_D15S1:
+    case D3DFMT_D24S8:
+    case D3DFMT_D24X8:
+    case D3DFMT_D24X4S4:
+    case D3DFMT_D16:
+    case D3DFMT_INDEX16:
+    case D3DFMT_INDEX32:
+    case D3DFMT_MULTI2_ARGB8:
+    case D3DFMT_CxV8U8:
+      // Less than 1 byte per pixel - needs special consideration
+      bytesPerPixel = 0;
+      break;
+
+    case D3DFMT_R3G3B2:
+    case D3DFMT_P8:
+    case D3DFMT_A8:
+    case D3DFMT_L8:
+    case D3DFMT_A4L4:
+      bytesPerPixel = 1;
+      break;
+
+    case D3DFMT_R16F:
+    case D3DFMT_R5G6B5:
+    case D3DFMT_X1R5G5B5:
+    case D3DFMT_A1R5G5B5:
+    case D3DFMT_A4R4G4B4:
+    case D3DFMT_A8R3G3B2:
+    case D3DFMT_X4R4G4B4:
+    case D3DFMT_A8P8:
+    case D3DFMT_A8L8:
+    case D3DFMT_V8U8:
+    case D3DFMT_L6V5U5:
+    case D3DFMT_D16_LOCKABLE:
+    case D3DFMT_L16:
+      bytesPerPixel = 2;
+      break;
+
+    case D3DFMT_R8G8B8:
+    case D3DFMT_D24FS8:
+      bytesPerPixel = 3;
+      break;
+
+    case D3DFMT_D32F_LOCKABLE:
+    case D3DFMT_A8R8G8B8:
+    case D3DFMT_R32F:
+    case D3DFMT_X8R8G8B8:
+    case D3DFMT_A2B10G10R10:
+    case D3DFMT_A8B8G8R8:
+    case D3DFMT_X8B8G8R8:
+    case D3DFMT_G16R16:
+    case D3DFMT_A2R10G10B10:
+    case D3DFMT_Q8W8V8U8:
+    case D3DFMT_X8L8V8U8:
+    case D3DFMT_V16U16:
+    case D3DFMT_A2W10V10U10:
+    case D3DFMT_R8G8_B8G8:
+    case D3DFMT_G8R8_G8B8:
+    case D3DFMT_G16R16F:
+    case D3DFMT_YUY2:
+      bytesPerPixel = 4;
+      break;
+
+    case D3DFMT_G32R32F:
+    case D3DFMT_A16B16G16R16:
+    case D3DFMT_A16B16G16R16F:
+    case D3DFMT_Q16W16V16U16:
+      bytesPerPixel = 8;
+      break;
+    case D3DFMT_A32B32G32R32F:
+      bytesPerPixel = 16;
+      break;
+    //#if !defined(D3D_DISABLE_9EX)
+    // case D3DFMT_D32_LOCKABLE:
+    // case D3DFMT_S8_LOCKABLE:
+    //#endif // !D3D_DISABLE_9EX
+    case D3DFMT_NV_12:
+      if (plane == 0) {
+        bytesPerPixel = 1;
+      } else if (plane == 1) {
+        bytesPerPixel = 2;
+      }  // plane != 0 or != 1 shouldn't happen here
+      break;
+    case D3DFMT_P010:
+      if (plane == 0) {
+        bytesPerPixel = 2;
+      } else if (plane == 1) {
+        bytesPerPixel = 4;
+      }  // plane != 0 or != 1 shouldn't happen here
+      break;
+    case D3DFMT_YV_12:
+      bytesPerPixel = 1;
+      break;
+
+    default:
+      bytesPerPixel = 0;
+      _ASSERT(FALSE);
+      break;
+  }
+  return bytesPerPixel;
+}
+
+void setObjDesc(amd::D3D9ObjDesc_t& objDesc, D3DSURFACE_DESC& resDesc, cl_uint plane) {
+  objDesc.d3dPool_ = resDesc.Pool;
+  objDesc.resType_ = resDesc.Type;
+  objDesc.usage_ = resDesc.Usage;
+  objDesc.d3dFormat_ = resDesc.Format;
+  switch (resDesc.Format) {
+    case D3DFMT_NV_12:
+    case D3DFMT_P010:
+      objDesc.surfRect_.left = 0;
+      objDesc.surfRect_.top = 0;
+      if (plane == 0) {
+        objDesc.objSize_.Height = resDesc.Height;
+        objDesc.objSize_.Width = resDesc.Width;
+        objDesc.surfRect_.right = resDesc.Width;  // resDesc.Width/2-1;
+        objDesc.surfRect_.bottom = 3 * resDesc.Height / 2;
+        ;  // 3*resDesc.Height/2-1;
+      } else if (plane == 1) {
+        objDesc.objSize_.Height = resDesc.Height / 2;
+        objDesc.objSize_.Width = resDesc.Width / 2;
+        objDesc.surfRect_.right = resDesc.Width;  // resDesc.Width/2-1;
+        objDesc.surfRect_.bottom = 3 * resDesc.Height / 2;
+        ;  // 3*resDesc.Height/2-1;
+      }    // plane != 0 or != 1 shouldn't happen here
+      break;
+    case D3DFMT_YV_12:
+      objDesc.surfRect_.left = 0;
+      if (plane == 0) {
+        objDesc.objSize_.Height = resDesc.Height;
+        objDesc.objSize_.Width = resDesc.Width;
+        objDesc.surfRect_.top = 0;
+        objDesc.surfRect_.right = resDesc.Width - 1;
+        objDesc.surfRect_.bottom = resDesc.Height - 1;
+      } else if (plane == 1) {
+        objDesc.objSize_.Height = resDesc.Height / 2;
+        objDesc.objSize_.Width = resDesc.Width / 2;
+        objDesc.surfRect_.top = resDesc.Height;
+        objDesc.surfRect_.right = resDesc.Width / 2 - 1;
+        objDesc.surfRect_.bottom = 3 * resDesc.Height / 2 - 1;
+      } else if (plane == 2) {
+        objDesc.objSize_.Height = resDesc.Height / 2;
+        objDesc.objSize_.Width = resDesc.Width / 2;
+        objDesc.surfRect_.top = 3 * resDesc.Height / 2;
+        objDesc.surfRect_.right = resDesc.Width / 2 - 1;
+        objDesc.surfRect_.bottom = 2 * resDesc.Height - 1;
+      }  // plane > 0 or > 2 shouldn't happen here
+      break;
+    default:
+      objDesc.objSize_.Height = resDesc.Height;
+      objDesc.objSize_.Width = resDesc.Width;
+      objDesc.surfRect_.left = 0;
+      objDesc.surfRect_.top = 0;
+      objDesc.surfRect_.right = resDesc.Width - 1;
+      objDesc.surfRect_.bottom = resDesc.Height - 1;
+      if (resDesc.Format == D3DFMT_YUY2) {
+        objDesc.objSize_.Width >>= 1;
+      }
+      break;
+  }
+}
+
+int D3D9Object::initD3D9Object(const Context& amdContext,
+                               cl_dx9_media_adapter_type_khr adapter_type,
+                               cl_dx9_surface_info_khr* cl_surf_info, cl_uint plane,
+                               D3D9Object& obj) {
+  ScopedLock sl(resLock_);
+
+  IDirect3DDevice9Ex* pDev9Ex = NULL;
+  cl_int errcode = CL_SUCCESS;
+
+  // Check if this ressource has already been used for interop
+  IDirect3DSurface9* pD3D9res = cl_surf_info->resource;
+  HANDLE shared_handle = cl_surf_info->shared_handle;
+
+  if ((adapter_type == CL_ADAPTER_D3D9_KHR) || (adapter_type == CL_ADAPTER_DXVA_KHR)) {
+    return CL_INVALID_DX9_MEDIA_ADAPTER_KHR;  // Not supported yet
+  }
+
+  for (const auto& it : resources_) {
+    if (it.first.surfInfo.resource == cl_surf_info->resource && it.first.surfPlane == plane) {
+      return CL_INVALID_D3D9_RESOURCE_KHR;
+    }
+  }
+
+  HRESULT hr;
+  D3DQUERYTYPE desc = D3DQUERYTYPE_EVENT;
+
+  D3DSURFACE_DESC resDesc;
+  if (D3D_OK != pD3D9res->GetDesc(&resDesc)) {
+    return CL_INVALID_D3D9_RESOURCE_KHR;
+  }
+
+  hr = pD3D9res->GetContainer(IID_IDirect3DDevice9Ex, (void**)&pDev9Ex);
+  if (hr == D3D_OK) {
+    pDev9Ex->CreateQuery(desc, &(obj.pQuery_));
+  } else {
+    return CL_INVALID_D3D9_RESOURCE_KHR;  // d3d9ex should be supported
+  }
+
+  obj.handleShared_ = shared_handle;
+  obj.surfPlane_ = plane;
+  obj.surfInfo_ = *cl_surf_info;
+  obj.adapterType_ = adapter_type;
+
+  // Init defaults
+  setObjDesc(obj.objDescOrig_, resDesc, plane);
+  obj.objDesc_ = obj.objDescOrig_;
+
+  // shared handle cases if the shared_handle is NULL
+  // first check if the format is NV12 or YV12, which we need special handling
+  if (NULL == shared_handle) {
+    bool found = false;
+    for (const auto& it : resources_) {
+      if (it.first.surfInfo.resource == cl_surf_info->resource &&
+          it.first.surfPlane != plane) {
+        obj.handleShared_ = it.second.surfInfo.shared_handle;
+        obj.pD3D9Res_ = it.second.surfInfo.resource;
+        obj.pD3D9Res_->AddRef();
+        obj.objDesc_ = obj.objDescOrig_;
+        found = true;
+        break;
+      }
+    }
+    if (!found) {
+      obj.handleShared_ = 0;
+      hr = pDev9Ex->CreateOffscreenPlainSurface(resDesc.Width, resDesc.Height, resDesc.Format,
+                                                resDesc.Pool, &obj.pD3D9Res_, &obj.handleShared_);
+
+      if (D3D_OK != hr) {
+        errcode = CL_INVALID_D3D9_RESOURCE_KHR;
+      }
+    }
+
+    // put the original info into the obj
+    obj.pD3D9ResOrig_ = pD3D9res;
+    obj.pD3D9ResOrig_->AddRef();  // addRef in case lost the resource
+  } else {
+    // Share the original resource
+    obj.pD3D9ResOrig_ = NULL;
+    obj.pD3D9Res_ = pD3D9res;
+    obj.pD3D9Res_->AddRef();
+  }
+
+  // Release the Ex interface
+  if (pDev9Ex) pDev9Ex->Release();
+
+  // Check for CL format compatibilty
+  if (obj.objDesc_.resType_ == D3DRTYPE_SURFACE) {
+    cl_image_format clFmt = obj.getCLFormatFromD3D9(obj.objDesc_.d3dFormat_, plane);
+    amd::Image::Format imageFormat(clFmt);
+    if (!imageFormat.isSupported(amdContext)) {
+      return CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+    }
+  }
+
+  TD3D9RESINFO d3d9ObjOri = {*cl_surf_info, plane};
+  TD3D9RESINFO d3d9ObjShared = {{obj.pD3D9Res_, obj.handleShared_}, plane};
+
+  if (errcode == CL_SUCCESS) {
+    resources_.push_back({d3d9ObjOri, d3d9ObjShared});
+  }
+
+  return errcode;
+}
+cl_uint D3D9Object::getMiscFlag() {
+  switch (objDescOrig_.d3dFormat_) {
+    case D3DFMT_NV_12:
+    case D3DFMT_P010:
+      return 1;
+      break;
+    case D3DFMT_YV_12:
+      return 2;
+      break;
+    case D3DFMT_YUY2:
+      return 3;
+      break;
+    default:
+      return 0;
+      break;
+  }
+}
+
+cl_image_format D3D9Object::getCLFormatFromD3D9() {
+  return getCLFormatFromD3D9(objDesc_.d3dFormat_, surfPlane_);
+}
+
+cl_image_format D3D9Object::getCLFormatFromD3D9(D3DFORMAT d3d9Fmt, cl_uint plane) {
+  cl_image_format fmt;
+
+  fmt.image_channel_order = 0;      // CL_RGBA;
+  fmt.image_channel_data_type = 0;  // CL_UNSIGNED_INT8;
+
+  switch (d3d9Fmt) {
+    case D3DFMT_R32F:
+      fmt.image_channel_order = CL_R;
+      fmt.image_channel_data_type = CL_FLOAT;
+      break;
+
+    case D3DFMT_R16F:
+      fmt.image_channel_order = CL_R;
+      fmt.image_channel_data_type = CL_HALF_FLOAT;
+      break;
+
+    case D3DFMT_L16:
+      fmt.image_channel_order = CL_R;
+      fmt.image_channel_data_type = CL_UNORM_INT16;
+      break;
+
+    case D3DFMT_A8:
+      fmt.image_channel_order = CL_A;
+      fmt.image_channel_data_type = CL_UNORM_INT8;
+      break;
+
+    case D3DFMT_L8:
+      fmt.image_channel_order = CL_R;
+      fmt.image_channel_data_type = CL_UNORM_INT8;
+      break;
+
+    case D3DFMT_G32R32F:
+      fmt.image_channel_order = CL_RG;
+      fmt.image_channel_data_type = CL_FLOAT;
+      break;
+
+    case D3DFMT_G16R16F:
+      fmt.image_channel_order = CL_RG;
+      fmt.image_channel_data_type = CL_HALF_FLOAT;
+      break;
+
+    case D3DFMT_G16R16:
+      fmt.image_channel_order = CL_RG;
+      fmt.image_channel_data_type = CL_UNORM_INT16;
+      break;
+
+    case D3DFMT_A8L8:
+      fmt.image_channel_order = CL_RG;
+      fmt.image_channel_data_type = CL_UNORM_INT8;
+      break;
+
+    case D3DFMT_A32B32G32R32F:
+      fmt.image_channel_order = CL_RGBA;
+      fmt.image_channel_data_type = CL_FLOAT;
+      break;
+
+    case D3DFMT_A16B16G16R16F:
+      fmt.image_channel_order = CL_RGBA;
+      fmt.image_channel_data_type = CL_HALF_FLOAT;
+      break;
+
+    case D3DFMT_A16B16G16R16:
+      fmt.image_channel_order = CL_RGBA;
+      fmt.image_channel_data_type = CL_UNORM_INT16;
+      break;
+
+    case D3DFMT_A8B8G8R8:
+      fmt.image_channel_order = CL_RGBA;
+      fmt.image_channel_data_type = CL_UNORM_INT8;
+      break;
+
+    case D3DFMT_X8B8G8R8:
+      fmt.image_channel_order = CL_RGBA;
+      fmt.image_channel_data_type = CL_UNORM_INT8;
+      break;
+
+    case D3DFMT_A8R8G8B8:
+      fmt.image_channel_order = CL_BGRA;
+      fmt.image_channel_data_type = CL_UNORM_INT8;
+      break;
+
+    case D3DFMT_X8R8G8B8:
+      fmt.image_channel_order = CL_BGRA;
+      fmt.image_channel_data_type = CL_UNORM_INT8;
+      break;
+    case D3DFMT_NV_12:
+      fmt.image_channel_data_type = CL_UNORM_INT8;
+      if (plane == 0) {
+        fmt.image_channel_order = CL_R;
+      } else if (plane == 1) {
+        fmt.image_channel_order = CL_RG;
+      }
+      break;
+    case D3DFMT_P010:
+      fmt.image_channel_data_type = CL_UNORM_INT16;
+      if (plane == 0) {
+        fmt.image_channel_order = CL_R;
+      } else if (plane == 1) {
+        fmt.image_channel_order = CL_RG;
+      }
+      break;
+    case D3DFMT_YV_12:
+      fmt.image_channel_order = CL_R;
+      fmt.image_channel_data_type = CL_UNORM_INT8;
+      break;
+    case D3DFMT_YUY2:
+      fmt.image_channel_order = CL_RGBA;
+      fmt.image_channel_data_type = CL_UNSIGNED_INT8;
+      break;
+    case D3DFMT_UNKNOWN:
+    case D3DFMT_R8G8B8:
+    case D3DFMT_R5G6B5:
+    case D3DFMT_X1R5G5B5:
+    case D3DFMT_A1R5G5B5:
+    case D3DFMT_A4R4G4B4:
+    case D3DFMT_R3G3B2:
+    case D3DFMT_A8R3G3B2:
+    case D3DFMT_X4R4G4B4:
+    case D3DFMT_A2B10G10R10:
+    case D3DFMT_A2R10G10B10:
+    case D3DFMT_A8P8:
+    case D3DFMT_P8:
+    case D3DFMT_A4L4:
+    case D3DFMT_V8U8:
+    case D3DFMT_L6V5U5:
+    case D3DFMT_X8L8V8U8:
+    case D3DFMT_Q8W8V8U8:
+    case D3DFMT_V16U16:
+    case D3DFMT_A2W10V10U10:
+    case D3DFMT_UYVY:
+    case D3DFMT_R8G8_B8G8:
+    case D3DFMT_G8R8_G8B8:
+    case D3DFMT_DXT1:
+    case D3DFMT_DXT2:
+    case D3DFMT_DXT3:
+    case D3DFMT_DXT4:
+    case D3DFMT_DXT5:
+    case D3DFMT_D16_LOCKABLE:
+    case D3DFMT_D32:
+    case D3DFMT_D15S1:
+    case D3DFMT_D24S8:
+    case D3DFMT_D24X8:
+    case D3DFMT_D24X4S4:
+    case D3DFMT_D16:
+    case D3DFMT_D32F_LOCKABLE:
+    case D3DFMT_D24FS8:
+    //#if !defined(D3D_DISABLE_9EX)
+    case D3DFMT_D32_LOCKABLE:
+    case D3DFMT_S8_LOCKABLE:
+    //#endif // !D3D_DISABLE_9EX
+    case D3DFMT_VERTEXDATA:
+    case D3DFMT_INDEX16:
+    case D3DFMT_INDEX32:
+    case D3DFMT_Q16W16V16U16:
+    case D3DFMT_MULTI2_ARGB8:
+    case D3DFMT_CxV8U8:
+    //#if !defined(D3D_DISABLE_9EX)
+    case D3DFMT_A1:
+    case D3DFMT_A2B10G10R10_XR_BIAS:
+    case D3DFMT_BINARYBUFFER:
+      _ASSERT(FALSE);  // NOT SURPPORTED
+      break;
+    //#endif // !D3D_DISABLE_9EX
+    default:
+      _ASSERT(FALSE);
+      break;
+  }
+
+  return fmt;
+}
+
+bool D3D9Object::copyOrigToShared() {
+  // Don't copy if there is no orig
+  if (NULL == getD3D9ResOrig()) return true;
+
+  IDirect3DDevice9Ex* d3dDev;
+  HRESULT hr;
+  ScopedLock sl(getResLock());
+
+  IDirect3DSurface9* srcSurf = getD3D9ResOrig();
+  IDirect3DSurface9* dstSurf = getD3D9Resource();
+
+  hr = getD3D9Resource()->GetContainer(IID_IDirect3DDevice9Ex, (void**)&d3dDev);
+  if (hr != D3D_OK || !d3dDev) {
+    LogError("\nCannot get D3D9 device from D3D9 surface\n");
+    return false;
+  }
+
+  hr = d3dDev->StretchRect(srcSurf, NULL, dstSurf, NULL, D3DTEXF_NONE);
+  if (hr != D3D_OK) {
+    LogError("\ncopy original surface to shared surface failed\n");
+    return false;
+  }
+  // Flush D3D queues and make sure D3D stuff is finished
+  pQuery_->Issue(D3DISSUE_END);
+  BOOL data;
+  while ((D3D_OK != pQuery_->GetData(&data, sizeof(BOOL), D3DGETDATA_FLUSH)) && (data != TRUE)) {
+  }
+
+  if (d3dDev) d3dDev->Release();
+  return true;
+}
+
+bool D3D9Object::copySharedToOrig() {
+  // Don't copy if there is no orig
+  if (NULL == getD3D9ResOrig()) return true;
+
+  IDirect3DDevice9Ex* d3dDev;
+  HRESULT hr;
+  ScopedLock sl(getResLock());
+
+  hr = getD3D9Resource()->GetContainer(IID_IDirect3DDevice9Ex, (void**)&d3dDev);
+  if (hr != D3D_OK || !d3dDev) {
+    LogError("\nCannot get D3D9 device from D3D9 surface\n");
+    return false;
+  }
+
+  hr = d3dDev->StretchRect(getD3D9Resource(), NULL, getD3D9ResOrig(), NULL, D3DTEXF_NONE);
+  if (hr != D3D_OK) {
+    LogError("\ncopy shared surface to original surface failed\n");
+    return false;
+  }
+
+  if (d3dDev) d3dDev->Release();
+  return true;
+}
+
+void Image2DD3D9::initDeviceMemory() {
+  deviceMemories_ =
+      reinterpret_cast<DeviceMemory*>(reinterpret_cast<char*>(this) + sizeof(Image2DD3D9));
+  memset(deviceMemories_, 0, context_().devices().size() * sizeof(DeviceMemory));
+}
+
+}  // namespace amd
+
+#endif  //_WIN32
@@ -0,0 +1,398 @@
+/* Copyright (c) 2010 - 2021 Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef CL_GL_AMD_HPP_
+#define CL_GL_AMD_HPP_
+
+#ifdef _WIN32
+#include <windows.h>
+#else //!_WIN32
+#include <dlfcn.h>
+#endif //!_WIN32
+
+#include <GL/gl.h>
+#include <GL/glext.h>
+#include "CL/cl_gl.h"
+#ifndef _WIN32
+#include <GL/glx.h>
+#endif //!_WIN32
+
+#include <EGL/egl.h>
+#include <EGL/eglext.h>
+#include <EGL/eglplatform.h>
+
+#include "platform/context.hpp"
+#include "platform/command.hpp"
+
+namespace amd
+{
+
+//! Class GLObject keeps all the info about the GL object
+//! from which the CL object is created
+class GLObject : public InteropObject
+{
+protected:
+    cl_gl_object_type   clGLType_;  //!< CL GL object type
+    GLenum  glTarget_;
+    GLuint  gluiName_;
+    GLint   gliMipLevel_;
+    GLenum  glInternalFormat_;
+    GLint   gliWidth_;
+    GLint   gliHeight_;
+    GLint   gliDepth_;
+    GLenum  glCubemapFace_;
+    GLsizei glNumSamples_;
+
+public:
+//! GLObject constructor initializes member variables
+    GLObject(
+        GLenum  glTarget,
+        GLuint  gluiName,
+        GLint   gliMipLevel,
+        GLenum  glInternalFormat,
+        GLint   gliWidth,
+        GLint   gliHeight,
+        GLint   gliDepth,
+        cl_gl_object_type   clGLType,
+        GLenum  glCubemapFace,
+        GLsizei glNumSamples
+    ): // Initialization of member variables
+            clGLType_(clGLType),
+            glTarget_(glTarget),
+            gluiName_(gluiName),
+            gliMipLevel_(gliMipLevel),
+            glInternalFormat_(glInternalFormat),
+            gliWidth_(gliWidth),
+            gliHeight_(gliHeight),
+            gliDepth_(gliDepth),
+            glCubemapFace_(glCubemapFace),
+            glNumSamples_(glNumSamples)
+    {
+    }
+
+    virtual ~GLObject() {}
+    virtual GLObject* asGLObject() {return this;}
+
+//! GLObject query functions to get GL info from member variables
+    GLenum  getGLTarget() const {return glTarget_;}
+    GLuint  getGLName() const {return gluiName_;}
+    GLint   getGLMipLevel() const {return gliMipLevel_;}
+    GLenum  getGLInternalFormat() const {return glInternalFormat_;}
+    GLint   getGLSize() const {return gliWidth_;}
+    GLint   getGLWidth() const {return gliWidth_;}
+    GLint   getGLHeight() const {return gliHeight_;}
+    GLint   getGLDepth() const {return gliDepth_;}
+    cl_gl_object_type getCLGLObjectType() const { return clGLType_; }
+    GLenum  getCubemapFace() const {return glCubemapFace_;}
+    GLsizei getNumSamples() const { return glNumSamples_;}
+};
+
+
+//! Class BufferGL is drived from classes Buffer and GLObject
+//! where the former keeps all data for CL object and
+//! the latter keeps all data for GL object
+class BufferGL : public Buffer, public GLObject
+{
+protected:
+    //! Initializes the device memory array which is nested
+    // after'BufferGL' object in memory layout.
+    virtual void initDeviceMemory();
+public:
+//! BufferGL constructor just calls constructors of base classes
+//! to pass down the parameters
+    BufferGL(
+        Context&        amdContext,
+        cl_mem_flags    clFlags,
+        size_t          uiSizeInBytes,
+        GLenum          glTarget,
+        GLuint          gluiName)
+        : // Call base classes constructors
+            Buffer(
+                amdContext,
+                clFlags,
+                uiSizeInBytes
+            ),
+            GLObject(
+                glTarget,
+                gluiName,
+                0,                  // Mipmap level default
+                GL_ARRAY_BUFFER,    // Just init to some value
+                (GLint) uiSizeInBytes,
+                1,
+                1,
+                CL_GL_OBJECT_BUFFER,
+                0,
+                0
+            )
+    {
+        setInteropObj(this);
+    }
+    virtual ~BufferGL() {}
+
+    virtual BufferGL* asBufferGL() { return this; }
+};
+
+
+//! Class ImageGL is derived from classes Image and GLObject
+//! where the former keeps all data for CL object and
+//! the latter keeps all data for GL object
+class ImageGL : public Image, public GLObject
+{
+public:
+    //! ImageGL constructor just calls constructors of base classes
+    //! to pass down the parameters
+    ImageGL(
+        Context&            amdContext,
+        cl_mem_object_type  clType,
+        cl_mem_flags        clFlags,
+        const Format&       format,
+        size_t              width,
+        size_t              height,
+        size_t              depth,
+        GLenum              glTarget,
+        GLuint              gluiName,
+        GLint               gliMipLevel,
+        GLenum              glInternalFormat,
+        cl_gl_object_type   clGLType,
+        GLsizei             numSamples,
+        GLenum              glCubemapFace = 0)
+        : Image(amdContext, clType, clFlags, format, width, height, depth,
+            Format(format).getElementSize() * width,
+            Format(format).getElementSize() * width * depth)
+        , GLObject(glTarget, gluiName, gliMipLevel, glInternalFormat,
+            static_cast<GLint>(width), static_cast<GLint>(height),
+            static_cast<GLint>(depth), clGLType, glCubemapFace,numSamples)
+    {
+        setInteropObj(this);
+    }
+
+    virtual ~ImageGL() {}
+
+protected:
+    //! Initializes the device memory array which is nested
+    // after'BufferGL' object in memory layout.
+    virtual void initDeviceMemory();
+};
+
+    typedef EGLContext (*PFN_eglGetCurrentContext) ();
+#ifdef _WIN32
+#define APICALL WINAPI
+#define GETPROCADDRESS      GetProcAddress
+#define API_GETPROCADDR     "wglGetProcAddress"
+#define FCN_STR_TYPE        LPCSTR
+    typedef PROC (WINAPI* PFN_xxxGetProcAddress) (LPCSTR fcnName);
+    typedef HGLRC (APICALL* PFN_wglCreateContext) (HDC hdc);
+    typedef HGLRC (APICALL* PFN_wglGetCurrentContext) (void);
+    typedef HDC   (APICALL* PFN_wglGetCurrentDC) (void);
+    typedef BOOL  (APICALL* PFN_wglDeleteContext) (HGLRC hglrc);
+    typedef BOOL  (APICALL* PFN_wglMakeCurrent) (HDC hdc, HGLRC hglrc);
+    typedef BOOL  (APICALL* PFN_wglShareLists) (HGLRC hglrc1, HGLRC hglrc2);
+#else //!_WIN32
+#define APICALL // __stdcall   //??? todo odintsov
+#define API_GETPROCADDR     "glXGetProcAddress"
+#define GETPROCADDRESS      dlsym
+#define FCN_STR_TYPE        const GLubyte*
+#define WINAPI
+#define PROC void*
+    typedef void* (*PFN_xxxGetProcAddress) (const GLubyte* procName);
+    // X11 typedef
+    typedef Display* (*PFNXOpenDisplay)(_Xconst char* display_name );
+    typedef int (*PFNXCloseDisplay)(Display* display );
+
+    //glx typedefs
+    typedef GLXDrawable (*PFNglXGetCurrentDrawable)();
+    typedef Display* (*PFNglXGetCurrentDisplay)();
+    typedef GLXContext (*PFNglXGetCurrentContext)( void );
+    typedef XVisualInfo* (*PFNglXChooseVisual)(Display *dpy, int screen, int *attribList);
+    typedef GLXContext(*PFNglXCreateContext)(Display* dpy,XVisualInfo* vis,GLXContext shareList,Bool direct);
+    typedef void(*PFNglXDestroyContext)(Display* dpy, GLXContext ctx);
+    typedef Bool(*PFNglXMakeCurrent)( Display* dpy, GLXDrawable drawable, GLXContext ctx);
+    typedef void* HMODULE;
+#endif //!_WIN32
+
+#define GLPREFIX(rtype, fcn, dclargs) \
+    typedef rtype (APICALL* PFN_##fcn) dclargs;
+
+// Declare prototypes for GL functions
+#include "gl_functions.hpp"
+
+class GLFunctions
+{
+public:
+    //! Locks any access to the virtual GPUs
+    class SetIntEnv : public amd::StackObject {
+    public:
+        //! Default constructor
+        SetIntEnv(GLFunctions* env);
+
+        //! Destructor
+        ~SetIntEnv();
+
+        //! Checks if the environment setup was successful
+        bool isValid() const { return isValid_; }
+
+    private:
+        GLFunctions*    env_;       //!< GL environment
+        bool            isValid_;   //!< If TRUE, then it's a valid setup
+    };
+
+private:
+    HMODULE libHandle_;
+    int missed_;    // Indicates how many GL functions not init'ed, if any
+
+    amd::Monitor lock_;
+
+    EGLDisplay eglDisplay_;
+    EGLContext eglOriginalContext_;
+    EGLContext eglInternalContext_;
+    EGLContext eglTempContext_;
+    bool isEGL_;
+    PFN_eglGetCurrentContext eglGetCurrentContext_;
+
+#ifdef _WIN32
+    HGLRC       hOrigGLRC_;
+    HDC         hDC_;
+    HGLRC       hIntGLRC_;  // handle for internal GLRC to access shared context
+    HDC         tempDC_;
+    HGLRC       tempGLRC_;
+
+public:
+    PFN_wglCreateContext     wglCreateContext_;
+    PFN_wglGetCurrentContext wglGetCurrentContext_;
+    PFN_wglGetCurrentDC      wglGetCurrentDC_;
+    PFN_wglDeleteContext     wglDeleteContext_;
+    PFN_wglMakeCurrent       wglMakeCurrent_;
+    PFN_wglShareLists        wglShareLists_;
+#else
+public:
+    Display*    Dpy_;
+    GLXDrawable Drawable_;
+    GLXContext  origCtx_;
+    Display*    intDpy_;
+    Window      intDrawable_;
+    GLXContext  intCtx_;
+    Display*    tempDpy_;
+    GLXDrawable tempDrawable_;
+    GLXContext  tempCtx_;
+
+    //pointers to X11 functions
+    PFNXOpenDisplay XOpenDisplay_;
+    PFNXCloseDisplay XCloseDisplay_;
+
+    //pointers to GLX functions
+    PFNglXGetCurrentDrawable glXGetCurrentDrawable_;
+    PFNglXGetCurrentDisplay glXGetCurrentDisplay_;
+    PFNglXGetCurrentContext glXGetCurrentContext_;
+    PFNglXChooseVisual glXChooseVisual_;
+    PFNglXCreateContext glXCreateContext_;
+    PFNglXDestroyContext glXDestroyContext_;
+    PFNglXMakeCurrent glXMakeCurrent_;
+#endif
+public:
+
+    GLFunctions(HMODULE h, bool isEGL);
+    ~GLFunctions();
+
+    bool update(intptr_t hglrc);
+    bool IsCurrentGlContext(const amd::Context::Info& info) const {
+      if (isEGL_) {
+        return ((info.hCtx_ != nullptr) && (eglGetCurrentContext_ != nullptr) &&
+                (info.hCtx_ == eglGetCurrentContext_()));
+      } else {
+#ifdef _WIN32
+        return ((info.hCtx_ != nullptr) && (info.hCtx_ == wglGetCurrentContext_()));
+#else
+        return ((info.hCtx_ != nullptr) && (info.hCtx_ == glXGetCurrentContext_()));
+#endif  // _WIN32
+      }
+    }
+
+    void WaitCurrentGlContext(const amd::Context::Info& info) const;
+
+    // Query CL-GL context association
+    bool isAssociated() const
+    {
+        if (isEGL_ && eglDisplay_ && eglOriginalContext_) return true;
+#ifdef _WIN32
+        if(hDC_ && hOrigGLRC_) return true;
+#else //!_WIN32
+        if(Dpy_ && origCtx_) return true;
+#endif //!_WIN32
+        return false;
+    }
+    bool isEGL() const
+    {
+        return isEGL_;
+    }
+    // Accessor methods
+#ifdef _WIN32
+    HGLRC getOrigGLRC() const {return hOrigGLRC_;}
+    HDC getDC() const {return hDC_;}
+    HGLRC getIntGLRC() const {return hIntGLRC_;}
+#else //!_WIN32
+    Display* getDpy() const {return Dpy_;}
+    GLXDrawable getDrawable() const {return Drawable_;}
+    GLXContext getOrigCtx() const {return origCtx_;}
+
+    Display* getIntDpy() const {return intDpy_;}
+    GLXDrawable getIntDrawable() const {return intDrawable_;}
+    GLXContext getIntCtx() const {return intCtx_;}
+
+    EGLDisplay getEglDpy() const { return eglDisplay_; }
+    EGLContext getEglOrigCtx() const { return eglOriginalContext_; }
+#endif //!_WIN32
+
+    // Initialize GL dynamic library and function pointers
+    bool init(intptr_t hdc, intptr_t hglrc);
+
+    // Return true if successful, false - if error occurred
+    bool setIntEnv();
+    bool restoreEnv();
+
+    amd::Monitor& getLock() { return lock_; }
+
+    PFN_xxxGetProcAddress GetProcAddress_;
+
+#define GLPREFIX(rtype, fcn, dclargs)   \
+    PFN_##fcn fcn##_;
+// Declare pointers to GL functions
+#include "gl_functions.hpp"
+};
+
+//! Functions for executing the GL related stuff
+cl_mem clCreateFromGLBufferAMD(Context& amdContext, cl_mem_flags flags,
+    GLuint bufobj, cl_int* errcode_ret);
+cl_mem clCreateFromGLTextureAMD(Context& amdContext, cl_mem_flags flags,
+    GLenum target, GLint miplevel, GLuint texture, int* errcode_ret);
+cl_mem clCreateFromGLRenderbufferAMD(Context& amdContext, cl_mem_flags flags,
+    GLuint renderbuffer, int* errcode_ret);
+
+bool
+getCLFormatFromGL(
+    const Context& amdContext,
+    GLint gliInternalFormat,
+    cl_image_format* pclImageFormat,
+    int* piBytesPerPixel,
+    cl_mem_flags flags
+);
+
+} //namespace amd
+
+#endif //CL_GL_AMD_HPP_
@@ -0,0 +1,51 @@
+# Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+if(ROCCLR_FOUND)
+  return()
+endif()
+
+find_path(ROCCLR_INCLUDE_DIR top.hpp
+  HINTS
+    ${ROCCLR_PATH}
+  PATHS
+    # gerrit repo name
+    ${CMAKE_SOURCE_DIR}/vdi
+    ${CMAKE_SOURCE_DIR}/../vdi
+    ${CMAKE_SOURCE_DIR}/../../vdi
+    # github repo name
+    ${CMAKE_SOURCE_DIR}/ROCclr
+    ${CMAKE_SOURCE_DIR}/../ROCclr
+    ${CMAKE_SOURCE_DIR}/../../ROCclr
+    # jenkins repo name
+    ${CMAKE_SOURCE_DIR}/rocclr
+    ${CMAKE_SOURCE_DIR}/../rocclr
+    ${CMAKE_SOURCE_DIR}/../../rocclr
+  PATH_SUFFIXES
+    include)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(ROCclr
+  "\nROCclr not found"
+  ROCCLR_INCLUDE_DIR)
+mark_as_advanced(ROCCLR_INCLUDE_DIR)
+
+list(APPEND CMAKE_MODULE_PATH "${ROCCLR_INCLUDE_DIR}/../cmake")
+include(ROCclr)
@@ -0,0 +1,40 @@
+/* Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "vdi_common.hpp"
+#ifdef _WIN32
+#include <windows.h>
+#include <d3d9.h>
+#include <d3d10_1.h>
+#include <CL/cl_d3d10.h>
+#include <CL/cl_d3d11.h>
+#include <CL/cl_dx9_media_sharing.h>
+#endif
+#include <CL/cl_icd.h>
+
+cl_icd_dispatch amd::ICDDispatchedObject::icdVendorDispatch_[] = {0};
+amd::PlatformIDS amd::PlatformID::Platform = {amd::ICDDispatchedObject::icdVendorDispatch_};
+
+RUNTIME_ENTRY(cl_int, clGetDeviceIDs,
+              (cl_platform_id platform, cl_device_type device_type, cl_uint num_entries,
+               cl_device_id* devices, cl_uint* num_devices)) {
+  return CL_SUCCESS;
+}
+RUNTIME_EXIT
@@ -0,0 +1,26 @@
+/* Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "platform/activity.hpp"
+#include <hip/hip_runtime_api.h>
+
+extern "C" const char* hipGetCmdName(unsigned op) {
+  return getOclCommandKindString(static_cast<cl_command_type>(op));
+}
@@ -0,0 +1,910 @@
+/*
+Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#include "hip_code_object.hpp"
+#include "amd_hsa_elf.hpp"
+
+#include <cstring>
+
+#include <hip/driver_types.h>
+#include "hip/hip_runtime_api.h"
+#include "hip/hip_runtime.h"
+#include "hip_internal.hpp"
+#include "platform/program.hpp"
+#include <elf/elf.hpp>
+
+hipError_t ihipFree(void* ptr);
+// forward declaration of methods required for managed variables
+hipError_t ihipMallocManaged(void** ptr, size_t size, unsigned int align = 0);
+namespace {
+size_t constexpr strLiteralLength(char const* str) {
+  return *str ? 1 + strLiteralLength(str + 1) : 0;
+}
+constexpr char const* CLANG_OFFLOAD_BUNDLER_MAGIC_STR = "__CLANG_OFFLOAD_BUNDLE__";
+constexpr char const* OFFLOAD_KIND_HIP = "hip";
+constexpr char const* OFFLOAD_KIND_HIPV4 = "hipv4";
+constexpr char const* OFFLOAD_KIND_HCC = "hcc";
+constexpr char const* AMDGCN_TARGET_TRIPLE = "amdgcn-amd-amdhsa-";
+
+// ClangOFFLOADBundle info.
+static constexpr size_t bundle_magic_string_size =
+    strLiteralLength(CLANG_OFFLOAD_BUNDLER_MAGIC_STR);
+
+// Clang Offload bundler description & Header.
+struct __ClangOffloadBundleInfo {
+  uint64_t offset;
+  uint64_t size;
+  uint64_t bundleEntryIdSize;
+  const char bundleEntryId[1];
+};
+
+struct __ClangOffloadBundleHeader {
+  const char magic[bundle_magic_string_size - 1];
+  uint64_t numOfCodeObjects;
+  __ClangOffloadBundleInfo desc[1];
+};
+}  // namespace
+
+namespace hip {
+
+bool CodeObject::IsClangOffloadMagicBundle(const void* data) {
+  std::string magic(reinterpret_cast<const char*>(data), bundle_magic_string_size);
+  return magic.compare(CLANG_OFFLOAD_BUNDLER_MAGIC_STR) ? false : true;
+}
+
+uint64_t CodeObject::ElfSize(const void* emi) { return amd::Elf::getElfSize(emi); }
+
+static bool getProcName(uint32_t EFlags, std::string& proc_name, bool& xnackSupported,
+                        bool& sramEccSupported) {
+  switch (EFlags & EF_AMDGPU_MACH) {
+    case EF_AMDGPU_MACH_AMDGCN_GFX700:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx700";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX701:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx701";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX702:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx702";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX703:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx703";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX704:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx704";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX705:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx705";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX801:
+      xnackSupported = true;
+      sramEccSupported = false;
+      proc_name = "gfx801";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX802:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx802";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX803:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx803";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX805:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx805";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX810:
+      xnackSupported = true;
+      sramEccSupported = false;
+      proc_name = "gfx810";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX900:
+      xnackSupported = true;
+      sramEccSupported = false;
+      proc_name = "gfx900";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX902:
+      xnackSupported = true;
+      sramEccSupported = false;
+      proc_name = "gfx902";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX904:
+      xnackSupported = true;
+      sramEccSupported = false;
+      proc_name = "gfx904";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX906:
+      xnackSupported = true;
+      sramEccSupported = true;
+      proc_name = "gfx906";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX908:
+      xnackSupported = true;
+      sramEccSupported = true;
+      proc_name = "gfx908";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX909:
+      xnackSupported = true;
+      sramEccSupported = false;
+      proc_name = "gfx909";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX90A:
+      xnackSupported = true;
+      sramEccSupported = true;
+      proc_name = "gfx90a";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX90C:
+      xnackSupported = true;
+      sramEccSupported = false;
+      proc_name = "gfx90c";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX940:
+      xnackSupported = true;
+      sramEccSupported = true;
+      proc_name = "gfx940";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX1010:
+      xnackSupported = true;
+      sramEccSupported = false;
+      proc_name = "gfx1010";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX1011:
+      xnackSupported = true;
+      sramEccSupported = false;
+      proc_name = "gfx1011";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX1012:
+      xnackSupported = true;
+      sramEccSupported = false;
+      proc_name = "gfx1012";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX1013:
+      xnackSupported = true;
+      sramEccSupported = false;
+      proc_name = "gfx1013";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX1030:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx1030";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX1031:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx1031";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX1032:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx1032";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX1033:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx1033";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX1034:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx1034";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX1035:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx1035";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX1036:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx1036";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX1100:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx1100";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX1101:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx1101";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX1102:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx1102";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX1103:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx1103";
+      break;
+    default:
+      return false;
+  }
+  return true;
+}
+
+static bool getTripleTargetIDFromCodeObject(const void* code_object, std::string& target_id) {
+  if (!code_object) return false;
+  const Elf64_Ehdr* ehdr = reinterpret_cast<const Elf64_Ehdr*>(code_object);
+  if (ehdr->e_machine != EM_AMDGPU) return false;
+  if (ehdr->e_ident[EI_OSABI] != ELFOSABI_AMDGPU_HSA) return false;
+
+  bool isXnackSupported{false}, isSramEccSupported{false};
+
+  std::string proc_name;
+  if (!getProcName(ehdr->e_flags, proc_name, isXnackSupported, isSramEccSupported)) return false;
+  target_id = std::string(AMDGCN_TARGET_TRIPLE) + '-' + proc_name;
+
+  switch (ehdr->e_ident[EI_ABIVERSION]) {
+    case ELFABIVERSION_AMDGPU_HSA_V2: {
+      LogPrintfInfo("[Code Object V2, target id:%s]", target_id.c_str());
+      return false;
+    }
+
+    case ELFABIVERSION_AMDGPU_HSA_V3: {
+      LogPrintfInfo("[Code Object V3, target id:%s]", target_id.c_str());
+      if (isSramEccSupported) {
+        if (ehdr->e_flags & EF_AMDGPU_FEATURE_SRAMECC_V3)
+          target_id += ":sramecc+";
+        else
+          target_id += ":sramecc-";
+      }
+      if (isXnackSupported) {
+        if (ehdr->e_flags & EF_AMDGPU_FEATURE_XNACK_V3)
+          target_id += ":xnack+";
+        else
+          target_id += ":xnack-";
+      }
+      break;
+    }
+
+    case ELFABIVERSION_AMDGPU_HSA_V4:
+    case ELFABIVERSION_AMDGPU_HSA_V5: {
+      if (ehdr->e_ident[EI_ABIVERSION] & ELFABIVERSION_AMDGPU_HSA_V4) {
+        LogPrintfInfo("[Code Object V4, target id:%s]", target_id.c_str());
+      } else {
+        LogPrintfInfo("[Code Object V5, target id:%s]", target_id.c_str());
+      }
+      unsigned co_sram_value = (ehdr->e_flags) & EF_AMDGPU_FEATURE_SRAMECC_V4;
+      if (co_sram_value == EF_AMDGPU_FEATURE_SRAMECC_OFF_V4)
+        target_id += ":sramecc-";
+      else if (co_sram_value == EF_AMDGPU_FEATURE_SRAMECC_ON_V4)
+        target_id += ":sramecc+";
+
+      unsigned co_xnack_value = (ehdr->e_flags) & EF_AMDGPU_FEATURE_XNACK_V4;
+      if (co_xnack_value == EF_AMDGPU_FEATURE_XNACK_OFF_V4)
+        target_id += ":xnack-";
+      else if (co_xnack_value == EF_AMDGPU_FEATURE_XNACK_ON_V4)
+        target_id += ":xnack+";
+      break;
+    }
+
+    default: {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Consumes the string 'consume_' from the starting of the given input
+// eg: input = amdgcn-amd-amdhsa--gfx908 and consume_ is amdgcn-amd-amdhsa--
+// input will become gfx908.
+static bool consume(std::string& input, std::string consume_) {
+  if (input.substr(0, consume_.size()) != consume_) {
+    return false;
+  }
+  input = input.substr(consume_.size());
+  return true;
+}
+
+// Trim String till character, will be used to get gpuname
+// example: input is gfx908:sram-ecc+ and trim char is :
+// input will become sram-ecc+.
+static std::string trimName(std::string& input, char trim) {
+  auto pos_ = input.find(trim);
+  auto res = input;
+  if (pos_ == std::string::npos) {
+    input = "";
+  } else {
+    res = input.substr(0, pos_);
+    input = input.substr(pos_);
+  }
+  return res;
+}
+
+static char getFeatureValue(std::string& input, std::string feature) {
+  char res = ' ';
+  if (consume(input, std::move(feature))) {
+    res = input[0];
+    input = input.substr(1);
+  }
+  return res;
+}
+
+static bool getTargetIDValue(std::string& input, std::string& processor, char& sramecc_value,
+                             char& xnack_value) {
+  processor = trimName(input, ':');
+  sramecc_value = getFeatureValue(input, std::string(":sramecc"));
+  if (sramecc_value != ' ' && sramecc_value != '+' && sramecc_value != '-') return false;
+  xnack_value = getFeatureValue(input, std::string(":xnack"));
+  if (xnack_value != ' ' && xnack_value != '+' && xnack_value != '-') return false;
+  return true;
+}
+
+static bool getTripleTargetID(std::string bundled_co_entry_id, const void* code_object,
+                              std::string& co_triple_target_id) {
+  std::string offload_kind = trimName(bundled_co_entry_id, '-');
+  if (offload_kind != OFFLOAD_KIND_HIPV4 && offload_kind != OFFLOAD_KIND_HIP &&
+      offload_kind != OFFLOAD_KIND_HCC)
+    return false;
+
+  if (offload_kind != OFFLOAD_KIND_HIPV4)
+    return getTripleTargetIDFromCodeObject(code_object, co_triple_target_id);
+
+  // For code object V4 onwards the bundled code object entry ID correctly
+  // specifies the target triple.
+  co_triple_target_id = bundled_co_entry_id.substr(1);
+  return true;
+}
+
+static bool isCodeObjectCompatibleWithDevice(std::string co_triple_target_id,
+                                             std::string agent_triple_target_id) {
+  // Primitive Check
+  if (co_triple_target_id == agent_triple_target_id) return true;
+
+  // Parse code object triple target id
+  if (!consume(co_triple_target_id, std::string(AMDGCN_TARGET_TRIPLE) + '-')) {
+    return false;
+  }
+
+  std::string co_processor;
+  char co_sram_ecc, co_xnack;
+  if (!getTargetIDValue(co_triple_target_id, co_processor, co_sram_ecc, co_xnack)) {
+    return false;
+  }
+
+  if (!co_triple_target_id.empty()) return false;
+
+  // Parse agent isa triple target id
+  if (!consume(agent_triple_target_id, std::string(AMDGCN_TARGET_TRIPLE) + '-')) {
+    return false;
+  }
+
+  std::string agent_isa_processor;
+  char isa_sram_ecc, isa_xnack;
+  if (!getTargetIDValue(agent_triple_target_id, agent_isa_processor, isa_sram_ecc, isa_xnack)) {
+    return false;
+  }
+
+  if (!agent_triple_target_id.empty()) return false;
+
+  // Check for compatibility
+  if (agent_isa_processor != co_processor) return false;
+  if (co_sram_ecc != ' ') {
+    if (co_sram_ecc != isa_sram_ecc) return false;
+  }
+  if (co_xnack != ' ') {
+    if (co_xnack != isa_xnack) return false;
+  }
+
+  return true;
+}
+
+// This will be moved to COMGR eventually
+hipError_t CodeObject::ExtractCodeObjectFromFile(
+    amd::Os::FileDesc fdesc, size_t fsize, const void** image,
+    const std::vector<std::string>& device_names,
+    std::vector<std::pair<const void*, size_t>>& code_objs) {
+  hipError_t hip_error = hipSuccess;
+
+  if (fdesc < 0) {
+    return hipErrorFileNotFound;
+  }
+
+  // Map the file to memory, with offset 0.
+  // file will be unmapped in ModuleUnload
+  // const void* image = nullptr;
+  if (!amd::Os::MemoryMapFileDesc(fdesc, fsize, 0, image)) {
+    return hipErrorInvalidValue;
+  }
+
+  // retrieve code_objs{binary_image, binary_size} for devices
+  hip_error = extractCodeObjectFromFatBinary(*image, device_names, code_objs);
+
+  return hip_error;
+}
+
+// This will be moved to COMGR eventually
+hipError_t CodeObject::ExtractCodeObjectFromMemory(
+    const void* data, const std::vector<std::string>& device_names,
+    std::vector<std::pair<const void*, size_t>>& code_objs, std::string& uri) {
+  // Get the URI from memory
+  if (!amd::Os::GetURIFromMemory(data, 0, uri)) {
+    return hipErrorInvalidValue;
+  }
+
+  return extractCodeObjectFromFatBinary(data, device_names, code_objs);
+}
+
+// This will be moved to COMGR eventually
+hipError_t CodeObject::extractCodeObjectFromFatBinary(
+    const void* data, const std::vector<std::string>& agent_triple_target_ids,
+    std::vector<std::pair<const void*, size_t>>& code_objs) {
+  std::string magic((const char*)data, bundle_magic_string_size);
+  if (magic.compare(CLANG_OFFLOAD_BUNDLER_MAGIC_STR)) {
+    return hipErrorInvalidKernelFile;
+  }
+
+  // Initialize Code objects
+  code_objs.reserve(agent_triple_target_ids.size());
+  for (size_t i = 0; i < agent_triple_target_ids.size(); i++) {
+    code_objs.push_back(std::make_pair(nullptr, 0));
+  }
+
+  const auto obheader = reinterpret_cast<const __ClangOffloadBundleHeader*>(data);
+  const auto* desc = &obheader->desc[0];
+  size_t num_code_objs = code_objs.size();
+  for (uint64_t i = 0; i < obheader->numOfCodeObjects; ++i,
+                desc = reinterpret_cast<const __ClangOffloadBundleInfo*>(
+                    reinterpret_cast<uintptr_t>(&desc->bundleEntryId[0]) +
+                    desc->bundleEntryIdSize)) {
+    const void* image =
+        reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(obheader) + desc->offset);
+    const size_t image_size = desc->size;
+
+    if (num_code_objs == 0) break;
+    std::string bundleEntryId{desc->bundleEntryId, desc->bundleEntryIdSize};
+
+    std::string co_triple_target_id;
+    if (!getTripleTargetID(bundleEntryId, image, co_triple_target_id)) continue;
+
+    for (size_t dev = 0; dev < agent_triple_target_ids.size(); ++dev) {
+      if (code_objs[dev].first) continue;
+      if (isCodeObjectCompatibleWithDevice(co_triple_target_id, agent_triple_target_ids[dev])) {
+        code_objs[dev] = std::make_pair(image, image_size);
+        --num_code_objs;
+      }
+    }
+  }
+  if (num_code_objs == 0) {
+    return hipSuccess;
+  } else {
+    LogPrintfError("%s",
+                   "hipErrorNoBinaryForGpu: Unable to find code object for all current devices!");
+    LogPrintfError("%s", "  Devices:");
+    for (size_t i = 0; i < agent_triple_target_ids.size(); i++) {
+      LogPrintfError("    %s - [%s]", agent_triple_target_ids[i].c_str(),
+                     ((code_objs[i].first) ? "Found" : "Not Found"));
+    }
+    const auto obheader = reinterpret_cast<const __ClangOffloadBundleHeader*>(data);
+    const auto* desc = &obheader->desc[0];
+    LogPrintfError("%s", "  Bundled Code Objects:");
+    for (uint64_t i = 0; i < obheader->numOfCodeObjects; ++i,
+                  desc = reinterpret_cast<const __ClangOffloadBundleInfo*>(
+                      reinterpret_cast<uintptr_t>(&desc->bundleEntryId[0]) +
+                      desc->bundleEntryIdSize)) {
+      std::string bundleEntryId{desc->bundleEntryId, desc->bundleEntryIdSize};
+      const void* image =
+          reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(obheader) + desc->offset);
+
+      std::string co_triple_target_id;
+      bool valid_co = getTripleTargetID(bundleEntryId, image, co_triple_target_id);
+
+      if (valid_co) {
+        LogPrintfError("    %s - [code object targetID is %s]", bundleEntryId.c_str(),
+                       co_triple_target_id.c_str());
+      } else {
+        LogPrintfError("    %s - [Unsupported]", bundleEntryId.c_str());
+      }
+    }
+
+    LogPrintfError("hipErrorNoBinaryForGpu: Unable to find code object for all current devices! - %d",hipErrorNoBinaryForGpu);
+    return hipErrorNoBinaryForGpu;
+  }
+}
+
+hipError_t DynCO::loadCodeObject(const char* fname, const void* image) {
+  amd::ScopedLock lock(dclock_);
+
+  // Number of devices = 1 in dynamic code object
+  fb_info_ = new FatBinaryInfo(fname, image);
+  std::vector<hip::Device*> devices = {g_devices[ihipGetDevice()]};
+  IHIP_RETURN_ONFAIL(fb_info_->ExtractFatBinary(devices));
+
+  // No Lazy loading for DynCO
+  IHIP_RETURN_ONFAIL(fb_info_->BuildProgram(ihipGetDevice()));
+
+  // Define Global variables
+  IHIP_RETURN_ONFAIL(populateDynGlobalVars());
+
+  // Define Global functions
+  IHIP_RETURN_ONFAIL(populateDynGlobalFuncs());
+
+  return hipSuccess;
+}
+
+// Dynamic Code Object
+DynCO::~DynCO() {
+  amd::ScopedLock lock(dclock_);
+
+  for (auto& elem : vars_) {
+    if (elem.second->getVarKind() == Var::DVK_Managed) {
+      hipError_t err = ihipFree(elem.second->getManagedVarPtr());
+      assert(err == hipSuccess);
+    }
+    delete elem.second;
+  }
+  vars_.clear();
+
+  for (auto& elem : functions_) {
+    delete elem.second;
+  }
+  functions_.clear();
+
+  delete fb_info_;
+}
+
+hipError_t DynCO::getDeviceVar(DeviceVar** dvar, std::string var_name) {
+  amd::ScopedLock lock(dclock_);
+
+  CheckDeviceIdMatch();
+
+  auto it = vars_.find(var_name);
+  if (it == vars_.end()) {
+    LogPrintfError("Cannot find the Var: %s ", var_name.c_str());
+    return hipErrorNotFound;
+  }
+
+  hipError_t err = it->second->getDeviceVar(dvar, device_id_, module());
+  return err;
+}
+
+hipError_t DynCO::getDynFunc(hipFunction_t* hfunc, std::string func_name) {
+  amd::ScopedLock lock(dclock_);
+
+  CheckDeviceIdMatch();
+
+  if (hfunc == nullptr) {
+    return hipErrorInvalidValue;
+  }
+
+  auto it = functions_.find(func_name);
+  if (it == functions_.end()) {
+    LogPrintfError("Cannot find the function: %s ", func_name.c_str());
+    return hipErrorNotFound;
+  }
+
+  /* See if this could be solved */
+  return it->second->getDynFunc(hfunc, module());
+}
+
+hipError_t DynCO::initDynManagedVars(const std::string& managedVar) {
+  amd::ScopedLock lock(dclock_);
+  DeviceVar* dvar;
+  void* pointer = nullptr;
+  hipError_t status = hipSuccess;
+  // To get size of the managed variable
+  status = getDeviceVar(&dvar, managedVar + ".managed");
+  if (status != hipSuccess) {
+    ClPrint(amd::LOG_ERROR, amd::LOG_API, "Status %d, failed to get .managed device variable:%s",
+            status, managedVar.c_str());
+    return status;
+  }
+  // Allocate managed memory for these symbols
+  status = ihipMallocManaged(&pointer, dvar->size());
+  if (status != hipSuccess) {
+    ClPrint(amd::LOG_ERROR, amd::LOG_API, "Status %d, failed to allocate managed memory", status);
+    guarantee(false, "Error during allocation of managed memory!");
+  }
+  // update as manager variable and set managed memory pointer and size
+  auto it = vars_.find(managedVar);
+  it->second->setManagedVarInfo(pointer, dvar->size());
+
+  // copy initial value to the managed variable to the managed memory allocated
+  hip::Stream* stream = hip::getNullStream();
+  if (stream != nullptr) {
+    status = ihipMemcpy(pointer, reinterpret_cast<address>(dvar->device_ptr()), dvar->size(),
+                        hipMemcpyDeviceToDevice, *stream);
+    if (status != hipSuccess) {
+      ClPrint(amd::LOG_ERROR, amd::LOG_API, "Status %d, failed to copy device ptr:%s", status,
+              managedVar.c_str());
+      return status;
+    }
+  } else {
+    ClPrint(amd::LOG_ERROR, amd::LOG_API, "Host Queue is NULL");
+    return hipErrorInvalidResourceHandle;
+  }
+
+  // Get deivce ptr to initialize with managed memory pointer
+  status = getDeviceVar(&dvar, managedVar);
+  if (status != hipSuccess) {
+    ClPrint(amd::LOG_ERROR, amd::LOG_API, "Status %d, failed to get managed device variable:%s",
+            status, managedVar.c_str());
+    return status;
+  }
+  // copy managed memory pointer to the managed device variable
+  status = ihipMemcpy(reinterpret_cast<address>(dvar->device_ptr()), &pointer, dvar->size(),
+                      hipMemcpyHostToDevice, *stream);
+  if (status != hipSuccess) {
+    ClPrint(amd::LOG_ERROR, amd::LOG_API, "Status %d, failed to copy device ptr:%s", status,
+            managedVar.c_str());
+    return status;
+  }
+  return status;
+}
+
+hipError_t DynCO::populateDynGlobalVars() {
+  amd::ScopedLock lock(dclock_);
+  hipError_t err = hipSuccess;
+  std::vector<std::string> var_names;
+  std::string managedVarExt = ".managed";
+  // For Dynamic Modules there is only one hipFatBinaryDevInfo_
+  device::Program* dev_program = fb_info_->GetProgram(ihipGetDevice())
+                                     ->getDeviceProgram(*hip::getCurrentDevice()->devices()[0]);
+
+  if (!dev_program->getGlobalVarFromCodeObj(&var_names)) {
+    LogPrintfError("Could not get Global vars from Code Obj for Module: 0x%x \n", module());
+    return hipErrorSharedObjectSymbolNotFound;
+  }
+
+  for (auto& elem : var_names) {
+    vars_.insert(
+        std::make_pair(elem, new Var(elem, Var::DeviceVarKind::DVK_Variable, 0, 0, 0, nullptr)));
+  }
+
+  for (auto& elem : var_names) {
+    if (elem.find(managedVarExt) != std::string::npos) {
+      std::string managedVar = elem;
+      managedVar.erase(managedVar.length() - managedVarExt.length(), managedVarExt.length());
+      err = initDynManagedVars(managedVar);
+    }
+  }
+  return err;
+}
+
+hipError_t DynCO::populateDynGlobalFuncs() {
+  amd::ScopedLock lock(dclock_);
+
+  std::vector<std::string> func_names;
+  device::Program* dev_program = fb_info_->GetProgram(ihipGetDevice())
+                                     ->getDeviceProgram(*hip::getCurrentDevice()->devices()[0]);
+
+  // Get all the global func names from COMGR
+  if (!dev_program->getGlobalFuncFromCodeObj(&func_names)) {
+    LogPrintfError("Could not get Global Funcs from Code Obj for Module: 0x%x \n", module());
+    return hipErrorSharedObjectSymbolNotFound;
+  }
+
+  for (auto& elem : func_names) {
+    functions_.insert(std::make_pair(elem, new Function(elem)));
+  }
+
+  return hipSuccess;
+}
+
+// Static Code Object
+StatCO::StatCO() {}
+
+StatCO::~StatCO() {
+  amd::ScopedLock lock(sclock_);
+
+  for (auto& elem : functions_) {
+    delete elem.second;
+  }
+  functions_.clear();
+
+  for (auto& elem : vars_) {
+    delete elem.second;
+  }
+  vars_.clear();
+}
+
+hipError_t StatCO::digestFatBinary(const void* data, FatBinaryInfo*& programs) {
+  amd::ScopedLock lock(sclock_);
+
+  if (programs != nullptr) {
+    return hipSuccess;
+  }
+
+  // Create a new fat binary object and extract the fat binary for all devices.
+  programs = new FatBinaryInfo(nullptr, data);
+  IHIP_RETURN_ONFAIL(programs->ExtractFatBinary(g_devices));
+
+  return hipSuccess;
+}
+
+FatBinaryInfo** StatCO::addFatBinary(const void* data, bool initialized) {
+  amd::ScopedLock lock(sclock_);
+
+  if (initialized) {
+    hipError_t err = digestFatBinary(data, modules_[data]);
+    assert(err == hipSuccess);
+  }
+  return &modules_[data];
+}
+
+hipError_t StatCO::removeFatBinary(FatBinaryInfo** module) {
+  amd::ScopedLock lock(sclock_);
+
+  auto vit = vars_.begin();
+  while (vit != vars_.end()) {
+    if (vit->second->moduleInfo() == module) {
+      delete vit->second;
+      vit = vars_.erase(vit);
+    } else {
+      ++vit;
+    }
+  }
+
+  auto it = managedVars_.begin();
+  while (it != managedVars_.end()) {
+    if ((*it)->moduleInfo() == module) {
+      for (auto dev : g_devices) {
+        DeviceVar* dvar = nullptr;
+        IHIP_RETURN_ONFAIL((*it)->getStatDeviceVar(&dvar, dev->deviceId()));
+        // free also deletes the device ptr
+        hipError_t err = ihipFree(dvar->device_ptr());
+        assert(err == hipSuccess);
+      }
+      it = managedVars_.erase(it);
+    } else {
+      ++it;
+    }
+  }
+
+  auto fit = functions_.begin();
+  while (fit != functions_.end()) {
+    if (fit->second->moduleInfo() == module) {
+      delete fit->second;
+      fit = functions_.erase(fit);
+    } else {
+      ++fit;
+    }
+  }
+
+  auto mit = modules_.begin();
+  while (mit != modules_.end()) {
+    if (&mit->second == module) {
+      delete mit->second;
+      mit = modules_.erase(mit);
+    } else {
+      ++mit;
+    }
+  }
+
+  return hipSuccess;
+}
+
+hipError_t StatCO::registerStatFunction(const void* hostFunction, Function* func) {
+  amd::ScopedLock lock(sclock_);
+
+  if (functions_.find(hostFunction) != functions_.end()) {
+    DevLogPrintfError("hostFunctionPtr: 0x%x already exists", hostFunction);
+  }
+  functions_.insert(std::make_pair(hostFunction, func));
+
+  return hipSuccess;
+}
+
+const char* StatCO::getStatFuncName(const void* hostFunction) {
+  amd::ScopedLock lock(sclock_);
+
+  const auto it = functions_.find(hostFunction);
+  if (it == functions_.end()) {
+    return nullptr;
+  }
+  return it->second->name().c_str();
+}
+
+hipError_t StatCO::getStatFunc(hipFunction_t* hfunc, const void* hostFunction, int deviceId) {
+  amd::ScopedLock lock(sclock_);
+
+  const auto it = functions_.find(hostFunction);
+  if (it == functions_.end()) {
+    return hipErrorInvalidSymbol;
+  }
+
+  return it->second->getStatFunc(hfunc, deviceId);
+}
+
+hipError_t StatCO::getStatFuncAttr(hipFuncAttributes* func_attr, const void* hostFunction,
+                                   int deviceId) {
+  amd::ScopedLock lock(sclock_);
+
+  const auto it = functions_.find(hostFunction);
+  if (it == functions_.end()) {
+    return hipErrorInvalidSymbol;
+  }
+
+  return it->second->getStatFuncAttr(func_attr, deviceId);
+}
+
+hipError_t StatCO::registerStatGlobalVar(const void* hostVar, Var* var) {
+  amd::ScopedLock lock(sclock_);
+
+  if (vars_.find(hostVar) != vars_.end()) {
+    return hipErrorInvalidSymbol;
+  }
+
+  vars_.insert(std::make_pair(hostVar, var));
+  return hipSuccess;
+}
+
+hipError_t StatCO::getStatGlobalVar(const void* hostVar, int deviceId, hipDeviceptr_t* dev_ptr,
+                                    size_t* size_ptr) {
+  amd::ScopedLock lock(sclock_);
+
+  const auto it = vars_.find(hostVar);
+  if (it == vars_.end()) {
+    return hipErrorInvalidSymbol;
+  }
+
+  DeviceVar* dvar = nullptr;
+  IHIP_RETURN_ONFAIL(it->second->getStatDeviceVar(&dvar, deviceId));
+
+  *dev_ptr = dvar->device_ptr();
+  *size_ptr = dvar->size();
+  return hipSuccess;
+}
+
+hipError_t StatCO::registerStatManagedVar(Var* var) {
+  managedVars_.emplace_back(var);
+  return hipSuccess;
+}
+
+hipError_t StatCO::initStatManagedVarDevicePtr(int deviceId) {
+  amd::ScopedLock lock(sclock_);
+  hipError_t err = hipSuccess;
+  if (managedVarsDevicePtrInitalized_.find(deviceId) == managedVarsDevicePtrInitalized_.end() ||
+      !managedVarsDevicePtrInitalized_[deviceId]) {
+    for (auto var : managedVars_) {
+      DeviceVar* dvar = nullptr;
+      IHIP_RETURN_ONFAIL(var->getStatDeviceVar(&dvar, deviceId));
+
+      hip::Stream* stream = g_devices.at(deviceId)->NullStream();
+      if (stream != nullptr) {
+        err = ihipMemcpy(reinterpret_cast<address>(dvar->device_ptr()), var->getManagedVarPtr(),
+                         dvar->size(), hipMemcpyHostToDevice, *stream);
+      } else {
+        ClPrint(amd::LOG_ERROR, amd::LOG_API, "Host Queue is NULL");
+        return hipErrorInvalidResourceHandle;
+      }
+    }
+    managedVarsDevicePtrInitalized_[deviceId] = true;
+  }
+  return err;
+}
+};  // namespace hip
@@ -0,0 +1,168 @@
+/*
+Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_CODE_OBJECT_HPP
+#define HIP_CODE_OBJECT_HPP
+
+#include "hip_global.hpp"
+
+#include <cstring>
+#include <unordered_map>
+
+#include "hip/hip_runtime.h"
+#include "hip/hip_runtime_api.h"
+#include "hip_internal.hpp"
+#include "device/device.hpp"
+#include "platform/program.hpp"
+
+//Forward Declaration for friend usage
+class PlatformState;
+
+namespace hip {
+
+//Code Object base class
+class CodeObject {
+ public:
+  virtual ~CodeObject() {}
+
+  // Functions to add_dev_prog and build
+  static hipError_t add_program(int deviceId, hipModule_t hmod, const void* binary_ptr,
+                                size_t binary_size);
+  static hipError_t build_module(hipModule_t hmod, const std::vector<amd::Device*>& devices);
+
+  // Given an file desc and file size, extracts to code object for corresponding devices,
+  // return code_objs{binary_ptr, binary_size}, which could be used to determine foffset
+  static hipError_t ExtractCodeObjectFromFile(amd::Os::FileDesc fdesc, size_t fsize,
+                    const void ** image, const std::vector<std::string>& device_names,
+                    std::vector<std::pair<const void*, size_t>>& code_objs);
+
+  // Given an ptr to memory, extracts to code object for corresponding devices,
+  // returns code_objs{binary_ptr, binary_size} and uniform resource indicator
+  static hipError_t ExtractCodeObjectFromMemory(const void* data,
+                    const std::vector<std::string>& device_names,
+                    std::vector<std::pair<const void*, size_t>>& code_objs,
+                    std::string& uri);
+
+  static uint64_t ElfSize(const void* emi);
+
+  static bool IsClangOffloadMagicBundle(const void* data);
+
+protected:
+  //Given an ptr to image or file, extracts to code object
+  //for corresponding devices
+  static hipError_t extractCodeObjectFromFatBinary(const void*,
+                    const std::vector<std::string>&,
+                    std::vector<std::pair<const void*, size_t>>&);
+
+  CodeObject() {}
+private:
+  friend const std::vector<hipModule_t>& modules();
+};
+
+//Dynamic Code Object
+class DynCO : public CodeObject {
+  amd::Monitor dclock_{"Guards Dynamic Code object", true};
+
+public:
+  DynCO() : device_id_(ihipGetDevice()), fb_info_(nullptr) {}
+  virtual ~DynCO();
+
+  //LoadsCodeObject and its data
+  hipError_t loadCodeObject(const char* fname, const void* image=nullptr);
+  hipModule_t module() const { return fb_info_->Module(ihipGetDevice()); };
+
+  //Gets GlobalVar/Functions from a dynamically loaded code object
+  hipError_t getDynFunc(hipFunction_t* hfunc, std::string func_name);
+  hipError_t getDeviceVar(DeviceVar** dvar, std::string var_name);
+
+  hipError_t getManagedVarPointer(std::string name, void** pointer, size_t* size_ptr) const {
+    auto it = vars_.find(name);
+    if (it != vars_.end() && it->second->getVarKind() == Var::DVK_Managed) {
+      *pointer = it->second->getManagedVarPtr();
+      *size_ptr = it->second->getSize();
+    }
+    return hipSuccess;
+  }
+  // Device ID Check to check if module is launched in the same device it was loaded.
+  inline void CheckDeviceIdMatch() const {
+    if (device_id_ != ihipGetDevice()) {
+      guarantee(false, "Device mismatch from where this module is loaded");
+    }
+  }
+
+private:
+  int device_id_;
+  FatBinaryInfo* fb_info_;
+
+  //Maps for vars/funcs, could be keyed in with std::string name
+  std::unordered_map<std::string, Function*> functions_;
+  std::unordered_map<std::string, Var*> vars_;
+
+  //Populate Global Vars/Funcs from an code object(@ module_load)
+  hipError_t populateDynGlobalFuncs();
+  hipError_t populateDynGlobalVars();
+  hipError_t initDynManagedVars(const std::string& managedVar);
+};
+
+//Static Code Object
+class StatCO: public CodeObject {
+  amd::Monitor sclock_{"Guards Static Code object", true};
+public:
+  StatCO();
+  virtual ~StatCO();
+
+  //Add/Remove/Digest Fat Binaries passed to us from "__hipRegisterFatBinary"
+  FatBinaryInfo** addFatBinary(const void* data, bool initialized);
+  hipError_t removeFatBinary(FatBinaryInfo** module);
+  hipError_t digestFatBinary(const void* data, FatBinaryInfo*& programs);
+
+  //Register vars/funcs given to use from __hipRegister[Var/Func/ManagedVar]
+  hipError_t registerStatFunction(const void* hostFunction, Function* func);
+  hipError_t registerStatGlobalVar(const void* hostVar, Var* var);
+  hipError_t registerStatManagedVar(Var *var);
+
+  //Retrive Vars/Funcs for a given hostSidePtr(const void*), unless stated otherwise.
+  const char* getStatFuncName(const void* hostFunction);
+  hipError_t getStatFunc(hipFunction_t* hfunc, const void* hostFunction, int deviceId);
+  hipError_t getStatFuncAttr(hipFuncAttributes* func_attr, const void* hostFunction, int deviceId);
+  hipError_t getStatGlobalVar(const void* hostVar, int deviceId, hipDeviceptr_t* dev_ptr,
+                              size_t* size_ptr);
+
+  //Managed variable is a defined symbol in code object
+  //pointer to the alocated managed memory has to be copied to the address of symbol
+  hipError_t initStatManagedVarDevicePtr(int deviceId);
+private:
+  friend class ::PlatformState;
+  //Populated during __hipRegisterFatBinary
+  std::unordered_map<const void*, FatBinaryInfo*> modules_;
+  //Populated during __hipRegisterFuncs
+  std::unordered_map<const void*, Function*> functions_;
+  //Populated during __hipRegisterVars
+  std::unordered_map<const void*, Var*> vars_;
+  //Populated during __hipRegisterManagedVar
+  std::vector<Var*> managedVars_;
+  std::unordered_map<int, bool> managedVarsDevicePtrInitalized_;
+};
+
+}; // namespace hip
+
+#endif /* HIP_CODE_OBJECT_HPP */
@@ -0,0 +1,402 @@
+/* Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include <hip/hip_runtime.h>
+#include "hip_internal.hpp"
+#include "hip_platform.hpp"
+#include "platform/runtime.hpp"
+#include "utils/flags.hpp"
+#include "utils/versions.hpp"
+
+std::vector<hip::Device*> g_devices;
+
+namespace hip {
+thread_local TlsAggregator tls;
+amd::Context* host_context = nullptr;
+
+//init() is only to be called from the HIP_INIT macro only once
+bool init() {
+  amd::IS_HIP = true;
+  GPU_NUM_MEM_DEPENDENCY = 0;
+#if DISABLE_DIRECT_DISPATCH
+  constexpr bool kDirectDispatch = false;
+#else
+  constexpr bool kDirectDispatch = IS_LINUX;
+#endif
+  AMD_DIRECT_DISPATCH = flagIsDefault(AMD_DIRECT_DISPATCH) ? kDirectDispatch : AMD_DIRECT_DISPATCH;
+  if (!amd::Runtime::init()) {
+    return false;
+  }
+  ClPrint(amd::LOG_INFO, amd::LOG_INIT, "Direct Dispatch: %d", AMD_DIRECT_DISPATCH);
+
+
+  const std::vector<amd::Device*>& devices = amd::Device::getDevices(CL_DEVICE_TYPE_GPU, false);
+
+  for (unsigned int i=0; i<devices.size(); i++) {
+    const std::vector<amd::Device*> device(1, devices[i]);
+    amd::Context* context = new amd::Context(device, amd::Context::Info());
+    if (!context) return false;
+
+    // Enable active wait on the device by default
+    devices[i]->SetActiveWait(true);
+
+    if (context && CL_SUCCESS != context->create(nullptr)) {
+      context->release();
+    } else {
+      auto device = new Device(context, i);
+      if ((device == nullptr) || !device->Create()) {
+        return false;
+      }
+      g_devices.push_back(device);
+    }
+  }
+
+  amd::Context* hContext = new amd::Context(devices, amd::Context::Info());
+  if (!hContext) return false;
+
+  if (CL_SUCCESS != hContext->create(nullptr)) {
+    hContext->release();
+  }
+  host_context = hContext;
+
+  PlatformState::instance().init();
+  return true;
+}
+
+Device* getCurrentDevice() {
+  return tls.device_;
+}
+
+void setCurrentDevice(unsigned int index) {
+  assert(index<g_devices.size());
+  tls.device_ = g_devices[index];
+  uint32_t preferredNumaNode = (tls.device_)->devices()[0]->getPreferredNumaNode();
+  amd::Os::setPreferredNumaNode(preferredNumaNode);
+}
+
+hip::Stream* getStream(hipStream_t stream) {
+ if (stream == nullptr) {
+    return getNullStream();
+  } else {
+    hip::Stream* hip_stream = reinterpret_cast<hip::Stream*>(stream);
+    if (!(hip_stream->Flags() & hipStreamNonBlocking)) {
+      constexpr bool WaitNullStreamOnly = true;
+      iHipWaitActiveStreams(hip_stream, WaitNullStreamOnly);
+    }
+    return hip_stream;
+  }
+}
+
+// ================================================================================================
+hip::Stream* getNullStream(amd::Context& ctx) {
+  for (auto& it : g_devices) {
+    if (it->asContext() == &ctx) {
+      return it->NullStream();
+    }
+  }
+  // If it's a pure SVM allocation with system memory access, then it shouldn't matter which device
+  // runtime selects by default
+  if (hip::host_context == &ctx) {
+    // Return current...
+    return getNullStream();
+  }
+  return nullptr;
+}
+
+// ================================================================================================
+int getDeviceID(amd::Context& ctx) {
+  for (auto& it : g_devices) {
+    if (it->asContext() == &ctx) {
+      return it->deviceId();
+    }
+  }
+  return -1;
+}
+
+// ================================================================================================
+hip::Stream* getNullStream() {
+  Device* device = getCurrentDevice();
+  return device ? device->NullStream() : nullptr;
+}
+
+};
+
+using namespace hip;
+
+hipError_t hipInit(unsigned int flags) {
+  HIP_INIT_API(hipInit, flags);
+
+  if (flags != 0) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipCtxCreate(hipCtx_t *ctx, unsigned int flags,  hipDevice_t device) {
+  HIP_INIT_API(hipCtxCreate, ctx, flags, device);
+
+  if (static_cast<size_t>(device) >= g_devices.size()) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  *ctx = reinterpret_cast<hipCtx_t>(g_devices[device]);
+
+  // Increment ref count for device primary context
+  g_devices[device]->retain();
+  tls.ctxt_stack_.push(g_devices[device]);
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipCtxSetCurrent(hipCtx_t ctx) {
+  HIP_INIT_API(hipCtxSetCurrent, ctx);
+
+  if (ctx == nullptr) {
+    if(!tls.ctxt_stack_.empty()) {
+      tls.ctxt_stack_.pop();
+    }
+  } else {
+    hip::tls.device_ = reinterpret_cast<hip::Device*>(ctx);
+    if(!tls.ctxt_stack_.empty()) {
+      tls.ctxt_stack_.pop();
+    }
+    tls.ctxt_stack_.push(hip::getCurrentDevice());
+  }
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipCtxGetCurrent(hipCtx_t* ctx) {
+  HIP_INIT_API(hipCtxGetCurrent, ctx);
+
+  *ctx = reinterpret_cast<hipCtx_t>(hip::getCurrentDevice());
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipCtxGetSharedMemConfig(hipSharedMemConfig* pConfig) {
+  HIP_INIT_API(hipCtxGetSharedMemConfig, pConfig);
+
+  *pConfig = hipSharedMemBankSizeFourByte;
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipRuntimeGetVersion(int *runtimeVersion) {
+  HIP_INIT_API_NO_RETURN(hipRuntimeGetVersion, runtimeVersion);
+
+  if (!runtimeVersion) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  // HIP_VERSION = HIP_VERSION_MAJOR*100 + HIP_MINOR_VERSION
+  *runtimeVersion = HIP_VERSION;
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipCtxDestroy(hipCtx_t ctx) {
+  HIP_INIT_API(hipCtxDestroy, ctx);
+
+  hip::Device* dev = reinterpret_cast<hip::Device*>(ctx);
+  if (dev == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  // Need to remove the ctx of calling thread if its the top one
+  if (!tls.ctxt_stack_.empty() && tls.ctxt_stack_.top() == dev) {
+    tls.ctxt_stack_.pop();
+  }
+
+  // Remove context from global context list
+  for (unsigned int i = 0; i < g_devices.size(); i++) {
+    if (g_devices[i] == dev) {
+      // Decrement ref count for device primary context
+      dev->release();
+    }
+  }
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipCtxPopCurrent(hipCtx_t* ctx) {
+  HIP_INIT_API(hipCtxPopCurrent, ctx);
+
+  hip::Device** dev = reinterpret_cast<hip::Device**>(ctx);
+  if (!tls.ctxt_stack_.empty()) {
+    if (dev != nullptr) {
+      *dev = tls.ctxt_stack_.top();
+    }
+    tls.ctxt_stack_.pop();
+  } else {
+    DevLogError("Context Stack empty \n");
+    HIP_RETURN(hipErrorInvalidContext);
+  }
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipCtxPushCurrent(hipCtx_t ctx) {
+  HIP_INIT_API(hipCtxPushCurrent, ctx);
+
+  hip::Device* dev = reinterpret_cast<hip::Device*>(ctx);
+  if (dev == nullptr) {
+    HIP_RETURN(hipErrorInvalidContext);
+  }
+
+  hip::tls.device_ = dev;
+  tls.ctxt_stack_.push(hip::getCurrentDevice());
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipDriverGetVersion(int* driverVersion) {
+  HIP_INIT_API_NO_RETURN(hipDriverGetVersion, driverVersion);
+
+  if (!driverVersion) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  // HIP_VERSION = HIP_VERSION_MAJOR*100 + HIP_MINOR_VERSION
+  *driverVersion = HIP_VERSION;
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipCtxGetDevice(hipDevice_t* device) {
+  HIP_INIT_API(hipCtxGetDevice, device);
+
+  if (device != nullptr) {
+    *device = hip::getCurrentDevice()->deviceId();
+    HIP_RETURN(hipSuccess);
+  } else {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  HIP_RETURN(hipErrorInvalidContext);
+}
+
+hipError_t hipCtxGetApiVersion(hipCtx_t ctx, int* apiVersion) {
+  HIP_INIT_API(hipCtxGetApiVersion, apiVersion);
+
+  assert(0 && "Unimplemented");
+
+  HIP_RETURN(hipErrorNotSupported);
+}
+
+hipError_t hipCtxGetCacheConfig(hipFuncCache_t* cacheConfig) {
+  HIP_INIT_API(hipCtxGetCacheConfig, cacheConfig);
+
+  assert(0 && "Unimplemented");
+
+  HIP_RETURN(hipErrorNotSupported);
+}
+
+hipError_t hipCtxSetCacheConfig(hipFuncCache_t cacheConfig) {
+  HIP_INIT_API(hipCtxSetCacheConfig, cacheConfig);
+
+  assert(0 && "Unimplemented");
+
+  HIP_RETURN(hipErrorNotSupported);
+}
+
+hipError_t hipCtxSetSharedMemConfig(hipSharedMemConfig config) {
+  HIP_INIT_API(hipCtxSetSharedMemConfig, config);
+
+  assert(0 && "Unimplemented");
+
+  HIP_RETURN(hipErrorNotSupported);
+}
+
+hipError_t hipCtxSynchronize(void) {
+  HIP_INIT_API(hipCtxSynchronize, 1);
+
+  assert(0 && "Unimplemented");
+
+  HIP_RETURN(hipErrorNotSupported);
+}
+
+hipError_t hipCtxGetFlags(unsigned int* flags) {
+  HIP_INIT_API(hipCtxGetFlags, flags);
+
+  assert(0 && "Unimplemented");
+
+  HIP_RETURN(hipErrorNotSupported);
+}
+
+hipError_t hipDevicePrimaryCtxGetState(hipDevice_t dev, unsigned int* flags, int* active) {
+  HIP_INIT_API(hipDevicePrimaryCtxGetState, dev, flags, active);
+
+  if (static_cast<unsigned int>(dev) >= g_devices.size()) {
+    HIP_RETURN(hipErrorInvalidDevice);
+  }
+
+  if (flags != nullptr) {
+    *flags = 0;
+  }
+
+  if (active != nullptr) {
+    *active = g_devices[dev]->GetActiveStatus() ? 1 : 0;
+  }
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipDevicePrimaryCtxRelease(hipDevice_t dev) {
+  HIP_INIT_API(hipDevicePrimaryCtxRelease, dev);
+
+  if (static_cast<unsigned int>(dev) >= g_devices.size()) {
+    HIP_RETURN(hipErrorInvalidDevice);
+  }
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipDevicePrimaryCtxRetain(hipCtx_t* pctx, hipDevice_t dev) {
+  HIP_INIT_API(hipDevicePrimaryCtxRetain, pctx, dev);
+
+  if (static_cast<unsigned int>(dev) >= g_devices.size()) {
+    HIP_RETURN(hipErrorInvalidDevice);
+  }
+  if (pctx == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  *pctx = reinterpret_cast<hipCtx_t>(g_devices[dev]);
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipDevicePrimaryCtxReset(hipDevice_t dev) {
+  HIP_INIT_API(hipDevicePrimaryCtxReset, dev);
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipDevicePrimaryCtxSetFlags(hipDevice_t dev, unsigned int flags) {
+  HIP_INIT_API(hipDevicePrimaryCtxSetFlags, dev, flags);
+
+  if (static_cast<unsigned int>(dev) >= g_devices.size()) {
+    HIP_RETURN(hipErrorInvalidDevice);
+  } else {
+    HIP_RETURN(hipErrorContextAlreadyInUse);
+  }
+}
@@ -0,0 +1,944 @@
+/*
+Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <hip/driver_types.h>
+#include <hip/texture_types.h>
+
+namespace hip
+{
+inline
+cl_channel_type getCLChannelType(const hipArray_Format hipFormat,
+                                 const hipTextureReadMode hipReadMode) {
+  if (hipReadMode == hipReadModeElementType) {
+    switch (hipFormat) {
+      case HIP_AD_FORMAT_UNSIGNED_INT8:
+        return CL_UNSIGNED_INT8;
+      case HIP_AD_FORMAT_SIGNED_INT8:
+        return CL_SIGNED_INT8;
+      case HIP_AD_FORMAT_UNSIGNED_INT16:
+        return CL_UNSIGNED_INT16;
+      case HIP_AD_FORMAT_SIGNED_INT16:
+        return CL_SIGNED_INT16;
+      case HIP_AD_FORMAT_UNSIGNED_INT32:
+        return CL_UNSIGNED_INT32;
+      case HIP_AD_FORMAT_SIGNED_INT32:
+        return CL_SIGNED_INT32;
+      case HIP_AD_FORMAT_HALF:
+        return CL_HALF_FLOAT;
+      case HIP_AD_FORMAT_FLOAT:
+        return CL_FLOAT;
+    }
+  } else if (hipReadMode == hipReadModeNormalizedFloat) {
+    switch (hipFormat) {
+      case HIP_AD_FORMAT_UNSIGNED_INT8:
+        return CL_UNORM_INT8;
+      case HIP_AD_FORMAT_SIGNED_INT8:
+        return CL_SNORM_INT8;
+      case HIP_AD_FORMAT_UNSIGNED_INT16:
+        return CL_UNORM_INT16;
+      case HIP_AD_FORMAT_SIGNED_INT16:
+        return CL_SNORM_INT16;
+      case HIP_AD_FORMAT_UNSIGNED_INT32:
+        return CL_UNSIGNED_INT32;
+      case HIP_AD_FORMAT_SIGNED_INT32:
+        return CL_SIGNED_INT32;
+      case HIP_AD_FORMAT_HALF:
+        return CL_HALF_FLOAT;
+      case HIP_AD_FORMAT_FLOAT:
+        return CL_FLOAT;
+    }
+  }
+
+  //error scenario
+  return {};
+}
+
+inline
+cl_channel_order getCLChannelOrder(const unsigned int hipNumChannels,
+                                   const int sRGB) {
+  switch (hipNumChannels) {
+    case 1:
+      return CL_R;
+    case 2:
+      return CL_RG;
+    case 4:
+      return (sRGB == 1) ? CL_sRGBA : CL_RGBA;
+    default:
+      break;
+  }
+
+  //error scenario
+  return {};
+}
+
+inline
+cl_mem_object_type getCLMemObjectType(const unsigned int hipWidth,
+                                      const unsigned int hipHeight,
+                                      const unsigned int hipDepth,
+                                      const unsigned int flags) {
+  if (flags == hipArrayDefault) {
+    if ((hipWidth != 0) && (hipHeight == 0) && (hipDepth == 0)) {
+      return CL_MEM_OBJECT_IMAGE1D;
+    } else if ((hipWidth != 0) && (hipHeight != 0) && (hipDepth == 0)) {
+      return CL_MEM_OBJECT_IMAGE2D;
+    } else if ((hipWidth != 0) && (hipHeight != 0) && (hipDepth != 0)) {
+      return CL_MEM_OBJECT_IMAGE3D;
+    }
+  } else if (flags == hipArrayLayered) {
+    if ((hipWidth != 0) && (hipHeight == 0) && (hipDepth != 0)) {
+      return CL_MEM_OBJECT_IMAGE1D_ARRAY;
+    } else if ((hipWidth != 0) && (hipHeight != 0) && (hipDepth != 0)) {
+      return CL_MEM_OBJECT_IMAGE2D_ARRAY;
+    }
+  }
+  // error scenario. ShouldNotReachHere()
+  return CL_MEM_OBJECT_ALLOCATION_FAILURE;
+}
+
+inline
+cl_addressing_mode getCLAddressingMode(const hipTextureAddressMode hipAddressMode) {
+  switch (hipAddressMode) {
+    case hipAddressModeWrap:
+      return CL_ADDRESS_REPEAT;
+    case hipAddressModeClamp:
+      return CL_ADDRESS_CLAMP_TO_EDGE;
+    case hipAddressModeMirror:
+      return CL_ADDRESS_MIRRORED_REPEAT;
+    case hipAddressModeBorder:
+      return CL_ADDRESS_CLAMP;
+  }
+
+  //error scenario
+  return {};
+}
+
+inline
+cl_filter_mode getCLFilterMode(const hipTextureFilterMode hipFilterMode) {
+  switch (hipFilterMode) {
+    case hipFilterModePoint:
+      return CL_FILTER_NEAREST;
+    case hipFilterModeLinear:
+      return CL_FILTER_LINEAR;
+  }
+
+  //error scenario
+  return {};
+}
+
+inline
+cl_mem_object_type getCLMemObjectType(const hipResourceType hipResType) {
+  switch (hipResType) {
+    case hipResourceTypeLinear:
+      return CL_MEM_OBJECT_IMAGE1D_BUFFER;
+    case hipResourceTypePitch2D:
+      return CL_MEM_OBJECT_IMAGE2D;
+    default:
+      break;
+  }
+
+  //error scenario
+  return {};
+}
+
+inline
+hipArray_Format getCL2hipArrayFormat(const cl_channel_type type) {
+  switch (type) {
+    case CL_SNORM_INT8:
+    case CL_SIGNED_INT8:
+      return HIP_AD_FORMAT_SIGNED_INT8;
+
+    case CL_UNSIGNED_INT16:
+      return HIP_AD_FORMAT_UNSIGNED_INT16;
+
+    case CL_SIGNED_INT16:
+      return HIP_AD_FORMAT_SIGNED_INT16;
+
+    case CL_SIGNED_INT32:
+      return HIP_AD_FORMAT_SIGNED_INT32;
+
+    case CL_UNSIGNED_INT32:
+      return HIP_AD_FORMAT_UNSIGNED_INT32;
+
+    case CL_FLOAT:
+      return HIP_AD_FORMAT_FLOAT;
+
+    case CL_UNSIGNED_INT8:
+    case CL_UNORM_INT8:
+    case CL_UNORM_INT_101010:
+    default:
+      return HIP_AD_FORMAT_UNSIGNED_INT8;
+  }
+}
+inline
+size_t getElementSize(const hipArray_const_t array) {
+  switch (array->Format) {
+    case HIP_AD_FORMAT_UNSIGNED_INT8:
+    case HIP_AD_FORMAT_SIGNED_INT8:
+      return 1 * array->NumChannels;
+    case HIP_AD_FORMAT_UNSIGNED_INT16:
+    case HIP_AD_FORMAT_SIGNED_INT16:
+    case HIP_AD_FORMAT_HALF:
+      return 2 * array->NumChannels;
+    case HIP_AD_FORMAT_UNSIGNED_INT32:
+    case HIP_AD_FORMAT_SIGNED_INT32:
+    case HIP_AD_FORMAT_FLOAT:
+      return 4 * array->NumChannels;
+  }
+
+  //error scenario
+  return {};
+}
+
+inline
+hipChannelFormatDesc getChannelFormatDesc(int numChannels,
+                                          hipArray_Format arrayFormat) {
+  switch (arrayFormat) {
+    case HIP_AD_FORMAT_UNSIGNED_INT8:
+      switch (numChannels) {
+        case 1:
+          return {8, 0, 0, 0, hipChannelFormatKindUnsigned};
+        case 2:
+          return {8, 8, 0, 0, hipChannelFormatKindUnsigned};
+        case 4:
+          return {8, 8, 8, 8, hipChannelFormatKindUnsigned};
+      }
+    case HIP_AD_FORMAT_SIGNED_INT8:
+      switch (numChannels) {
+        case 1:
+          return {8, 0, 0, 0, hipChannelFormatKindSigned};
+        case 2:
+          return {8, 8, 0, 0, hipChannelFormatKindSigned};
+        case 4:
+          return {8, 8, 8, 8, hipChannelFormatKindSigned};
+      }
+    case HIP_AD_FORMAT_UNSIGNED_INT16:
+      switch (numChannels) {
+        case 1:
+          return {16, 0, 0, 0, hipChannelFormatKindUnsigned};
+        case 2:
+          return {16, 16, 0, 0, hipChannelFormatKindUnsigned};
+        case 4:
+          return {16, 16, 16, 16, hipChannelFormatKindUnsigned};
+      }
+    case HIP_AD_FORMAT_SIGNED_INT16:
+      switch (numChannels) {
+        case 1:
+          return {16, 0, 0, 0, hipChannelFormatKindSigned};
+        case 2:
+          return {16, 16, 0, 0, hipChannelFormatKindSigned};
+        case 4:
+          return {16, 16, 16, 16, hipChannelFormatKindSigned};
+      }
+    case HIP_AD_FORMAT_UNSIGNED_INT32:
+      switch (numChannels) {
+        case 1:
+          return {32, 0, 0, 0, hipChannelFormatKindUnsigned};
+        case 2:
+          return {32, 32, 0, 0, hipChannelFormatKindUnsigned};
+        case 4:
+          return {32, 32, 32, 32, hipChannelFormatKindUnsigned};
+      }
+    case HIP_AD_FORMAT_SIGNED_INT32:
+      switch (numChannels) {
+        case 1:
+          return {32, 0, 0, 0, hipChannelFormatKindSigned};
+        case 2:
+          return {32, 32, 0, 0, hipChannelFormatKindSigned};
+        case 4:
+          return {32, 32, 32, 32, hipChannelFormatKindSigned};
+      }
+    case HIP_AD_FORMAT_HALF:
+      switch (numChannels) {
+        case 1:
+          return {16, 0, 0, 0, hipChannelFormatKindFloat};
+        case 2:
+          return {16, 16, 0, 0, hipChannelFormatKindFloat};
+        case 4:
+          return {16, 16, 16, 16, hipChannelFormatKindFloat};
+      }
+    case HIP_AD_FORMAT_FLOAT:
+      switch (numChannels) {
+        case 1:
+          return {32, 0, 0, 0, hipChannelFormatKindFloat};
+        case 2:
+          return {32, 32, 0, 0, hipChannelFormatKindFloat};
+        case 4:
+          return {32, 32, 32, 32, hipChannelFormatKindFloat};
+      }
+  }
+
+  //error scenario
+  return {};
+}
+
+inline
+unsigned int getNumChannels(const hipChannelFormatDesc& desc) {
+  return ((desc.x != 0) + (desc.y != 0) + (desc.z != 0) + (desc.w != 0));
+}
+
+inline
+bool CheckArrayFormat(const hipChannelFormatDesc& desc) {
+  if(desc.x == 0) {
+    return false;
+  } else {
+    if(desc.y != 0 && desc.y != desc.x) {
+      return false;
+    }
+    if(desc.z !=0 && desc.z != desc.x) {
+      return false;
+    }
+    if(desc.w !=0 && desc.w != desc.x) {
+      return false;
+    }
+  }
+  // The bit channel description should not allow any channels after a zero channel
+  if (desc.y == 0) {
+    return !(desc.z > 0 || desc.w > 0);
+  }
+  else if (desc.z == 0) {
+    return !(desc.w > 0);
+  }
+
+  return true;
+}
+
+inline
+hipArray_Format getArrayFormat(const hipChannelFormatDesc& desc) {
+  switch (desc.f) {
+    case hipChannelFormatKindUnsigned:
+      switch (desc.x) {
+        case 8:
+          return HIP_AD_FORMAT_UNSIGNED_INT8;
+        case 16:
+          return HIP_AD_FORMAT_UNSIGNED_INT16;
+        case 32:
+          return HIP_AD_FORMAT_UNSIGNED_INT32;
+      }
+    case hipChannelFormatKindSigned:
+      switch (desc.x) {
+        case 8:
+          return HIP_AD_FORMAT_SIGNED_INT8;
+        case 16:
+          return HIP_AD_FORMAT_SIGNED_INT16;
+        case 32:
+          return HIP_AD_FORMAT_SIGNED_INT32;
+      }
+    case hipChannelFormatKindFloat:
+      switch (desc.x) {
+        case 16:
+          return HIP_AD_FORMAT_HALF;
+        case 32:
+          return HIP_AD_FORMAT_FLOAT;
+      }
+    default:
+      break;
+  }
+
+  //error scenario
+  return {};
+}
+
+inline
+int getNumChannels(const hipResourceViewFormat hipFormat) {
+  switch (hipFormat) {
+    case hipResViewFormatUnsignedChar1:
+    case hipResViewFormatSignedChar1:
+    case hipResViewFormatUnsignedShort1:
+    case hipResViewFormatSignedShort1:
+    case hipResViewFormatUnsignedInt1:
+    case hipResViewFormatSignedInt1:
+    case hipResViewFormatHalf1:
+    case hipResViewFormatFloat1:
+      return 1;
+    case hipResViewFormatUnsignedChar2:
+    case hipResViewFormatSignedChar2:
+    case hipResViewFormatUnsignedShort2:
+    case hipResViewFormatSignedShort2:
+    case hipResViewFormatUnsignedInt2:
+    case hipResViewFormatSignedInt2:
+    case hipResViewFormatHalf2:
+    case hipResViewFormatFloat2:
+      return 2;
+    case hipResViewFormatUnsignedChar4:
+    case hipResViewFormatSignedChar4:
+    case hipResViewFormatUnsignedShort4:
+    case hipResViewFormatSignedShort4:
+    case hipResViewFormatUnsignedInt4:
+    case hipResViewFormatSignedInt4:
+    case hipResViewFormatHalf4:
+    case hipResViewFormatFloat4:
+      return 4;
+    default:
+      break;
+  }
+
+  //error scenario
+  return {};
+}
+
+inline
+hipArray_Format getArrayFormat(const hipResourceViewFormat hipFormat) {
+  switch (hipFormat) {
+    case hipResViewFormatUnsignedChar1:
+    case hipResViewFormatUnsignedChar2:
+    case hipResViewFormatUnsignedChar4:
+      return HIP_AD_FORMAT_UNSIGNED_INT8;
+    case hipResViewFormatSignedChar1:
+    case hipResViewFormatSignedChar2:
+    case hipResViewFormatSignedChar4:
+      return HIP_AD_FORMAT_SIGNED_INT8;
+    case hipResViewFormatUnsignedShort1:
+    case hipResViewFormatUnsignedShort2:
+    case hipResViewFormatUnsignedShort4:
+      return HIP_AD_FORMAT_UNSIGNED_INT16;
+    case hipResViewFormatSignedShort1:
+    case hipResViewFormatSignedShort2:
+    case hipResViewFormatSignedShort4:
+      return HIP_AD_FORMAT_SIGNED_INT16;
+    case hipResViewFormatUnsignedInt1:
+    case hipResViewFormatUnsignedInt2:
+    case hipResViewFormatUnsignedInt4:
+      return HIP_AD_FORMAT_UNSIGNED_INT32;
+    case hipResViewFormatSignedInt1:
+    case hipResViewFormatSignedInt2:
+    case hipResViewFormatSignedInt4:
+      return HIP_AD_FORMAT_SIGNED_INT32;
+    case hipResViewFormatHalf1:
+    case hipResViewFormatHalf2:
+    case hipResViewFormatHalf4:
+      return HIP_AD_FORMAT_HALF;
+    case hipResViewFormatFloat1:
+    case hipResViewFormatFloat2:
+    case hipResViewFormatFloat4:
+      return HIP_AD_FORMAT_FLOAT;
+    default:
+      break;
+  }
+
+  //error scenario
+  return {};
+}
+
+inline
+hipResourceViewFormat getResourceViewFormat(const hipChannelFormatDesc& desc) {
+  switch (desc.f) {
+    case hipChannelFormatKindUnsigned:
+      switch (getNumChannels(desc)) {
+        case 1:
+          switch (desc.x) {
+            case 8:
+              return hipResViewFormatUnsignedChar1;
+            case 16:
+              return hipResViewFormatUnsignedShort1;
+            case 32:
+              return hipResViewFormatUnsignedInt1;
+          }
+        case 2:
+          switch (desc.x) {
+            case 8:
+              return hipResViewFormatUnsignedChar2;
+            case 16:
+              return hipResViewFormatUnsignedShort2;
+            case 32:
+              return hipResViewFormatUnsignedInt2;
+          }
+        case 4:
+          switch (desc.x) {
+            case 8:
+              return hipResViewFormatUnsignedChar4;
+            case 16:
+              return hipResViewFormatUnsignedShort4;
+            case 32:
+              return hipResViewFormatUnsignedInt4;
+          }
+      }
+    case hipChannelFormatKindSigned:
+      switch (getNumChannels(desc)) {
+        case 1:
+          switch (desc.x) {
+            case 8:
+              return hipResViewFormatSignedChar1;
+            case 16:
+              return hipResViewFormatSignedShort1;
+            case 32:
+              return hipResViewFormatSignedInt1;
+          }
+        case 2:
+          switch (desc.x) {
+            case 8:
+              return hipResViewFormatSignedChar2;
+            case 16:
+              return hipResViewFormatSignedShort2;
+            case 32:
+              return hipResViewFormatSignedInt2;
+          }
+        case 4:
+          switch (desc.x) {
+            case 8:
+              return hipResViewFormatSignedChar4;
+            case 16:
+              return hipResViewFormatSignedShort4;
+            case 32:
+              return hipResViewFormatSignedInt4;
+          }
+      }
+    case hipChannelFormatKindFloat:
+      switch (getNumChannels(desc)) {
+        case 1:
+          switch (desc.x) {
+            case 16:
+              return hipResViewFormatHalf1;
+            case 32:
+              return hipResViewFormatFloat1;
+          }
+        case 2:
+          switch (desc.x) {
+            case 16:
+              return hipResViewFormatHalf2;
+            case 32:
+              return hipResViewFormatFloat2;
+          }
+        case 4:
+          switch (desc.x) {
+            case 16:
+              return hipResViewFormatHalf4;
+            case 32:
+              return hipResViewFormatFloat4;
+          }
+      }
+    default:
+      break;
+  }
+
+  //error scenario
+  return {};
+}
+
+inline
+hipTextureDesc getTextureDesc(const textureReference* texRef) {
+  hipTextureDesc texDesc = {};
+  std::memcpy(texDesc.addressMode, texRef->addressMode, sizeof(texDesc.addressMode));
+  texDesc.filterMode = texRef->filterMode;
+  texDesc.readMode = texRef->readMode;
+  texDesc.sRGB = texRef->sRGB;
+  texDesc.normalizedCoords = texRef->normalized;
+  texDesc.maxAnisotropy = texRef->maxAnisotropy;
+  texDesc.mipmapFilterMode = texRef->mipmapFilterMode;
+  texDesc.mipmapLevelBias = texRef->mipmapLevelBias;
+  texDesc.minMipmapLevelClamp = texRef->minMipmapLevelClamp;
+  texDesc.maxMipmapLevelClamp = texRef->maxMipmapLevelClamp;
+
+  return texDesc;
+}
+
+inline
+hipResourceViewDesc getResourceViewDesc(hipArray_const_t array,
+                                        const hipResourceViewFormat format) {
+  hipResourceViewDesc resViewDesc = {};
+  resViewDesc.format = format;
+  resViewDesc.width = array->width;
+  resViewDesc.height = array->height;
+  resViewDesc.depth = array->depth;
+  resViewDesc.firstMipmapLevel = 0;
+  resViewDesc.lastMipmapLevel = 0;
+  resViewDesc.firstLayer = 0;
+  resViewDesc.lastLayer = 0; /* TODO add hipArray::numLayers */
+
+  return resViewDesc;
+}
+
+inline
+hipResourceViewDesc getResourceViewDesc(hipMipmappedArray_const_t array,
+                                        const hipResourceViewFormat format) {
+  hipResourceViewDesc resViewDesc = {};
+  resViewDesc.format = format;
+  resViewDesc.width = array->width;
+  resViewDesc.height = array->height;
+  resViewDesc.depth = array->depth;
+  resViewDesc.firstMipmapLevel = 0;
+  resViewDesc.lastMipmapLevel = 0; /* TODO add hipMipmappedArray::numMipLevels */
+  resViewDesc.firstLayer = 0;
+  resViewDesc.lastLayer = 0; /* TODO add hipArray::numLayers */
+
+  return resViewDesc;
+}
+
+inline
+std::pair<hipMemoryType, hipMemoryType> getMemoryType(const hipMemcpyKind kind) {
+  switch (kind) {
+    case hipMemcpyHostToHost:
+      return {hipMemoryTypeHost, hipMemoryTypeHost};
+    case hipMemcpyHostToDevice:
+      return {hipMemoryTypeHost, hipMemoryTypeDevice};
+    case hipMemcpyDeviceToHost:
+      return {hipMemoryTypeDevice, hipMemoryTypeHost};
+    case hipMemcpyDeviceToDevice:
+      return {hipMemoryTypeDevice, hipMemoryTypeDevice};
+    case hipMemcpyDefault:
+      return {hipMemoryTypeUnified, hipMemoryTypeUnified};
+  }
+
+  //error scenario
+  return {};
+}
+
+inline
+HIP_MEMCPY3D getDrvMemcpy3DDesc(const hip_Memcpy2D& desc2D) {
+  HIP_MEMCPY3D desc3D = {};
+
+  desc3D.srcXInBytes = desc2D.srcXInBytes;
+  desc3D.srcY = desc2D.srcY;
+  desc3D.srcZ = 0;
+  desc3D.srcLOD = 0;
+  desc3D.srcMemoryType = desc2D.srcMemoryType;
+  desc3D.srcHost = desc2D.srcHost;
+  desc3D.srcDevice = desc2D.srcDevice;
+  desc3D.srcArray = desc2D.srcArray;
+  desc3D.srcPitch = desc2D.srcPitch;
+  desc3D.srcHeight = 0;
+
+  desc3D.dstXInBytes = desc2D.dstXInBytes;
+  desc3D.dstY = desc2D.dstY;
+  desc3D.dstZ = 0;
+  desc3D.dstLOD = 0;
+  desc3D.dstMemoryType = desc2D.dstMemoryType;
+  desc3D.dstHost = desc2D.dstHost;
+  desc3D.dstDevice = desc2D.dstDevice;
+  desc3D.dstArray = desc2D.dstArray;
+  desc3D.dstPitch = desc2D.dstPitch;
+  desc3D.dstHeight = 0;
+
+  desc3D.WidthInBytes = desc2D.WidthInBytes;
+  desc3D.Height = desc2D.Height;
+  desc3D.Depth = 1;
+
+  return desc3D;
+}
+
+inline
+HIP_MEMCPY3D getDrvMemcpy3DDesc(const hipMemcpy3DParms& desc) {
+  HIP_MEMCPY3D descDrv = {};
+
+  descDrv.WidthInBytes = desc.extent.width;
+  descDrv.Height = desc.extent.height;
+  descDrv.Depth = desc.extent.depth;
+
+  descDrv.srcXInBytes = desc.srcPos.x;
+  descDrv.srcY = desc.srcPos.y;
+  descDrv.srcZ = desc.srcPos.z;
+  descDrv.srcLOD = 0;
+
+  descDrv.dstXInBytes = desc.dstPos.x;
+  descDrv.dstY = desc.dstPos.y;
+  descDrv.dstZ = desc.dstPos.z;
+  descDrv.dstLOD = 0;
+
+  if (desc.srcArray != nullptr) {
+    descDrv.srcMemoryType = hipMemoryTypeArray;
+    descDrv.srcArray = desc.srcArray;
+    // When reffering to array memory, hipPos::x is in elements.
+    descDrv.srcXInBytes *= getElementSize(desc.srcArray);
+  }
+
+  if (desc.srcPtr.ptr != nullptr) {
+    descDrv.srcMemoryType = std::get<0>(hip::getMemoryType(desc.kind));
+    descDrv.srcHost = desc.srcPtr.ptr;
+    descDrv.srcDevice = desc.srcPtr.ptr;
+    descDrv.srcPitch = desc.srcPtr.pitch;
+    descDrv.srcHeight = desc.srcPtr.ysize;
+  }
+
+  if (desc.dstArray != nullptr) {
+    descDrv.dstMemoryType = hipMemoryTypeArray;
+    descDrv.dstArray = desc.dstArray;
+    // When reffering to array memory, hipPos::x is in elements.
+    descDrv.dstXInBytes *= getElementSize(desc.dstArray);
+  }
+
+  if (desc.dstPtr.ptr != nullptr) {
+    descDrv.dstMemoryType = std::get<1>(getMemoryType(desc.kind));
+    descDrv.dstHost = desc.dstPtr.ptr;
+    descDrv.dstDevice = desc.dstPtr.ptr;
+    descDrv.dstPitch = desc.dstPtr.pitch;
+    descDrv.dstHeight = desc.dstPtr.ysize;
+  }
+
+  // If a HIP array is participating in the copy, the extent is defined in terms of that array's elements.
+  if ((desc.srcArray != nullptr) && (desc.dstArray == nullptr)) {
+    descDrv.WidthInBytes *= getElementSize(desc.srcArray);
+  } else if ((desc.srcArray == nullptr) && (desc.dstArray != nullptr)) {
+    descDrv.WidthInBytes *= getElementSize(desc.dstArray);
+  } else if ((desc.srcArray != nullptr) && (desc.dstArray != nullptr)) {
+    descDrv.WidthInBytes *= getElementSize(desc.dstArray);
+  }
+
+  return descDrv;
+}
+
+inline
+hipResourceType getResourceType(const HIPresourcetype resType) {
+  // These two enums should be isomorphic.
+  return static_cast<hipResourceType>(resType);
+}
+
+inline
+HIPresourcetype getResourceType(const hipResourceType resType) {
+  // These two enums should be isomorphic.
+  return static_cast<HIPresourcetype>(resType);
+}
+
+inline
+hipResourceDesc getResourceDesc(const HIP_RESOURCE_DESC& resDesc) {
+  hipResourceDesc desc;
+
+  desc.resType = getResourceType(resDesc.resType);
+  switch (desc.resType) {
+  case hipResourceTypeArray:
+    desc.res.array.array = resDesc.res.array.hArray;
+    break;
+  case hipResourceTypeMipmappedArray:
+    desc.res.mipmap.mipmap = resDesc.res.mipmap.hMipmappedArray;
+    break;
+  case hipResourceTypeLinear:
+    desc.res.linear.devPtr = resDesc.res.linear.devPtr;
+    desc.res.linear.desc = getChannelFormatDesc(resDesc.res.linear.numChannels, resDesc.res.linear.format);
+    desc.res.linear.sizeInBytes = resDesc.res.linear.sizeInBytes;
+    break;
+  case hipResourceTypePitch2D:
+    desc.res.pitch2D.devPtr = resDesc.res.pitch2D.devPtr;
+    desc.res.pitch2D.desc = getChannelFormatDesc(resDesc.res.pitch2D.numChannels, resDesc.res.pitch2D.format);
+    desc.res.pitch2D.width = resDesc.res.pitch2D.width;
+    desc.res.pitch2D.height = resDesc.res.pitch2D.height;
+    desc.res.pitch2D.pitchInBytes = resDesc.res.pitch2D.pitchInBytes;
+    break;
+  default:
+    break;
+  }
+
+  return desc;
+}
+
+inline
+HIP_RESOURCE_DESC getResourceDesc(const hipResourceDesc& resDesc) {
+  HIP_RESOURCE_DESC desc;
+
+  desc.resType = getResourceType(resDesc.resType);
+  switch (desc.resType) {
+  case HIP_RESOURCE_TYPE_ARRAY:
+    desc.res.array.hArray = resDesc.res.array.array;
+    break;
+  case HIP_RESOURCE_TYPE_MIPMAPPED_ARRAY:
+    desc.res.mipmap.hMipmappedArray = resDesc.res.mipmap.mipmap;
+    break;
+  case HIP_RESOURCE_TYPE_LINEAR:
+    desc.res.linear.devPtr = resDesc.res.linear.devPtr;
+    desc.res.linear.numChannels = getNumChannels(resDesc.res.linear.desc);
+    desc.res.linear.format = getArrayFormat(resDesc.res.linear.desc);
+    desc.res.linear.sizeInBytes = resDesc.res.linear.sizeInBytes;
+    break;
+  case HIP_RESOURCE_TYPE_PITCH2D:
+    desc.res.pitch2D.devPtr = resDesc.res.pitch2D.devPtr;
+    desc.res.pitch2D.numChannels = getNumChannels(resDesc.res.pitch2D.desc);
+    desc.res.pitch2D.format = getArrayFormat(resDesc.res.pitch2D.desc);
+    desc.res.pitch2D.width = resDesc.res.pitch2D.width;
+    desc.res.pitch2D.height = resDesc.res.pitch2D.height;
+    desc.res.pitch2D.pitchInBytes = resDesc.res.pitch2D.pitchInBytes;
+    break;
+  default:
+    break;
+  }
+
+  return desc;
+}
+
+inline
+hipTextureAddressMode getAddressMode(const HIPaddress_mode mode) {
+  // These two enums should be isomorphic.
+  return static_cast<hipTextureAddressMode>(mode);
+}
+
+inline
+HIPaddress_mode getAddressMode(const hipTextureAddressMode mode) {
+  // These two enums should be isomorphic.
+  return static_cast<HIPaddress_mode>(mode);
+}
+
+inline
+hipTextureFilterMode getFilterMode(const HIPfilter_mode mode) {
+  // These two enums should be isomorphic.
+  return static_cast<hipTextureFilterMode>(mode);
+}
+
+inline
+HIPfilter_mode getFilterMode(const hipTextureFilterMode mode) {
+  // These two enums should be isomorphic.
+  return static_cast<HIPfilter_mode>(mode);
+}
+
+inline
+hipTextureReadMode getReadMode(const unsigned int flags) {
+  if (flags & HIP_TRSF_READ_AS_INTEGER) {
+    return hipReadModeElementType;
+  } else {
+    return hipReadModeNormalizedFloat;
+  }
+}
+
+inline
+unsigned int getReadMode(const hipTextureReadMode mode) {
+  if (mode ==  hipReadModeElementType) {
+    return HIP_TRSF_READ_AS_INTEGER;
+  } else {
+    return 0;
+  }
+}
+
+inline
+int getsRGB(const unsigned int flags) {
+  if (flags & HIP_TRSF_SRGB) {
+    return 1;
+  } else {
+    return 0;
+  }
+}
+
+inline
+unsigned int getsRGB(const int sRGB) {
+  if (sRGB == 1) {
+    return HIP_TRSF_SRGB;
+  } else {
+    return 0;
+  }
+}
+
+inline
+int getNormalizedCoords(const unsigned int flags) {
+  if (flags & HIP_TRSF_NORMALIZED_COORDINATES) {
+    return 1;
+  } else {
+    return 0;
+  }
+}
+
+inline
+unsigned int getNormalizedCoords(const int normalizedCoords) {
+  if (normalizedCoords == 1) {
+    return HIP_TRSF_NORMALIZED_COORDINATES;
+  } else {
+    return 0;
+  }
+}
+
+inline
+hipTextureDesc getTextureDesc(const HIP_TEXTURE_DESC& texDesc) {
+  hipTextureDesc desc;
+
+  desc.addressMode[0] = getAddressMode(texDesc.addressMode[0]);
+  desc.addressMode[1] = getAddressMode(texDesc.addressMode[1]);
+  desc.addressMode[2] = getAddressMode(texDesc.addressMode[2]);
+  desc.filterMode = getFilterMode(texDesc.filterMode);
+  desc.readMode = getReadMode(texDesc.flags);
+  desc.sRGB = getsRGB(texDesc.flags);
+  std::memcpy(desc.borderColor, texDesc.borderColor, sizeof(desc.borderColor));
+  desc.normalizedCoords = getNormalizedCoords(texDesc.flags);
+  desc.maxAnisotropy = texDesc.maxAnisotropy;
+  desc.mipmapFilterMode = getFilterMode(texDesc.mipmapFilterMode);
+  desc.mipmapLevelBias = texDesc.mipmapLevelBias;
+  desc.minMipmapLevelClamp = texDesc.minMipmapLevelClamp;
+  desc.maxMipmapLevelClamp = texDesc.maxMipmapLevelClamp;
+
+  return desc;
+}
+
+inline
+HIP_TEXTURE_DESC getTextureDesc(const hipTextureDesc& texDesc) {
+  HIP_TEXTURE_DESC desc;
+
+  desc.addressMode[0] = getAddressMode(texDesc.addressMode[0]);
+  desc.addressMode[1] = getAddressMode(texDesc.addressMode[1]);
+  desc.addressMode[2] = getAddressMode(texDesc.addressMode[2]);
+  desc.filterMode = getFilterMode(texDesc.filterMode);
+  desc.flags = 0;
+  desc.flags |= getReadMode(texDesc.readMode);
+  desc.flags |= getsRGB(texDesc.sRGB);
+  desc.flags |= getNormalizedCoords(texDesc.normalizedCoords);
+  desc.maxAnisotropy = texDesc.maxAnisotropy;
+  desc.mipmapFilterMode = getFilterMode(texDesc.mipmapFilterMode);
+  desc.mipmapLevelBias = texDesc.mipmapLevelBias;
+  desc.minMipmapLevelClamp = texDesc.minMipmapLevelClamp;
+  desc.maxMipmapLevelClamp = texDesc.maxMipmapLevelClamp;
+  std::memcpy(desc.borderColor, texDesc.borderColor, sizeof(desc.borderColor));
+
+  return desc;
+}
+
+inline
+hipResourceViewFormat getResourceViewFormat(const HIPresourceViewFormat format) {
+  // These two enums should be isomorphic.
+  return static_cast<hipResourceViewFormat>(format);
+}
+
+inline
+HIPresourceViewFormat getResourceViewFormat(const hipResourceViewFormat format) {
+  // These two enums should be isomorphic.
+  return static_cast<HIPresourceViewFormat>(format);
+}
+
+inline
+hipResourceViewDesc getResourceViewDesc(const HIP_RESOURCE_VIEW_DESC& resViewDesc) {
+  hipResourceViewDesc desc;
+
+  desc.format = getResourceViewFormat(resViewDesc.format);
+  desc.width = resViewDesc.width;
+  desc.height = resViewDesc.height;
+  desc.depth = resViewDesc.depth;
+  desc.firstMipmapLevel = resViewDesc.firstMipmapLevel;
+  desc.lastMipmapLevel = resViewDesc.lastMipmapLevel;
+  desc.firstLayer = resViewDesc.firstLayer;
+  desc.lastLayer = resViewDesc.lastLayer;
+
+  return desc;
+}
+
+inline
+HIP_RESOURCE_VIEW_DESC getResourceViewDesc(const hipResourceViewDesc& resViewDesc) {
+  HIP_RESOURCE_VIEW_DESC desc;
+
+  desc.format = getResourceViewFormat(resViewDesc.format);
+  desc.width = resViewDesc.width;
+  desc.height = resViewDesc.height;
+  desc.depth = resViewDesc.depth;
+  desc.firstMipmapLevel = resViewDesc.firstMipmapLevel;
+  desc.lastMipmapLevel = resViewDesc.lastMipmapLevel;
+  desc.firstLayer = resViewDesc.firstLayer;
+  desc.lastLayer = resViewDesc.lastLayer;
+
+  return desc;
+}
+
+inline
+size_t getElementSize(const hipChannelFormatDesc &desc) {
+  return (desc.x / 8) * getNumChannels(desc);
+}
+};
@@ -0,0 +1,382 @@
+/* Copyright (c) 2018 - 2022 Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include <hip/hip_runtime.h>
+
+#include "hip_internal.hpp"
+#include "hip_mempool_impl.hpp"
+
+namespace hip {
+
+// ================================================================================================
+hip::Stream* Device::NullStream(bool skip_alloc) {
+  if (null_stream_ == nullptr && !skip_alloc) {
+    null_stream_ = new Stream(this, Stream::Priority::Normal, 0, true);
+  }
+
+  if (null_stream_ == nullptr) {
+    return nullptr;
+  }
+  // Wait for all active streams before executing commands on the default
+  iHipWaitActiveStreams(null_stream_);
+  return null_stream_;
+}
+
+// ================================================================================================
+bool Device::Create() {
+  // Create default memory pool
+  default_mem_pool_ = new MemoryPool(this);
+  if (default_mem_pool_ == nullptr) {
+    return false;
+  }
+
+  // Create graph memory pool
+  graph_mem_pool_ = new MemoryPool(this);
+  if (graph_mem_pool_ == nullptr) {
+    return false;
+  }
+
+  uint64_t max_size = std::numeric_limits<uint64_t>::max();
+  // Use maximum value to hold memory, because current implementation doesn't support VM
+  // Note: the call for the threshold is always successful
+  auto error = graph_mem_pool_->SetAttribute(hipMemPoolAttrReleaseThreshold, &max_size);
+
+  // Current is default pool after device creation
+  current_mem_pool_ = default_mem_pool_;
+  return true;
+}
+
+// ================================================================================================
+void Device::AddMemoryPool(MemoryPool* pool) {
+  amd::ScopedLock lock(lock_);
+  if (auto it = mem_pools_.find(pool); it == mem_pools_.end()) {
+    mem_pools_.insert(pool);
+  }
+}
+
+// ================================================================================================
+void Device::RemoveMemoryPool(MemoryPool* pool) {
+  amd::ScopedLock lock(lock_);
+  if (auto it = mem_pools_.find(pool); it != mem_pools_.end()) {
+    mem_pools_.erase(it);
+  }
+}
+
+// ================================================================================================
+bool Device::FreeMemory(amd::Memory* memory, Stream* stream) {
+  amd::ScopedLock lock(lock_);
+  // Search for memory in the entire list of pools
+  for (auto it : mem_pools_) {
+    if (it->FreeMemory(memory, stream)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// ================================================================================================
+void Device::ReleaseFreedMemory(Stream* stream) {
+  amd::ScopedLock lock(lock_);
+  // Search for memory in the entire list of pools
+  for (auto it : mem_pools_) {
+    it->ReleaseFreedMemory(stream);
+  }
+}
+
+// ================================================================================================
+void Device::RemoveStreamFromPools(Stream* stream) {
+  amd::ScopedLock lock(lock_);
+  // Update all pools with the destroyed stream
+  for (auto it : mem_pools_) {
+    it->RemoveStream(stream);
+  }
+}
+
+// ================================================================================================
+void Device::Reset() {
+  {
+    amd::ScopedLock lock(lock_);
+    auto it = mem_pools_.begin();
+    while (it != mem_pools_.end()) {
+      auto current = it++;
+      (*current)->ReleaseAllMemory();
+      delete *current;
+    }
+    mem_pools_.clear();
+  }
+  flags_ = hipDeviceScheduleSpin;
+  hip::Stream::destroyAllStreams(deviceId_);
+  amd::MemObjMap::Purge(devices()[0]);
+  Create();
+}
+
+// ================================================================================================
+Device::~Device() {
+  if (default_mem_pool_ != nullptr) {
+    default_mem_pool_->release();
+  }
+
+  if (graph_mem_pool_ != nullptr) {
+    graph_mem_pool_->release();
+  }
+
+  if (null_stream_!= nullptr) {
+    null_stream_->release();
+  }
+}
+
+}
+
+void ihipDestroyDevice() {
+  for (auto deviceHandle : g_devices) {
+    delete deviceHandle;
+  }
+}
+
+hipError_t ihipDeviceGet(hipDevice_t* device, int deviceId) {
+  if (device == nullptr) {
+    return hipErrorInvalidValue;
+  }
+
+  if (deviceId < 0 || static_cast<size_t>(deviceId) >= g_devices.size()) {
+    return hipErrorInvalidDevice;
+  }
+
+  *device = deviceId;
+  return hipSuccess;
+}
+
+hipError_t hipDeviceGet(hipDevice_t* device, int deviceId) {
+  HIP_INIT_API(hipDeviceGet, device, deviceId);
+
+  HIP_RETURN(ihipDeviceGet(device, deviceId));
+}
+
+hipError_t hipDeviceTotalMem (size_t *bytes, hipDevice_t device) {
+
+  HIP_INIT_API(hipDeviceTotalMem, bytes, device);
+
+  if (device < 0 || static_cast<size_t>(device) >= g_devices.size()) {
+    HIP_RETURN(hipErrorInvalidDevice);
+  }
+
+  if (bytes == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  auto* deviceHandle = g_devices[device]->devices()[0];
+  const auto& info = deviceHandle->info();
+
+  *bytes = info.globalMemSize_;
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipDeviceComputeCapability(int *major, int *minor, hipDevice_t device) {
+
+  HIP_INIT_API(hipDeviceComputeCapability, major, minor, device);
+
+  if (device < 0 || static_cast<size_t>(device) >= g_devices.size()) {
+    HIP_RETURN(hipErrorInvalidDevice);
+  }
+
+  if (major == nullptr || minor == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  auto* deviceHandle = g_devices[device]->devices()[0];
+  const auto& isa = deviceHandle->isa();
+  *major = isa.versionMajor();
+  *minor = isa.versionMinor();
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipDeviceGetCount(int* count) {
+  HIP_INIT_API(hipDeviceGetCount, count);
+
+  HIP_RETURN(ihipDeviceGetCount(count));
+}
+
+hipError_t ihipDeviceGetCount(int* count) {
+  if (count == nullptr) {
+    return hipErrorInvalidValue;
+  }
+
+  // Get all available devices
+  *count = g_devices.size();
+
+  if (*count < 1) {
+    return hipErrorNoDevice;
+  }
+
+  return hipSuccess;
+}
+
+hipError_t hipDeviceGetName(char *name, int len, hipDevice_t device) {
+
+  HIP_INIT_API(hipDeviceGetName, (void*)name, len, device);
+
+  if (device < 0 || static_cast<size_t>(device) >= g_devices.size()) {
+    HIP_RETURN(hipErrorInvalidDevice);
+  }
+
+  if (name == nullptr || len <= 0) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  auto* deviceHandle = g_devices[device]->devices()[0];
+  const auto& info = deviceHandle->info();
+  const auto nameLen = ::strlen(info.boardName_);
+
+  // Only copy partial name if size of `dest` is smaller than size of `src` including
+  // trailing zero byte
+  auto memcpySize = (len <= (nameLen + 1) ? (len - 1) : nameLen);
+  ::memcpy(name, info.boardName_, memcpySize);
+  name[memcpySize] = '\0';
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipDeviceGetUuid(hipUUID* uuid, hipDevice_t device) {
+  HIP_INIT_API(hipDeviceGetUuid, reinterpret_cast<void*>(uuid), device);
+
+  if (device < 0 || static_cast<size_t>(device) >= g_devices.size()) {
+    HIP_RETURN(hipErrorInvalidDevice);
+  }
+
+  if (uuid == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  auto* deviceHandle = g_devices[device]->devices()[0];
+  const auto& info = deviceHandle->info();
+
+  ::strncpy(uuid->bytes, info.uuid_, 16);
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t ihipGetDeviceProperties(hipDeviceProp_t* props, hipDevice_t device) {
+  if (props == nullptr) {
+    return hipErrorInvalidValue;
+  }
+
+  if (unsigned(device) >= g_devices.size()) {
+    return hipErrorInvalidDevice;
+  }
+  auto* deviceHandle = g_devices[device]->devices()[0];
+
+  constexpr auto int32_max = static_cast<uint64_t>(std::numeric_limits<int32_t>::max());
+  hipDeviceProp_t deviceProps = {0};
+
+  const auto& info = deviceHandle->info();
+  const auto& isa = deviceHandle->isa();
+  ::strncpy(deviceProps.name, info.boardName_, 128);
+  deviceProps.totalGlobalMem = info.globalMemSize_;
+  deviceProps.sharedMemPerBlock = info.localMemSizePerCU_;
+  deviceProps.regsPerBlock = info.availableRegistersPerCU_;
+  deviceProps.warpSize = info.wavefrontWidth_;
+  deviceProps.maxThreadsPerBlock = info.maxWorkGroupSize_;
+  deviceProps.maxThreadsDim[0] = info.maxWorkItemSizes_[0];
+  deviceProps.maxThreadsDim[1] = info.maxWorkItemSizes_[1];
+  deviceProps.maxThreadsDim[2] = info.maxWorkItemSizes_[2];
+  deviceProps.maxGridSize[0] = int32_max;
+  deviceProps.maxGridSize[1] = int32_max;
+  deviceProps.maxGridSize[2] = int32_max;
+  deviceProps.clockRate = info.maxEngineClockFrequency_ * 1000;
+  deviceProps.memoryClockRate = info.maxMemoryClockFrequency_ * 1000;
+  deviceProps.memoryBusWidth = info.globalMemChannels_;
+  deviceProps.totalConstMem = std::min(info.maxConstantBufferSize_, int32_max);
+  deviceProps.major = isa.versionMajor();
+  deviceProps.minor = isa.versionMinor();
+  deviceProps.multiProcessorCount = info.maxComputeUnits_;
+  deviceProps.l2CacheSize = info.l2CacheSize_;
+  deviceProps.maxThreadsPerMultiProcessor = info.maxThreadsPerCU_;
+  deviceProps.computeMode = 0;
+  deviceProps.clockInstructionRate = info.timeStampFrequency_;
+  deviceProps.arch.hasGlobalInt32Atomics = 1;
+  deviceProps.arch.hasGlobalFloatAtomicExch = 1;
+  deviceProps.arch.hasSharedInt32Atomics = 1;
+  deviceProps.arch.hasSharedFloatAtomicExch = 1;
+  deviceProps.arch.hasFloatAtomicAdd = 1;
+  deviceProps.arch.hasGlobalInt64Atomics = 1;
+  deviceProps.arch.hasSharedInt64Atomics = 1;
+  deviceProps.arch.hasDoubles = 1;
+  deviceProps.arch.hasWarpVote = 1;
+  deviceProps.arch.hasWarpBallot = 1;
+  deviceProps.arch.hasWarpShuffle = 1;
+  deviceProps.arch.hasFunnelShift = 0;
+  deviceProps.arch.hasThreadFenceSystem = 1;
+  deviceProps.arch.hasSyncThreadsExt = 0;
+  deviceProps.arch.hasSurfaceFuncs = 0;
+  deviceProps.arch.has3dGrid = 1;
+  deviceProps.arch.hasDynamicParallelism = 0;
+  deviceProps.concurrentKernels = 1;
+  deviceProps.pciDomainID = info.pciDomainID;
+  deviceProps.pciBusID = info.deviceTopology_.pcie.bus;
+  deviceProps.pciDeviceID = info.deviceTopology_.pcie.device;
+  deviceProps.maxSharedMemoryPerMultiProcessor = info.localMemSizePerCU_;
+  deviceProps.canMapHostMemory = 1;
+  // FIXME: This should be removed, targets can have character names as well.
+  deviceProps.gcnArch = isa.versionMajor() * 100 + isa.versionMinor() * 10 + isa.versionStepping();
+  sprintf(deviceProps.gcnArchName, "%s", isa.targetId());
+  deviceProps.cooperativeLaunch = info.cooperativeGroups_;
+  deviceProps.cooperativeMultiDeviceLaunch = info.cooperativeMultiDeviceGroups_;
+
+  deviceProps.cooperativeMultiDeviceUnmatchedFunc = info.cooperativeMultiDeviceGroups_;
+  deviceProps.cooperativeMultiDeviceUnmatchedGridDim = info.cooperativeMultiDeviceGroups_;
+  deviceProps.cooperativeMultiDeviceUnmatchedBlockDim = info.cooperativeMultiDeviceGroups_;
+  deviceProps.cooperativeMultiDeviceUnmatchedSharedMem = info.cooperativeMultiDeviceGroups_;
+
+  deviceProps.maxTexture1DLinear = std::min(16 * info.imageMaxBufferSize_, int32_max);  // Max pixel size is 16 bytes
+  deviceProps.maxTexture1D    = std::min(info.image1DMaxWidth_, int32_max);
+  deviceProps.maxTexture2D[0] = std::min(info.image2DMaxWidth_, int32_max);
+  deviceProps.maxTexture2D[1] = std::min(info.image2DMaxHeight_, int32_max);
+  deviceProps.maxTexture3D[0] = std::min(info.image3DMaxWidth_, int32_max);
+  deviceProps.maxTexture3D[1] = std::min(info.image3DMaxHeight_, int32_max);
+  deviceProps.maxTexture3D[2] = std::min(info.image3DMaxDepth_, int32_max);
+  deviceProps.hdpMemFlushCntl = info.hdpMemFlushCntl;
+  deviceProps.hdpRegFlushCntl = info.hdpRegFlushCntl;
+
+  deviceProps.memPitch = std::min(info.maxMemAllocSize_, int32_max);
+  deviceProps.textureAlignment = info.imageBaseAddressAlignment_;
+  deviceProps.texturePitchAlignment = info.imagePitchAlignment_;
+  deviceProps.kernelExecTimeoutEnabled = 0;
+  deviceProps.ECCEnabled = info.errorCorrectionSupport_ ? 1 : 0;
+  deviceProps.isLargeBar = info.largeBar_ ? 1 : 0;
+  deviceProps.asicRevision = info.asicRevision_;
+
+  // HMM capabilities
+  deviceProps.managedMemory = info.hmmSupported_;
+  deviceProps.concurrentManagedAccess = info.hmmSupported_;
+  deviceProps.directManagedMemAccessFromHost = info.hmmDirectHostAccess_;
+  deviceProps.pageableMemoryAccess = info.hmmCpuMemoryAccessible_;
+  deviceProps.pageableMemoryAccessUsesHostPageTables = info.hostUnifiedMemory_;
+
+  *props = deviceProps;
+  return hipSuccess;
+}
+
+hipError_t hipGetDeviceProperties(hipDeviceProp_t* props, hipDevice_t device) {
+  HIP_INIT_API(hipGetDeviceProperties, props, device);
+
+  HIP_RETURN(ihipGetDeviceProperties(props, device));
+}
@@ -0,0 +1,632 @@
+/* Copyright (c) 2018 - 2021 Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include <hip/hip_runtime.h>
+
+#include "hip_internal.hpp"
+
+hipError_t hipChooseDevice(int* device, const hipDeviceProp_t* properties) {
+
+  HIP_INIT_API(hipChooseDevice, device, properties);
+
+  if (device == nullptr || properties == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  *device = 0;
+  cl_uint maxMatchedCount = 0;
+  int count = 0;
+  HIP_RETURN_ONFAIL(ihipDeviceGetCount(&count));
+
+  for (cl_int i = 0; i< count; ++i) {
+    hipDeviceProp_t currentProp = {0};
+    cl_uint validPropCount = 0;
+    cl_uint matchedCount = 0;
+    hipError_t err = ihipGetDeviceProperties(&currentProp, i);
+    if (properties->major != 0) {
+      validPropCount++;
+      if(currentProp.major >= properties->major) {
+        matchedCount++;
+      }
+    }
+    if (properties->minor != 0) {
+      validPropCount++;
+      if(currentProp.minor >= properties->minor) {
+        matchedCount++;
+      }
+    }
+    if(properties->totalGlobalMem != 0) {
+        validPropCount++;
+        if(currentProp.totalGlobalMem >= properties->totalGlobalMem) {
+            matchedCount++;
+        }
+    }
+    if(properties->sharedMemPerBlock != 0) {
+        validPropCount++;
+        if(currentProp.sharedMemPerBlock >= properties->sharedMemPerBlock) {
+            matchedCount++;
+        }
+    }
+    if(properties->maxThreadsPerBlock != 0) {
+        validPropCount++;
+        if(currentProp.maxThreadsPerBlock >= properties->maxThreadsPerBlock ) {
+            matchedCount++;
+        }
+    }
+    if(properties->totalConstMem != 0) {
+        validPropCount++;
+        if(currentProp.totalConstMem >= properties->totalConstMem ) {
+            matchedCount++;
+        }
+    }
+    if(properties->multiProcessorCount != 0) {
+        validPropCount++;
+        if(currentProp.multiProcessorCount >=
+          properties->multiProcessorCount ) {
+            matchedCount++;
+        }
+    }
+    if(properties->maxThreadsPerMultiProcessor != 0) {
+        validPropCount++;
+        if(currentProp.maxThreadsPerMultiProcessor >=
+          properties->maxThreadsPerMultiProcessor ) {
+            matchedCount++;
+        }
+    }
+    if(properties->memoryClockRate != 0) {
+        validPropCount++;
+        if(currentProp.memoryClockRate >= properties->memoryClockRate ) {
+            matchedCount++;
+        }
+    }
+    if(properties->memoryBusWidth != 0) {
+        validPropCount++;
+        if(currentProp.memoryBusWidth >= properties->memoryBusWidth ) {
+            matchedCount++;
+        }
+    }
+    if(properties->l2CacheSize != 0) {
+        validPropCount++;
+        if(currentProp.l2CacheSize >= properties->l2CacheSize ) {
+            matchedCount++;
+        }
+    }
+    if(properties->regsPerBlock != 0) {
+        validPropCount++;
+        if(currentProp.regsPerBlock >= properties->regsPerBlock ) {
+            matchedCount++;
+        }
+    }
+    if(properties->maxSharedMemoryPerMultiProcessor != 0) {
+        validPropCount++;
+        if(currentProp.maxSharedMemoryPerMultiProcessor >=
+          properties->maxSharedMemoryPerMultiProcessor ) {
+            matchedCount++;
+        }
+    }
+    if(properties->warpSize != 0) {
+        validPropCount++;
+        if(currentProp.warpSize >= properties->warpSize ) {
+            matchedCount++;
+        }
+    }
+    if(validPropCount == matchedCount) {
+      *device = matchedCount > maxMatchedCount ? i : *device;
+      maxMatchedCount = std::max(matchedCount, maxMatchedCount);
+    }
+  }
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device) {
+
+  HIP_INIT_API(hipDeviceGetAttribute, pi, attr, device);
+
+  if (pi == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  int count = 0;
+  HIP_RETURN_ONFAIL(ihipDeviceGetCount(&count));
+
+  if (device < 0 || device >= count) {
+    HIP_RETURN(hipErrorInvalidDevice);
+  }
+
+  //FIXME: should we cache the props, or just select from deviceHandle->info_?
+  hipDeviceProp_t prop = {0};
+  HIP_RETURN_ONFAIL(ihipGetDeviceProperties(&prop, device));
+
+  constexpr auto int32_max = static_cast<uint64_t>(std::numeric_limits<int32_t>::max());
+
+  switch (attr) {
+  case hipDeviceAttributeMaxThreadsPerBlock:
+    *pi = prop.maxThreadsPerBlock;
+    break;
+  case hipDeviceAttributeMaxBlockDimX:
+    *pi = prop.maxThreadsDim[0];
+    break;
+  case hipDeviceAttributeMaxBlockDimY:
+    *pi = prop.maxThreadsDim[1];
+    break;
+  case hipDeviceAttributeMaxBlockDimZ:
+    *pi = prop.maxThreadsDim[2];
+    break;
+  case hipDeviceAttributeMaxGridDimX:
+    *pi = prop.maxGridSize[0];
+    break;
+  case hipDeviceAttributeMaxGridDimY:
+    *pi = prop.maxGridSize[1];
+    break;
+  case hipDeviceAttributeMaxGridDimZ:
+    *pi = prop.maxGridSize[2];
+    break;
+  case hipDeviceAttributeMaxSharedMemoryPerBlock:
+    *pi = prop.sharedMemPerBlock;
+    break;
+  case hipDeviceAttributeTotalConstantMemory:
+    // size_t to int casting
+    *pi = std::min(prop.totalConstMem, int32_max);
+    break;
+  case hipDeviceAttributeWarpSize:
+    *pi = prop.warpSize;
+    break;
+  case hipDeviceAttributeMaxRegistersPerBlock:
+    *pi = prop.regsPerBlock;
+    break;
+  case hipDeviceAttributeClockRate:
+    *pi = prop.clockRate;
+    break;
+  case hipDeviceAttributeWallClockRate:
+    *pi = g_devices[device]->devices()[0]->info().wallClockFrequency_;
+    break;
+  case hipDeviceAttributeMemoryClockRate:
+    *pi = prop.memoryClockRate;
+    break;
+  case hipDeviceAttributeMemoryBusWidth:
+    *pi = prop.memoryBusWidth;
+    break;
+  case hipDeviceAttributeMultiprocessorCount:
+    *pi = prop.multiProcessorCount;
+    break;
+  case hipDeviceAttributeComputeMode:
+    *pi = prop.computeMode;
+    break;
+  case hipDeviceAttributeL2CacheSize:
+    *pi = prop.l2CacheSize;
+    break;
+  case hipDeviceAttributeMaxThreadsPerMultiProcessor:
+    *pi = prop.maxThreadsPerMultiProcessor;
+    break;
+  case hipDeviceAttributeComputeCapabilityMajor:
+    *pi = prop.major;
+    break;
+  case hipDeviceAttributeComputeCapabilityMinor:
+    *pi = prop.minor;
+    break;
+  case hipDeviceAttributePciBusId:
+    *pi = prop.pciBusID;
+    break;
+  case hipDeviceAttributeConcurrentKernels:
+    *pi = prop.concurrentKernels;
+    break;
+  case hipDeviceAttributePciDeviceId:
+    *pi = prop.pciDeviceID;
+    break;
+  case hipDeviceAttributeMaxSharedMemoryPerMultiprocessor:
+    *pi = prop.maxSharedMemoryPerMultiProcessor;
+    break;
+  case hipDeviceAttributeIsMultiGpuBoard:
+    *pi = prop.isMultiGpuBoard;
+    break;
+  case hipDeviceAttributeCooperativeLaunch:
+    *pi = prop.cooperativeLaunch;
+    break;
+  case hipDeviceAttributeCooperativeMultiDeviceLaunch:
+    *pi = prop.cooperativeMultiDeviceLaunch;
+    break;
+  case hipDeviceAttributeIntegrated:
+    *pi = prop.integrated;
+    break;
+  case hipDeviceAttributeMaxTexture1DWidth:
+    *pi = prop.maxTexture1D;
+    break;
+  case hipDeviceAttributeMaxTexture2DWidth:
+    *pi = prop.maxTexture2D[0];
+    break;
+  case hipDeviceAttributeMaxTexture2DHeight:
+    *pi = prop.maxTexture2D[1];
+    break;
+  case hipDeviceAttributeMaxTexture3DWidth:
+    *pi = prop.maxTexture3D[0];
+    break;
+  case hipDeviceAttributeMaxTexture3DHeight:
+    *pi = prop.maxTexture3D[1];
+    break;
+  case hipDeviceAttributeMaxTexture3DDepth:
+    *pi = prop.maxTexture3D[2];
+    break;
+  case hipDeviceAttributeHdpMemFlushCntl:
+    *reinterpret_cast<unsigned int**>(pi) = prop.hdpMemFlushCntl;
+    break;
+  case hipDeviceAttributeHdpRegFlushCntl:
+    *reinterpret_cast<unsigned int**>(pi) = prop.hdpRegFlushCntl;
+    break;
+  case hipDeviceAttributeMaxPitch:
+    // size_t to int casting
+    *pi = std::min(prop.memPitch, int32_max);
+    break;
+  case hipDeviceAttributeTextureAlignment:
+    *pi = prop.textureAlignment;
+    break;
+  case hipDeviceAttributeTexturePitchAlignment:
+    *pi = prop.texturePitchAlignment;
+    break;
+  case hipDeviceAttributeKernelExecTimeout:
+    *pi = prop.kernelExecTimeoutEnabled;
+    break;
+  case hipDeviceAttributeCanMapHostMemory:
+    *pi = prop.canMapHostMemory;
+    break;
+  case hipDeviceAttributeEccEnabled:
+    *pi = prop.ECCEnabled;
+    break;
+  case hipDeviceAttributeCooperativeMultiDeviceUnmatchedFunc:
+    *pi = prop.cooperativeMultiDeviceUnmatchedFunc;
+    break;
+  case hipDeviceAttributeCooperativeMultiDeviceUnmatchedGridDim:
+    *pi = prop.cooperativeMultiDeviceUnmatchedGridDim;
+    break;
+  case hipDeviceAttributeCooperativeMultiDeviceUnmatchedBlockDim:
+    *pi = prop.cooperativeMultiDeviceUnmatchedBlockDim;
+    break;
+  case hipDeviceAttributeCooperativeMultiDeviceUnmatchedSharedMem:
+    *pi = prop.cooperativeMultiDeviceUnmatchedSharedMem;
+    break;
+  case hipDeviceAttributeAsicRevision:
+    *pi = prop.asicRevision;
+    break;
+  case hipDeviceAttributeManagedMemory:
+    *pi = prop.managedMemory;
+    break;
+  case hipDeviceAttributeDirectManagedMemAccessFromHost:
+    *pi = prop.directManagedMemAccessFromHost;
+    break;
+  case hipDeviceAttributeConcurrentManagedAccess:
+    *pi = prop.concurrentManagedAccess;
+    break;
+  case hipDeviceAttributePageableMemoryAccess:
+    *pi = prop.pageableMemoryAccess;
+    break;
+  case hipDeviceAttributePageableMemoryAccessUsesHostPageTables:
+    *pi = prop.pageableMemoryAccessUsesHostPageTables;
+    break;
+  case hipDeviceAttributeUnifiedAddressing:
+    // HIP runtime always uses SVM for host memory allocations.
+    // Note: Host registered memory isn't covered by this feature
+    // and still requires hipMemHostGetDevicePointer() call
+    *pi = true;
+    break;
+  case hipDeviceAttributeCanUseStreamWaitValue:
+    // hipStreamWaitValue64() and hipStreamWaitValue32() support
+    *pi = g_devices[device]->devices()[0]->info().aqlBarrierValue_;
+    break;
+  case hipDeviceAttributeImageSupport:
+    *pi = static_cast<int>(g_devices[device]->devices()[0]->info().imageSupport_);
+    break;
+  case hipDeviceAttributePhysicalMultiProcessorCount:
+    *pi = g_devices[device]->devices()[0]->info().maxPhysicalComputeUnits_;
+    break;
+  case hipDeviceAttributeFineGrainSupport:
+    *pi = static_cast<int>(g_devices[device]->devices()[0]->isFineGrainSupported());
+    break;
+  case hipDeviceAttributeMemoryPoolsSupported:
+    *pi = HIP_MEM_POOL_SUPPORT;
+    break;
+  case hipDeviceAttributeVirtualMemoryManagementSupported:
+    *pi = static_cast<int>(g_devices[device]->devices()[0]->info().virtualMemoryManagement_);
+    break;
+  default:
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipDeviceGetByPCIBusId(int* device, const char*pciBusIdstr) {
+
+  HIP_INIT_API(hipDeviceGetByPCIBusId, device, pciBusIdstr);
+
+  if (device == nullptr || pciBusIdstr == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  int pciBusID = -1;
+  int pciDeviceID = -1;
+  int pciDomainID = -1;
+  bool found = false;
+  if (sscanf (pciBusIdstr, "%04x:%02x:%02x", reinterpret_cast<unsigned int*>(&pciDomainID),
+              reinterpret_cast<unsigned int*>(&pciBusID),
+              reinterpret_cast<unsigned int*>(&pciDeviceID)) == 0x3) {
+    int count = 0;
+    HIP_RETURN_ONFAIL(ihipDeviceGetCount(&count));
+    for (cl_int i = 0; i < count; i++) {
+      hipDevice_t dev;
+      hipDeviceProp_t prop;
+      HIP_RETURN_ONFAIL(ihipDeviceGet(&dev, i));
+      HIP_RETURN_ONFAIL(ihipGetDeviceProperties(&prop, dev));
+
+      if ((pciBusID == prop.pciBusID) && (pciDomainID == prop.pciDomainID)
+                    && (pciDeviceID == prop.pciDeviceID)) {
+        *device = i;
+        found = true;
+        break;
+      }
+    }
+  }
+  if (!found) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipDeviceGetCacheConfig ( hipFuncCache_t * cacheConfig ) {
+  HIP_INIT_API(hipDeviceGetCacheConfig, cacheConfig);
+
+  if(cacheConfig == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  *cacheConfig = hipFuncCache_t();
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipDeviceGetLimit ( size_t* pValue, hipLimit_t limit ) {
+
+  HIP_INIT_API(hipDeviceGetLimit, pValue, limit);
+
+  if (pValue == nullptr || limit >= hipLimitRange) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  switch (limit) {
+    case hipLimitMallocHeapSize:
+      hipDeviceProp_t prop;
+      HIP_RETURN_ONFAIL(ihipGetDeviceProperties(&prop, ihipGetDevice()));
+      *pValue = prop.totalGlobalMem;
+      break;
+    case hipLimitStackSize:
+      *pValue = hip::getCurrentDevice()->devices()[0]->StackSize();
+      break;
+    default:
+      LogPrintfError("UnsupportedLimit = %d is passed", limit);
+      HIP_RETURN(hipErrorUnsupportedLimit);
+  }
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipDeviceGetPCIBusId ( char* pciBusId, int  len, int  device ) {
+
+  HIP_INIT_API(hipDeviceGetPCIBusId, (void*)pciBusId, len, device);
+
+  int count;
+  HIP_RETURN_ONFAIL(ihipDeviceGetCount(&count));
+
+  if (device < 0 || device >= count) {
+    HIP_RETURN(hipErrorInvalidDevice);
+  }
+
+  //pciBusId should be large enough to store 13 characters including the NULL-terminator.
+  if (pciBusId == nullptr || len <= 12) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  hipDeviceProp_t prop;
+  HIP_RETURN_ONFAIL(ihipGetDeviceProperties(&prop, device));
+  snprintf (pciBusId, len, "%04x:%02x:%02x.0",
+                    prop.pciDomainID,
+                    prop.pciBusID,
+                    prop.pciDeviceID);
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipDeviceGetSharedMemConfig ( hipSharedMemConfig * pConfig ) {
+  HIP_INIT_API(hipDeviceGetSharedMemConfig, pConfig);
+  if (pConfig == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  *pConfig = hipSharedMemBankSizeFourByte;
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipDeviceReset ( void ) {
+  HIP_INIT_API(hipDeviceReset);
+
+  hip::getCurrentDevice()->Reset();
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipDeviceSetCacheConfig ( hipFuncCache_t cacheConfig ) {
+  HIP_INIT_API(hipDeviceSetCacheConfig, cacheConfig);
+
+  // No way to set cache config yet.
+
+  HIP_RETURN(hipErrorNotSupported);
+}
+
+hipError_t hipDeviceSetLimit ( hipLimit_t limit, size_t value ) {
+  HIP_INIT_API(hipDeviceSetLimit, limit, value);
+  if (limit >= hipLimitRange) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  switch(limit) {
+  case hipLimitStackSize :
+    // need to query device size and take action
+    if (!hip::getCurrentDevice()->devices()[0]->UpdateStackSize(value)) {
+      HIP_RETURN(hipErrorInvalidValue);
+    }
+    break;
+  case hipLimitMallocHeapSize:
+    if (!hip::getCurrentDevice()->devices()[0]->UpdateInitialHeapSize(value)) {
+      HIP_RETURN(hipErrorInvalidValue);
+    }
+    break;
+  default:
+    LogPrintfError("UnsupportedLimit = %d is passed", limit);
+    HIP_RETURN(hipErrorUnsupportedLimit);
+  }
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipDeviceSetSharedMemConfig ( hipSharedMemConfig config ) {
+  HIP_INIT_API(hipDeviceSetSharedMemConfig, config);
+
+  // No way to set cache config yet.
+
+  HIP_RETURN(hipErrorNotSupported);
+}
+
+hipError_t hipDeviceSynchronize ( void ) {
+  HIP_INIT_API(hipDeviceSynchronize);
+
+  hip::Stream* stream = hip::getNullStream();
+
+  if (!stream) {
+    HIP_RETURN(hipErrorOutOfMemory);
+  }
+
+  if (hip::Stream::StreamCaptureOngoing() == true) {
+    HIP_RETURN(hipErrorStreamCaptureUnsupported);
+  }
+
+  stream->finish();
+
+  hip::Stream::syncNonBlockingStreams(hip::getCurrentDevice()->deviceId());
+
+  HIP_RETURN(hipSuccess);
+}
+
+int ihipGetDevice() {
+  hip::Device* device = hip::getCurrentDevice();
+  if(device == nullptr){
+    return -1;
+  }
+  return device->deviceId();
+}
+
+hipError_t hipGetDevice ( int* deviceId ) {
+  HIP_INIT_API(hipGetDevice, deviceId);
+
+  if (deviceId != nullptr) {
+    int dev = ihipGetDevice();
+    if (dev == -1) {
+      HIP_RETURN(hipErrorNoDevice);
+    }
+    *deviceId = dev;
+    HIP_RETURN(hipSuccess);
+  } else {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+}
+
+hipError_t hipGetDeviceCount ( int* count ) {
+  HIP_INIT_API_NO_RETURN(hipGetDeviceCount, count);
+
+  HIP_RETURN(ihipDeviceGetCount(count));
+}
+
+hipError_t hipGetDeviceFlags ( unsigned int* flags ) {
+  HIP_INIT_API(hipGetDeviceFlags, flags);
+  if (flags == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  *flags = hip::getCurrentDevice()->getFlags();
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipSetDevice ( int  device ) {
+  HIP_INIT_API(hipSetDevice, device);
+
+  if (static_cast<unsigned int>(device) < g_devices.size()) {
+    hip::setCurrentDevice(device);
+
+    HIP_RETURN(hipSuccess);
+  }
+  HIP_RETURN(hipErrorInvalidDevice);
+}
+
+hipError_t hipSetDeviceFlags ( unsigned int  flags ) {
+  HIP_INIT_API(hipSetDeviceFlags, flags);
+
+  constexpr uint32_t supportedFlags =
+      hipDeviceScheduleMask | hipDeviceMapHost | hipDeviceLmemResizeToMax;
+  constexpr uint32_t mutualExclusiveFlags =
+      hipDeviceScheduleSpin | hipDeviceScheduleYield | hipDeviceScheduleBlockingSync;
+  // Only one scheduling flag allowed a time
+  uint32_t scheduleFlag = flags & hipDeviceScheduleMask;
+
+  if (((scheduleFlag & mutualExclusiveFlags) != hipDeviceScheduleSpin) && ((scheduleFlag & mutualExclusiveFlags) != hipDeviceScheduleYield)
+      && ((scheduleFlag & mutualExclusiveFlags) != hipDeviceScheduleBlockingSync)
+      && ((scheduleFlag & mutualExclusiveFlags) != hipDeviceScheduleAuto)) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  if (flags & ~supportedFlags) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  amd::Device* device = hip::getCurrentDevice()->devices()[0];
+  switch (scheduleFlag) {
+    case hipDeviceScheduleAuto:
+      // Current behavior is different from the spec, due to MT usage in runtime
+      if (hip::host_context->devices().size() >= std::thread::hardware_concurrency()) {
+        device->SetActiveWait(false);
+        break;
+      }
+      // Fall through for active wait...
+    case hipDeviceScheduleSpin:
+    case hipDeviceScheduleYield:
+      // The both options falls into yield, because MT usage in runtime
+      device->SetActiveWait(true);
+      break;
+    case hipDeviceScheduleBlockingSync:
+      device->SetActiveWait(false);
+      break;
+    default:
+      break;
+  }
+  hip::getCurrentDevice()->setFlags(flags & hipDeviceScheduleMask);
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipSetValidDevices ( int* device_arr, int  len ) {
+  HIP_INIT_API(hipSetValidDevices, device_arr, len);
+
+  assert(0 && "Unimplemented");
+
+  HIP_RETURN(hipErrorNotSupported);
+}
@@ -0,0 +1,209 @@
+#!/bin/bash
+# Copyright (c) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+printUsage() {
+  echo
+  echo "Usage: $(basename "$0") HIP_BUILD_INC_DIR HIP_INC_DIR HIP_AMD_INC_DIR LLVM_DIR [option] [RTC_LIB_OUTPUT]"
+  echo
+  echo "Options:"
+  echo "  -p,  --generate_pch  Generate pre-compiled header (default)"
+  echo "  -r,  --generate_rtc  Generate preprocessor expansion (hiprtc_header.o)"
+  echo "  -h,  --help          Prints this help"
+  echo
+  echo
+  return 0
+}
+
+if [ "$1" == "" ]; then
+  printUsage
+  exit 0
+fi
+
+HIP_BUILD_INC_DIR="$1"
+HIP_INC_DIR="$2"
+HIP_AMD_INC_DIR="$3"
+LLVM_DIR="$4"
+# By default, generate pch
+TARGET="generatepch"
+
+while [ "$5" != "" ];
+do
+  case "$5" in
+    -h | --help )
+        printUsage ; exit 0 ;;
+    -p | --generate_pch )
+        TARGET="generatepch" ; break ;;
+    -r | --generate_rtc )
+        TARGET="generatertc" ; break ;;
+    *)
+        echo " UNEXPECTED ERROR Parm : [$4] ">&2 ; exit 20 ;;
+  esac
+  shift 1
+done
+
+# Allow hiprtc lib name to be set by argument 7
+if [[ "$6" != "" ]]; then
+  rtc_shared_lib_out="$6"
+else
+  if [[ "$OSTYPE" == cygwin ]]; then
+    rtc_shared_lib_out=hiprtc-builtins64.dll
+  else
+    rtc_shared_lib_out=libhiprtc-builtins.so
+  fi
+fi
+
+if [[ "$OSTYPE" == cygwin || "$OSTYPE" == msys ]]; then
+  isWindows=1
+  tmpdir=.
+else
+  isWindows=0
+  tmpdir=/tmp
+fi
+
+# Expected first argument $1 to be output file name.
+create_hip_macro_file() {
+cat >$1 <<EOF
+#define __device__ __attribute__((device))
+#define __host__ __attribute__((host))
+#define __global__ __attribute__((global))
+#define __constant__ __attribute__((constant))
+#define __shared__ __attribute__((shared))
+
+#define launch_bounds_impl0(requiredMaxThreadsPerBlock)                                            \
+    __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock)))
+#define launch_bounds_impl1(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor)                \
+    __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock),                     \
+                   amdgpu_waves_per_eu(minBlocksPerMultiprocessor)))
+#define select_impl_(_1, _2, impl_, ...) impl_
+#define __launch_bounds__(...)                                                                     \
+    select_impl_(__VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0)(__VA_ARGS__)
+
+EOF
+}
+
+generate_pch() {
+  tmp=$tmpdir/hip_pch.$$
+  mkdir -p $tmp
+
+  create_hip_macro_file $tmp/hip_macros.h
+
+cat >$tmp/hip_pch.h <<EOF
+#include "hip/hip_runtime.h"
+#include "hip/hip_fp16.h"
+EOF
+
+cat >$tmp/hip_pch.mcin <<EOF
+  .type __hip_pch_wave32,@object
+  .section .hip_pch_wave32,"aMS",@progbits,1
+  .data
+  .globl __hip_pch_wave32
+  .globl __hip_pch_wave32_size
+  .p2align 3
+__hip_pch_wave32:
+  .incbin "$tmp/hip_wave32.pch"
+__hip_pch_wave32_size:
+  .long __hip_pch_wave32_size - __hip_pch_wave32
+  .type __hip_pch_wave64,@object
+  .section .hip_pch_wave64,"aMS",@progbits,1
+  .data
+  .globl __hip_pch_wave64
+  .globl __hip_pch_wave64_size
+  .p2align 3
+__hip_pch_wave64:
+  .incbin "$tmp/hip_wave64.pch"
+__hip_pch_wave64_size:
+  .long __hip_pch_wave64_size - __hip_pch_wave64
+EOF
+
+  set -x
+
+  # For gfx10/Navi devices
+  $LLVM_DIR/bin/clang -O3 --rocm-path=$HIP_INC_DIR/.. -std=c++17 -nogpulib -isystem $HIP_INC_DIR -isystem $HIP_BUILD_INC_DIR -isystem $HIP_AMD_INC_DIR --cuda-device-only --cuda-gpu-arch=gfx1030 -x hip $tmp/hip_pch.h -E >$tmp/pch_wave32.cui &&
+
+  cat $tmp/hip_macros.h >> $tmp/pch_wave32.cui &&
+
+  $LLVM_DIR/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip_wave32.pch -x hip-cpp-output - <$tmp/pch_wave32.cui &&
+
+  # For other devices
+  $LLVM_DIR/bin/clang -O3 --rocm-path=$HIP_INC_DIR/.. -std=c++17 -nogpulib -isystem $HIP_INC_DIR -isystem $HIP_BUILD_INC_DIR -isystem $HIP_AMD_INC_DIR --cuda-device-only -x hip $tmp/hip_pch.h -E >$tmp/pch_wave64.cui &&
+
+  cat $tmp/hip_macros.h >> $tmp/pch_wave64.cui &&
+
+  $LLVM_DIR/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip_wave64.pch -x hip-cpp-output - <$tmp/pch_wave64.cui &&
+
+  $LLVM_DIR/bin/llvm-mc -o hip_pch.o $tmp/hip_pch.mcin --filetype=obj &&
+
+  rm -rf $tmp
+}
+
+generate_rtc_header() {
+  tmp=$tmpdir/hip_rtc.$$
+  mkdir -p $tmp
+  local macroFile="$tmp/hip_macros.h"
+  local headerFile="$tmp/hipRTC_header.h"
+  local mcinFile="$tmp/hipRTC_header.mcin"
+
+  create_hip_macro_file $macroFile
+
+cat >$headerFile <<EOF
+#pragma push_macro("CHAR_BIT")
+#pragma push_macro("INT_MAX")
+#define CHAR_BIT __CHAR_BIT__
+#define INT_MAX __INTMAX_MAX__
+
+#include "hip/hip_runtime.h"
+#include "hip/hip_fp16.h"
+
+#pragma pop_macro("CHAR_BIT")
+#pragma pop_macro("INT_MAX")
+EOF
+
+  echo "// Automatically generated script for HIP RTC." > $mcinFile
+  if [[ $isWindows -eq 0 ]]; then
+    echo "  .type __hipRTC_header,@object" >> $mcinFile
+    echo "  .type __hipRTC_header_size,@object" >> $mcinFile
+  fi
+cat >>$mcinFile <<EOF
+  .section .hipRTC_header,"a"
+  .globl __hipRTC_header
+  .globl __hipRTC_header_size
+  .p2align 3
+__hipRTC_header:
+  .incbin "$tmp/hiprtc"
+__hipRTC_header_size:
+  .long __hipRTC_header_size - __hipRTC_header
+EOF
+
+  set -x
+  $LLVM_DIR/bin/clang -O3 --rocm-path=$HIP_INC_DIR/.. -std=c++14 -nogpulib --hip-version=4.4 -isystem $HIP_INC_DIR -isystem $HIP_BUILD_INC_DIR -isystem $HIP_AMD_INC_DIR --cuda-device-only -D__HIPCC_RTC__ -x hip $tmp/hipRTC_header.h -E -o $tmp/hiprtc &&
+  cat $macroFile >> $tmp/hiprtc &&
+  $LLVM_DIR/bin/llvm-mc -o $tmp/hiprtc_header.o $tmp/hipRTC_header.mcin --filetype=obj &&
+  $LLVM_DIR/bin/clang $tmp/hiprtc_header.o -o $rtc_shared_lib_out -shared &&
+  $LLVM_DIR/bin/clang -O3 --rocm-path=$HIP_INC_DIR/.. -std=c++14 -nogpulib -nogpuinc -emit-llvm -c -o $tmp/tmp.bc --cuda-device-only -D__HIPCC_RTC__ --offload-arch=gfx906 -x hip-cpp-output $tmp/hiprtc &&
+  rm -rf $tmp
+}
+
+case $TARGET in
+    (generatertc) generate_rtc_header ;;
+    (generatepch) generate_pch ;;
+    (*) die "Invalid target $TARGET" ;;
+esac
+
@@ -0,0 +1,382 @@
+/* Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include <hip/hip_runtime.h>
+
+#include "hip_internal.hpp"
+
+hipError_t hipGetLastError()
+{
+  HIP_INIT_API(hipGetLastError);
+  hipError_t err = hip::tls.last_error_;
+  hip::tls.last_error_ = hipSuccess;
+  return err;
+}
+
+hipError_t hipPeekAtLastError()
+{
+  HIP_INIT_API(hipPeekAtLastError);
+  hipError_t err = hip::tls.last_error_;
+  HIP_RETURN(err);
+}
+
+const char *ihipGetErrorName(hipError_t hip_error)
+{
+  switch (hip_error) {
+    case hipSuccess:
+        return "hipSuccess";
+    case hipErrorInvalidValue:
+        return "hipErrorInvalidValue";
+    case hipErrorOutOfMemory:
+        return "hipErrorOutOfMemory";
+    case hipErrorNotInitialized:
+        return "hipErrorNotInitialized";
+    case hipErrorDeinitialized:
+        return "hipErrorDeinitialized";
+    case hipErrorProfilerDisabled:
+        return "hipErrorProfilerDisabled";
+    case hipErrorProfilerNotInitialized:
+        return "hipErrorProfilerNotInitialized";
+    case hipErrorProfilerAlreadyStarted:
+        return "hipErrorProfilerAlreadyStarted";
+    case hipErrorProfilerAlreadyStopped:
+        return "hipErrorProfilerAlreadyStopped";
+    case hipErrorInvalidConfiguration:
+        return "hipErrorInvalidConfiguration";
+    case hipErrorInvalidSymbol:
+        return "hipErrorInvalidSymbol";
+    case hipErrorInvalidDevicePointer:
+        return "hipErrorInvalidDevicePointer";
+    case hipErrorInvalidMemcpyDirection:
+        return "hipErrorInvalidMemcpyDirection";
+    case hipErrorInsufficientDriver:
+        return "hipErrorInsufficientDriver";
+    case hipErrorMissingConfiguration:
+        return "hipErrorMissingConfiguration";
+    case hipErrorPriorLaunchFailure:
+        return "hipErrorPriorLaunchFailure";
+    case hipErrorInvalidDeviceFunction:
+        return "hipErrorInvalidDeviceFunction";
+    case hipErrorNoDevice:
+        return "hipErrorNoDevice";
+    case hipErrorInvalidDevice:
+        return "hipErrorInvalidDevice";
+    case hipErrorInvalidPitchValue:
+        return "hipErrorInvalidPitchValue";
+    case hipErrorInvalidImage:
+        return "hipErrorInvalidImage";
+    case hipErrorInvalidContext:
+        return "hipErrorInvalidContext";
+    case hipErrorContextAlreadyCurrent:
+        return "hipErrorContextAlreadyCurrent";
+    case hipErrorMapFailed:
+        return "hipErrorMapFailed";
+    case hipErrorUnmapFailed:
+        return "hipErrorUnmapFailed";
+    case hipErrorArrayIsMapped:
+        return "hipErrorArrayIsMapped";
+    case hipErrorAlreadyMapped:
+        return "hipErrorAlreadyMapped";
+    case hipErrorNoBinaryForGpu:
+        return "hipErrorNoBinaryForGpu";
+    case hipErrorAlreadyAcquired:
+        return "hipErrorAlreadyAcquired";
+    case hipErrorNotMapped:
+        return "hipErrorNotMapped";
+    case hipErrorNotMappedAsArray:
+        return "hipErrorNotMappedAsArray";
+    case hipErrorNotMappedAsPointer:
+        return "hipErrorNotMappedAsPointer";
+    case hipErrorECCNotCorrectable:
+        return "hipErrorECCNotCorrectable";
+    case hipErrorUnsupportedLimit:
+        return "hipErrorUnsupportedLimit";
+    case hipErrorContextAlreadyInUse:
+        return "hipErrorContextAlreadyInUse";
+    case hipErrorPeerAccessUnsupported:
+        return "hipErrorPeerAccessUnsupported";
+    case hipErrorInvalidKernelFile:
+        return "hipErrorInvalidKernelFile";
+    case hipErrorInvalidGraphicsContext:
+        return "hipErrorInvalidGraphicsContext";
+    case hipErrorInvalidSource:
+        return "hipErrorInvalidSource";
+    case hipErrorFileNotFound:
+        return "hipErrorFileNotFound";
+    case hipErrorSharedObjectSymbolNotFound:
+        return "hipErrorSharedObjectSymbolNotFound";
+    case hipErrorSharedObjectInitFailed:
+        return "hipErrorSharedObjectInitFailed";
+    case hipErrorOperatingSystem:
+        return "hipErrorOperatingSystem";
+    case hipErrorInvalidHandle:
+        return "hipErrorInvalidHandle";
+    case hipErrorIllegalState:
+        return "hipErrorIllegalState";
+    case hipErrorNotFound:
+        return "hipErrorNotFound";
+    case hipErrorNotReady:
+        return "hipErrorNotReady";
+    case hipErrorIllegalAddress:
+        return "hipErrorIllegalAddress";
+    case hipErrorLaunchOutOfResources:
+        return "hipErrorLaunchOutOfResources";
+    case hipErrorLaunchTimeOut:
+        return "hipErrorLaunchTimeOut";
+    case hipErrorPeerAccessAlreadyEnabled:
+        return "hipErrorPeerAccessAlreadyEnabled";
+    case hipErrorPeerAccessNotEnabled:
+        return "hipErrorPeerAccessNotEnabled";
+    case hipErrorSetOnActiveProcess:
+        return "hipErrorSetOnActiveProcess";
+    case hipErrorContextIsDestroyed:
+        return "hipErrorContextIsDestroyed";
+    case hipErrorAssert:
+        return "hipErrorAssert";
+    case hipErrorHostMemoryAlreadyRegistered:
+        return "hipErrorHostMemoryAlreadyRegistered";
+    case hipErrorHostMemoryNotRegistered:
+        return "hipErrorHostMemoryNotRegistered";
+    case hipErrorLaunchFailure:
+        return "hipErrorLaunchFailure";
+    case hipErrorNotSupported:
+        return "hipErrorNotSupported";
+    case hipErrorUnknown:
+        return "hipErrorUnknown";
+    case hipErrorRuntimeMemory:
+        return "hipErrorRuntimeMemory";
+    case hipErrorRuntimeOther:
+        return "hipErrorRuntimeOther";
+    case hipErrorCooperativeLaunchTooLarge:
+        return "hipErrorCooperativeLaunchTooLarge";
+    case hipErrorStreamCaptureUnsupported:
+        return "hipErrorStreamCaptureUnsupported";
+    case hipErrorStreamCaptureInvalidated:
+        return "hipErrorStreamCaptureInvalidated";
+    case hipErrorStreamCaptureMerge:
+        return "hipErrorStreamCaptureMerge";
+    case hipErrorStreamCaptureUnmatched:
+        return "hipErrorStreamCaptureUnmatched";
+    case hipErrorStreamCaptureUnjoined:
+        return "hipErrorStreamCaptureUnjoined";
+    case hipErrorStreamCaptureIsolation:
+        return "hipErrorStreamCaptureIsolation";
+    case hipErrorStreamCaptureImplicit:
+        return "hipErrorStreamCaptureImplicit";
+    case hipErrorCapturedEvent:
+        return "hipErrorCapturedEvent";
+    case hipErrorStreamCaptureWrongThread:
+        return "hipErrorStreamCaptureWrongThread";
+    case hipErrorGraphExecUpdateFailure:
+        return "hipErrorGraphExecUpdateFailure";
+    case hipErrorTbd:
+        return "hipErrorTbd";
+    default:
+        return "hipErrorUnknown";
+    };
+}
+
+const char *ihipGetErrorString(hipError_t hip_error) {
+    switch(hip_error) {
+        case hipSuccess:
+            return "no error";
+        case hipErrorInvalidValue:
+            return "invalid argument";
+        case hipErrorOutOfMemory:
+            return "out of memory";
+        case hipErrorNotInitialized:
+            return "initialization error";
+        case hipErrorDeinitialized:
+            return "driver shutting down";
+        case hipErrorProfilerDisabled:
+            return "profiler disabled while using external profiling tool";
+        case hipErrorProfilerNotInitialized:
+            return "profiler is not initialized";
+        case hipErrorProfilerAlreadyStarted:
+            return "profiler already started";
+        case hipErrorProfilerAlreadyStopped:
+            return "profiler already stopped";
+        case hipErrorInvalidConfiguration:
+            return "invalid configuration argument";
+        case hipErrorInvalidPitchValue:
+            return "invalid pitch argument";
+        case hipErrorInvalidSymbol:
+            return "invalid device symbol";
+        case hipErrorInvalidDevicePointer:
+            return "invalid device pointer";
+        case hipErrorInvalidMemcpyDirection:
+            return "invalid copy direction for memcpy";
+        case hipErrorInsufficientDriver:
+            return "driver version is insufficient for runtime version";
+        case hipErrorMissingConfiguration:
+            return "__global__ function call is not configured";
+        case hipErrorPriorLaunchFailure:
+            return "unspecified launch failure in prior launch";
+        case hipErrorInvalidDeviceFunction:
+            return "invalid device function";
+        case hipErrorNoDevice:
+            return "no ROCm-capable device is detected";
+        case hipErrorInvalidDevice:
+            return "invalid device ordinal";
+        case hipErrorInvalidImage:
+            return "device kernel image is invalid";
+        case hipErrorInvalidContext:
+            return "invalid device context";
+        case hipErrorContextAlreadyCurrent:
+            return "context is already current context";
+        case hipErrorMapFailed:
+            return "mapping of buffer object failed";
+        case hipErrorUnmapFailed:
+            return "unmapping of buffer object failed";
+        case hipErrorArrayIsMapped:
+            return "array is mapped";
+        case hipErrorAlreadyMapped:
+            return "resource already mapped";
+        case hipErrorNoBinaryForGpu:
+            return "no kernel image is available for execution on the device";
+        case hipErrorAlreadyAcquired:
+            return "resource already acquired";
+        case hipErrorNotMapped:
+            return "resource not mapped";
+        case hipErrorNotMappedAsArray:
+            return "resource not mapped as array";
+        case hipErrorNotMappedAsPointer:
+            return "resource not mapped as pointer";
+        case hipErrorECCNotCorrectable:
+            return "uncorrectable ECC error encountered";
+        case hipErrorUnsupportedLimit:
+            return "limit is not supported on this architecture";
+        case hipErrorContextAlreadyInUse:
+            return "exclusive-thread device already in use by a different thread";
+        case hipErrorPeerAccessUnsupported:
+            return "peer access is not supported between these two devices";
+        case hipErrorInvalidKernelFile:
+            return "invalid kernel file";
+        case hipErrorInvalidGraphicsContext:
+            return "invalid OpenGL or DirectX context";
+        case hipErrorInvalidSource:
+            return "device kernel image is invalid";
+        case hipErrorFileNotFound:
+            return "file not found";
+        case hipErrorSharedObjectSymbolNotFound:
+            return "shared object symbol not found";
+        case hipErrorSharedObjectInitFailed:
+            return "shared object initialization failed";
+        case hipErrorOperatingSystem:
+            return "OS call failed or operation not supported on this OS";
+        case hipErrorInvalidHandle:
+            return "invalid resource handle";
+        case hipErrorIllegalState:
+            return "the operation cannot be performed in the present state";
+        case hipErrorNotFound:
+            return "named symbol not found";
+        case hipErrorNotReady:
+            return "device not ready";
+        case hipErrorIllegalAddress:
+            return "an illegal memory access was encountered";
+        case hipErrorLaunchOutOfResources:
+            return "too many resources requested for launch";
+        case hipErrorLaunchTimeOut:
+            return "the launch timed out and was terminated";
+        case hipErrorPeerAccessAlreadyEnabled:
+            return "peer access is already enabled";
+        case hipErrorPeerAccessNotEnabled:
+            return "peer access has not been enabled";
+        case hipErrorSetOnActiveProcess:
+            return "cannot set while device is active in this process";
+        case hipErrorContextIsDestroyed:
+            return "context is destroyed";
+        case hipErrorAssert:
+            return "device-side assert triggered";
+        case hipErrorHostMemoryAlreadyRegistered:
+            return "part or all of the requested memory range is already mapped";
+        case hipErrorHostMemoryNotRegistered:
+            return "pointer does not correspond to a registered memory region";
+        case hipErrorLaunchFailure:
+            return "unspecified launch failure";
+        case hipErrorCooperativeLaunchTooLarge:
+            return "too many blocks in cooperative launch";
+        case hipErrorNotSupported:
+            return "operation not supported";
+        case hipErrorStreamCaptureUnsupported:
+            return "operation not permitted when stream is capturing";
+        case hipErrorStreamCaptureInvalidated:
+            return "operation failed due to a previous error during capture";
+        case hipErrorStreamCaptureMerge:
+            return "operation would result in a merge of separate capture sequences";
+        case hipErrorStreamCaptureUnmatched:
+            return "capture was not ended in the same stream as it began";
+        case hipErrorStreamCaptureUnjoined:
+            return "capturing stream has unjoined work";
+        case hipErrorStreamCaptureIsolation:
+            return "dependency created on uncaptured work in another stream";
+        case hipErrorStreamCaptureImplicit:
+            return "operation would make the legacy stream depend on a capturing blocking stream";
+        case hipErrorCapturedEvent:
+            return "operation not permitted on an event last recorded in a capturing stream";
+        case hipErrorStreamCaptureWrongThread:
+            return "attempt to terminate a thread-local capture sequence from another thread";
+        case hipErrorGraphExecUpdateFailure:
+            return "the graph update was not performed because it included changes which violated constraints specific to instantiated graph update";
+        case hipErrorRuntimeMemory:
+            return "runtime memory call returned error";
+        case hipErrorRuntimeOther:
+            return "runtime call other than memory returned error";
+        case hipErrorUnknown:
+        default:
+            return "unknown error";
+    }
+}
+
+const char* hipGetErrorName(hipError_t hip_error) 
+{ 
+  return ihipGetErrorName(hip_error); 
+}
+
+const char *hipGetErrorString(hipError_t hip_error)
+{
+  return ihipGetErrorString(hip_error);
+}
+
+hipError_t hipDrvGetErrorName(hipError_t hip_error, const char** errStr)
+{
+  if (errStr == nullptr) {
+    return hipErrorInvalidValue;
+  }
+  *errStr = ihipGetErrorName(hip_error);
+  if (hip_error == hipErrorUnknown || strcmp( *errStr, "hipErrorUnknown") != 0) {
+    return hipSuccess;
+  } else {
+    return hipErrorInvalidValue;
+  }
+}
+
+hipError_t hipDrvGetErrorString(hipError_t hip_error, const char** errStr)
+{
+  if (errStr == nullptr) {
+    return hipErrorInvalidValue;
+  }
+  *errStr = ihipGetErrorString(hip_error);
+  if (hip_error == hipErrorUnknown || strcmp( *errStr, "unknown error") != 0) {
+    return hipSuccess;
+  } else {
+    return hipErrorInvalidValue;
+  }
+}
@@ -0,0 +1,429 @@
+/* Copyright (c) 2015 - 2022 Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include <hip/hip_runtime.h>
+
+#include "hip_event.hpp"
+#if !defined(_MSC_VER)
+#include <unistd.h>
+#endif
+
+namespace hip {
+
+static amd::Monitor eventSetLock{"Guards global event set"};
+static std::unordered_set<hipEvent_t> eventSet;
+
+bool Event::ready(eventType type) {
+  if (event_->status() != CL_COMPLETE) {
+    event_->notifyCmdQueue();
+  }
+  // Check HW status of the ROCcrl event. Note: not all ROCclr modes support HW status
+  bool ready = CheckHwEvent(type);
+  if (!ready) {
+    ready = (event_->status() == CL_COMPLETE);
+  }
+  return ready;
+}
+
+bool EventDD::ready(eventType type) {
+  // Check HW status of the ROCcrl event. Note: not all ROCclr modes support HW status
+  bool ready = CheckHwEvent(type);
+  // FIXME: Remove status check entirely
+  if (!ready) {
+    ready = (event_->status() == CL_COMPLETE);
+  }
+  return ready;
+}
+
+hipError_t Event::query() {
+  amd::ScopedLock lock(lock_);
+
+  // If event is not recorded, event_ is null, hence return hipSuccess
+  if (event_ == nullptr) {
+    return hipSuccess;
+  }
+
+  return ready(Query) ? hipSuccess : hipErrorNotReady;
+}
+
+hipError_t Event::synchronize() {
+  amd::ScopedLock lock(lock_);
+
+  // If event is not recorded, event_ is null, hence return hipSuccess
+  if (event_ == nullptr) {
+    return hipSuccess;
+  }
+
+  // Check HW status of the ROCcrl event. Note: not all ROCclr modes support HW status
+  static constexpr bool kWaitCompletion = true;
+  if (!g_devices[deviceId()]->devices()[0]->IsHwEventReady(*event_, kWaitCompletion)) {
+    if (event_->HwEvent() != nullptr) {
+      amd::Command* command = nullptr;
+      hipError_t status = recordCommand(command, event_->command().queue(), flags);
+      command->enqueue();
+      g_devices[deviceId()]->devices()[0]->IsHwEventReady(command->event(), kWaitCompletion);
+      command->release();
+    } else {
+      event_->awaitCompletion();
+    }
+  }
+
+  return hipSuccess;
+}
+
+bool Event::awaitEventCompletion() {
+  return event_->awaitCompletion();
+}
+
+bool EventDD::awaitEventCompletion() {
+  return g_devices[deviceId()]->devices()[0]->IsHwEventReady(*event_, true);
+}
+
+hipError_t Event::elapsedTime(Event& eStop, float& ms) {
+  amd::ScopedLock startLock(lock_);
+  if (this == &eStop) {
+    ms = 0.f;
+    if (event_ == nullptr) {
+      return hipErrorInvalidHandle;
+    }
+
+    if (flags & hipEventDisableTiming) {
+      return hipErrorInvalidHandle;
+    }
+
+    if (!ready(ElapsedTime)) {
+      return hipErrorNotReady;
+    }
+
+    return hipSuccess;
+  }
+  amd::ScopedLock stopLock(eStop.lock());
+
+  if (event_ == nullptr || eStop.event() == nullptr) {
+    return hipErrorInvalidHandle;
+  }
+
+  if ((flags | eStop.flags) & hipEventDisableTiming) {
+    return hipErrorInvalidHandle;
+  }
+
+  if (!ready(ElapsedTime) || !eStop.ready(ElapsedTime)) {
+    return hipErrorNotReady;
+  }
+
+  if (event_ == eStop.event_) {
+    // Events are the same, which indicates the stream is empty and likely
+    // eventRecord is called on another stream. For such cases insert and measure a
+    // marker.
+    amd::Command* command = new amd::Marker(*event_->command().queue(), kMarkerDisableFlush);
+    command->enqueue();
+    command->awaitCompletion();
+    ms = static_cast<float>(static_cast<int64_t>(command->event().profilingInfo().end_) - time(false)) /
+        1000000.f;
+    command->release();
+  } else {
+    // Note: with direct dispatch eStop.ready() relies on HW event, but CPU status can be delayed.
+    // Hence for now make sure CPU status is updated by calling awaitCompletion();
+    awaitEventCompletion();
+    eStop.awaitEventCompletion();
+    if (unrecorded_ && eStop.isUnRecorded()) {
+      // Both the events are not recorded, just need the end and start of stop event
+      ms = static_cast<float>(eStop.time(false) - eStop.time(true)) / 1000000.f;
+    } else {
+      ms = static_cast<float>(eStop.time(false) - time(false)) / 1000000.f;
+    }
+  }
+  return hipSuccess;
+}
+
+int64_t Event::time(bool getStartTs) const {
+  assert(event_ != nullptr);
+  if (getStartTs) {
+    return static_cast<int64_t>(event_->profilingInfo().start_);
+  } else {
+    return static_cast<int64_t>(event_->profilingInfo().end_);
+  }
+}
+
+int64_t EventDD::time(bool getStartTs) const {
+  uint64_t start = 0, end = 0;
+  assert(event_ != nullptr);
+  g_devices[deviceId()]->devices()[0]->getHwEventTime(*event_, &start, &end);
+  // FIXME: This is only needed if the command had to wait CL_COMPLETE status
+  if (start == 0 || end == 0) {
+    return Event::time(getStartTs);
+  }
+  if (getStartTs) {
+    return static_cast<int64_t>(start);
+  } else {
+    return static_cast<int64_t>(end);
+  }
+}
+
+hipError_t Event::streamWaitCommand(amd::Command*& command, hip::Stream* stream) {
+  amd::Command::EventWaitList eventWaitList;
+  if (event_ != nullptr) {
+    eventWaitList.push_back(event_);
+  }
+  command = new amd::Marker(*stream, kMarkerDisableFlush, eventWaitList);
+
+  if (command == NULL) {
+    return hipErrorOutOfMemory;
+  }
+  return hipSuccess;
+}
+
+hipError_t Event::enqueueStreamWaitCommand(hipStream_t stream, amd::Command* command) {
+  command->enqueue();
+  return hipSuccess;
+}
+
+hipError_t Event::streamWait(hipStream_t stream, uint flags) {
+  hip::Stream* hip_stream = hip::getStream(stream);
+  // Access to event_ object must be lock protected
+  amd::ScopedLock lock(lock_);
+  if ((event_ == nullptr) || (event_->command().queue() == hip_stream) || ready(StreamWait)) {
+    return hipSuccess;
+  }
+  if (!event_->notifyCmdQueue()) {
+    return hipErrorLaunchOutOfResources;
+  }
+  amd::Command* command;
+  hipError_t status = streamWaitCommand(command, hip_stream);
+  if (status != hipSuccess) {
+    return status;
+  }
+  status = enqueueStreamWaitCommand(stream, command);
+  if (status != hipSuccess) {
+    return status;
+  }
+  command->release();
+  return hipSuccess;
+}
+
+hipError_t Event::recordCommand(amd::Command*& command, amd::HostQueue* stream,
+                                uint32_t ext_flags ) {
+  if (command == nullptr) {
+    int32_t releaseFlags = ((ext_flags == 0) ? flags : ext_flags) &
+                            (hipEventReleaseToDevice | hipEventReleaseToSystem |
+                             hipEventDisableSystemFence);
+    if (releaseFlags & hipEventDisableSystemFence) {
+      releaseFlags = amd::Device::kCacheStateIgnore;
+    } else {
+      releaseFlags = amd::Device::kCacheStateInvalid;
+    }
+    // Always submit a EventMarker.
+    command = new hip::EventMarker(*stream, !kMarkerDisableFlush, true, releaseFlags);
+  }
+  return hipSuccess;
+}
+
+hipError_t Event::enqueueRecordCommand(hipStream_t stream, amd::Command* command, bool record) {
+  command->enqueue();
+  if (event_ == &command->event()) return hipSuccess;
+  if (event_ != nullptr) {
+    event_->release();
+  }
+  event_ = &command->event();
+  unrecorded_ = !record;
+
+  return hipSuccess;
+}
+
+hipError_t Event::addMarker(hipStream_t stream, amd::Command* command, bool record) {
+  hip::Stream* hip_stream = hip::getStream(stream);
+  // Keep the lock always at the beginning of this to avoid a race. SWDEV-277847
+  amd::ScopedLock lock(lock_);
+  hipError_t status = recordCommand(command, hip_stream);
+  if (status != hipSuccess) {
+    return hipSuccess;
+  }
+  status = enqueueRecordCommand(stream, command, record);
+  return status;
+}
+
+// ================================================================================================
+bool isValid(hipEvent_t event) {
+  // NULL event is always valid
+  if (event == nullptr) {
+    return true;
+  }
+
+  amd::ScopedLock lock(eventSetLock);
+  if (eventSet.find(event) == eventSet.end()) {
+    return false;
+  }
+
+  return true;
+}
+
+}  // namespace hip
+// ================================================================================================
+hipError_t ihipEventCreateWithFlags(hipEvent_t* event, unsigned flags) {
+  unsigned supportedFlags = hipEventDefault | hipEventBlockingSync | hipEventDisableTiming |
+                            hipEventReleaseToDevice | hipEventReleaseToSystem |
+                            hipEventInterprocess | hipEventDisableSystemFence;
+
+  const unsigned releaseFlags = (hipEventReleaseToDevice | hipEventReleaseToSystem |
+                                 hipEventDisableSystemFence);
+  // can't set any unsupported flags.
+  // can set only one of the release flags.
+  // if hipEventInterprocess flag is set, then hipEventDisableTiming flag also must be set
+  const bool illegalFlags = (flags & ~supportedFlags) ||
+                            ([](unsigned int num){
+                              unsigned int bitcount;
+                              for (bitcount = 0; num; bitcount++) {
+                                num &= num - 1;
+                              }
+                              return bitcount; } (flags & releaseFlags) > 1) ||
+                            ((flags & hipEventInterprocess) && !(flags & hipEventDisableTiming));
+  if (!illegalFlags) {
+    hip::Event* e = nullptr;
+    if (flags & hipEventInterprocess) {
+      e = new hip::IPCEvent();
+    } else {
+      if (AMD_DIRECT_DISPATCH) {
+        e = new hip::EventDD(flags);
+      } else {
+        e = new hip::Event(flags);
+      }
+    }
+    if (e == nullptr) {
+      return hipErrorOutOfMemory;
+    }
+    *event = reinterpret_cast<hipEvent_t>(e);
+    amd::ScopedLock lock(hip::eventSetLock);
+    hip::eventSet.insert(*event);
+  } else {
+    return hipErrorInvalidValue;
+  }
+  return hipSuccess;
+}
+
+hipError_t hipEventCreateWithFlags(hipEvent_t* event, unsigned flags) {
+  HIP_INIT_API(hipEventCreateWithFlags, event, flags);
+
+  if (event == nullptr) {
+    return hipErrorInvalidValue;
+  }
+
+  HIP_RETURN(ihipEventCreateWithFlags(event, flags), *event);
+}
+
+hipError_t hipEventCreate(hipEvent_t* event) {
+  HIP_INIT_API(hipEventCreate, event);
+
+  if (event == nullptr) {
+    return hipErrorInvalidValue;
+  }
+
+  HIP_RETURN(ihipEventCreateWithFlags(event, 0), *event);
+}
+
+hipError_t hipEventDestroy(hipEvent_t event) {
+  HIP_INIT_API(hipEventDestroy, event);
+
+  if (event == nullptr) {
+    HIP_RETURN(hipErrorInvalidHandle);
+  }
+
+  amd::ScopedLock lock(hip::eventSetLock);
+  if (hip::eventSet.erase(event) == 0 ) {
+    return hipErrorContextIsDestroyed;
+  }
+
+  hip::Event* e = reinterpret_cast<hip::Event*>(event);
+  if (e->GetCaptureStream() != nullptr) {
+    reinterpret_cast<hip::Stream*>(e->GetCaptureStream())->EraseCaptureEvent(event);
+  }
+  delete e;
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipEventElapsedTime(float* ms, hipEvent_t start, hipEvent_t stop) {
+  HIP_INIT_API(hipEventElapsedTime, ms, start, stop);
+
+  if (ms == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  if (start == nullptr || stop == nullptr) {
+    HIP_RETURN(hipErrorInvalidHandle);
+  }
+
+  hip::Event* eStart = reinterpret_cast<hip::Event*>(start);
+  hip::Event* eStop = reinterpret_cast<hip::Event*>(stop);
+
+  if (eStart->deviceId() != eStop->deviceId()) {
+    HIP_RETURN(hipErrorInvalidHandle);
+  }
+
+  HIP_RETURN(eStart->elapsedTime(*eStop, *ms), "Elapsed Time = ", *ms);
+}
+
+hipError_t hipEventRecord_common(hipEvent_t event, hipStream_t stream) {
+  STREAM_CAPTURE(hipEventRecord, stream, event);
+
+  if (event == nullptr) {
+    return hipErrorInvalidHandle;
+  }
+  hip::Event* e = reinterpret_cast<hip::Event*>(event);
+  hip::Stream* hip_stream = hip::getStream(stream);
+  if (g_devices[e->deviceId()]->devices()[0] != &hip_stream->device()) {
+    return hipErrorInvalidHandle;
+  }
+  return e->addMarker(stream, nullptr, true);
+}
+
+hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream) {
+  HIP_INIT_API(hipEventRecord, event, stream);
+  HIP_RETURN(hipEventRecord_common(event, stream));
+}
+
+hipError_t hipEventRecord_spt(hipEvent_t event, hipStream_t stream) {
+  HIP_INIT_API(hipEventRecord, event, stream);
+  PER_THREAD_DEFAULT_STREAM(stream);
+  HIP_RETURN(hipEventRecord_common(event, stream));
+}
+
+hipError_t hipEventSynchronize(hipEvent_t event) {
+  HIP_INIT_API(hipEventSynchronize, event);
+
+  if (event == nullptr) {
+    HIP_RETURN(hipErrorInvalidHandle);
+  }
+
+  hip::Event* e = reinterpret_cast<hip::Event*>(event);
+  HIP_RETURN(e->synchronize());
+}
+
+hipError_t ihipEventQuery(hipEvent_t event) {
+  if (event == nullptr) {
+    return hipErrorInvalidHandle;
+  }
+
+  hip::Event* e = reinterpret_cast<hip::Event*>(event);
+  return e->query();
+}
+
+hipError_t hipEventQuery(hipEvent_t event) {
+  HIP_INIT_API(hipEventQuery, event);
+  HIP_RETURN(ihipEventQuery(event));
+}
@@ -0,0 +1,253 @@
+/* Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#ifndef HIP_EVENT_H
+#define HIP_EVENT_H
+
+#include "hip_internal.hpp"
+#include "thread/monitor.hpp"
+
+// Internal structure for stream callback handler
+class StreamCallback {
+protected:
+  void* userData_;
+ public:
+  StreamCallback(void* userData)
+      : userData_(userData) {}
+
+  virtual void CL_CALLBACK callback() = 0;
+};
+
+class StreamAddCallback : public StreamCallback {
+  hipStreamCallback_t callBack_;
+  hipStream_t stream_;
+public:
+  StreamAddCallback(hipStream_t stream, hipStreamCallback_t callback, void* userData)
+      : StreamCallback(userData) {
+    stream_ = stream;
+    callBack_ = callback;
+  }
+
+  void CL_CALLBACK callback() {
+    hipError_t status = hipSuccess;
+    callBack_(stream_, status, userData_);
+  }
+};
+
+class LaunchHostFuncCallback : public StreamCallback {
+  hipHostFn_t callBack_;
+ public:
+  LaunchHostFuncCallback(hipHostFn_t callback, void* userData)
+      : StreamCallback(userData) {
+    callBack_ = callback;
+  }
+
+  void CL_CALLBACK callback() { callBack_(userData_); }
+};
+
+void CL_CALLBACK ihipStreamCallback(cl_event event, cl_int command_exec_status, void* user_data);
+
+namespace hip {
+
+#define IPC_SIGNALS_PER_EVENT 32
+typedef struct ihipIpcEventShmem_s {
+  std::atomic<int> owners;
+  std::atomic<int> owners_device_id;
+  std::atomic<int> owners_process_id;
+  std::atomic<int> read_index;
+  std::atomic<int> write_index;
+  uint32_t signal[IPC_SIGNALS_PER_EVENT];
+} ihipIpcEventShmem_t;
+
+class EventMarker : public amd::Marker {
+ public:
+  EventMarker(amd::HostQueue& stream, bool disableFlush, bool markerTs = false,
+              int32_t scope = amd::Device::kCacheStateInvalid)
+      : amd::Marker(stream, disableFlush) {
+    profilingInfo_.enabled_ = true;
+    profilingInfo_.callback_ = nullptr;
+    profilingInfo_.marker_ts_ = markerTs;
+    profilingInfo_.clear();
+    setEventScope(scope);
+  }
+};
+
+enum eventType { Query, StreamWait, ElapsedTime };
+class Event {
+  /// event recorded on stream where capture is active
+  bool onCapture_;
+  /// capture stream where event is recorded
+  hipStream_t captureStream_ = nullptr;
+  /// Previous captured nodes before event record
+  std::vector<hipGraphNode_t> nodesPrevToRecorded_;
+ protected:
+  bool CheckHwEvent(eventType type) {
+    bool ready;
+    if (type == Query) {
+      ready = g_devices[deviceId()]->devices()[0]->IsHwEventReadyForcedWait(*event_);
+    } else {
+      ready = g_devices[deviceId()]->devices()[0]->IsHwEventReady(*event_);
+    }
+    return ready;
+  }
+
+ public:
+  Event(unsigned int flags) : flags(flags), lock_("hipEvent_t", true),
+                              event_(nullptr), unrecorded_(false), stream_(nullptr) {
+    // No need to init event_ here as addMarker does that
+    onCapture_ = false;
+    device_id_ = hip::getCurrentDevice()->deviceId();  // Created in current device ctx
+  }
+
+  virtual ~Event() {
+    if (event_ != nullptr) {
+      event_->release();
+    }
+  }
+  unsigned int flags;
+
+  virtual hipError_t query();
+  virtual hipError_t synchronize();
+  hipError_t elapsedTime(Event& eStop, float& ms);
+
+  virtual hipError_t streamWaitCommand(amd::Command*& command, hip::Stream* stream);
+  virtual hipError_t enqueueStreamWaitCommand(hipStream_t stream, amd::Command* command);
+  virtual hipError_t streamWait(hipStream_t stream, uint flags);
+
+  virtual hipError_t recordCommand(amd::Command*& command, amd::HostQueue* stream,
+                                   uint32_t flags = 0);
+  virtual hipError_t enqueueRecordCommand(hipStream_t stream, amd::Command* command, bool record);
+  hipError_t addMarker(hipStream_t stream, amd::Command* command, bool record);
+
+  void BindCommand(amd::Command& command, bool record) {
+    amd::ScopedLock lock(lock_);
+    if (event_ != nullptr) {
+      event_->release();
+    }
+    event_ = &command.event();
+    unrecorded_ = !record;
+    command.retain();
+  }
+
+  bool isUnRecorded() const { return unrecorded_; }
+  amd::Monitor& lock() { return lock_; }
+  const int deviceId() const { return device_id_; }
+  void setDeviceId(int id) { device_id_ = id; }
+  amd::Event* event() { return event_; }
+
+  /// End capture on this event
+  void EndCapture() {
+    onCapture_ = false;
+    captureStream_ = nullptr;
+  }
+  /// Start capture when waited on this event
+  void StartCapture(hipStream_t stream) {
+    onCapture_ = true;
+    captureStream_ = stream;
+  }
+  /// Get capture status of the graph
+  bool GetCaptureStatus() const { return onCapture_; }
+  /// Get capture stream where event is recorded
+  hipStream_t GetCaptureStream() const { return captureStream_; }
+  /// Set capture stream where event is recorded
+  void SetCaptureStream(hipStream_t stream) { captureStream_ = stream; }
+  /// Returns previous captured nodes before event record
+  std::vector<hipGraphNode_t> GetNodesPrevToRecorded() const { return nodesPrevToRecorded_; }
+  /// Set last captured graph node before event record
+  void SetNodesPrevToRecorded(std::vector<hipGraphNode_t>& graphNode) {
+    nodesPrevToRecorded_ = graphNode;
+  }
+  virtual hipError_t GetHandle(ihipIpcEventHandle_t* handle) {
+    return hipErrorInvalidConfiguration;
+  }
+  virtual hipError_t OpenHandle(ihipIpcEventHandle_t* handle) {
+    return hipErrorInvalidConfiguration;
+  }
+  virtual bool awaitEventCompletion();
+  virtual bool ready(eventType type);
+  virtual int64_t time(bool getStartTs) const;
+
+ protected:
+  amd::Monitor lock_;
+  hip::Stream* stream_;
+  amd::Event* event_;
+  int device_id_;
+  //! Flag to indicate hipEventRecord has not been called. This is needed for
+  //! hip*ModuleLaunchKernel API which takes start and stop events so no
+  //! hipEventRecord is called. Cleanup needed once those APIs are deprecated.
+  bool unrecorded_;
+};
+
+class EventDD : public Event {
+ public:
+  EventDD(unsigned int flags) : Event(flags) {}
+  virtual ~EventDD() {}
+
+  virtual bool awaitEventCompletion();
+  virtual bool ready(eventType type);
+  virtual int64_t time(bool getStartTs) const;
+};
+
+class IPCEvent : public Event {
+  // IPC Events
+  struct ihipIpcEvent_t {
+    std::string ipc_name_;
+    int ipc_fd_;
+    ihipIpcEventShmem_t* ipc_shmem_;
+    ihipIpcEvent_t() : ipc_name_("dummy"), ipc_fd_(0), ipc_shmem_(nullptr) {}
+    void setipcname(const char* name) { ipc_name_ = std::string(name); }
+  };
+  ihipIpcEvent_t ipc_evt_;
+
+ public:
+  ~IPCEvent() {
+    if (ipc_evt_.ipc_shmem_) {
+      int owners = --ipc_evt_.ipc_shmem_->owners;
+      // Make sure event is synchronized
+      hipError_t status = synchronize();
+      status  = ihipHostUnregister(&ipc_evt_.ipc_shmem_->signal);
+      if (!amd::Os::MemoryUnmapFile(ipc_evt_.ipc_shmem_, sizeof(hip::ihipIpcEventShmem_t))) {
+        // print hipErrorInvalidHandle;
+      }
+    }
+  }
+  IPCEvent() : Event(hipEventInterprocess) {}
+  bool createIpcEventShmemIfNeeded();
+  hipError_t GetHandle(ihipIpcEventHandle_t* handle);
+  hipError_t OpenHandle(ihipIpcEventHandle_t* handle);
+  hipError_t synchronize();
+  hipError_t query();
+
+  hipError_t streamWaitCommand(amd::Command*& command, hip::Stream* stream);
+  hipError_t enqueueStreamWaitCommand(hipStream_t stream, amd::Command* command);
+  hipError_t streamWait(hipStream_t stream, uint flags);
+
+  hipError_t recordCommand(amd::Command*& command, amd::HostQueue* queue, uint32_t flags = 0);
+  hipError_t enqueueRecordCommand(hipStream_t stream, amd::Command* command, bool record);
+};
+
+};  // namespace hip
+
+struct CallbackData {
+  int previous_read_index;
+  hip::ihipIpcEventShmem_t* shmem;
+};
+
+#endif  // HIP_EVEMT_H
@@ -0,0 +1,250 @@
+/* Copyright (c) 2015 - 2022 Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include <hip/hip_runtime.h>
+
+#include "hip_event.hpp"
+#if !defined(_MSC_VER)
+#include <unistd.h>
+#else
+#include <io.h>
+#endif
+
+// ================================================================================================
+
+hipError_t ihipEventCreateWithFlags(hipEvent_t* event, unsigned flags);
+
+namespace hip {
+
+bool IPCEvent::createIpcEventShmemIfNeeded() {
+  if (ipc_evt_.ipc_shmem_) {
+    // ipc_shmem_ already created, no need to create it again
+    return true;
+  }
+
+  char name_template[] = "/tmp/eventXXXXXX";
+#if !defined(_MSC_VER)
+  int temp_fd = mkstemp(name_template);
+#else
+  _mktemp_s(name_template, sizeof(name_template));
+#endif
+
+  ipc_evt_.ipc_name_ = name_template;
+  ipc_evt_.ipc_name_.replace(0, 5, "/hip_");
+  if (!amd::Os::MemoryMapFileTruncated(
+          ipc_evt_.ipc_name_.c_str(),
+          const_cast<const void**>(reinterpret_cast<void**>(&(ipc_evt_.ipc_shmem_))),
+          sizeof(hip::ihipIpcEventShmem_t))) {
+    return false;
+  }
+
+#if !defined(_MSC_VER)
+  close(temp_fd);
+#endif
+
+  ipc_evt_.ipc_shmem_->owners = 1;
+  ipc_evt_.ipc_shmem_->read_index = -1;
+  ipc_evt_.ipc_shmem_->write_index = 0;
+  for (uint32_t sig_idx = 0; sig_idx < IPC_SIGNALS_PER_EVENT; ++sig_idx) {
+    ipc_evt_.ipc_shmem_->signal[sig_idx] = 0;
+  }
+
+  // device sets 0 to this ptr when the ipc event is completed
+  hipError_t status = ihipHostRegister(&ipc_evt_.ipc_shmem_->signal,
+                                       sizeof(uint32_t) * IPC_SIGNALS_PER_EVENT,
+                                       0);
+  if (status != hipSuccess) {
+    return false;
+  }
+  return true;
+}
+
+hipError_t IPCEvent::query() {
+  if (ipc_evt_.ipc_shmem_) {
+    int prev_read_idx = ipc_evt_.ipc_shmem_->read_index;
+    int offset = (prev_read_idx % IPC_SIGNALS_PER_EVENT);
+    if (ipc_evt_.ipc_shmem_->read_index < prev_read_idx + IPC_SIGNALS_PER_EVENT &&
+        ipc_evt_.ipc_shmem_->signal[offset] != 0) {
+      return hipErrorNotReady;
+    }
+  }
+  return hipSuccess;
+}
+
+hipError_t IPCEvent::synchronize() {
+  if (ipc_evt_.ipc_shmem_) {
+    int prev_read_idx = ipc_evt_.ipc_shmem_->read_index;
+    if (prev_read_idx >= 0) {
+      int offset = (prev_read_idx % IPC_SIGNALS_PER_EVENT);
+      while ((ipc_evt_.ipc_shmem_->read_index < prev_read_idx + IPC_SIGNALS_PER_EVENT) &&
+             (ipc_evt_.ipc_shmem_->signal[offset] != 0)) {
+        amd::Os::sleep(1);
+      }
+    }
+  }
+  return hipSuccess;
+}
+
+hipError_t IPCEvent::streamWaitCommand(amd::Command*& command, hip::Stream* stream) {
+  command = new amd::Marker(*stream, false);
+  if (command == NULL) {
+    return hipErrorOutOfMemory;
+  }
+  return hipSuccess;
+}
+
+hipError_t IPCEvent::enqueueStreamWaitCommand(hipStream_t stream, amd::Command* command) {
+  auto t{new CallbackData{ipc_evt_.ipc_shmem_->read_index, ipc_evt_.ipc_shmem_}};
+  StreamCallback* cbo = new StreamAddCallback(
+      stream, reinterpret_cast<hipStreamCallback_t>(WaitThenDecrementSignal), t);
+  if (!command->setCallback(CL_COMPLETE, ihipStreamCallback, cbo)) {
+    command->release();
+    return hipErrorInvalidHandle;
+  }
+  command->enqueue();
+  command->release();
+  command->awaitCompletion();
+  return hipSuccess;
+}
+
+hipError_t IPCEvent::streamWait(hipStream_t stream, uint flags) {
+  hip::Stream* hip_stream = hip::getStream(stream);
+
+  amd::ScopedLock lock(lock_);
+  if(query() != hipSuccess) {
+    amd::Command* command;
+    hipError_t status = streamWaitCommand(command, hip_stream);
+    if (status != hipSuccess) {
+      return status;
+    }
+    status = enqueueStreamWaitCommand(stream, command);
+    return status;
+  }
+  return hipSuccess;
+}
+
+hipError_t IPCEvent::recordCommand(amd::Command*& command, amd::HostQueue* stream, uint32_t flags) {
+  bool unrecorded = isUnRecorded();
+  if (unrecorded) {
+    command = new amd::Marker(*stream, kMarkerDisableFlush);
+  } else {
+    return Event::recordCommand(command, stream);
+  }
+  return hipSuccess;
+}
+
+hipError_t IPCEvent::enqueueRecordCommand(hipStream_t stream, amd::Command* command, bool record) {
+  bool unrecorded = isUnRecorded();
+  if (unrecorded) {
+    amd::Event& tEvent = command->event();
+    createIpcEventShmemIfNeeded();
+    int write_index = ipc_evt_.ipc_shmem_->write_index++;
+    int offset = write_index % IPC_SIGNALS_PER_EVENT;
+    while (ipc_evt_.ipc_shmem_->signal[offset] != 0) {
+      amd::Os::sleep(1);
+    }
+    // Lock signal.
+    ipc_evt_.ipc_shmem_->signal[offset] = 1;
+    ipc_evt_.ipc_shmem_->owners_device_id = deviceId();
+    command->enqueue();
+
+    // device writes 0 to signal after the hipEventRecord command is completed
+    // the signal value is checked by WaitThenDecrementSignal cb
+    hipError_t status = ihipStreamOperation(stream, ROCCLR_COMMAND_STREAM_WRITE_VALUE,
+                                 &(ipc_evt_.ipc_shmem_->signal[offset]),
+                                 0,
+                                 0, 0, sizeof(uint32_t));
+    if (status != hipSuccess) {
+      return status;
+    }
+
+    // Update read index to indicate new signal.
+    int expected = write_index - 1;
+    while (!ipc_evt_.ipc_shmem_->read_index.compare_exchange_weak(expected, write_index)) {
+      amd::Os::sleep(1);
+    }
+  } else {
+    return Event::enqueueRecordCommand(stream, command, record);
+  }
+  return hipSuccess;
+}
+
+hipError_t IPCEvent::GetHandle(ihipIpcEventHandle_t* handle) {
+  if (!createIpcEventShmemIfNeeded()) {
+    return hipErrorInvalidValue;
+  }
+  ipc_evt_.ipc_shmem_->owners_device_id = deviceId();
+  ipc_evt_.ipc_shmem_->owners_process_id = amd::Os::getProcessId();
+  memset(handle->shmem_name, 0, HIP_IPC_HANDLE_SIZE);
+  ipc_evt_.ipc_name_.copy(handle->shmem_name, std::string::npos);
+  return hipSuccess;
+}
+
+hipError_t IPCEvent::OpenHandle(ihipIpcEventHandle_t* handle) {
+  ipc_evt_.ipc_name_ = handle->shmem_name;
+  if (!amd::Os::MemoryMapFileTruncated(ipc_evt_.ipc_name_.c_str(),
+                                       (const void**)&(ipc_evt_.ipc_shmem_),
+                                       sizeof(ihipIpcEventShmem_t))) {
+    return hipErrorInvalidValue;
+  }
+
+  if (amd::Os::getProcessId() == ipc_evt_.ipc_shmem_->owners_process_id.load()) {
+    // If this is in the same process, return error.
+    return hipErrorInvalidContext;
+  }
+
+  ipc_evt_.ipc_shmem_->owners += 1;
+  // device sets 0 to this ptr when the ipc event is completed
+  hipError_t status = hipSuccess;
+  status = ihipHostRegister(&ipc_evt_.ipc_shmem_->signal,
+                            sizeof(uint32_t) * IPC_SIGNALS_PER_EVENT,
+                            0);
+  return status;
+}
+
+}  // namespace hip
+
+// ================================================================================================
+
+hipError_t hipIpcGetEventHandle(hipIpcEventHandle_t* handle, hipEvent_t event) {
+  HIP_INIT_API(hipIpcGetEventHandle, handle, event);
+
+  if (handle == nullptr || event == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  hip::Event* e = reinterpret_cast<hip::Event*>(event);
+  HIP_RETURN(e->GetHandle(reinterpret_cast<ihipIpcEventHandle_t*>(handle)));
+}
+
+hipError_t hipIpcOpenEventHandle(hipEvent_t* event, hipIpcEventHandle_t handle) {
+  HIP_INIT_API(hipIpcOpenEventHandle, event, handle);
+
+  hipError_t hip_err = hipSuccess;
+  if (event == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  hip_err = ihipEventCreateWithFlags(event, hipEventDisableTiming | hipEventInterprocess);
+  if (hip_err != hipSuccess) {
+    HIP_RETURN(hip_err);
+  }
+  hip::Event* e = reinterpret_cast<hip::Event*>(*event);
+  ihipIpcEventHandle_t* iHandle = reinterpret_cast<ihipIpcEventHandle_t*>(&handle);
+  HIP_RETURN(e->OpenHandle(iHandle));
+}
@@ -0,0 +1,345 @@
+#include "hip_fatbin.hpp"
+
+#include <unordered_map>
+#include "hip_code_object.hpp"
+
+namespace hip {
+
+FatBinaryDeviceInfo::~FatBinaryDeviceInfo() {
+  if (program_ != nullptr) {
+    program_->unload();
+    program_->release();
+    program_ = nullptr;
+  }
+}
+
+FatBinaryInfo::FatBinaryInfo(const char* fname, const void* image) : fdesc_(amd::Os::FDescInit()),
+                             fsize_(0), foffset_(0), image_(image), image_mapped_(false),
+                             uri_(std::string()) {
+
+  if (fname != nullptr) {
+    fname_ = std::string(fname);
+  } else {
+    fname_ = std::string();
+  }
+
+  fatbin_dev_info_.resize(g_devices.size(), nullptr);
+}
+
+FatBinaryInfo::~FatBinaryInfo() {
+
+  for (auto* fbd: fatbin_dev_info_) {
+    if (fbd != nullptr) {
+      delete fbd;
+    }
+  }
+
+  if (fdesc_ > 0) {
+    if (fsize_ && image_mapped_ && !amd::Os::MemoryUnmapFile(image_, fsize_)) {
+      guarantee(false, "Cannot unmap file");
+    }
+    if (!amd::Os::CloseFileHandle(fdesc_)) {
+      guarantee(false, "Cannot close file");
+    }
+  }
+
+  fname_ = std::string();
+  fdesc_ = amd::Os::FDescInit();
+  fsize_ = 0;
+  image_ = nullptr;
+  uri_ = std::string();
+}
+
+hipError_t FatBinaryInfo::ExtractFatBinaryUsingCOMGR(const std::vector<hip::Device*>& devices) {
+  amd_comgr_data_t data_object;
+  amd_comgr_status_t comgr_status = AMD_COMGR_STATUS_SUCCESS;
+  hipError_t hip_status = hipSuccess;
+  amd_comgr_code_object_info_t* query_list_array = nullptr;
+
+  // If image was passed as a pointer to our hipMod* api, we can try to extract the file name
+  // if it was mapped by the app. Otherwise use the COMGR data API.
+  if (fname_.size() == 0) {
+    if (image_ == nullptr) {
+      LogError("Both Filename and image cannot be null");
+      return hipErrorInvalidValue;
+    }
+
+    if(!amd::Os::FindFileNameFromAddress(image_, &fname_, &foffset_)) {
+      fname_ = std::string("");
+      foffset_ = 0;
+    }
+  }
+
+  // If file name & path are available (or it is passed to you), then get the file desc to use
+  // COMGR file slice APIs.
+  if (fname_.size() > 0) {
+    // Get File Handle & size of the file.
+    if (!amd::Os::GetFileHandle(fname_.c_str(), &fdesc_, &fsize_))
+      return hipErrorFileNotFound;
+
+    // If the file name exists but the file size is 0, the something wrong with the file or its path
+    if (fsize_ == 0)
+      return hipErrorInvalidValue;
+
+    // If image_ is nullptr, then file path is passed via hipMod* APIs, so map the file.
+    if (image_ == nullptr && !amd::Os::MemoryMapFileDesc(fdesc_, fsize_, foffset_, &image_)
+                          && (image_mapped_ = true)) {
+      LogError("Cannot map the file descriptor");
+      amd::Os::CloseFileHandle(fdesc_);
+      return hipErrorInvalidValue;
+    }
+  }
+
+  // At this line, image should be a valid ptr.
+  guarantee(image_ != nullptr, "Image cannot be nullptr, file did not map for some reason");
+
+  do {
+
+    // If the image ptr is not clang offload bundle then just directly point the image.
+    if (!CodeObject::IsClangOffloadMagicBundle(image_)) {
+      for (size_t dev_idx=0; dev_idx < devices.size(); ++dev_idx) {
+        fatbin_dev_info_[devices[dev_idx]->deviceId()]
+          = new FatBinaryDeviceInfo(image_, CodeObject::ElfSize(image_), 0);
+        fatbin_dev_info_[devices[dev_idx]->deviceId()]->program_
+          = new amd::Program(*devices[dev_idx]->asContext());
+        if (fatbin_dev_info_[devices[dev_idx]->deviceId()]->program_ == nullptr) {
+          hip_status = hipErrorOutOfMemory;
+          break;
+        }
+      }
+      break;
+    }
+
+    // Create a data object, if it fails return error
+    if ((comgr_status = amd_comgr_create_data(AMD_COMGR_DATA_KIND_FATBIN, &data_object))
+                        != AMD_COMGR_STATUS_SUCCESS) {
+      LogPrintfError("Creating data object failed with status %d ", comgr_status);
+      hip_status = hipErrorInvalidValue;
+      break;
+    }
+
+#if !defined(_WIN32)
+    // Using the file descriptor and file size, map the data object.
+    if (fdesc_ > 0) {
+      guarantee(fsize_ > 0, "Cannot have a file size of 0");
+      if ((comgr_status = amd_comgr_set_data_from_file_slice(data_object, fdesc_, foffset_,
+                          fsize_)) != AMD_COMGR_STATUS_SUCCESS) {
+        LogPrintfError("Setting data from file slice failed with status %d ", comgr_status);
+        hip_status = hipErrorInvalidValue;
+        break;
+      }
+    } else
+#endif
+    if (image_ != nullptr) {
+      // Using the image ptr, map the data object.
+      if ((comgr_status = amd_comgr_set_data(data_object, 4096,
+                          reinterpret_cast<const char*>(image_))) != AMD_COMGR_STATUS_SUCCESS) {
+        LogPrintfError("Setting data from file slice failed with status %d ", comgr_status);
+        hip_status = hipErrorInvalidValue;
+        break;
+      }
+    } else {
+      guarantee(false, "Cannot have both fname_ and image_ as nullptr");
+    }
+
+    // Find the unique number of ISAs needed for this COMGR query.
+    std::unordered_map<std::string, std::pair<size_t, size_t>> unique_isa_names;
+    for (size_t dev_idx = 0; dev_idx < devices.size(); ++dev_idx) {
+      std::string device_name = devices[dev_idx]->devices()[0]->isa().isaName();
+      if (unique_isa_names.cend() == unique_isa_names.find(device_name)) {
+        unique_isa_names.insert({device_name, std::make_pair<size_t, size_t>(0,0)});
+      } 
+    }
+
+    // Create a query list using COMGR info for unique ISAs.
+    query_list_array = new amd_comgr_code_object_info_t[unique_isa_names.size()];
+    auto isa_it = unique_isa_names.begin();
+    for (size_t isa_idx = 0; isa_idx < unique_isa_names.size(); ++isa_idx) {
+      std::advance(isa_it, isa_idx);
+      query_list_array[isa_idx].isa = isa_it->first.c_str();
+      query_list_array[isa_idx].size = 0;
+      query_list_array[isa_idx].offset = 0;
+    }
+
+    // Look up the code object info passing the query list.
+    if ((comgr_status = amd_comgr_lookup_code_object(data_object, query_list_array,
+                        unique_isa_names.size())) != AMD_COMGR_STATUS_SUCCESS) {
+      LogPrintfError("Setting data from file slice failed with status %d ", comgr_status);
+      hip_status = hipErrorInvalidValue; 
+      break;
+    }
+
+    for (size_t isa_idx = 0; isa_idx < unique_isa_names.size(); ++isa_idx) {
+      auto unique_it = unique_isa_names.find(query_list_array[isa_idx].isa);
+      guarantee(unique_isa_names.cend() != unique_it, "Cannot find unique isa");
+      unique_it->second = std::pair<size_t, size_t>
+                            (static_cast<size_t>(query_list_array[isa_idx].size),
+                             static_cast<size_t>(query_list_array[isa_idx].offset));
+    }
+
+    for (size_t dev_idx = 0; dev_idx < devices.size(); ++dev_idx) {
+      std::string device_name = devices[dev_idx]->devices()[0]->isa().isaName();
+      auto dev_it = unique_isa_names.find(device_name);
+      guarantee(unique_isa_names.cend() != dev_it,
+                "Cannot find the device name in the unique device name");
+      fatbin_dev_info_[devices[dev_idx]->deviceId()]
+        = new FatBinaryDeviceInfo(reinterpret_cast<address>(const_cast<void*>(image_))
+                                  + dev_it->second.second, dev_it->second.first,
+                                                           dev_it->second.second);
+      fatbin_dev_info_[devices[dev_idx]->deviceId()]->program_
+        = new amd::Program(*devices[dev_idx]->asContext());
+    }
+
+  } while(0);
+
+  if (query_list_array) {
+    delete[] query_list_array;
+  }
+
+  // Clean up file and memory resouces if hip_status failed for some reason.
+  if (hip_status != hipSuccess && hip_status != hipErrorInvalidKernelFile) {
+    if (image_mapped_) {
+      if (!amd::Os::MemoryUnmapFile(image_, fsize_))
+        guarantee(false, "Cannot unmap the file");
+
+      image_ = nullptr;
+      image_mapped_ = false;
+    }
+
+    if (fdesc_ > 0) {
+      guarantee(fsize_ > 0, "Size has to greater than 0 too");
+      if (!amd::Os::CloseFileHandle(fdesc_))
+        guarantee(false, "Cannot close the file handle");
+
+      fdesc_ = 0;
+      fsize_ = 0;
+    }
+
+    if ((comgr_status = amd_comgr_release_data(data_object)) != AMD_COMGR_STATUS_SUCCESS) {
+      LogPrintfError("Releasing COMGR data failed with status %d ", comgr_status);
+      return hipErrorInvalidValue;
+    } 
+  }
+
+  return hip_status;
+}
+
+hipError_t FatBinaryInfo::ExtractFatBinary(const std::vector<hip::Device*>& devices) {
+  if (!HIP_USE_RUNTIME_UNBUNDLER) {
+    return ExtractFatBinaryUsingCOMGR(devices);
+  }
+
+  hipError_t hip_error = hipSuccess;
+  std::vector<std::pair<const void*, size_t>> code_objs;
+
+  // Copy device names for Extract Code object File
+  std::vector<std::string> device_names;
+  device_names.reserve(devices.size());
+  for (size_t dev_idx = 0; dev_idx < devices.size(); ++dev_idx) {
+    device_names.push_back(devices[dev_idx]->devices()[0]->isa().isaName());
+  }
+
+  // We are given file name, get the file desc and file size
+  if (fname_.size() > 0) {
+    // Get File Handle & size of the file.
+    if (!amd::Os::GetFileHandle(fname_.c_str(), &fdesc_, &fsize_)) {
+      return hipErrorFileNotFound;
+    }
+    if (fsize_ == 0) {
+      return hipErrorInvalidImage;
+    }
+
+    // Extract the code object from file
+    hip_error = CodeObject::ExtractCodeObjectFromFile(fdesc_, fsize_, &image_,
+                device_names, code_objs);
+
+  } else if (image_ != nullptr) {
+    // We are directly given image pointer directly, try to extract file desc & file Size
+    hip_error = CodeObject::ExtractCodeObjectFromMemory(image_,
+                device_names, code_objs, uri_);
+  } else {
+    return hipErrorInvalidValue;
+  }
+
+  if (hip_error == hipErrorNoBinaryForGpu) {
+    LogPrintfError("hipErrorNoBinaryForGpu: Couldn't find binary for current devices! - %d",hip_error);
+    return hip_error;
+  }
+
+  if (hip_error == hipErrorInvalidKernelFile) {
+    for (size_t dev_idx = 0; dev_idx < devices.size(); ++dev_idx) {
+      // the image type is no CLANG_OFFLOAD_BUNDLER, image for current device directly passed
+      fatbin_dev_info_[devices[dev_idx]->deviceId()]
+        = new FatBinaryDeviceInfo(image_, CodeObject::ElfSize(image_), 0);
+    }
+  } else if(hip_error == hipSuccess) {
+    for (size_t dev_idx = 0; dev_idx < devices.size(); ++dev_idx) {
+      // Calculate the offset wrt binary_image and the original image
+      size_t offset_l
+        = (reinterpret_cast<address>(const_cast<void*>(code_objs[dev_idx].first))
+            - reinterpret_cast<address>(const_cast<void*>(image_)));
+
+      fatbin_dev_info_[devices[dev_idx]->deviceId()]
+        = new FatBinaryDeviceInfo(code_objs[dev_idx].first, code_objs[dev_idx].second, offset_l);
+    }
+  }
+
+  for (size_t dev_idx = 0; dev_idx < devices.size(); ++dev_idx) {
+    fatbin_dev_info_[devices[dev_idx]->deviceId()]->program_
+       = new amd::Program(*devices[dev_idx]->asContext());
+    if (fatbin_dev_info_[devices[dev_idx]->deviceId()]->program_ == NULL) {
+      return hipErrorOutOfMemory;
+    }
+  }
+
+  return hipSuccess;
+}
+
+hipError_t FatBinaryInfo::AddDevProgram(const int device_id) {
+  // Device Id bounds Check
+  DeviceIdCheck(device_id);
+
+  FatBinaryDeviceInfo* fbd_info = fatbin_dev_info_[device_id];
+  if (fbd_info == nullptr) {
+    return hipErrorInvalidKernelFile;
+  }
+
+  // If fat binary was already added, skip this step and return success
+  if (fbd_info->add_dev_prog_ == false) {
+    amd::Context* ctx = g_devices[device_id]->asContext();
+    if (CL_SUCCESS != fbd_info->program_->addDeviceProgram(*ctx->devices()[0],
+                                          fbd_info->binary_image_,
+                                          fbd_info->binary_size_, false,
+                                          nullptr, nullptr, fdesc_,
+                                          fbd_info->binary_offset_, uri_)) {
+      return hipErrorInvalidKernelFile;
+    }
+    fbd_info->add_dev_prog_ = true;
+  }
+  return hipSuccess;
+}
+
+hipError_t FatBinaryInfo::BuildProgram(const int device_id) {
+
+  // Device Id Check and Add DeviceProgram if not added so far
+  DeviceIdCheck(device_id);
+  IHIP_RETURN_ONFAIL(AddDevProgram(device_id));
+
+  // If Program was already built skip this step and return success
+  FatBinaryDeviceInfo* fbd_info = fatbin_dev_info_[device_id];
+  if (fbd_info->prog_built_ == false) {
+    if(CL_SUCCESS != fbd_info->program_->build(g_devices[device_id]->devices(),
+                                               nullptr, nullptr, nullptr,
+                                               kOptionChangeable, kNewDevProg)) {
+      return hipErrorSharedObjectInitFailed;
+    }
+    fbd_info->prog_built_ = true;
+  }
+
+  if (!fbd_info->program_->load()) {
+    return hipErrorSharedObjectInitFailed;
+  }
+  return hipSuccess;
+}
+
+} //namespace : hip
@@ -0,0 +1,90 @@
+#ifndef HIP_FAT_BINARY_HPP
+#define HIP_FAT_BINARY_HPP
+
+#include "hip/hip_runtime.h"
+#include "hip/hip_runtime_api.h"
+#include "hip_internal.hpp"
+#include "platform/program.hpp"
+
+namespace hip {
+
+//Fat Binary Per Device info
+class FatBinaryDeviceInfo {
+public:
+  FatBinaryDeviceInfo (const void* binary_image, size_t binary_size, size_t binary_offset)
+                      : binary_image_(binary_image), binary_size_(binary_size),
+                        binary_offset_(binary_offset), program_(nullptr),
+                        add_dev_prog_(false), prog_built_(false) {}
+
+  ~FatBinaryDeviceInfo();
+
+private:
+  const void* binary_image_; // binary image ptr
+  size_t binary_size_;       // binary image size
+  size_t binary_offset_;     // image offset from original
+
+  amd::Program* program_;    // reinterpreted as hipModule_t
+  friend class FatBinaryInfo;
+
+  //Control Variables
+  bool add_dev_prog_;
+  bool prog_built_;
+};
+
+
+// Fat Binary Info
+class FatBinaryInfo {
+public:
+  FatBinaryInfo(const char* fname, const void* image);
+  ~FatBinaryInfo();
+
+  // Loads Fat binary from file or image, unbundles COs for devices.
+  hipError_t ExtractFatBinaryUsingCOMGR(const std::vector<hip::Device*>& devices);
+  hipError_t ExtractFatBinary(const std::vector<hip::Device*>& devices);
+  hipError_t AddDevProgram(const int device_id);
+  hipError_t BuildProgram(const int device_id);
+
+
+  // Device Id bounds check
+  inline void DeviceIdCheck(const int device_id) const {
+    guarantee(device_id >= 0, "Invalid DeviceId less than 0");
+    guarantee(static_cast<size_t>(device_id) < fatbin_dev_info_.size(), "Invalid DeviceId, greater than no of fatbin device info!");
+  }
+
+  // Getter Methods
+  amd::Program* GetProgram(int device_id) {
+    DeviceIdCheck(device_id);
+    return fatbin_dev_info_[device_id]->program_;
+  }
+
+  hipModule_t Module(int device_id) const {
+    DeviceIdCheck(device_id);
+    return reinterpret_cast<hipModule_t>(as_cl(fatbin_dev_info_[device_id]->program_));
+  }
+
+  hipError_t GetModule(int device_id, hipModule_t* hmod) const {
+    DeviceIdCheck(device_id);
+    *hmod = reinterpret_cast<hipModule_t>(as_cl(fatbin_dev_info_[device_id]->program_));
+    return hipSuccess;
+  }
+
+private:
+  std::string fname_;        // File name
+  amd::Os::FileDesc fdesc_;  // File descriptor
+  size_t fsize_;             // Total file size
+  size_t foffset_;           // File Offset where the fat binary is present.
+
+  // Even when file is passed image will be mmapped till ~desctructor.
+  const void* image_;        // Image
+  bool image_mapped_;        // flag to detect if image is mapped
+
+  // Only used for FBs where image is directly passed
+  std::string uri_;          // Uniform resource indicator
+
+  // Per Device Info, like corresponding binary ptr, size.
+  std::vector<FatBinaryDeviceInfo*> fatbin_dev_info_;
+};
+
+}; /* namespace hip */
+
+#endif /* HIP_FAT_BINARY_HPP */
@@ -0,0 +1,877 @@
+/* Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+#include <hip/hip_runtime_api.h>
+#include <hip/hiprtc.h>
+
+inline std::ostream& operator<<(std::ostream& os, const hipTextureFilterMode& s) {
+  switch (s) {
+    case hipFilterModePoint:
+      os << "hipFilterModePoint";
+      break;
+    case hipFilterModeLinear:
+      os << "hipFilterModeLinear";
+      break;
+    default:
+      os << "hipFilterModePoint";
+  };
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipTextureReadMode& s) {
+  switch (s) {
+    case hipReadModeElementType:
+      os << "hipReadModeElementType";
+      break;
+    case hipReadModeNormalizedFloat:
+      os << "hipReadModeNormalizedFloat";
+      break;
+    default:
+      os << "hipReadModeElementType";
+  };
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipTextureAddressMode& s) {
+  switch (s) {
+    case hipAddressModeWrap:
+      os << "hipAddressModeWrap";
+      break;
+    case hipAddressModeClamp:
+      os << "hipAddressModeClamp";
+      break;
+    case hipAddressModeMirror:
+      os << "hipAddressModeMirror";
+      break;
+    case hipAddressModeBorder:
+      os << "hipAddressModeBorder";
+      break;
+    default:
+      os << "hipAddressModeWrap";
+  };
+  return os;
+}
+
+
+inline std::ostream& operator<<(std::ostream& os, const hipMemcpyKind& s) {
+  switch (s) {
+    case hipMemcpyHostToHost:
+      os << "hipMemcpyHostToHost";
+      break;
+    case hipMemcpyHostToDevice:
+      os << "hipMemcpyHostToDevice";
+      break;
+    case hipMemcpyDeviceToHost:
+      os << "hipMemcpyDeviceToHost";
+      break;
+    case hipMemcpyDeviceToDevice:
+      os << "hipMemcpyDeviceToDevice";
+      break;
+    case hipMemcpyDefault:
+      os << "hipMemcpyDefault";
+      break;
+    default:
+      os << "hipMemcpyDefault";
+  };
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipChannelFormatKind& s) {
+  switch (s) {
+    case hipChannelFormatKindSigned:
+      os << "hipChannelFormatKindSigned";
+      break;
+    case hipChannelFormatKindUnsigned:
+      os << "hipMemcpyHostToDevice";
+      break;
+    case hipChannelFormatKindFloat:
+      os << "hipChannelFormatKindFloat";
+      break;
+    case hipChannelFormatKindNone:
+      os << "hipChannelFormatKindNone";
+      break;
+    default:
+      os << "hipChannelFormatKindNone";
+  };
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipArray_Format& s) {
+  switch (s) {
+    case HIP_AD_FORMAT_UNSIGNED_INT8:
+      os << "HIP_AD_FORMAT_UNSIGNED_INT8";
+      break;
+    case HIP_AD_FORMAT_UNSIGNED_INT16:
+      os << "HIP_AD_FORMAT_UNSIGNED_INT16";
+      break;
+    case HIP_AD_FORMAT_UNSIGNED_INT32:
+      os << "HIP_AD_FORMAT_UNSIGNED_INT32";
+      break;
+    case HIP_AD_FORMAT_SIGNED_INT8:
+      os << "HIP_AD_FORMAT_SIGNED_INT8";
+      break;
+    case HIP_AD_FORMAT_SIGNED_INT16:
+      os << "HIP_AD_FORMAT_SIGNED_INT16";
+      break;
+    case HIP_AD_FORMAT_SIGNED_INT32:
+      os << "HIP_AD_FORMAT_SIGNED_INT32";
+      break;
+    case HIP_AD_FORMAT_HALF:
+      os << "HIP_AD_FORMAT_HALF";
+      break;
+    case HIP_AD_FORMAT_FLOAT:
+      os << "HIP_AD_FORMAT_FLOAT";
+      break;
+    default:
+      os << "HIP_AD_FORMAT_FLOAT";
+  };
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipResourceViewFormat& s) {
+  switch (s) {
+    case hipResViewFormatNone:
+      os << "hipResViewFormatNone";
+      break;
+    case hipResViewFormatUnsignedChar1:
+      os << "hipResViewFormatUnsignedChar1";
+      break;
+    case hipResViewFormatUnsignedChar2:
+      os << "hipResViewFormatUnsignedChar2";
+      break;
+    case hipResViewFormatUnsignedChar4:
+      os << "hipResViewFormatUnsignedChar4";
+      break;
+    case hipResViewFormatSignedChar1:
+      os << "hipResViewFormatSignedChar1";
+      break;
+    case hipResViewFormatSignedChar2:
+      os << "hipResViewFormatSignedChar2";
+      break;
+    case hipResViewFormatSignedChar4:
+      os << "hipResViewFormatSignedChar4";
+      break;
+    case hipResViewFormatUnsignedShort1:
+      os << "hipResViewFormatUnsignedShort1";
+      break;
+    case hipResViewFormatUnsignedShort2:
+      os << "hipResViewFormatUnsignedShort2";
+      break;
+    case hipResViewFormatUnsignedShort4:
+      os << "hipResViewFormatUnsignedShort4";
+      break;
+    case hipResViewFormatSignedShort1:
+      os << "hipResViewFormatSignedShort1";
+      break;
+    case hipResViewFormatSignedShort2:
+      os << "hipResViewFormatSignedShort2";
+      break;
+    case hipResViewFormatSignedShort4:
+      os << "hipResViewFormatSignedShort4";
+      break;
+    case hipResViewFormatUnsignedInt1:
+      os << "hipResViewFormatUnsignedInt1";
+      break;
+    case hipResViewFormatUnsignedInt2:
+      os << "hipResViewFormatUnsignedInt2";
+      break;
+    case hipResViewFormatUnsignedInt4:
+      os << "hipResViewFormatUnsignedInt4";
+      break;
+    case hipResViewFormatSignedInt1:
+      os << "hipResViewFormatSignedInt1";
+      break;
+    case hipResViewFormatSignedInt2:
+      os << "hipResViewFormatSignedInt2";
+      break;
+    case hipResViewFormatSignedInt4:
+      os << "hipResViewFormatSignedInt4";
+      break;
+    case hipResViewFormatHalf1:
+      os << "hipResViewFormatHalf1";
+      break;
+    case hipResViewFormatHalf2:
+      os << "hipResViewFormatHalf2";
+      break;
+    case hipResViewFormatHalf4:
+      os << "hipResViewFormatHalf4";
+      break;
+    case hipResViewFormatFloat1:
+      os << "hipResViewFormatFloat1";
+      break;
+    case hipResViewFormatFloat2:
+      os << "hipResViewFormatFloat2";
+      break;
+    case hipResViewFormatFloat4:
+      os << "hipResViewFormatFloat4";
+      break;
+    case hipResViewFormatUnsignedBlockCompressed1:
+      os << "hipResViewFormatUnsignedBlockCompressed1";
+      break;
+    case hipResViewFormatUnsignedBlockCompressed2:
+      os << "hipResViewFormatUnsignedBlockCompressed2";
+      break;
+    case hipResViewFormatUnsignedBlockCompressed3:
+      os << "hipResViewFormatUnsignedBlockCompressed3";
+      break;
+    case hipResViewFormatUnsignedBlockCompressed4:
+      os << "hipResViewFormatUnsignedBlockCompressed4";
+      break;
+    case hipResViewFormatSignedBlockCompressed4:
+      os << "hipResViewFormatSignedBlockCompressed4";
+      break;
+    case hipResViewFormatUnsignedBlockCompressed5:
+      os << "hipResViewFormatUnsignedBlockCompressed5";
+      break;
+    case hipResViewFormatSignedBlockCompressed5:
+      os << "hipResViewFormatSignedBlockCompressed5";
+      break;
+    case hipResViewFormatUnsignedBlockCompressed6H:
+      os << "hipResViewFormatUnsignedBlockCompressed6H";
+      break;
+    case hipResViewFormatSignedBlockCompressed6H:
+      os << "hipResViewFormatSignedBlockCompressed6H";
+      break;
+    case hipResViewFormatUnsignedBlockCompressed7:
+      os << "hipResViewFormatUnsignedBlockCompressed7";
+      break;
+    default:
+      os << "hipResViewFormatNone";
+  };
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipFunction_attribute& s) {
+  switch (s) {
+    case HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK:
+      os << "HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK";
+      break;
+    case HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES:
+      os << "HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES";
+      break;
+    case HIP_FUNC_ATTRIBUTE_CONST_SIZE_BYTES:
+      os << "HIP_FUNC_ATTRIBUTE_CONST_SIZE_BYTES";
+      break;
+    case HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES:
+      os << "HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES";
+      break;
+    case HIP_FUNC_ATTRIBUTE_NUM_REGS:
+      os << "HIP_FUNC_ATTRIBUTE_NUM_REGS";
+      break;
+    case HIP_FUNC_ATTRIBUTE_PTX_VERSION:
+      os << "HIP_FUNC_ATTRIBUTE_PTX_VERSION";
+      break;
+    case HIP_FUNC_ATTRIBUTE_BINARY_VERSION:
+      os << "HIP_FUNC_ATTRIBUTE_BINARY_VERSION";
+      break;
+    case HIP_FUNC_ATTRIBUTE_CACHE_MODE_CA:
+      os << "HIP_FUNC_ATTRIBUTE_CACHE_MODE_CA";
+      break;
+    case HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES:
+      os << "HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES";
+      break;
+    case HIP_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT:
+      os << "HIP_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT";
+      break;
+    case HIP_FUNC_ATTRIBUTE_MAX:
+      os << "HIP_FUNC_ATTRIBUTE_MAX";
+      break;
+    default:
+      os << "HIP_FUNC_ATTRIBUTE_MAX";
+  };
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hiprtcResult& s) {
+  switch (s) {
+    case HIPRTC_SUCCESS:
+      os << "HIPRTC_SUCCESS";
+      break;
+    case HIPRTC_ERROR_OUT_OF_MEMORY:
+      os << "HIPRTC_ERROR_OUT_OF_MEMORY";
+      break;
+    case HIPRTC_ERROR_PROGRAM_CREATION_FAILURE:
+      os << "HIPRTC_ERROR_PROGRAM_CREATION_FAILURE";
+      break;
+    case HIPRTC_ERROR_INVALID_INPUT:
+      os << "HIPRTC_ERROR_INVALID_INPUT";
+      break;
+    case HIPRTC_ERROR_INVALID_PROGRAM:
+      os << "HIPRTC_ERROR_INVALID_PROGRAM";
+      break;
+    case HIPRTC_ERROR_INVALID_OPTION:
+      os << "HIPRTC_ERROR_INVALID_OPTION";
+      break;
+    case HIPRTC_ERROR_COMPILATION:
+      os << "HIPRTC_ERROR_COMPILATION";
+      break;
+    case HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE:
+      os << "HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE";
+      break;
+    case HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION:
+      os << "HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION";
+      break;
+    case HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION:
+      os << "IPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION";
+      break;
+    case HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID:
+      os << "HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID";
+      break;
+    case HIPRTC_ERROR_INTERNAL_ERROR:
+      os << "HIPRTC_ERROR_INTERNAL_ERROR";
+      break;
+    default:
+      os << "HIPRTC_ERROR_INTERNAL_ERROR";
+  };
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipJitOption& s) {
+  switch (s) {
+    case HIPRTC_JIT_MAX_REGISTERS:
+      os << "HIPRTC_JIT_MAX_REGISTERS";
+      break;
+    case HIPRTC_JIT_THREADS_PER_BLOCK:
+      os << "HIPRTC_JIT_THREADS_PER_BLOCK";
+      break;
+    case HIPRTC_JIT_WALL_TIME:
+      os << "HIPRTC_JIT_WALL_TIME";
+      break;
+    case HIPRTC_JIT_INFO_LOG_BUFFER:
+      os << "HIPRTC_JIT_INFO_LOG_BUFFER";
+      break;
+    case HIPRTC_JIT_INFO_LOG_BUFFER_SIZE_BYTES:
+      os << "HIPRTC_JIT_ERROR_LOG_BUFFER_SIZE_BYTES";
+      break;
+    case HIPRTC_JIT_ERROR_LOG_BUFFER:
+      os << "HIPRTC_JIT_ERROR_LOG_BUFFER";
+      break;
+    case HIPRTC_JIT_ERROR_LOG_BUFFER_SIZE_BYTES:
+      os << "HIPRTC_JIT_ERROR_LOG_BUFFER_SIZE_BYTES";
+      break;
+    case HIPRTC_JIT_OPTIMIZATION_LEVEL:
+      os << "HIPRTC_JIT_OPTIMIZATION_LEVEL";
+      break;
+    case HIPRTC_JIT_TARGET_FROM_HIPCONTEXT:
+      os << "HIPRTC_JIT_TARGET_FROM_HIPCONTEXT";
+      break;
+    case HIPRTC_JIT_TARGET:
+      os << "HIPRTC_JIT_TARGET";
+      break;
+    case HIPRTC_JIT_FALLBACK_STRATEGY:
+      os << "HIPRTC_JIT_FALLBACK_STRATEGY";
+      break;
+    case HIPRTC_JIT_GENERATE_DEBUG_INFO:
+      os << "HIPRTC_JIT_GENERATE_DEBUG_INFO";
+      break;
+    case HIPRTC_JIT_CACHE_MODE:
+      os << "HIPRTC_JIT_CACHE_MODE";
+      break;
+    case HIPRTC_JIT_NEW_SM3X_OPT:
+      os << "HIPRTC_JIT_NEW_SM3X_OPT";
+      break;
+    case HIPRTC_JIT_FAST_COMPILE:
+      os << "HIPRTC_JIT_FAST_COMPILE";
+      break;
+    case HIPRTC_JIT_GLOBAL_SYMBOL_NAMES:
+      os << "HIPRTC_JIT_GLOBAL_SYMBOL_NAMES";
+      break;
+    case HIPRTC_JIT_GLOBAL_SYMBOL_ADDRESS:
+      os << "HIPRTC_JIT_GLOBAL_SYMBOL_ADDRESS";
+      break;
+    case HIPRTC_JIT_GLOBAL_SYMBOL_COUNT:
+      os << "HIPRTC_JIT_GLOBAL_SYMBOL_COUNT";
+      break;
+    case HIPRTC_JIT_LTO:
+      os << "HIPRTC_JIT_LTO";
+      break;
+    case HIPRTC_JIT_FTZ:
+      os << "HIPRTC_JIT_FTZ";
+      break;
+    case HIPRTC_JIT_PREC_DIV:
+      os << "HIPRTC_JIT_PREC_DIV";
+      break;
+    case HIPRTC_JIT_PREC_SQRT:
+      os << "HIPRTC_JIT_PREC_SQRT";
+      break;
+    case HIPRTC_JIT_FMA:
+      os << "HIPRTC_JIT_FMA";
+      break;
+    case HIPRTC_JIT_NUM_OPTIONS:
+      os << "HIPRTC_JIT_NUM_OPTIONS";
+      break;
+    default:
+      os << "HIPRTC_JIT_MAX_REGISTERS";
+  };
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipFuncCache_t& s) {
+  switch (s) {
+    case hipFuncCachePreferNone:
+      os << "hipFuncCachePreferNone";
+      break;
+    case hipFuncCachePreferShared:
+      os << "hipFuncCachePreferShared";
+      break;
+    case hipFuncCachePreferL1:
+      os << "hipFuncCachePreferL1";
+      break;
+    case hipFuncCachePreferEqual:
+      os << "hipFuncCachePreferEqual";
+      break;
+    default:
+      os << "hipFuncCachePreferNone";
+  };
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipSharedMemConfig& s) {
+  switch (s) {
+    case hipSharedMemBankSizeDefault:
+      os << "hipSharedMemBankSizeDefault";
+      break;
+    case hipSharedMemBankSizeFourByte:
+      os << "hipSharedMemBankSizeFourByte";
+      break;
+    case hipSharedMemBankSizeEightByte:
+      os << "hipSharedMemBankSizeEightByte";
+      break;
+    default:
+      os << "hipSharedMemBankSizeDefault";
+  };
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipDataType& s) {
+  switch (s) {
+    case HIP_R_16F:
+      os << "HIP_R_16F";
+      break;
+    case HIP_R_32F:
+      os << "HIP_R_32F";
+      break;
+    case HIP_R_64F:
+      os << "HIP_R_64F";
+      break;
+    case HIP_C_16F:
+      os << "HIP_C_16F";
+      break;
+    case HIP_C_32F:
+      os << "HIP_C_32F";
+      break;
+    case HIP_C_64F:
+      os << "HIP_C_64F";
+      break;
+    default:
+      os << "HIP_R_16F";
+  };
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipLibraryPropertyType& s) {
+  switch (s) {
+    case HIP_LIBRARY_MAJOR_VERSION:
+      os << "HIP_LIBRARY_MAJOR_VERSION";
+      break;
+    case HIP_LIBRARY_MINOR_VERSION:
+      os << "HIP_LIBRARY_MINOR_VERSION";
+      break;
+    case HIP_LIBRARY_PATCH_LEVEL:
+      os << "HIP_LIBRARY_PATCH_LEVEL";
+      break;
+    default:
+      os << "HIP_LIBRARY_MAJOR_VERSION";
+  };
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hip_api_id_t& s) {
+  os << hip_api_name(s);
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hip_api_id_t* s) {
+  if (s) {
+    os << *s;
+  } else {
+    os << "nullptr";
+  }
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipTextureDesc& s) {
+  os << '{'
+  << '{'
+  << s.addressMode[0]
+  << ','
+  << s.addressMode[1]
+  << ','
+  << s.addressMode[2]
+  << '}'
+  << ','
+  << s.filterMode
+  << ','
+  << s.readMode
+  << ','
+  << s.sRGB
+  << ','
+  << '{'
+  << s.borderColor[0]
+  << ','
+  << s.borderColor[1]
+  << ','
+  << s.borderColor[2]
+  << ','
+  << s.borderColor[3]
+  << '}'
+  << ','
+  << s.normalizedCoords
+  << ','
+  << s.mipmapFilterMode
+  << ','
+  << s.mipmapLevelBias
+  << ','
+  << s.minMipmapLevelClamp
+  << ','
+  << s.maxMipmapLevelClamp
+  << '}';
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipTextureDesc* s) {
+  if (s) {
+    os << *s;
+  } else {
+    os << "nullptr";
+  }
+  return os;
+}
+
+
+inline std::ostream& operator<<(std::ostream& os, const dim3& s) {
+  os << '{'
+  << s.x
+  << ','
+  << s.y
+  << ','
+  << s.z
+  << '}';
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const dim3* s) {
+  if (s) {
+    os << *s;
+  } else {
+    os << "nullptr";
+  }
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipChannelFormatDesc& s) {
+  os << '{'
+  << s.x
+  << ','
+  << s.y
+  << ','
+  << s.z
+  << ','
+  << s.w
+  << ','
+  << s.f
+  << '}';
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipChannelFormatDesc* s) {
+  if (s) {
+    os << *s;
+  } else {
+    os << "nullptr";
+  }
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipMipmappedArray& s) {
+  os << '{'
+  << s.data
+  << ','
+  << s.desc
+  << ','
+  << s.width
+  << ','
+  << s.height
+  << ','
+  << s.depth
+  << '}';
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipMipmappedArray* s) {
+  if (s) {
+    os << *s;
+  } else {
+    os << "nullptr";
+  }
+  return os;
+}
+
+
+inline std::ostream& operator<<(std::ostream& os, const hipResourceDesc& s) {
+  os << '{'
+  << s.resType
+  << ','
+  << '{';
+
+  switch (s.resType) {
+  case hipResourceTypeLinear:
+    os << s.res.linear.devPtr
+    << ','
+    << s.res.linear.desc
+    << ','
+    << s.res.linear.sizeInBytes;
+    break;
+  case hipResourceTypePitch2D:
+    os << s.res.pitch2D.devPtr
+    << ','
+    << s.res.pitch2D.desc
+    << ','
+    << s.res.pitch2D.width
+    << ','
+    << s.res.pitch2D.height
+    << ','
+    << s.res.pitch2D.pitchInBytes;
+    break;
+  case hipResourceTypeArray:
+    os << s.res.array.array;
+    break;
+  case hipResourceTypeMipmappedArray:
+    os <<s.res.mipmap.mipmap;
+    break;
+  default:
+    break;
+  }
+
+  os << '}';
+
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipResourceDesc* s) {
+  if (s) {
+    os << *s;
+  } else {
+    os << "nullptr";
+  }
+  return os;
+}
+
+
+inline std::ostream& operator<<(std::ostream& os, const hipArray& s) {
+  os << '{'
+  << s.data
+  << ','
+  << s.desc
+  << ','
+  << s.type
+  << ','
+  << s.width
+  << ','
+  << s.height
+  << ','
+  << s.depth
+  << ','
+  << s.Format
+  << ','
+  << s.NumChannels
+  << ','
+  << s.isDrv
+  << ','
+  << s.textureType
+  << '}';
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipArray* s) {
+  if (s) {
+    os << *s;
+  } else {
+    os << "nullptr";
+  }
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const textureReference& s) {
+  os << '{'
+  << s.normalized
+  << ','
+  << s.readMode
+  << ','
+  << s.filterMode
+  << ','
+  << '{'
+  << s.addressMode[0]
+  << ','
+  << s.addressMode[1]
+  << ','
+  << s.addressMode[2]
+  << '}'
+  << ','
+  << s.channelDesc
+  << ','
+  << s.sRGB
+  << ','
+  << s.maxAnisotropy
+  << ','
+  << s.mipmapFilterMode
+  << ','
+  << s.mipmapLevelBias
+  << ','
+  << s.minMipmapLevelClamp
+  << ','
+  << s.maxMipmapLevelClamp
+  << ','
+  << s.textureObject
+  << '}';
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const textureReference* s) {
+  if (s) {
+    os << *s;
+  } else {
+    os << "nullptr";
+  }
+  return os;
+}
+
+
+inline std::ostream& operator<<(std::ostream& os, const hipError_t& s) {
+  os << hipGetErrorName(s);
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipError_t* s) {
+  if (s) {
+    os << *s;
+  } else {
+    os << "nullptr";
+  }
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipResourceViewDesc& s) {
+  os << '{'
+  << s.format
+  << ','
+  << s.width
+  << ','
+  << s.height
+  << ','
+  << s.depth
+  << ','
+  << s.firstMipmapLevel
+  << ','
+  << s.lastMipmapLevel
+  << ','
+  << s.firstLayer
+  << ','
+  << s.lastLayer
+  << '}';
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipResourceViewDesc* s) {
+  if (s) {
+    os << *s;
+  } else {
+    os << "nullptr";
+  }
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const HIP_ARRAY_DESCRIPTOR& s) {
+  os << '{'
+  << s.Width
+  << ','
+  << s.Height
+  << ','
+  << s.Format
+  << ','
+  << s.NumChannels
+  << '}';
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const HIP_ARRAY_DESCRIPTOR* s) {
+  if (s) {
+    os << *s;
+  } else {
+    os << "nullptr";
+  }
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const HIP_ARRAY3D_DESCRIPTOR& s) {
+  os << '{'
+  << s.Width
+  << ','
+  << s.Height
+  << ','
+  << s.Depth
+  << ','
+  << s.Format
+  << ','
+  << s.NumChannels
+  << ','
+  << s.Flags
+  << '}';
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const HIP_ARRAY3D_DESCRIPTOR* s) {
+  if (s) {
+    os << *s;
+  } else {
+    os << "nullptr";
+  }
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipExtent& s) {
+  os << '{'
+  << s.width
+  << ','
+  << s.height
+  << ','
+  << s.depth
+  << '}';
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipIpcEventHandle_t& s) {
+  //TODO fill in later
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipIpcEventHandle_t* s) {
+  //TODO fill in later
+  return os;
+}
@@ -0,0 +1,758 @@
+/* Copyright (c) 2010 - 2021 Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+
+#include "top.hpp"
+#include "hip/hip_runtime.h"
+#include "hip_internal.hpp"
+#include "cl_gl_amd.hpp"
+#include "cl_common.hpp"
+#include <GL/gl.h>
+#include <GL/glext.h>
+#include "hip_conversions.hpp"
+
+namespace amd {
+static std::once_flag interopOnce;
+}
+// Sets up GL context association with amd context.
+// NOTE: Refer to Context setup code in OCLTestImp.cpp
+void setupGLInteropOnce() {
+  amd::Context* amdContext = hip::getCurrentDevice()->asContext();
+
+//current context will be read in amdContext->create
+  cl_context_properties properties[] = {CL_CONTEXT_PLATFORM,
+                                        (cl_context_properties)AMD_PLATFORM,
+                                        ROCCLR_HIP_GL_CONTEXT_KHR,
+                                        (cl_context_properties) nullptr,
+#ifdef _WIN32
+                                        ROCCLR_HIP_WGL_HDC_KHR,
+                                        (cl_context_properties) nullptr,
+#else
+                                        ROCCLR_HIP_GLX_DISPLAY_KHR,
+                                        (cl_context_properties) nullptr,
+#endif
+                                        0};
+
+  amd::Context::Info info;
+  if (CL_SUCCESS != amd::Context::checkProperties(properties, &info)) {
+    LogError("Context setup failed \n");
+    return;
+  }
+
+  amdContext->setInfo(info);
+  if (CL_SUCCESS != amdContext->create(properties)) {
+    LogError("Context setup failed \n");
+  }
+}
+
+static inline hipError_t hipSetInteropObjects(int num_objects, void** mem_objects,
+                                              std::vector<amd::Memory*>& interopObjects) {
+  if ((num_objects == 0 && mem_objects != nullptr) || (num_objects != 0 && mem_objects == nullptr)) {
+    return hipErrorUnknown;
+  }
+
+  while (num_objects-- > 0) {
+    void* obj = *mem_objects++;
+    if (obj == nullptr) {
+      return hipErrorInvalidResourceHandle;
+    }
+
+    amd::Memory* mem = reinterpret_cast<amd::Memory*>(obj);
+
+    if (mem->getInteropObj() == nullptr) {
+      return hipErrorInvalidResourceHandle;
+    }
+
+    interopObjects.push_back(mem);
+  }
+  return hipSuccess;
+}
+
+// NOTE: This method cooresponds to OpenCL functionality in clGetGLContextInfoKHR()
+hipError_t hipGLGetDevices(unsigned int* pHipDeviceCount, int* pHipDevices,
+                           unsigned int hipDeviceCount, hipGLDeviceList deviceList) {
+  HIP_INIT_API(hipGLGetDevices, pHipDeviceCount, pHipDevices, hipDeviceCount, deviceList);
+
+  std::call_once(amd::interopOnce, setupGLInteropOnce);
+
+  static const bool VALIDATE_ONLY = true;
+  if (deviceList == hipGLDeviceListNextFrame) {
+    LogError(" hipGLDeviceListNextFrame not supported yet.\n");
+    HIP_RETURN(hipErrorNotSupported);
+  }
+  if (pHipDeviceCount == nullptr || pHipDevices == nullptr || hipDeviceCount == 0) {
+    LogError(" Invalid Argument \n");
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  hipDeviceCount = std::min(hipDeviceCount, static_cast<unsigned int>(g_devices.size()));
+
+  amd::Context::Info info = hip::getCurrentDevice()->asContext()->info();
+  if (!(info.flags_ & amd::Context::GLDeviceKhr)) {
+    LogError("Failed : Invalid Shared Group Reference \n");
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  amd::GLFunctions* glenv = hip::getCurrentDevice()->asContext()->glenv();
+  if (glenv != nullptr) {
+#ifdef _WIN32
+    info.hCtx_ = glenv->wglGetCurrentContext_();
+#else
+    info.hCtx_ = glenv->glXGetCurrentContext_();
+#endif
+    hip::getCurrentDevice()->asContext()->setInfo(info);
+    glenv->update(reinterpret_cast<intptr_t>(info.hCtx_));
+  }
+  *pHipDeviceCount = 0;
+  switch (deviceList) {
+    case hipGLDeviceListCurrentFrame:
+      for (int i = 0; i < hipDeviceCount; ++i) {
+        const std::vector<amd::Device*>& devices = g_devices[i]->devices();
+        if (devices.size() > 0 &&
+            devices[0]->bindExternalDevice(info.flags_, info.hDev_, info.hCtx_, VALIDATE_ONLY)) {
+          pHipDevices[0] = i;
+          *pHipDeviceCount = 1;
+          break;
+        }
+      }
+      break;
+
+    case hipGLDeviceListAll: {
+      int foundDeviceCount = 0;
+      for (int i = 0; i < hipDeviceCount; ++i) {
+        const std::vector<amd::Device*>& devices = g_devices[i]->devices();
+        if (devices.size() > 0 &&
+            devices[0]->bindExternalDevice(info.flags_, info.hDev_, info.hCtx_, VALIDATE_ONLY)) {
+          pHipDevices[foundDeviceCount++] = i;
+          break;
+        }
+      }
+
+      *pHipDeviceCount = foundDeviceCount;
+    } break;
+
+    default:
+      LogWarning("Invalid deviceList value");
+      HIP_RETURN(hipErrorInvalidValue);
+  }
+  HIP_RETURN(*pHipDeviceCount > 0 ? hipSuccess : hipErrorNoDevice);
+}
+
+static inline void clearGLErrors(const amd::Context& amdContext) {
+  GLenum glErr, glLastErr = GL_NO_ERROR;
+  while (1) {
+    glErr = amdContext.glenv()->glGetError_();
+    if (glErr == GL_NO_ERROR || glErr == glLastErr) {
+      break;
+    }
+    glLastErr = glErr;
+    LogWarning("GL error");
+  }
+}
+
+static inline GLenum checkForGLError(const amd::Context& amdContext) {
+  GLenum glRetErr = GL_NO_ERROR;
+  GLenum glErr;
+  while (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_())) {
+    glRetErr = glErr;  // Just return the last GL error
+    LogWarning("Check GL error");
+  }
+  return glRetErr;
+}
+
+hipError_t hipGraphicsSubResourceGetMappedArray(hipArray_t* array, hipGraphicsResource_t resource,
+                                      unsigned int arrayIndex, unsigned int mipLevel) {
+  HIP_INIT_API(hipGraphicsSubResourceGetMappedArray, array, resource, arrayIndex, mipLevel);
+
+  amd::Context& amdContext = *(hip::getCurrentDevice()->asContext());
+  if (array == nullptr || resource == nullptr) {
+    LogError("invalid array/resource");
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  amd::Image* image = (reinterpret_cast<amd::Memory*>(resource))->asImage();
+  if (image == nullptr) {
+    LogError("invalid resource/image");
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  // arrayIndex higher than zero not implmented
+  assert(arrayIndex == 0) ;
+  amd::Image * view = image->createView(amdContext, image->getImageFormat(), nullptr, mipLevel, 0);
+
+  hipArray* myarray = new hipArray();
+
+  myarray->data = as_cl<amd::Memory> (view);
+
+  myarray->width = view->getWidth();
+  myarray->height = view->getHeight();
+  myarray->depth = view->getDepth();
+
+  const cl_mem_object_type image_type = hip::getCLMemObjectType(myarray->width, myarray->height, myarray->depth, hipArrayDefault);
+  myarray->type = image_type;
+  amd::Image::Format f = image->getImageFormat();
+  myarray->Format = hip::getCL2hipArrayFormat(f.image_channel_data_type);
+  myarray->desc = hip::getChannelFormatDesc(f.getNumChannels(), myarray->Format);
+  myarray->NumChannels = hip::getNumChannels(myarray->desc);
+  myarray->isDrv = 0;
+  myarray->textureType = 0;
+  *array = myarray;
+  {
+    amd::ScopedLock lock(hip::hipArraySetLock);
+    hip::hipArraySet.insert(*array);
+  }
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipGraphicsGLRegisterImage(hipGraphicsResource** resource, GLuint image, GLenum target,
+                                      unsigned int flags) {
+  HIP_INIT_API(hipGraphicsGLRegisterImage, resource, image, target, flags);
+
+    if (!((flags == hipGraphicsRegisterFlagsNone) || (flags & hipGraphicsRegisterFlagsReadOnly) ||
+        (flags & hipGraphicsRegisterFlagsWriteDiscard) ||
+          (flags & hipGraphicsRegisterFlagsSurfaceLoadStore) ||
+            (flags & hipGraphicsRegisterFlagsTextureGather))) {
+    LogError("invalid parameter \"flags\"");
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  if (resource == nullptr) {
+    LogError("invalid resource");
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  GLint miplevel = 0;
+  amd::Context& amdContext = *(hip::getCurrentDevice()->asContext());
+
+  if (amdContext.glenv() == nullptr) {
+    LogError("invalid context, gl interop not initialized");
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  amd::GLFunctions::SetIntEnv ie(amdContext.glenv());
+  if (!ie.isValid()) {
+    LogWarning("\"amdContext\" is not created from GL context or share list \n");
+    HIP_RETURN(hipErrorUnknown);
+  }
+
+  amd::ImageGL* pImageGL = NULL;
+  GLenum glErr;
+  GLenum glTarget = 0;
+  GLenum glInternalFormat;
+  cl_image_format clImageFormat;
+  uint dim = 1;
+  cl_mem_object_type clType;
+  cl_gl_object_type clGLType;
+  GLsizei numSamples = 1;
+
+  GLint gliTexWidth = 1;
+  GLint gliTexHeight = 1;
+  GLint gliTexDepth = 1;
+
+  // Verify GL texture object
+  clearGLErrors(amdContext);
+  if ((GL_FALSE == amdContext.glenv()->glIsTexture_(image)) ||
+      (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_()))) {
+    LogWarning("\"texture\" is not a GL texture object");
+    HIP_RETURN(hipErrorUnknown);
+  }
+
+  bool isImage = true;
+
+  // Check target value validity
+  switch (target) {
+    case GL_TEXTURE_BUFFER:
+      glTarget = GL_TEXTURE_BUFFER;
+      dim = 1;
+      clType = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+      clGLType = CL_GL_OBJECT_TEXTURE_BUFFER;
+      isImage = false;
+      break;
+
+    case GL_TEXTURE_1D:
+      glTarget = GL_TEXTURE_1D;
+      dim = 1;
+      clType = CL_MEM_OBJECT_IMAGE1D;
+      clGLType = CL_GL_OBJECT_TEXTURE1D;
+      break;
+
+    case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
+    case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
+    case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
+    case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
+    case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
+    case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
+      glTarget = GL_TEXTURE_CUBE_MAP;
+      dim = 2;
+      clType = CL_MEM_OBJECT_IMAGE2D;
+      clGLType = CL_GL_OBJECT_TEXTURE2D;
+      break;
+
+    case GL_TEXTURE_1D_ARRAY:
+      glTarget = GL_TEXTURE_1D_ARRAY;
+      dim = 2;
+      clType = CL_MEM_OBJECT_IMAGE1D_ARRAY;
+      clGLType = CL_GL_OBJECT_TEXTURE1D_ARRAY;
+      break;
+
+    case GL_TEXTURE_2D:
+      glTarget = GL_TEXTURE_2D;
+      dim = 2;
+      clType = CL_MEM_OBJECT_IMAGE2D;
+      clGLType = CL_GL_OBJECT_TEXTURE2D;
+      break;
+
+    case GL_TEXTURE_2D_MULTISAMPLE:
+      glTarget = GL_TEXTURE_2D_MULTISAMPLE;
+      dim = 2;
+      clType = CL_MEM_OBJECT_IMAGE2D;
+      clGLType = CL_GL_OBJECT_TEXTURE2D;
+      break;
+
+    case GL_TEXTURE_RECTANGLE_ARB:
+      glTarget = GL_TEXTURE_RECTANGLE_ARB;
+      dim = 2;
+      clType = CL_MEM_OBJECT_IMAGE2D;
+      clGLType = CL_GL_OBJECT_TEXTURE2D;
+      break;
+
+    case GL_TEXTURE_2D_ARRAY:
+      glTarget = GL_TEXTURE_2D_ARRAY;
+      dim = 3;
+      clType = CL_MEM_OBJECT_IMAGE2D_ARRAY;
+      clGLType = CL_GL_OBJECT_TEXTURE2D_ARRAY;
+      break;
+
+    case GL_TEXTURE_3D:
+      glTarget = GL_TEXTURE_3D;
+      dim = 3;
+      clType = CL_MEM_OBJECT_IMAGE3D;
+      clGLType = CL_GL_OBJECT_TEXTURE3D;
+      break;
+
+    default:
+      // wrong value
+      LogWarning("invalid \"target\" value");
+      HIP_RETURN(hipErrorInvalidValue);
+      break;
+  }
+  amdContext.glenv()->glBindTexture_(glTarget, image);
+
+  // Check if size is available - data store is created
+  if (isImage) {
+    // Check mipmap level for "texture" name
+    GLint gliTexBaseLevel;
+    GLint gliTexMaxLevel;
+
+    clearGLErrors(amdContext);
+    amdContext.glenv()->glGetTexParameteriv_(glTarget, GL_TEXTURE_BASE_LEVEL, &gliTexBaseLevel);
+    if (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_())) {
+      LogWarning("Cannot get base mipmap level of a GL \"texture\" object");
+      HIP_RETURN(hipErrorInvalidValue);
+    }
+    clearGLErrors(amdContext);
+    amdContext.glenv()->glGetTexParameteriv_(glTarget, GL_TEXTURE_MAX_LEVEL, &gliTexMaxLevel);
+    if (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_())) {
+      LogWarning("Cannot get max mipmap level of a GL \"texture\" object");
+      HIP_RETURN(hipErrorInvalidValue);
+    }
+
+    if ((gliTexBaseLevel > miplevel) || (miplevel > gliTexMaxLevel)) {
+      LogWarning("\"miplevel\" is not a valid mipmap level of the GL \"texture\" object");
+      HIP_RETURN(hipErrorInvalidValue);
+    }
+
+    // Get GL texture format and check if it's compatible with CL format
+    clearGLErrors(amdContext);
+    amdContext.glenv()->glGetTexLevelParameteriv_(target, miplevel, GL_TEXTURE_INTERNAL_FORMAT,
+                                                  (GLint*)&glInternalFormat);
+    if (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_())) {
+      LogWarning("Cannot get internal format of \"miplevel\" of GL \"texture\" object");
+      HIP_RETURN(hipErrorInvalidValue);
+    }
+
+    amdContext.glenv()->glGetTexLevelParameteriv_(target, miplevel, GL_TEXTURE_SAMPLES,
+                                                  (GLint*)&numSamples);
+    if (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_())) {
+      LogWarning("Cannot get  numbers of samples of GL \"texture\" object");
+      HIP_RETURN(hipErrorInvalidValue);
+    }
+    if (numSamples > 1) {
+      LogWarning("MSAA \"texture\" object is not suppoerted for the device");
+      HIP_RETURN(hipErrorInvalidValue);
+    }
+
+    // Now get CL format from GL format and bytes per pixel
+    int iBytesPerPixel = 0;
+    if (!getCLFormatFromGL(amdContext, glInternalFormat, &clImageFormat, &iBytesPerPixel,
+                            0)) { //clFlags)) {
+      LogWarning("\"texture\" format does not map to an appropriate CL image format");
+      HIP_RETURN(hipErrorInvalidValue);
+    }
+
+    switch (dim) {
+      case 3:
+        clearGLErrors(amdContext);
+        amdContext.glenv()->glGetTexLevelParameteriv_(target, miplevel, GL_TEXTURE_DEPTH,
+                                                      &gliTexDepth);
+        if (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_())) {
+          LogWarning("Cannot get the depth of \"miplevel\" of GL \"texure\"");
+          HIP_RETURN(hipErrorInvalidValue);
+        }
+      // Fall trough to process other dimensions...
+      case 2:
+        clearGLErrors(amdContext);
+        amdContext.glenv()->glGetTexLevelParameteriv_(target, miplevel, GL_TEXTURE_HEIGHT,
+                                                      &gliTexHeight);
+        if (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_())) {
+          LogWarning("Cannot get the height of \"miplevel\" of GL \"texure\"");
+          HIP_RETURN(hipErrorInvalidValue);
+        }
+      // Fall trough to process other dimensions...
+      case 1:
+        clearGLErrors(amdContext);
+        amdContext.glenv()->glGetTexLevelParameteriv_(target, miplevel, GL_TEXTURE_WIDTH,
+                                                      &gliTexWidth);
+        if (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_())) {
+          LogWarning("Cannot get the width of \"miplevel\" of GL \"texure\"");
+          HIP_RETURN(hipErrorInvalidValue);
+        }
+        break;
+      default:
+        LogWarning("invalid \"target\" value");
+        HIP_RETURN(hipErrorInvalidValue);
+    }
+
+  } else {
+    GLint size;
+
+    // In case target is GL_TEXTURE_BUFFER
+    GLint backingBuffer;
+    clearGLErrors(amdContext);
+    amdContext.glenv()->glGetTexLevelParameteriv_(
+        glTarget, 0, GL_TEXTURE_BUFFER_DATA_STORE_BINDING, &backingBuffer);
+    if (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_())) {
+      LogWarning("Cannot get backing buffer for GL \"texture buffer\" object");
+      HIP_RETURN(hipErrorInvalidValue);
+    }
+    amdContext.glenv()->glBindBuffer_(glTarget, backingBuffer);
+
+    // Get GL texture format and check if it's compatible with CL format
+    clearGLErrors(amdContext);
+    amdContext.glenv()->glGetIntegerv_(GL_TEXTURE_BUFFER_FORMAT_EXT,
+                                        reinterpret_cast<GLint*>(&glInternalFormat));
+    if (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_())) {
+      LogWarning("Cannot get internal format of \"miplevel\" of GL \"texture\" object");
+      HIP_RETURN(hipErrorInvalidValue);
+    }
+
+    // Now get CL format from GL format and bytes per pixel
+    int iBytesPerPixel = 0;
+    if (!getCLFormatFromGL(amdContext, glInternalFormat, &clImageFormat, &iBytesPerPixel,
+                            flags)) {
+      LogWarning("\"texture\" format does not map to an appropriate CL image format");
+      HIP_RETURN(hipErrorInvalidValue);
+    }
+
+    clearGLErrors(amdContext);
+    amdContext.glenv()->glGetBufferParameteriv_(glTarget, GL_BUFFER_SIZE, &size);
+    if (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_())) {
+      LogWarning("Cannot get internal format of \"miplevel\" of GL \"texture\" object");
+      HIP_RETURN(hipErrorInvalidValue);
+    }
+
+    gliTexWidth = size / iBytesPerPixel;
+  }
+  size_t imageSize = (clType == CL_MEM_OBJECT_IMAGE1D_ARRAY) ? static_cast<size_t>(gliTexHeight)
+                                                              : static_cast<size_t>(gliTexDepth);
+
+  if (!amd::Image::validateDimensions(
+          amdContext.devices(), clType, static_cast<size_t>(gliTexWidth),
+          static_cast<size_t>(gliTexHeight), static_cast<size_t>(gliTexDepth), imageSize)) {
+    LogWarning("The GL \"texture\" data store is not created or out of supported dimensions");
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  target = (glTarget == GL_TEXTURE_CUBE_MAP) ? target : 0;
+
+  pImageGL = new (amdContext)
+      amd::ImageGL(amdContext, clType, flags, clImageFormat, static_cast<size_t>(gliTexWidth),
+              static_cast<size_t>(gliTexHeight), static_cast<size_t>(gliTexDepth), glTarget,
+              image, 0, glInternalFormat, clGLType, numSamples, target);
+
+  if (!pImageGL) {
+    LogWarning("Cannot create class ImageGL - out of memory?");
+    HIP_RETURN(hipErrorUnknown);
+  }
+
+  if (!pImageGL->create()) {
+    pImageGL->release();
+    HIP_RETURN(hipErrorUnknown);
+  }
+  // Create interop object
+  if (pImageGL->getInteropObj() == nullptr) {
+    LogWarning("cannot create object of class BufferGL");
+    pImageGL->release();
+    HIP_RETURN(hipErrorUnknown);
+  }
+  // Fixme: If more than one device is present in the context, we choose the first device.
+  // We should come up with a more elegant solution to handle this.
+  assert(amdContext.devices().size() == 1);
+
+  const amd::Device& dev = *(amdContext.devices()[0]);
+ 
+  device::Memory* mem = pImageGL->getDeviceMemory(dev);
+  if (nullptr == mem) {
+    LogPrintfError("Can't allocate memory size - 0x%08X bytes!", pImageGL->getSize());
+    pImageGL->release();
+    HIP_RETURN(hipErrorUnknown);
+  }
+  mem->processGLResource(device::Memory::GLDecompressResource);
+
+  *resource = reinterpret_cast<hipGraphicsResource*>(pImageGL);
+  HIP_RETURN(hipSuccess);
+
+}
+
+hipError_t hipGraphicsGLRegisterBuffer(hipGraphicsResource** resource, GLuint buffer,
+                                       unsigned int flags) {
+  HIP_INIT_API(hipGraphicsGLRegisterBuffer, resource, buffer, flags);
+
+  if (!((flags == hipGraphicsRegisterFlagsNone) || (flags & hipGraphicsRegisterFlagsReadOnly) ||
+        (flags & hipGraphicsRegisterFlagsWriteDiscard))) {
+    LogError("invalid parameter \"flags\"");
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  if (resource == nullptr) {
+    LogError("invalid resource");
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  amd::BufferGL* pBufferGL = nullptr;
+  GLenum glErr;
+  GLenum glTarget = GL_ARRAY_BUFFER;
+  GLint gliSize = 0;
+  GLint gliMapped = 0;
+
+  amd::Context& amdContext = *(hip::getCurrentDevice()->asContext());
+
+  if (amdContext.glenv() == nullptr) {
+    LogError("invalid context, gl interop not initialized");
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  // Add this scope to bound the scoped lock
+  {
+    amd::GLFunctions::SetIntEnv ie(amdContext.glenv());
+    if (!ie.isValid()) {
+      LogWarning("\"amdContext\" is not created from GL context or share list \n");
+      HIP_RETURN(hipErrorUnknown);
+    }
+
+    // Verify GL buffer object
+    clearGLErrors(amdContext);
+    if ((GL_FALSE == amdContext.glenv()->glIsBuffer_(buffer)) ||
+        (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_()))) {
+      LogWarning("\"buffer\" is not a GL buffer object \n");
+      HIP_RETURN(hipErrorInvalidResourceHandle);
+    }
+
+    // Check if size is available - data store is created
+    amdContext.glenv()->glBindBuffer_(glTarget, buffer);
+    clearGLErrors(amdContext);
+    amdContext.glenv()->glGetBufferParameteriv_(glTarget, GL_BUFFER_SIZE, &gliSize);
+    if (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_())) {
+      LogWarning("cannot get the GL buffer size \n");
+      HIP_RETURN(hipErrorInvalidResourceHandle);
+    }
+    if (gliSize == 0) {
+      LogWarning("the GL buffer's data store is not created \n");
+      HIP_RETURN(hipErrorInvalidResourceHandle);
+    }
+
+  }  // Release scoped lock
+
+  // Now create BufferGL object
+  pBufferGL = new (amdContext) amd::BufferGL(amdContext, flags, gliSize, 0, buffer);
+
+  if (!pBufferGL) {
+    LogWarning("cannot create object of class BufferGL");
+    HIP_RETURN(hipErrorUnknown);
+  }
+
+  if (!pBufferGL->create()) {
+    pBufferGL->release();
+    HIP_RETURN(hipErrorUnknown);
+  }
+
+  // Create interop object
+  if (pBufferGL->getInteropObj() == nullptr) {
+    LogWarning("cannot create object of class BufferGL");
+    HIP_RETURN(hipErrorUnknown);
+  }
+
+  // Fixme: If more than one device is present in the context, we choose the first device.
+  // We should come up with a more elegant solution to handle this.
+  assert(amdContext.devices().size() == 1);
+
+  const auto it = amdContext.devices().cbegin();
+  const amd::Device& dev = *(*it);
+
+  device::Memory* mem = pBufferGL->getDeviceMemory(dev);
+  if (nullptr == mem) {
+    LogPrintfError("Can't allocate memory size - 0x%08X bytes!", pBufferGL->getSize());
+    HIP_RETURN(hipErrorUnknown);
+  }
+  mem->processGLResource(device::Memory::GLDecompressResource);
+
+  *resource = reinterpret_cast<hipGraphicsResource*>(pBufferGL);
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipGraphicsMapResources(int count, hipGraphicsResource_t* resources,
+                                   hipStream_t stream) {
+  HIP_INIT_API(hipGraphicsMapResources, count, resources, stream);
+  amd::Context* amdContext = hip::getCurrentDevice()->asContext();
+  if (!amdContext || !amdContext->glenv()) {
+    HIP_RETURN(hipErrorUnknown);
+  }
+  clearGLErrors(*amdContext);
+  amdContext->glenv()->glFinish_();
+  if (checkForGLError(*amdContext) != GL_NO_ERROR) {
+    HIP_RETURN(hipErrorUnknown);
+  }
+
+  hip::Stream* hip_stream = hip::getStream(stream);
+  if (nullptr == hip_stream) {
+    HIP_RETURN(hipErrorUnknown);
+  }
+
+  if (!hip_stream->context().glenv() || !hip_stream->context().glenv()->isAssociated()) {
+    LogWarning("\"amdContext\" is not created from GL context or share list");
+    HIP_RETURN(hipErrorUnknown);
+  }
+
+  std::vector<amd::Memory*> memObjects;
+  hipError_t err = hipSetInteropObjects(count, reinterpret_cast<void**>(resources), memObjects);
+  if (err != hipSuccess) {
+    HIP_RETURN(err);
+  }
+
+  amd::Command::EventWaitList nullWaitList;
+
+  //! Now create command and enqueue
+  amd::AcquireExtObjectsCommand* command = new amd::AcquireExtObjectsCommand(
+      *hip_stream, nullWaitList, count, memObjects, CL_COMMAND_ACQUIRE_GL_OBJECTS);
+  if (command == nullptr) {
+    HIP_RETURN(hipErrorUnknown);
+  }
+
+  // Make sure we have memory for the command execution
+  if (!command->validateMemory()) {
+    delete command;
+    HIP_RETURN(hipErrorUnknown);
+  }
+
+  command->enqueue();
+
+  // *not_null(event) = as_cl(&command->event());
+  if (as_cl(&command->event()) == nullptr) {
+    command->release();
+  }
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipGraphicsResourceGetMappedPointer(void** devPtr, size_t* size,
+                                               hipGraphicsResource_t resource) {
+  HIP_INIT_API(hipGraphicsResourceGetMappedPointer, devPtr, size, resource);
+  amd::Context* amdContext = hip::getCurrentDevice()->asContext();
+  if (!amdContext || !amdContext->glenv()) {
+    HIP_RETURN(hipErrorUnknown);
+  }
+
+  // Fixme: If more than one device is present in the context, we choose the first device.
+  // We should come up with a more elegant solution to handle this.
+  assert(amdContext->devices().size() == 1);
+
+  const auto it = amdContext->devices().cbegin();
+
+  amd::Device* curDev = *it;
+  amd::Memory* amdMem = reinterpret_cast<amd::Memory*>(resource);
+  *size = amdMem->getSize();
+
+  // Interop resources don't have svm allocations they are added to
+  // amd::MemObjMap using device virtual address during creation.
+  device::Memory* mem = reinterpret_cast<device::Memory*>(amdMem->getDeviceMemory(*curDev));
+  *devPtr = reinterpret_cast<void*>(static_cast<uintptr_t>(mem->virtualAddress()));
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipGraphicsUnmapResources(int count, hipGraphicsResource_t* resources,
+                                     hipStream_t stream) {
+  HIP_INIT_API(hipGraphicsUnmapResources, count, resources, stream);
+  if (!hip::isValid(stream)) {
+    HIP_RETURN(hipErrorContextIsDestroyed);
+  }
+
+  // Wait for the current host queue
+  hip::getStream(stream)->finish();
+
+  hip::Stream* hip_stream = hip::getStream(stream);
+  if (nullptr == hip_stream) {
+    HIP_RETURN(hipErrorUnknown);
+  }
+
+  std::vector<amd::Memory*> memObjects;
+  hipError_t err = hipSetInteropObjects(count, reinterpret_cast<void**>(resources), memObjects);
+  if (err != hipSuccess) {
+    HIP_RETURN(err);
+  }
+
+  amd::Command::EventWaitList nullWaitList;
+
+  // Now create command and enqueue
+  amd::ReleaseExtObjectsCommand* command = new amd::ReleaseExtObjectsCommand(
+      *hip_stream, nullWaitList, count, memObjects, CL_COMMAND_RELEASE_GL_OBJECTS);
+  if (command == nullptr) {
+    HIP_RETURN(hipErrorUnknown);
+  }
+
+  // Make sure we have memory for the command execution
+  if (!command->validateMemory()) {
+    delete command;
+    HIP_RETURN(hipErrorUnknown);
+  }
+
+  command->enqueue();
+
+  if (as_cl(&command->event()) == nullptr) {
+    command->release();
+  }
+
+  HIP_RETURN(hipSuccess);
+}
+
+hipError_t hipGraphicsUnregisterResource(hipGraphicsResource_t resource) {
+  HIP_INIT_API(hipGraphicsUnregisterResource, resource);
+
+  amd::BufferGL* pBufferGL = reinterpret_cast<amd::BufferGL*>(resource);
+  delete pBufferGL;
+
+  HIP_RETURN(hipSuccess);
+}
@@ -0,0 +1,235 @@
+#include "hip_global.hpp"
+
+#include "hip/hip_runtime.h"
+#include "hip_internal.hpp"
+#include "hip_code_object.hpp"
+#include "platform/program.hpp"
+#include <hip/hip_version.h>
+
+const char* amd_dbgapi_get_build_name(void) {
+  return HIP_VERSION_BUILD_NAME;
+}
+
+const char* amd_dbgapi_get_git_hash() {
+  return HIP_VERSION_GITHASH;
+}
+
+size_t amd_dbgapi_get_build_id() {
+  return HIP_VERSION_BUILD_ID;
+}
+
+#ifdef __HIP_ENABLE_PCH
+extern const char __hip_pch_wave32[];
+extern const char __hip_pch_wave64[];
+extern unsigned __hip_pch_wave32_size;
+extern unsigned __hip_pch_wave64_size;
+void __hipGetPCH(const char** pch, unsigned int *size) {
+  hipDeviceProp_t deviceProp;
+  int deviceId;
+  hipError_t error = hipGetDevice(&deviceId);
+  error = hipGetDeviceProperties(&deviceProp, deviceId);
+  if (deviceProp.warpSize == 32) {
+    *pch = __hip_pch_wave32;
+    *size = __hip_pch_wave32_size;
+  } else {
+    *pch = __hip_pch_wave64;
+    *size = __hip_pch_wave64_size;
+  }
+}
+#endif
+namespace hip {
+
+//Device Vars
+DeviceVar::DeviceVar(std::string name,
+                     hipModule_t hmod,
+                     int deviceId) :
+                     shadowVptr(nullptr), name_(name),
+                     amd_mem_obj_(nullptr), device_ptr_(nullptr),
+                     size_(0) {
+  amd::Program* program = as_amd(reinterpret_cast<cl_program>(hmod));
+  device::Program* dev_program =
+                   program->getDeviceProgram(*g_devices.at(deviceId)->devices()[0]);
+
+  if (dev_program == nullptr) {
+    LogPrintfError("Cannot get Device Program for module: 0x%x \n", hmod);
+    guarantee(false, "Cannot get Device Program");
+  }
+
+  if(!dev_program->createGlobalVarObj(&amd_mem_obj_, &device_ptr_, &size_, name.c_str())) {
+    LogPrintfError("Cannot create Global Var obj for symbol: %s \n", name.c_str());
+    guarantee(false, "Cannot create GlobalVar Obj");
+  }
+
+  // Handle size 0 symbols
+  if (size_ != 0) {
+    if (amd_mem_obj_ == nullptr || device_ptr_ == nullptr) {
+      LogPrintfError("Cannot get memory for creating device Var: %s", name.c_str());
+      guarantee(false, "Cannot get memory for creating device var");
+    }
+    amd::MemObjMap::AddMemObj(device_ptr_, amd_mem_obj_);
+  }
+}
+
+DeviceVar::~DeviceVar() {
+  if (amd_mem_obj_ != nullptr) {
+    amd::MemObjMap::RemoveMemObj(device_ptr_);
+    amd_mem_obj_->release();
+  }
+
+  if (shadowVptr != nullptr) {
+    textureReference* texRef = reinterpret_cast<textureReference*>(shadowVptr);
+    hipError_t err = ihipUnbindTexture(texRef);
+    delete texRef;
+    shadowVptr = nullptr;
+  }
+
+  device_ptr_ = nullptr;
+  size_ = 0;
+}
+
+//Device Functions
+DeviceFunc::DeviceFunc(std::string name, hipModule_t hmod) : dflock_("function lock"),
+                       name_(name), kernel_(nullptr) {
+  amd::Program* program = as_amd(reinterpret_cast<cl_program>(hmod));
+
+  const amd::Symbol *symbol = program->findSymbol(name.c_str());
+  if (symbol == nullptr) {
+    LogPrintfError("Cannot find Symbol with name: %s \n", name.c_str());
+    guarantee(false, "Cannot find Symbol");
+  }
+
+  kernel_ = new amd::Kernel(*program, *symbol, name);
+  if (kernel_ == nullptr) {
+    LogPrintfError("Cannot create kernel with name: %s \n", name.c_str());
+    guarantee(false, "Cannot Create kernel");
+  }
+}
+
+DeviceFunc::~DeviceFunc() {
+  if (kernel_ != nullptr) {
+    kernel_->release();
+  }
+}
+
+//Abstract functions
+Function::Function(const std::string& name, FatBinaryInfo** modules)
+                   : name_(name), modules_(modules) {
+  dFunc_.resize(g_devices.size());
+}
+
+Function::~Function() {
+  for (auto& elem : dFunc_) {
+    delete elem;
+  }
+  name_ = "";
+  modules_ = nullptr;
+}
+
+hipError_t Function::getDynFunc(hipFunction_t* hfunc, hipModule_t hmod) {
+  guarantee((dFunc_.size() == g_devices.size()), "dFunc Size mismatch");
+  if (dFunc_[ihipGetDevice()] == nullptr) {
+    dFunc_[ihipGetDevice()] = new DeviceFunc(name_, hmod);
+  }
+  *hfunc = dFunc_[ihipGetDevice()]->asHipFunction();
+
+  return hipSuccess;
+}
+
+hipError_t Function::getStatFunc(hipFunction_t* hfunc, int deviceId) {
+  guarantee(modules_ != nullptr, "Module not initialized");
+
+  hipModule_t hmod = nullptr;
+  IHIP_RETURN_ONFAIL((*modules_)->BuildProgram(deviceId));
+  IHIP_RETURN_ONFAIL((*modules_)->GetModule(deviceId, &hmod));
+
+  if (dFunc_[deviceId] == nullptr) {
+    dFunc_[deviceId] = new DeviceFunc(name_, hmod);
+  }
+  *hfunc = dFunc_[deviceId]->asHipFunction();
+
+  return hipSuccess;
+}
+
+hipError_t Function::getStatFuncAttr(hipFuncAttributes* func_attr, int deviceId) {
+  guarantee((modules_ != nullptr), "Module not initialized");
+
+  hipModule_t hmod = nullptr;
+  IHIP_RETURN_ONFAIL((*modules_)->BuildProgram(deviceId));
+  IHIP_RETURN_ONFAIL((*modules_)->GetModule(deviceId, &hmod));
+
+  if (dFunc_[deviceId] == nullptr) {
+    dFunc_[deviceId] = new DeviceFunc(name_, hmod);
+  }
+
+  const std::vector<amd::Device*>& devices = amd::Device::getDevices(CL_DEVICE_TYPE_GPU, false);
+
+  amd::Kernel* kernel = dFunc_[deviceId]->kernel();
+  const device::Kernel::WorkGroupInfo* wginfo = kernel->getDeviceKernel(*devices[deviceId])->workGroupInfo();
+  func_attr->sharedSizeBytes = static_cast<int>(wginfo->localMemSize_);
+  func_attr->binaryVersion = static_cast<int>(kernel->signature().version());
+  func_attr->cacheModeCA = 0;
+  func_attr->constSizeBytes = 0;
+  func_attr->localSizeBytes = wginfo->privateMemSize_;
+  func_attr->maxDynamicSharedSizeBytes = static_cast<int>(wginfo->availableLDSSize_
+                                                          - wginfo->localMemSize_);
+
+  func_attr->maxThreadsPerBlock = static_cast<int>(wginfo->size_);
+  func_attr->numRegs = static_cast<int>(wginfo->usedVGPRs_);
+  func_attr->preferredShmemCarveout = 0;
+  func_attr->ptxVersion = 30;
+
+
+  return hipSuccess;
+}
+
+//Abstract Vars
+Var::Var(const std::string& name, DeviceVarKind dVarKind, size_t size, int type, int norm,
+         FatBinaryInfo** modules) : name_(name), dVarKind_(dVarKind), size_(size),
+         type_(type), norm_(norm), modules_(modules), managedVarPtr_(nullptr), align_(0) {
+  dVar_.resize(g_devices.size());
+}
+
+Var::Var(const std::string& name, DeviceVarKind dVarKind, void *pointer, size_t size,
+         unsigned align, FatBinaryInfo** modules) : name_(name), dVarKind_(dVarKind),
+         size_(size), modules_(modules), managedVarPtr_(pointer), align_(align),
+         type_(0), norm_(0) {
+  dVar_.resize(g_devices.size());
+}
+
+Var::~Var() {
+  for (auto& elem : dVar_) {
+    delete elem;
+  }
+  modules_ = nullptr;
+}
+
+hipError_t Var::getDeviceVar(DeviceVar** dvar, int deviceId, hipModule_t hmod) {
+  guarantee((deviceId >= 0), "Invalid DeviceId, less than zero");
+  guarantee((static_cast<size_t>(deviceId) < g_devices.size()),
+            "Invalid DeviceId, greater than no of code objects");
+  guarantee((dVar_.size() == g_devices.size()),
+             "Device Var not initialized to size");
+
+  if (dVar_[deviceId] == nullptr) {
+    dVar_[deviceId] = new DeviceVar(name_, hmod, deviceId);
+  }
+
+  *dvar = dVar_[deviceId];
+  return hipSuccess;
+}
+
+hipError_t Var::getStatDeviceVar(DeviceVar** dvar, int deviceId) {
+  guarantee((deviceId >= 0) , "Invalid DeviceId, less than zero");
+  guarantee((static_cast<size_t>(deviceId) < g_devices.size()),
+            "Invalid DeviceId, greater than no of code objects");
+  if (dVar_[deviceId] == nullptr) {
+    hipModule_t hmod = nullptr;
+    IHIP_RETURN_ONFAIL((*modules_)->BuildProgram(deviceId));
+    IHIP_RETURN_ONFAIL((*modules_)->GetModule(deviceId, &hmod));
+    dVar_[deviceId] = new DeviceVar(name_, hmod, deviceId);
+  }
+  *dvar = dVar_[deviceId];
+  return hipSuccess;
+}
+
+}; //namespace: hip
@@ -0,0 +1,128 @@
+#ifndef HIP_GLOBAL_HPP
+#define HIP_GLOBAL_HPP
+
+#include <vector>
+#include <string>
+
+#include "hip/hip_runtime_api.h"
+#include "hip/hip_runtime.h"
+#include "hip_internal.hpp"
+#include "hip_fatbin.hpp"
+#include "platform/program.hpp"
+
+namespace hip {
+
+//Forward Declaration
+class CodeObject;
+
+//Device Structures
+class DeviceVar {
+public:
+  DeviceVar(std::string name, hipModule_t hmod, int deviceId);
+  ~DeviceVar();
+
+  //Accessors for device ptr and size, populated during constructor.
+  hipDeviceptr_t device_ptr() const { return device_ptr_; }
+  size_t size() const { return size_; }
+  std::string name() const { return name_; }
+  void* shadowVptr;
+
+private:
+  std::string name_;           //Name of the var
+  amd::Memory* amd_mem_obj_;   //amd_mem_obj abstraction
+  hipDeviceptr_t device_ptr_;  //Device Pointer
+  size_t size_;                //Size of the var
+};
+
+class DeviceFunc {
+public:
+  DeviceFunc(std::string name, hipModule_t hmod);
+  ~DeviceFunc();
+
+  amd::Monitor dflock_;
+
+  //Converts DeviceFunc to hipFunction_t(used by app) and vice versa.
+  hipFunction_t asHipFunction() { return reinterpret_cast<hipFunction_t>(this); }
+  static DeviceFunc* asFunction(hipFunction_t f) { return reinterpret_cast<DeviceFunc*>(f); }
+
+  //Accessor for kernel_ and name_ populated during constructor.
+  std::string name() const { return name_; }
+  amd::Kernel* kernel() const { return kernel_; }
+
+private:
+  std::string name_;        //name of the func(not unique identifier)
+  amd::Kernel* kernel_;     //Kernel ptr referencing to ROCclr Symbol
+};
+
+//Abstract Structures
+class Function {
+public:
+  Function(const std::string& name, FatBinaryInfo** modules=nullptr);
+  ~Function();
+
+  //Return DeviceFunc for this this dynamically loaded module
+  hipError_t getDynFunc(hipFunction_t* hfunc, hipModule_t hmod);
+
+  //Return Device Func & attr . Generate/build if not already done so.
+  hipError_t getStatFunc(hipFunction_t *hfunc, int deviceId);
+  hipError_t getStatFuncAttr(hipFuncAttributes* func_attr, int deviceId);
+  void resize_dFunc(size_t size) { dFunc_.resize(size); }
+  FatBinaryInfo** moduleInfo() { return modules_; }
+  const std::string& name() const { return name_; }
+
+private:
+  std::vector<DeviceFunc*> dFunc_;  //DeviceFuncObj per Device
+  std::string name_;                //name of the func(not unique identifier)
+  FatBinaryInfo** modules_;      // static module where it is referenced
+};
+
+class Var {
+public:
+  //Types of variable
+  enum DeviceVarKind {
+    DVK_Variable = 0,
+    DVK_Surface,
+    DVK_Texture,
+    DVK_Managed
+  };
+
+  Var(const std::string& name, DeviceVarKind dVarKind, size_t size, int type, int norm,
+      FatBinaryInfo** modules = nullptr);
+
+  Var(const std::string& name, DeviceVarKind dVarKind, void *pointer, size_t size, unsigned align,
+      FatBinaryInfo** modules = nullptr);
+
+  ~Var();
+
+  //Return DeviceVar for this dynamically loaded module
+  hipError_t getDeviceVar(DeviceVar** dvar, int deviceId, hipModule_t hmod);
+
+  //Return DeviceVar for module Generate/build if not already done so.
+  hipError_t getStatDeviceVar(DeviceVar** dvar, int deviceId);
+  void resize_dVar(size_t size) { dVar_.resize(size); }
+
+  FatBinaryInfo** moduleInfo() { return modules_; };
+  DeviceVarKind getVarKind() const { return dVarKind_; }
+  size_t getSize() const { return size_; }
+
+  void* getManagedVarPtr() { return managedVarPtr_; };
+  void setManagedVarInfo(void* pointer, size_t size) {
+    managedVarPtr_ = pointer;
+    size_ = size;
+    dVarKind_ = DVK_Managed;
+  }
+private:
+  std::vector<DeviceVar*> dVar_;   // DeviceVarObj per Device
+  std::string name_;               // Variable name (not unique identifier)
+  DeviceVarKind dVarKind_;         // Variable kind
+  size_t size_;                    // Size of the variable
+  int type_;                       // Type(Textures/Surfaces only)
+  int norm_;                       // Type(Textures/Surfaces only)
+  FatBinaryInfo** modules_;        // static module where it is referenced
+
+  void *managedVarPtr_;            // Managed memory pointer with size_ & align_
+  unsigned int align_;             // Managed memory alignment
+};
+
+}; //namespace: hip
+#endif /* HIP_GLOBAL_HPP */
--- a/显示更多
+++ b/显示更多