diff --git a/projects/rocr-runtime/libhsakmt/CMakeLists.txt b/projects/rocr-runtime/libhsakmt/CMakeLists.txt
index 25b3af4af8..be539e2553 100644
--- a/projects/rocr-runtime/libhsakmt/CMakeLists.txt
+++ b/projects/rocr-runtime/libhsakmt/CMakeLists.txt
@@ -25,6 +25,9 @@
 
 cmake_minimum_required ( VERSION 3.6.3 )
 
+if (WIN_SDK)
+  include(${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists_wsl.txt)
+else ()
 set(CMAKE_VERBOSE_MAKEFILE ON)
 
 set ( HSAKMT "hsakmt" )
@@ -319,3 +322,4 @@ endif()
 ###########################
 # Use component packaging
 set ( ENABLE_LDCONFIG ON CACHE BOOL "Set library links and caches using ldconfig.")
+endif()
diff --git a/projects/rocr-runtime/libhsakmt/CMakeLists_wsl.txt b/projects/rocr-runtime/libhsakmt/CMakeLists_wsl.txt
new file mode 100644
index 0000000000..e07f9f9932
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/CMakeLists_wsl.txt
@@ -0,0 +1,309 @@
+################################################################################
+##
+## Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved.
+##
+## MIT LICENSE:
+## Permission is hereby granted, free of charge, to any person obtaining a copy of
+## this software and associated documentation files (the "Software"), to deal in
+## the Software without restriction, including without limitation the rights to
+## use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+## of the Software, and to permit persons to whom the Software is furnished to do
+## so, subject to the following conditions:
+##
+## The above copyright notice and this permission notice shall be included in all
+## copies or substantial portions of the Software.
+##
+## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+## AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+## LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+## OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+## SOFTWARE.
+##
+################################################################################
+
+cmake_minimum_required ( VERSION 3.15 )
+
+set(CMAKE_VERBOSE_MAKEFILE ON)
+
+set ( ROCDXG "rocdxg" )
+set ( ROCDXG_PACKAGE "rocdxg-roct" )
+set ( ROCDXG_COMPONENT "lib${ROCDXG}" )
+set ( ROCDXG_TARGET "${ROCDXG}" )
+set ( ROCDXG_VERSION "1.1.0")
+
+project ( ${ROCDXG_TARGET} VERSION ${ROCDXG_VERSION} )
+# Project/version initialized; expose version to code via target defs below
+
+# Optionally, build ROCDXG with ccache.
+set(ROCM_CCACHE_BUILD OFF CACHE BOOL "Set to ON for a ccache enabled build")
+if (ROCM_CCACHE_BUILD)
+  find_program(CCACHE_PROGRAM ccache)
+  if (CCACHE_PROGRAM)
+    set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ${CCACHE_PROGRAM})
+  else()
+    message(WARNING "Unable to find ccache. Falling back to real compiler")
+  endif() # if (CCACHE_PROGRAM)
+endif() # if (ROCM_CCACHE_BUILD)
+
+list( PREPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules" )
+
+## Include common cmake modules
+include ( utils )
+include ( GNUInstallDirs )
+
+## Setup the package version.
+get_version ( "${ROCDXG_VERSION}" )
+
+set ( BUILD_VERSION_MAJOR ${VERSION_MAJOR} )
+set ( BUILD_VERSION_MINOR ${VERSION_MINOR} )
+set ( BUILD_VERSION_PATCH ${VERSION_PATCH} )
+
+set ( LIB_VERSION_MAJOR ${VERSION_MAJOR})
+set ( LIB_VERSION_MINOR ${VERSION_MINOR})
+set ( LIB_VERSION_PATCH ${VERSION_PATCH})
+
+set ( LIB_VERSION_STRING "${LIB_VERSION_MAJOR}.${LIB_VERSION_MINOR}.${LIB_VERSION_PATCH}" )
+
+if ( DEFINED VERSION_BUILD AND NOT ${VERSION_BUILD} STREQUAL "" )
+    message ( "VERSION BUILD DEFINED ${VERSION_BUILD}" )
+    set ( BUILD_VERSION_PATCH "${BUILD_VERSION_PATCH}-${VERSION_BUILD}" )
+endif ()
+set ( BUILD_VERSION_STRING "${BUILD_VERSION_MAJOR}.${BUILD_VERSION_MINOR}.${BUILD_VERSION_PATCH}" )
+
+## Compiler flags
+set (ROCDXG_CXX_FLAGS -fPIC -include ${CMAKE_CURRENT_SOURCE_DIR}/src/dxg/librocdxg.h)
+
+if ( CMAKE_COMPILER_IS_GNUCC )
+    set ( ROCDXG_CXX_FLAGS "${ROCDXG_CXX_FLAGS}" -Wlogical-op)
+endif ()
+if ( ${ROCDXG_WERROR} )
+    set ( ROCDXG_CXX_FLAGS "${ROCDXG_CXX_FLAGS}" -Werror )
+endif ()
+if ( "${CMAKE_BUILD_TYPE}" STREQUAL Release )
+    set ( ROCDXG_CXX_FLAGS "${ROCDXG_CXX_FLAGS}" -O2 )
+else ()
+    set ( ROCDXG_CXX_FLAGS "${ROCDXG_CXX_FLAGS}" -g )
+endif ()
+
+set ( ROCDXG_LINKER_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/src/dxg/librocdxg.ver" )
+
+## Linker Flags
+## Add --enable-new-dtags to generate DT_RUNPATH
+set (ROCDXG_LINK_FLAGS "${ROCDXG_LINK_FLAGS} -Wl,--enable-new-dtags -Wl,--version-script=${ROCDXG_LINKER_SCRIPT} -Wl,-soname=${ROCDXG_COMPONENT}.so.${LIB_VERSION_MAJOR} -Wl,-z,nodelete")
+
+## Linker undefined symbol handling
+if ( CMAKE_COMPILER_IS_GNUCC )
+    set ( ROCDXG_LINK_FLAGS "${ROCDXG_LINK_FLAGS} -Wl,-no-undefined" )
+else ()
+    set ( ROCDXG_LINK_FLAGS "${ROCDXG_LINK_FLAGS} -Wl,-undefined,error" )
+endif ()
+
+## Source files
+set ( ROCDXG_SRC "src/dxg/debug.cpp"
+                 "src/dxg/events.cpp"
+                 "src/dxg/memory.cpp"
+                 "src/dxg/libdrm.cpp"
+                 "src/dxg/hsa.cpp"
+                 "src/dxg/openclose.cpp"
+                 "src/dxg/perfctr.cpp"
+                 "src/dxg/queues.cpp"
+                 "src/dxg/time.cpp"
+                 "src/dxg/topology.cpp"
+                 "src/dxg/spm.cpp"
+                 "src/dxg/version.cpp"
+                 "src/dxg/svm.cpp"
+                 "src/dxg/pc_sampling.cpp"
+                 "src/dxg/hsakmtmodel.cpp"
+                 "src/dxg/dxcore_loader.cpp"
+                 "src/dxg/ais.cpp"
+                 "src/dxg/wddm/device.cpp"
+                 "src/dxg/wddm/gpu_memory.cpp"
+                 "src/dxg/wddm/va_mgr.cpp"
+                 "src/dxg/wddm/queue.cpp"
+                 "src/dxg/wddm/cmd_util.cpp" )
+
+## Declare the library target name
+add_library (${ROCDXG_TARGET} SHARED "")
+
+## Add sources
+target_sources ( ${ROCDXG_TARGET} PRIVATE ${ROCDXG_SRC} )
+
+## Add headers.  The public headers need to point at their location in both build and install
+## directory layouts.  This declaration allows publishing library use data to downstream clients.
+target_include_directories( ${ROCDXG_TARGET}
+  PUBLIC
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
+  PRIVATE
+  ${WIN_SDK}
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/dxg )
+
+add_compile_definitions(LINUX __AMD64__ LITTLEENDIAN_CPU HSA_LARGE_MODEL)
+
+# Ensure version macro is defined for this target
+target_compile_definitions(${ROCDXG_TARGET} PRIVATE ROCDXG_VERSION="${ROCDXG_VERSION}")
+
+target_link_directories(${ROCDXG_TARGET} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src/dxg/thunk_proxy)
+target_link_libraries(${ROCDXG_TARGET} PRIVATE thunk_proxy)
+
+set_property(TARGET ${ROCDXG_TARGET} PROPERTY LINK_FLAGS ${ROCDXG_LINK_FLAGS})
+
+## Set the VERSION and SOVERSION values
+set_property ( TARGET ${ROCDXG_TARGET} PROPERTY VERSION "${LIB_VERSION_STRING}" )
+set_property ( TARGET ${ROCDXG_TARGET} PROPERTY SOVERSION "${LIB_VERSION_MAJOR}" )
+
+find_package(PkgConfig)
+# get OS-info for OS-specific build dependencies
+get_os_info()
+# Check for libraries required for building
+find_library(LIBC NAMES c REQUIRED)
+message(STATUS "LIBC:" ${LIBC})
+
+target_link_libraries ( ${ROCDXG_TARGET}
+  PRIVATE pthread rt c ${CMAKE_DL_LIBS}
+)
+
+target_compile_options(${ROCDXG_TARGET} PRIVATE ${ROCDXG_CXX_FLAGS})
+
+## Define default paths and packages.
+if( CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT )
+  set ( CMAKE_INSTALL_PREFIX "/opt/rocm" )
+endif()
+set ( CMAKE_INSTALL_PREFIX ${CMAKE_INSTALL_PREFIX} CACHE STRING "Default installation directory." FORCE )
+set ( CPACK_PACKAGING_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}"  CACHE STRING "Default packaging prefix." )
+set ( CPACK_GENERATOR "DEB"  CACHE STRING "Default packaging generators." )
+
+# Installs binaries and exports the library usage data to ${ROCDXG_TARGET}Targets
+install ( TARGETS ${ROCDXG_TARGET} EXPORT ${ROCDXG_TARGET}Targets
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT binary
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT binary )
+
+# Install public headers
+#install ( DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/${ROCDXG_TARGET} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+#  COMPONENT dev PATTERN "*drm*" EXCLUDE )
+
+# Record our usage data for clients find_package calls.
+install ( EXPORT ${ROCDXG_TARGET}Targets
+  FILE ${ROCDXG_TARGET}Targets.cmake
+  NAMESPACE ${ROCDXG_TARGET}::
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${ROCDXG_TARGET}
+  COMPONENT dev )
+
+# Adds the target alias rocdxg::rocdxg to the local cmake cache.
+# This isn't necessary today.  It's harmless preparation for some
+# hypothetical future in which the we might be included by add_subdirectory()
+# in some other project's cmake file.  It allows uniform use of find_package
+# and target_link_library() without regard to whether a target is external or
+# a subdirectory of the current build.
+add_library( ${ROCDXG_TARGET}::${ROCDXG_TARGET} ALIAS ${ROCDXG_TARGET} )
+
+# Create cmake configuration files
+include(CMakePackageConfigHelpers)
+
+configure_package_config_file(${ROCDXG_TARGET}-config.cmake.in
+                            ${ROCDXG_TARGET}-config.cmake
+                            INSTALL_DESTINATION
+                            ${CMAKE_INSTALL_LIBDIR}/cmake/${ROCDXG_TARGET} )
+
+write_basic_package_version_file(${ROCDXG_TARGET}-config-version.cmake
+                 VERSION ${BUILD_VERSION_STRING}
+                 COMPATIBILITY
+                 AnyNewerVersion)
+
+install(FILES
+        ${CMAKE_CURRENT_BINARY_DIR}/${ROCDXG_TARGET}-config.cmake
+        ${CMAKE_CURRENT_BINARY_DIR}/${ROCDXG_TARGET}-config-version.cmake
+        DESTINATION
+        ${CMAKE_INSTALL_LIBDIR}/cmake/${ROCDXG_TARGET}
+        COMPONENT dev )
+
+# Optionally record the package's find module in the user's package cache.
+if ( NOT DEFINED EXPORT_TO_USER_PACKAGE_REGISTRY )
+  set ( EXPORT_TO_USER_PACKAGE_REGISTRY "off" )
+endif()
+set ( EXPORT_TO_USER_PACKAGE_REGISTRY ${EXPORT_TO_USER_PACKAGE_REGISTRY}
+             CACHE BOOL "Add cmake package config location to the user's cmake package registry.")
+if(${EXPORT_TO_USER_PACKAGE_REGISTRY})
+  # Enable writing to the registry
+  set(CMAKE_EXPORT_PACKAGE_REGISTRY ON)
+  # Generate a target file for the build
+  export(TARGETS ${ROCDXG_TARGET} NAMESPACE ${ROCDXG_TARGET}:: FILE ${ROCDXG_TARGET}Targets.cmake)
+  # Record the package in the user's cache.
+  export(PACKAGE ${ROCDXG_TARGET})
+endif()
+
+# Since librocdxg.pc and libhsakmt.pc are installed to the same pkgconfig directory,
+# we can directly use libhsakmt's header file path in the includedir.
+# This allows librocdxg to reference the same header files as libhsakmt without
+# duplicating header installation.
+configure_file ( librocdxg.pc.in librocdxg.pc @ONLY )
+
+install ( FILES ${CMAKE_CURRENT_BINARY_DIR}/librocdxg.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig COMPONENT dev )
+
+install(CODE "execute_process(COMMAND ldconfig)" COMPONENT binary)
+
+###########################
+# Packaging directives
+###########################
+# Use component packaging
+set(CPACK_COMPONENTS_GROUPING IGNORE)
+set(CPACK_DEB_COMPONENT_INSTALL ON)
+set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.")
+set(CPACK_PACKAGE_VERSION_MAJOR ${VERSION_MAJOR})
+set(CPACK_PACKAGE_VERSION_MINOR ${VERSION_MINOR})
+set(CPACK_PACKAGE_VERSION_PATCH ${VERSION_PATCH})
+set(CPACK_PACKAGE_CONTACT "AMD GFX mailing list <amd-gfx@lists.freedesktop.org>")
+set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md")
+set(CPACK_COMPONENT_DESCRIPTION "ROCDXG development package.\n This package includes the user-mode API interfaces\nused to interact with the ROCm driver.\n This package contains the libraries and cmake files for the ROCDXG package.")
+set ( ENABLE_LDCONFIG ON CACHE BOOL "Set library links and caches using ldconfig.")
+
+# Install License file
+install ( FILES ${CPACK_RESOURCE_FILE_LICENSE} DESTINATION ${CMAKE_INSTALL_DOCDIR} COMPONENT binary )
+
+# Prepare final version for the CPACK use
+set(PACKAGE_VERSION_STR "${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}")
+set(CPACK_PACKAGE_VERSION "${PACKAGE_VERSION_STR}")
+
+# Debian package specific variables
+set(CPACK_DEBIAN_PACKAGE_NAME "rocdxg-roct")
+
+# Debian binary package specific variables (runtime package)
+set(CPACK_DEBIAN_BINARY_PACKAGE_NAME "rocdxg-roct")
+set(CPACK_DEBIAN_BINARY_PACKAGE_DESCRIPTION "ROCDXG runtime package containing libraries")
+
+# Debian dev package specific variables
+set(CPACK_DEBIAN_DEV_PACKAGE_NAME "rocdxg-roct-dev")
+set(CPACK_DEBIAN_DEV_PACKAGE_DESCRIPTION "ROCDXG development package containing pkgconfig and cmake files")
+
+## Process the Debian install/remove scripts to update the CPACK variables
+configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/postinst.in DEBIAN/postinst @ONLY )
+configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/prerm.in DEBIAN/prerm @ONLY )
+set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "DEBIAN/postinst;DEBIAN/prerm")
+
+# Setting package dependencies
+set(CPACK_DEBIAN_PACKAGE_DEPENDS "rocm-core")
+set(CPACK_DEBIAN_BINARY_PACKAGE_DEPENDS "rocm-core")
+set(CPACK_DEBIAN_DEV_PACKAGE_DEPENDS "rocdxg-roct (= ${PACKAGE_VERSION_STR}), rocm-core")
+
+# Set the names now using CPACK utility
+set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT")
+
+# Remove dependency on rocm-core if -DROCM_DEP_ROCMCORE=ON not given to cmake
+if(NOT ROCM_DEP_ROCMCORE)
+    string(REGEX REPLACE ",? ?rocm-core" "" CPACK_DEBIAN_PACKAGE_DEPENDS ${CPACK_DEBIAN_PACKAGE_DEPENDS})
+    string(REGEX REPLACE ",? ?rocm-core" "" CPACK_DEBIAN_BINARY_PACKAGE_DEPENDS ${CPACK_DEBIAN_BINARY_PACKAGE_DEPENDS})
+    string(REGEX REPLACE ",? ?rocm-core" "" CPACK_DEBIAN_DEV_PACKAGE_DEPENDS ${CPACK_DEBIAN_DEV_PACKAGE_DEPENDS})
+endif()
+
+include(CPack)
+
+# Add component descriptions
+cpack_add_component(binary
+  DISPLAY_NAME "Runtime"
+  DESCRIPTION "ROCDXG runtime libraries")
+
+cpack_add_component(dev
+  DISPLAY_NAME "Development"
+  DESCRIPTION "ROCDXG development files (pkgconfig and cmake)")
diff --git a/projects/rocr-runtime/libhsakmt/include/hsakmt/drm/amdgpu.h b/projects/rocr-runtime/libhsakmt/include/hsakmt/drm/amdgpu.h
new file mode 100644
index 0000000000..2b56bd3aac
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/include/hsakmt/drm/amdgpu.h
@@ -0,0 +1,2171 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+/**
+ * \file amdgpu.h
+ *
+ * Declare public libdrm_amdgpu API
+ *
+ * This file define API exposed by libdrm_amdgpu library.
+ * User wanted to use libdrm_amdgpu functionality must include
+ * this file.
+ *
+ */
+#ifndef _AMDGPU_H_
+#define _AMDGPU_H_
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct drm_amdgpu_info_hw_ip;
+struct drm_amdgpu_bo_list_entry;
+struct drm_amdgpu_capability;
+
+/*--------------------------------------------------------------------------*/
+/* --------------------------- Defines ------------------------------------ */
+/*--------------------------------------------------------------------------*/
+
+/**
+ * Define max. number of Command Buffers (IB) which could be sent to the single
+ * hardware IP to accommodate CE/DE requirements
+ *
+ * \sa amdgpu_cs_ib_info
+*/
+#define AMDGPU_CS_MAX_IBS_PER_SUBMIT		4
+
+/**
+ * Special timeout value meaning that the timeout is infinite.
+ */
+#define AMDGPU_TIMEOUT_INFINITE			0xffffffffffffffffull
+
+/**
+ * Used in amdgpu_cs_query_fence_status(), meaning that the given timeout
+ * is absolute.
+ */
+#define AMDGPU_QUERY_FENCE_TIMEOUT_IS_ABSOLUTE     (1 << 0)
+
+/*--------------------------------------------------------------------------*/
+/* ----------------------------- Enums ------------------------------------ */
+/*--------------------------------------------------------------------------*/
+
+/**
+ * Enum describing possible handle types
+ *
+ * \sa amdgpu_bo_import, amdgpu_bo_export
+ *
+*/
+enum amdgpu_bo_handle_type {
+	/** GEM flink name (needs DRM authentication, used by DRI2) */
+	amdgpu_bo_handle_type_gem_flink_name = 0,
+
+	/** KMS handle which is used by all driver ioctls */
+	amdgpu_bo_handle_type_kms = 1,
+
+	/** DMA-buf fd handle */
+	amdgpu_bo_handle_type_dma_buf_fd = 2,
+
+	/** Deprecated in favour of and same behaviour as
+	 * amdgpu_bo_handle_type_kms, use that instead of this
+	 */
+	amdgpu_bo_handle_type_kms_noimport = 3,
+};
+
+/** Define known types of GPU VM VA ranges */
+enum amdgpu_gpu_va_range
+{
+	/** Allocate from "normal"/general range */
+	amdgpu_gpu_va_range_general = 0
+};
+
+enum amdgpu_sw_info {
+	amdgpu_sw_info_address32_hi = 0,
+};
+
+/*--------------------------------------------------------------------------*/
+/* -------------------------- Datatypes ----------------------------------- */
+/*--------------------------------------------------------------------------*/
+
+/**
+ * Define opaque pointer to context associated with fd.
+ * This context will be returned as the result of
+ * "initialize" function and should be pass as the first
+ * parameter to any API call
+ */
+typedef struct amdgpu_device *amdgpu_device_handle;
+
+/**
+ * Define GPU Context type as pointer to opaque structure
+ * Example of GPU Context is the "rendering" context associated
+ * with OpenGL context (glCreateContext)
+ */
+typedef struct amdgpu_context *amdgpu_context_handle;
+
+/**
+ * Define handle for amdgpu resources: buffer, GDS, etc.
+ */
+typedef struct amdgpu_bo *amdgpu_bo_handle;
+
+/**
+ * Define handle for list of BOs
+ */
+typedef struct amdgpu_bo_list *amdgpu_bo_list_handle;
+
+/**
+ * Define handle to be used to work with VA allocated ranges
+ */
+typedef struct amdgpu_va *amdgpu_va_handle;
+
+/**
+ * Define handle dealing with VA allocation. An amdgpu_device
+ * owns one of these, but they can also be used without a device.
+ */
+typedef struct amdgpu_va_manager *amdgpu_va_manager_handle;
+
+/**
+ * Define handle for semaphore
+ */
+typedef struct amdgpu_semaphore *amdgpu_semaphore_handle;
+
+/**
+ * Define handle for sem file
+ */
+typedef uint32_t amdgpu_sem_handle;
+
+
+/*--------------------------------------------------------------------------*/
+/* -------------------------- Structures ---------------------------------- */
+/*--------------------------------------------------------------------------*/
+
+/**
+ * Structure describing memory allocation request
+ *
+ * \sa amdgpu_bo_alloc()
+ *
+*/
+struct amdgpu_bo_alloc_request {
+	/** Allocation request. It must be aligned correctly. */
+	uint64_t alloc_size;
+
+	/**
+	 * It may be required to have some specific alignment requirements
+	 * for physical back-up storage (e.g. for displayable surface).
+	 * If 0 there is no special alignment requirement
+	 */
+	uint64_t phys_alignment;
+
+	/**
+	 * UMD should specify where to allocate memory and how it
+	 * will be accessed by the CPU.
+	 */
+	uint32_t preferred_heap;
+
+	/** Additional flags passed on allocation */
+	uint64_t flags;
+};
+
+/**
+ * Special UMD specific information associated with buffer.
+ *
+ * It may be need to pass some buffer charactersitic as part
+ * of buffer sharing. Such information are defined UMD and
+ * opaque for libdrm_amdgpu as well for kernel driver.
+ *
+ * \sa amdgpu_bo_set_metadata(), amdgpu_bo_query_info,
+ *     amdgpu_bo_import(), amdgpu_bo_export
+ *
+*/
+struct amdgpu_bo_metadata {
+	/** Special flag associated with surface */
+	uint64_t flags;
+
+	union {
+		/**
+		 * ASIC-specific tiling information (also used by DCE).
+		 * The encoding is defined by the AMDGPU_TILING_* definitions.
+		 */
+		uint64_t tiling_info;
+		/**
+		 * ASIC-specific swizzle information.
+		 * The encoding is defined by the AMDGPU_SWIZZLE_* definitions.
+		 */
+		uint64_t swizzle_info;
+	};
+
+	/** Size of metadata associated with the buffer, in bytes. */
+	uint32_t size_metadata;
+
+	/** UMD specific metadata. Opaque for kernel */
+	uint32_t umd_metadata[64];
+};
+
+/**
+ * Structure describing allocated buffer. Client may need
+ * to query such information as part of 'sharing' buffers mechanism
+ *
+ * \sa amdgpu_bo_set_metadata(), amdgpu_bo_query_info(),
+ *     amdgpu_bo_import(), amdgpu_bo_export()
+*/
+struct amdgpu_bo_info {
+	/** Allocated memory size */
+	uint64_t alloc_size;
+
+	/**
+	 * It may be required to have some specific alignment requirements
+	 * for physical back-up storage.
+	 */
+	uint64_t phys_alignment;
+
+	/** Heap where to allocate memory. */
+	uint32_t preferred_heap;
+
+	/** Additional allocation flags. */
+	uint64_t alloc_flags;
+
+	/** Metadata associated with buffer if any. */
+	struct amdgpu_bo_metadata metadata;
+};
+
+/**
+ * Structure with information about "imported" buffer
+ *
+ * \sa amdgpu_bo_import()
+ *
+ */
+struct amdgpu_bo_import_result {
+	/** Handle of memory/buffer to use */
+	amdgpu_bo_handle buf_handle;
+
+	 /** Buffer size */
+	uint64_t alloc_size;
+};
+
+/**
+ *
+ * Structure to describe GDS partitioning information.
+ * \note OA and GWS resources are asscoiated with GDS partition
+ *
+ * \sa amdgpu_gpu_resource_query_gds_info
+ *
+*/
+struct amdgpu_gds_resource_info {
+	uint32_t gds_gfx_partition_size;
+	uint32_t compute_partition_size;
+	uint32_t gds_total_size;
+	uint32_t gws_per_gfx_partition;
+	uint32_t gws_per_compute_partition;
+	uint32_t oa_per_gfx_partition;
+	uint32_t oa_per_compute_partition;
+};
+
+/**
+ * Structure describing CS fence
+ *
+ * \sa amdgpu_cs_query_fence_status(), amdgpu_cs_request, amdgpu_cs_submit()
+ *
+*/
+struct amdgpu_cs_fence {
+
+	/** In which context IB was sent to execution */
+	amdgpu_context_handle context;
+
+	/** To which HW IP type the fence belongs */
+	uint32_t ip_type;
+
+	/** IP instance index if there are several IPs of the same type. */
+	uint32_t ip_instance;
+
+	/** Ring index of the HW IP */
+	uint32_t ring;
+
+	/** Specify fence for which we need to check submission status.*/
+	uint64_t fence;
+};
+
+/**
+ * Structure describing IB
+ *
+ * \sa amdgpu_cs_request, amdgpu_cs_submit()
+ *
+*/
+struct amdgpu_cs_ib_info {
+	/** Special flags */
+	uint64_t flags;
+
+	/** Virtual MC address of the command buffer */
+	uint64_t ib_mc_address;
+
+	/**
+	 * Size of Command Buffer to be submitted.
+	 *   - The size is in units of dwords (4 bytes).
+	 *   - Could be 0
+	 */
+	uint32_t size;
+};
+
+/**
+ * Structure describing fence information
+ *
+ * \sa amdgpu_cs_request, amdgpu_cs_query_fence,
+ *     amdgpu_cs_submit(), amdgpu_cs_query_fence_status()
+*/
+struct amdgpu_cs_fence_info {
+	/** buffer object for the fence */
+	amdgpu_bo_handle handle;
+
+	/** fence offset in the unit of sizeof(uint64_t) */
+	uint64_t offset;
+};
+
+/**
+ * Structure describing submission request
+ *
+ * \note We could have several IBs as packet. e.g. CE, CE, DE case for gfx
+ *
+ * \sa amdgpu_cs_submit()
+*/
+struct amdgpu_cs_request {
+	/** Specify flags with additional information */
+	uint64_t flags;
+
+	/** Specify HW IP block type to which to send the IB. */
+	unsigned ip_type;
+
+	/** IP instance index if there are several IPs of the same type. */
+	unsigned ip_instance;
+
+	/**
+	 * Specify ring index of the IP. We could have several rings
+	 * in the same IP. E.g. 0 for SDMA0 and 1 for SDMA1.
+	 */
+	uint32_t ring;
+
+	/**
+	 * List handle with resources used by this request.
+	 */
+	amdgpu_bo_list_handle resources;
+
+	/**
+	 * Number of dependencies this Command submission needs to
+	 * wait for before starting execution.
+	 */
+	uint32_t number_of_dependencies;
+
+	/**
+	 * Array of dependencies which need to be met before
+	 * execution can start.
+	 */
+	struct amdgpu_cs_fence *dependencies;
+
+	/** Number of IBs to submit in the field ibs. */
+	uint32_t number_of_ibs;
+
+	/**
+	 * IBs to submit. Those IBs will be submit together as single entity
+	 */
+	struct amdgpu_cs_ib_info *ibs;
+
+	/**
+	 * The returned sequence number for the command submission 
+	 */
+	uint64_t seq_no;
+
+	/**
+	 * The fence information
+	 */
+	struct amdgpu_cs_fence_info fence_info;
+};
+
+/**
+ * Structure which provide information about GPU VM MC Address space
+ * alignments requirements
+ *
+ * \sa amdgpu_query_buffer_size_alignment
+ */
+struct amdgpu_buffer_size_alignments {
+	/** Size alignment requirement for allocation in
+	 * local memory */
+	uint64_t size_local;
+
+	/**
+	 * Size alignment requirement for allocation in remote memory
+	 */
+	uint64_t size_remote;
+};
+
+/**
+ * Structure which provide information about heap
+ *
+ * \sa amdgpu_query_heap_info()
+ *
+ */
+struct amdgpu_heap_info {
+	/** Theoretical max. available memory in the given heap */
+	uint64_t heap_size;
+
+	/**
+	 * Number of bytes allocated in the heap. This includes all processes
+	 * and private allocations in the kernel. It changes when new buffers
+	 * are allocated, freed, and moved. It cannot be larger than
+	 * heap_size.
+	 */
+	uint64_t heap_usage;
+
+	/**
+	 * Theoretical possible max. size of buffer which
+	 * could be allocated in the given heap
+	 */
+	uint64_t max_allocation;
+};
+
+/**
+ * Describe GPU h/w info needed for UMD correct initialization
+ *
+ * \sa amdgpu_query_gpu_info()
+*/
+struct amdgpu_gpu_info {
+	/** Asic id */
+	uint32_t asic_id;
+	/** Chip revision */
+	uint32_t chip_rev;
+	/** Chip external revision */
+	uint32_t chip_external_rev;
+	/** Family ID */
+	uint32_t family_id;
+	/** Special flags */
+	uint64_t ids_flags;
+	/** max engine clock*/
+	uint64_t max_engine_clk;
+	/** max memory clock */
+	uint64_t max_memory_clk;
+	/** number of shader engines */
+	uint32_t num_shader_engines;
+	/** number of shader arrays per engine */
+	uint32_t num_shader_arrays_per_engine;
+	/**  Number of available good shader pipes */
+	uint32_t avail_quad_shader_pipes;
+	/**  Max. number of shader pipes.(including good and bad pipes  */
+	uint32_t max_quad_shader_pipes;
+	/** Number of parameter cache entries per shader quad pipe */
+	uint32_t cache_entries_per_quad_pipe;
+	/**  Number of available graphics context */
+	uint32_t num_hw_gfx_contexts;
+	/** Number of render backend pipes */
+	uint32_t rb_pipes;
+	/**  Enabled render backend pipe mask */
+	uint32_t enabled_rb_pipes_mask;
+	/** Frequency of GPU Counter */
+	uint32_t gpu_counter_freq;
+	/** CC_RB_BACKEND_DISABLE.BACKEND_DISABLE per SE */
+	uint32_t backend_disable[4];
+	/** Value of MC_ARB_RAMCFG register*/
+	uint32_t mc_arb_ramcfg;
+	/** Value of GB_ADDR_CONFIG */
+	uint32_t gb_addr_cfg;
+	/** Values of the GB_TILE_MODE0..31 registers */
+	uint32_t gb_tile_mode[32];
+	/** Values of GB_MACROTILE_MODE0..15 registers */
+	uint32_t gb_macro_tile_mode[16];
+	/** Value of PA_SC_RASTER_CONFIG register per SE */
+	uint32_t pa_sc_raster_cfg[4];
+	/** Value of PA_SC_RASTER_CONFIG_1 register per SE */
+	uint32_t pa_sc_raster_cfg1[4];
+	/* CU info */
+	uint32_t cu_active_number;
+	uint32_t cu_ao_mask;
+	uint32_t cu_bitmap[4][4];
+	/* video memory type info*/
+	uint32_t vram_type;
+	/* video memory bit width*/
+	uint32_t vram_bit_width;
+	/** constant engine ram size*/
+	uint32_t ce_ram_size;
+	/* vce harvesting instance */
+	uint32_t vce_harvest_config;
+	/* PCI revision ID */
+	uint32_t pci_rev_id;
+};
+
+
+/*--------------------------------------------------------------------------*/
+/*------------------------- Functions --------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+/*
+ * Initialization / Cleanup
+ *
+*/
+
+/**
+ *
+ * \param   fd            - \c [in]  File descriptor for AMD GPU device
+ *                                   received previously as the result of
+ *                                   e.g. drmOpen() call.
+ *                                   For legacy fd type, the DRI2/DRI3
+ *                                   authentication should be done before
+ *                                   calling this function.
+ * \param   major_version - \c [out] Major version of library. It is assumed
+ *                                   that adding new functionality will cause
+ *                                   increase in major version
+ * \param   minor_version - \c [out] Minor version of library
+ * \param   device_handle - \c [out] Pointer to opaque context which should
+ *                                   be passed as the first parameter on each
+ *                                   API call
+ *
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+ *
+ * \sa amdgpu_device_deinitialize()
+*/
+int amdgpu_device_initialize(int fd,
+			     uint32_t *major_version,
+			     uint32_t *minor_version,
+			     amdgpu_device_handle *device_handle);
+
+/**
+ * Same as amdgpu_device_initialize() except when deduplicate_device
+ * is false *and* fd points to a device that was already initialized.
+ * In this case, amdgpu_device_initialize would return the same
+ * amdgpu_device_handle while here amdgpu_device_initialize2 would
+ * return a new handle.
+ * amdgpu_device_initialize() should be preferred in most situations;
+ * the only use-case where not-deduplicating devices make sense is
+ * when one wants to have isolated device handles in the same process.
+ */
+int amdgpu_device_initialize2(int fd, bool deduplicate_device,
+			      uint32_t *major_version,
+			      uint32_t *minor_version,
+			      amdgpu_device_handle *device_handle);
+/**
+ *
+ * When access to such library does not needed any more the special
+ * function must be call giving opportunity to clean up any
+ * resources if needed.
+ *
+ * \param   device_handle - \c [in]  Context associated with file
+ *                                   descriptor for AMD GPU device
+ *                                   received previously as the
+ *                                   result e.g. of drmOpen() call.
+ *
+ * \return  0 on success\n
+ *         <0 - Negative POSIX Error code
+ *
+ * \sa amdgpu_device_initialize()
+ *
+*/
+int amdgpu_device_deinitialize(amdgpu_device_handle device_handle);
+
+/**
+ *
+ * /param device_handle - \c [in] Device handle.
+ *                           See #amdgpu_device_initialize()
+ *
+ * \return Returns the drm fd used for operations on this
+ *         device. This is still owned by the library and hence
+ *         should not be closed. Guaranteed to be valid until
+ *         #amdgpu_device_deinitialize gets called.
+ *
+*/
+int amdgpu_device_get_fd(amdgpu_device_handle device_handle);
+
+/*
+ * Memory Management
+ *
+*/
+
+/**
+ * Allocate memory to be used by UMD for GPU related operations
+ *
+ * \param   dev		 - \c [in] Device handle.
+ *				   See #amdgpu_device_initialize()
+ * \param   alloc_buffer - \c [in] Pointer to the structure describing an
+ *				   allocation request
+ * \param   buf_handle	- \c [out] Allocated buffer handle
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+ * \sa amdgpu_bo_free()
+*/
+int amdgpu_bo_alloc(amdgpu_device_handle dev,
+		    struct amdgpu_bo_alloc_request *alloc_buffer,
+		    amdgpu_bo_handle *buf_handle);
+
+/**
+ * Associate opaque data with buffer to be queried by another UMD
+ *
+ * \param   dev	       - \c [in] Device handle. See #amdgpu_device_initialize()
+ * \param   buf_handle - \c [in] Buffer handle
+ * \param   info       - \c [in] Metadata to associated with buffer
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+*/
+int amdgpu_bo_set_metadata(amdgpu_bo_handle buf_handle,
+			   struct amdgpu_bo_metadata *info);
+
+/**
+ * Query buffer information including metadata previusly associated with
+ * buffer.
+ *
+ * \param   dev	       - \c [in] Device handle.
+ *				 See #amdgpu_device_initialize()
+ * \param   buf_handle - \c [in]   Buffer handle
+ * \param   info       - \c [out]  Structure describing buffer
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+ * \sa amdgpu_bo_set_metadata(), amdgpu_bo_alloc()
+*/
+int amdgpu_bo_query_info(amdgpu_bo_handle buf_handle,
+			 struct amdgpu_bo_info *info);
+
+/**
+ * Allow others to get access to buffer
+ *
+ * \param   dev		  - \c [in] Device handle.
+ *				    See #amdgpu_device_initialize()
+ * \param   buf_handle    - \c [in] Buffer handle
+ * \param   type          - \c [in] Type of handle requested
+ * \param   shared_handle - \c [out] Special "shared" handle
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+ * \sa amdgpu_bo_import()
+ *
+*/
+int amdgpu_bo_export(amdgpu_bo_handle buf_handle,
+		     enum amdgpu_bo_handle_type type,
+		     uint32_t *shared_handle);
+
+/**
+ * Request access to "shared" buffer
+ *
+ * \param   dev		  - \c [in] Device handle.
+ *				    See #amdgpu_device_initialize()
+ * \param   type	  - \c [in] Type of handle requested
+ * \param   shared_handle - \c [in] Shared handle received as result "import"
+ *				     operation
+ * \param   output        - \c [out] Pointer to structure with information
+ *				     about imported buffer
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+ * \note  Buffer must be "imported" only using new "fd" (different from
+ *	  one used by "exporter").
+ *
+ * \sa amdgpu_bo_export()
+ *
+*/
+int amdgpu_bo_import(amdgpu_device_handle dev,
+		     enum amdgpu_bo_handle_type type,
+		     uint32_t shared_handle,
+		     struct amdgpu_bo_import_result *output);
+
+/**
+ * Allow others to get access to crtc's framebuffer
+ *
+ * \param   dev   - \c [in] Device handle.
+ *				   See #amdgpu_device_initialize()
+ * \param   fb_id - \c [out] the first crtc's framebuffer's buffer_id
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+ * \sa amdgpu_get_fb_id()
+ *
+*/
+int amdgpu_get_fb_id(amdgpu_device_handle dev, unsigned int *fb_id);
+
+/**
+ * Get the framebuffer's bo by fb_id
+ *
+ * \param   dev    - \c [in] Device handle.
+ *				    See #amdgpu_device_initialize()
+ * \param   fb_id  - \c [in] the framebuffer's buffer_id
+ *
+ * \param   output - \c [output] the bo of fb_id
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+ * \sa amdgpu_get_bo_from_fb_id()
+ *
+*/
+int amdgpu_get_bo_from_fb_id(amdgpu_device_handle dev, unsigned int fb_id, struct amdgpu_bo_import_result *output);
+
+/**
+ * Request GPU access to user allocated memory e.g. via "malloc"
+ *
+ * \param dev - [in] Device handle. See #amdgpu_device_initialize()
+ * \param cpu - [in] CPU address of user allocated memory which we
+ * want to map to GPU address space (make GPU accessible)
+ * (This address must be correctly aligned).
+ * \param size - [in] Size of allocation (must be correctly aligned)
+ * \param buf_handle - [out] Buffer handle for the userptr memory
+ * resource on submission and be used in other operations.
+ *
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+ * \note
+ * This call doesn't guarantee that such memory will be persistently
+ * "locked" / make non-pageable. The purpose of this call is to provide
+ * opportunity for GPU get access to this resource during submission.
+ *
+ * The maximum amount of memory which could be mapped in this call depends
+ * if overcommit is disabled or not. If overcommit is disabled than the max.
+ * amount of memory to be pinned will be limited by left "free" size in total
+ * amount of memory which could be locked simultaneously ("GART" size).
+ *
+ * Supported (theoretical) max. size of mapping is restricted only by
+ * "GART" size.
+ *
+ * It is responsibility of caller to correctly specify access rights
+ * on VA assignment.
+*/
+int amdgpu_create_bo_from_user_mem(amdgpu_device_handle dev,
+				    void *cpu, uint64_t size,
+				    amdgpu_bo_handle *buf_handle);
+
+/**
+ * Validate if the user memory comes from BO
+ *
+ * \param dev - [in] Device handle. See #amdgpu_device_initialize()
+ * \param cpu - [in] CPU address of user allocated memory which we
+ * want to map to GPU address space (make GPU accessible)
+ * (This address must be correctly aligned).
+ * \param size - [in] Size of allocation (must be correctly aligned)
+ * \param buf_handle - [out] Buffer handle for the userptr memory
+ * if the user memory is not from BO, the buf_handle will be NULL.
+ * \param offset_in_bo - [out] offset in this BO for this user memory
+ *
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+*/
+int amdgpu_find_bo_by_cpu_mapping(amdgpu_device_handle dev,
+				  void *cpu,
+				  uint64_t size,
+				  amdgpu_bo_handle *buf_handle,
+				  uint64_t *offset_in_bo);
+
+/**
+ * Request GPU access to physical memory from 3rd party device.
+ *
+ * \param dev - [in] Device handle. See #amdgpu_device_initialize()
+ * \param phys_address - [in] Physical address from 3rd party device which
+ * we want to map to GPU address space (make GPU accessible)
+ * (This address must be correctly aligned).
+ * \param size - [in] Size of allocation (must be correctly aligned)
+ * \param buf_handle - [out] Buffer handle for the userptr memory
+ * resource on submission and be used in other operations.
+ *
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+ * \note
+ * This call should guarantee that such memory will be persistently
+ * "locked" / make non-pageable. The purpose of this call is to provide
+ * opportunity for GPU get access to this resource during submission.
+ *
+ *
+ * Supported (theoretical) max. size of mapping is restricted only by
+ * capability.direct_gma_size. See #amdgpu_query_capability()
+ *
+ * It is responsibility of caller to correctly specify physical_address
+*/
+int amdgpu_create_bo_from_phys_mem(amdgpu_device_handle dev,
+				uint64_t phys_address, uint64_t size,
+				amdgpu_bo_handle *buf_handle);
+
+/**
+ * Get physical address from BO
+ *
+ * \param buf_handle - [in] Buffer handle for the physical address.
+ * \param phys_address - [out] Physical address of this BO.
+ *
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+*/
+int amdgpu_bo_get_phys_address(amdgpu_bo_handle buf_handle,
+					uint64_t *phys_address);
+
+/**
+ * Free previously allocated memory
+ *
+ * \param   dev	       - \c [in] Device handle. See #amdgpu_device_initialize()
+ * \param   buf_handle - \c [in]  Buffer handle to free
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+ * \note In the case of memory shared between different applications all
+ *	 resources will be “physically” freed only all such applications
+ *	 will be terminated
+ * \note If is UMD responsibility to ‘free’ buffer only when there is no
+ *	 more GPU access
+ *
+ * \sa amdgpu_bo_set_metadata(), amdgpu_bo_alloc()
+ *
+*/
+int amdgpu_bo_free(amdgpu_bo_handle buf_handle);
+
+/**
+ * Increase the reference count of a buffer object
+ *
+ * \param   bo - \c [in]  Buffer object handle to increase the reference count
+ *
+ * \sa amdgpu_bo_alloc(), amdgpu_bo_free()
+ *
+*/
+void amdgpu_bo_inc_ref(amdgpu_bo_handle bo);
+
+/**
+ * Request CPU access to GPU accessible memory
+ *
+ * \param   buf_handle - \c [in] Buffer handle
+ * \param   cpu        - \c [out] CPU address to be used for access
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+ * \sa amdgpu_bo_cpu_unmap()
+ *
+*/
+int amdgpu_bo_cpu_map(amdgpu_bo_handle buf_handle, void **cpu);
+
+/**
+ * Release CPU access to GPU memory
+ *
+ * \param   buf_handle  - \c [in] Buffer handle
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+ * \sa amdgpu_bo_cpu_map()
+ *
+*/
+int amdgpu_bo_cpu_unmap(amdgpu_bo_handle buf_handle);
+
+/**
+ * Wait until a buffer is not used by the device.
+ *
+ * \param   dev           - \c [in] Device handle. See #amdgpu_device_initialize()
+ * \param   buf_handle    - \c [in] Buffer handle.
+ * \param   timeout_ns    - Timeout in nanoseconds.
+ * \param   buffer_busy   - 0 if buffer is idle, all GPU access was completed
+ *                            and no GPU access is scheduled.
+ *                          1 GPU access is in fly or scheduled
+ *
+ * \return   0 - on success
+ *          <0 - Negative POSIX Error code
+ */
+int amdgpu_bo_wait_for_idle(amdgpu_bo_handle buf_handle,
+			    uint64_t timeout_ns,
+			    bool *buffer_busy);
+
+/**
+ * Creates a BO list handle for command submission.
+ *
+ * \param   dev			- \c [in] Device handle.
+ *				   See #amdgpu_device_initialize()
+ * \param   number_of_buffers	- \c [in] Number of BOs in the list
+ * \param   buffers		- \c [in] List of BO handles
+ * \param   result		- \c [out] Created BO list handle
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+ * \sa amdgpu_bo_list_destroy_raw(), amdgpu_cs_submit_raw2()
+*/
+int amdgpu_bo_list_create_raw(amdgpu_device_handle dev,
+			      uint32_t number_of_buffers,
+			      struct drm_amdgpu_bo_list_entry *buffers,
+			      uint32_t *result);
+
+/**
+ * Destroys a BO list handle.
+ *
+ * \param   bo_list	- \c [in] BO list handle.
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+ * \sa amdgpu_bo_list_create_raw(), amdgpu_cs_submit_raw2()
+*/
+int amdgpu_bo_list_destroy_raw(amdgpu_device_handle dev, uint32_t bo_list);
+
+/**
+ * Creates a BO list handle for command submission.
+ *
+ * \param   dev			- \c [in] Device handle.
+ *				   See #amdgpu_device_initialize()
+ * \param   number_of_resources	- \c [in] Number of BOs in the list
+ * \param   resources		- \c [in] List of BO handles
+ * \param   resource_prios	- \c [in] Optional priority for each handle
+ * \param   result		- \c [out] Created BO list handle
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+ * \sa amdgpu_bo_list_destroy()
+*/
+int amdgpu_bo_list_create(amdgpu_device_handle dev,
+			  uint32_t number_of_resources,
+			  amdgpu_bo_handle *resources,
+			  uint8_t *resource_prios,
+			  amdgpu_bo_list_handle *result);
+
+/**
+ * Destroys a BO list handle.
+ *
+ * \param   handle	- \c [in] BO list handle.
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+ * \sa amdgpu_bo_list_create()
+*/
+int amdgpu_bo_list_destroy(amdgpu_bo_list_handle handle);
+
+/**
+ * Update resources for existing BO list
+ *
+ * \param   handle              - \c [in] BO list handle
+ * \param   number_of_resources - \c [in] Number of BOs in the list
+ * \param   resources           - \c [in] List of BO handles
+ * \param   resource_prios      - \c [in] Optional priority for each handle
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+ * \sa amdgpu_bo_list_update()
+*/
+int amdgpu_bo_list_update(amdgpu_bo_list_handle handle,
+			  uint32_t number_of_resources,
+			  amdgpu_bo_handle *resources,
+			  uint8_t *resource_prios);
+
+/*
+ * GPU Execution context
+ *
+*/
+
+/**
+ * Create GPU execution Context
+ *
+ * For the purpose of GPU Scheduler and GPU Robustness extensions it is
+ * necessary to have information/identify rendering/compute contexts.
+ * It also may be needed to associate some specific requirements with such
+ * contexts.  Kernel driver will guarantee that submission from the same
+ * context will always be executed in order (first come, first serve).
+ *
+ *
+ * \param   dev      - \c [in] Device handle. See #amdgpu_device_initialize()
+ * \param   priority - \c [in] Context creation flags. See AMDGPU_CTX_PRIORITY_*
+ * \param   context  - \c [out] GPU Context handle
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+ * \sa amdgpu_cs_ctx_free()
+ *
+*/
+int amdgpu_cs_ctx_create2(amdgpu_device_handle dev,
+			 uint32_t priority,
+			 amdgpu_context_handle *context);
+/**
+ * Create GPU execution Context
+ *
+ * Refer to amdgpu_cs_ctx_create2 for full documentation. This call
+ * is missing the priority parameter.
+ *
+ * \sa amdgpu_cs_ctx_create2()
+ *
+*/
+int amdgpu_cs_ctx_create(amdgpu_device_handle dev,
+			 amdgpu_context_handle *context);
+
+/**
+ *
+ * Destroy GPU execution context when not needed any more
+ *
+ * \param   context - \c [in] GPU Context handle
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+ * \sa amdgpu_cs_ctx_create()
+ *
+*/
+int amdgpu_cs_ctx_free(amdgpu_context_handle context);
+
+/**
+ * Override the submission priority for the given context using a master fd.
+ *
+ * \param   dev        - \c [in] device handle
+ * \param   context    - \c [in] context handle for context id
+ * \param   master_fd  - \c [in] The master fd to authorize the override.
+ * \param   priority   - \c [in] The priority to assign to the context.
+ *
+ * \return 0 on success or a a negative Posix error code on failure.
+ */
+int amdgpu_cs_ctx_override_priority(amdgpu_device_handle dev,
+                                    amdgpu_context_handle context,
+                                    int master_fd,
+                                    unsigned priority);
+
+/**
+ * Set or query the stable power state for GPU profiling.
+ *
+ * \param   dev        - \c [in] device handle
+ * \param   op         - \c [in] AMDGPU_CTX_OP_{GET,SET}_STABLE_PSTATE
+ * \param   flags      - \c [in] AMDGPU_CTX_STABLE_PSTATE_*
+ * \param   out_flags  - \c [out] output current stable pstate
+ *
+ * \return  0 on success otherwise POSIX Error code.
+ */
+int amdgpu_cs_ctx_stable_pstate(amdgpu_context_handle context,
+			        uint32_t op,
+			        uint32_t flags,
+			        uint32_t *out_flags);
+
+/**
+ * Query reset state for the specific GPU Context
+ *
+ * \param   context - \c [in]  GPU Context handle
+ * \param   state   - \c [out] One of AMDGPU_CTX_*_RESET
+ * \param   hangs   - \c [out] Number of hangs caused by the context.
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+ * \sa amdgpu_cs_ctx_create()
+ *
+*/
+int amdgpu_cs_query_reset_state(amdgpu_context_handle context,
+				uint32_t *state, uint32_t *hangs);
+
+/**
+ * Query reset state for the specific GPU Context.
+ *
+ * \param   context - \c [in]  GPU Context handle
+ * \param   flags   - \c [out] A combination of AMDGPU_CTX_QUERY2_FLAGS_*
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+ * \sa amdgpu_cs_ctx_create()
+ *
+*/
+int amdgpu_cs_query_reset_state2(amdgpu_context_handle context,
+				 uint64_t *flags);
+
+/*
+ * Command Buffers Management
+ *
+*/
+
+/**
+ * Send request to submit command buffers to hardware.
+ *
+ * Kernel driver could use GPU Scheduler to make decision when physically
+ * sent this request to the hardware. Accordingly this request could be put
+ * in queue and sent for execution later. The only guarantee is that request
+ * from the same GPU context to the same ip:ip_instance:ring will be executed in
+ * order.
+ *
+ * The caller can specify the user fence buffer/location with the fence_info in the
+ * cs_request.The sequence number is returned via the 'seq_no' parameter
+ * in ibs_request structure.
+ *
+ *
+ * \param   dev		       - \c [in]  Device handle.
+ *					  See #amdgpu_device_initialize()
+ * \param   context            - \c [in]  GPU Context
+ * \param   flags              - \c [in]  Global submission flags
+ * \param   ibs_request        - \c [in/out] Pointer to submission requests.
+ *					  We could submit to the several
+ *					  engines/rings simulteniously as
+ *					  'atomic' operation
+ * \param   number_of_requests - \c [in]  Number of submission requests
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+ * \note It is required to pass correct resource list with buffer handles
+ *	 which will be accessible by command buffers from submission
+ *	 This will allow kernel driver to correctly implement "paging".
+ *	 Failure to do so will have unpredictable results.
+ *
+ * \sa amdgpu_command_buffer_alloc(), amdgpu_command_buffer_free(),
+ *     amdgpu_cs_query_fence_status()
+ *
+*/
+int amdgpu_cs_submit(amdgpu_context_handle context,
+		     uint64_t flags,
+		     struct amdgpu_cs_request *ibs_request,
+		     uint32_t number_of_requests);
+
+/**
+ *  Query status of Command Buffer Submission
+ *
+ * \param   fence   - \c [in] Structure describing fence to query
+ * \param   timeout_ns - \c [in] Timeout value to wait
+ * \param   flags   - \c [in] Flags for the query
+ * \param   expired - \c [out] If fence expired or not.\n
+ *				0  – if fence is not expired\n
+ *				!0 - otherwise
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+ * \note If UMD wants only to check operation status and returned immediately
+ *	 then timeout value as 0 must be passed. In this case success will be
+ *	 returned in the case if submission was completed or timeout error
+ *	 code.
+ *
+ * \sa amdgpu_cs_submit()
+*/
+int amdgpu_cs_query_fence_status(struct amdgpu_cs_fence *fence,
+				 uint64_t timeout_ns,
+				 uint64_t flags,
+				 uint32_t *expired);
+
+/**
+ *  Wait for multiple fences
+ *
+ * \param   fences      - \c [in] The fence array to wait
+ * \param   fence_count - \c [in] The fence count
+ * \param   wait_all    - \c [in] If true, wait all fences to be signaled,
+ *                                otherwise, wait at least one fence
+ * \param   timeout_ns  - \c [in] The timeout to wait, in nanoseconds
+ * \param   status      - \c [out] '1' for signaled, '0' for timeout
+ * \param   first       - \c [out] the index of the first signaled fence from @fences
+ *
+ * \return  0 on success
+ *          <0 - Negative POSIX Error code
+ *
+ * \note    Currently it supports only one amdgpu_device. All fences come from
+ *          the same amdgpu_device with the same fd.
+*/
+int amdgpu_cs_wait_fences(struct amdgpu_cs_fence *fences,
+			  uint32_t fence_count,
+			  bool wait_all,
+			  uint64_t timeout_ns,
+			  uint32_t *status, uint32_t *first);
+
+/*
+ * Query / Info API
+ *
+*/
+
+/**
+ * Query allocation size alignments
+ *
+ * UMD should query information about GPU VM MC size alignments requirements
+ * to be able correctly choose required allocation size and implement
+ * internal optimization if needed.
+ *
+ * \param   dev  - \c [in] Device handle. See #amdgpu_device_initialize()
+ * \param   info - \c [out] Pointer to structure to get size alignment
+ *			  requirements
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+*/
+int amdgpu_query_buffer_size_alignment(amdgpu_device_handle dev,
+				       struct amdgpu_buffer_size_alignments
+						*info);
+
+/**
+ * Query firmware versions
+ *
+ * \param   dev	        - \c [in] Device handle. See #amdgpu_device_initialize()
+ * \param   fw_type     - \c [in] AMDGPU_INFO_FW_*
+ * \param   ip_instance - \c [in] Index of the IP block of the same type.
+ * \param   index       - \c [in] Index of the engine. (for SDMA and MEC)
+ * \param   version     - \c [out] Pointer to to the "version" return value
+ * \param   feature     - \c [out] Pointer to to the "feature" return value
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+*/
+int amdgpu_query_firmware_version(amdgpu_device_handle dev, unsigned fw_type,
+				  unsigned ip_instance, unsigned index,
+				  uint32_t *version, uint32_t *feature);
+
+/**
+ * Query the number of HW IP instances of a certain type.
+ *
+ * \param   dev      - \c [in] Device handle. See #amdgpu_device_initialize()
+ * \param   type     - \c [in] Hardware IP block type = AMDGPU_HW_IP_*
+ * \param   count    - \c [out] Pointer to structure to get information
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+*/
+int amdgpu_query_hw_ip_count(amdgpu_device_handle dev, unsigned type,
+			     uint32_t *count);
+
+/**
+ * Query engine information
+ *
+ * This query allows UMD to query information different engines and their
+ * capabilities.
+ *
+ * \param   dev         - \c [in] Device handle. See #amdgpu_device_initialize()
+ * \param   type        - \c [in] Hardware IP block type = AMDGPU_HW_IP_*
+ * \param   ip_instance - \c [in] Index of the IP block of the same type.
+ * \param   info        - \c [out] Pointer to structure to get information
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+*/
+int amdgpu_query_hw_ip_info(amdgpu_device_handle dev, unsigned type,
+			    unsigned ip_instance,
+			    struct drm_amdgpu_info_hw_ip *info);
+
+/**
+ * Query heap information
+ *
+ * This query allows UMD to query potentially available memory resources and
+ * adjust their logic if necessary.
+ *
+ * \param   dev  - \c [in] Device handle. See #amdgpu_device_initialize()
+ * \param   heap - \c [in] Heap type
+ * \param   info - \c [in] Pointer to structure to get needed information
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+*/
+int amdgpu_query_heap_info(amdgpu_device_handle dev, uint32_t heap,
+			   uint32_t flags, struct amdgpu_heap_info *info);
+
+/**
+ * Get the CRTC ID from the mode object ID
+ *
+ * \param   dev    - \c [in] Device handle. See #amdgpu_device_initialize()
+ * \param   id     - \c [in] Mode object ID
+ * \param   result - \c [in] Pointer to the CRTC ID
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+*/
+int amdgpu_query_crtc_from_id(amdgpu_device_handle dev, unsigned id,
+			      int32_t *result);
+
+/**
+ * Query GPU H/w Info
+ *
+ * Query hardware specific information
+ *
+ * \param   dev  - \c [in] Device handle. See #amdgpu_device_initialize()
+ * \param   heap - \c [in] Heap type
+ * \param   info - \c [in] Pointer to structure to get needed information
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+*/
+int amdgpu_query_gpu_info(amdgpu_device_handle dev,
+			   struct amdgpu_gpu_info *info);
+
+/**
+ * Query hardware or driver information.
+ *
+ * The return size is query-specific and depends on the "info_id" parameter.
+ * No more than "size" bytes is returned.
+ *
+ * \param   dev     - \c [in] Device handle. See #amdgpu_device_initialize()
+ * \param   info_id - \c [in] AMDGPU_INFO_*
+ * \param   size    - \c [in] Size of the returned value.
+ * \param   value   - \c [out] Pointer to the return value.
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX error code
+ *
+*/
+int amdgpu_query_info(amdgpu_device_handle dev, unsigned info_id,
+		      unsigned size, void *value);
+
+/**
+ * Query hardware or driver capabilities.
+ *
+ *
+ * \param   dev     - \c [in] Device handle. See #amdgpu_device_initialize()
+ * \param   value   - \c [out] Pointer to the return value.
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX error code
+ *
+*/
+int amdgpu_query_capability(amdgpu_device_handle dev,
+			     struct drm_amdgpu_capability *cap);
+
+/**
+ * Query hardware or driver information.
+ *
+ * The return size is query-specific and depends on the "info_id" parameter.
+ * No more than "size" bytes is returned.
+ *
+ * \param   dev     - \c [in] Device handle. See #amdgpu_device_initialize()
+ * \param   info    - \c [in] amdgpu_sw_info_*
+ * \param   value   - \c [out] Pointer to the return value.
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX error code
+ *
+*/
+int amdgpu_query_sw_info(amdgpu_device_handle dev, enum amdgpu_sw_info info,
+			 void *value);
+
+/**
+ * Query information about GDS
+ *
+ * \param   dev	     - \c [in] Device handle. See #amdgpu_device_initialize()
+ * \param   gds_info - \c [out] Pointer to structure to get GDS information
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+*/
+int amdgpu_query_gds_info(amdgpu_device_handle dev,
+			struct amdgpu_gds_resource_info *gds_info);
+
+/**
+ * Query information about sensor.
+ *
+ * The return size is query-specific and depends on the "sensor_type"
+ * parameter. No more than "size" bytes is returned.
+ *
+ * \param   dev         - \c [in] Device handle. See #amdgpu_device_initialize()
+ * \param   sensor_type - \c [in] AMDGPU_INFO_SENSOR_*
+ * \param   size        - \c [in] Size of the returned value.
+ * \param   value       - \c [out] Pointer to the return value.
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+*/
+int amdgpu_query_sensor_info(amdgpu_device_handle dev, unsigned sensor_type,
+			     unsigned size, void *value);
+
+/**
+ * Query information about video capabilities
+ *
+ * The return sizeof(struct drm_amdgpu_info_video_caps)
+ *
+ * \param   dev         - \c [in] Device handle. See #amdgpu_device_initialize()
+ * \param   caps_type   - \c [in] AMDGPU_INFO_VIDEO_CAPS_DECODE(ENCODE)
+ * \param   size        - \c [in] Size of the returned value.
+ * \param   value       - \c [out] Pointer to the return value.
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+*/
+int amdgpu_query_video_caps_info(amdgpu_device_handle dev, unsigned cap_type,
+                                 unsigned size, void *value);
+
+/**
+ * Query private aperture range
+ *
+ * \param dev    - [in] Device handle. See #amdgpu_device_initialize()
+ * \param start - \c [out] Start of private aperture
+ * \param end    - \c [out] End of private aperture
+ *
+ * \return  0 on success\n
+ *         <0 - Negative POSIX Error code
+ *
+*/
+int amdgpu_query_private_aperture(amdgpu_device_handle dev,
+			uint64_t *start,
+			uint64_t *end);
+
+/**
+ * Query shared aperture range
+ *
+ * \param dev    - [in] Device handle. See #amdgpu_device_initialize()
+ * \param start - \c [out] Start of shared aperture
+ * \param end    - \c [out] End of shared aperture
+ *
+ * \return 0 on success\n
+ *    <0 - Negative POSIX Error code
+ *
+*/
+int amdgpu_query_shared_aperture(amdgpu_device_handle dev,
+			uint64_t *start,
+			uint64_t *end);
+/**
+ * Query information about VM faults
+ *
+ * The return sizeof(struct drm_amdgpu_info_gpuvm_fault)
+ *
+ * \param   dev         - \c [in] Device handle. See #amdgpu_device_initialize()
+ * \param   size        - \c [in] Size of the returned value.
+ * \param   value       - \c [out] Pointer to the return value.
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+*/
+int amdgpu_query_gpuvm_fault_info(amdgpu_device_handle dev, unsigned size,
+				  void *value);
+
+/**
+ * Read a set of consecutive memory-mapped registers.
+ * Not all registers are allowed to be read by userspace.
+ *
+ * \param   dev          - \c [in] Device handle. See #amdgpu_device_initialize(
+ * \param   dword_offset - \c [in] Register offset in dwords
+ * \param   count        - \c [in] The number of registers to read starting
+ *                                 from the offset
+ * \param   instance     - \c [in] GRBM_GFX_INDEX selector. It may have other
+ *                                 uses. Set it to 0xffffffff if unsure.
+ * \param   flags        - \c [in] Flags with additional information.
+ * \param   values       - \c [out] The pointer to return values.
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX error code
+ *
+*/
+int amdgpu_read_mm_registers(amdgpu_device_handle dev, unsigned dword_offset,
+			     unsigned count, uint32_t instance, uint32_t flags,
+			     uint32_t *values);
+
+/**
+ * Flag to request VA address range in the 32bit address space
+*/
+#define AMDGPU_VA_RANGE_32_BIT		0x1
+#define AMDGPU_VA_RANGE_HIGH		0x2
+#define AMDGPU_VA_RANGE_REPLAYABLE	0x4
+
+/**
+ * Allocate virtual address range
+ *
+ * \param dev - [in] Device handle. See #amdgpu_device_initialize()
+ * \param va_range_type - \c [in] Type of MC va range from which to allocate
+ * \param size - \c [in] Size of range. Size must be correctly* aligned.
+ * It is client responsibility to correctly aligned size based on the future
+ * usage of allocated range.
+ * \param va_base_alignment - \c [in] Overwrite base address alignment
+ * requirement for GPU VM MC virtual
+ * address assignment. Must be multiple of size alignments received as
+ * 'amdgpu_buffer_size_alignments'.
+ * If 0 use the default one.
+ * \param va_base_required - \c [in] Specified required va base address.
+ * If 0 then library choose available one.
+ * If !0 value will be passed and those value already "in use" then
+ * corresponding error status will be returned.
+ * \param va_base_allocated - \c [out] On return: Allocated VA base to be used
+ * by client.
+ * \param va_range_handle - \c [out] On return: Handle assigned to allocation
+ * \param flags - \c [in] flags for special VA range
+ *
+ * \return 0 on success\n
+ * >0 - AMD specific error code\n
+ * <0 - Negative POSIX Error code
+ *
+ * \notes \n
+ * It is client responsibility to correctly handle VA assignments and usage.
+ * Neither kernel driver nor libdrm_amdpgu are able to prevent and
+ * detect wrong va assignment.
+ *
+ * It is client responsibility to correctly handle multi-GPU cases and to pass
+ * the corresponding arrays of all devices handles where corresponding VA will
+ * be used.
+ *
+*/
+int amdgpu_va_range_alloc(amdgpu_device_handle dev,
+			   enum amdgpu_gpu_va_range va_range_type,
+			   uint64_t size,
+			   uint64_t va_base_alignment,
+			   uint64_t va_base_required,
+			   uint64_t *va_base_allocated,
+			   amdgpu_va_handle *va_range_handle,
+			   uint64_t flags);
+
+/**
+ * Free previously allocated virtual address range
+ *
+ *
+ * \param va_range_handle - \c [in] Handle assigned to VA allocation
+ *
+ * \return 0 on success\n
+ * >0 - AMD specific error code\n
+ * <0 - Negative POSIX Error code
+ *
+*/
+int amdgpu_va_range_free(amdgpu_va_handle va_range_handle);
+
+/**
+ * Return the starting address of the allocated virtual address range.
+ */
+uint64_t amdgpu_va_get_start_addr(amdgpu_va_handle va_handle);
+
+/**
+* Query virtual address range
+*
+* UMD can query GPU VM range supported by each device
+* to initialize its own VAM accordingly.
+*
+* \param   dev    - [in] Device handle. See #amdgpu_device_initialize()
+* \param   type   - \c [in] Type of virtual address range
+* \param   offset - \c [out] Start offset of virtual address range
+* \param   size   - \c [out] Size of virtual address range
+*
+* \return   0 on success\n
+*          <0 - Negative POSIX Error code
+*
+*/
+
+int amdgpu_va_range_query(amdgpu_device_handle dev,
+			  enum amdgpu_gpu_va_range type,
+			  uint64_t *start,
+			  uint64_t *end);
+
+/**
+ * Allocate a amdgpu_va_manager object.
+ * The returned object has be initialized with the amdgpu_va_manager_init
+ * before use.
+ * On release, amdgpu_va_manager_deinit needs to be called, then the memory
+ * can be released using free().
+ */
+amdgpu_va_manager_handle amdgpu_va_manager_alloc(void);
+
+void amdgpu_va_manager_init(amdgpu_va_manager_handle va_mgr,
+			    uint64_t low_va_offset, uint64_t low_va_max,
+			    uint64_t high_va_offset, uint64_t high_va_max,
+			    uint32_t virtual_address_alignment);
+
+void amdgpu_va_manager_deinit(amdgpu_va_manager_handle va_mgr);
+
+/**
+ * Similar to #amdgpu_va_range_alloc() but allocates VA
+ * directly from an amdgpu_va_manager_handle instead of using
+ * the manager from an amdgpu_device.
+ */
+
+int amdgpu_va_range_alloc2(amdgpu_va_manager_handle va_mgr,
+			   enum amdgpu_gpu_va_range va_range_type,
+			   uint64_t size,
+			   uint64_t va_base_alignment,
+			   uint64_t va_base_required,
+			   uint64_t *va_base_allocated,
+			   amdgpu_va_handle *va_range_handle,
+			   uint64_t flags);
+
+/**
+ *  VA mapping/unmapping for the buffer object
+ *
+ * \param  bo		- \c [in] BO handle
+ * \param  offset	- \c [in] Start offset to map
+ * \param  size		- \c [in] Size to map
+ * \param  addr		- \c [in] Start virtual address.
+ * \param  flags	- \c [in] Supported flags for mapping/unmapping
+ * \param  ops		- \c [in] AMDGPU_VA_OP_MAP or AMDGPU_VA_OP_UNMAP
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+*/
+
+int amdgpu_bo_va_op(amdgpu_bo_handle bo,
+		    uint64_t offset,
+		    uint64_t size,
+		    uint64_t addr,
+		    uint64_t flags,
+		    uint32_t ops);
+
+/**
+ *  VA mapping/unmapping for a buffer object or PRT region.
+ *
+ * This is not a simple drop-in extension for amdgpu_bo_va_op; instead, all
+ * parameters are treated "raw", i.e. size is not automatically aligned, and
+ * all flags must be specified explicitly.
+ *
+ * \param  dev		- \c [in] device handle
+ * \param  bo		- \c [in] BO handle (may be NULL)
+ * \param  offset	- \c [in] Start offset to map
+ * \param  size		- \c [in] Size to map
+ * \param  addr		- \c [in] Start virtual address.
+ * \param  flags	- \c [in] Supported flags for mapping/unmapping
+ * \param  ops		- \c [in] AMDGPU_VA_OP_MAP or AMDGPU_VA_OP_UNMAP
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+*/
+
+int amdgpu_bo_va_op_raw(amdgpu_device_handle dev,
+			amdgpu_bo_handle bo,
+			uint64_t offset,
+			uint64_t size,
+			uint64_t addr,
+			uint64_t flags,
+			uint32_t ops);
+
+/**
+ *  create semaphore
+ *
+ * \param   sem	   - \c [out] semaphore handle
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+*/
+int amdgpu_cs_create_semaphore(amdgpu_semaphore_handle *sem);
+
+/**
+ *  signal semaphore
+ *
+ * \param   context        - \c [in] GPU Context
+ * \param   ip_type        - \c [in] Hardware IP block type = AMDGPU_HW_IP_*
+ * \param   ip_instance    - \c [in] Index of the IP block of the same type
+ * \param   ring           - \c [in] Specify ring index of the IP
+ * \param   sem	           - \c [in] semaphore handle
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+*/
+int amdgpu_cs_signal_semaphore(amdgpu_context_handle ctx,
+			       uint32_t ip_type,
+			       uint32_t ip_instance,
+			       uint32_t ring,
+			       amdgpu_semaphore_handle sem);
+
+/**
+ *  wait semaphore
+ *
+ * \param   context        - \c [in] GPU Context
+ * \param   ip_type        - \c [in] Hardware IP block type = AMDGPU_HW_IP_*
+ * \param   ip_instance    - \c [in] Index of the IP block of the same type
+ * \param   ring           - \c [in] Specify ring index of the IP
+ * \param   sem	           - \c [in] semaphore handle
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+*/
+int amdgpu_cs_wait_semaphore(amdgpu_context_handle ctx,
+			     uint32_t ip_type,
+			     uint32_t ip_instance,
+			     uint32_t ring,
+			     amdgpu_semaphore_handle sem);
+
+/**
+ *  destroy semaphore
+ *
+ * \param   sem	    - \c [in] semaphore handle
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+*/
+int amdgpu_cs_destroy_semaphore(amdgpu_semaphore_handle sem);
+
+/**
+ *  create sem
+ *
+ * \param   dev    - [in] Device handle. See #amdgpu_device_initialize()
+ * \param   sem	   - \c [out] sem handle
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+*/
+int amdgpu_cs_create_sem(amdgpu_device_handle dev,
+			 amdgpu_sem_handle *sem);
+
+/**
+ *  signal sem
+ *
+ * \param   dev    - [in] Device handle. See #amdgpu_device_initialize()
+ * \param   context        - \c [in] GPU Context
+ * \param   ip_type        - \c [in] Hardware IP block type = AMDGPU_HW_IP_*
+ * \param   ip_instance    - \c [in] Index of the IP block of the same type
+ * \param   ring           - \c [in] Specify ring index of the IP
+ * \param   sem	   - \c [out] sem handle
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+ */
+int amdgpu_cs_signal_sem(amdgpu_device_handle dev,
+			 amdgpu_context_handle ctx,
+			 uint32_t ip_type,
+			 uint32_t ip_instance,
+			 uint32_t ring,
+			 amdgpu_sem_handle sem);
+
+/**
+ *  wait sem
+ *
+ * \param   dev    - [in] Device handle. See #amdgpu_device_initialize()
+ * \param   context        - \c [in] GPU Context
+ * \param   ip_type        - \c [in] Hardware IP block type = AMDGPU_HW_IP_*
+ * \param   ip_instance    - \c [in] Index of the IP block of the same type
+ * \param   ring           - \c [in] Specify ring index of the IP
+ * \param   sem	   - \c [out] sem handle
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+*/
+int amdgpu_cs_wait_sem(amdgpu_device_handle dev,
+		       amdgpu_context_handle ctx,
+		       uint32_t ip_type,
+		       uint32_t ip_instance,
+		       uint32_t ring,
+		       amdgpu_sem_handle sem);
+
+int amdgpu_cs_export_sem(amdgpu_device_handle dev,
+			  amdgpu_sem_handle sem,
+			  int *shared_handle);
+
+int amdgpu_cs_import_sem(amdgpu_device_handle dev,
+			  int shared_handle,
+			  amdgpu_sem_handle *sem);
+
+/**
+ *  destroy sem
+ *
+ * \param   dev    - [in] Device handle. See #amdgpu_device_initialize()
+ * \param   sem	   - \c [out] sem handle
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+ */
+int amdgpu_cs_destroy_sem(amdgpu_device_handle dev,
+			  amdgpu_sem_handle sem);
+
+/**
+ *  reserve vmid for this process
+ *
+ * \param   dev    - [in] Device handle. See #amdgpu_device_initialize()
+ */
+int amdgpu_cs_reserved_vmid(amdgpu_device_handle dev);
+
+/**
+ *  unreserve vmid for this process
+ *
+ * \param   dev    - [in] Device handle. See #amdgpu_device_initialize()
+ */
+int amdgpu_cs_unreserved_vmid(amdgpu_device_handle dev);
+
+/**
+ *  Get the ASIC marketing name
+ *
+ * \param   dev         - \c [in] Device handle. See #amdgpu_device_initialize()
+ *
+ * \return  the constant string of the marketing name
+ *          "NULL" means the ASIC is not found
+*/
+const char *amdgpu_get_marketing_name(amdgpu_device_handle dev);
+
+/**
+ *  Create kernel sync object
+ *
+ * \param   dev         - \c [in]  device handle
+ * \param   flags       - \c [in]  flags that affect creation
+ * \param   syncobj     - \c [out] sync object handle
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+*/
+int amdgpu_cs_create_syncobj2(amdgpu_device_handle dev,
+			      uint32_t  flags,
+			      uint32_t *syncobj);
+
+/**
+ *  Create kernel sync object
+ *
+ * \param   dev	      - \c [in]  device handle
+ * \param   syncobj   - \c [out] sync object handle
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+*/
+int amdgpu_cs_create_syncobj(amdgpu_device_handle dev,
+			     uint32_t *syncobj);
+/**
+ *  Destroy kernel sync object
+ *
+ * \param   dev	    - \c [in] device handle
+ * \param   syncobj - \c [in] sync object handle
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+*/
+int amdgpu_cs_destroy_syncobj(amdgpu_device_handle dev,
+			      uint32_t syncobj);
+
+/**
+ * Reset kernel sync objects to unsignalled state.
+ *
+ * \param dev           - \c [in] device handle
+ * \param syncobjs      - \c [in] array of sync object handles
+ * \param syncobj_count - \c [in] number of handles in syncobjs
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+*/
+int amdgpu_cs_syncobj_reset(amdgpu_device_handle dev,
+			    const uint32_t *syncobjs, uint32_t syncobj_count);
+
+/**
+ * Signal kernel sync objects.
+ *
+ * \param dev           - \c [in] device handle
+ * \param syncobjs      - \c [in] array of sync object handles
+ * \param syncobj_count - \c [in] number of handles in syncobjs
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+*/
+int amdgpu_cs_syncobj_signal(amdgpu_device_handle dev,
+			     const uint32_t *syncobjs, uint32_t syncobj_count);
+
+/**
+ * Signal kernel timeline sync objects.
+ *
+ * \param dev           - \c [in] device handle
+ * \param syncobjs      - \c [in] array of sync object handles
+ * \param points	- \c [in] array of timeline points
+ * \param syncobj_count - \c [in] number of handles in syncobjs
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+*/
+int amdgpu_cs_syncobj_timeline_signal(amdgpu_device_handle dev,
+				      const uint32_t *syncobjs,
+				      uint64_t *points,
+				      uint32_t syncobj_count);
+
+/**
+ *  Wait for one or all sync objects to signal.
+ *
+ * \param   dev	    - \c [in] self-explanatory
+ * \param   handles - \c [in] array of sync object handles
+ * \param   num_handles - \c [in] self-explanatory
+ * \param   timeout_nsec - \c [in] self-explanatory
+ * \param   flags   - \c [in] a bitmask of DRM_SYNCOBJ_WAIT_FLAGS_*
+ * \param   first_signaled - \c [in] self-explanatory
+ *
+ * \return   0 on success\n
+ *          -ETIME - Timeout
+ *          <0 - Negative POSIX Error code
+ *
+ */
+int amdgpu_cs_syncobj_wait(amdgpu_device_handle dev,
+			   uint32_t *handles, unsigned num_handles,
+			   int64_t timeout_nsec, unsigned flags,
+			   uint32_t *first_signaled);
+
+/**
+ *  Wait for one or all sync objects on their points to signal.
+ *
+ * \param   dev	    - \c [in] self-explanatory
+ * \param   handles - \c [in] array of sync object handles
+ * \param   points - \c [in] array of sync points to wait
+ * \param   num_handles - \c [in] self-explanatory
+ * \param   timeout_nsec - \c [in] self-explanatory
+ * \param   flags   - \c [in] a bitmask of DRM_SYNCOBJ_WAIT_FLAGS_*
+ * \param   first_signaled - \c [in] self-explanatory
+ *
+ * \return   0 on success\n
+ *          -ETIME - Timeout
+ *          <0 - Negative POSIX Error code
+ *
+ */
+int amdgpu_cs_syncobj_timeline_wait(amdgpu_device_handle dev,
+				    uint32_t *handles, uint64_t *points,
+				    unsigned num_handles,
+				    int64_t timeout_nsec, unsigned flags,
+				    uint32_t *first_signaled);
+/**
+ *  Query sync objects payloads.
+ *
+ * \param   dev	    - \c [in] self-explanatory
+ * \param   handles - \c [in] array of sync object handles
+ * \param   points - \c [out] array of sync points returned, which presents
+ * syncobj payload.
+ * \param   num_handles - \c [in] self-explanatory
+ *
+ * \return   0 on success\n
+ *          -ETIME - Timeout
+ *          <0 - Negative POSIX Error code
+ *
+ */
+int amdgpu_cs_syncobj_query(amdgpu_device_handle dev,
+			    uint32_t *handles, uint64_t *points,
+			    unsigned num_handles);
+/**
+ *  Query sync objects last signaled or submitted point.
+ *
+ * \param   dev	    - \c [in] self-explanatory
+ * \param   handles - \c [in] array of sync object handles
+ * \param   points - \c [out] array of sync points returned, which presents
+ * syncobj payload.
+ * \param   num_handles - \c [in] self-explanatory
+ * \param   flags   - \c [in] a bitmask of DRM_SYNCOBJ_QUERY_FLAGS_*
+ *
+ * \return   0 on success\n
+ *          -ETIME - Timeout
+ *          <0 - Negative POSIX Error code
+ *
+ */
+int amdgpu_cs_syncobj_query2(amdgpu_device_handle dev,
+			     uint32_t *handles, uint64_t *points,
+			     unsigned num_handles, uint32_t flags);
+
+/**
+ *  Export kernel sync object to shareable fd.
+ *
+ * \param   dev	       - \c [in] device handle
+ * \param   syncobj    - \c [in] sync object handle
+ * \param   shared_fd  - \c [out] shared file descriptor.
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+*/
+int amdgpu_cs_export_syncobj(amdgpu_device_handle dev,
+			     uint32_t syncobj,
+			     int *shared_fd);
+/**
+ *  Import kernel sync object from shareable fd.
+ *
+ * \param   dev	       - \c [in] device handle
+ * \param   shared_fd  - \c [in] shared file descriptor.
+ * \param   syncobj    - \c [out] sync object handle
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+*/
+int amdgpu_cs_import_syncobj(amdgpu_device_handle dev,
+			     int shared_fd,
+			     uint32_t *syncobj);
+
+/**
+ *  Export kernel sync object to a sync_file.
+ *
+ * \param   dev	       - \c [in] device handle
+ * \param   syncobj    - \c [in] sync object handle
+ * \param   sync_file_fd - \c [out] sync_file file descriptor.
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+ */
+int amdgpu_cs_syncobj_export_sync_file(amdgpu_device_handle dev,
+				       uint32_t syncobj,
+				       int *sync_file_fd);
+
+/**
+ *  Import kernel sync object from a sync_file.
+ *
+ * \param   dev	       - \c [in] device handle
+ * \param   syncobj    - \c [in] sync object handle
+ * \param   sync_file_fd - \c [in] sync_file file descriptor.
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+ */
+int amdgpu_cs_syncobj_import_sync_file(amdgpu_device_handle dev,
+				       uint32_t syncobj,
+				       int sync_file_fd);
+/**
+ *  Export kernel timeline sync object to a sync_file.
+ *
+ * \param   dev		- \c [in] device handle
+ * \param   syncobj	- \c [in] sync object handle
+ * \param   point	- \c [in] timeline point
+ * \param   flags	- \c [in] flags
+ * \param   sync_file_fd - \c [out] sync_file file descriptor.
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+ */
+int amdgpu_cs_syncobj_export_sync_file2(amdgpu_device_handle dev,
+					uint32_t syncobj,
+					uint64_t point,
+					uint32_t flags,
+					int *sync_file_fd);
+
+/**
+ *  Import kernel timeline sync object from a sync_file.
+ *
+ * \param   dev		- \c [in] device handle
+ * \param   syncobj	- \c [in] sync object handle
+ * \param   point	- \c [in] timeline point
+ * \param   sync_file_fd - \c [in] sync_file file descriptor.
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+ */
+int amdgpu_cs_syncobj_import_sync_file2(amdgpu_device_handle dev,
+					uint32_t syncobj,
+					uint64_t point,
+					int sync_file_fd);
+
+/**
+ *  transfer between syncbojs.
+ *
+ * \param   dev		- \c [in] device handle
+ * \param   dst_handle	- \c [in] sync object handle
+ * \param   dst_point	- \c [in] timeline point, 0 presents dst is binary
+ * \param   src_handle	- \c [in] sync object handle
+ * \param   src_point	- \c [in] timeline point, 0 presents src is binary
+ * \param   flags	- \c [in] flags
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+ */
+int amdgpu_cs_syncobj_transfer(amdgpu_device_handle dev,
+			       uint32_t dst_handle,
+			       uint64_t dst_point,
+			       uint32_t src_handle,
+			       uint64_t src_point,
+			       uint32_t flags);
+
+/**
+ * Export an amdgpu fence as a handle (syncobj or fd).
+ *
+ * \param what		AMDGPU_FENCE_TO_HANDLE_GET_{SYNCOBJ, FD}
+ * \param out_handle	returned handle
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ */
+int amdgpu_cs_fence_to_handle(amdgpu_device_handle dev,
+			      struct amdgpu_cs_fence *fence,
+			      uint32_t what,
+			      uint32_t *out_handle);
+
+/**
+ *  Submit raw command submission to kernel
+ *
+ * \param   dev	       - \c [in] device handle
+ * \param   context    - \c [in] context handle for context id
+ * \param   bo_list_handle - \c [in] request bo list handle (0 for none)
+ * \param   num_chunks - \c [in] number of CS chunks to submit
+ * \param   chunks     - \c [in] array of CS chunks
+ * \param   seq_no     - \c [out] output sequence number for submission.
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+ */
+struct drm_amdgpu_cs_chunk;
+struct drm_amdgpu_cs_chunk_dep;
+struct drm_amdgpu_cs_chunk_data;
+
+int amdgpu_cs_submit_raw(amdgpu_device_handle dev,
+			 amdgpu_context_handle context,
+			 amdgpu_bo_list_handle bo_list_handle,
+			 int num_chunks,
+			 struct drm_amdgpu_cs_chunk *chunks,
+			 uint64_t *seq_no);
+
+/**
+ * Submit raw command submission to the kernel with a raw BO list handle.
+ *
+ * \param   dev	       - \c [in] device handle
+ * \param   context    - \c [in] context handle for context id
+ * \param   bo_list_handle - \c [in] raw bo list handle (0 for none)
+ * \param   num_chunks - \c [in] number of CS chunks to submit
+ * \param   chunks     - \c [in] array of CS chunks
+ * \param   seq_no     - \c [out] output sequence number for submission.
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+ * \sa amdgpu_bo_list_create_raw(), amdgpu_bo_list_destroy_raw()
+ */
+int amdgpu_cs_submit_raw2(amdgpu_device_handle dev,
+			  amdgpu_context_handle context,
+			  uint32_t bo_list_handle,
+			  int num_chunks,
+			  struct drm_amdgpu_cs_chunk *chunks,
+			  uint64_t *seq_no);
+
+void amdgpu_cs_chunk_fence_to_dep(struct amdgpu_cs_fence *fence,
+				  struct drm_amdgpu_cs_chunk_dep *dep);
+void amdgpu_cs_chunk_fence_info_to_data(struct amdgpu_cs_fence_info *fence_info,
+					struct drm_amdgpu_cs_chunk_data *data);
+
+/**
+ * Reserve VMID
+ * \param   context - \c [in]  GPU Context
+ * \param   flags - \c [in]  TBD
+ *
+ * \return  0 on success otherwise POSIX Error code
+*/
+int amdgpu_vm_reserve_vmid(amdgpu_device_handle dev, uint32_t flags);
+
+/**
+ * Free reserved VMID
+ * \param   context - \c [in]  GPU Context
+ * \param   flags - \c [in]  TBD
+ *
+ * \return  0 on success otherwise POSIX Error code
+*/
+int amdgpu_vm_unreserve_vmid(amdgpu_device_handle dev, uint32_t flags);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* #ifdef _AMDGPU_H_ */
diff --git a/projects/rocr-runtime/libhsakmt/include/hsakmt/drm/amdgpu_drm.h b/projects/rocr-runtime/libhsakmt/include/hsakmt/drm/amdgpu_drm.h
new file mode 100644
index 0000000000..9c595f392a
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/include/hsakmt/drm/amdgpu_drm.h
@@ -0,0 +1,1382 @@
+/* amdgpu_drm.h -- Public header for the amdgpu driver -*- linux-c -*-
+ *
+ * Copyright 2000 Precision Insight, Inc., Cedar Park, Texas.
+ * Copyright 2000 VA Linux Systems, Inc., Fremont, California.
+ * Copyright 2002 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Kevin E. Martin <martin@valinux.com>
+ *    Gareth Hughes <gareth@valinux.com>
+ *    Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef __AMDGPU_DRM_H__
+#define __AMDGPU_DRM_H__
+
+#include "drm.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#define DRM_AMDGPU_GEM_CREATE		0x00
+#define DRM_AMDGPU_GEM_MMAP		0x01
+#define DRM_AMDGPU_CTX			0x02
+#define DRM_AMDGPU_BO_LIST		0x03
+#define DRM_AMDGPU_CS			0x04
+#define DRM_AMDGPU_INFO			0x05
+#define DRM_AMDGPU_GEM_METADATA		0x06
+#define DRM_AMDGPU_GEM_WAIT_IDLE	0x07
+#define DRM_AMDGPU_GEM_VA		0x08
+#define DRM_AMDGPU_WAIT_CS		0x09
+#define DRM_AMDGPU_GEM_OP		0x10
+#define DRM_AMDGPU_GEM_USERPTR		0x11
+#define DRM_AMDGPU_WAIT_FENCES		0x12
+#define DRM_AMDGPU_VM			0x13
+#define DRM_AMDGPU_FENCE_TO_HANDLE	0x14
+#define DRM_AMDGPU_SCHED		0x15
+
+/* hybrid specific ioctls */
+#define DRM_AMDGPU_SEM			0x5b
+#define DRM_AMDGPU_GEM_DGMA		0x5c
+
+#define DRM_IOCTL_AMDGPU_GEM_CREATE	DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_CREATE, union drm_amdgpu_gem_create)
+#define DRM_IOCTL_AMDGPU_GEM_MMAP	DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_MMAP, union drm_amdgpu_gem_mmap)
+#define DRM_IOCTL_AMDGPU_CTX		DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_CTX, union drm_amdgpu_ctx)
+#define DRM_IOCTL_AMDGPU_BO_LIST	DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_BO_LIST, union drm_amdgpu_bo_list)
+#define DRM_IOCTL_AMDGPU_CS		DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_CS, union drm_amdgpu_cs)
+#define DRM_IOCTL_AMDGPU_INFO		DRM_IOW(DRM_COMMAND_BASE + DRM_AMDGPU_INFO, struct drm_amdgpu_info)
+#define DRM_IOCTL_AMDGPU_GEM_METADATA	DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_METADATA, struct drm_amdgpu_gem_metadata)
+#define DRM_IOCTL_AMDGPU_GEM_WAIT_IDLE	DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_WAIT_IDLE, union drm_amdgpu_gem_wait_idle)
+#define DRM_IOCTL_AMDGPU_GEM_VA		DRM_IOW(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_VA, struct drm_amdgpu_gem_va)
+#define DRM_IOCTL_AMDGPU_WAIT_CS	DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_WAIT_CS, union drm_amdgpu_wait_cs)
+#define DRM_IOCTL_AMDGPU_GEM_OP		DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_OP, struct drm_amdgpu_gem_op)
+#define DRM_IOCTL_AMDGPU_GEM_USERPTR	DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_USERPTR, struct drm_amdgpu_gem_userptr)
+#define DRM_IOCTL_AMDGPU_WAIT_FENCES	DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_WAIT_FENCES, union drm_amdgpu_wait_fences)
+#define DRM_IOCTL_AMDGPU_VM		DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_VM, union drm_amdgpu_vm)
+#define DRM_IOCTL_AMDGPU_FENCE_TO_HANDLE DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_FENCE_TO_HANDLE, union drm_amdgpu_fence_to_handle)
+#define DRM_IOCTL_AMDGPU_SCHED		DRM_IOW(DRM_COMMAND_BASE + DRM_AMDGPU_SCHED, union drm_amdgpu_sched)
+/* hybrid specific ioctls */
+#define DRM_IOCTL_AMDGPU_SEM		DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_SEM, union drm_amdgpu_sem)
+#define DRM_IOCTL_AMDGPU_GEM_DGMA	DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_DGMA, struct drm_amdgpu_gem_dgma)
+
+/**
+ * DOC: memory domains
+ *
+ * %AMDGPU_GEM_DOMAIN_CPU	System memory that is not GPU accessible.
+ * Memory in this pool could be swapped out to disk if there is pressure.
+ *
+ * %AMDGPU_GEM_DOMAIN_GTT	GPU accessible system memory, mapped into the
+ * GPU's virtual address space via gart. Gart memory linearizes non-contiguous
+ * pages of system memory, allows GPU access system memory in a linearized
+ * fashion.
+ *
+ * %AMDGPU_GEM_DOMAIN_VRAM	Local video memory. For APUs, it is memory
+ * carved out by the BIOS.
+ *
+ * %AMDGPU_GEM_DOMAIN_GDS	Global on-chip data storage used to share data
+ * across shader threads.
+ *
+ * %AMDGPU_GEM_DOMAIN_GWS	Global wave sync, used to synchronize the
+ * execution of all the waves on a device.
+ *
+ * %AMDGPU_GEM_DOMAIN_OA	Ordered append, used by 3D or Compute engines
+ * for appending data.
+ *
+ * %AMDGPU_GEM_DOMAIN_DOORBELL	Doorbell. It is an MMIO region for
+ * signalling user mode queues.
+ */
+#define AMDGPU_GEM_DOMAIN_CPU		0x1
+#define AMDGPU_GEM_DOMAIN_GTT		0x2
+#define AMDGPU_GEM_DOMAIN_VRAM		0x4
+#define AMDGPU_GEM_DOMAIN_GDS		0x8
+#define AMDGPU_GEM_DOMAIN_GWS		0x10
+#define AMDGPU_GEM_DOMAIN_OA		0x20
+#define AMDGPU_GEM_DOMAIN_DOORBELL	0x40
+#define AMDGPU_GEM_DOMAIN_DGMA		0x400
+#define AMDGPU_GEM_DOMAIN_MASK		(AMDGPU_GEM_DOMAIN_CPU | \
+					 AMDGPU_GEM_DOMAIN_GTT | \
+					 AMDGPU_GEM_DOMAIN_VRAM | \
+					 AMDGPU_GEM_DOMAIN_GDS | \
+					 AMDGPU_GEM_DOMAIN_GWS | \
+					 AMDGPU_GEM_DOMAIN_OA | \
+					 AMDGPU_GEM_DOMAIN_DOORBELL | \
+					 AMDGPU_GEM_DOMAIN_DGMA)
+
+/* Flag that CPU access will be required for the case of VRAM domain */
+#define AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED	(1 << 0)
+/* Flag that CPU access will not work, this VRAM domain is invisible */
+#define AMDGPU_GEM_CREATE_NO_CPU_ACCESS		(1 << 1)
+/* Flag that USWC attributes should be used for GTT */
+#define AMDGPU_GEM_CREATE_CPU_GTT_USWC		(1 << 2)
+/* Flag that the memory should be in VRAM and cleared */
+#define AMDGPU_GEM_CREATE_VRAM_CLEARED		(1 << 3)
+/* Flag that allocating the BO should use linear VRAM */
+#define AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS	(1 << 5)
+/* Flag that BO is always valid in this VM */
+#define AMDGPU_GEM_CREATE_VM_ALWAYS_VALID	(1 << 6)
+/* Flag that BO sharing will be explicitly synchronized */
+#define AMDGPU_GEM_CREATE_EXPLICIT_SYNC		(1 << 7)
+/* Flag that indicates allocating MQD gart on GFX9, where the mtype
+ * for the second page onward should be set to NC. It should never
+ * be used by user space applications.
+ */
+#define AMDGPU_GEM_CREATE_CP_MQD_GFX9		(1 << 8)
+/* Flag that BO may contain sensitive data that must be wiped before
+ * releasing the memory
+ */
+#define AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE	(1 << 9)
+/* Flag that BO will be encrypted and that the TMZ bit should be
+ * set in the PTEs when mapping this buffer via GPUVM or
+ * accessing it with various hw blocks
+ */
+#define AMDGPU_GEM_CREATE_ENCRYPTED		(1 << 10)
+/* Flag that BO will be used only in preemptible context, which does
+ * not require GTT memory accounting
+ */
+#define AMDGPU_GEM_CREATE_PREEMPTIBLE		(1 << 11)
+/* Flag that BO can be discarded under memory pressure without keeping the
+ * content.
+ */
+#define AMDGPU_GEM_CREATE_DISCARDABLE		(1 << 12)
+/* Flag that BO is shared coherently between multiple devices or CPU threads.
+ * May depend on GPU instructions to flush caches to system scope explicitly.
+ *
+ * This influences the choice of MTYPE in the PTEs on GFXv9 and later GPUs and
+ * may override the MTYPE selected in AMDGPU_VA_OP_MAP.
+ */
+#define AMDGPU_GEM_CREATE_COHERENT		(1 << 13)
+/* Flag that BO should not be cached by GPU. Coherent without having to flush
+ * GPU caches explicitly
+ *
+ * This influences the choice of MTYPE in the PTEs on GFXv9 and later GPUs and
+ * may override the MTYPE selected in AMDGPU_VA_OP_MAP.
+ */
+#define AMDGPU_GEM_CREATE_UNCACHED		(1 << 14)
+/* Flag that BO should be coherent across devices when using device-level
+ * atomics. May depend on GPU instructions to flush caches to device scope
+ * explicitly, promoting them to system scope automatically.
+ *
+ * This influences the choice of MTYPE in the PTEs on GFXv9 and later GPUs and
+ * may override the MTYPE selected in AMDGPU_VA_OP_MAP.
+ */
+#define AMDGPU_GEM_CREATE_EXT_COHERENT		(1 << 15)
+
+/* Hybrid specific */
+/* Flag that the memory allocation should be from top of domain */
+#define AMDGPU_GEM_CREATE_TOP_DOWN		(1ULL << 30)
+/* Flag that the memory allocation should be pinned */
+#define AMDGPU_GEM_CREATE_NO_EVICT		(1ULL << 31)
+
+struct drm_amdgpu_gem_create_in  {
+	/** the requested memory size */
+	__u64 bo_size;
+	/** physical start_addr alignment in bytes for some HW requirements */
+	__u64 alignment;
+	/** the requested memory domains */
+	__u64 domains;
+	/** allocation flags */
+	__u64 domain_flags;
+};
+
+struct drm_amdgpu_gem_create_out  {
+	/** returned GEM object handle */
+	__u32 handle;
+	__u32 _pad;
+};
+
+union drm_amdgpu_gem_create {
+	struct drm_amdgpu_gem_create_in		in;
+	struct drm_amdgpu_gem_create_out	out;
+};
+
+/** Opcode to create new residency list.  */
+#define AMDGPU_BO_LIST_OP_CREATE	0
+/** Opcode to destroy previously created residency list */
+#define AMDGPU_BO_LIST_OP_DESTROY	1
+/** Opcode to update resource information in the list */
+#define AMDGPU_BO_LIST_OP_UPDATE	2
+
+struct drm_amdgpu_bo_list_in {
+	/** Type of operation */
+	__u32 operation;
+	/** Handle of list or 0 if we want to create one */
+	__u32 list_handle;
+	/** Number of BOs in list  */
+	__u32 bo_number;
+	/** Size of each element describing BO */
+	__u32 bo_info_size;
+	/** Pointer to array describing BOs */
+	__u64 bo_info_ptr;
+};
+
+struct drm_amdgpu_bo_list_entry {
+	/** Handle of BO */
+	__u32 bo_handle;
+	/** New (if specified) BO priority to be used during migration */
+	__u32 bo_priority;
+};
+
+struct drm_amdgpu_bo_list_out {
+	/** Handle of resource list  */
+	__u32 list_handle;
+	__u32 _pad;
+};
+
+union drm_amdgpu_bo_list {
+	struct drm_amdgpu_bo_list_in in;
+	struct drm_amdgpu_bo_list_out out;
+};
+
+/* context related */
+#define AMDGPU_CTX_OP_ALLOC_CTX	1
+#define AMDGPU_CTX_OP_FREE_CTX	2
+#define AMDGPU_CTX_OP_QUERY_STATE	3
+#define AMDGPU_CTX_OP_QUERY_STATE2	4
+#define AMDGPU_CTX_OP_GET_STABLE_PSTATE	5
+#define AMDGPU_CTX_OP_SET_STABLE_PSTATE	6
+
+/* GPU reset status */
+#define AMDGPU_CTX_NO_RESET		0
+/* this the context caused it */
+#define AMDGPU_CTX_GUILTY_RESET		1
+/* some other context caused it */
+#define AMDGPU_CTX_INNOCENT_RESET	2
+/* unknown cause */
+#define AMDGPU_CTX_UNKNOWN_RESET	3
+
+/* indicate gpu reset occurred after ctx created */
+#define AMDGPU_CTX_QUERY2_FLAGS_RESET    (1<<0)
+/* indicate vram lost occurred after ctx created */
+#define AMDGPU_CTX_QUERY2_FLAGS_VRAMLOST (1<<1)
+/* indicate some job from this context once cause gpu hang */
+#define AMDGPU_CTX_QUERY2_FLAGS_GUILTY   (1<<2)
+/* indicate some errors are detected by RAS */
+#define AMDGPU_CTX_QUERY2_FLAGS_RAS_CE   (1<<3)
+#define AMDGPU_CTX_QUERY2_FLAGS_RAS_UE   (1<<4)
+/* indicate that the reset hasn't completed yet */
+#define AMDGPU_CTX_QUERY2_FLAGS_RESET_IN_PROGRESS (1<<5)
+
+/* Context priority level */
+#define AMDGPU_CTX_PRIORITY_UNSET       -2048
+#define AMDGPU_CTX_PRIORITY_VERY_LOW    -1023
+#define AMDGPU_CTX_PRIORITY_LOW         -512
+#define AMDGPU_CTX_PRIORITY_NORMAL      0
+/*
+ * When used in struct drm_amdgpu_ctx_in, a priority above NORMAL requires
+ * CAP_SYS_NICE or DRM_MASTER
+*/
+#define AMDGPU_CTX_PRIORITY_HIGH        512
+#define AMDGPU_CTX_PRIORITY_VERY_HIGH   1023
+
+/* select a stable profiling pstate for perfmon tools */
+#define AMDGPU_CTX_STABLE_PSTATE_FLAGS_MASK  0xf
+#define AMDGPU_CTX_STABLE_PSTATE_NONE  0
+#define AMDGPU_CTX_STABLE_PSTATE_STANDARD  1
+#define AMDGPU_CTX_STABLE_PSTATE_MIN_SCLK  2
+#define AMDGPU_CTX_STABLE_PSTATE_MIN_MCLK  3
+#define AMDGPU_CTX_STABLE_PSTATE_PEAK  4
+
+struct drm_amdgpu_ctx_in {
+	/** AMDGPU_CTX_OP_* */
+	__u32	op;
+	/** Flags */
+	__u32	flags;
+	__u32	ctx_id;
+	/** AMDGPU_CTX_PRIORITY_* */
+	__s32	priority;
+};
+
+union drm_amdgpu_ctx_out {
+		struct {
+			__u32	ctx_id;
+			__u32	_pad;
+		} alloc;
+
+		struct {
+			/** For future use, no flags defined so far */
+			__u64	flags;
+			/** Number of resets caused by this context so far. */
+			__u32	hangs;
+			/** Reset status since the last call of the ioctl. */
+			__u32	reset_status;
+		} state;
+
+		struct {
+			__u32	flags;
+			__u32	_pad;
+		} pstate;
+};
+
+union drm_amdgpu_ctx {
+	struct drm_amdgpu_ctx_in in;
+	union drm_amdgpu_ctx_out out;
+};
+
+/* sem related */
+#define AMDGPU_SEM_OP_CREATE_SEM       1
+#define AMDGPU_SEM_OP_WAIT_SEM         2
+#define AMDGPU_SEM_OP_SIGNAL_SEM       3
+#define AMDGPU_SEM_OP_DESTROY_SEM      4
+#define AMDGPU_SEM_OP_IMPORT_SEM       5
+#define AMDGPU_SEM_OP_EXPORT_SEM       6
+
+struct drm_amdgpu_sem_in {
+	/** AMDGPU_SEM_OP_* */
+	uint32_t        op;
+	uint32_t        handle;
+	uint32_t        ctx_id;
+	uint32_t        ip_type;
+	uint32_t        ip_instance;
+	uint32_t        ring;
+	uint64_t        seq;
+};
+
+union drm_amdgpu_sem_out {
+	int            fd;
+	uint32_t        handle;
+};
+
+union drm_amdgpu_sem {
+	struct drm_amdgpu_sem_in in;
+	union drm_amdgpu_sem_out out;
+};
+
+/* vm ioctl */
+#define AMDGPU_VM_OP_RESERVE_VMID	1
+#define AMDGPU_VM_OP_UNRESERVE_VMID	2
+
+struct drm_amdgpu_vm_in {
+	/** AMDGPU_VM_OP_* */
+	__u32	op;
+	__u32	flags;
+};
+
+struct drm_amdgpu_vm_out {
+	/** For future use, no flags defined so far */
+	__u64	flags;
+};
+
+union drm_amdgpu_vm {
+	struct drm_amdgpu_vm_in in;
+	struct drm_amdgpu_vm_out out;
+};
+
+/* sched ioctl */
+#define AMDGPU_SCHED_OP_PROCESS_PRIORITY_OVERRIDE	1
+#define AMDGPU_SCHED_OP_CONTEXT_PRIORITY_OVERRIDE	2
+
+struct drm_amdgpu_sched_in {
+	/* AMDGPU_SCHED_OP_* */
+	__u32	op;
+	__u32	fd;
+	/** AMDGPU_CTX_PRIORITY_* */
+	__s32	priority;
+	__u32   ctx_id;
+};
+
+union drm_amdgpu_sched {
+	struct drm_amdgpu_sched_in in;
+};
+
+/*
+ * This is not a reliable API and you should expect it to fail for any
+ * number of reasons and have fallback path that do not use userptr to
+ * perform any operation.
+ */
+#define AMDGPU_GEM_USERPTR_READONLY	(1 << 0)
+#define AMDGPU_GEM_USERPTR_ANONONLY	(1 << 1)
+#define AMDGPU_GEM_USERPTR_VALIDATE	(1 << 2)
+#define AMDGPU_GEM_USERPTR_REGISTER	(1 << 3)
+
+struct drm_amdgpu_gem_userptr {
+	__u64		addr;
+	__u64		size;
+	/* AMDGPU_GEM_USERPTR_* */
+	__u32		flags;
+	/* Resulting GEM handle */
+	__u32		handle;
+};
+
+#define AMDGPU_GEM_DGMA_IMPORT			0
+#define AMDGPU_GEM_DGMA_QUERY_PHYS_ADDR	1
+struct drm_amdgpu_gem_dgma {
+	uint64_t		addr;
+	uint64_t		size;
+	uint32_t		op;
+	uint32_t		handle;
+};
+
+/* SI-CI-VI: */
+/* same meaning as the GB_TILE_MODE and GL_MACRO_TILE_MODE fields */
+#define AMDGPU_TILING_ARRAY_MODE_SHIFT			0
+#define AMDGPU_TILING_ARRAY_MODE_MASK			0xf
+#define AMDGPU_TILING_PIPE_CONFIG_SHIFT			4
+#define AMDGPU_TILING_PIPE_CONFIG_MASK			0x1f
+#define AMDGPU_TILING_TILE_SPLIT_SHIFT			9
+#define AMDGPU_TILING_TILE_SPLIT_MASK			0x7
+#define AMDGPU_TILING_MICRO_TILE_MODE_SHIFT		12
+#define AMDGPU_TILING_MICRO_TILE_MODE_MASK		0x7
+#define AMDGPU_TILING_BANK_WIDTH_SHIFT			15
+#define AMDGPU_TILING_BANK_WIDTH_MASK			0x3
+#define AMDGPU_TILING_BANK_HEIGHT_SHIFT			17
+#define AMDGPU_TILING_BANK_HEIGHT_MASK			0x3
+#define AMDGPU_TILING_MACRO_TILE_ASPECT_SHIFT		19
+#define AMDGPU_TILING_MACRO_TILE_ASPECT_MASK		0x3
+#define AMDGPU_TILING_NUM_BANKS_SHIFT			21
+#define AMDGPU_TILING_NUM_BANKS_MASK			0x3
+
+/* GFX9 - GFX11: */
+#define AMDGPU_TILING_SWIZZLE_MODE_SHIFT		0
+#define AMDGPU_TILING_SWIZZLE_MODE_MASK			0x1f
+#define AMDGPU_TILING_DCC_OFFSET_256B_SHIFT		5
+#define AMDGPU_TILING_DCC_OFFSET_256B_MASK		0xFFFFFF
+#define AMDGPU_TILING_DCC_PITCH_MAX_SHIFT		29
+#define AMDGPU_TILING_DCC_PITCH_MAX_MASK		0x3FFF
+#define AMDGPU_TILING_DCC_INDEPENDENT_64B_SHIFT		43
+#define AMDGPU_TILING_DCC_INDEPENDENT_64B_MASK		0x1
+#define AMDGPU_TILING_DCC_INDEPENDENT_128B_SHIFT	44
+#define AMDGPU_TILING_DCC_INDEPENDENT_128B_MASK		0x1
+#define AMDGPU_TILING_SCANOUT_SHIFT			63
+#define AMDGPU_TILING_SCANOUT_MASK			0x1
+
+/* GFX12 and later: */
+#define AMDGPU_TILING_GFX12_SWIZZLE_MODE_SHIFT			0
+#define AMDGPU_TILING_GFX12_SWIZZLE_MODE_MASK			0x7
+/* These are DCC recompression setting for memory management: */
+#define AMDGPU_TILING_GFX12_DCC_MAX_COMPRESSED_BLOCK_SHIFT	3
+#define AMDGPU_TILING_GFX12_DCC_MAX_COMPRESSED_BLOCK_MASK	0x3 /* 0:64B, 1:128B, 2:256B */
+#define AMDGPU_TILING_GFX12_DCC_NUMBER_TYPE_SHIFT		5
+#define AMDGPU_TILING_GFX12_DCC_NUMBER_TYPE_MASK		0x7 /* CB_COLOR0_INFO.NUMBER_TYPE */
+#define AMDGPU_TILING_GFX12_DCC_DATA_FORMAT_SHIFT		8
+#define AMDGPU_TILING_GFX12_DCC_DATA_FORMAT_MASK		0x3f /* [0:4]:CB_COLOR0_INFO.FORMAT, [5]:MM */
+
+/* Set/Get helpers for tiling flags. */
+#define AMDGPU_TILING_SET(field, value) \
+	(((__u64)(value) & AMDGPU_TILING_##field##_MASK) << AMDGPU_TILING_##field##_SHIFT)
+#define AMDGPU_TILING_GET(value, field) \
+	(((__u64)(value) >> AMDGPU_TILING_##field##_SHIFT) & AMDGPU_TILING_##field##_MASK)
+
+#define AMDGPU_GEM_METADATA_OP_SET_METADATA                  1
+#define AMDGPU_GEM_METADATA_OP_GET_METADATA                  2
+
+/** The same structure is shared for input/output */
+struct drm_amdgpu_gem_metadata {
+	/** GEM Object handle */
+	__u32	handle;
+	/** Do we want get or set metadata */
+	__u32	op;
+	struct {
+		/** For future use, no flags defined so far */
+		__u64	flags;
+		/** family specific tiling info */
+		__u64	tiling_info;
+		__u32	data_size_bytes;
+		__u32	data[64];
+	} data;
+};
+
+struct drm_amdgpu_gem_mmap_in {
+	/** the GEM object handle */
+	__u32 handle;
+	__u32 _pad;
+};
+
+struct drm_amdgpu_gem_mmap_out {
+	/** mmap offset from the vma offset manager */
+	__u64 addr_ptr;
+};
+
+union drm_amdgpu_gem_mmap {
+	struct drm_amdgpu_gem_mmap_in   in;
+	struct drm_amdgpu_gem_mmap_out out;
+};
+
+struct drm_amdgpu_gem_wait_idle_in {
+	/** GEM object handle */
+	__u32 handle;
+	/** For future use, no flags defined so far */
+	__u32 flags;
+	/** Absolute timeout to wait */
+	__u64 timeout;
+};
+
+struct drm_amdgpu_gem_wait_idle_out {
+	/** BO status:  0 - BO is idle, 1 - BO is busy */
+	__u32 status;
+	/** Returned current memory domain */
+	__u32 domain;
+};
+
+union drm_amdgpu_gem_wait_idle {
+	struct drm_amdgpu_gem_wait_idle_in  in;
+	struct drm_amdgpu_gem_wait_idle_out out;
+};
+
+struct drm_amdgpu_wait_cs_in {
+	/* Command submission handle
+         * handle equals 0 means none to wait for
+         * handle equals ~0ull means wait for the latest sequence number
+         */
+	__u64 handle;
+	/** Absolute timeout to wait */
+	__u64 timeout;
+	__u32 ip_type;
+	__u32 ip_instance;
+	__u32 ring;
+	__u32 ctx_id;
+};
+
+struct drm_amdgpu_wait_cs_out {
+	/** CS status:  0 - CS completed, 1 - CS still busy */
+	__u64 status;
+};
+
+union drm_amdgpu_wait_cs {
+	struct drm_amdgpu_wait_cs_in in;
+	struct drm_amdgpu_wait_cs_out out;
+};
+
+struct drm_amdgpu_fence {
+	__u32 ctx_id;
+	__u32 ip_type;
+	__u32 ip_instance;
+	__u32 ring;
+	__u64 seq_no;
+};
+
+struct drm_amdgpu_wait_fences_in {
+	/** This points to uint64_t * which points to fences */
+	__u64 fences;
+	__u32 fence_count;
+	__u32 wait_all;
+	__u64 timeout_ns;
+};
+
+struct drm_amdgpu_wait_fences_out {
+	__u32 status;
+	__u32 first_signaled;
+};
+
+union drm_amdgpu_wait_fences {
+	struct drm_amdgpu_wait_fences_in in;
+	struct drm_amdgpu_wait_fences_out out;
+};
+
+#define AMDGPU_GEM_OP_GET_GEM_CREATE_INFO	0
+#define AMDGPU_GEM_OP_SET_PLACEMENT		1
+
+/* Sets or returns a value associated with a buffer. */
+struct drm_amdgpu_gem_op {
+	/** GEM object handle */
+	__u32	handle;
+	/** AMDGPU_GEM_OP_* */
+	__u32	op;
+	/** Input or return value */
+	__u64	value;
+};
+
+#define AMDGPU_VA_OP_MAP			1
+#define AMDGPU_VA_OP_UNMAP			2
+#define AMDGPU_VA_OP_CLEAR			3
+#define AMDGPU_VA_OP_REPLACE			4
+
+/* Delay the page table update till the next CS */
+#define AMDGPU_VM_DELAY_UPDATE		(1 << 0)
+
+/* Mapping flags */
+/* readable mapping */
+#define AMDGPU_VM_PAGE_READABLE		(1 << 1)
+/* writable mapping */
+#define AMDGPU_VM_PAGE_WRITEABLE	(1 << 2)
+/* executable mapping, new for VI */
+#define AMDGPU_VM_PAGE_EXECUTABLE	(1 << 3)
+/* partially resident texture */
+#define AMDGPU_VM_PAGE_PRT		(1 << 4)
+/* MTYPE flags use bit 5 to 8 */
+#define AMDGPU_VM_MTYPE_MASK		(0xf << 5)
+/* Default MTYPE. Pre-AI must use this.  Recommended for newer ASICs. */
+#define AMDGPU_VM_MTYPE_DEFAULT		(0 << 5)
+/* Use Non Coherent MTYPE instead of default MTYPE */
+#define AMDGPU_VM_MTYPE_NC		(1 << 5)
+/* Use Write Combine MTYPE instead of default MTYPE */
+#define AMDGPU_VM_MTYPE_WC		(2 << 5)
+/* Use Cache Coherent MTYPE instead of default MTYPE */
+#define AMDGPU_VM_MTYPE_CC		(3 << 5)
+/* Use UnCached MTYPE instead of default MTYPE */
+#define AMDGPU_VM_MTYPE_UC		(4 << 5)
+/* Use Read Write MTYPE instead of default MTYPE */
+#define AMDGPU_VM_MTYPE_RW		(5 << 5)
+/* don't allocate MALL */
+#define AMDGPU_VM_PAGE_NOALLOC		(1 << 9)
+
+struct drm_amdgpu_gem_va {
+	/** GEM object handle */
+	__u32 handle;
+	__u32 _pad;
+	/** AMDGPU_VA_OP_* */
+	__u32 operation;
+	/** AMDGPU_VM_PAGE_* */
+	__u32 flags;
+	/** va address to assign . Must be correctly aligned.*/
+	__u64 va_address;
+	/** Specify offset inside of BO to assign. Must be correctly aligned.*/
+	__u64 offset_in_bo;
+	/** Specify mapping size. Must be correctly aligned. */
+	__u64 map_size;
+};
+
+#define AMDGPU_HW_IP_GFX          0
+#define AMDGPU_HW_IP_COMPUTE      1
+#define AMDGPU_HW_IP_DMA          2
+#define AMDGPU_HW_IP_UVD          3
+#define AMDGPU_HW_IP_VCE          4
+#define AMDGPU_HW_IP_UVD_ENC      5
+#define AMDGPU_HW_IP_VCN_DEC      6
+/*
+ * From VCN4, AMDGPU_HW_IP_VCN_ENC is re-used to support
+ * both encoding and decoding jobs.
+ */
+#define AMDGPU_HW_IP_VCN_ENC      7
+#define AMDGPU_HW_IP_VCN_JPEG     8
+#define AMDGPU_HW_IP_VPE          9
+#define AMDGPU_HW_IP_NUM          10
+
+#define AMDGPU_HW_IP_INSTANCE_MAX_COUNT 1
+
+#define AMDGPU_CHUNK_ID_IB		0x01
+#define AMDGPU_CHUNK_ID_FENCE		0x02
+#define AMDGPU_CHUNK_ID_DEPENDENCIES	0x03
+#define AMDGPU_CHUNK_ID_SYNCOBJ_IN      0x04
+#define AMDGPU_CHUNK_ID_SYNCOBJ_OUT     0x05
+#define AMDGPU_CHUNK_ID_BO_HANDLES      0x06
+#define AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES	0x07
+#define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_WAIT    0x08
+#define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_SIGNAL  0x09
+#define AMDGPU_CHUNK_ID_CP_GFX_SHADOW   0x0a
+
+struct drm_amdgpu_cs_chunk {
+	__u32		chunk_id;
+	__u32		length_dw;
+	__u64		chunk_data;
+};
+
+struct drm_amdgpu_cs_in {
+	/** Rendering context id */
+	__u32		ctx_id;
+	/**  Handle of resource list associated with CS */
+	__u32		bo_list_handle;
+	__u32		num_chunks;
+	__u32		flags;
+	/** this points to __u64 * which point to cs chunks */
+	__u64		chunks;
+};
+
+struct drm_amdgpu_cs_out {
+	__u64 handle;
+};
+
+union drm_amdgpu_cs {
+	struct drm_amdgpu_cs_in in;
+	struct drm_amdgpu_cs_out out;
+};
+
+/* Specify flags to be used for IB */
+
+/* This IB should be submitted to CE */
+#define AMDGPU_IB_FLAG_CE	(1<<0)
+
+/* Preamble flag, which means the IB could be dropped if no context switch */
+#define AMDGPU_IB_FLAG_PREAMBLE (1<<1)
+
+/* Preempt flag, IB should set Pre_enb bit if PREEMPT flag detected */
+#define AMDGPU_IB_FLAG_PREEMPT (1<<2)
+
+/* The IB fence should do the L2 writeback but not invalidate any shader
+ * caches (L2/vL1/sL1/I$). */
+#define AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE (1 << 3)
+
+/* Set GDS_COMPUTE_MAX_WAVE_ID = DEFAULT before PACKET3_INDIRECT_BUFFER.
+ * This will reset wave ID counters for the IB.
+ */
+#define AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID (1 << 4)
+
+/* Flag the IB as secure (TMZ)
+ */
+#define AMDGPU_IB_FLAGS_SECURE  (1 << 5)
+
+/* Tell KMD to flush and invalidate caches
+ */
+#define AMDGPU_IB_FLAG_EMIT_MEM_SYNC  (1 << 6)
+
+struct drm_amdgpu_cs_chunk_ib {
+	__u32 _pad;
+	/** AMDGPU_IB_FLAG_* */
+	__u32 flags;
+	/** Virtual address to begin IB execution */
+	__u64 va_start;
+	/** Size of submission */
+	__u32 ib_bytes;
+	/** HW IP to submit to */
+	__u32 ip_type;
+	/** HW IP index of the same type to submit to  */
+	__u32 ip_instance;
+	/** Ring index to submit to */
+	__u32 ring;
+};
+
+struct drm_amdgpu_cs_chunk_dep {
+	__u32 ip_type;
+	__u32 ip_instance;
+	__u32 ring;
+	__u32 ctx_id;
+	__u64 handle;
+};
+
+struct drm_amdgpu_cs_chunk_fence {
+	__u32 handle;
+	__u32 offset;
+};
+
+struct drm_amdgpu_cs_chunk_sem {
+	__u32 handle;
+};
+
+struct drm_amdgpu_cs_chunk_syncobj {
+       __u32 handle;
+       __u32 flags;
+       __u64 point;
+};
+
+#define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ	0
+#define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ_FD	1
+#define AMDGPU_FENCE_TO_HANDLE_GET_SYNC_FILE_FD	2
+
+union drm_amdgpu_fence_to_handle {
+	struct {
+		struct drm_amdgpu_fence fence;
+		__u32 what;
+		__u32 pad;
+	} in;
+	struct {
+		__u32 handle;
+	} out;
+};
+
+struct drm_amdgpu_cs_chunk_data {
+	union {
+		struct drm_amdgpu_cs_chunk_ib		ib_data;
+		struct drm_amdgpu_cs_chunk_fence	fence_data;
+	};
+};
+
+#define AMDGPU_CS_CHUNK_CP_GFX_SHADOW_FLAGS_INIT_SHADOW         0x1
+
+struct drm_amdgpu_cs_chunk_cp_gfx_shadow {
+	__u64 shadow_va;
+	__u64 csa_va;
+	__u64 gds_va;
+	__u64 flags;
+};
+
+/*
+ *  Query h/w info: Flag that this is integrated (a.h.a. fusion) GPU
+ *
+ */
+#define AMDGPU_IDS_FLAGS_FUSION         0x1
+#define AMDGPU_IDS_FLAGS_PREEMPTION     0x2
+#define AMDGPU_IDS_FLAGS_TMZ            0x4
+#define AMDGPU_IDS_FLAGS_CONFORMANT_TRUNC_COORD 0x8
+
+/*
+ *  Query h/w info: Flag identifying VF/PF/PT mode
+ *
+ */
+#define AMDGPU_IDS_FLAGS_MODE_MASK      0x300
+#define AMDGPU_IDS_FLAGS_MODE_SHIFT     0x8
+#define AMDGPU_IDS_FLAGS_MODE_PF        0x0
+#define AMDGPU_IDS_FLAGS_MODE_VF        0x1
+#define AMDGPU_IDS_FLAGS_MODE_PT        0x2
+
+/* indicate if acceleration can be working */
+#define AMDGPU_INFO_ACCEL_WORKING		0x00
+/* get the crtc_id from the mode object id? */
+#define AMDGPU_INFO_CRTC_FROM_ID		0x01
+/* query hw IP info */
+#define AMDGPU_INFO_HW_IP_INFO			0x02
+/* query hw IP instance count for the specified type */
+#define AMDGPU_INFO_HW_IP_COUNT			0x03
+/* timestamp for GL_ARB_timer_query */
+#define AMDGPU_INFO_TIMESTAMP			0x05
+/* Query the firmware version */
+#define AMDGPU_INFO_FW_VERSION			0x0e
+	/* Subquery id: Query VCE firmware version */
+	#define AMDGPU_INFO_FW_VCE		0x1
+	/* Subquery id: Query UVD firmware version */
+	#define AMDGPU_INFO_FW_UVD		0x2
+	/* Subquery id: Query GMC firmware version */
+	#define AMDGPU_INFO_FW_GMC		0x03
+	/* Subquery id: Query GFX ME firmware version */
+	#define AMDGPU_INFO_FW_GFX_ME		0x04
+	/* Subquery id: Query GFX PFP firmware version */
+	#define AMDGPU_INFO_FW_GFX_PFP		0x05
+	/* Subquery id: Query GFX CE firmware version */
+	#define AMDGPU_INFO_FW_GFX_CE		0x06
+	/* Subquery id: Query GFX RLC firmware version */
+	#define AMDGPU_INFO_FW_GFX_RLC		0x07
+	/* Subquery id: Query GFX MEC firmware version */
+	#define AMDGPU_INFO_FW_GFX_MEC		0x08
+	/* Subquery id: Query SMC firmware version */
+	#define AMDGPU_INFO_FW_SMC		0x0a
+	/* Subquery id: Query SDMA firmware version */
+	#define AMDGPU_INFO_FW_SDMA		0x0b
+	/* Subquery id: Query PSP SOS firmware version */
+	#define AMDGPU_INFO_FW_SOS		0x0c
+	/* Subquery id: Query PSP ASD firmware version */
+	#define AMDGPU_INFO_FW_ASD		0x0d
+	/* Subquery id: Query VCN firmware version */
+	#define AMDGPU_INFO_FW_VCN		0x0e
+	/* Subquery id: Query GFX RLC SRLC firmware version */
+	#define AMDGPU_INFO_FW_GFX_RLC_RESTORE_LIST_CNTL 0x0f
+	/* Subquery id: Query GFX RLC SRLG firmware version */
+	#define AMDGPU_INFO_FW_GFX_RLC_RESTORE_LIST_GPM_MEM 0x10
+	/* Subquery id: Query GFX RLC SRLS firmware version */
+	#define AMDGPU_INFO_FW_GFX_RLC_RESTORE_LIST_SRM_MEM 0x11
+	/* Subquery id: Query DMCU firmware version */
+	#define AMDGPU_INFO_FW_DMCU		0x12
+	#define AMDGPU_INFO_FW_TA		0x13
+	/* Subquery id: Query DMCUB firmware version */
+	#define AMDGPU_INFO_FW_DMCUB		0x14
+	/* Subquery id: Query TOC firmware version */
+	#define AMDGPU_INFO_FW_TOC		0x15
+	/* Subquery id: Query CAP firmware version */
+	#define AMDGPU_INFO_FW_CAP		0x16
+	/* Subquery id: Query GFX RLCP firmware version */
+	#define AMDGPU_INFO_FW_GFX_RLCP		0x17
+	/* Subquery id: Query GFX RLCV firmware version */
+	#define AMDGPU_INFO_FW_GFX_RLCV		0x18
+	/* Subquery id: Query MES_KIQ firmware version */
+	#define AMDGPU_INFO_FW_MES_KIQ		0x19
+	/* Subquery id: Query MES firmware version */
+	#define AMDGPU_INFO_FW_MES		0x1a
+	/* Subquery id: Query IMU firmware version */
+	#define AMDGPU_INFO_FW_IMU		0x1b
+	/* Subquery id: Query VPE firmware version */
+	#define AMDGPU_INFO_FW_VPE		0x1c
+
+/* number of bytes moved for TTM migration */
+#define AMDGPU_INFO_NUM_BYTES_MOVED		0x0f
+/* the used VRAM size */
+#define AMDGPU_INFO_VRAM_USAGE			0x10
+/* the used GTT size */
+#define AMDGPU_INFO_GTT_USAGE			0x11
+/* Information about GDS, etc. resource configuration */
+#define AMDGPU_INFO_GDS_CONFIG			0x13
+/* Query information about VRAM and GTT domains */
+#define AMDGPU_INFO_VRAM_GTT			0x14
+/* Query information about register in MMR address space*/
+#define AMDGPU_INFO_READ_MMR_REG		0x15
+/* Query information about device: rev id, family, etc. */
+#define AMDGPU_INFO_DEV_INFO			0x16
+/* visible vram usage */
+#define AMDGPU_INFO_VIS_VRAM_USAGE		0x17
+/* number of TTM buffer evictions */
+#define AMDGPU_INFO_NUM_EVICTIONS		0x18
+/* Query memory about VRAM and GTT domains */
+#define AMDGPU_INFO_MEMORY			0x19
+/* Query vce clock table */
+#define AMDGPU_INFO_VCE_CLOCK_TABLE		0x1A
+/* Query vbios related information */
+#define AMDGPU_INFO_VBIOS			0x1B
+	/* Subquery id: Query vbios size */
+	#define AMDGPU_INFO_VBIOS_SIZE		0x1
+	/* Subquery id: Query vbios image */
+	#define AMDGPU_INFO_VBIOS_IMAGE		0x2
+	/* Subquery id: Query vbios info */
+	#define AMDGPU_INFO_VBIOS_INFO		0x3
+/* Query UVD handles */
+#define AMDGPU_INFO_NUM_HANDLES			0x1C
+/* Query sensor related information */
+#define AMDGPU_INFO_SENSOR			0x1D
+	/* Subquery id: Query GPU shader clock */
+	#define AMDGPU_INFO_SENSOR_GFX_SCLK		0x1
+	/* Subquery id: Query GPU memory clock */
+	#define AMDGPU_INFO_SENSOR_GFX_MCLK		0x2
+	/* Subquery id: Query GPU temperature */
+	#define AMDGPU_INFO_SENSOR_GPU_TEMP		0x3
+	/* Subquery id: Query GPU load */
+	#define AMDGPU_INFO_SENSOR_GPU_LOAD		0x4
+	/* Subquery id: Query average GPU power	*/
+	#define AMDGPU_INFO_SENSOR_GPU_AVG_POWER	0x5
+	/* Subquery id: Query northbridge voltage */
+	#define AMDGPU_INFO_SENSOR_VDDNB		0x6
+	/* Subquery id: Query graphics voltage */
+	#define AMDGPU_INFO_SENSOR_VDDGFX		0x7
+	/* Subquery id: Query GPU stable pstate shader clock */
+	#define AMDGPU_INFO_SENSOR_STABLE_PSTATE_GFX_SCLK		0x8
+	/* Subquery id: Query GPU stable pstate memory clock */
+	#define AMDGPU_INFO_SENSOR_STABLE_PSTATE_GFX_MCLK		0x9
+	/* Subquery id: Query GPU peak pstate shader clock */
+	#define AMDGPU_INFO_SENSOR_PEAK_PSTATE_GFX_SCLK			0xa
+	/* Subquery id: Query GPU peak pstate memory clock */
+	#define AMDGPU_INFO_SENSOR_PEAK_PSTATE_GFX_MCLK			0xb
+	/* Subquery id: Query input GPU power	*/
+	#define AMDGPU_INFO_SENSOR_GPU_INPUT_POWER	0xc
+/* Number of VRAM page faults on CPU access. */
+#define AMDGPU_INFO_NUM_VRAM_CPU_PAGE_FAULTS	0x1E
+#define AMDGPU_INFO_VRAM_LOST_COUNTER		0x1F
+/* query ras mask of enabled features*/
+#define AMDGPU_INFO_RAS_ENABLED_FEATURES	0x20
+/* RAS MASK: UMC (VRAM) */
+#define AMDGPU_INFO_RAS_ENABLED_UMC			(1 << 0)
+/* RAS MASK: SDMA */
+#define AMDGPU_INFO_RAS_ENABLED_SDMA			(1 << 1)
+/* RAS MASK: GFX */
+#define AMDGPU_INFO_RAS_ENABLED_GFX			(1 << 2)
+/* RAS MASK: MMHUB */
+#define AMDGPU_INFO_RAS_ENABLED_MMHUB			(1 << 3)
+/* RAS MASK: ATHUB */
+#define AMDGPU_INFO_RAS_ENABLED_ATHUB			(1 << 4)
+/* RAS MASK: PCIE */
+#define AMDGPU_INFO_RAS_ENABLED_PCIE			(1 << 5)
+/* RAS MASK: HDP */
+#define AMDGPU_INFO_RAS_ENABLED_HDP			(1 << 6)
+/* RAS MASK: XGMI */
+#define AMDGPU_INFO_RAS_ENABLED_XGMI			(1 << 7)
+/* RAS MASK: DF */
+#define AMDGPU_INFO_RAS_ENABLED_DF			(1 << 8)
+/* RAS MASK: SMN */
+#define AMDGPU_INFO_RAS_ENABLED_SMN			(1 << 9)
+/* RAS MASK: SEM */
+#define AMDGPU_INFO_RAS_ENABLED_SEM			(1 << 10)
+/* RAS MASK: MP0 */
+#define AMDGPU_INFO_RAS_ENABLED_MP0			(1 << 11)
+/* RAS MASK: MP1 */
+#define AMDGPU_INFO_RAS_ENABLED_MP1			(1 << 12)
+/* RAS MASK: FUSE */
+#define AMDGPU_INFO_RAS_ENABLED_FUSE			(1 << 13)
+/* query video encode/decode caps */
+#define AMDGPU_INFO_VIDEO_CAPS			0x21
+	/* Subquery id: Decode */
+	#define AMDGPU_INFO_VIDEO_CAPS_DECODE		0
+	/* Subquery id: Encode */
+	#define AMDGPU_INFO_VIDEO_CAPS_ENCODE		1
+/* Query the max number of IBs per gang per submission */
+#define AMDGPU_INFO_MAX_IBS			0x22
+/* query last page fault info */
+#define AMDGPU_INFO_GPUVM_FAULT			0x23
+
+/* gpu capability */
+#define AMDGPU_INFO_CAPABILITY			0x50
+/* virtual range */
+#define AMDGPU_INFO_VIRTUAL_RANGE               0x51
+
+#define AMDGPU_INFO_MMR_SE_INDEX_SHIFT	0
+#define AMDGPU_INFO_MMR_SE_INDEX_MASK	0xff
+#define AMDGPU_INFO_MMR_SH_INDEX_SHIFT	8
+#define AMDGPU_INFO_MMR_SH_INDEX_MASK	0xff
+
+struct drm_amdgpu_query_fw {
+	/** AMDGPU_INFO_FW_* */
+	__u32 fw_type;
+	/**
+	 * Index of the IP if there are more IPs of
+	 * the same type.
+	 */
+	__u32 ip_instance;
+	/**
+	 * Index of the engine. Whether this is used depends
+	 * on the firmware type. (e.g. MEC, SDMA)
+	 */
+	__u32 index;
+	__u32 _pad;
+};
+
+/* Input structure for the INFO ioctl */
+struct drm_amdgpu_info {
+	/* Where the return value will be stored */
+	__u64 return_pointer;
+	/* The size of the return value. Just like "size" in "snprintf",
+	 * it limits how many bytes the kernel can write. */
+	__u32 return_size;
+	/* The query request id. */
+	__u32 query;
+
+	union {
+		struct {
+			__u32 id;
+			__u32 _pad;
+		} mode_crtc;
+
+		struct {
+			/** AMDGPU_HW_IP_* */
+			__u32 type;
+			/**
+			 * Index of the IP if there are more IPs of the same
+			 * type. Ignored by AMDGPU_INFO_HW_IP_COUNT.
+			 */
+			__u32 ip_instance;
+		} query_hw_ip;
+
+		struct {
+			__u32 dword_offset;
+			/** number of registers to read */
+			__u32 count;
+			__u32 instance;
+			/** For future use, no flags defined so far */
+			__u32 flags;
+		} read_mmr_reg;
+
+		struct {
+			uint32_t aperture;
+			uint32_t _pad;
+		} virtual_range;
+
+		struct drm_amdgpu_query_fw query_fw;
+
+		struct {
+			__u32 type;
+			__u32 offset;
+		} vbios_info;
+
+		struct {
+			__u32 type;
+		} sensor_info;
+
+		struct {
+			__u32 type;
+		} video_cap;
+	};
+};
+
+struct drm_amdgpu_info_gds {
+	/** GDS GFX partition size */
+	__u32 gds_gfx_partition_size;
+	/** GDS compute partition size */
+	__u32 compute_partition_size;
+	/** total GDS memory size */
+	__u32 gds_total_size;
+	/** GWS size per GFX partition */
+	__u32 gws_per_gfx_partition;
+	/** GSW size per compute partition */
+	__u32 gws_per_compute_partition;
+	/** OA size per GFX partition */
+	__u32 oa_per_gfx_partition;
+	/** OA size per compute partition */
+	__u32 oa_per_compute_partition;
+	__u32 _pad;
+};
+
+struct drm_amdgpu_info_vram_gtt {
+	__u64 vram_size;
+	__u64 vram_cpu_accessible_size;
+	__u64 gtt_size;
+};
+
+struct drm_amdgpu_heap_info {
+	/** max. physical memory */
+	__u64 total_heap_size;
+
+	/** Theoretical max. available memory in the given heap */
+	__u64 usable_heap_size;
+
+	/**
+	 * Number of bytes allocated in the heap. This includes all processes
+	 * and private allocations in the kernel. It changes when new buffers
+	 * are allocated, freed, and moved. It cannot be larger than
+	 * heap_size.
+	 */
+	__u64 heap_usage;
+
+	/**
+	 * Theoretical possible max. size of buffer which
+	 * could be allocated in the given heap
+	 */
+	__u64 max_allocation;
+};
+
+struct drm_amdgpu_memory_info {
+	struct drm_amdgpu_heap_info vram;
+	struct drm_amdgpu_heap_info cpu_accessible_vram;
+	struct drm_amdgpu_heap_info gtt;
+};
+
+struct drm_amdgpu_info_firmware {
+	__u32 ver;
+	__u32 feature;
+};
+
+struct drm_amdgpu_info_vbios {
+	__u8 name[64];
+	__u8 vbios_pn[64];
+	__u32 version;
+	__u32 pad;
+	__u8 vbios_ver_str[32];
+	__u8 date[32];
+};
+
+#define AMDGPU_VRAM_TYPE_UNKNOWN 0
+#define AMDGPU_VRAM_TYPE_GDDR1 1
+#define AMDGPU_VRAM_TYPE_DDR2  2
+#define AMDGPU_VRAM_TYPE_GDDR3 3
+#define AMDGPU_VRAM_TYPE_GDDR4 4
+#define AMDGPU_VRAM_TYPE_GDDR5 5
+#define AMDGPU_VRAM_TYPE_HBM   6
+#define AMDGPU_VRAM_TYPE_DDR3  7
+#define AMDGPU_VRAM_TYPE_DDR4  8
+#define AMDGPU_VRAM_TYPE_GDDR6 9
+#define AMDGPU_VRAM_TYPE_DDR5  10
+#define AMDGPU_VRAM_TYPE_LPDDR4 11
+#define AMDGPU_VRAM_TYPE_LPDDR5 12
+
+struct drm_amdgpu_info_device {
+	/** PCI Device ID */
+	__u32 device_id;
+	/** Internal chip revision: A0, A1, etc.) */
+	__u32 chip_rev;
+	__u32 external_rev;
+	/** Revision id in PCI Config space */
+	__u32 pci_rev;
+	__u32 family;
+	__u32 num_shader_engines;
+	__u32 num_shader_arrays_per_engine;
+	/* in KHz */
+	__u32 gpu_counter_freq;
+	__u64 max_engine_clock;
+	__u64 max_memory_clock;
+	/* cu information */
+	__u32 cu_active_number;
+	/* NOTE: cu_ao_mask is INVALID, DON'T use it */
+	__u32 cu_ao_mask;
+	__u32 cu_bitmap[4][4];
+	/** Render backend pipe mask. One render backend is CB+DB. */
+	__u32 enabled_rb_pipes_mask;
+	__u32 num_rb_pipes;
+	__u32 num_hw_gfx_contexts;
+	/* PCIe version (the smaller of the GPU and the CPU/motherboard) */
+	__u32 pcie_gen;
+	__u64 ids_flags;
+	/** Starting virtual address for UMDs. */
+	__u64 virtual_address_offset;
+	/** The maximum virtual address */
+	__u64 virtual_address_max;
+	/** Required alignment of virtual addresses. */
+	__u32 virtual_address_alignment;
+	/** Page table entry - fragment size */
+	__u32 pte_fragment_size;
+	__u32 gart_page_size;
+	/** constant engine ram size*/
+	__u32 ce_ram_size;
+	/** video memory type info*/
+	__u32 vram_type;
+	/** video memory bit width*/
+	__u32 vram_bit_width;
+	/* vce harvesting instance */
+	__u32 vce_harvest_config;
+	/* gfx double offchip LDS buffers */
+	__u32 gc_double_offchip_lds_buf;
+	/* NGG Primitive Buffer */
+	__u64 prim_buf_gpu_addr;
+	/* NGG Position Buffer */
+	__u64 pos_buf_gpu_addr;
+	/* NGG Control Sideband */
+	__u64 cntl_sb_buf_gpu_addr;
+	/* NGG Parameter Cache */
+	__u64 param_buf_gpu_addr;
+	__u32 prim_buf_size;
+	__u32 pos_buf_size;
+	__u32 cntl_sb_buf_size;
+	__u32 param_buf_size;
+	/* wavefront size*/
+	__u32 wave_front_size;
+	/* shader visible vgprs*/
+	__u32 num_shader_visible_vgprs;
+	/* CU per shader array*/
+	__u32 num_cu_per_sh;
+	/* number of tcc blocks*/
+	__u32 num_tcc_blocks;
+	/* gs vgt table depth*/
+	__u32 gs_vgt_table_depth;
+	/* gs primitive buffer depth*/
+	__u32 gs_prim_buffer_depth;
+	/* max gs wavefront per vgt*/
+	__u32 max_gs_waves_per_vgt;
+	/* PCIe number of lanes (the smaller of the GPU and the CPU/motherboard) */
+	__u32 pcie_num_lanes;
+	/* always on cu bitmap */
+	__u32 cu_ao_bitmap[4][4];
+	/** Starting high virtual address for UMDs. */
+	__u64 high_va_offset;
+	/** The maximum high virtual address */
+	__u64 high_va_max;
+	/* gfx10 pa_sc_tile_steering_override */
+	__u32 pa_sc_tile_steering_override;
+	/* disabled TCCs */
+	__u64 tcc_disabled_mask;
+	__u64 min_engine_clock;
+	__u64 min_memory_clock;
+	/* The following fields are only set on gfx11+, older chips set 0. */
+	__u32 tcp_cache_size;       /* AKA GL0, VMEM cache */
+	__u32 num_sqc_per_wgp;
+	__u32 sqc_data_cache_size;  /* AKA SMEM cache */
+	__u32 sqc_inst_cache_size;
+	__u32 gl1c_cache_size;
+	__u32 gl2c_cache_size;
+	__u64 mall_size;            /* AKA infinity cache */
+	/* high 32 bits of the rb pipes mask */
+	__u32 enabled_rb_pipes_mask_hi;
+	/* shadow area size for gfx11 */
+	__u32 shadow_size;
+	/* shadow area base virtual alignment for gfx11 */
+	__u32 shadow_alignment;
+	/* context save area size for gfx11 */
+	__u32 csa_size;
+	/* context save area base virtual alignment for gfx11 */
+	__u32 csa_alignment;
+};
+
+struct drm_amdgpu_info_hw_ip {
+	/** Version of h/w IP */
+	__u32  hw_ip_version_major;
+	__u32  hw_ip_version_minor;
+	/** Capabilities */
+	__u64  capabilities_flags;
+	/** command buffer address start alignment*/
+	__u32  ib_start_alignment;
+	/** command buffer size alignment*/
+	__u32  ib_size_alignment;
+	/** Bitmask of available rings. Bit 0 means ring 0, etc. */
+	__u32  available_rings;
+	/** version info: bits 23:16 major, 15:8 minor, 7:0 revision */
+	__u32  ip_discovery_version;
+};
+
+struct drm_amdgpu_info_num_handles {
+	/** Max handles as supported by firmware for UVD */
+	__u32  uvd_max_handles;
+	/** Handles currently in use for UVD */
+	__u32  uvd_used_handles;
+};
+
+#define AMDGPU_VCE_CLOCK_TABLE_ENTRIES		6
+
+struct drm_amdgpu_info_vce_clock_table_entry {
+	/** System clock */
+	__u32 sclk;
+	/** Memory clock */
+	__u32 mclk;
+	/** VCE clock */
+	__u32 eclk;
+	__u32 pad;
+};
+
+struct drm_amdgpu_info_vce_clock_table {
+	struct drm_amdgpu_info_vce_clock_table_entry entries[AMDGPU_VCE_CLOCK_TABLE_ENTRIES];
+	__u32 num_valid_entries;
+	__u32 pad;
+};
+
+/* query video encode/decode caps */
+#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG2			0
+#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4			1
+#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VC1			2
+#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC		3
+#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC			4
+#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_JPEG			5
+#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VP9			6
+#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_AV1			7
+#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_COUNT			8
+
+struct drm_amdgpu_info_video_codec_info {
+	__u32 valid;
+	__u32 max_width;
+	__u32 max_height;
+	__u32 max_pixels_per_frame;
+	__u32 max_level;
+	__u32 pad;
+};
+
+struct drm_amdgpu_info_video_caps {
+	struct drm_amdgpu_info_video_codec_info codec_info[AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_COUNT];
+};
+
+#define AMDGPU_VMHUB_TYPE_MASK			0xff
+#define AMDGPU_VMHUB_TYPE_SHIFT			0
+#define AMDGPU_VMHUB_TYPE_GFX			0
+#define AMDGPU_VMHUB_TYPE_MM0			1
+#define AMDGPU_VMHUB_TYPE_MM1			2
+#define AMDGPU_VMHUB_IDX_MASK			0xff00
+#define AMDGPU_VMHUB_IDX_SHIFT			8
+
+struct drm_amdgpu_info_gpuvm_fault {
+	__u64 addr;
+	__u32 status;
+	__u32 vmhub;
+};
+
+/*
+ * Supported GPU families
+ */
+#define AMDGPU_FAMILY_UNKNOWN			0
+#define AMDGPU_FAMILY_SI			110 /* Hainan, Oland, Verde, Pitcairn, Tahiti */
+#define AMDGPU_FAMILY_CI			120 /* Bonaire, Hawaii */
+#define AMDGPU_FAMILY_KV			125 /* Kaveri, Kabini, Mullins */
+#define AMDGPU_FAMILY_VI			130 /* Iceland, Tonga */
+#define AMDGPU_FAMILY_CZ			135 /* Carrizo, Stoney */
+#define AMDGPU_FAMILY_AI			141 /* Vega10 */
+#define AMDGPU_FAMILY_RV			142 /* Raven */
+#define AMDGPU_FAMILY_NV			143 /* Navi10 */
+#define AMDGPU_FAMILY_VGH			144 /* Van Gogh */
+#define AMDGPU_FAMILY_GC_11_0_0			145 /* GC 11.0.0 */
+#define AMDGPU_FAMILY_YC			146 /* Yellow Carp */
+#define AMDGPU_FAMILY_GC_11_0_1			148 /* GC 11.0.1 */
+#define AMDGPU_FAMILY_GC_10_3_6			149 /* GC 10.3.6 */
+#define AMDGPU_FAMILY_GC_10_3_7			151 /* GC 10.3.7 */
+#define AMDGPU_FAMILY_GC_11_5_0			150 /* GC 11.5.0 */
+#define AMDGPU_FAMILY_GC_12_0_0			152 /* GC 12.0.0 */
+
+/**
+ *  Definition of System Unified Address (SUA) apertures
+ */
+#define AMDGPU_SUA_APERTURE_PRIVATE	1
+#define AMDGPU_SUA_APERTURE_SHARED	2
+struct drm_amdgpu_virtual_range {
+	uint64_t start;
+	uint64_t end;
+};
+
+/* query pin memory capability */
+#define AMDGPU_CAPABILITY_PIN_MEM_FLAG		(1 << 0)
+/* query direct gma capability */
+#define AMDGPU_CAPABILITY_DIRECT_GMA_FLAG	(1 << 1)
+
+struct drm_amdgpu_capability {
+	uint32_t flag;
+	uint32_t direct_gma_size;
+};
+
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/projects/rocr-runtime/libhsakmt/include/hsakmt/drm/drm.h b/projects/rocr-runtime/libhsakmt/include/hsakmt/drm/drm.h
new file mode 100644
index 0000000000..4e4f7c2c39
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/include/hsakmt/drm/drm.h
@@ -0,0 +1,1408 @@
+/*
+ * Header for the Direct Rendering Manager
+ *
+ * Author: Rickard E. (Rik) Faith <faith@valinux.com>
+ *
+ * Acknowledgments:
+ * Dec 1999, Richard Henderson <rth@twiddle.net>, move to generic cmpxchg.
+ */
+
+/*
+ * Copyright 1999 Precision Insight, Inc., Cedar Park, Texas.
+ * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California.
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _DRM_H_
+#define _DRM_H_
+
+#if   defined(__linux__)
+
+#include <linux/types.h>
+#include <asm/ioctl.h>
+typedef unsigned int drm_handle_t;
+
+#else /* One of the BSDs */
+
+#include <stdint.h>
+#include <sys/ioccom.h>
+#include <sys/types.h>
+typedef int8_t   __s8;
+typedef uint8_t  __u8;
+typedef int16_t  __s16;
+typedef uint16_t __u16;
+typedef int32_t  __s32;
+typedef uint32_t __u32;
+typedef int64_t  __s64;
+typedef uint64_t __u64;
+typedef size_t   __kernel_size_t;
+typedef unsigned long drm_handle_t;
+
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#define DRM_NAME	"drm"	  /**< Name in kernel, /dev, and /proc */
+#define DRM_MIN_ORDER	5	  /**< At least 2^5 bytes = 32 bytes */
+#define DRM_MAX_ORDER	22	  /**< Up to 2^22 bytes = 4MB */
+#define DRM_RAM_PERCENT 10	  /**< How much system ram can we lock? */
+
+#define _DRM_LOCK_HELD	0x80000000U /**< Hardware lock is held */
+#define _DRM_LOCK_CONT	0x40000000U /**< Hardware lock is contended */
+#define _DRM_LOCK_IS_HELD(lock)	   ((lock) & _DRM_LOCK_HELD)
+#define _DRM_LOCK_IS_CONT(lock)	   ((lock) & _DRM_LOCK_CONT)
+#define _DRM_LOCKING_CONTEXT(lock) ((lock) & ~(_DRM_LOCK_HELD|_DRM_LOCK_CONT))
+
+typedef unsigned int drm_context_t;
+typedef unsigned int drm_drawable_t;
+typedef unsigned int drm_magic_t;
+
+/*
+ * Cliprect.
+ *
+ * \warning: If you change this structure, make sure you change
+ * XF86DRIClipRectRec in the server as well
+ *
+ * \note KW: Actually it's illegal to change either for
+ * backwards-compatibility reasons.
+ */
+struct drm_clip_rect {
+	unsigned short x1;
+	unsigned short y1;
+	unsigned short x2;
+	unsigned short y2;
+};
+
+/*
+ * Drawable information.
+ */
+struct drm_drawable_info {
+	unsigned int num_rects;
+	struct drm_clip_rect *rects;
+};
+
+/*
+ * Texture region,
+ */
+struct drm_tex_region {
+	unsigned char next;
+	unsigned char prev;
+	unsigned char in_use;
+	unsigned char padding;
+	unsigned int age;
+};
+
+/*
+ * Hardware lock.
+ *
+ * The lock structure is a simple cache-line aligned integer.  To avoid
+ * processor bus contention on a multiprocessor system, there should not be any
+ * other data stored in the same cache line.
+ */
+struct drm_hw_lock {
+	__volatile__ unsigned int lock;		/**< lock variable */
+	char padding[60];			/**< Pad to cache line */
+};
+
+/*
+ * DRM_IOCTL_VERSION ioctl argument type.
+ *
+ * \sa drmGetVersion().
+ */
+struct drm_version {
+	int version_major;	  /**< Major version */
+	int version_minor;	  /**< Minor version */
+	int version_patchlevel;	  /**< Patch level */
+	__kernel_size_t name_len;	  /**< Length of name buffer */
+	char *name;	  /**< Name of driver */
+	__kernel_size_t date_len;	  /**< Length of date buffer */
+	char *date;	  /**< User-space buffer to hold date */
+	__kernel_size_t desc_len;	  /**< Length of desc buffer */
+	char *desc;	  /**< User-space buffer to hold desc */
+};
+
+/*
+ * DRM_IOCTL_GET_UNIQUE ioctl argument type.
+ *
+ * \sa drmGetBusid() and drmSetBusId().
+ */
+struct drm_unique {
+	__kernel_size_t unique_len;	  /**< Length of unique */
+	char *unique;	  /**< Unique name for driver instantiation */
+};
+
+struct drm_list {
+	int count;		  /**< Length of user-space structures */
+	struct drm_version *version;
+};
+
+struct drm_block {
+	int unused;
+};
+
+/*
+ * DRM_IOCTL_CONTROL ioctl argument type.
+ *
+ * \sa drmCtlInstHandler() and drmCtlUninstHandler().
+ */
+struct drm_control {
+	enum {
+		DRM_ADD_COMMAND,
+		DRM_RM_COMMAND,
+		DRM_INST_HANDLER,
+		DRM_UNINST_HANDLER
+	} func;
+	int irq;
+};
+
+/*
+ * Type of memory to map.
+ */
+enum drm_map_type {
+	_DRM_FRAME_BUFFER = 0,	  /**< WC (no caching), no core dump */
+	_DRM_REGISTERS = 1,	  /**< no caching, no core dump */
+	_DRM_SHM = 2,		  /**< shared, cached */
+	_DRM_AGP = 3,		  /**< AGP/GART */
+	_DRM_SCATTER_GATHER = 4,  /**< Scatter/gather memory for PCI DMA */
+	_DRM_CONSISTENT = 5	  /**< Consistent memory for PCI DMA */
+};
+
+/*
+ * Memory mapping flags.
+ */
+enum drm_map_flags {
+	_DRM_RESTRICTED = 0x01,	     /**< Cannot be mapped to user-virtual */
+	_DRM_READ_ONLY = 0x02,
+	_DRM_LOCKED = 0x04,	     /**< shared, cached, locked */
+	_DRM_KERNEL = 0x08,	     /**< kernel requires access */
+	_DRM_WRITE_COMBINING = 0x10, /**< use write-combining if available */
+	_DRM_CONTAINS_LOCK = 0x20,   /**< SHM page that contains lock */
+	_DRM_REMOVABLE = 0x40,	     /**< Removable mapping */
+	_DRM_DRIVER = 0x80	     /**< Managed by driver */
+};
+
+struct drm_ctx_priv_map {
+	unsigned int ctx_id;	 /**< Context requesting private mapping */
+	void *handle;		 /**< Handle of map */
+};
+
+/*
+ * DRM_IOCTL_GET_MAP, DRM_IOCTL_ADD_MAP and DRM_IOCTL_RM_MAP ioctls
+ * argument type.
+ *
+ * \sa drmAddMap().
+ */
+struct drm_map {
+	unsigned long offset;	 /**< Requested physical address (0 for SAREA)*/
+	unsigned long size;	 /**< Requested physical size (bytes) */
+	enum drm_map_type type;	 /**< Type of memory to map */
+	enum drm_map_flags flags;	 /**< Flags */
+	void *handle;		 /**< User-space: "Handle" to pass to mmap() */
+				 /**< Kernel-space: kernel-virtual address */
+	int mtrr;		 /**< MTRR slot used */
+	/*   Private data */
+};
+
+/*
+ * DRM_IOCTL_GET_CLIENT ioctl argument type.
+ */
+struct drm_client {
+	int idx;		/**< Which client desired? */
+	int auth;		/**< Is client authenticated? */
+	unsigned long pid;	/**< Process ID */
+	unsigned long uid;	/**< User ID */
+	unsigned long magic;	/**< Magic */
+	unsigned long iocs;	/**< Ioctl count */
+};
+
+enum drm_stat_type {
+	_DRM_STAT_LOCK,
+	_DRM_STAT_OPENS,
+	_DRM_STAT_CLOSES,
+	_DRM_STAT_IOCTLS,
+	_DRM_STAT_LOCKS,
+	_DRM_STAT_UNLOCKS,
+	_DRM_STAT_VALUE,	/**< Generic value */
+	_DRM_STAT_BYTE,		/**< Generic byte counter (1024bytes/K) */
+	_DRM_STAT_COUNT,	/**< Generic non-byte counter (1000/k) */
+
+	_DRM_STAT_IRQ,		/**< IRQ */
+	_DRM_STAT_PRIMARY,	/**< Primary DMA bytes */
+	_DRM_STAT_SECONDARY,	/**< Secondary DMA bytes */
+	_DRM_STAT_DMA,		/**< DMA */
+	_DRM_STAT_SPECIAL,	/**< Special DMA (e.g., priority or polled) */
+	_DRM_STAT_MISSED	/**< Missed DMA opportunity */
+	    /* Add to the *END* of the list */
+};
+
+/*
+ * DRM_IOCTL_GET_STATS ioctl argument type.
+ */
+struct drm_stats {
+	unsigned long count;
+	struct {
+		unsigned long value;
+		enum drm_stat_type type;
+	} data[15];
+};
+
+/*
+ * Hardware locking flags.
+ */
+enum drm_lock_flags {
+	_DRM_LOCK_READY = 0x01,	     /**< Wait until hardware is ready for DMA */
+	_DRM_LOCK_QUIESCENT = 0x02,  /**< Wait until hardware quiescent */
+	_DRM_LOCK_FLUSH = 0x04,	     /**< Flush this context's DMA queue first */
+	_DRM_LOCK_FLUSH_ALL = 0x08,  /**< Flush all DMA queues first */
+	/* These *HALT* flags aren't supported yet
+	   -- they will be used to support the
+	   full-screen DGA-like mode. */
+	_DRM_HALT_ALL_QUEUES = 0x10, /**< Halt all current and future queues */
+	_DRM_HALT_CUR_QUEUES = 0x20  /**< Halt all current queues */
+};
+
+/*
+ * DRM_IOCTL_LOCK, DRM_IOCTL_UNLOCK and DRM_IOCTL_FINISH ioctl argument type.
+ *
+ * \sa drmGetLock() and drmUnlock().
+ */
+struct drm_lock {
+	int context;
+	enum drm_lock_flags flags;
+};
+
+/*
+ * DMA flags
+ *
+ * \warning
+ * These values \e must match xf86drm.h.
+ *
+ * \sa drm_dma.
+ */
+enum drm_dma_flags {
+	/* Flags for DMA buffer dispatch */
+	_DRM_DMA_BLOCK = 0x01,	      /**<
+				       * Block until buffer dispatched.
+				       *
+				       * \note The buffer may not yet have
+				       * been processed by the hardware --
+				       * getting a hardware lock with the
+				       * hardware quiescent will ensure
+				       * that the buffer has been
+				       * processed.
+				       */
+	_DRM_DMA_WHILE_LOCKED = 0x02, /**< Dispatch while lock held */
+	_DRM_DMA_PRIORITY = 0x04,     /**< High priority dispatch */
+
+	/* Flags for DMA buffer request */
+	_DRM_DMA_WAIT = 0x10,	      /**< Wait for free buffers */
+	_DRM_DMA_SMALLER_OK = 0x20,   /**< Smaller-than-requested buffers OK */
+	_DRM_DMA_LARGER_OK = 0x40     /**< Larger-than-requested buffers OK */
+};
+
+/*
+ * DRM_IOCTL_ADD_BUFS and DRM_IOCTL_MARK_BUFS ioctl argument type.
+ *
+ * \sa drmAddBufs().
+ */
+struct drm_buf_desc {
+	int count;		 /**< Number of buffers of this size */
+	int size;		 /**< Size in bytes */
+	int low_mark;		 /**< Low water mark */
+	int high_mark;		 /**< High water mark */
+	enum {
+		_DRM_PAGE_ALIGN = 0x01,	/**< Align on page boundaries for DMA */
+		_DRM_AGP_BUFFER = 0x02,	/**< Buffer is in AGP space */
+		_DRM_SG_BUFFER = 0x04,	/**< Scatter/gather memory buffer */
+		_DRM_FB_BUFFER = 0x08,	/**< Buffer is in frame buffer */
+		_DRM_PCI_BUFFER_RO = 0x10 /**< Map PCI DMA buffer read-only */
+	} flags;
+	unsigned long agp_start; /**<
+				  * Start address of where the AGP buffers are
+				  * in the AGP aperture
+				  */
+};
+
+/*
+ * DRM_IOCTL_INFO_BUFS ioctl argument type.
+ */
+struct drm_buf_info {
+	int count;		/**< Entries in list */
+	struct drm_buf_desc *list;
+};
+
+/*
+ * DRM_IOCTL_FREE_BUFS ioctl argument type.
+ */
+struct drm_buf_free {
+	int count;
+	int *list;
+};
+
+/*
+ * Buffer information
+ *
+ * \sa drm_buf_map.
+ */
+struct drm_buf_pub {
+	int idx;		       /**< Index into the master buffer list */
+	int total;		       /**< Buffer size */
+	int used;		       /**< Amount of buffer in use (for DMA) */
+	void *address;	       /**< Address of buffer */
+};
+
+/*
+ * DRM_IOCTL_MAP_BUFS ioctl argument type.
+ */
+struct drm_buf_map {
+	int count;		/**< Length of the buffer list */
+#ifdef __cplusplus
+	void *virt;
+#else
+	void *virtual;		/**< Mmap'd area in user-virtual */
+#endif
+	struct drm_buf_pub *list;	/**< Buffer information */
+};
+
+/*
+ * DRM_IOCTL_DMA ioctl argument type.
+ *
+ * Indices here refer to the offset into the buffer list in drm_buf_get.
+ *
+ * \sa drmDMA().
+ */
+struct drm_dma {
+	int context;			  /**< Context handle */
+	int send_count;			  /**< Number of buffers to send */
+	int *send_indices;	  /**< List of handles to buffers */
+	int *send_sizes;		  /**< Lengths of data to send */
+	enum drm_dma_flags flags;	  /**< Flags */
+	int request_count;		  /**< Number of buffers requested */
+	int request_size;		  /**< Desired size for buffers */
+	int *request_indices;	  /**< Buffer information */
+	int *request_sizes;
+	int granted_count;		  /**< Number of buffers granted */
+};
+
+enum drm_ctx_flags {
+	_DRM_CONTEXT_PRESERVED = 0x01,
+	_DRM_CONTEXT_2DONLY = 0x02
+};
+
+/*
+ * DRM_IOCTL_ADD_CTX ioctl argument type.
+ *
+ * \sa drmCreateContext() and drmDestroyContext().
+ */
+struct drm_ctx {
+	drm_context_t handle;
+	enum drm_ctx_flags flags;
+};
+
+/*
+ * DRM_IOCTL_RES_CTX ioctl argument type.
+ */
+struct drm_ctx_res {
+	int count;
+	struct drm_ctx *contexts;
+};
+
+/*
+ * DRM_IOCTL_ADD_DRAW and DRM_IOCTL_RM_DRAW ioctl argument type.
+ */
+struct drm_draw {
+	drm_drawable_t handle;
+};
+
+/*
+ * DRM_IOCTL_UPDATE_DRAW ioctl argument type.
+ */
+typedef enum {
+	DRM_DRAWABLE_CLIPRECTS
+} drm_drawable_info_type_t;
+
+struct drm_update_draw {
+	drm_drawable_t handle;
+	unsigned int type;
+	unsigned int num;
+	unsigned long long data;
+};
+
+/*
+ * DRM_IOCTL_GET_MAGIC and DRM_IOCTL_AUTH_MAGIC ioctl argument type.
+ */
+struct drm_auth {
+	drm_magic_t magic;
+};
+
+/*
+ * DRM_IOCTL_IRQ_BUSID ioctl argument type.
+ *
+ * \sa drmGetInterruptFromBusID().
+ */
+struct drm_irq_busid {
+	int irq;	/**< IRQ number */
+	int busnum;	/**< bus number */
+	int devnum;	/**< device number */
+	int funcnum;	/**< function number */
+};
+
+enum drm_vblank_seq_type {
+	_DRM_VBLANK_ABSOLUTE = 0x0,	/**< Wait for specific vblank sequence number */
+	_DRM_VBLANK_RELATIVE = 0x1,	/**< Wait for given number of vblanks */
+	/* bits 1-6 are reserved for high crtcs */
+	_DRM_VBLANK_HIGH_CRTC_MASK = 0x0000003e,
+	_DRM_VBLANK_EVENT = 0x4000000,   /**< Send event instead of blocking */
+	_DRM_VBLANK_FLIP = 0x8000000,   /**< Scheduled buffer swap should flip */
+	_DRM_VBLANK_NEXTONMISS = 0x10000000,	/**< If missed, wait for next vblank */
+	_DRM_VBLANK_SECONDARY = 0x20000000,	/**< Secondary display controller */
+	_DRM_VBLANK_SIGNAL = 0x40000000	/**< Send signal instead of blocking, unsupported */
+};
+#define _DRM_VBLANK_HIGH_CRTC_SHIFT 1
+
+#define _DRM_VBLANK_TYPES_MASK (_DRM_VBLANK_ABSOLUTE | _DRM_VBLANK_RELATIVE)
+#define _DRM_VBLANK_FLAGS_MASK (_DRM_VBLANK_EVENT | _DRM_VBLANK_SIGNAL | \
+				_DRM_VBLANK_SECONDARY | _DRM_VBLANK_NEXTONMISS)
+
+struct drm_wait_vblank_request {
+	enum drm_vblank_seq_type type;
+	unsigned int sequence;
+	unsigned long signal;
+};
+
+struct drm_wait_vblank_reply {
+	enum drm_vblank_seq_type type;
+	unsigned int sequence;
+	long tval_sec;
+	long tval_usec;
+};
+
+/*
+ * DRM_IOCTL_WAIT_VBLANK ioctl argument type.
+ *
+ * \sa drmWaitVBlank().
+ */
+union drm_wait_vblank {
+	struct drm_wait_vblank_request request;
+	struct drm_wait_vblank_reply reply;
+};
+
+#define _DRM_PRE_MODESET 1
+#define _DRM_POST_MODESET 2
+
+/*
+ * DRM_IOCTL_MODESET_CTL ioctl argument type
+ *
+ * \sa drmModesetCtl().
+ */
+struct drm_modeset_ctl {
+	__u32 crtc;
+	__u32 cmd;
+};
+
+/*
+ * DRM_IOCTL_AGP_ENABLE ioctl argument type.
+ *
+ * \sa drmAgpEnable().
+ */
+struct drm_agp_mode {
+	unsigned long mode;	/**< AGP mode */
+};
+
+/*
+ * DRM_IOCTL_AGP_ALLOC and DRM_IOCTL_AGP_FREE ioctls argument type.
+ *
+ * \sa drmAgpAlloc() and drmAgpFree().
+ */
+struct drm_agp_buffer {
+	unsigned long size;	/**< In bytes -- will round to page boundary */
+	unsigned long handle;	/**< Used for binding / unbinding */
+	unsigned long type;	/**< Type of memory to allocate */
+	unsigned long physical;	/**< Physical used by i810 */
+};
+
+/*
+ * DRM_IOCTL_AGP_BIND and DRM_IOCTL_AGP_UNBIND ioctls argument type.
+ *
+ * \sa drmAgpBind() and drmAgpUnbind().
+ */
+struct drm_agp_binding {
+	unsigned long handle;	/**< From drm_agp_buffer */
+	unsigned long offset;	/**< In bytes -- will round to page boundary */
+};
+
+/*
+ * DRM_IOCTL_AGP_INFO ioctl argument type.
+ *
+ * \sa drmAgpVersionMajor(), drmAgpVersionMinor(), drmAgpGetMode(),
+ * drmAgpBase(), drmAgpSize(), drmAgpMemoryUsed(), drmAgpMemoryAvail(),
+ * drmAgpVendorId() and drmAgpDeviceId().
+ */
+struct drm_agp_info {
+	int agp_version_major;
+	int agp_version_minor;
+	unsigned long mode;
+	unsigned long aperture_base;	/* physical address */
+	unsigned long aperture_size;	/* bytes */
+	unsigned long memory_allowed;	/* bytes */
+	unsigned long memory_used;
+
+	/* PCI information */
+	unsigned short id_vendor;
+	unsigned short id_device;
+};
+
+/*
+ * DRM_IOCTL_SG_ALLOC ioctl argument type.
+ */
+struct drm_scatter_gather {
+	unsigned long size;	/**< In bytes -- will round to page boundary */
+	unsigned long handle;	/**< Used for mapping / unmapping */
+};
+
+/*
+ * DRM_IOCTL_SET_VERSION ioctl argument type.
+ */
+struct drm_set_version {
+	int drm_di_major;
+	int drm_di_minor;
+	int drm_dd_major;
+	int drm_dd_minor;
+};
+
+/* DRM_IOCTL_GEM_CLOSE ioctl argument type */
+struct drm_gem_close {
+	/** Handle of the object to be closed. */
+	__u32 handle;
+	__u32 pad;
+};
+
+/* DRM_IOCTL_GEM_FLINK ioctl argument type */
+struct drm_gem_flink {
+	/** Handle for the object being named */
+	__u32 handle;
+
+	/** Returned global name */
+	__u32 name;
+};
+
+/* DRM_IOCTL_GEM_OPEN ioctl argument type */
+struct drm_gem_open {
+	/** Name of object being opened */
+	__u32 name;
+
+	/** Returned handle for the object */
+	__u32 handle;
+
+	/** Returned size of the object */
+	__u64 size;
+};
+
+/**
+ * DRM_CAP_DUMB_BUFFER
+ *
+ * If set to 1, the driver supports creating dumb buffers via the
+ * &DRM_IOCTL_MODE_CREATE_DUMB ioctl.
+ */
+#define DRM_CAP_DUMB_BUFFER		0x1
+/**
+ * DRM_CAP_VBLANK_HIGH_CRTC
+ *
+ * If set to 1, the kernel supports specifying a :ref:`CRTC index<crtc_index>`
+ * in the high bits of &drm_wait_vblank_request.type.
+ *
+ * Starting kernel version 2.6.39, this capability is always set to 1.
+ */
+#define DRM_CAP_VBLANK_HIGH_CRTC	0x2
+/**
+ * DRM_CAP_DUMB_PREFERRED_DEPTH
+ *
+ * The preferred bit depth for dumb buffers.
+ *
+ * The bit depth is the number of bits used to indicate the color of a single
+ * pixel excluding any padding. This is different from the number of bits per
+ * pixel. For instance, XRGB8888 has a bit depth of 24 but has 32 bits per
+ * pixel.
+ *
+ * Note that this preference only applies to dumb buffers, it's irrelevant for
+ * other types of buffers.
+ */
+#define DRM_CAP_DUMB_PREFERRED_DEPTH	0x3
+/**
+ * DRM_CAP_DUMB_PREFER_SHADOW
+ *
+ * If set to 1, the driver prefers userspace to render to a shadow buffer
+ * instead of directly rendering to a dumb buffer. For best speed, userspace
+ * should do streaming ordered memory copies into the dumb buffer and never
+ * read from it.
+ *
+ * Note that this preference only applies to dumb buffers, it's irrelevant for
+ * other types of buffers.
+ */
+#define DRM_CAP_DUMB_PREFER_SHADOW	0x4
+/**
+ * DRM_CAP_PRIME
+ *
+ * Bitfield of supported PRIME sharing capabilities. See &DRM_PRIME_CAP_IMPORT
+ * and &DRM_PRIME_CAP_EXPORT.
+ *
+ * Starting from kernel version 6.6, both &DRM_PRIME_CAP_IMPORT and
+ * &DRM_PRIME_CAP_EXPORT are always advertised.
+ *
+ * PRIME buffers are exposed as dma-buf file descriptors.
+ * See :ref:`prime_buffer_sharing`.
+ */
+#define DRM_CAP_PRIME			0x5
+/**
+ * DRM_PRIME_CAP_IMPORT
+ *
+ * If this bit is set in &DRM_CAP_PRIME, the driver supports importing PRIME
+ * buffers via the &DRM_IOCTL_PRIME_FD_TO_HANDLE ioctl.
+ *
+ * Starting from kernel version 6.6, this bit is always set in &DRM_CAP_PRIME.
+ */
+#define  DRM_PRIME_CAP_IMPORT		0x1
+/**
+ * DRM_PRIME_CAP_EXPORT
+ *
+ * If this bit is set in &DRM_CAP_PRIME, the driver supports exporting PRIME
+ * buffers via the &DRM_IOCTL_PRIME_HANDLE_TO_FD ioctl.
+ *
+ * Starting from kernel version 6.6, this bit is always set in &DRM_CAP_PRIME.
+ */
+#define  DRM_PRIME_CAP_EXPORT		0x2
+/**
+ * DRM_CAP_TIMESTAMP_MONOTONIC
+ *
+ * If set to 0, the kernel will report timestamps with ``CLOCK_REALTIME`` in
+ * struct drm_event_vblank. If set to 1, the kernel will report timestamps with
+ * ``CLOCK_MONOTONIC``. See ``clock_gettime(2)`` for the definition of these
+ * clocks.
+ *
+ * Starting from kernel version 2.6.39, the default value for this capability
+ * is 1. Starting kernel version 4.15, this capability is always set to 1.
+ */
+#define DRM_CAP_TIMESTAMP_MONOTONIC	0x6
+/**
+ * DRM_CAP_ASYNC_PAGE_FLIP
+ *
+ * If set to 1, the driver supports &DRM_MODE_PAGE_FLIP_ASYNC for legacy
+ * page-flips.
+ */
+#define DRM_CAP_ASYNC_PAGE_FLIP		0x7
+/**
+ * DRM_CAP_CURSOR_WIDTH
+ *
+ * The ``CURSOR_WIDTH`` and ``CURSOR_HEIGHT`` capabilities return a valid
+ * width x height combination for the hardware cursor. The intention is that a
+ * hardware agnostic userspace can query a cursor plane size to use.
+ *
+ * Note that the cross-driver contract is to merely return a valid size;
+ * drivers are free to attach another meaning on top, eg. i915 returns the
+ * maximum plane size.
+ */
+#define DRM_CAP_CURSOR_WIDTH		0x8
+/**
+ * DRM_CAP_CURSOR_HEIGHT
+ *
+ * See &DRM_CAP_CURSOR_WIDTH.
+ */
+#define DRM_CAP_CURSOR_HEIGHT		0x9
+/**
+ * DRM_CAP_ADDFB2_MODIFIERS
+ *
+ * If set to 1, the driver supports supplying modifiers in the
+ * &DRM_IOCTL_MODE_ADDFB2 ioctl.
+ */
+#define DRM_CAP_ADDFB2_MODIFIERS	0x10
+/**
+ * DRM_CAP_PAGE_FLIP_TARGET
+ *
+ * If set to 1, the driver supports the &DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE and
+ * &DRM_MODE_PAGE_FLIP_TARGET_RELATIVE flags in
+ * &drm_mode_crtc_page_flip_target.flags for the &DRM_IOCTL_MODE_PAGE_FLIP
+ * ioctl.
+ */
+#define DRM_CAP_PAGE_FLIP_TARGET	0x11
+/**
+ * DRM_CAP_CRTC_IN_VBLANK_EVENT
+ *
+ * If set to 1, the kernel supports reporting the CRTC ID in
+ * &drm_event_vblank.crtc_id for the &DRM_EVENT_VBLANK and
+ * &DRM_EVENT_FLIP_COMPLETE events.
+ *
+ * Starting kernel version 4.12, this capability is always set to 1.
+ */
+#define DRM_CAP_CRTC_IN_VBLANK_EVENT	0x12
+/**
+ * DRM_CAP_SYNCOBJ
+ *
+ * If set to 1, the driver supports sync objects. See :ref:`drm_sync_objects`.
+ */
+#define DRM_CAP_SYNCOBJ		0x13
+/**
+ * DRM_CAP_SYNCOBJ_TIMELINE
+ *
+ * If set to 1, the driver supports timeline operations on sync objects. See
+ * :ref:`drm_sync_objects`.
+ */
+#define DRM_CAP_SYNCOBJ_TIMELINE	0x14
+/**
+ * DRM_CAP_ATOMIC_ASYNC_PAGE_FLIP
+ *
+ * If set to 1, the driver supports &DRM_MODE_PAGE_FLIP_ASYNC for atomic
+ * commits.
+ */
+#define DRM_CAP_ATOMIC_ASYNC_PAGE_FLIP	0x15
+
+/* DRM_IOCTL_GET_CAP ioctl argument type */
+struct drm_get_cap {
+	__u64 capability;
+	__u64 value;
+};
+
+/**
+ * DRM_CLIENT_CAP_STEREO_3D
+ *
+ * If set to 1, the DRM core will expose the stereo 3D capabilities of the
+ * monitor by advertising the supported 3D layouts in the flags of struct
+ * drm_mode_modeinfo. See ``DRM_MODE_FLAG_3D_*``.
+ *
+ * This capability is always supported for all drivers starting from kernel
+ * version 3.13.
+ */
+#define DRM_CLIENT_CAP_STEREO_3D	1
+
+/**
+ * DRM_CLIENT_CAP_UNIVERSAL_PLANES
+ *
+ * If set to 1, the DRM core will expose all planes (overlay, primary, and
+ * cursor) to userspace.
+ *
+ * This capability has been introduced in kernel version 3.15. Starting from
+ * kernel version 3.17, this capability is always supported for all drivers.
+ */
+#define DRM_CLIENT_CAP_UNIVERSAL_PLANES  2
+
+/**
+ * DRM_CLIENT_CAP_ATOMIC
+ *
+ * If set to 1, the DRM core will expose atomic properties to userspace. This
+ * implicitly enables &DRM_CLIENT_CAP_UNIVERSAL_PLANES and
+ * &DRM_CLIENT_CAP_ASPECT_RATIO.
+ *
+ * If the driver doesn't support atomic mode-setting, enabling this capability
+ * will fail with -EOPNOTSUPP.
+ *
+ * This capability has been introduced in kernel version 4.0. Starting from
+ * kernel version 4.2, this capability is always supported for atomic-capable
+ * drivers.
+ */
+#define DRM_CLIENT_CAP_ATOMIC	3
+
+/**
+ * DRM_CLIENT_CAP_ASPECT_RATIO
+ *
+ * If set to 1, the DRM core will provide aspect ratio information in modes.
+ * See ``DRM_MODE_FLAG_PIC_AR_*``.
+ *
+ * This capability is always supported for all drivers starting from kernel
+ * version 4.18.
+ */
+#define DRM_CLIENT_CAP_ASPECT_RATIO    4
+
+/**
+ * DRM_CLIENT_CAP_WRITEBACK_CONNECTORS
+ *
+ * If set to 1, the DRM core will expose special connectors to be used for
+ * writing back to memory the scene setup in the commit. The client must enable
+ * &DRM_CLIENT_CAP_ATOMIC first.
+ *
+ * This capability is always supported for atomic-capable drivers starting from
+ * kernel version 4.19.
+ */
+#define DRM_CLIENT_CAP_WRITEBACK_CONNECTORS	5
+
+/**
+ * DRM_CLIENT_CAP_CURSOR_PLANE_HOTSPOT
+ *
+ * Drivers for para-virtualized hardware (e.g. vmwgfx, qxl, virtio and
+ * virtualbox) have additional restrictions for cursor planes (thus
+ * making cursor planes on those drivers not truly universal,) e.g.
+ * they need cursor planes to act like one would expect from a mouse
+ * cursor and have correctly set hotspot properties.
+ * If this client cap is not set the DRM core will hide cursor plane on
+ * those virtualized drivers because not setting it implies that the
+ * client is not capable of dealing with those extra restictions.
+ * Clients which do set cursor hotspot and treat the cursor plane
+ * like a mouse cursor should set this property.
+ * The client must enable &DRM_CLIENT_CAP_ATOMIC first.
+ *
+ * Setting this property on drivers which do not special case
+ * cursor planes (i.e. non-virtualized drivers) will return
+ * EOPNOTSUPP, which can be used by userspace to gauge
+ * requirements of the hardware/drivers they're running on.
+ *
+ * This capability is always supported for atomic-capable virtualized
+ * drivers starting from kernel version 6.6.
+ */
+#define DRM_CLIENT_CAP_CURSOR_PLANE_HOTSPOT	6
+
+/* DRM_IOCTL_SET_CLIENT_CAP ioctl argument type */
+struct drm_set_client_cap {
+	__u64 capability;
+	__u64 value;
+};
+
+#define DRM_RDWR O_RDWR
+#define DRM_CLOEXEC O_CLOEXEC
+struct drm_prime_handle {
+	__u32 handle;
+
+	/** Flags.. only applicable for handle->fd */
+	__u32 flags;
+
+	/** Returned dmabuf file descriptor */
+	__s32 fd;
+};
+
+struct drm_syncobj_create {
+	__u32 handle;
+#define DRM_SYNCOBJ_CREATE_SIGNALED (1 << 0)
+	__u32 flags;
+};
+
+struct drm_syncobj_destroy {
+	__u32 handle;
+	__u32 pad;
+};
+
+#define DRM_SYNCOBJ_FD_TO_HANDLE_FLAGS_IMPORT_SYNC_FILE (1 << 0)
+#define DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_EXPORT_SYNC_FILE (1 << 0)
+struct drm_syncobj_handle {
+	__u32 handle;
+	__u32 flags;
+
+	__s32 fd;
+	__u32 pad;
+};
+
+struct drm_syncobj_transfer {
+	__u32 src_handle;
+	__u32 dst_handle;
+	__u64 src_point;
+	__u64 dst_point;
+	__u32 flags;
+	__u32 pad;
+};
+
+#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL (1 << 0)
+#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT (1 << 1)
+#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE (1 << 2) /* wait for time point to become available */
+#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE (1 << 3) /* set fence deadline to deadline_nsec */
+struct drm_syncobj_wait {
+	__u64 handles;
+	/* absolute timeout */
+	__s64 timeout_nsec;
+	__u32 count_handles;
+	__u32 flags;
+	__u32 first_signaled; /* only valid when not waiting all */
+	__u32 pad;
+	/**
+	 * @deadline_nsec - fence deadline hint
+	 *
+	 * Deadline hint, in absolute CLOCK_MONOTONIC, to set on backing
+	 * fence(s) if the DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE flag is
+	 * set.
+	 */
+	__u64 deadline_nsec;
+};
+
+struct drm_syncobj_timeline_wait {
+	__u64 handles;
+	/* wait on specific timeline point for every handles*/
+	__u64 points;
+	/* absolute timeout */
+	__s64 timeout_nsec;
+	__u32 count_handles;
+	__u32 flags;
+	__u32 first_signaled; /* only valid when not waiting all */
+	__u32 pad;
+	/**
+	 * @deadline_nsec - fence deadline hint
+	 *
+	 * Deadline hint, in absolute CLOCK_MONOTONIC, to set on backing
+	 * fence(s) if the DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE flag is
+	 * set.
+	 */
+	__u64 deadline_nsec;
+};
+
+/**
+ * struct drm_syncobj_eventfd
+ * @handle: syncobj handle.
+ * @flags: Zero to wait for the point to be signalled, or
+ *         &DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE to wait for a fence to be
+ *         available for the point.
+ * @point: syncobj timeline point (set to zero for binary syncobjs).
+ * @fd: Existing eventfd to sent events to.
+ * @pad: Must be zero.
+ *
+ * Register an eventfd to be signalled by a syncobj. The eventfd counter will
+ * be incremented by one.
+ */
+struct drm_syncobj_eventfd {
+	__u32 handle;
+	__u32 flags;
+	__u64 point;
+	__s32 fd;
+	__u32 pad;
+};
+
+
+struct drm_syncobj_array {
+	__u64 handles;
+	__u32 count_handles;
+	__u32 pad;
+};
+
+#define DRM_SYNCOBJ_QUERY_FLAGS_LAST_SUBMITTED (1 << 0) /* last available point on timeline syncobj */
+struct drm_syncobj_timeline_array {
+	__u64 handles;
+	__u64 points;
+	__u32 count_handles;
+	__u32 flags;
+};
+
+
+/* Query current scanout sequence number */
+struct drm_crtc_get_sequence {
+	__u32 crtc_id;		/* requested crtc_id */
+	__u32 active;		/* return: crtc output is active */
+	__u64 sequence;		/* return: most recent vblank sequence */
+	__s64 sequence_ns;	/* return: most recent time of first pixel out */
+};
+
+/* Queue event to be delivered at specified sequence. Time stamp marks
+ * when the first pixel of the refresh cycle leaves the display engine
+ * for the display
+ */
+#define DRM_CRTC_SEQUENCE_RELATIVE		0x00000001	/* sequence is relative to current */
+#define DRM_CRTC_SEQUENCE_NEXT_ON_MISS		0x00000002	/* Use next sequence if we've missed */
+
+struct drm_crtc_queue_sequence {
+	__u32 crtc_id;
+	__u32 flags;
+	__u64 sequence;		/* on input, target sequence. on output, actual sequence */
+	__u64 user_data;	/* user data passed to event */
+};
+
+#if defined(__cplusplus)
+}
+#endif
+
+#include "drm_mode.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#define DRM_IOCTL_BASE			'd'
+#define DRM_IO(nr)			_IO(DRM_IOCTL_BASE,nr)
+#define DRM_IOR(nr,type)		_IOR(DRM_IOCTL_BASE,nr,type)
+#define DRM_IOW(nr,type)		_IOW(DRM_IOCTL_BASE,nr,type)
+#define DRM_IOWR(nr,type)		_IOWR(DRM_IOCTL_BASE,nr,type)
+
+#define DRM_IOCTL_VERSION		DRM_IOWR(0x00, struct drm_version)
+#define DRM_IOCTL_GET_UNIQUE		DRM_IOWR(0x01, struct drm_unique)
+#define DRM_IOCTL_GET_MAGIC		DRM_IOR( 0x02, struct drm_auth)
+#define DRM_IOCTL_IRQ_BUSID		DRM_IOWR(0x03, struct drm_irq_busid)
+#define DRM_IOCTL_GET_MAP               DRM_IOWR(0x04, struct drm_map)
+#define DRM_IOCTL_GET_CLIENT            DRM_IOWR(0x05, struct drm_client)
+#define DRM_IOCTL_GET_STATS             DRM_IOR( 0x06, struct drm_stats)
+#define DRM_IOCTL_SET_VERSION		DRM_IOWR(0x07, struct drm_set_version)
+#define DRM_IOCTL_MODESET_CTL           DRM_IOW(0x08, struct drm_modeset_ctl)
+/**
+ * DRM_IOCTL_GEM_CLOSE - Close a GEM handle.
+ *
+ * GEM handles are not reference-counted by the kernel. User-space is
+ * responsible for managing their lifetime. For example, if user-space imports
+ * the same memory object twice on the same DRM file description, the same GEM
+ * handle is returned by both imports, and user-space needs to ensure
+ * &DRM_IOCTL_GEM_CLOSE is performed once only. The same situation can happen
+ * when a memory object is allocated, then exported and imported again on the
+ * same DRM file description. The &DRM_IOCTL_MODE_GETFB2 IOCTL is an exception
+ * and always returns fresh new GEM handles even if an existing GEM handle
+ * already refers to the same memory object before the IOCTL is performed.
+ */
+#define DRM_IOCTL_GEM_CLOSE		DRM_IOW (0x09, struct drm_gem_close)
+#define DRM_IOCTL_GEM_FLINK		DRM_IOWR(0x0a, struct drm_gem_flink)
+#define DRM_IOCTL_GEM_OPEN		DRM_IOWR(0x0b, struct drm_gem_open)
+#define DRM_IOCTL_GET_CAP		DRM_IOWR(0x0c, struct drm_get_cap)
+#define DRM_IOCTL_SET_CLIENT_CAP	DRM_IOW( 0x0d, struct drm_set_client_cap)
+
+#define DRM_IOCTL_SET_UNIQUE		DRM_IOW( 0x10, struct drm_unique)
+#define DRM_IOCTL_AUTH_MAGIC		DRM_IOW( 0x11, struct drm_auth)
+#define DRM_IOCTL_BLOCK			DRM_IOWR(0x12, struct drm_block)
+#define DRM_IOCTL_UNBLOCK		DRM_IOWR(0x13, struct drm_block)
+#define DRM_IOCTL_CONTROL		DRM_IOW( 0x14, struct drm_control)
+#define DRM_IOCTL_ADD_MAP		DRM_IOWR(0x15, struct drm_map)
+#define DRM_IOCTL_ADD_BUFS		DRM_IOWR(0x16, struct drm_buf_desc)
+#define DRM_IOCTL_MARK_BUFS		DRM_IOW( 0x17, struct drm_buf_desc)
+#define DRM_IOCTL_INFO_BUFS		DRM_IOWR(0x18, struct drm_buf_info)
+#define DRM_IOCTL_MAP_BUFS		DRM_IOWR(0x19, struct drm_buf_map)
+#define DRM_IOCTL_FREE_BUFS		DRM_IOW( 0x1a, struct drm_buf_free)
+
+#define DRM_IOCTL_RM_MAP		DRM_IOW( 0x1b, struct drm_map)
+
+#define DRM_IOCTL_SET_SAREA_CTX		DRM_IOW( 0x1c, struct drm_ctx_priv_map)
+#define DRM_IOCTL_GET_SAREA_CTX 	DRM_IOWR(0x1d, struct drm_ctx_priv_map)
+
+#define DRM_IOCTL_SET_MASTER            DRM_IO(0x1e)
+#define DRM_IOCTL_DROP_MASTER           DRM_IO(0x1f)
+
+#define DRM_IOCTL_ADD_CTX		DRM_IOWR(0x20, struct drm_ctx)
+#define DRM_IOCTL_RM_CTX		DRM_IOWR(0x21, struct drm_ctx)
+#define DRM_IOCTL_MOD_CTX		DRM_IOW( 0x22, struct drm_ctx)
+#define DRM_IOCTL_GET_CTX		DRM_IOWR(0x23, struct drm_ctx)
+#define DRM_IOCTL_SWITCH_CTX		DRM_IOW( 0x24, struct drm_ctx)
+#define DRM_IOCTL_NEW_CTX		DRM_IOW( 0x25, struct drm_ctx)
+#define DRM_IOCTL_RES_CTX		DRM_IOWR(0x26, struct drm_ctx_res)
+#define DRM_IOCTL_ADD_DRAW		DRM_IOWR(0x27, struct drm_draw)
+#define DRM_IOCTL_RM_DRAW		DRM_IOWR(0x28, struct drm_draw)
+#define DRM_IOCTL_DMA			DRM_IOWR(0x29, struct drm_dma)
+#define DRM_IOCTL_LOCK			DRM_IOW( 0x2a, struct drm_lock)
+#define DRM_IOCTL_UNLOCK		DRM_IOW( 0x2b, struct drm_lock)
+#define DRM_IOCTL_FINISH		DRM_IOW( 0x2c, struct drm_lock)
+
+/**
+ * DRM_IOCTL_PRIME_HANDLE_TO_FD - Convert a GEM handle to a DMA-BUF FD.
+ *
+ * User-space sets &drm_prime_handle.handle with the GEM handle to export and
+ * &drm_prime_handle.flags, and gets back a DMA-BUF file descriptor in
+ * &drm_prime_handle.fd.
+ *
+ * The export can fail for any driver-specific reason, e.g. because export is
+ * not supported for this specific GEM handle (but might be for others).
+ *
+ * Support for exporting DMA-BUFs is advertised via &DRM_PRIME_CAP_EXPORT.
+ */
+#define DRM_IOCTL_PRIME_HANDLE_TO_FD    DRM_IOWR(0x2d, struct drm_prime_handle)
+/**
+ * DRM_IOCTL_PRIME_FD_TO_HANDLE - Convert a DMA-BUF FD to a GEM handle.
+ *
+ * User-space sets &drm_prime_handle.fd with a DMA-BUF file descriptor to
+ * import, and gets back a GEM handle in &drm_prime_handle.handle.
+ * &drm_prime_handle.flags is unused.
+ *
+ * If an existing GEM handle refers to the memory object backing the DMA-BUF,
+ * that GEM handle is returned. Therefore user-space which needs to handle
+ * arbitrary DMA-BUFs must have a user-space lookup data structure to manually
+ * reference-count duplicated GEM handles. For more information see
+ * &DRM_IOCTL_GEM_CLOSE.
+ *
+ * The import can fail for any driver-specific reason, e.g. because import is
+ * only supported for DMA-BUFs allocated on this DRM device.
+ *
+ * Support for importing DMA-BUFs is advertised via &DRM_PRIME_CAP_IMPORT.
+ */
+#define DRM_IOCTL_PRIME_FD_TO_HANDLE    DRM_IOWR(0x2e, struct drm_prime_handle)
+
+#define DRM_IOCTL_AGP_ACQUIRE		DRM_IO(  0x30)
+#define DRM_IOCTL_AGP_RELEASE		DRM_IO(  0x31)
+#define DRM_IOCTL_AGP_ENABLE		DRM_IOW( 0x32, struct drm_agp_mode)
+#define DRM_IOCTL_AGP_INFO		DRM_IOR( 0x33, struct drm_agp_info)
+#define DRM_IOCTL_AGP_ALLOC		DRM_IOWR(0x34, struct drm_agp_buffer)
+#define DRM_IOCTL_AGP_FREE		DRM_IOW( 0x35, struct drm_agp_buffer)
+#define DRM_IOCTL_AGP_BIND		DRM_IOW( 0x36, struct drm_agp_binding)
+#define DRM_IOCTL_AGP_UNBIND		DRM_IOW( 0x37, struct drm_agp_binding)
+
+#define DRM_IOCTL_SG_ALLOC		DRM_IOWR(0x38, struct drm_scatter_gather)
+#define DRM_IOCTL_SG_FREE		DRM_IOW( 0x39, struct drm_scatter_gather)
+
+#define DRM_IOCTL_WAIT_VBLANK		DRM_IOWR(0x3a, union drm_wait_vblank)
+
+#define DRM_IOCTL_CRTC_GET_SEQUENCE	DRM_IOWR(0x3b, struct drm_crtc_get_sequence)
+#define DRM_IOCTL_CRTC_QUEUE_SEQUENCE	DRM_IOWR(0x3c, struct drm_crtc_queue_sequence)
+
+#define DRM_IOCTL_UPDATE_DRAW		DRM_IOW(0x3f, struct drm_update_draw)
+
+#define DRM_IOCTL_MODE_GETRESOURCES	DRM_IOWR(0xA0, struct drm_mode_card_res)
+#define DRM_IOCTL_MODE_GETCRTC		DRM_IOWR(0xA1, struct drm_mode_crtc)
+#define DRM_IOCTL_MODE_SETCRTC		DRM_IOWR(0xA2, struct drm_mode_crtc)
+#define DRM_IOCTL_MODE_CURSOR		DRM_IOWR(0xA3, struct drm_mode_cursor)
+#define DRM_IOCTL_MODE_GETGAMMA		DRM_IOWR(0xA4, struct drm_mode_crtc_lut)
+#define DRM_IOCTL_MODE_SETGAMMA		DRM_IOWR(0xA5, struct drm_mode_crtc_lut)
+#define DRM_IOCTL_MODE_GETENCODER	DRM_IOWR(0xA6, struct drm_mode_get_encoder)
+#define DRM_IOCTL_MODE_GETCONNECTOR	DRM_IOWR(0xA7, struct drm_mode_get_connector)
+#define DRM_IOCTL_MODE_ATTACHMODE	DRM_IOWR(0xA8, struct drm_mode_mode_cmd) /* deprecated (never worked) */
+#define DRM_IOCTL_MODE_DETACHMODE	DRM_IOWR(0xA9, struct drm_mode_mode_cmd) /* deprecated (never worked) */
+
+#define DRM_IOCTL_MODE_GETPROPERTY	DRM_IOWR(0xAA, struct drm_mode_get_property)
+#define DRM_IOCTL_MODE_SETPROPERTY	DRM_IOWR(0xAB, struct drm_mode_connector_set_property)
+#define DRM_IOCTL_MODE_GETPROPBLOB	DRM_IOWR(0xAC, struct drm_mode_get_blob)
+#define DRM_IOCTL_MODE_GETFB		DRM_IOWR(0xAD, struct drm_mode_fb_cmd)
+#define DRM_IOCTL_MODE_ADDFB		DRM_IOWR(0xAE, struct drm_mode_fb_cmd)
+/**
+ * DRM_IOCTL_MODE_RMFB - Remove a framebuffer.
+ *
+ * This removes a framebuffer previously added via ADDFB/ADDFB2. The IOCTL
+ * argument is a framebuffer object ID.
+ *
+ * Warning: removing a framebuffer currently in-use on an enabled plane will
+ * disable that plane. The CRTC the plane is linked to may also be disabled
+ * (depending on driver capabilities).
+ */
+#define DRM_IOCTL_MODE_RMFB		DRM_IOWR(0xAF, unsigned int)
+#define DRM_IOCTL_MODE_PAGE_FLIP	DRM_IOWR(0xB0, struct drm_mode_crtc_page_flip)
+#define DRM_IOCTL_MODE_DIRTYFB		DRM_IOWR(0xB1, struct drm_mode_fb_dirty_cmd)
+
+/**
+ * DRM_IOCTL_MODE_CREATE_DUMB - Create a new dumb buffer object.
+ *
+ * KMS dumb buffers provide a very primitive way to allocate a buffer object
+ * suitable for scanout and map it for software rendering. KMS dumb buffers are
+ * not suitable for hardware-accelerated rendering nor video decoding. KMS dumb
+ * buffers are not suitable to be displayed on any other device than the KMS
+ * device where they were allocated from. Also see
+ * :ref:`kms_dumb_buffer_objects`.
+ *
+ * The IOCTL argument is a struct drm_mode_create_dumb.
+ *
+ * User-space is expected to create a KMS dumb buffer via this IOCTL, then add
+ * it as a KMS framebuffer via &DRM_IOCTL_MODE_ADDFB and map it via
+ * &DRM_IOCTL_MODE_MAP_DUMB.
+ *
+ * &DRM_CAP_DUMB_BUFFER indicates whether this IOCTL is supported.
+ * &DRM_CAP_DUMB_PREFERRED_DEPTH and &DRM_CAP_DUMB_PREFER_SHADOW indicate
+ * driver preferences for dumb buffers.
+ */
+#define DRM_IOCTL_MODE_CREATE_DUMB DRM_IOWR(0xB2, struct drm_mode_create_dumb)
+#define DRM_IOCTL_MODE_MAP_DUMB    DRM_IOWR(0xB3, struct drm_mode_map_dumb)
+#define DRM_IOCTL_MODE_DESTROY_DUMB    DRM_IOWR(0xB4, struct drm_mode_destroy_dumb)
+#define DRM_IOCTL_MODE_GETPLANERESOURCES DRM_IOWR(0xB5, struct drm_mode_get_plane_res)
+#define DRM_IOCTL_MODE_GETPLANE	DRM_IOWR(0xB6, struct drm_mode_get_plane)
+#define DRM_IOCTL_MODE_SETPLANE	DRM_IOWR(0xB7, struct drm_mode_set_plane)
+#define DRM_IOCTL_MODE_ADDFB2		DRM_IOWR(0xB8, struct drm_mode_fb_cmd2)
+#define DRM_IOCTL_MODE_OBJ_GETPROPERTIES	DRM_IOWR(0xB9, struct drm_mode_obj_get_properties)
+#define DRM_IOCTL_MODE_OBJ_SETPROPERTY	DRM_IOWR(0xBA, struct drm_mode_obj_set_property)
+#define DRM_IOCTL_MODE_CURSOR2		DRM_IOWR(0xBB, struct drm_mode_cursor2)
+#define DRM_IOCTL_MODE_ATOMIC		DRM_IOWR(0xBC, struct drm_mode_atomic)
+#define DRM_IOCTL_MODE_CREATEPROPBLOB	DRM_IOWR(0xBD, struct drm_mode_create_blob)
+#define DRM_IOCTL_MODE_DESTROYPROPBLOB	DRM_IOWR(0xBE, struct drm_mode_destroy_blob)
+
+#define DRM_IOCTL_SYNCOBJ_CREATE	DRM_IOWR(0xBF, struct drm_syncobj_create)
+#define DRM_IOCTL_SYNCOBJ_DESTROY	DRM_IOWR(0xC0, struct drm_syncobj_destroy)
+#define DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD	DRM_IOWR(0xC1, struct drm_syncobj_handle)
+#define DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE	DRM_IOWR(0xC2, struct drm_syncobj_handle)
+#define DRM_IOCTL_SYNCOBJ_WAIT		DRM_IOWR(0xC3, struct drm_syncobj_wait)
+#define DRM_IOCTL_SYNCOBJ_RESET		DRM_IOWR(0xC4, struct drm_syncobj_array)
+#define DRM_IOCTL_SYNCOBJ_SIGNAL	DRM_IOWR(0xC5, struct drm_syncobj_array)
+
+#define DRM_IOCTL_MODE_CREATE_LEASE	DRM_IOWR(0xC6, struct drm_mode_create_lease)
+#define DRM_IOCTL_MODE_LIST_LESSEES	DRM_IOWR(0xC7, struct drm_mode_list_lessees)
+#define DRM_IOCTL_MODE_GET_LEASE	DRM_IOWR(0xC8, struct drm_mode_get_lease)
+#define DRM_IOCTL_MODE_REVOKE_LEASE	DRM_IOWR(0xC9, struct drm_mode_revoke_lease)
+
+#define DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT	DRM_IOWR(0xCA, struct drm_syncobj_timeline_wait)
+#define DRM_IOCTL_SYNCOBJ_QUERY		DRM_IOWR(0xCB, struct drm_syncobj_timeline_array)
+#define DRM_IOCTL_SYNCOBJ_TRANSFER	DRM_IOWR(0xCC, struct drm_syncobj_transfer)
+#define DRM_IOCTL_SYNCOBJ_TIMELINE_SIGNAL	DRM_IOWR(0xCD, struct drm_syncobj_timeline_array)
+
+/**
+ * DRM_IOCTL_MODE_GETFB2 - Get framebuffer metadata.
+ *
+ * This queries metadata about a framebuffer. User-space fills
+ * &drm_mode_fb_cmd2.fb_id as the input, and the kernels fills the rest of the
+ * struct as the output.
+ *
+ * If the client is DRM master or has &CAP_SYS_ADMIN, &drm_mode_fb_cmd2.handles
+ * will be filled with GEM buffer handles. Fresh new GEM handles are always
+ * returned, even if another GEM handle referring to the same memory object
+ * already exists on the DRM file description. The caller is responsible for
+ * removing the new handles, e.g. via the &DRM_IOCTL_GEM_CLOSE IOCTL. The same
+ * new handle will be returned for multiple planes in case they use the same
+ * memory object. Planes are valid until one has a zero handle -- this can be
+ * used to compute the number of planes.
+ *
+ * Otherwise, &drm_mode_fb_cmd2.handles will be zeroed and planes are valid
+ * until one has a zero &drm_mode_fb_cmd2.pitches.
+ *
+ * If the framebuffer has a format modifier, &DRM_MODE_FB_MODIFIERS will be set
+ * in &drm_mode_fb_cmd2.flags and &drm_mode_fb_cmd2.modifier will contain the
+ * modifier. Otherwise, user-space must ignore &drm_mode_fb_cmd2.modifier.
+ *
+ * To obtain DMA-BUF FDs for each plane without leaking GEM handles, user-space
+ * can export each handle via &DRM_IOCTL_PRIME_HANDLE_TO_FD, then immediately
+ * close each unique handle via &DRM_IOCTL_GEM_CLOSE, making sure to not
+ * double-close handles which are specified multiple times in the array.
+ */
+#define DRM_IOCTL_MODE_GETFB2		DRM_IOWR(0xCE, struct drm_mode_fb_cmd2)
+
+#define DRM_IOCTL_SYNCOBJ_EVENTFD	DRM_IOWR(0xCF, struct drm_syncobj_eventfd)
+
+/**
+ * DRM_IOCTL_MODE_CLOSEFB - Close a framebuffer.
+ *
+ * This closes a framebuffer previously added via ADDFB/ADDFB2. The IOCTL
+ * argument is a framebuffer object ID.
+ *
+ * This IOCTL is similar to &DRM_IOCTL_MODE_RMFB, except it doesn't disable
+ * planes and CRTCs. As long as the framebuffer is used by a plane, it's kept
+ * alive. When the plane no longer uses the framebuffer (because the
+ * framebuffer is replaced with another one, or the plane is disabled), the
+ * framebuffer is cleaned up.
+ *
+ * This is useful to implement flicker-free transitions between two processes.
+ *
+ * Depending on the threat model, user-space may want to ensure that the
+ * framebuffer doesn't expose any sensitive user information: closed
+ * framebuffers attached to a plane can be read back by the next DRM master.
+ */
+#define DRM_IOCTL_MODE_CLOSEFB		DRM_IOWR(0xD0, struct drm_mode_closefb)
+
+/*
+ * Device specific ioctls should only be in their respective headers
+ * The device specific ioctl range is from 0x40 to 0x9f.
+ * Generic IOCTLS restart at 0xA0.
+ *
+ * \sa drmCommandNone(), drmCommandRead(), drmCommandWrite(), and
+ * drmCommandReadWrite().
+ */
+#define DRM_COMMAND_BASE                0x40
+#define DRM_COMMAND_END			0xA0
+
+/**
+ * struct drm_event - Header for DRM events
+ * @type: event type.
+ * @length: total number of payload bytes (including header).
+ *
+ * This struct is a header for events written back to user-space on the DRM FD.
+ * A read on the DRM FD will always only return complete events: e.g. if the
+ * read buffer is 100 bytes large and there are two 64 byte events pending,
+ * only one will be returned.
+ *
+ * Event types 0 - 0x7fffffff are generic DRM events, 0x80000000 and
+ * up are chipset specific. Generic DRM events include &DRM_EVENT_VBLANK,
+ * &DRM_EVENT_FLIP_COMPLETE and &DRM_EVENT_CRTC_SEQUENCE.
+ */
+struct drm_event {
+	__u32 type;
+	__u32 length;
+};
+
+/**
+ * DRM_EVENT_VBLANK - vertical blanking event
+ *
+ * This event is sent in response to &DRM_IOCTL_WAIT_VBLANK with the
+ * &_DRM_VBLANK_EVENT flag set.
+ *
+ * The event payload is a struct drm_event_vblank.
+ */
+#define DRM_EVENT_VBLANK 0x01
+/**
+ * DRM_EVENT_FLIP_COMPLETE - page-flip completion event
+ *
+ * This event is sent in response to an atomic commit or legacy page-flip with
+ * the &DRM_MODE_PAGE_FLIP_EVENT flag set.
+ *
+ * The event payload is a struct drm_event_vblank.
+ */
+#define DRM_EVENT_FLIP_COMPLETE 0x02
+/**
+ * DRM_EVENT_CRTC_SEQUENCE - CRTC sequence event
+ *
+ * This event is sent in response to &DRM_IOCTL_CRTC_QUEUE_SEQUENCE.
+ *
+ * The event payload is a struct drm_event_crtc_sequence.
+ */
+#define DRM_EVENT_CRTC_SEQUENCE	0x03
+
+struct drm_event_vblank {
+	struct drm_event base;
+	__u64 user_data;
+	__u32 tv_sec;
+	__u32 tv_usec;
+	__u32 sequence;
+	__u32 crtc_id; /* 0 on older kernels that do not support this */
+};
+
+/* Event delivered at sequence. Time stamp marks when the first pixel
+ * of the refresh cycle leaves the display engine for the display
+ */
+struct drm_event_crtc_sequence {
+	struct drm_event	base;
+	__u64			user_data;
+	__s64			time_ns;
+	__u64			sequence;
+};
+
+/* typedef area */
+typedef struct drm_clip_rect drm_clip_rect_t;
+typedef struct drm_drawable_info drm_drawable_info_t;
+typedef struct drm_tex_region drm_tex_region_t;
+typedef struct drm_hw_lock drm_hw_lock_t;
+typedef struct drm_version drm_version_t;
+typedef struct drm_unique drm_unique_t;
+typedef struct drm_list drm_list_t;
+typedef struct drm_block drm_block_t;
+typedef struct drm_control drm_control_t;
+typedef enum drm_map_type drm_map_type_t;
+typedef enum drm_map_flags drm_map_flags_t;
+typedef struct drm_ctx_priv_map drm_ctx_priv_map_t;
+typedef struct drm_map drm_map_t;
+typedef struct drm_client drm_client_t;
+typedef enum drm_stat_type drm_stat_type_t;
+typedef struct drm_stats drm_stats_t;
+typedef enum drm_lock_flags drm_lock_flags_t;
+typedef struct drm_lock drm_lock_t;
+typedef enum drm_dma_flags drm_dma_flags_t;
+typedef struct drm_buf_desc drm_buf_desc_t;
+typedef struct drm_buf_info drm_buf_info_t;
+typedef struct drm_buf_free drm_buf_free_t;
+typedef struct drm_buf_pub drm_buf_pub_t;
+typedef struct drm_buf_map drm_buf_map_t;
+typedef struct drm_dma drm_dma_t;
+typedef union drm_wait_vblank drm_wait_vblank_t;
+typedef struct drm_agp_mode drm_agp_mode_t;
+typedef enum drm_ctx_flags drm_ctx_flags_t;
+typedef struct drm_ctx drm_ctx_t;
+typedef struct drm_ctx_res drm_ctx_res_t;
+typedef struct drm_draw drm_draw_t;
+typedef struct drm_update_draw drm_update_draw_t;
+typedef struct drm_auth drm_auth_t;
+typedef struct drm_irq_busid drm_irq_busid_t;
+typedef enum drm_vblank_seq_type drm_vblank_seq_type_t;
+
+typedef struct drm_agp_buffer drm_agp_buffer_t;
+typedef struct drm_agp_binding drm_agp_binding_t;
+typedef struct drm_agp_info drm_agp_info_t;
+typedef struct drm_scatter_gather drm_scatter_gather_t;
+typedef struct drm_set_version drm_set_version_t;
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/projects/rocr-runtime/libhsakmt/include/hsakmt/drm/drm_mode.h b/projects/rocr-runtime/libhsakmt/include/hsakmt/drm/drm_mode.h
new file mode 100644
index 0000000000..d390011b89
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/include/hsakmt/drm/drm_mode.h
@@ -0,0 +1,1360 @@
+/*
+ * Copyright (c) 2007 Dave Airlie <airlied@linux.ie>
+ * Copyright (c) 2007 Jakob Bornecrantz <wallbraker@gmail.com>
+ * Copyright (c) 2008 Red Hat Inc.
+ * Copyright (c) 2007-2008 Tungsten Graphics, Inc., Cedar Park, TX., USA
+ * Copyright (c) 2007-2008 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef _DRM_MODE_H
+#define _DRM_MODE_H
+
+#include "drm.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/**
+ * DOC: overview
+ *
+ * DRM exposes many UAPI and structure definitions to have a consistent
+ * and standardized interface with users.
+ * Userspace can refer to these structure definitions and UAPI formats
+ * to communicate to drivers.
+ */
+
+#define DRM_CONNECTOR_NAME_LEN	32
+#define DRM_DISPLAY_MODE_LEN	32
+#define DRM_PROP_NAME_LEN	32
+
+#define DRM_MODE_TYPE_BUILTIN	(1<<0) /* deprecated */
+#define DRM_MODE_TYPE_CLOCK_C	((1<<1) | DRM_MODE_TYPE_BUILTIN) /* deprecated */
+#define DRM_MODE_TYPE_CRTC_C	((1<<2) | DRM_MODE_TYPE_BUILTIN) /* deprecated */
+#define DRM_MODE_TYPE_PREFERRED	(1<<3)
+#define DRM_MODE_TYPE_DEFAULT	(1<<4) /* deprecated */
+#define DRM_MODE_TYPE_USERDEF	(1<<5)
+#define DRM_MODE_TYPE_DRIVER	(1<<6)
+
+#define DRM_MODE_TYPE_ALL	(DRM_MODE_TYPE_PREFERRED |	\
+				 DRM_MODE_TYPE_USERDEF |	\
+				 DRM_MODE_TYPE_DRIVER)
+
+/* Video mode flags */
+/* bit compatible with the xrandr RR_ definitions (bits 0-13)
+ *
+ * ABI warning: Existing userspace really expects
+ * the mode flags to match the xrandr definitions. Any
+ * changes that don't match the xrandr definitions will
+ * likely need a new client cap or some other mechanism
+ * to avoid breaking existing userspace. This includes
+ * allocating new flags in the previously unused bits!
+ */
+#define DRM_MODE_FLAG_PHSYNC			(1<<0)
+#define DRM_MODE_FLAG_NHSYNC			(1<<1)
+#define DRM_MODE_FLAG_PVSYNC			(1<<2)
+#define DRM_MODE_FLAG_NVSYNC			(1<<3)
+#define DRM_MODE_FLAG_INTERLACE			(1<<4)
+#define DRM_MODE_FLAG_DBLSCAN			(1<<5)
+#define DRM_MODE_FLAG_CSYNC			(1<<6)
+#define DRM_MODE_FLAG_PCSYNC			(1<<7)
+#define DRM_MODE_FLAG_NCSYNC			(1<<8)
+#define DRM_MODE_FLAG_HSKEW			(1<<9) /* hskew provided */
+#define DRM_MODE_FLAG_BCAST			(1<<10) /* deprecated */
+#define DRM_MODE_FLAG_PIXMUX			(1<<11) /* deprecated */
+#define DRM_MODE_FLAG_DBLCLK			(1<<12)
+#define DRM_MODE_FLAG_CLKDIV2			(1<<13)
+ /*
+  * When adding a new stereo mode don't forget to adjust DRM_MODE_FLAGS_3D_MAX
+  * (define not exposed to user space).
+  */
+#define DRM_MODE_FLAG_3D_MASK			(0x1f<<14)
+#define  DRM_MODE_FLAG_3D_NONE		(0<<14)
+#define  DRM_MODE_FLAG_3D_FRAME_PACKING		(1<<14)
+#define  DRM_MODE_FLAG_3D_FIELD_ALTERNATIVE	(2<<14)
+#define  DRM_MODE_FLAG_3D_LINE_ALTERNATIVE	(3<<14)
+#define  DRM_MODE_FLAG_3D_SIDE_BY_SIDE_FULL	(4<<14)
+#define  DRM_MODE_FLAG_3D_L_DEPTH		(5<<14)
+#define  DRM_MODE_FLAG_3D_L_DEPTH_GFX_GFX_DEPTH	(6<<14)
+#define  DRM_MODE_FLAG_3D_TOP_AND_BOTTOM	(7<<14)
+#define  DRM_MODE_FLAG_3D_SIDE_BY_SIDE_HALF	(8<<14)
+
+/* Picture aspect ratio options */
+#define DRM_MODE_PICTURE_ASPECT_NONE		0
+#define DRM_MODE_PICTURE_ASPECT_4_3		1
+#define DRM_MODE_PICTURE_ASPECT_16_9		2
+#define DRM_MODE_PICTURE_ASPECT_64_27		3
+#define DRM_MODE_PICTURE_ASPECT_256_135		4
+
+/* Content type options */
+#define DRM_MODE_CONTENT_TYPE_NO_DATA		0
+#define DRM_MODE_CONTENT_TYPE_GRAPHICS		1
+#define DRM_MODE_CONTENT_TYPE_PHOTO		2
+#define DRM_MODE_CONTENT_TYPE_CINEMA		3
+#define DRM_MODE_CONTENT_TYPE_GAME		4
+
+/* Aspect ratio flag bitmask (4 bits 22:19) */
+#define DRM_MODE_FLAG_PIC_AR_MASK		(0x0F<<19)
+#define  DRM_MODE_FLAG_PIC_AR_NONE \
+			(DRM_MODE_PICTURE_ASPECT_NONE<<19)
+#define  DRM_MODE_FLAG_PIC_AR_4_3 \
+			(DRM_MODE_PICTURE_ASPECT_4_3<<19)
+#define  DRM_MODE_FLAG_PIC_AR_16_9 \
+			(DRM_MODE_PICTURE_ASPECT_16_9<<19)
+#define  DRM_MODE_FLAG_PIC_AR_64_27 \
+			(DRM_MODE_PICTURE_ASPECT_64_27<<19)
+#define  DRM_MODE_FLAG_PIC_AR_256_135 \
+			(DRM_MODE_PICTURE_ASPECT_256_135<<19)
+
+#define  DRM_MODE_FLAG_ALL	(DRM_MODE_FLAG_PHSYNC |		\
+				 DRM_MODE_FLAG_NHSYNC |		\
+				 DRM_MODE_FLAG_PVSYNC |		\
+				 DRM_MODE_FLAG_NVSYNC |		\
+				 DRM_MODE_FLAG_INTERLACE |	\
+				 DRM_MODE_FLAG_DBLSCAN |	\
+				 DRM_MODE_FLAG_CSYNC |		\
+				 DRM_MODE_FLAG_PCSYNC |		\
+				 DRM_MODE_FLAG_NCSYNC |		\
+				 DRM_MODE_FLAG_HSKEW |		\
+				 DRM_MODE_FLAG_DBLCLK |		\
+				 DRM_MODE_FLAG_CLKDIV2 |	\
+				 DRM_MODE_FLAG_3D_MASK)
+
+/* DPMS flags */
+/* bit compatible with the xorg definitions. */
+#define DRM_MODE_DPMS_ON	0
+#define DRM_MODE_DPMS_STANDBY	1
+#define DRM_MODE_DPMS_SUSPEND	2
+#define DRM_MODE_DPMS_OFF	3
+
+/* Scaling mode options */
+#define DRM_MODE_SCALE_NONE		0 /* Unmodified timing (display or
+					     software can still scale) */
+#define DRM_MODE_SCALE_FULLSCREEN	1 /* Full screen, ignore aspect */
+#define DRM_MODE_SCALE_CENTER		2 /* Centered, no scaling */
+#define DRM_MODE_SCALE_ASPECT		3 /* Full screen, preserve aspect */
+
+/* Dithering mode options */
+#define DRM_MODE_DITHERING_OFF	0
+#define DRM_MODE_DITHERING_ON	1
+#define DRM_MODE_DITHERING_AUTO 2
+
+/* Dirty info options */
+#define DRM_MODE_DIRTY_OFF      0
+#define DRM_MODE_DIRTY_ON       1
+#define DRM_MODE_DIRTY_ANNOTATE 2
+
+/* Link Status options */
+#define DRM_MODE_LINK_STATUS_GOOD	0
+#define DRM_MODE_LINK_STATUS_BAD	1
+
+/*
+ * DRM_MODE_ROTATE_<degrees>
+ *
+ * Signals that a drm plane is been rotated <degrees> degrees in counter
+ * clockwise direction.
+ *
+ * This define is provided as a convenience, looking up the property id
+ * using the name->prop id lookup is the preferred method.
+ */
+#define DRM_MODE_ROTATE_0       (1<<0)
+#define DRM_MODE_ROTATE_90      (1<<1)
+#define DRM_MODE_ROTATE_180     (1<<2)
+#define DRM_MODE_ROTATE_270     (1<<3)
+
+/*
+ * DRM_MODE_ROTATE_MASK
+ *
+ * Bitmask used to look for drm plane rotations.
+ */
+#define DRM_MODE_ROTATE_MASK (\
+		DRM_MODE_ROTATE_0  | \
+		DRM_MODE_ROTATE_90  | \
+		DRM_MODE_ROTATE_180 | \
+		DRM_MODE_ROTATE_270)
+
+/*
+ * DRM_MODE_REFLECT_<axis>
+ *
+ * Signals that the contents of a drm plane is reflected along the <axis> axis,
+ * in the same way as mirroring.
+ * See kerneldoc chapter "Plane Composition Properties" for more details.
+ *
+ * This define is provided as a convenience, looking up the property id
+ * using the name->prop id lookup is the preferred method.
+ */
+#define DRM_MODE_REFLECT_X      (1<<4)
+#define DRM_MODE_REFLECT_Y      (1<<5)
+
+/*
+ * DRM_MODE_REFLECT_MASK
+ *
+ * Bitmask used to look for drm plane reflections.
+ */
+#define DRM_MODE_REFLECT_MASK (\
+		DRM_MODE_REFLECT_X | \
+		DRM_MODE_REFLECT_Y)
+
+/* Content Protection Flags */
+#define DRM_MODE_CONTENT_PROTECTION_UNDESIRED	0
+#define DRM_MODE_CONTENT_PROTECTION_DESIRED     1
+#define DRM_MODE_CONTENT_PROTECTION_ENABLED     2
+
+/**
+ * struct drm_mode_modeinfo - Display mode information.
+ * @clock: pixel clock in kHz
+ * @hdisplay: horizontal display size
+ * @hsync_start: horizontal sync start
+ * @hsync_end: horizontal sync end
+ * @htotal: horizontal total size
+ * @hskew: horizontal skew
+ * @vdisplay: vertical display size
+ * @vsync_start: vertical sync start
+ * @vsync_end: vertical sync end
+ * @vtotal: vertical total size
+ * @vscan: vertical scan
+ * @vrefresh: approximate vertical refresh rate in Hz
+ * @flags: bitmask of misc. flags, see DRM_MODE_FLAG_* defines
+ * @type: bitmask of type flags, see DRM_MODE_TYPE_* defines
+ * @name: string describing the mode resolution
+ *
+ * This is the user-space API display mode information structure. For the
+ * kernel version see struct drm_display_mode.
+ */
+struct drm_mode_modeinfo {
+	__u32 clock;
+	__u16 hdisplay;
+	__u16 hsync_start;
+	__u16 hsync_end;
+	__u16 htotal;
+	__u16 hskew;
+	__u16 vdisplay;
+	__u16 vsync_start;
+	__u16 vsync_end;
+	__u16 vtotal;
+	__u16 vscan;
+
+	__u32 vrefresh;
+
+	__u32 flags;
+	__u32 type;
+	char name[DRM_DISPLAY_MODE_LEN];
+};
+
+struct drm_mode_card_res {
+	__u64 fb_id_ptr;
+	__u64 crtc_id_ptr;
+	__u64 connector_id_ptr;
+	__u64 encoder_id_ptr;
+	__u32 count_fbs;
+	__u32 count_crtcs;
+	__u32 count_connectors;
+	__u32 count_encoders;
+	__u32 min_width;
+	__u32 max_width;
+	__u32 min_height;
+	__u32 max_height;
+};
+
+struct drm_mode_crtc {
+	__u64 set_connectors_ptr;
+	__u32 count_connectors;
+
+	__u32 crtc_id; /**< Id */
+	__u32 fb_id; /**< Id of framebuffer */
+
+	__u32 x; /**< x Position on the framebuffer */
+	__u32 y; /**< y Position on the framebuffer */
+
+	__u32 gamma_size;
+	__u32 mode_valid;
+	struct drm_mode_modeinfo mode;
+};
+
+#define DRM_MODE_PRESENT_TOP_FIELD	(1<<0)
+#define DRM_MODE_PRESENT_BOTTOM_FIELD	(1<<1)
+
+/* Planes blend with or override other bits on the CRTC */
+struct drm_mode_set_plane {
+	__u32 plane_id;
+	__u32 crtc_id;
+	__u32 fb_id; /* fb object contains surface format type */
+	__u32 flags; /* see above flags */
+
+	/* Signed dest location allows it to be partially off screen */
+	__s32 crtc_x;
+	__s32 crtc_y;
+	__u32 crtc_w;
+	__u32 crtc_h;
+
+	/* Source values are 16.16 fixed point */
+	__u32 src_x;
+	__u32 src_y;
+	__u32 src_h;
+	__u32 src_w;
+};
+
+/**
+ * struct drm_mode_get_plane - Get plane metadata.
+ *
+ * Userspace can perform a GETPLANE ioctl to retrieve information about a
+ * plane.
+ *
+ * To retrieve the number of formats supported, set @count_format_types to zero
+ * and call the ioctl. @count_format_types will be updated with the value.
+ *
+ * To retrieve these formats, allocate an array with the memory needed to store
+ * @count_format_types formats. Point @format_type_ptr to this array and call
+ * the ioctl again (with @count_format_types still set to the value returned in
+ * the first ioctl call).
+ */
+struct drm_mode_get_plane {
+	/**
+	 * @plane_id: Object ID of the plane whose information should be
+	 * retrieved. Set by caller.
+	 */
+	__u32 plane_id;
+
+	/** @crtc_id: Object ID of the current CRTC. */
+	__u32 crtc_id;
+	/** @fb_id: Object ID of the current fb. */
+	__u32 fb_id;
+
+	/**
+	 * @possible_crtcs: Bitmask of CRTC's compatible with the plane. CRTC's
+	 * are created and they receive an index, which corresponds to their
+	 * position in the bitmask. Bit N corresponds to
+	 * :ref:`CRTC index<crtc_index>` N.
+	 */
+	__u32 possible_crtcs;
+	/** @gamma_size: Never used. */
+	__u32 gamma_size;
+
+	/** @count_format_types: Number of formats. */
+	__u32 count_format_types;
+	/**
+	 * @format_type_ptr: Pointer to ``__u32`` array of formats that are
+	 * supported by the plane. These formats do not require modifiers.
+	 */
+	__u64 format_type_ptr;
+};
+
+struct drm_mode_get_plane_res {
+	__u64 plane_id_ptr;
+	__u32 count_planes;
+};
+
+#define DRM_MODE_ENCODER_NONE	0
+#define DRM_MODE_ENCODER_DAC	1
+#define DRM_MODE_ENCODER_TMDS	2
+#define DRM_MODE_ENCODER_LVDS	3
+#define DRM_MODE_ENCODER_TVDAC	4
+#define DRM_MODE_ENCODER_VIRTUAL 5
+#define DRM_MODE_ENCODER_DSI	6
+#define DRM_MODE_ENCODER_DPMST	7
+#define DRM_MODE_ENCODER_DPI	8
+
+struct drm_mode_get_encoder {
+	__u32 encoder_id;
+	__u32 encoder_type;
+
+	__u32 crtc_id; /**< Id of crtc */
+
+	__u32 possible_crtcs;
+	__u32 possible_clones;
+};
+
+/* This is for connectors with multiple signal types. */
+/* Try to match DRM_MODE_CONNECTOR_X as closely as possible. */
+enum drm_mode_subconnector {
+	DRM_MODE_SUBCONNECTOR_Automatic   = 0,  /* DVI-I, TV     */
+	DRM_MODE_SUBCONNECTOR_Unknown     = 0,  /* DVI-I, TV, DP */
+	DRM_MODE_SUBCONNECTOR_VGA	  = 1,  /*            DP */
+	DRM_MODE_SUBCONNECTOR_DVID	  = 3,  /* DVI-I      DP */
+	DRM_MODE_SUBCONNECTOR_DVIA	  = 4,  /* DVI-I         */
+	DRM_MODE_SUBCONNECTOR_Composite   = 5,  /*        TV     */
+	DRM_MODE_SUBCONNECTOR_SVIDEO	  = 6,  /*        TV     */
+	DRM_MODE_SUBCONNECTOR_Component   = 8,  /*        TV     */
+	DRM_MODE_SUBCONNECTOR_SCART	  = 9,  /*        TV     */
+	DRM_MODE_SUBCONNECTOR_DisplayPort = 10, /*            DP */
+	DRM_MODE_SUBCONNECTOR_HDMIA       = 11, /*            DP */
+	DRM_MODE_SUBCONNECTOR_Native      = 15, /*            DP */
+	DRM_MODE_SUBCONNECTOR_Wireless    = 18, /*            DP */
+};
+
+#define DRM_MODE_CONNECTOR_Unknown	0
+#define DRM_MODE_CONNECTOR_VGA		1
+#define DRM_MODE_CONNECTOR_DVII		2
+#define DRM_MODE_CONNECTOR_DVID		3
+#define DRM_MODE_CONNECTOR_DVIA		4
+#define DRM_MODE_CONNECTOR_Composite	5
+#define DRM_MODE_CONNECTOR_SVIDEO	6
+#define DRM_MODE_CONNECTOR_LVDS		7
+#define DRM_MODE_CONNECTOR_Component	8
+#define DRM_MODE_CONNECTOR_9PinDIN	9
+#define DRM_MODE_CONNECTOR_DisplayPort	10
+#define DRM_MODE_CONNECTOR_HDMIA	11
+#define DRM_MODE_CONNECTOR_HDMIB	12
+#define DRM_MODE_CONNECTOR_TV		13
+#define DRM_MODE_CONNECTOR_eDP		14
+#define DRM_MODE_CONNECTOR_VIRTUAL      15
+#define DRM_MODE_CONNECTOR_DSI		16
+#define DRM_MODE_CONNECTOR_DPI		17
+#define DRM_MODE_CONNECTOR_WRITEBACK	18
+#define DRM_MODE_CONNECTOR_SPI		19
+#define DRM_MODE_CONNECTOR_USB		20
+
+/**
+ * struct drm_mode_get_connector - Get connector metadata.
+ *
+ * User-space can perform a GETCONNECTOR ioctl to retrieve information about a
+ * connector. User-space is expected to retrieve encoders, modes and properties
+ * by performing this ioctl at least twice: the first time to retrieve the
+ * number of elements, the second time to retrieve the elements themselves.
+ *
+ * To retrieve the number of elements, set @count_props and @count_encoders to
+ * zero, set @count_modes to 1, and set @modes_ptr to a temporary struct
+ * drm_mode_modeinfo element.
+ *
+ * To retrieve the elements, allocate arrays for @encoders_ptr, @modes_ptr,
+ * @props_ptr and @prop_values_ptr, then set @count_modes, @count_props and
+ * @count_encoders to their capacity.
+ *
+ * Performing the ioctl only twice may be racy: the number of elements may have
+ * changed with a hotplug event in-between the two ioctls. User-space is
+ * expected to retry the last ioctl until the number of elements stabilizes.
+ * The kernel won't fill any array which doesn't have the expected length.
+ *
+ * **Force-probing a connector**
+ *
+ * If the @count_modes field is set to zero and the DRM client is the current
+ * DRM master, the kernel will perform a forced probe on the connector to
+ * refresh the connector status, modes and EDID. A forced-probe can be slow,
+ * might cause flickering and the ioctl will block.
+ *
+ * User-space needs to force-probe connectors to ensure their metadata is
+ * up-to-date at startup and after receiving a hot-plug event. User-space
+ * may perform a forced-probe when the user explicitly requests it. User-space
+ * shouldn't perform a forced-probe in other situations.
+ */
+struct drm_mode_get_connector {
+	/** @encoders_ptr: Pointer to ``__u32`` array of object IDs. */
+	__u64 encoders_ptr;
+	/** @modes_ptr: Pointer to struct drm_mode_modeinfo array. */
+	__u64 modes_ptr;
+	/** @props_ptr: Pointer to ``__u32`` array of property IDs. */
+	__u64 props_ptr;
+	/** @prop_values_ptr: Pointer to ``__u64`` array of property values. */
+	__u64 prop_values_ptr;
+
+	/** @count_modes: Number of modes. */
+	__u32 count_modes;
+	/** @count_props: Number of properties. */
+	__u32 count_props;
+	/** @count_encoders: Number of encoders. */
+	__u32 count_encoders;
+
+	/** @encoder_id: Object ID of the current encoder. */
+	__u32 encoder_id;
+	/** @connector_id: Object ID of the connector. */
+	__u32 connector_id;
+	/**
+	 * @connector_type: Type of the connector.
+	 *
+	 * See DRM_MODE_CONNECTOR_* defines.
+	 */
+	__u32 connector_type;
+	/**
+	 * @connector_type_id: Type-specific connector number.
+	 *
+	 * This is not an object ID. This is a per-type connector number. Each
+	 * (type, type_id) combination is unique across all connectors of a DRM
+	 * device.
+	 *
+	 * The (type, type_id) combination is not a stable identifier: the
+	 * type_id can change depending on the driver probe order.
+	 */
+	__u32 connector_type_id;
+
+	/**
+	 * @connection: Status of the connector.
+	 *
+	 * See enum drm_connector_status.
+	 */
+	__u32 connection;
+	/** @mm_width: Width of the connected sink in millimeters. */
+	__u32 mm_width;
+	/** @mm_height: Height of the connected sink in millimeters. */
+	__u32 mm_height;
+	/**
+	 * @subpixel: Subpixel order of the connected sink.
+	 *
+	 * See enum subpixel_order.
+	 */
+	__u32 subpixel;
+
+	/** @pad: Padding, must be zero. */
+	__u32 pad;
+};
+
+#define DRM_MODE_PROP_PENDING	(1<<0) /* deprecated, do not use */
+#define DRM_MODE_PROP_RANGE	(1<<1)
+#define DRM_MODE_PROP_IMMUTABLE	(1<<2)
+#define DRM_MODE_PROP_ENUM	(1<<3) /* enumerated type with text strings */
+#define DRM_MODE_PROP_BLOB	(1<<4)
+#define DRM_MODE_PROP_BITMASK	(1<<5) /* bitmask of enumerated types */
+
+/* non-extended types: legacy bitmask, one bit per type: */
+#define DRM_MODE_PROP_LEGACY_TYPE  ( \
+		DRM_MODE_PROP_RANGE | \
+		DRM_MODE_PROP_ENUM | \
+		DRM_MODE_PROP_BLOB | \
+		DRM_MODE_PROP_BITMASK)
+
+/* extended-types: rather than continue to consume a bit per type,
+ * grab a chunk of the bits to use as integer type id.
+ */
+#define DRM_MODE_PROP_EXTENDED_TYPE	0x0000ffc0
+#define DRM_MODE_PROP_TYPE(n)		((n) << 6)
+#define DRM_MODE_PROP_OBJECT		DRM_MODE_PROP_TYPE(1)
+#define DRM_MODE_PROP_SIGNED_RANGE	DRM_MODE_PROP_TYPE(2)
+
+/* the PROP_ATOMIC flag is used to hide properties from userspace that
+ * is not aware of atomic properties.  This is mostly to work around
+ * older userspace (DDX drivers) that read/write each prop they find,
+ * without being aware that this could be triggering a lengthy modeset.
+ */
+#define DRM_MODE_PROP_ATOMIC        0x80000000
+
+/**
+ * struct drm_mode_property_enum - Description for an enum/bitfield entry.
+ * @value: numeric value for this enum entry.
+ * @name: symbolic name for this enum entry.
+ *
+ * See struct drm_property_enum for details.
+ */
+struct drm_mode_property_enum {
+	__u64 value;
+	char name[DRM_PROP_NAME_LEN];
+};
+
+/**
+ * struct drm_mode_get_property - Get property metadata.
+ *
+ * User-space can perform a GETPROPERTY ioctl to retrieve information about a
+ * property. The same property may be attached to multiple objects, see
+ * "Modeset Base Object Abstraction".
+ *
+ * The meaning of the @values_ptr field changes depending on the property type.
+ * See &drm_property.flags for more details.
+ *
+ * The @enum_blob_ptr and @count_enum_blobs fields are only meaningful when the
+ * property has the type &DRM_MODE_PROP_ENUM or &DRM_MODE_PROP_BITMASK. For
+ * backwards compatibility, the kernel will always set @count_enum_blobs to
+ * zero when the property has the type &DRM_MODE_PROP_BLOB. User-space must
+ * ignore these two fields if the property has a different type.
+ *
+ * User-space is expected to retrieve values and enums by performing this ioctl
+ * at least twice: the first time to retrieve the number of elements, the
+ * second time to retrieve the elements themselves.
+ *
+ * To retrieve the number of elements, set @count_values and @count_enum_blobs
+ * to zero, then call the ioctl. @count_values will be updated with the number
+ * of elements. If the property has the type &DRM_MODE_PROP_ENUM or
+ * &DRM_MODE_PROP_BITMASK, @count_enum_blobs will be updated as well.
+ *
+ * To retrieve the elements themselves, allocate an array for @values_ptr and
+ * set @count_values to its capacity. If the property has the type
+ * &DRM_MODE_PROP_ENUM or &DRM_MODE_PROP_BITMASK, allocate an array for
+ * @enum_blob_ptr and set @count_enum_blobs to its capacity. Calling the ioctl
+ * again will fill the arrays.
+ */
+struct drm_mode_get_property {
+	/** @values_ptr: Pointer to a ``__u64`` array. */
+	__u64 values_ptr;
+	/** @enum_blob_ptr: Pointer to a struct drm_mode_property_enum array. */
+	__u64 enum_blob_ptr;
+
+	/**
+	 * @prop_id: Object ID of the property which should be retrieved. Set
+	 * by the caller.
+	 */
+	__u32 prop_id;
+	/**
+	 * @flags: ``DRM_MODE_PROP_*`` bitfield. See &drm_property.flags for
+	 * a definition of the flags.
+	 */
+	__u32 flags;
+	/**
+	 * @name: Symbolic property name. User-space should use this field to
+	 * recognize properties.
+	 */
+	char name[DRM_PROP_NAME_LEN];
+
+	/** @count_values: Number of elements in @values_ptr. */
+	__u32 count_values;
+	/** @count_enum_blobs: Number of elements in @enum_blob_ptr. */
+	__u32 count_enum_blobs;
+};
+
+struct drm_mode_connector_set_property {
+	__u64 value;
+	__u32 prop_id;
+	__u32 connector_id;
+};
+
+#define DRM_MODE_OBJECT_CRTC 0xcccccccc
+#define DRM_MODE_OBJECT_CONNECTOR 0xc0c0c0c0
+#define DRM_MODE_OBJECT_ENCODER 0xe0e0e0e0
+#define DRM_MODE_OBJECT_MODE 0xdededede
+#define DRM_MODE_OBJECT_PROPERTY 0xb0b0b0b0
+#define DRM_MODE_OBJECT_FB 0xfbfbfbfb
+#define DRM_MODE_OBJECT_BLOB 0xbbbbbbbb
+#define DRM_MODE_OBJECT_PLANE 0xeeeeeeee
+#define DRM_MODE_OBJECT_ANY 0
+
+struct drm_mode_obj_get_properties {
+	__u64 props_ptr;
+	__u64 prop_values_ptr;
+	__u32 count_props;
+	__u32 obj_id;
+	__u32 obj_type;
+};
+
+struct drm_mode_obj_set_property {
+	__u64 value;
+	__u32 prop_id;
+	__u32 obj_id;
+	__u32 obj_type;
+};
+
+struct drm_mode_get_blob {
+	__u32 blob_id;
+	__u32 length;
+	__u64 data;
+};
+
+struct drm_mode_fb_cmd {
+	__u32 fb_id;
+	__u32 width;
+	__u32 height;
+	__u32 pitch;
+	__u32 bpp;
+	__u32 depth;
+	/* driver specific handle */
+	__u32 handle;
+};
+
+#define DRM_MODE_FB_INTERLACED	(1<<0) /* for interlaced framebuffers */
+#define DRM_MODE_FB_MODIFIERS	(1<<1) /* enables ->modifier[] */
+
+/**
+ * struct drm_mode_fb_cmd2 - Frame-buffer metadata.
+ *
+ * This struct holds frame-buffer metadata. There are two ways to use it:
+ *
+ * - User-space can fill this struct and perform a &DRM_IOCTL_MODE_ADDFB2
+ *   ioctl to register a new frame-buffer. The new frame-buffer object ID will
+ *   be set by the kernel in @fb_id.
+ * - User-space can set @fb_id and perform a &DRM_IOCTL_MODE_GETFB2 ioctl to
+ *   fetch metadata about an existing frame-buffer.
+ *
+ * In case of planar formats, this struct allows up to 4 buffer objects with
+ * offsets and pitches per plane. The pitch and offset order are dictated by
+ * the format FourCC as defined by ``drm_fourcc.h``, e.g. NV12 is described as:
+ *
+ *     YUV 4:2:0 image with a plane of 8-bit Y samples followed by an
+ *     interleaved U/V plane containing 8-bit 2x2 subsampled colour difference
+ *     samples.
+ *
+ * So it would consist of a Y plane at ``offsets[0]`` and a UV plane at
+ * ``offsets[1]``.
+ *
+ * To accommodate tiled, compressed, etc formats, a modifier can be specified.
+ * For more information see the "Format Modifiers" section. Note that even
+ * though it looks like we have a modifier per-plane, we in fact do not. The
+ * modifier for each plane must be identical. Thus all combinations of
+ * different data layouts for multi-plane formats must be enumerated as
+ * separate modifiers.
+ *
+ * All of the entries in @handles, @pitches, @offsets and @modifier must be
+ * zero when unused. Warning, for @offsets and @modifier zero can't be used to
+ * figure out whether the entry is used or not since it's a valid value (a zero
+ * offset is common, and a zero modifier is &DRM_FORMAT_MOD_LINEAR).
+ */
+struct drm_mode_fb_cmd2 {
+	/** @fb_id: Object ID of the frame-buffer. */
+	__u32 fb_id;
+	/** @width: Width of the frame-buffer. */
+	__u32 width;
+	/** @height: Height of the frame-buffer. */
+	__u32 height;
+	/**
+	 * @pixel_format: FourCC format code, see ``DRM_FORMAT_*`` constants in
+	 * ``drm_fourcc.h``.
+	 */
+	__u32 pixel_format;
+	/**
+	 * @flags: Frame-buffer flags (see &DRM_MODE_FB_INTERLACED and
+	 * &DRM_MODE_FB_MODIFIERS).
+	 */
+	__u32 flags;
+
+	/**
+	 * @handles: GEM buffer handle, one per plane. Set to 0 if the plane is
+	 * unused. The same handle can be used for multiple planes.
+	 */
+	__u32 handles[4];
+	/** @pitches: Pitch (aka. stride) in bytes, one per plane. */
+	__u32 pitches[4];
+	/** @offsets: Offset into the buffer in bytes, one per plane. */
+	__u32 offsets[4];
+	/**
+	 * @modifier: Format modifier, one per plane. See ``DRM_FORMAT_MOD_*``
+	 * constants in ``drm_fourcc.h``. All planes must use the same
+	 * modifier. Ignored unless &DRM_MODE_FB_MODIFIERS is set in @flags.
+	 */
+	__u64 modifier[4];
+};
+
+#define DRM_MODE_FB_DIRTY_ANNOTATE_COPY 0x01
+#define DRM_MODE_FB_DIRTY_ANNOTATE_FILL 0x02
+#define DRM_MODE_FB_DIRTY_FLAGS         0x03
+
+#define DRM_MODE_FB_DIRTY_MAX_CLIPS     256
+
+/*
+ * Mark a region of a framebuffer as dirty.
+ *
+ * Some hardware does not automatically update display contents
+ * as a hardware or software draw to a framebuffer. This ioctl
+ * allows userspace to tell the kernel and the hardware what
+ * regions of the framebuffer have changed.
+ *
+ * The kernel or hardware is free to update more then just the
+ * region specified by the clip rects. The kernel or hardware
+ * may also delay and/or coalesce several calls to dirty into a
+ * single update.
+ *
+ * Userspace may annotate the updates, the annotates are a
+ * promise made by the caller that the change is either a copy
+ * of pixels or a fill of a single color in the region specified.
+ *
+ * If the DRM_MODE_FB_DIRTY_ANNOTATE_COPY flag is given then
+ * the number of updated regions are half of num_clips given,
+ * where the clip rects are paired in src and dst. The width and
+ * height of each one of the pairs must match.
+ *
+ * If the DRM_MODE_FB_DIRTY_ANNOTATE_FILL flag is given the caller
+ * promises that the region specified of the clip rects is filled
+ * completely with a single color as given in the color argument.
+ */
+
+struct drm_mode_fb_dirty_cmd {
+	__u32 fb_id;
+	__u32 flags;
+	__u32 color;
+	__u32 num_clips;
+	__u64 clips_ptr;
+};
+
+struct drm_mode_mode_cmd {
+	__u32 connector_id;
+	struct drm_mode_modeinfo mode;
+};
+
+#define DRM_MODE_CURSOR_BO	0x01
+#define DRM_MODE_CURSOR_MOVE	0x02
+#define DRM_MODE_CURSOR_FLAGS	0x03
+
+/*
+ * depending on the value in flags different members are used.
+ *
+ * CURSOR_BO uses
+ *    crtc_id
+ *    width
+ *    height
+ *    handle - if 0 turns the cursor off
+ *
+ * CURSOR_MOVE uses
+ *    crtc_id
+ *    x
+ *    y
+ */
+struct drm_mode_cursor {
+	__u32 flags;
+	__u32 crtc_id;
+	__s32 x;
+	__s32 y;
+	__u32 width;
+	__u32 height;
+	/* driver specific handle */
+	__u32 handle;
+};
+
+struct drm_mode_cursor2 {
+	__u32 flags;
+	__u32 crtc_id;
+	__s32 x;
+	__s32 y;
+	__u32 width;
+	__u32 height;
+	/* driver specific handle */
+	__u32 handle;
+	__s32 hot_x;
+	__s32 hot_y;
+};
+
+struct drm_mode_crtc_lut {
+	__u32 crtc_id;
+	__u32 gamma_size;
+
+	/* pointers to arrays */
+	__u64 red;
+	__u64 green;
+	__u64 blue;
+};
+
+struct drm_color_ctm {
+	/*
+	 * Conversion matrix in S31.32 sign-magnitude
+	 * (not two's complement!) format.
+	 *
+	 * out   matrix    in
+	 * |R|   |0 1 2|   |R|
+	 * |G| = |3 4 5| x |G|
+	 * |B|   |6 7 8|   |B|
+	 */
+	__u64 matrix[9];
+};
+
+struct drm_color_lut {
+	/*
+	 * Values are mapped linearly to 0.0 - 1.0 range, with 0x0 == 0.0 and
+	 * 0xffff == 1.0.
+	 */
+	__u16 red;
+	__u16 green;
+	__u16 blue;
+	__u16 reserved;
+};
+
+/**
+ * struct drm_plane_size_hint - Plane size hints
+ *
+ * The plane SIZE_HINTS property blob contains an
+ * array of struct drm_plane_size_hint.
+ */
+struct drm_plane_size_hint {
+	__u16 width;
+	__u16 height;
+};
+
+/**
+ * struct hdr_metadata_infoframe - HDR Metadata Infoframe Data.
+ *
+ * HDR Metadata Infoframe as per CTA 861.G spec. This is expected
+ * to match exactly with the spec.
+ *
+ * Userspace is expected to pass the metadata information as per
+ * the format described in this structure.
+ */
+struct hdr_metadata_infoframe {
+	/**
+	 * @eotf: Electro-Optical Transfer Function (EOTF)
+	 * used in the stream.
+	 */
+	__u8 eotf;
+	/**
+	 * @metadata_type: Static_Metadata_Descriptor_ID.
+	 */
+	__u8 metadata_type;
+	/**
+	 * @display_primaries: Color Primaries of the Data.
+	 * These are coded as unsigned 16-bit values in units of
+	 * 0.00002, where 0x0000 represents zero and 0xC350
+	 * represents 1.0000.
+	 * @display_primaries.x: X coordinate of color primary.
+	 * @display_primaries.y: Y coordinate of color primary.
+	 */
+	struct {
+		__u16 x, y;
+	} display_primaries[3];
+	/**
+	 * @white_point: White Point of Colorspace Data.
+	 * These are coded as unsigned 16-bit values in units of
+	 * 0.00002, where 0x0000 represents zero and 0xC350
+	 * represents 1.0000.
+	 * @white_point.x: X coordinate of whitepoint of color primary.
+	 * @white_point.y: Y coordinate of whitepoint of color primary.
+	 */
+	struct {
+		__u16 x, y;
+	} white_point;
+	/**
+	 * @max_display_mastering_luminance: Max Mastering Display Luminance.
+	 * This value is coded as an unsigned 16-bit value in units of 1 cd/m2,
+	 * where 0x0001 represents 1 cd/m2 and 0xFFFF represents 65535 cd/m2.
+	 */
+	__u16 max_display_mastering_luminance;
+	/**
+	 * @min_display_mastering_luminance: Min Mastering Display Luminance.
+	 * This value is coded as an unsigned 16-bit value in units of
+	 * 0.0001 cd/m2, where 0x0001 represents 0.0001 cd/m2 and 0xFFFF
+	 * represents 6.5535 cd/m2.
+	 */
+	__u16 min_display_mastering_luminance;
+	/**
+	 * @max_cll: Max Content Light Level.
+	 * This value is coded as an unsigned 16-bit value in units of 1 cd/m2,
+	 * where 0x0001 represents 1 cd/m2 and 0xFFFF represents 65535 cd/m2.
+	 */
+	__u16 max_cll;
+	/**
+	 * @max_fall: Max Frame Average Light Level.
+	 * This value is coded as an unsigned 16-bit value in units of 1 cd/m2,
+	 * where 0x0001 represents 1 cd/m2 and 0xFFFF represents 65535 cd/m2.
+	 */
+	__u16 max_fall;
+};
+
+/**
+ * struct hdr_output_metadata - HDR output metadata
+ *
+ * Metadata Information to be passed from userspace
+ */
+struct hdr_output_metadata {
+	/**
+	 * @metadata_type: Static_Metadata_Descriptor_ID.
+	 */
+	__u32 metadata_type;
+	/**
+	 * @hdmi_metadata_type1: HDR Metadata Infoframe.
+	 */
+	union {
+		struct hdr_metadata_infoframe hdmi_metadata_type1;
+	};
+};
+
+/**
+ * DRM_MODE_PAGE_FLIP_EVENT
+ *
+ * Request that the kernel sends back a vblank event (see
+ * struct drm_event_vblank) with the &DRM_EVENT_FLIP_COMPLETE type when the
+ * page-flip is done.
+ */
+#define DRM_MODE_PAGE_FLIP_EVENT 0x01
+/**
+ * DRM_MODE_PAGE_FLIP_ASYNC
+ *
+ * Request that the page-flip is performed as soon as possible, ie. with no
+ * delay due to waiting for vblank. This may cause tearing to be visible on
+ * the screen.
+ *
+ * When used with atomic uAPI, the driver will return an error if the hardware
+ * doesn't support performing an asynchronous page-flip for this update.
+ * User-space should handle this, e.g. by falling back to a regular page-flip.
+ *
+ * Note, some hardware might need to perform one last synchronous page-flip
+ * before being able to switch to asynchronous page-flips. As an exception,
+ * the driver will return success even though that first page-flip is not
+ * asynchronous.
+ */
+#define DRM_MODE_PAGE_FLIP_ASYNC 0x02
+#define DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE 0x4
+#define DRM_MODE_PAGE_FLIP_TARGET_RELATIVE 0x8
+#define DRM_MODE_PAGE_FLIP_TARGET (DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE | \
+				   DRM_MODE_PAGE_FLIP_TARGET_RELATIVE)
+/**
+ * DRM_MODE_PAGE_FLIP_FLAGS
+ *
+ * Bitmask of flags suitable for &drm_mode_crtc_page_flip_target.flags.
+ */
+#define DRM_MODE_PAGE_FLIP_FLAGS (DRM_MODE_PAGE_FLIP_EVENT | \
+				  DRM_MODE_PAGE_FLIP_ASYNC | \
+				  DRM_MODE_PAGE_FLIP_TARGET)
+
+/*
+ * Request a page flip on the specified crtc.
+ *
+ * This ioctl will ask KMS to schedule a page flip for the specified
+ * crtc.  Once any pending rendering targeting the specified fb (as of
+ * ioctl time) has completed, the crtc will be reprogrammed to display
+ * that fb after the next vertical refresh.  The ioctl returns
+ * immediately, but subsequent rendering to the current fb will block
+ * in the execbuffer ioctl until the page flip happens.  If a page
+ * flip is already pending as the ioctl is called, EBUSY will be
+ * returned.
+ *
+ * Flag DRM_MODE_PAGE_FLIP_EVENT requests that drm sends back a vblank
+ * event (see drm.h: struct drm_event_vblank) when the page flip is
+ * done.  The user_data field passed in with this ioctl will be
+ * returned as the user_data field in the vblank event struct.
+ *
+ * Flag DRM_MODE_PAGE_FLIP_ASYNC requests that the flip happen
+ * 'as soon as possible', meaning that it not delay waiting for vblank.
+ * This may cause tearing on the screen.
+ *
+ * The reserved field must be zero.
+ */
+
+struct drm_mode_crtc_page_flip {
+	__u32 crtc_id;
+	__u32 fb_id;
+	__u32 flags;
+	__u32 reserved;
+	__u64 user_data;
+};
+
+/*
+ * Request a page flip on the specified crtc.
+ *
+ * Same as struct drm_mode_crtc_page_flip, but supports new flags and
+ * re-purposes the reserved field:
+ *
+ * The sequence field must be zero unless either of the
+ * DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE/RELATIVE flags is specified. When
+ * the ABSOLUTE flag is specified, the sequence field denotes the absolute
+ * vblank sequence when the flip should take effect. When the RELATIVE
+ * flag is specified, the sequence field denotes the relative (to the
+ * current one when the ioctl is called) vblank sequence when the flip
+ * should take effect. NOTE: DRM_IOCTL_WAIT_VBLANK must still be used to
+ * make sure the vblank sequence before the target one has passed before
+ * calling this ioctl. The purpose of the
+ * DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE/RELATIVE flags is merely to clarify
+ * the target for when code dealing with a page flip runs during a
+ * vertical blank period.
+ */
+
+struct drm_mode_crtc_page_flip_target {
+	__u32 crtc_id;
+	__u32 fb_id;
+	__u32 flags;
+	__u32 sequence;
+	__u64 user_data;
+};
+
+/**
+ * struct drm_mode_create_dumb - Create a KMS dumb buffer for scanout.
+ * @height: buffer height in pixels
+ * @width: buffer width in pixels
+ * @bpp: bits per pixel
+ * @flags: must be zero
+ * @handle: buffer object handle
+ * @pitch: number of bytes between two consecutive lines
+ * @size: size of the whole buffer in bytes
+ *
+ * User-space fills @height, @width, @bpp and @flags. If the IOCTL succeeds,
+ * the kernel fills @handle, @pitch and @size.
+ */
+struct drm_mode_create_dumb {
+	__u32 height;
+	__u32 width;
+	__u32 bpp;
+	__u32 flags;
+
+	__u32 handle;
+	__u32 pitch;
+	__u64 size;
+};
+
+/* set up for mmap of a dumb scanout buffer */
+struct drm_mode_map_dumb {
+	/** Handle for the object being mapped. */
+	__u32 handle;
+	__u32 pad;
+	/**
+	 * Fake offset to use for subsequent mmap call
+	 *
+	 * This is a fixed-size type for 32/64 compatibility.
+	 */
+	__u64 offset;
+};
+
+struct drm_mode_destroy_dumb {
+	__u32 handle;
+};
+
+/**
+ * DRM_MODE_ATOMIC_TEST_ONLY
+ *
+ * Do not apply the atomic commit, instead check whether the hardware supports
+ * this configuration.
+ *
+ * See &drm_mode_config_funcs.atomic_check for more details on test-only
+ * commits.
+ */
+#define DRM_MODE_ATOMIC_TEST_ONLY 0x0100
+/**
+ * DRM_MODE_ATOMIC_NONBLOCK
+ *
+ * Do not block while applying the atomic commit. The &DRM_IOCTL_MODE_ATOMIC
+ * IOCTL returns immediately instead of waiting for the changes to be applied
+ * in hardware. Note, the driver will still check that the update can be
+ * applied before retuning.
+ */
+#define DRM_MODE_ATOMIC_NONBLOCK  0x0200
+/**
+ * DRM_MODE_ATOMIC_ALLOW_MODESET
+ *
+ * Allow the update to result in temporary or transient visible artifacts while
+ * the update is being applied. Applying the update may also take significantly
+ * more time than a page flip. All visual artifacts will disappear by the time
+ * the update is completed, as signalled through the vblank event's timestamp
+ * (see struct drm_event_vblank).
+ *
+ * This flag must be set when the KMS update might cause visible artifacts.
+ * Without this flag such KMS update will return a EINVAL error. What kind of
+ * update may cause visible artifacts depends on the driver and the hardware.
+ * User-space that needs to know beforehand if an update might cause visible
+ * artifacts can use &DRM_MODE_ATOMIC_TEST_ONLY without
+ * &DRM_MODE_ATOMIC_ALLOW_MODESET to see if it fails.
+ *
+ * To the best of the driver's knowledge, visual artifacts are guaranteed to
+ * not appear when this flag is not set. Some sinks might display visual
+ * artifacts outside of the driver's control.
+ */
+#define DRM_MODE_ATOMIC_ALLOW_MODESET 0x0400
+
+/**
+ * DRM_MODE_ATOMIC_FLAGS
+ *
+ * Bitfield of flags accepted by the &DRM_IOCTL_MODE_ATOMIC IOCTL in
+ * &drm_mode_atomic.flags.
+ */
+#define DRM_MODE_ATOMIC_FLAGS (\
+		DRM_MODE_PAGE_FLIP_EVENT |\
+		DRM_MODE_PAGE_FLIP_ASYNC |\
+		DRM_MODE_ATOMIC_TEST_ONLY |\
+		DRM_MODE_ATOMIC_NONBLOCK |\
+		DRM_MODE_ATOMIC_ALLOW_MODESET)
+
+struct drm_mode_atomic {
+	__u32 flags;
+	__u32 count_objs;
+	__u64 objs_ptr;
+	__u64 count_props_ptr;
+	__u64 props_ptr;
+	__u64 prop_values_ptr;
+	__u64 reserved;
+	__u64 user_data;
+};
+
+struct drm_format_modifier_blob {
+#define FORMAT_BLOB_CURRENT 1
+	/* Version of this blob format */
+	__u32 version;
+
+	/* Flags */
+	__u32 flags;
+
+	/* Number of fourcc formats supported */
+	__u32 count_formats;
+
+	/* Where in this blob the formats exist (in bytes) */
+	__u32 formats_offset;
+
+	/* Number of drm_format_modifiers */
+	__u32 count_modifiers;
+
+	/* Where in this blob the modifiers exist (in bytes) */
+	__u32 modifiers_offset;
+
+	/* __u32 formats[] */
+	/* struct drm_format_modifier modifiers[] */
+};
+
+struct drm_format_modifier {
+	/* Bitmask of formats in get_plane format list this info applies to. The
+	 * offset allows a sliding window of which 64 formats (bits).
+	 *
+	 * Some examples:
+	 * In today's world with < 65 formats, and formats 0, and 2 are
+	 * supported
+	 * 0x0000000000000005
+	 *		  ^-offset = 0, formats = 5
+	 *
+	 * If the number formats grew to 128, and formats 98-102 are
+	 * supported with the modifier:
+	 *
+	 * 0x0000007c00000000 0000000000000000
+	 *		  ^
+	 *		  |__offset = 64, formats = 0x7c00000000
+	 *
+	 */
+	__u64 formats;
+	__u32 offset;
+	__u32 pad;
+
+	/* The modifier that applies to the >get_plane format list bitmask. */
+	__u64 modifier;
+};
+
+/**
+ * struct drm_mode_create_blob - Create New blob property
+ *
+ * Create a new 'blob' data property, copying length bytes from data pointer,
+ * and returning new blob ID.
+ */
+struct drm_mode_create_blob {
+	/** @data: Pointer to data to copy. */
+	__u64 data;
+	/** @length: Length of data to copy. */
+	__u32 length;
+	/** @blob_id: Return: new property ID. */
+	__u32 blob_id;
+};
+
+/**
+ * struct drm_mode_destroy_blob - Destroy user blob
+ * @blob_id: blob_id to destroy
+ *
+ * Destroy a user-created blob property.
+ *
+ * User-space can release blobs as soon as they do not need to refer to them by
+ * their blob object ID.  For instance, if you are using a MODE_ID blob in an
+ * atomic commit and you will not make another commit re-using the same ID, you
+ * can destroy the blob as soon as the commit has been issued, without waiting
+ * for it to complete.
+ */
+struct drm_mode_destroy_blob {
+	__u32 blob_id;
+};
+
+/**
+ * struct drm_mode_create_lease - Create lease
+ *
+ * Lease mode resources, creating another drm_master.
+ *
+ * The @object_ids array must reference at least one CRTC, one connector and
+ * one plane if &DRM_CLIENT_CAP_UNIVERSAL_PLANES is enabled. Alternatively,
+ * the lease can be completely empty.
+ */
+struct drm_mode_create_lease {
+	/** @object_ids: Pointer to array of object ids (__u32) */
+	__u64 object_ids;
+	/** @object_count: Number of object ids */
+	__u32 object_count;
+	/** @flags: flags for new FD (O_CLOEXEC, etc) */
+	__u32 flags;
+
+	/** @lessee_id: Return: unique identifier for lessee. */
+	__u32 lessee_id;
+	/** @fd: Return: file descriptor to new drm_master file */
+	__u32 fd;
+};
+
+/**
+ * struct drm_mode_list_lessees - List lessees
+ *
+ * List lesses from a drm_master.
+ */
+struct drm_mode_list_lessees {
+	/**
+	 * @count_lessees: Number of lessees.
+	 *
+	 * On input, provides length of the array.
+	 * On output, provides total number. No
+	 * more than the input number will be written
+	 * back, so two calls can be used to get
+	 * the size and then the data.
+	 */
+	__u32 count_lessees;
+	/** @pad: Padding. */
+	__u32 pad;
+
+	/**
+	 * @lessees_ptr: Pointer to lessees.
+	 *
+	 * Pointer to __u64 array of lessee ids
+	 */
+	__u64 lessees_ptr;
+};
+
+/**
+ * struct drm_mode_get_lease - Get Lease
+ *
+ * Get leased objects.
+ */
+struct drm_mode_get_lease {
+	/**
+	 * @count_objects: Number of leased objects.
+	 *
+	 * On input, provides length of the array.
+	 * On output, provides total number. No
+	 * more than the input number will be written
+	 * back, so two calls can be used to get
+	 * the size and then the data.
+	 */
+	__u32 count_objects;
+	/** @pad: Padding. */
+	__u32 pad;
+
+	/**
+	 * @objects_ptr: Pointer to objects.
+	 *
+	 * Pointer to __u32 array of object ids.
+	 */
+	__u64 objects_ptr;
+};
+
+/**
+ * struct drm_mode_revoke_lease - Revoke lease
+ */
+struct drm_mode_revoke_lease {
+	/** @lessee_id: Unique ID of lessee */
+	__u32 lessee_id;
+};
+
+/**
+ * struct drm_mode_rect - Two dimensional rectangle.
+ * @x1: Horizontal starting coordinate (inclusive).
+ * @y1: Vertical starting coordinate (inclusive).
+ * @x2: Horizontal ending coordinate (exclusive).
+ * @y2: Vertical ending coordinate (exclusive).
+ *
+ * With drm subsystem using struct drm_rect to manage rectangular area this
+ * export it to user-space.
+ *
+ * Currently used by drm_mode_atomic blob property FB_DAMAGE_CLIPS.
+ */
+struct drm_mode_rect {
+	__s32 x1;
+	__s32 y1;
+	__s32 x2;
+	__s32 y2;
+};
+
+/**
+ * struct drm_mode_closefb
+ * @fb_id: Framebuffer ID.
+ * @pad: Must be zero.
+ */
+struct drm_mode_closefb {
+	__u32 fb_id;
+	__u32 pad;
+};
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/projects/rocr-runtime/libhsakmt/include/hsakmt/drm/xf86drm.h b/projects/rocr-runtime/libhsakmt/include/hsakmt/drm/xf86drm.h
new file mode 100644
index 0000000000..1bc6e2233e
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/include/hsakmt/drm/xf86drm.h
@@ -0,0 +1,983 @@
+/**
+ * \file xf86drm.h 
+ * OS-independent header for DRM user-level library interface.
+ *
+ * \author Rickard E. (Rik) Faith <faith@valinux.com>
+ */
+ 
+/*
+ * Copyright 1999, 2000 Precision Insight, Inc., Cedar Park, Texas.
+ * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef _XF86DRM_H_
+#define _XF86DRM_H_
+
+#include <stdarg.h>
+#include <sys/types.h>
+#include <stdint.h>
+#include "drm.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#ifndef DRM_MAX_MINOR
+#define DRM_MAX_MINOR   64 /* deprecated */
+#endif
+
+#if defined(__linux__)
+
+#define DRM_IOCTL_NR(n)		_IOC_NR(n)
+#define DRM_IOC_VOID		_IOC_NONE
+#define DRM_IOC_READ		_IOC_READ
+#define DRM_IOC_WRITE		_IOC_WRITE
+#define DRM_IOC_READWRITE	_IOC_READ|_IOC_WRITE
+#define DRM_IOC(dir, group, nr, size) _IOC(dir, group, nr, size)
+
+#else /* One of the *BSDs */
+
+#include <sys/ioccom.h>
+#define DRM_IOCTL_NR(n)         ((n) & 0xff)
+#define DRM_IOC_VOID            IOC_VOID
+#define DRM_IOC_READ            IOC_OUT
+#define DRM_IOC_WRITE           IOC_IN
+#define DRM_IOC_READWRITE       IOC_INOUT
+#define DRM_IOC(dir, group, nr, size) _IOC(dir, group, nr, size)
+
+#endif
+
+				/* Defaults, if nothing set in xf86config */
+#define DRM_DEV_UID	 0
+#define DRM_DEV_GID	 0
+/* Default /dev/dri directory permissions 0755 */
+#define DRM_DEV_DIRMODE	 	\
+	(S_IRUSR|S_IWUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH)
+#define DRM_DEV_MODE	 (S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP)
+
+#ifdef __OpenBSD__
+#define DRM_DIR_NAME  "/dev"
+#define DRM_PRIMARY_MINOR_NAME  "drm"
+#define DRM_CONTROL_MINOR_NAME  "drmC" /* deprecated */
+#define DRM_RENDER_MINOR_NAME   "drmR"
+#else
+#define DRM_DIR_NAME  "/dev/dri"
+#define DRM_PRIMARY_MINOR_NAME  "card"
+#define DRM_CONTROL_MINOR_NAME  "controlD" /* deprecated */
+#define DRM_RENDER_MINOR_NAME   "renderD"
+#define DRM_PROC_NAME "/proc/dri/" /* For backward Linux compatibility */
+#endif
+
+#define DRM_DEV_NAME          "%s/" DRM_PRIMARY_MINOR_NAME "%d"
+#define DRM_CONTROL_DEV_NAME  "%s/" DRM_CONTROL_MINOR_NAME "%d" /* deprecated */
+#define DRM_RENDER_DEV_NAME   "%s/" DRM_RENDER_MINOR_NAME  "%d"
+
+#define DRM_NODE_NAME_MAX \
+    (sizeof(DRM_DIR_NAME) + 1 /* slash */ \
+     + MAX3(sizeof(DRM_PRIMARY_MINOR_NAME), \
+            sizeof(DRM_CONTROL_MINOR_NAME), \
+            sizeof(DRM_RENDER_MINOR_NAME)) \
+     + sizeof("1048575") /* highest possible node number 2^MINORBITS - 1 */ \
+     + 1) /* NULL-terminator */
+
+#define DRM_ERR_NO_DEVICE  (-1001)
+#define DRM_ERR_NO_ACCESS  (-1002)
+#define DRM_ERR_NOT_ROOT   (-1003)
+#define DRM_ERR_INVALID    (-1004)
+#define DRM_ERR_NO_FD      (-1005)
+
+#define DRM_AGP_NO_HANDLE 0
+
+typedef unsigned int  drmSize,     *drmSizePtr;	    /**< For mapped regions */
+typedef void          *drmAddress, **drmAddressPtr; /**< For mapped regions */
+
+#if (__GNUC__ >= 3)
+#define DRM_PRINTFLIKE(f, a) __attribute__ ((format(__printf__, f, a)))
+#else
+#define DRM_PRINTFLIKE(f, a)
+#endif
+
+typedef struct _drmServerInfo {
+  int (*debug_print)(const char *format, va_list ap) DRM_PRINTFLIKE(1,0);
+  int (*load_module)(const char *name);
+  void (*get_perms)(gid_t *, mode_t *);
+} drmServerInfo, *drmServerInfoPtr;
+
+typedef struct drmHashEntry {
+    int      fd;
+    void     (*f)(int, void *, void *);
+    void     *tagTable;
+} drmHashEntry;
+
+extern int drmIoctl(int fd, unsigned long request, void *arg);
+extern void *drmGetHashTable(void);
+extern drmHashEntry *drmGetEntry(int fd);
+
+/**
+ * Driver version information.
+ *
+ * \sa drmGetVersion() and drmSetVersion().
+ */
+typedef struct _drmVersion {
+    int     version_major;        /**< Major version */
+    int     version_minor;        /**< Minor version */
+    int     version_patchlevel;   /**< Patch level */
+    int     name_len; 	          /**< Length of name buffer */
+    char    *name;	          /**< Name of driver */
+    int     date_len;             /**< Length of date buffer */
+    char    *date;                /**< User-space buffer to hold date */
+    int     desc_len;	          /**< Length of desc buffer */
+    char    *desc;                /**< User-space buffer to hold desc */
+} drmVersion, *drmVersionPtr;
+
+typedef struct _drmStats {
+    unsigned long count;	     /**< Number of data */
+    struct {
+	unsigned long value;	     /**< Value from kernel */
+	const char    *long_format;  /**< Suggested format for long_name */
+	const char    *long_name;    /**< Long name for value */
+	const char    *rate_format;  /**< Suggested format for rate_name */
+	const char    *rate_name;    /**< Short name for value per second */
+	int           isvalue;       /**< True if value (vs. counter) */
+	const char    *mult_names;   /**< Multiplier names (e.g., "KGM") */
+	int           mult;          /**< Multiplier value (e.g., 1024) */
+	int           verbose;       /**< Suggest only in verbose output */
+    } data[15];
+} drmStatsT;
+
+
+				/* All of these enums *MUST* match with the
+                                   kernel implementation -- so do *NOT*
+                                   change them!  (The drmlib implementation
+                                   will just copy the flags instead of
+                                   translating them.) */
+typedef enum {
+    DRM_FRAME_BUFFER    = 0,      /**< WC, no caching, no core dump */
+    DRM_REGISTERS       = 1,      /**< no caching, no core dump */
+    DRM_SHM             = 2,      /**< shared, cached */
+    DRM_AGP             = 3,	  /**< AGP/GART */
+    DRM_SCATTER_GATHER  = 4,	  /**< PCI scatter/gather */
+    DRM_CONSISTENT      = 5	  /**< PCI consistent */
+} drmMapType;
+
+typedef enum {
+    DRM_RESTRICTED      = 0x0001, /**< Cannot be mapped to client-virtual */
+    DRM_READ_ONLY       = 0x0002, /**< Read-only in client-virtual */
+    DRM_LOCKED          = 0x0004, /**< Physical pages locked */
+    DRM_KERNEL          = 0x0008, /**< Kernel requires access */
+    DRM_WRITE_COMBINING = 0x0010, /**< Use write-combining, if available */
+    DRM_CONTAINS_LOCK   = 0x0020, /**< SHM page that contains lock */
+    DRM_REMOVABLE	= 0x0040  /**< Removable mapping */
+} drmMapFlags;
+
+/**
+ * \warning These values *MUST* match drm.h
+ */
+typedef enum {
+    /** \name Flags for DMA buffer dispatch */
+    /*@{*/
+    DRM_DMA_BLOCK        = 0x01, /**< 
+				  * Block until buffer dispatched.
+				  * 
+				  * \note the buffer may not yet have been
+				  * processed by the hardware -- getting a
+				  * hardware lock with the hardware quiescent
+				  * will ensure that the buffer has been
+				  * processed.
+				  */
+    DRM_DMA_WHILE_LOCKED = 0x02, /**< Dispatch while lock held */
+    DRM_DMA_PRIORITY     = 0x04, /**< High priority dispatch */
+    /*@}*/
+
+    /** \name Flags for DMA buffer request */
+    /*@{*/
+    DRM_DMA_WAIT         = 0x10, /**< Wait for free buffers */
+    DRM_DMA_SMALLER_OK   = 0x20, /**< Smaller-than-requested buffers OK */
+    DRM_DMA_LARGER_OK    = 0x40  /**< Larger-than-requested buffers OK */
+    /*@}*/
+} drmDMAFlags;
+
+typedef enum {
+    DRM_PAGE_ALIGN       = 0x01,
+    DRM_AGP_BUFFER       = 0x02,
+    DRM_SG_BUFFER        = 0x04,
+    DRM_FB_BUFFER        = 0x08,
+    DRM_PCI_BUFFER_RO    = 0x10
+} drmBufDescFlags;
+
+typedef enum {
+    DRM_LOCK_READY      = 0x01, /**< Wait until hardware is ready for DMA */
+    DRM_LOCK_QUIESCENT  = 0x02, /**< Wait until hardware quiescent */
+    DRM_LOCK_FLUSH      = 0x04, /**< Flush this context's DMA queue first */
+    DRM_LOCK_FLUSH_ALL  = 0x08, /**< Flush all DMA queues first */
+				/* These *HALT* flags aren't supported yet
+                                   -- they will be used to support the
+                                   full-screen DGA-like mode. */
+    DRM_HALT_ALL_QUEUES = 0x10, /**< Halt all current and future queues */
+    DRM_HALT_CUR_QUEUES = 0x20  /**< Halt all current queues */
+} drmLockFlags;
+
+typedef enum {
+    DRM_CONTEXT_PRESERVED = 0x01, /**< This context is preserved and
+				     never swapped. */
+    DRM_CONTEXT_2DONLY    = 0x02  /**< This context is for 2D rendering only. */
+} drm_context_tFlags, *drm_context_tFlagsPtr;
+
+typedef struct _drmBufDesc {
+    int              count;	  /**< Number of buffers of this size */
+    int              size;	  /**< Size in bytes */
+    int              low_mark;	  /**< Low water mark */
+    int              high_mark;	  /**< High water mark */
+} drmBufDesc, *drmBufDescPtr;
+
+typedef struct _drmBufInfo {
+    int              count;	  /**< Number of buffers described in list */
+    drmBufDescPtr    list;	  /**< List of buffer descriptions */
+} drmBufInfo, *drmBufInfoPtr;
+
+typedef struct _drmBuf {
+    int              idx;	  /**< Index into the master buffer list */
+    int              total;	  /**< Buffer size */
+    int              used;	  /**< Amount of buffer in use (for DMA) */
+    drmAddress       address;	  /**< Address */
+} drmBuf, *drmBufPtr;
+
+/**
+ * Buffer mapping information.
+ *
+ * Used by drmMapBufs() and drmUnmapBufs() to store information about the
+ * mapped buffers.
+ */
+typedef struct _drmBufMap {
+    int              count;	  /**< Number of buffers mapped */
+    drmBufPtr        list;	  /**< Buffers */
+} drmBufMap, *drmBufMapPtr;
+
+typedef struct _drmLock {
+    volatile unsigned int lock;
+    char                      padding[60];
+    /* This is big enough for most current (and future?) architectures:
+       DEC Alpha:              32 bytes
+       Intel Merced:           ?
+       Intel P5/PPro/PII/PIII: 32 bytes
+       Intel StrongARM:        32 bytes
+       Intel i386/i486:        16 bytes
+       MIPS:                   32 bytes (?)
+       Motorola 68k:           16 bytes
+       Motorola PowerPC:       32 bytes
+       Sun SPARC:              32 bytes
+    */
+} drmLock, *drmLockPtr;
+
+/**
+ * Indices here refer to the offset into
+ * list in drmBufInfo
+ */
+typedef struct _drmDMAReq {
+    drm_context_t    context;  	  /**< Context handle */
+    int           send_count;     /**< Number of buffers to send */
+    int           *send_list;     /**< List of handles to buffers */
+    int           *send_sizes;    /**< Lengths of data to send, in bytes */
+    drmDMAFlags   flags;          /**< Flags */
+    int           request_count;  /**< Number of buffers requested */
+    int           request_size;	  /**< Desired size of buffers requested */
+    int           *request_list;  /**< Buffer information */
+    int           *request_sizes; /**< Minimum acceptable sizes */
+    int           granted_count;  /**< Number of buffers granted at this size */
+} drmDMAReq, *drmDMAReqPtr;
+
+typedef struct _drmRegion {
+    drm_handle_t     handle;
+    unsigned int  offset;
+    drmSize       size;
+    drmAddress    map;
+} drmRegion, *drmRegionPtr;
+
+typedef struct _drmTextureRegion {
+    unsigned char next;
+    unsigned char prev;
+    unsigned char in_use;
+    unsigned char padding;	/**< Explicitly pad this out */
+    unsigned int  age;
+} drmTextureRegion, *drmTextureRegionPtr;
+
+
+typedef enum {
+    DRM_VBLANK_ABSOLUTE = 0x0,	/**< Wait for specific vblank sequence number */
+    DRM_VBLANK_RELATIVE = 0x1,	/**< Wait for given number of vblanks */
+    /* bits 1-6 are reserved for high crtcs */
+    DRM_VBLANK_HIGH_CRTC_MASK = 0x0000003e,
+    DRM_VBLANK_EVENT = 0x4000000,	/**< Send event instead of blocking */
+    DRM_VBLANK_FLIP = 0x8000000,	/**< Scheduled buffer swap should flip */
+    DRM_VBLANK_NEXTONMISS = 0x10000000,	/**< If missed, wait for next vblank */
+    DRM_VBLANK_SECONDARY = 0x20000000,	/**< Secondary display controller */
+    DRM_VBLANK_SIGNAL   = 0x40000000	/* Send signal instead of blocking */
+} drmVBlankSeqType;
+#define DRM_VBLANK_HIGH_CRTC_SHIFT 1
+
+typedef struct _drmVBlankReq {
+	drmVBlankSeqType type;
+	unsigned int sequence;
+	unsigned long signal;
+} drmVBlankReq, *drmVBlankReqPtr;
+
+typedef struct _drmVBlankReply {
+	drmVBlankSeqType type;
+	unsigned int sequence;
+	long tval_sec;
+	long tval_usec;
+} drmVBlankReply, *drmVBlankReplyPtr;
+
+typedef union _drmVBlank {
+	drmVBlankReq request;
+	drmVBlankReply reply;
+} drmVBlank, *drmVBlankPtr;
+
+typedef struct _drmSetVersion {
+	int drm_di_major;
+	int drm_di_minor;
+	int drm_dd_major;
+	int drm_dd_minor;
+} drmSetVersion, *drmSetVersionPtr;
+
+#define __drm_dummy_lock(lock) (*(__volatile__ unsigned int *)lock)
+
+#define DRM_LOCK_HELD  0x80000000U /**< Hardware lock is held */
+#define DRM_LOCK_CONT  0x40000000U /**< Hardware lock is contended */
+
+#if defined(__GNUC__) && (__GNUC__ >= 2)
+# if defined(__i386) || defined(__AMD64__) || defined(__x86_64__) || defined(__amd64__)
+				/* Reflect changes here to drmP.h */
+#define DRM_CAS(lock,old,new,__ret)                                    \
+	do {                                                           \
+                int __dummy;	/* Can't mark eax as clobbered */      \
+		__asm__ __volatile__(                                  \
+			"lock ; cmpxchg %4,%1\n\t"                     \
+                        "setnz %0"                                     \
+			: "=d" (__ret),                                \
+   			  "=m" (__drm_dummy_lock(lock)),               \
+                          "=a" (__dummy)                               \
+			: "2" (old),                                   \
+			  "r" (new));                                  \
+	} while (0)
+
+#elif defined(__alpha__)
+
+#define	DRM_CAS(lock, old, new, ret)		\
+	do {					\
+		int tmp, old32;			\
+		__asm__ __volatile__(		\
+		"	addl	$31, %5, %3\n"	\
+		"1:	ldl_l	%0, %2\n"	\
+		"	cmpeq	%0, %3, %1\n"	\
+		"	beq	%1, 2f\n"	\
+		"	mov	%4, %0\n"	\
+		"	stl_c	%0, %2\n"	\
+		"	beq	%0, 3f\n"	\
+		"	mb\n"			\
+		"2:	cmpeq	%1, 0, %1\n"	\
+		".subsection 2\n"		\
+		"3:	br	1b\n"		\
+		".previous"			\
+		: "=&r"(tmp), "=&r"(ret),	\
+		  "=m"(__drm_dummy_lock(lock)),	\
+		  "=&r"(old32)			\
+		: "r"(new), "r"(old)		\
+		: "memory");			\
+	} while (0)
+
+#elif defined(__sparc__)
+
+#define DRM_CAS(lock,old,new,__ret)				\
+do {	register unsigned int __old __asm("o0");		\
+	register unsigned int __new __asm("o1");		\
+	register volatile unsigned int *__lock __asm("o2");	\
+	__old = old;						\
+	__new = new;						\
+	__lock = (volatile unsigned int *)lock;			\
+	__asm__ __volatile__(					\
+		/*"cas [%2], %3, %0"*/				\
+		".word 0xd3e29008\n\t"				\
+		/*"membar #StoreStore | #StoreLoad"*/		\
+		".word 0x8143e00a"				\
+		: "=&r" (__new)					\
+		: "0" (__new),					\
+		  "r" (__lock),					\
+		  "r" (__old)					\
+		: "memory");					\
+	__ret = (__new != __old);				\
+} while(0)
+
+#elif defined(__ia64__)
+
+#ifdef __INTEL_COMPILER
+/* this currently generates bad code (missing stop bits)... */
+#include <ia64intrin.h>
+
+#define DRM_CAS(lock,old,new,__ret)					      \
+	do {								      \
+		unsigned long __result, __old = (old) & 0xffffffff;		\
+		__mf();							      	\
+		__result = _InterlockedCompareExchange_acq(&__drm_dummy_lock(lock), (new), __old);\
+		__ret = (__result) != (__old);					\
+/*		__ret = (__sync_val_compare_and_swap(&__drm_dummy_lock(lock), \
+						     (old), (new))	      \
+			 != (old));					      */\
+	} while (0)
+
+#else
+#define DRM_CAS(lock,old,new,__ret)					  \
+	do {								  \
+		unsigned int __result, __old = (old);			  \
+		__asm__ __volatile__(					  \
+			"mf\n"						  \
+			"mov ar.ccv=%2\n"				  \
+			";;\n"						  \
+			"cmpxchg4.acq %0=%1,%3,ar.ccv"			  \
+			: "=r" (__result), "=m" (__drm_dummy_lock(lock))  \
+			: "r" ((unsigned long)__old), "r" (new)			  \
+			: "memory");					  \
+		__ret = (__result) != (__old);				  \
+	} while (0)
+
+#endif
+
+#elif defined(__powerpc__)
+
+#define DRM_CAS(lock,old,new,__ret)			\
+	do {						\
+		__asm__ __volatile__(			\
+			"sync;"				\
+			"0:    lwarx %0,0,%1;"		\
+			"      xor. %0,%3,%0;"		\
+			"      bne 1f;"			\
+			"      stwcx. %2,0,%1;"		\
+			"      bne- 0b;"		\
+			"1:    "			\
+			"sync;"				\
+		: "=&r"(__ret)				\
+		: "r"(lock), "r"(new), "r"(old)		\
+		: "cr0", "memory");			\
+	} while (0)
+
+# elif defined (__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \
+	|| defined (__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) \
+	|| defined (__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) \
+	|| defined (__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \
+	|| defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \
+	|| defined(__ARM_ARCH_7EM__)
+       /* excluding ARMv4/ARMv5 and lower (lacking ldrex/strex support) */
+       #undef DRM_DEV_MODE
+       #define DRM_DEV_MODE     (S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH)
+
+       #define DRM_CAS(lock,old,new,__ret)             \
+       do {                                            \
+               __asm__ __volatile__ (                  \
+                       "1: ldrex %0, [%1]\n"           \
+                       "   teq %0, %2\n"               \
+                       "   ite eq\n"                   \
+                       "   strexeq %0, %3, [%1]\n"     \
+                       "   movne   %0, #1\n"           \
+               : "=&r" (__ret)                         \
+               : "r" (lock), "r" (old), "r" (new)      \
+               : "cc","memory");                       \
+       } while (0)
+
+#endif /* architecture */
+#endif /* __GNUC__ >= 2 */
+
+#ifndef DRM_CAS
+#define DRM_CAS(lock,old,new,ret) do { ret=1; } while (0) /* FAST LOCK FAILS */
+#endif
+
+#if defined(__alpha__)
+#define DRM_CAS_RESULT(_result)		long _result
+#elif defined(__powerpc__)
+#define DRM_CAS_RESULT(_result)		int _result
+#else
+#define DRM_CAS_RESULT(_result)		char _result
+#endif
+
+#define DRM_LIGHT_LOCK(fd,lock,context)                                \
+	do {                                                           \
+                DRM_CAS_RESULT(__ret);                                 \
+		DRM_CAS(lock,context,DRM_LOCK_HELD|context,__ret);     \
+                if (__ret) drmGetLock(fd,context,0);                   \
+        } while(0)
+
+				/* This one counts fast locks -- for
+                                   benchmarking only. */
+#define DRM_LIGHT_LOCK_COUNT(fd,lock,context,count)                    \
+	do {                                                           \
+                DRM_CAS_RESULT(__ret);                                 \
+		DRM_CAS(lock,context,DRM_LOCK_HELD|context,__ret);     \
+                if (__ret) drmGetLock(fd,context,0);                   \
+                else       ++count;                                    \
+        } while(0)
+
+#define DRM_LOCK(fd,lock,context,flags)                                \
+	do {                                                           \
+		if (flags) drmGetLock(fd,context,flags);               \
+		else       DRM_LIGHT_LOCK(fd,lock,context);            \
+	} while(0)
+
+#define DRM_UNLOCK(fd,lock,context)                                    \
+	do {                                                           \
+                DRM_CAS_RESULT(__ret);                                 \
+		DRM_CAS(lock,DRM_LOCK_HELD|context,context,__ret);     \
+                if (__ret) drmUnlock(fd,context);                      \
+        } while(0)
+
+				/* Simple spin locks */
+#define DRM_SPINLOCK(spin,val)                                         \
+	do {                                                           \
+            DRM_CAS_RESULT(__ret);                                     \
+	    do {                                                       \
+		DRM_CAS(spin,0,val,__ret);                             \
+		if (__ret) while ((spin)->lock);                       \
+	    } while (__ret);                                           \
+	} while(0)
+
+#define DRM_SPINLOCK_TAKE(spin,val)                                    \
+	do {                                                           \
+            DRM_CAS_RESULT(__ret);                                     \
+            int  cur;                                                  \
+	    do {                                                       \
+                cur = (*spin).lock;                                    \
+		DRM_CAS(spin,cur,val,__ret);                           \
+	    } while (__ret);                                           \
+	} while(0)
+
+#define DRM_SPINLOCK_COUNT(spin,val,count,__ret)                       \
+	do {                                                           \
+            int  __i;                                                  \
+            __ret = 1;                                                 \
+            for (__i = 0; __ret && __i < count; __i++) {               \
+		DRM_CAS(spin,0,val,__ret);                             \
+		if (__ret) for (;__i < count && (spin)->lock; __i++);  \
+	    }                                                          \
+	} while(0)
+
+#define DRM_SPINUNLOCK(spin,val)                                       \
+	do {                                                           \
+            DRM_CAS_RESULT(__ret);                                     \
+            if ((*spin).lock == val) { /* else server stole lock */    \
+	        do {                                                   \
+		    DRM_CAS(spin,val,0,__ret);                         \
+	        } while (__ret);                                       \
+            }                                                          \
+	} while(0)
+
+
+
+/* General user-level programmer's API: unprivileged */
+extern int           drmAvailable(void);
+extern int           drmOpen(const char *name, const char *busid);
+
+#define DRM_NODE_PRIMARY 0
+#define DRM_NODE_CONTROL 1 /* deprecated: never returned */
+#define DRM_NODE_RENDER  2
+#define DRM_NODE_MAX     3
+
+extern int           drmOpenWithType(const char *name, const char *busid,
+                                     int type);
+
+extern int           drmOpenControl(int minor); /* deprecated: always fails */
+extern int           drmOpenRender(int minor);
+extern int           drmClose(int fd);
+extern drmVersionPtr drmGetVersion(int fd);
+extern drmVersionPtr drmGetLibVersion(int fd);
+extern int           drmGetCap(int fd, uint64_t capability, uint64_t *value);
+extern void          drmFreeVersion(drmVersionPtr);
+extern int           drmGetMagic(int fd, drm_magic_t * magic);
+extern char          *drmGetBusid(int fd);
+extern int           drmGetInterruptFromBusID(int fd, int busnum, int devnum,
+					      int funcnum);
+extern int           drmGetMap(int fd, int idx, drm_handle_t *offset,
+			       drmSize *size, drmMapType *type,
+			       drmMapFlags *flags, drm_handle_t *handle,
+			       int *mtrr);
+extern int           drmGetClient(int fd, int idx, int *auth, int *pid,
+				  int *uid, unsigned long *magic,
+				  unsigned long *iocs);
+extern int           drmGetStats(int fd, drmStatsT *stats);
+extern int           drmSetInterfaceVersion(int fd, drmSetVersion *version);
+extern int           drmCommandNone(int fd, unsigned long drmCommandIndex);
+extern int           drmCommandRead(int fd, unsigned long drmCommandIndex,
+                                    void *data, unsigned long size);
+extern int           drmCommandWrite(int fd, unsigned long drmCommandIndex,
+                                     void *data, unsigned long size);
+extern int           drmCommandWriteRead(int fd, unsigned long drmCommandIndex,
+                                         void *data, unsigned long size);
+
+/* General user-level programmer's API: X server (root) only  */
+extern void          drmFreeBusid(const char *busid);
+extern int           drmSetBusid(int fd, const char *busid);
+extern int           drmAuthMagic(int fd, drm_magic_t magic);
+extern int           drmAddMap(int fd,
+			       drm_handle_t offset,
+			       drmSize size,
+			       drmMapType type,
+			       drmMapFlags flags,
+			       drm_handle_t * handle);
+extern int	     drmRmMap(int fd, drm_handle_t handle);
+extern int	     drmAddContextPrivateMapping(int fd, drm_context_t ctx_id,
+						 drm_handle_t handle);
+
+extern int           drmAddBufs(int fd, int count, int size,
+				drmBufDescFlags flags,
+				int agp_offset);
+extern int           drmMarkBufs(int fd, double low, double high);
+extern int           drmCreateContext(int fd, drm_context_t * handle);
+extern int           drmSetContextFlags(int fd, drm_context_t context,
+					drm_context_tFlags flags);
+extern int           drmGetContextFlags(int fd, drm_context_t context,
+					drm_context_tFlagsPtr flags);
+extern int           drmAddContextTag(int fd, drm_context_t context, void *tag);
+extern int           drmDelContextTag(int fd, drm_context_t context);
+extern void          *drmGetContextTag(int fd, drm_context_t context);
+extern drm_context_t * drmGetReservedContextList(int fd, int *count);
+extern void          drmFreeReservedContextList(drm_context_t *);
+extern int           drmSwitchToContext(int fd, drm_context_t context);
+extern int           drmDestroyContext(int fd, drm_context_t handle);
+extern int           drmCreateDrawable(int fd, drm_drawable_t * handle);
+extern int           drmDestroyDrawable(int fd, drm_drawable_t handle);
+extern int           drmUpdateDrawableInfo(int fd, drm_drawable_t handle,
+					   drm_drawable_info_type_t type,
+					   unsigned int num, void *data);
+extern int           drmCtlInstHandler(int fd, int irq);
+extern int           drmCtlUninstHandler(int fd);
+extern int           drmSetClientCap(int fd, uint64_t capability,
+				     uint64_t value);
+
+extern int           drmCrtcGetSequence(int fd, uint32_t crtcId,
+					uint64_t *sequence, uint64_t *ns);
+extern int           drmCrtcQueueSequence(int fd, uint32_t crtcId,
+					  uint32_t flags, uint64_t sequence,
+					  uint64_t *sequence_queued,
+					  uint64_t user_data);
+/* General user-level programmer's API: authenticated client and/or X */
+extern int           drmMap(int fd,
+			    drm_handle_t handle,
+			    drmSize size,
+			    drmAddressPtr address);
+extern int           drmUnmap(drmAddress address, drmSize size);
+extern drmBufInfoPtr drmGetBufInfo(int fd);
+extern drmBufMapPtr  drmMapBufs(int fd);
+extern int           drmUnmapBufs(drmBufMapPtr bufs);
+extern int           drmDMA(int fd, drmDMAReqPtr request);
+extern int           drmFreeBufs(int fd, int count, int *list);
+extern int           drmGetLock(int fd,
+			        drm_context_t context,
+			        drmLockFlags flags);
+extern int           drmUnlock(int fd, drm_context_t context);
+extern int           drmFinish(int fd, int context, drmLockFlags flags);
+extern int	     drmGetContextPrivateMapping(int fd, drm_context_t ctx_id, 
+						 drm_handle_t * handle);
+
+/* AGP/GART support: X server (root) only */
+extern int           drmAgpAcquire(int fd);
+extern int           drmAgpRelease(int fd);
+extern int           drmAgpEnable(int fd, unsigned long mode);
+extern int           drmAgpAlloc(int fd, unsigned long size,
+				 unsigned long type, unsigned long *address,
+				 drm_handle_t *handle);
+extern int           drmAgpFree(int fd, drm_handle_t handle);
+extern int 	     drmAgpBind(int fd, drm_handle_t handle,
+				unsigned long offset);
+extern int           drmAgpUnbind(int fd, drm_handle_t handle);
+
+/* AGP/GART info: authenticated client and/or X */
+extern int           drmAgpVersionMajor(int fd);
+extern int           drmAgpVersionMinor(int fd);
+extern unsigned long drmAgpGetMode(int fd);
+extern unsigned long drmAgpBase(int fd); /* Physical location */
+extern unsigned long drmAgpSize(int fd); /* Bytes */
+extern unsigned long drmAgpMemoryUsed(int fd);
+extern unsigned long drmAgpMemoryAvail(int fd);
+extern unsigned int  drmAgpVendorId(int fd);
+extern unsigned int  drmAgpDeviceId(int fd);
+
+/* PCI scatter/gather support: X server (root) only */
+extern int           drmScatterGatherAlloc(int fd, unsigned long size,
+					   drm_handle_t *handle);
+extern int           drmScatterGatherFree(int fd, drm_handle_t handle);
+
+extern int           drmWaitVBlank(int fd, drmVBlankPtr vbl);
+
+/* Support routines */
+extern void          drmSetServerInfo(drmServerInfoPtr info);
+extern int           drmError(int err, const char *label);
+extern void          *drmMalloc(int size);
+extern void          drmFree(void *pt);
+
+/* Hash table routines */
+extern void *drmHashCreate(void);
+extern int  drmHashDestroy(void *t);
+extern int  drmHashLookup(void *t, unsigned long key, void **value);
+extern int  drmHashInsert(void *t, unsigned long key, void *value);
+extern int  drmHashDelete(void *t, unsigned long key);
+extern int  drmHashFirst(void *t, unsigned long *key, void **value);
+extern int  drmHashNext(void *t, unsigned long *key, void **value);
+
+/* PRNG routines */
+extern void          *drmRandomCreate(unsigned long seed);
+extern int           drmRandomDestroy(void *state);
+extern unsigned long drmRandom(void *state);
+extern double        drmRandomDouble(void *state);
+
+/* Skip list routines */
+
+extern void *drmSLCreate(void);
+extern int  drmSLDestroy(void *l);
+extern int  drmSLLookup(void *l, unsigned long key, void **value);
+extern int  drmSLInsert(void *l, unsigned long key, void *value);
+extern int  drmSLDelete(void *l, unsigned long key);
+extern int  drmSLNext(void *l, unsigned long *key, void **value);
+extern int  drmSLFirst(void *l, unsigned long *key, void **value);
+extern void drmSLDump(void *l);
+extern int  drmSLLookupNeighbors(void *l, unsigned long key,
+				 unsigned long *prev_key, void **prev_value,
+				 unsigned long *next_key, void **next_value);
+
+extern int drmOpenOnce(void *unused, const char *BusID, int *newlyopened);
+extern int drmOpenOnceWithType(const char *BusID, int *newlyopened, int type);
+extern void drmCloseOnce(int fd);
+extern void drmMsg(const char *format, ...) DRM_PRINTFLIKE(1, 2);
+
+extern int drmSetMaster(int fd);
+extern int drmDropMaster(int fd);
+extern int drmIsMaster(int fd);
+
+#define DRM_EVENT_CONTEXT_VERSION 4
+
+typedef struct _drmEventContext {
+
+	/* This struct is versioned so we can add more pointers if we
+	 * add more events. */
+	int version;
+
+	void (*vblank_handler)(int fd,
+			       unsigned int sequence, 
+			       unsigned int tv_sec,
+			       unsigned int tv_usec,
+			       void *user_data);
+
+	void (*page_flip_handler)(int fd,
+				  unsigned int sequence,
+				  unsigned int tv_sec,
+				  unsigned int tv_usec,
+				  void *user_data);
+
+	void (*page_flip_handler2)(int fd,
+				   unsigned int sequence,
+				   unsigned int tv_sec,
+				   unsigned int tv_usec,
+				   unsigned int crtc_id,
+				   void *user_data);
+
+	void (*sequence_handler)(int fd,
+				 uint64_t sequence,
+				 uint64_t ns,
+				 uint64_t user_data);
+} drmEventContext, *drmEventContextPtr;
+
+extern int drmHandleEvent(int fd, drmEventContextPtr evctx);
+
+extern char *drmGetDeviceNameFromFd(int fd);
+
+/* Improved version of drmGetDeviceNameFromFd which attributes for any type of
+ * device/node - card or renderD.
+ */
+extern char *drmGetDeviceNameFromFd2(int fd);
+extern int drmGetNodeTypeFromFd(int fd);
+
+/* Convert between GEM handles and DMA-BUF file descriptors.
+ *
+ * Warning: since GEM handles are not reference-counted and are unique per
+ * DRM file description, the caller is expected to perform its own reference
+ * counting. drmPrimeFDToHandle is guaranteed to return the same handle for
+ * different FDs if they reference the same underlying buffer object. This
+ * could even be a buffer object originally created on the same DRM FD.
+ *
+ * When sharing a DRM FD with an API such as EGL or GBM, the caller must not
+ * use drmPrimeHandleToFD nor drmPrimeFDToHandle. A single user-space
+ * reference-counting implementation is necessary to avoid double-closing GEM
+ * handles.
+ *
+ * Two processes can't share the same DRM FD and both use it to create or
+ * import GEM handles, even when using a single user-space reference-counting
+ * implementation like GBM, because GBM doesn't share its state between
+ * processes.
+ */
+extern int drmPrimeHandleToFD(int fd, uint32_t handle, uint32_t flags, int *prime_fd);
+extern int drmPrimeFDToHandle(int fd, int prime_fd, uint32_t *handle);
+
+extern int drmCloseBufferHandle(int fd, uint32_t handle);
+
+extern char *drmGetPrimaryDeviceNameFromFd(int fd);
+extern char *drmGetRenderDeviceNameFromFd(int fd);
+
+#define DRM_BUS_PCI       0
+#define DRM_BUS_USB       1
+#define DRM_BUS_PLATFORM  2
+#define DRM_BUS_HOST1X    3
+
+typedef struct _drmPciBusInfo {
+    uint16_t domain;
+    uint8_t bus;
+    uint8_t dev;
+    uint8_t func;
+} drmPciBusInfo, *drmPciBusInfoPtr;
+
+typedef struct _drmPciDeviceInfo {
+    uint16_t vendor_id;
+    uint16_t device_id;
+    uint16_t subvendor_id;
+    uint16_t subdevice_id;
+    uint8_t revision_id;
+} drmPciDeviceInfo, *drmPciDeviceInfoPtr;
+
+typedef struct _drmUsbBusInfo {
+    uint8_t bus;
+    uint8_t dev;
+} drmUsbBusInfo, *drmUsbBusInfoPtr;
+
+typedef struct _drmUsbDeviceInfo {
+    uint16_t vendor;
+    uint16_t product;
+} drmUsbDeviceInfo, *drmUsbDeviceInfoPtr;
+
+#define DRM_PLATFORM_DEVICE_NAME_LEN 512
+
+typedef struct _drmPlatformBusInfo {
+    char fullname[DRM_PLATFORM_DEVICE_NAME_LEN];
+} drmPlatformBusInfo, *drmPlatformBusInfoPtr;
+
+typedef struct _drmPlatformDeviceInfo {
+    char **compatible; /* NULL terminated list of compatible strings */
+} drmPlatformDeviceInfo, *drmPlatformDeviceInfoPtr;
+
+#define DRM_HOST1X_DEVICE_NAME_LEN 512
+
+typedef struct _drmHost1xBusInfo {
+    char fullname[DRM_HOST1X_DEVICE_NAME_LEN];
+} drmHost1xBusInfo, *drmHost1xBusInfoPtr;
+
+typedef struct _drmHost1xDeviceInfo {
+    char **compatible; /* NULL terminated list of compatible strings */
+} drmHost1xDeviceInfo, *drmHost1xDeviceInfoPtr;
+
+typedef struct _drmDevice {
+    char **nodes; /* DRM_NODE_MAX sized array */
+    int available_nodes; /* DRM_NODE_* bitmask */
+    int bustype;
+    union {
+        drmPciBusInfoPtr pci;
+        drmUsbBusInfoPtr usb;
+        drmPlatformBusInfoPtr platform;
+        drmHost1xBusInfoPtr host1x;
+    } businfo;
+    union {
+        drmPciDeviceInfoPtr pci;
+        drmUsbDeviceInfoPtr usb;
+        drmPlatformDeviceInfoPtr platform;
+        drmHost1xDeviceInfoPtr host1x;
+    } deviceinfo;
+} drmDevice, *drmDevicePtr;
+
+extern int drmGetDevice(int fd, drmDevicePtr *device);
+extern void drmFreeDevice(drmDevicePtr *device);
+
+extern int drmGetDevices(drmDevicePtr devices[], int max_devices);
+extern void drmFreeDevices(drmDevicePtr devices[], int count);
+
+#define DRM_DEVICE_GET_PCI_REVISION (1 << 0)
+extern int drmGetDevice2(int fd, uint32_t flags, drmDevicePtr *device);
+extern int drmGetDevices2(uint32_t flags, drmDevicePtr devices[], int max_devices);
+
+extern int drmGetDeviceFromDevId(dev_t dev_id, uint32_t flags, drmDevicePtr *device);
+
+/**
+ * Get the node type (DRM_NODE_PRIMARY or DRM_NODE_RENDER) from a device ID.
+ *
+ * Returns negative errno on error.
+ */
+extern int drmGetNodeTypeFromDevId(dev_t devid);
+
+/**
+ * Check if two drmDevice pointers represent the same DRM device.
+ *
+ * Returns 1 if the devices are equal, 0 otherwise.
+ */
+extern int drmDevicesEqual(drmDevicePtr a, drmDevicePtr b);
+
+extern int drmSyncobjCreate(int fd, uint32_t flags, uint32_t *handle);
+extern int drmSyncobjDestroy(int fd, uint32_t handle);
+extern int drmSyncobjHandleToFD(int fd, uint32_t handle, int *obj_fd);
+extern int drmSyncobjFDToHandle(int fd, int obj_fd, uint32_t *handle);
+
+extern int drmSyncobjImportSyncFile(int fd, uint32_t handle, int sync_file_fd);
+extern int drmSyncobjExportSyncFile(int fd, uint32_t handle, int *sync_file_fd);
+extern int drmSyncobjWait(int fd, uint32_t *handles, unsigned num_handles,
+			  int64_t timeout_nsec, unsigned flags,
+			  uint32_t *first_signaled);
+extern int drmSyncobjReset(int fd, const uint32_t *handles, uint32_t handle_count);
+extern int drmSyncobjSignal(int fd, const uint32_t *handles, uint32_t handle_count);
+extern int drmSyncobjTimelineSignal(int fd, const uint32_t *handles,
+				    uint64_t *points, uint32_t handle_count);
+extern int drmSyncobjTimelineWait(int fd, uint32_t *handles, uint64_t *points,
+				  unsigned num_handles,
+				  int64_t timeout_nsec, unsigned flags,
+				  uint32_t *first_signaled);
+extern int drmSyncobjQuery(int fd, uint32_t *handles, uint64_t *points,
+			   uint32_t handle_count);
+extern int drmSyncobjQuery2(int fd, uint32_t *handles, uint64_t *points,
+			    uint32_t handle_count, uint32_t flags);
+extern int drmSyncobjTransfer(int fd,
+			      uint32_t dst_handle, uint64_t dst_point,
+			      uint32_t src_handle, uint64_t src_point,
+			      uint32_t flags);
+extern int drmSyncobjEventfd(int fd, uint32_t handle, uint64_t point, int ev_fd,
+                             uint32_t flags);
+
+extern char *
+drmGetFormatModifierVendor(uint64_t modifier);
+
+extern char *
+drmGetFormatModifierName(uint64_t modifier);
+
+extern char *
+drmGetFormatName(uint32_t format);
+
+#ifndef fourcc_mod_get_vendor
+#define fourcc_mod_get_vendor(modifier) \
+       (((modifier) >> 56) & 0xff)
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/projects/rocr-runtime/libhsakmt/include/hsakmt/hsakmt.h b/projects/rocr-runtime/libhsakmt/include/hsakmt/hsakmt.h
index 88ab70ae93..e19441b5a8 100644
--- a/projects/rocr-runtime/libhsakmt/include/hsakmt/hsakmt.h
+++ b/projects/rocr-runtime/libhsakmt/include/hsakmt/hsakmt.h
@@ -398,6 +398,12 @@ hsaKmtGetQueueInfo(
     HsaQueueInfo *QueueInfo	//IN
 );
 
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtQueueRingDoorbell(
+    HSA_QUEUEID QueueId
+);
+
 /**
   Allows an HSA process to set/change the default and alternate memory coherency, before starting to dispatch. 
 */
diff --git a/projects/rocr-runtime/libhsakmt/include/hsakmt/hsakmt_drm.h b/projects/rocr-runtime/libhsakmt/include/hsakmt/hsakmt_drm.h
new file mode 100644
index 0000000000..af9658cf6d
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/include/hsakmt/hsakmt_drm.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright © 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _HSAKMT_DRM_H_
+#define _HSAKMT_DRM_H_
+
+#include "drm/xf86drm.h"
+#include "drm/amdgpu.h"
+#include "drm/amdgpu_drm.h"
+
+#endif
diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/Brig.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/Brig.h
new file mode 100644
index 0000000000..4f34bd1d50
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/Brig.h
@@ -0,0 +1,1131 @@
+// University of Illinois/NCSA
+// Open Source License
+//
+// Copyright (c) 2013-2015, Advanced Micro Devices, Inc.
+// All rights reserved.
+//
+// Developed by:
+//
+//     HSA Team
+//
+//     Advanced Micro Devices, Inc
+//
+//     www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of
+// this software and associated documentation files (the "Software"), to deal with
+// the Software without restriction, including without limitation the rights to
+// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+// of the Software, and to permit persons to whom the Software is furnished to do
+// so, subject to the following conditions:
+//
+//     * Redistributions of source code must retain the above copyright notice,
+//       this list of conditions and the following disclaimers.
+//
+//     * Redistributions in binary form must reproduce the above copyright notice,
+//       this list of conditions and the following disclaimers in the
+//       documentation and/or other materials provided with the distribution.
+//
+//     * Neither the names of the LLVM Team, University of Illinois at
+//       Urbana-Champaign, nor the names of its contributors may be used to
+//       endorse or promote products derived from this Software without specific
+//       prior written permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+// FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+// SOFTWARE.
+
+#ifndef INCLUDED_BRIG_H
+#define INCLUDED_BRIG_H
+
+#include <stddef.h>   /* size_t */
+#include <stdint.h>   /* uintXX_t */
+
+#ifdef __cplusplus
+extern "C" {
+#endif  /* __cplusplus */
+
+/*========================================================================================*/
+/* =======================================================================================*/
+/* =======================================================================================*/
+/* =======================================================================================*/
+
+typedef uint32_t BrigCodeOffset32_t;
+typedef uint32_t BrigOperandOffset32_t;
+typedef uint32_t BrigDataOffset32_t;
+
+typedef BrigDataOffset32_t BrigDataOffsetCodeList32_t;
+typedef BrigDataOffset32_t BrigDataOffsetOperandList32_t;
+typedef BrigDataOffset32_t BrigDataOffsetString32_t;
+
+typedef uint32_t BrigVersion32_t;
+enum BrigVersion {
+    BRIG_VERSION_HSAIL_MAJOR = 1,
+    BRIG_VERSION_HSAIL_MINOR = 0,
+    BRIG_VERSION_BRIG_MAJOR  = 1,
+    BRIG_VERSION_BRIG_MINOR  = 0
+};
+
+typedef uint16_t BrigKind16_t;
+enum BrigKind {
+    BRIG_KIND_NONE = 0x0000,
+
+    BRIG_KIND_DIRECTIVE_BEGIN = 0x1000,
+        BRIG_KIND_DIRECTIVE_ARG_BLOCK_END = 0x1000,
+        BRIG_KIND_DIRECTIVE_ARG_BLOCK_START = 0x1001,
+        BRIG_KIND_DIRECTIVE_COMMENT = 0x1002,
+        BRIG_KIND_DIRECTIVE_CONTROL = 0x1003,
+        BRIG_KIND_DIRECTIVE_EXTENSION = 0x1004,
+        BRIG_KIND_DIRECTIVE_FBARRIER = 0x1005,
+        BRIG_KIND_DIRECTIVE_FUNCTION = 0x1006,
+        BRIG_KIND_DIRECTIVE_INDIRECT_FUNCTION = 0x1007,
+        BRIG_KIND_DIRECTIVE_KERNEL = 0x1008,
+        BRIG_KIND_DIRECTIVE_LABEL = 0x1009,
+        BRIG_KIND_DIRECTIVE_LOC = 0x100a,
+        BRIG_KIND_DIRECTIVE_MODULE = 0x100b,
+        BRIG_KIND_DIRECTIVE_PRAGMA = 0x100c,
+        BRIG_KIND_DIRECTIVE_SIGNATURE = 0x100d,
+        BRIG_KIND_DIRECTIVE_VARIABLE = 0x100e,
+    BRIG_KIND_DIRECTIVE_END = 0x100f,
+
+    BRIG_KIND_INST_BEGIN = 0x2000,
+        BRIG_KIND_INST_ADDR = 0x2000,
+        BRIG_KIND_INST_ATOMIC = 0x2001,
+        BRIG_KIND_INST_BASIC = 0x2002,
+        BRIG_KIND_INST_BR = 0x2003,
+        BRIG_KIND_INST_CMP = 0x2004,
+        BRIG_KIND_INST_CVT = 0x2005,
+        BRIG_KIND_INST_IMAGE = 0x2006,
+        BRIG_KIND_INST_LANE = 0x2007,
+        BRIG_KIND_INST_MEM = 0x2008,
+        BRIG_KIND_INST_MEM_FENCE = 0x2009,
+        BRIG_KIND_INST_MOD = 0x200a,
+        BRIG_KIND_INST_QUERY_IMAGE = 0x200b,
+        BRIG_KIND_INST_QUERY_SAMPLER = 0x200c,
+        BRIG_KIND_INST_QUEUE = 0x200d,
+        BRIG_KIND_INST_SEG = 0x200e,
+        BRIG_KIND_INST_SEG_CVT = 0x200f,
+        BRIG_KIND_INST_SIGNAL = 0x2010,
+        BRIG_KIND_INST_SOURCE_TYPE = 0x2011,
+    BRIG_KIND_INST_END = 0x2012,
+
+    BRIG_KIND_OPERAND_BEGIN = 0x3000,
+        BRIG_KIND_OPERAND_ADDRESS = 0x3000,
+        BRIG_KIND_OPERAND_ALIGN = 0x3001,
+        BRIG_KIND_OPERAND_CODE_LIST = 0x3002,
+        BRIG_KIND_OPERAND_CODE_REF = 0x3003,
+        BRIG_KIND_OPERAND_CONSTANT_BYTES = 0x3004,
+        BRIG_KIND_OPERAND_RESERVED = 0x3005,
+        BRIG_KIND_OPERAND_CONSTANT_IMAGE = 0x3006,
+        BRIG_KIND_OPERAND_CONSTANT_OPERAND_LIST = 0x3007,
+        BRIG_KIND_OPERAND_CONSTANT_SAMPLER = 0x3008,
+        BRIG_KIND_OPERAND_OPERAND_LIST = 0x3009,
+        BRIG_KIND_OPERAND_REGISTER = 0x300a,
+        BRIG_KIND_OPERAND_STRING = 0x300b,
+        BRIG_KIND_OPERAND_WAVESIZE = 0x300c,
+    BRIG_KIND_OPERAND_END = 0x300d
+};
+
+typedef uint8_t BrigAlignment8_t;
+enum BrigAlignment {
+    BRIG_ALIGNMENT_NONE = 0,
+    BRIG_ALIGNMENT_1 = 1,
+    BRIG_ALIGNMENT_2 = 2,
+    BRIG_ALIGNMENT_4 = 3,
+    BRIG_ALIGNMENT_8 = 4,
+    BRIG_ALIGNMENT_16 = 5,
+    BRIG_ALIGNMENT_32 = 6,
+    BRIG_ALIGNMENT_64 = 7,
+    BRIG_ALIGNMENT_128 = 8,
+    BRIG_ALIGNMENT_256 = 9,
+    BRIG_ALIGNMENT_MAX = BRIG_ALIGNMENT_256
+};
+
+typedef uint8_t BrigAllocation8_t;
+enum BrigAllocation {
+    BRIG_ALLOCATION_NONE = 0,
+    BRIG_ALLOCATION_PROGRAM = 1,
+    BRIG_ALLOCATION_AGENT = 2,
+    BRIG_ALLOCATION_AUTOMATIC = 3
+};
+
+typedef uint8_t BrigAluModifier8_t;
+enum BrigAluModifierMask {
+    BRIG_ALU_FTZ = 1
+};
+
+typedef uint8_t BrigAtomicOperation8_t;
+enum BrigAtomicOperation {
+    BRIG_ATOMIC_ADD = 0,
+    BRIG_ATOMIC_AND = 1,
+    BRIG_ATOMIC_CAS = 2,
+    BRIG_ATOMIC_EXCH = 3,
+    BRIG_ATOMIC_LD = 4,
+    BRIG_ATOMIC_MAX = 5,
+    BRIG_ATOMIC_MIN = 6,
+    BRIG_ATOMIC_OR = 7,
+    BRIG_ATOMIC_ST = 8,
+    BRIG_ATOMIC_SUB = 9,
+    BRIG_ATOMIC_WRAPDEC = 10,
+    BRIG_ATOMIC_WRAPINC = 11,
+    BRIG_ATOMIC_XOR = 12,
+    BRIG_ATOMIC_WAIT_EQ = 13,
+    BRIG_ATOMIC_WAIT_NE = 14,
+    BRIG_ATOMIC_WAIT_LT = 15,
+    BRIG_ATOMIC_WAIT_GTE = 16,
+    BRIG_ATOMIC_WAITTIMEOUT_EQ = 17,
+    BRIG_ATOMIC_WAITTIMEOUT_NE = 18,
+    BRIG_ATOMIC_WAITTIMEOUT_LT = 19,
+    BRIG_ATOMIC_WAITTIMEOUT_GTE = 20
+};
+
+typedef uint8_t BrigCompareOperation8_t;
+enum BrigCompareOperation {
+    BRIG_COMPARE_EQ = 0,
+    BRIG_COMPARE_NE = 1,
+    BRIG_COMPARE_LT = 2,
+    BRIG_COMPARE_LE = 3,
+    BRIG_COMPARE_GT = 4,
+    BRIG_COMPARE_GE = 5,
+    BRIG_COMPARE_EQU = 6,
+    BRIG_COMPARE_NEU = 7,
+    BRIG_COMPARE_LTU = 8,
+    BRIG_COMPARE_LEU = 9,
+    BRIG_COMPARE_GTU = 10,
+    BRIG_COMPARE_GEU = 11,
+    BRIG_COMPARE_NUM = 12,
+    BRIG_COMPARE_NAN = 13,
+    BRIG_COMPARE_SEQ = 14,
+    BRIG_COMPARE_SNE = 15,
+    BRIG_COMPARE_SLT = 16,
+    BRIG_COMPARE_SLE = 17,
+    BRIG_COMPARE_SGT = 18,
+    BRIG_COMPARE_SGE = 19,
+    BRIG_COMPARE_SGEU = 20,
+    BRIG_COMPARE_SEQU = 21,
+    BRIG_COMPARE_SNEU = 22,
+    BRIG_COMPARE_SLTU = 23,
+    BRIG_COMPARE_SLEU = 24,
+    BRIG_COMPARE_SNUM = 25,
+    BRIG_COMPARE_SNAN = 26,
+    BRIG_COMPARE_SGTU = 27
+};
+
+typedef uint16_t BrigControlDirective16_t;
+enum BrigControlDirective {
+    BRIG_CONTROL_NONE = 0,
+    BRIG_CONTROL_ENABLEBREAKEXCEPTIONS = 1,
+    BRIG_CONTROL_ENABLEDETECTEXCEPTIONS = 2,
+    BRIG_CONTROL_MAXDYNAMICGROUPSIZE = 3,
+    BRIG_CONTROL_MAXFLATGRIDSIZE = 4,
+    BRIG_CONTROL_MAXFLATWORKGROUPSIZE = 5,
+    BRIG_CONTROL_REQUIREDDIM = 6,
+    BRIG_CONTROL_REQUIREDGRIDSIZE = 7,
+    BRIG_CONTROL_REQUIREDWORKGROUPSIZE = 8,
+    BRIG_CONTROL_REQUIRENOPARTIALWORKGROUPS = 9
+};
+
+typedef uint8_t BrigExecutableModifier8_t;
+enum BrigExecutableModifierMask {
+    BRIG_EXECUTABLE_DEFINITION = 1
+};
+
+typedef uint8_t BrigImageChannelOrder8_t;
+enum BrigImageChannelOrder {
+    BRIG_CHANNEL_ORDER_A = 0,
+    BRIG_CHANNEL_ORDER_R = 1,
+    BRIG_CHANNEL_ORDER_RX = 2,
+    BRIG_CHANNEL_ORDER_RG = 3,
+    BRIG_CHANNEL_ORDER_RGX = 4,
+    BRIG_CHANNEL_ORDER_RA = 5,
+    BRIG_CHANNEL_ORDER_RGB = 6,
+    BRIG_CHANNEL_ORDER_RGBX = 7,
+    BRIG_CHANNEL_ORDER_RGBA = 8,
+    BRIG_CHANNEL_ORDER_BGRA = 9,
+    BRIG_CHANNEL_ORDER_ARGB = 10,
+    BRIG_CHANNEL_ORDER_ABGR = 11,
+    BRIG_CHANNEL_ORDER_SRGB = 12,
+    BRIG_CHANNEL_ORDER_SRGBX = 13,
+    BRIG_CHANNEL_ORDER_SRGBA = 14,
+    BRIG_CHANNEL_ORDER_SBGRA = 15,
+    BRIG_CHANNEL_ORDER_INTENSITY = 16,
+    BRIG_CHANNEL_ORDER_LUMINANCE = 17,
+    BRIG_CHANNEL_ORDER_DEPTH = 18,
+    BRIG_CHANNEL_ORDER_DEPTH_STENCIL = 19,
+
+    BRIG_CHANNEL_ORDER_FIRST_USER_DEFINED = 128
+};
+
+typedef uint8_t BrigImageChannelType8_t;
+enum BrigImageChannelType {
+    BRIG_CHANNEL_TYPE_SNORM_INT8 = 0,
+    BRIG_CHANNEL_TYPE_SNORM_INT16 = 1,
+    BRIG_CHANNEL_TYPE_UNORM_INT8 = 2,
+    BRIG_CHANNEL_TYPE_UNORM_INT16 = 3,
+    BRIG_CHANNEL_TYPE_UNORM_INT24 = 4,
+    BRIG_CHANNEL_TYPE_UNORM_SHORT_555 = 5,
+    BRIG_CHANNEL_TYPE_UNORM_SHORT_565 = 6,
+    BRIG_CHANNEL_TYPE_UNORM_INT_101010 = 7,
+    BRIG_CHANNEL_TYPE_SIGNED_INT8 = 8,
+    BRIG_CHANNEL_TYPE_SIGNED_INT16 = 9,
+    BRIG_CHANNEL_TYPE_SIGNED_INT32 = 10,
+    BRIG_CHANNEL_TYPE_UNSIGNED_INT8 = 11,
+    BRIG_CHANNEL_TYPE_UNSIGNED_INT16 = 12,
+    BRIG_CHANNEL_TYPE_UNSIGNED_INT32 = 13,
+    BRIG_CHANNEL_TYPE_HALF_FLOAT = 14,
+    BRIG_CHANNEL_TYPE_FLOAT = 15,
+
+    BRIG_CHANNEL_TYPE_FIRST_USER_DEFINED = 128
+};
+
+typedef uint8_t BrigImageGeometry8_t;
+enum BrigImageGeometry {
+    BRIG_GEOMETRY_1D = 0,
+    BRIG_GEOMETRY_2D = 1,
+    BRIG_GEOMETRY_3D = 2,
+    BRIG_GEOMETRY_1DA = 3,
+    BRIG_GEOMETRY_2DA = 4,
+    BRIG_GEOMETRY_1DB = 5,
+    BRIG_GEOMETRY_2DDEPTH = 6,
+    BRIG_GEOMETRY_2DADEPTH = 7,
+
+    BRIG_GEOMETRY_FIRST_USER_DEFINED = 128
+};
+
+typedef uint8_t BrigImageQuery8_t;
+enum BrigImageQuery {
+    BRIG_IMAGE_QUERY_WIDTH = 0,
+    BRIG_IMAGE_QUERY_HEIGHT = 1,
+    BRIG_IMAGE_QUERY_DEPTH = 2,
+    BRIG_IMAGE_QUERY_ARRAY = 3,
+    BRIG_IMAGE_QUERY_CHANNELORDER = 4,
+    BRIG_IMAGE_QUERY_CHANNELTYPE = 5,
+
+    BRIG_IMAGE_QUERY_FIRST_USER_DEFINED = 6
+};
+
+typedef uint8_t BrigLinkage8_t;
+enum BrigLinkage {
+    BRIG_LINKAGE_NONE = 0,
+    BRIG_LINKAGE_PROGRAM = 1,
+    BRIG_LINKAGE_MODULE = 2,
+    BRIG_LINKAGE_FUNCTION = 3,
+    BRIG_LINKAGE_ARG = 4
+};
+
+typedef uint8_t BrigMachineModel8_t;
+enum BrigMachineModel {
+    BRIG_MACHINE_SMALL = 0,
+    BRIG_MACHINE_LARGE = 1,
+};
+
+typedef uint8_t BrigMemoryModifier8_t;
+enum BrigMemoryModifierMask {
+    BRIG_MEMORY_CONST = 1
+};
+
+typedef uint8_t BrigMemoryOrder8_t;
+enum BrigMemoryOrder {
+    BRIG_MEMORY_ORDER_NONE = 0,
+    BRIG_MEMORY_ORDER_RELAXED = 1,
+    BRIG_MEMORY_ORDER_SC_ACQUIRE = 2,
+    BRIG_MEMORY_ORDER_SC_RELEASE = 3,
+    BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE = 4,
+};
+
+typedef uint8_t BrigMemoryScope8_t;
+enum BrigMemoryScope {
+    BRIG_MEMORY_SCOPE_NONE = 0,
+    BRIG_MEMORY_SCOPE_WORKITEM = 1,
+    BRIG_MEMORY_SCOPE_WAVEFRONT = 2,
+    BRIG_MEMORY_SCOPE_WORKGROUP = 3,
+    BRIG_MEMORY_SCOPE_AGENT = 4,
+    BRIG_MEMORY_SCOPE_SYSTEM = 5,
+};
+
+typedef uint16_t BrigOpcode16_t;
+enum BrigOpcode {
+    BRIG_OPCODE_NOP = 0,
+    BRIG_OPCODE_ABS = 1,
+    BRIG_OPCODE_ADD = 2,
+    BRIG_OPCODE_BORROW = 3,
+    BRIG_OPCODE_CARRY = 4,
+    BRIG_OPCODE_CEIL = 5,
+    BRIG_OPCODE_COPYSIGN = 6,
+    BRIG_OPCODE_DIV = 7,
+    BRIG_OPCODE_FLOOR = 8,
+    BRIG_OPCODE_FMA = 9,
+    BRIG_OPCODE_FRACT = 10,
+    BRIG_OPCODE_MAD = 11,
+    BRIG_OPCODE_MAX = 12,
+    BRIG_OPCODE_MIN = 13,
+    BRIG_OPCODE_MUL = 14,
+    BRIG_OPCODE_MULHI = 15,
+    BRIG_OPCODE_NEG = 16,
+    BRIG_OPCODE_REM = 17,
+    BRIG_OPCODE_RINT = 18,
+    BRIG_OPCODE_SQRT = 19,
+    BRIG_OPCODE_SUB = 20,
+    BRIG_OPCODE_TRUNC = 21,
+    BRIG_OPCODE_MAD24 = 22,
+    BRIG_OPCODE_MAD24HI = 23,
+    BRIG_OPCODE_MUL24 = 24,
+    BRIG_OPCODE_MUL24HI = 25,
+    BRIG_OPCODE_SHL = 26,
+    BRIG_OPCODE_SHR = 27,
+    BRIG_OPCODE_AND = 28,
+    BRIG_OPCODE_NOT = 29,
+    BRIG_OPCODE_OR = 30,
+    BRIG_OPCODE_POPCOUNT = 31,
+    BRIG_OPCODE_XOR = 32,
+    BRIG_OPCODE_BITEXTRACT = 33,
+    BRIG_OPCODE_BITINSERT = 34,
+    BRIG_OPCODE_BITMASK = 35,
+    BRIG_OPCODE_BITREV = 36,
+    BRIG_OPCODE_BITSELECT = 37,
+    BRIG_OPCODE_FIRSTBIT = 38,
+    BRIG_OPCODE_LASTBIT = 39,
+    BRIG_OPCODE_COMBINE = 40,
+    BRIG_OPCODE_EXPAND = 41,
+    BRIG_OPCODE_LDA = 42,
+    BRIG_OPCODE_MOV = 43,
+    BRIG_OPCODE_SHUFFLE = 44,
+    BRIG_OPCODE_UNPACKHI = 45,
+    BRIG_OPCODE_UNPACKLO = 46,
+    BRIG_OPCODE_PACK = 47,
+    BRIG_OPCODE_UNPACK = 48,
+    BRIG_OPCODE_CMOV = 49,
+    BRIG_OPCODE_CLASS = 50,
+    BRIG_OPCODE_NCOS = 51,
+    BRIG_OPCODE_NEXP2 = 52,
+    BRIG_OPCODE_NFMA = 53,
+    BRIG_OPCODE_NLOG2 = 54,
+    BRIG_OPCODE_NRCP = 55,
+    BRIG_OPCODE_NRSQRT = 56,
+    BRIG_OPCODE_NSIN = 57,
+    BRIG_OPCODE_NSQRT = 58,
+    BRIG_OPCODE_BITALIGN = 59,
+    BRIG_OPCODE_BYTEALIGN = 60,
+    BRIG_OPCODE_PACKCVT = 61,
+    BRIG_OPCODE_UNPACKCVT = 62,
+    BRIG_OPCODE_LERP = 63,
+    BRIG_OPCODE_SAD = 64,
+    BRIG_OPCODE_SADHI = 65,
+    BRIG_OPCODE_SEGMENTP = 66,
+    BRIG_OPCODE_FTOS = 67,
+    BRIG_OPCODE_STOF = 68,
+    BRIG_OPCODE_CMP = 69,
+    BRIG_OPCODE_CVT = 70,
+    BRIG_OPCODE_LD = 71,
+    BRIG_OPCODE_ST = 72,
+    BRIG_OPCODE_ATOMIC = 73,
+    BRIG_OPCODE_ATOMICNORET = 74,
+    BRIG_OPCODE_SIGNAL = 75,
+    BRIG_OPCODE_SIGNALNORET = 76,
+    BRIG_OPCODE_MEMFENCE = 77,
+    BRIG_OPCODE_RDIMAGE = 78,
+    BRIG_OPCODE_LDIMAGE = 79,
+    BRIG_OPCODE_STIMAGE = 80,
+    BRIG_OPCODE_IMAGEFENCE = 81,
+    BRIG_OPCODE_QUERYIMAGE = 82,
+    BRIG_OPCODE_QUERYSAMPLER = 83,
+    BRIG_OPCODE_CBR = 84,
+    BRIG_OPCODE_BR = 85,
+    BRIG_OPCODE_SBR = 86,
+    BRIG_OPCODE_BARRIER = 87,
+    BRIG_OPCODE_WAVEBARRIER = 88,
+    BRIG_OPCODE_ARRIVEFBAR = 89,
+    BRIG_OPCODE_INITFBAR = 90,
+    BRIG_OPCODE_JOINFBAR = 91,
+    BRIG_OPCODE_LEAVEFBAR = 92,
+    BRIG_OPCODE_RELEASEFBAR = 93,
+    BRIG_OPCODE_WAITFBAR = 94,
+    BRIG_OPCODE_LDF = 95,
+    BRIG_OPCODE_ACTIVELANECOUNT = 96,
+    BRIG_OPCODE_ACTIVELANEID = 97,
+    BRIG_OPCODE_ACTIVELANEMASK = 98,
+    BRIG_OPCODE_ACTIVELANEPERMUTE = 99,
+    BRIG_OPCODE_CALL = 100,
+    BRIG_OPCODE_SCALL = 101,
+    BRIG_OPCODE_ICALL = 102,
+    BRIG_OPCODE_RET = 103,
+    BRIG_OPCODE_ALLOCA = 104,
+    BRIG_OPCODE_CURRENTWORKGROUPSIZE = 105,
+    BRIG_OPCODE_CURRENTWORKITEMFLATID = 106,
+    BRIG_OPCODE_DIM = 107,
+    BRIG_OPCODE_GRIDGROUPS = 108,
+    BRIG_OPCODE_GRIDSIZE = 109,
+    BRIG_OPCODE_PACKETCOMPLETIONSIG = 110,
+    BRIG_OPCODE_PACKETID = 111,
+    BRIG_OPCODE_WORKGROUPID = 112,
+    BRIG_OPCODE_WORKGROUPSIZE = 113,
+    BRIG_OPCODE_WORKITEMABSID = 114,
+    BRIG_OPCODE_WORKITEMFLATABSID = 115,
+    BRIG_OPCODE_WORKITEMFLATID = 116,
+    BRIG_OPCODE_WORKITEMID = 117,
+    BRIG_OPCODE_CLEARDETECTEXCEPT = 118,
+    BRIG_OPCODE_GETDETECTEXCEPT = 119,
+    BRIG_OPCODE_SETDETECTEXCEPT = 120,
+    BRIG_OPCODE_ADDQUEUEWRITEINDEX = 121,
+    BRIG_OPCODE_CASQUEUEWRITEINDEX = 122,
+    BRIG_OPCODE_LDQUEUEREADINDEX = 123,
+    BRIG_OPCODE_LDQUEUEWRITEINDEX = 124,
+    BRIG_OPCODE_STQUEUEREADINDEX = 125,
+    BRIG_OPCODE_STQUEUEWRITEINDEX = 126,
+    BRIG_OPCODE_CLOCK = 127,
+    BRIG_OPCODE_CUID = 128,
+    BRIG_OPCODE_DEBUGTRAP = 129,
+    BRIG_OPCODE_GROUPBASEPTR = 130,
+    BRIG_OPCODE_KERNARGBASEPTR = 131,
+    BRIG_OPCODE_LANEID = 132,
+    BRIG_OPCODE_MAXCUID = 133,
+    BRIG_OPCODE_MAXWAVEID = 134,
+    BRIG_OPCODE_NULLPTR = 135,
+    BRIG_OPCODE_WAVEID = 136,
+
+    BRIG_OPCODE_FIRST_USER_DEFINED = 32768,
+};
+
+typedef uint8_t BrigPack8_t;
+enum BrigPack {
+    BRIG_PACK_NONE = 0,
+    BRIG_PACK_PP = 1,
+    BRIG_PACK_PS = 2,
+    BRIG_PACK_SP = 3,
+    BRIG_PACK_SS = 4,
+    BRIG_PACK_S = 5,
+    BRIG_PACK_P = 6,
+    BRIG_PACK_PPSAT = 7,
+    BRIG_PACK_PSSAT = 8,
+    BRIG_PACK_SPSAT = 9,
+    BRIG_PACK_SSSAT = 10,
+    BRIG_PACK_SSAT = 11,
+    BRIG_PACK_PSAT = 12
+};
+
+typedef uint8_t BrigProfile8_t;
+enum BrigProfile {
+    BRIG_PROFILE_BASE = 0,
+    BRIG_PROFILE_FULL = 1,
+};
+
+typedef uint16_t BrigRegisterKind16_t;
+enum BrigRegisterKind {
+    BRIG_REGISTER_KIND_CONTROL = 0,
+    BRIG_REGISTER_KIND_SINGLE = 1,
+    BRIG_REGISTER_KIND_DOUBLE = 2,
+    BRIG_REGISTER_KIND_QUAD = 3
+};
+
+typedef uint8_t BrigRound8_t;
+enum BrigRound {
+    BRIG_ROUND_NONE = 0,
+    BRIG_ROUND_FLOAT_DEFAULT = 1,
+    BRIG_ROUND_FLOAT_NEAR_EVEN = 2,
+    BRIG_ROUND_FLOAT_ZERO = 3,
+    BRIG_ROUND_FLOAT_PLUS_INFINITY = 4,
+    BRIG_ROUND_FLOAT_MINUS_INFINITY = 5,
+    BRIG_ROUND_INTEGER_NEAR_EVEN = 6,
+    BRIG_ROUND_INTEGER_ZERO = 7,
+    BRIG_ROUND_INTEGER_PLUS_INFINITY = 8,
+    BRIG_ROUND_INTEGER_MINUS_INFINITY = 9,
+    BRIG_ROUND_INTEGER_NEAR_EVEN_SAT = 10,
+    BRIG_ROUND_INTEGER_ZERO_SAT = 11,
+    BRIG_ROUND_INTEGER_PLUS_INFINITY_SAT = 12,
+    BRIG_ROUND_INTEGER_MINUS_INFINITY_SAT = 13,
+    BRIG_ROUND_INTEGER_SIGNALING_NEAR_EVEN = 14,
+    BRIG_ROUND_INTEGER_SIGNALING_ZERO = 15,
+    BRIG_ROUND_INTEGER_SIGNALING_PLUS_INFINITY = 16,
+    BRIG_ROUND_INTEGER_SIGNALING_MINUS_INFINITY = 17,
+    BRIG_ROUND_INTEGER_SIGNALING_NEAR_EVEN_SAT = 18,
+    BRIG_ROUND_INTEGER_SIGNALING_ZERO_SAT = 19,
+    BRIG_ROUND_INTEGER_SIGNALING_PLUS_INFINITY_SAT = 20,
+    BRIG_ROUND_INTEGER_SIGNALING_MINUS_INFINITY_SAT = 21
+};
+
+typedef uint8_t BrigSamplerAddressing8_t;
+enum BrigSamplerAddressing {
+    BRIG_ADDRESSING_UNDEFINED = 0,
+    BRIG_ADDRESSING_CLAMP_TO_EDGE = 1,
+    BRIG_ADDRESSING_CLAMP_TO_BORDER = 2,
+    BRIG_ADDRESSING_REPEAT = 3,
+    BRIG_ADDRESSING_MIRRORED_REPEAT = 4,
+
+    BRIG_ADDRESSING_FIRST_USER_DEFINED = 128
+};
+
+typedef uint8_t BrigSamplerCoordNormalization8_t;
+enum BrigSamplerCoordNormalization {
+    BRIG_COORD_UNNORMALIZED = 0,
+    BRIG_COORD_NORMALIZED = 1
+};
+
+typedef uint8_t BrigSamplerFilter8_t;
+enum BrigSamplerFilter {
+    BRIG_FILTER_NEAREST = 0,
+    BRIG_FILTER_LINEAR = 1,
+
+    BRIG_FILTER_FIRST_USER_DEFINED = 128
+};
+
+typedef uint8_t BrigSamplerQuery8_t;
+enum BrigSamplerQuery {
+    BRIG_SAMPLER_QUERY_ADDRESSING = 0,
+    BRIG_SAMPLER_QUERY_COORD = 1,
+    BRIG_SAMPLER_QUERY_FILTER = 2
+};
+
+typedef uint32_t BrigSectionIndex32_t;
+enum BrigSectionIndex {
+    BRIG_SECTION_INDEX_DATA = 0,
+    BRIG_SECTION_INDEX_CODE = 1,
+    BRIG_SECTION_INDEX_OPERAND = 2,
+
+    BRIG_SECTION_INDEX_BEGIN_IMPLEMENTATION_DEFINED = 3,
+};
+
+typedef uint8_t BrigSegCvtModifier8_t;
+enum BrigSegCvtModifierMask {
+    BRIG_SEG_CVT_NONULL = 1
+};
+
+typedef uint8_t BrigSegment8_t;
+enum BrigSegment {
+    BRIG_SEGMENT_NONE = 0,
+    BRIG_SEGMENT_FLAT = 1,
+    BRIG_SEGMENT_GLOBAL = 2,
+    BRIG_SEGMENT_READONLY = 3,
+    BRIG_SEGMENT_KERNARG = 4,
+    BRIG_SEGMENT_GROUP = 5,
+    BRIG_SEGMENT_PRIVATE = 6,
+    BRIG_SEGMENT_SPILL = 7,
+    BRIG_SEGMENT_ARG = 8,
+
+    BRIG_SEGMENT_FIRST_USER_DEFINED = 128
+};
+
+enum {
+    BRIG_TYPE_BASE_SIZE  = 5,
+    BRIG_TYPE_PACK_SIZE  = 2,
+    BRIG_TYPE_ARRAY_SIZE = 1,
+
+    BRIG_TYPE_BASE_SHIFT  = 0,
+    BRIG_TYPE_PACK_SHIFT  = BRIG_TYPE_BASE_SHIFT + BRIG_TYPE_BASE_SIZE,
+    BRIG_TYPE_ARRAY_SHIFT = BRIG_TYPE_PACK_SHIFT + BRIG_TYPE_PACK_SIZE,
+
+    BRIG_TYPE_BASE_MASK  = ((1 << BRIG_TYPE_BASE_SIZE)  - 1) << BRIG_TYPE_BASE_SHIFT,
+    BRIG_TYPE_PACK_MASK  = ((1 << BRIG_TYPE_PACK_SIZE)  - 1) << BRIG_TYPE_PACK_SHIFT,
+    BRIG_TYPE_ARRAY_MASK = ((1 << BRIG_TYPE_ARRAY_SIZE) - 1) << BRIG_TYPE_ARRAY_SHIFT,
+
+    BRIG_TYPE_PACK_NONE = 0 << BRIG_TYPE_PACK_SHIFT,
+    BRIG_TYPE_PACK_32   = 1 << BRIG_TYPE_PACK_SHIFT,
+    BRIG_TYPE_PACK_64   = 2 << BRIG_TYPE_PACK_SHIFT,
+    BRIG_TYPE_PACK_128  = 3 << BRIG_TYPE_PACK_SHIFT,
+
+    BRIG_TYPE_ARRAY     = 1 << BRIG_TYPE_ARRAY_SHIFT
+};
+
+typedef uint16_t BrigType16_t;
+enum BrigType {
+    BRIG_TYPE_NONE  = 0,
+    BRIG_TYPE_U8    = 1,
+    BRIG_TYPE_U16   = 2,
+    BRIG_TYPE_U32   = 3,
+    BRIG_TYPE_U64   = 4,
+    BRIG_TYPE_S8    = 5,
+    BRIG_TYPE_S16   = 6,
+    BRIG_TYPE_S32   = 7,
+    BRIG_TYPE_S64   = 8,
+    BRIG_TYPE_F16   = 9,
+    BRIG_TYPE_F32   = 10,
+    BRIG_TYPE_F64   = 11,
+    BRIG_TYPE_B1    = 12,
+    BRIG_TYPE_B8    = 13,
+    BRIG_TYPE_B16   = 14,
+    BRIG_TYPE_B32   = 15,
+    BRIG_TYPE_B64   = 16,
+    BRIG_TYPE_B128  = 17,
+    BRIG_TYPE_SAMP  = 18,
+    BRIG_TYPE_ROIMG = 19,
+    BRIG_TYPE_WOIMG = 20,
+    BRIG_TYPE_RWIMG = 21,
+    BRIG_TYPE_SIG32 = 22,
+    BRIG_TYPE_SIG64 = 23,
+
+    BRIG_TYPE_U8X4  = BRIG_TYPE_U8  | BRIG_TYPE_PACK_32,
+    BRIG_TYPE_U8X8  = BRIG_TYPE_U8  | BRIG_TYPE_PACK_64,
+    BRIG_TYPE_U8X16 = BRIG_TYPE_U8  | BRIG_TYPE_PACK_128,
+    BRIG_TYPE_U16X2 = BRIG_TYPE_U16 | BRIG_TYPE_PACK_32,
+    BRIG_TYPE_U16X4 = BRIG_TYPE_U16 | BRIG_TYPE_PACK_64,
+    BRIG_TYPE_U16X8 = BRIG_TYPE_U16 | BRIG_TYPE_PACK_128,
+    BRIG_TYPE_U32X2 = BRIG_TYPE_U32 | BRIG_TYPE_PACK_64,
+    BRIG_TYPE_U32X4 = BRIG_TYPE_U32 | BRIG_TYPE_PACK_128,
+    BRIG_TYPE_U64X2 = BRIG_TYPE_U64 | BRIG_TYPE_PACK_128,
+    BRIG_TYPE_S8X4  = BRIG_TYPE_S8  | BRIG_TYPE_PACK_32,
+    BRIG_TYPE_S8X8  = BRIG_TYPE_S8  | BRIG_TYPE_PACK_64,
+    BRIG_TYPE_S8X16 = BRIG_TYPE_S8  | BRIG_TYPE_PACK_128,
+    BRIG_TYPE_S16X2 = BRIG_TYPE_S16 | BRIG_TYPE_PACK_32,
+    BRIG_TYPE_S16X4 = BRIG_TYPE_S16 | BRIG_TYPE_PACK_64,
+    BRIG_TYPE_S16X8 = BRIG_TYPE_S16 | BRIG_TYPE_PACK_128,
+    BRIG_TYPE_S32X2 = BRIG_TYPE_S32 | BRIG_TYPE_PACK_64,
+    BRIG_TYPE_S32X4 = BRIG_TYPE_S32 | BRIG_TYPE_PACK_128,
+    BRIG_TYPE_S64X2 = BRIG_TYPE_S64 | BRIG_TYPE_PACK_128,
+    BRIG_TYPE_F16X2 = BRIG_TYPE_F16 | BRIG_TYPE_PACK_32,
+    BRIG_TYPE_F16X4 = BRIG_TYPE_F16 | BRIG_TYPE_PACK_64,
+    BRIG_TYPE_F16X8 = BRIG_TYPE_F16 | BRIG_TYPE_PACK_128,
+    BRIG_TYPE_F32X2 = BRIG_TYPE_F32 | BRIG_TYPE_PACK_64,
+    BRIG_TYPE_F32X4 = BRIG_TYPE_F32 | BRIG_TYPE_PACK_128,
+    BRIG_TYPE_F64X2 = BRIG_TYPE_F64 | BRIG_TYPE_PACK_128,
+
+    BRIG_TYPE_U8_ARRAY    = BRIG_TYPE_U8    | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_U16_ARRAY   = BRIG_TYPE_U16   | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_U32_ARRAY   = BRIG_TYPE_U32   | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_U64_ARRAY   = BRIG_TYPE_U64   | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_S8_ARRAY    = BRIG_TYPE_S8    | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_S16_ARRAY   = BRIG_TYPE_S16   | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_S32_ARRAY   = BRIG_TYPE_S32   | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_S64_ARRAY   = BRIG_TYPE_S64   | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_F16_ARRAY   = BRIG_TYPE_F16   | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_F32_ARRAY   = BRIG_TYPE_F32   | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_F64_ARRAY   = BRIG_TYPE_F64   | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_B8_ARRAY    = BRIG_TYPE_B8    | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_B16_ARRAY   = BRIG_TYPE_B16   | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_B32_ARRAY   = BRIG_TYPE_B32   | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_B64_ARRAY   = BRIG_TYPE_B64   | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_B128_ARRAY  = BRIG_TYPE_B128  | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_SAMP_ARRAY  = BRIG_TYPE_SAMP  | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_ROIMG_ARRAY = BRIG_TYPE_ROIMG | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_WOIMG_ARRAY = BRIG_TYPE_WOIMG | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_RWIMG_ARRAY = BRIG_TYPE_RWIMG | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_SIG32_ARRAY = BRIG_TYPE_SIG32 | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_SIG64_ARRAY = BRIG_TYPE_SIG64 | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_U8X4_ARRAY  = BRIG_TYPE_U8X4  | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_U8X8_ARRAY  = BRIG_TYPE_U8X8  | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_U8X16_ARRAY = BRIG_TYPE_U8X16 | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_U16X2_ARRAY = BRIG_TYPE_U16X2 | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_U16X4_ARRAY = BRIG_TYPE_U16X4 | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_U16X8_ARRAY = BRIG_TYPE_U16X8 | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_U32X2_ARRAY = BRIG_TYPE_U32X2 | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_U32X4_ARRAY = BRIG_TYPE_U32X4 | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_U64X2_ARRAY = BRIG_TYPE_U64X2 | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_S8X4_ARRAY  = BRIG_TYPE_S8X4  | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_S8X8_ARRAY  = BRIG_TYPE_S8X8  | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_S8X16_ARRAY = BRIG_TYPE_S8X16 | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_S16X2_ARRAY = BRIG_TYPE_S16X2 | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_S16X4_ARRAY = BRIG_TYPE_S16X4 | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_S16X8_ARRAY = BRIG_TYPE_S16X8 | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_S32X2_ARRAY = BRIG_TYPE_S32X2 | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_S32X4_ARRAY = BRIG_TYPE_S32X4 | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_S64X2_ARRAY = BRIG_TYPE_S64X2 | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_F16X2_ARRAY = BRIG_TYPE_F16X2 | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_F16X4_ARRAY = BRIG_TYPE_F16X4 | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_F16X8_ARRAY = BRIG_TYPE_F16X8 | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_F32X2_ARRAY = BRIG_TYPE_F32X2 | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_F32X4_ARRAY = BRIG_TYPE_F32X4 | BRIG_TYPE_ARRAY,
+    BRIG_TYPE_F64X2_ARRAY = BRIG_TYPE_F64X2 | BRIG_TYPE_ARRAY,
+};
+
+typedef uint8_t BrigVariableModifier8_t;
+enum BrigVariableModifierMask {
+    BRIG_VARIABLE_DEFINITION = 1,
+    BRIG_VARIABLE_CONST = 2
+};
+
+typedef uint8_t BrigWidth8_t;
+enum BrigWidth {
+    BRIG_WIDTH_NONE = 0,
+    BRIG_WIDTH_1 = 1,
+    BRIG_WIDTH_2 = 2,
+    BRIG_WIDTH_4 = 3,
+    BRIG_WIDTH_8 = 4,
+    BRIG_WIDTH_16 = 5,
+    BRIG_WIDTH_32 = 6,
+    BRIG_WIDTH_64 = 7,
+    BRIG_WIDTH_128 = 8,
+    BRIG_WIDTH_256 = 9,
+    BRIG_WIDTH_512 = 10,
+    BRIG_WIDTH_1024 = 11,
+    BRIG_WIDTH_2048 = 12,
+    BRIG_WIDTH_4096 = 13,
+    BRIG_WIDTH_8192 = 14,
+    BRIG_WIDTH_16384 = 15,
+    BRIG_WIDTH_32768 = 16,
+    BRIG_WIDTH_65536 = 17,
+    BRIG_WIDTH_131072 = 18,
+    BRIG_WIDTH_262144 = 19,
+    BRIG_WIDTH_524288 = 20,
+    BRIG_WIDTH_1048576 = 21,
+    BRIG_WIDTH_2097152 = 22,
+    BRIG_WIDTH_4194304 = 23,
+    BRIG_WIDTH_8388608 = 24,
+    BRIG_WIDTH_16777216 = 25,
+    BRIG_WIDTH_33554432 = 26,
+    BRIG_WIDTH_67108864 = 27,
+    BRIG_WIDTH_134217728 = 28,
+    BRIG_WIDTH_268435456 = 29,
+    BRIG_WIDTH_536870912 = 30,
+    BRIG_WIDTH_1073741824 = 31,
+    BRIG_WIDTH_2147483648 = 32,
+    BRIG_WIDTH_WAVESIZE = 33,
+    BRIG_WIDTH_ALL = 34,
+};
+
+struct BrigUInt64 {
+    uint32_t lo;
+    uint32_t hi;
+};
+
+struct BrigBase {
+    uint16_t byteCount;
+    BrigKind16_t kind;
+};
+
+struct BrigData {
+    uint32_t byteCount;
+    uint8_t bytes[1];
+};
+
+struct BrigDirectiveArgBlock {
+    BrigBase base;
+};
+
+struct BrigDirectiveComment {
+    BrigBase base;
+    BrigDataOffsetString32_t name;
+};
+
+struct BrigDirectiveControl {
+    BrigBase base;
+    BrigControlDirective16_t control;
+    uint16_t reserved;
+    BrigDataOffsetOperandList32_t operands;
+};
+
+struct BrigDirectiveExecutable {
+    BrigBase base;
+    BrigDataOffsetString32_t name;
+    uint16_t outArgCount;
+    uint16_t inArgCount;
+    BrigCodeOffset32_t firstInArg;
+    BrigCodeOffset32_t firstCodeBlockEntry;
+    BrigCodeOffset32_t nextModuleEntry;
+    BrigExecutableModifier8_t modifier;
+    BrigLinkage8_t linkage;
+    uint16_t reserved;
+};
+
+struct BrigDirectiveExtension {
+    BrigBase base;
+    BrigDataOffsetString32_t name;
+};
+
+struct BrigDirectiveFbarrier {
+    BrigBase base;
+    BrigDataOffsetString32_t name;
+    BrigVariableModifier8_t modifier;
+    BrigLinkage8_t linkage;
+    uint16_t reserved;
+};
+
+struct BrigDirectiveLabel {
+    BrigBase base;
+    BrigDataOffsetString32_t name;
+};
+
+struct BrigDirectiveLoc {
+    BrigBase base;
+    BrigDataOffsetString32_t filename;
+    uint32_t line;
+    uint32_t column;
+};
+
+struct BrigDirectiveNone {
+    BrigBase base;
+};
+
+struct BrigDirectivePragma {
+    BrigBase base;
+    BrigDataOffsetOperandList32_t operands;
+};
+
+struct BrigDirectiveVariable {
+    BrigBase base;
+    BrigDataOffsetString32_t name;
+    BrigOperandOffset32_t init;
+    BrigType16_t type;
+    BrigSegment8_t segment;
+    BrigAlignment8_t align;
+    BrigUInt64 dim;
+    BrigVariableModifier8_t modifier;
+    BrigLinkage8_t linkage;
+    BrigAllocation8_t allocation;
+    uint8_t reserved;
+};
+
+struct BrigDirectiveModule {
+    BrigBase base;
+    BrigDataOffsetString32_t name;
+    BrigVersion32_t hsailMajor;
+    BrigVersion32_t hsailMinor;
+    BrigProfile8_t profile;
+    BrigMachineModel8_t machineModel;
+    BrigRound8_t defaultFloatRound;
+    uint8_t reserved;
+};
+
+struct BrigInstBase {
+    BrigBase base;
+    BrigOpcode16_t opcode;
+    BrigType16_t type;
+    BrigDataOffsetOperandList32_t operands;
+};
+
+struct BrigInstAddr {
+    BrigInstBase base;
+    BrigSegment8_t segment;
+    uint8_t reserved[3];
+};
+
+struct BrigInstAtomic {
+    BrigInstBase base;
+    BrigSegment8_t segment;
+    BrigMemoryOrder8_t memoryOrder;
+    BrigMemoryScope8_t memoryScope;
+    BrigAtomicOperation8_t atomicOperation;
+    uint8_t equivClass;
+    uint8_t reserved[3];
+};
+
+struct BrigInstBasic {
+    BrigInstBase base;
+};
+
+struct BrigInstBr {
+    BrigInstBase base;
+    BrigWidth8_t width;
+    uint8_t reserved[3];
+};
+
+struct BrigInstCmp {
+    BrigInstBase base;
+    BrigType16_t sourceType;
+    BrigAluModifier8_t modifier;
+    BrigCompareOperation8_t compare;
+    BrigPack8_t pack;
+    uint8_t reserved[3];
+};
+
+struct BrigInstCvt {
+    BrigInstBase base;
+    BrigType16_t sourceType;
+    BrigAluModifier8_t modifier;
+    BrigRound8_t round;
+};
+
+struct BrigInstImage {
+    BrigInstBase base;
+    BrigType16_t imageType;
+    BrigType16_t coordType;
+    BrigImageGeometry8_t geometry;
+    uint8_t equivClass;
+    uint16_t reserved;
+};
+
+struct BrigInstLane {
+    BrigInstBase base;
+    BrigType16_t sourceType;
+    BrigWidth8_t width;
+    uint8_t reserved;
+};
+
+struct BrigInstMem {
+    BrigInstBase base;
+    BrigSegment8_t segment;
+    BrigAlignment8_t align;
+    uint8_t equivClass;
+    BrigWidth8_t width;
+    BrigMemoryModifier8_t modifier;
+    uint8_t reserved[3];
+};
+
+struct BrigInstMemFence {
+    BrigInstBase base;
+    BrigMemoryOrder8_t memoryOrder;
+    BrigMemoryScope8_t globalSegmentMemoryScope;
+    BrigMemoryScope8_t groupSegmentMemoryScope;
+    BrigMemoryScope8_t imageSegmentMemoryScope;
+};
+
+struct BrigInstMod {
+    BrigInstBase base;
+    BrigAluModifier8_t modifier;
+    BrigRound8_t round;
+    BrigPack8_t pack;
+    uint8_t reserved;
+};
+
+struct BrigInstQueryImage {
+    BrigInstBase base;
+    BrigType16_t imageType;
+    BrigImageGeometry8_t geometry;
+    BrigImageQuery8_t query;
+};
+
+struct BrigInstQuerySampler {
+    BrigInstBase base;
+    BrigSamplerQuery8_t query;
+    uint8_t reserved[3];
+};
+
+struct BrigInstQueue {
+    BrigInstBase base;
+    BrigSegment8_t segment;
+    BrigMemoryOrder8_t memoryOrder;
+    uint16_t reserved;
+};
+
+struct BrigInstSeg {
+    BrigInstBase base;
+    BrigSegment8_t segment;
+    uint8_t reserved[3];
+};
+
+struct BrigInstSegCvt {
+    BrigInstBase base;
+    BrigType16_t sourceType;
+    BrigSegment8_t segment;
+    BrigSegCvtModifier8_t modifier;
+};
+
+struct BrigInstSignal {
+    BrigInstBase base;
+    BrigType16_t signalType;
+    BrigMemoryOrder8_t memoryOrder;
+    BrigAtomicOperation8_t signalOperation;
+};
+
+struct BrigInstSourceType {
+    BrigInstBase base;
+    BrigType16_t sourceType;
+    uint16_t reserved;
+};
+
+struct BrigOperandAddress {
+    BrigBase base;
+    BrigCodeOffset32_t symbol;
+    BrigOperandOffset32_t reg;
+    BrigUInt64 offset;
+};
+
+struct BrigOperandAlign {
+    BrigBase base;
+    BrigAlignment8_t align;
+    uint8_t reserved[3];
+};
+
+struct BrigOperandCodeList {
+    BrigBase base;
+    BrigDataOffsetCodeList32_t elements;
+};
+
+struct BrigOperandCodeRef {
+    BrigBase base;
+    BrigCodeOffset32_t ref;
+};
+
+struct BrigOperandConstantBytes {
+    BrigBase base;
+    BrigType16_t type;
+    uint16_t reserved;
+    BrigDataOffsetString32_t bytes;
+};
+
+struct BrigOperandConstantOperandList {
+    BrigBase base;
+    BrigType16_t type;
+    uint16_t reserved;
+    BrigDataOffsetOperandList32_t elements;
+};
+
+struct BrigOperandConstantImage {
+    BrigBase base;
+    BrigType16_t type;
+    BrigImageGeometry8_t geometry;
+    BrigImageChannelOrder8_t channelOrder;
+    BrigImageChannelType8_t channelType;
+    uint8_t reserved[3];
+    BrigUInt64 width;
+    BrigUInt64 height;
+    BrigUInt64 depth;
+    BrigUInt64 array;
+};
+
+struct BrigOperandOperandList {
+    BrigBase base;
+    BrigDataOffsetOperandList32_t elements;
+};
+
+struct BrigOperandRegister {
+    BrigBase base;
+    BrigRegisterKind16_t regKind;
+    uint16_t regNum;
+};
+
+struct BrigOperandConstantSampler {
+    BrigBase base;
+    BrigType16_t type;
+    BrigSamplerCoordNormalization8_t coord;
+    BrigSamplerFilter8_t filter;
+    BrigSamplerAddressing8_t addressing;
+    uint8_t reserved[3];
+};
+
+struct BrigOperandString {
+    BrigBase base;
+    BrigDataOffsetString32_t string;
+};
+
+struct BrigOperandWavesize {
+    BrigBase base;
+};
+
+typedef uint32_t BrigExceptions32_t;
+enum BrigExceptionsMask {
+    BRIG_EXCEPTIONS_INVALID_OPERATION = 1 << 0,
+    BRIG_EXCEPTIONS_DIVIDE_BY_ZERO = 1 << 1,
+    BRIG_EXCEPTIONS_OVERFLOW = 1 << 2,
+    BRIG_EXCEPTIONS_UNDERFLOW = 1 << 3,
+    BRIG_EXCEPTIONS_INEXACT = 1 << 4,
+
+    BRIG_EXCEPTIONS_FIRST_USER_DEFINED = 1 << 16
+};
+
+struct BrigSectionHeader {
+    uint64_t byteCount;
+    uint32_t headerByteCount;
+    uint32_t nameLength;
+    uint8_t name[1];
+};
+
+struct BrigModuleHeader {
+    char identification[8];
+    BrigVersion32_t brigMajor;
+    BrigVersion32_t brigMinor;
+    uint64_t byteCount;
+    uint8_t hash[64];
+    uint32_t reserved;
+    uint32_t sectionCount;
+    uint64_t sectionIndex;
+};
+
+typedef BrigModuleHeader* BrigModule_t;
+
+#ifdef __cplusplus
+}
+#endif  /*__cplusplus*/
+
+#endif // defined(INCLUDED_BRIG_H)
diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_common.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_common.h
new file mode 100644
index 0000000000..7c4ed3eea4
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_common.h
@@ -0,0 +1,91 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+// 
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+// 
+// Developed by:
+// 
+//                 AMD Research and AMD HSA Software Development
+// 
+//                 Advanced Micro Devices, Inc.
+// 
+//                 www.amd.com
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+// 
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// The following set of header files provides definitions for AMD GPU
+// Architecture:
+//   - amd_hsa_common.h
+//   - amd_hsa_elf.h
+//   - amd_hsa_kernel_code.h
+//   - amd_hsa_queue.h
+//   - amd_hsa_signal.h
+//
+// Refer to "HSA Application Binary Interface: AMD GPU Architecture" for more
+// information.
+
+#ifndef AMD_HSA_COMMON_H
+#define AMD_HSA_COMMON_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+// Descriptive version of the HSA Application Binary Interface.
+#define AMD_HSA_ABI_VERSION "AMD GPU Architecture v0.35 (June 25, 2015)"
+
+// Alignment attribute that specifies a minimum alignment (in bytes) for
+// variables of the specified type.
+#if defined(__GNUC__)
+#  define __ALIGNED__(x) __attribute__((aligned(x)))
+#elif defined(_MSC_VER)
+#  define __ALIGNED__(x) __declspec(align(x))
+#elif defined(RC_INVOKED)
+#  define __ALIGNED__(x)
+#else
+#  error
+#endif
+
+// Creates enumeration entries for packed types. Enumeration entries include
+// bit shift amount, bit width, and bit mask.
+#define AMD_HSA_BITS_CREATE_ENUM_ENTRIES(name, shift, width)                   \
+  name##_SHIFT = (shift),                                                      \
+  name##_WIDTH = (width),                                                      \
+  name = (((1 << (width)) - 1) << (shift))                                     \
+
+// Gets bits for specified mask from specified src packed instance.
+#define AMD_HSA_BITS_GET(src, mask)                                            \
+  ((src & mask) >> mask ## _SHIFT)                                             \
+
+// Sets val bits for specified mask in specified dst packed instance.
+#define AMD_HSA_BITS_SET(dst, mask, val)                                       \
+  dst &= (~(1 << mask##_SHIFT) & ~mask);                                       \
+  dst |= (((val) << mask##_SHIFT) & mask)                                      \
+
+#endif // AMD_HSA_COMMON_H
diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_elf.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_elf.h
new file mode 100644
index 0000000000..2b6c4c9672
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_elf.h
@@ -0,0 +1,467 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// Undefine the macro in case it is defined in the system elf.h.
+#undef EM_AMDGPU
+
+#ifndef AMD_HSA_ELF_H
+#define AMD_HSA_ELF_H
+
+// AMD GPU Specific ELF Header Enumeration Values.
+//
+// Values are copied from LLVM BinaryFormat/ELF.h . This file also contains
+// code object V1 defintions which are not part of the LLVM header. Code object
+// V1 was only supported by the Finalizer which is now deprecated and removed.
+//
+// TODO: Deprecate and remove V1 support and replace this header with using the
+// LLVM header.
+namespace ELF {
+
+// Machine architectures
+// See current registered ELF machine architectures at:
+//    http://www.uxsglobal.com/developers/gabi/latest/ch4.eheader.html
+enum {
+  EM_AMDGPU = 224,        // AMD GPU architecture
+};
+
+// OS ABI identification.
+enum {
+  ELFOSABI_AMDGPU_HSA = 64,    // AMD HSA runtime
+};
+
+// AMDGPU OS ABI Version identification.
+enum {
+  // ELFABIVERSION_AMDGPU_HSA_V1 does not exist because OS ABI identification
+  // was never defined for V1.
+  ELFABIVERSION_AMDGPU_HSA_V2 = 0,
+  ELFABIVERSION_AMDGPU_HSA_V3 = 1,
+  ELFABIVERSION_AMDGPU_HSA_V4 = 2,
+  ELFABIVERSION_AMDGPU_HSA_V5 = 3,
+  ELFABIVERSION_AMDGPU_HSA_V6 = 4,
+};
+
+// AMDGPU specific e_flags.
+enum : unsigned {
+  // Processor selection mask for EF_AMDGPU_MACH_* values.
+  EF_AMDGPU_MACH = 0x0ff,
+
+  // Not specified processor.
+  EF_AMDGPU_MACH_NONE = 0x000,
+
+  // AMDGCN-based processors.
+  // clang-format off
+  EF_AMDGPU_MACH_AMDGCN_GFX600          = 0x020,
+  EF_AMDGPU_MACH_AMDGCN_GFX601          = 0x021,
+  EF_AMDGPU_MACH_AMDGCN_GFX700          = 0x022,
+  EF_AMDGPU_MACH_AMDGCN_GFX701          = 0x023,
+  EF_AMDGPU_MACH_AMDGCN_GFX702          = 0x024,
+  EF_AMDGPU_MACH_AMDGCN_GFX703          = 0x025,
+  EF_AMDGPU_MACH_AMDGCN_GFX704          = 0x026,
+  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X27   = 0x027,
+  EF_AMDGPU_MACH_AMDGCN_GFX801          = 0x028,
+  EF_AMDGPU_MACH_AMDGCN_GFX802          = 0x029,
+  EF_AMDGPU_MACH_AMDGCN_GFX803          = 0x02a,
+  EF_AMDGPU_MACH_AMDGCN_GFX810          = 0x02b,
+  EF_AMDGPU_MACH_AMDGCN_GFX900          = 0x02c,
+  EF_AMDGPU_MACH_AMDGCN_GFX902          = 0x02d,
+  EF_AMDGPU_MACH_AMDGCN_GFX904          = 0x02e,
+  EF_AMDGPU_MACH_AMDGCN_GFX906          = 0x02f,
+  EF_AMDGPU_MACH_AMDGCN_GFX908          = 0x030,
+  EF_AMDGPU_MACH_AMDGCN_GFX909          = 0x031,
+  EF_AMDGPU_MACH_AMDGCN_GFX90C          = 0x032,
+  EF_AMDGPU_MACH_AMDGCN_GFX1010         = 0x033,
+  EF_AMDGPU_MACH_AMDGCN_GFX1011         = 0x034,
+  EF_AMDGPU_MACH_AMDGCN_GFX1012         = 0x035,
+  EF_AMDGPU_MACH_AMDGCN_GFX1030         = 0x036,
+  EF_AMDGPU_MACH_AMDGCN_GFX1031         = 0x037,
+  EF_AMDGPU_MACH_AMDGCN_GFX1032         = 0x038,
+  EF_AMDGPU_MACH_AMDGCN_GFX1033         = 0x039,
+  EF_AMDGPU_MACH_AMDGCN_GFX602          = 0x03a,
+  EF_AMDGPU_MACH_AMDGCN_GFX705          = 0x03b,
+  EF_AMDGPU_MACH_AMDGCN_GFX805          = 0x03c,
+  EF_AMDGPU_MACH_AMDGCN_GFX1035         = 0x03d,
+  EF_AMDGPU_MACH_AMDGCN_GFX1034         = 0x03e,
+  EF_AMDGPU_MACH_AMDGCN_GFX90A          = 0x03f,
+  EF_AMDGPU_MACH_AMDGCN_GFX940          = 0x040,
+  EF_AMDGPU_MACH_AMDGCN_GFX1100         = 0x041,
+  EF_AMDGPU_MACH_AMDGCN_GFX1013         = 0x042,
+  EF_AMDGPU_MACH_AMDGCN_GFX1150         = 0x043,
+  EF_AMDGPU_MACH_AMDGCN_GFX1103         = 0x044,
+  EF_AMDGPU_MACH_AMDGCN_GFX1036         = 0x045,
+  EF_AMDGPU_MACH_AMDGCN_GFX1101         = 0x046,
+  EF_AMDGPU_MACH_AMDGCN_GFX1102         = 0x047,
+  EF_AMDGPU_MACH_AMDGCN_GFX1200         = 0x048,
+  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X49   = 0x049,
+  EF_AMDGPU_MACH_AMDGCN_GFX1151         = 0x04a,
+  EF_AMDGPU_MACH_AMDGCN_GFX941          = 0x04b,
+  EF_AMDGPU_MACH_AMDGCN_GFX942          = 0x04c,
+  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X4D   = 0x04d,
+  EF_AMDGPU_MACH_AMDGCN_GFX1201         = 0x04e,
+  EF_AMDGPU_MACH_AMDGCN_GFX950          = 0x04f,
+  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X50   = 0x050,
+  EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC    = 0x051,
+  EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC = 0x052,
+  EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC = 0x053,
+  EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC   = 0x054,
+  EF_AMDGPU_MACH_AMDGCN_GFX1152         = 0x055,
+  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X56   = 0x056,
+  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X57   = 0x057,
+  EF_AMDGPU_MACH_AMDGCN_GFX1153         = 0x058,
+  EF_AMDGPU_MACH_AMDGCN_GFX12_GENERIC   = 0x059,
+  EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC  = 0x05f,
+  // clang-format on
+
+  // First/last AMDGCN-based processors.
+  EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600,
+  EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC,
+
+  // Indicates if the "xnack" target feature is enabled for all code contained
+  // in the object.
+  //
+  // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V2.
+  EF_AMDGPU_FEATURE_XNACK_V2 = 0x01,
+  // Indicates if the trap handler is enabled for all code contained
+  // in the object.
+  //
+  // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V2.
+  EF_AMDGPU_FEATURE_TRAP_HANDLER_V2 = 0x02,
+
+  // Indicates if the "xnack" target feature is enabled for all code contained
+  // in the object.
+  //
+  // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V3.
+  EF_AMDGPU_FEATURE_XNACK_V3 = 0x100,
+  // Indicates if the "sramecc" target feature is enabled for all code
+  // contained in the object.
+  //
+  // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V3.
+  EF_AMDGPU_FEATURE_SRAMECC_V3 = 0x200,
+
+  // XNACK selection mask for EF_AMDGPU_FEATURE_XNACK_* values.
+  //
+  // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V4.
+  EF_AMDGPU_FEATURE_XNACK_V4 = 0x300,
+  // XNACK is not supported.
+  EF_AMDGPU_FEATURE_XNACK_UNSUPPORTED_V4 = 0x000,
+  // XNACK is any/default/unspecified.
+  EF_AMDGPU_FEATURE_XNACK_ANY_V4 = 0x100,
+  // XNACK is off.
+  EF_AMDGPU_FEATURE_XNACK_OFF_V4 = 0x200,
+  // XNACK is on.
+  EF_AMDGPU_FEATURE_XNACK_ON_V4 = 0x300,
+
+  // SRAMECC selection mask for EF_AMDGPU_FEATURE_SRAMECC_* values.
+  //
+  // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V4.
+  EF_AMDGPU_FEATURE_SRAMECC_V4 = 0xc00,
+  // SRAMECC is not supported.
+  EF_AMDGPU_FEATURE_SRAMECC_UNSUPPORTED_V4 = 0x000,
+  // SRAMECC is any/default/unspecified.
+  EF_AMDGPU_FEATURE_SRAMECC_ANY_V4 = 0x400,
+  // SRAMECC is off.
+  EF_AMDGPU_FEATURE_SRAMECC_OFF_V4 = 0x800,
+  // SRAMECC is on.
+  EF_AMDGPU_FEATURE_SRAMECC_ON_V4 = 0xc00,
+
+  // Generic target versioning. This is contained in the list byte of EFLAGS.
+  EF_AMDGPU_GENERIC_VERSION = 0xff000000,
+  EF_AMDGPU_GENERIC_VERSION_OFFSET = 24,
+  EF_AMDGPU_GENERIC_VERSION_MIN = 1,
+  EF_AMDGPU_GENERIC_VERSION_MAX = 0xff,
+};
+
+// ELF Relocation types for AMDGPU.
+enum : unsigned {
+  R_AMDGPU_ABS32_LO = 1,
+  R_AMDGPU_ABS32_HI = 2,
+  R_AMDGPU_ABS64 = 3,
+  R_AMDGPU_ABS32 = 6,
+  R_AMDGPU_RELATIVE64 = 13,
+};
+
+} // end namespace ELF
+
+// ELF Section Header Flag Enumeration Values.
+#define SHF_AMDGPU_HSA_GLOBAL   (0x00100000 & SHF_MASKOS)
+#define SHF_AMDGPU_HSA_READONLY (0x00200000 & SHF_MASKOS)
+#define SHF_AMDGPU_HSA_CODE     (0x00400000 & SHF_MASKOS)
+#define SHF_AMDGPU_HSA_AGENT    (0x00800000 & SHF_MASKOS)
+
+//
+typedef enum {
+  AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM = 0,
+  AMDGPU_HSA_SEGMENT_GLOBAL_AGENT = 1,
+  AMDGPU_HSA_SEGMENT_READONLY_AGENT = 2,
+  AMDGPU_HSA_SEGMENT_CODE_AGENT = 3,
+  AMDGPU_HSA_SEGMENT_LAST,
+} amdgpu_hsa_elf_segment_t;
+
+// ELF Program Header Type Enumeration Values.
+#define PT_AMDGPU_HSA_LOAD_GLOBAL_PROGRAM (PT_LOOS + AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM)
+#define PT_AMDGPU_HSA_LOAD_GLOBAL_AGENT   (PT_LOOS + AMDGPU_HSA_SEGMENT_GLOBAL_AGENT)
+#define PT_AMDGPU_HSA_LOAD_READONLY_AGENT (PT_LOOS + AMDGPU_HSA_SEGMENT_READONLY_AGENT)
+#define PT_AMDGPU_HSA_LOAD_CODE_AGENT     (PT_LOOS + AMDGPU_HSA_SEGMENT_CODE_AGENT)
+
+// ELF Symbol Type Enumeration Values.
+#define STT_AMDGPU_HSA_KERNEL            (STT_LOOS + 0)
+#define STT_AMDGPU_HSA_INDIRECT_FUNCTION (STT_LOOS + 1)
+#define STT_AMDGPU_HSA_METADATA          (STT_LOOS + 2)
+
+// ELF Symbol Binding Enumeration Values.
+#define STB_AMDGPU_HSA_EXTERNAL (STB_LOOS + 0)
+
+// ELF Symbol Other Information Creation/Retrieval.
+#define ELF64_ST_AMDGPU_ALLOCATION(o)  (((o) >> 2) & 0x3)
+#define ELF64_ST_AMDGPU_FLAGS(o)       ((o) >> 4)
+#define ELF64_ST_AMDGPU_OTHER(f, a, v) (((f) << 4) + (((a) & 0x3) << 2) + ((v) & 0x3))
+
+typedef enum {
+  AMDGPU_HSA_SYMBOL_ALLOCATION_DEFAULT = 0,
+  AMDGPU_HSA_SYMBOL_ALLOCATION_GLOBAL_PROGRAM = 1,
+  AMDGPU_HSA_SYMBOL_ALLOCATION_GLOBAL_AGENT = 2,
+  AMDGPU_HSA_SYMBOL_ALLOCATION_READONLY_AGENT = 3,
+  AMDGPU_HSA_SYMBOL_ALLOCATION_LAST,
+} amdgpu_hsa_symbol_allocation_t;
+
+// ELF Symbol Allocation Enumeration Values.
+#define STA_AMDGPU_HSA_DEFAULT        AMDGPU_HSA_SYMBOL_ALLOCATION_DEFAULT
+#define STA_AMDGPU_HSA_GLOBAL_PROGRAM AMDGPU_HSA_SYMBOL_ALLOCATION_GLOBAL_PROGRAM
+#define STA_AMDGPU_HSA_GLOBAL_AGENT   AMDGPU_HSA_SYMBOL_ALLOCATION_GLOBAL_AGENT
+#define STA_AMDGPU_HSA_READONLY_AGENT AMDGPU_HSA_SYMBOL_ALLOCATION_READONLY_AGENT
+
+typedef enum {
+  AMDGPU_HSA_SYMBOL_FLAG_DEFAULT = 0,
+  AMDGPU_HSA_SYMBOL_FLAG_CONST = 1,
+  AMDGPU_HSA_SYMBOL_FLAG_LAST,
+} amdgpu_hsa_symbol_flag_t;
+
+// ELF Symbol Flag Enumeration Values.
+#define STF_AMDGPU_HSA_CONST AMDGPU_HSA_SYMBOL_FLAG_CONST
+
+// Legacy/V1 AMD GPU Relocation Type Enumeration Values.
+#define R_AMDGPU_V1_NONE         0
+#define R_AMDGPU_V1_32_LOW       1
+#define R_AMDGPU_V1_32_HIGH      2
+#define R_AMDGPU_V1_64           3
+#define R_AMDGPU_V1_INIT_SAMPLER 4
+#define R_AMDGPU_V1_INIT_IMAGE   5
+#define R_AMDGPU_V1_RELATIVE64   13
+
+// AMD GPU Note Type Enumeration Values.
+#define NT_AMD_HSA_CODE_OBJECT_VERSION 1
+#define NT_AMD_HSA_HSAIL               2
+#define NT_AMD_HSA_ISA_VERSION         3
+#define NT_AMD_HSA_PRODUCER            4
+#define NT_AMD_HSA_PRODUCER_OPTIONS    5
+#define NT_AMD_HSA_EXTENSION           6
+#define NT_AMD_HSA_ISA_NAME            11
+/* AMDGPU snapshots of runtime, agent and queues state for use in core dump */
+#define NT_AMDGPU_CORE_STATE           33
+#define NT_AMD_HSA_HLDEBUG_DEBUG       101
+#define NT_AMD_HSA_HLDEBUG_TARGET      102
+
+// AMD GPU Metadata Kind Enumeration Values.
+typedef uint16_t amdgpu_hsa_metadata_kind16_t;
+typedef enum {
+  AMDGPU_HSA_METADATA_KIND_NONE = 0,
+  AMDGPU_HSA_METADATA_KIND_INIT_SAMP = 1,
+  AMDGPU_HSA_METADATA_KIND_INIT_ROIMG = 2,
+  AMDGPU_HSA_METADATA_KIND_INIT_WOIMG = 3,
+  AMDGPU_HSA_METADATA_KIND_INIT_RWIMG = 4
+} amdgpu_hsa_metadata_kind_t;
+
+// AMD GPU Sampler Coordinate Normalization Enumeration Values.
+typedef uint8_t amdgpu_hsa_sampler_coord8_t;
+typedef enum {
+  AMDGPU_HSA_SAMPLER_COORD_UNNORMALIZED = 0,
+  AMDGPU_HSA_SAMPLER_COORD_NORMALIZED = 1
+} amdgpu_hsa_sampler_coord_t;
+
+// AMD GPU Sampler Filter Enumeration Values.
+typedef uint8_t amdgpu_hsa_sampler_filter8_t;
+typedef enum {
+  AMDGPU_HSA_SAMPLER_FILTER_NEAREST = 0,
+  AMDGPU_HSA_SAMPLER_FILTER_LINEAR = 1
+} amdgpu_hsa_sampler_filter_t;
+
+// AMD GPU Sampler Addressing Enumeration Values.
+typedef uint8_t amdgpu_hsa_sampler_addressing8_t;
+typedef enum {
+  AMDGPU_HSA_SAMPLER_ADDRESSING_UNDEFINED = 0,
+  AMDGPU_HSA_SAMPLER_ADDRESSING_CLAMP_TO_EDGE = 1,
+  AMDGPU_HSA_SAMPLER_ADDRESSING_CLAMP_TO_BORDER = 2,
+  AMDGPU_HSA_SAMPLER_ADDRESSING_REPEAT = 3,
+  AMDGPU_HSA_SAMPLER_ADDRESSING_MIRRORED_REPEAT = 4
+} amdgpu_hsa_sampler_addressing_t;
+
+// AMD GPU Sampler Descriptor.
+typedef struct amdgpu_hsa_sampler_descriptor_s {
+  uint16_t size;
+  amdgpu_hsa_metadata_kind16_t kind;
+  amdgpu_hsa_sampler_coord8_t coord;
+  amdgpu_hsa_sampler_filter8_t filter;
+  amdgpu_hsa_sampler_addressing8_t addressing;
+  uint8_t reserved1;
+} amdgpu_hsa_sampler_descriptor_t;
+
+// AMD GPU Image Geometry Enumeration Values.
+typedef uint8_t amdgpu_hsa_image_geometry8_t;
+typedef enum {
+  AMDGPU_HSA_IMAGE_GEOMETRY_1D = 0,
+  AMDGPU_HSA_IMAGE_GEOMETRY_2D = 1,
+  AMDGPU_HSA_IMAGE_GEOMETRY_3D = 2,
+  AMDGPU_HSA_IMAGE_GEOMETRY_1DA = 3,
+  AMDGPU_HSA_IMAGE_GEOMETRY_2DA = 4,
+  AMDGPU_HSA_IMAGE_GEOMETRY_1DB = 5,
+  AMDGPU_HSA_IMAGE_GEOMETRY_2DDEPTH = 6,
+  AMDGPU_HSA_IMAGE_GEOMETRY_2DADEPTH = 7
+} amdgpu_hsa_image_geometry_t;
+
+// AMD GPU Image Channel Order Enumeration Values.
+typedef uint8_t amdgpu_hsa_image_channel_order8_t;
+typedef enum {
+  AMDGPU_HSA_IMAGE_CHANNEL_ORDER_A = 0,
+  AMDGPU_HSA_IMAGE_CHANNEL_ORDER_R = 1,
+  AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RX = 2,
+  AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RG = 3,
+  AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RGX = 4,
+  AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RA = 5,
+  AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RGB = 6,
+  AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RGBX = 7,
+  AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RGBA = 8,
+  AMDGPU_HSA_IMAGE_CHANNEL_ORDER_BGRA = 9,
+  AMDGPU_HSA_IMAGE_CHANNEL_ORDER_ARGB = 10,
+  AMDGPU_HSA_IMAGE_CHANNEL_ORDER_ABGR = 11,
+  AMDGPU_HSA_IMAGE_CHANNEL_ORDER_SRGB = 12,
+  AMDGPU_HSA_IMAGE_CHANNEL_ORDER_SRGBX = 13,
+  AMDGPU_HSA_IMAGE_CHANNEL_ORDER_SRGBA = 14,
+  AMDGPU_HSA_IMAGE_CHANNEL_ORDER_SBGRA = 15,
+  AMDGPU_HSA_IMAGE_CHANNEL_ORDER_INTENSITY = 16,
+  AMDGPU_HSA_IMAGE_CHANNEL_ORDER_LUMINANCE = 17,
+  AMDGPU_HSA_IMAGE_CHANNEL_ORDER_DEPTH = 18,
+  AMDGPU_HSA_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL = 19
+} amdgpu_hsa_image_channel_order_t;
+
+// AMD GPU Image Channel Type Enumeration Values.
+typedef uint8_t amdgpu_hsa_image_channel_type8_t;
+typedef enum {
+  AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SNORM_INT8 = 0,
+  AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SNORM_INT16 = 1,
+  AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNORM_INT8 = 2,
+  AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNORM_INT16 = 3,
+  AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNORM_INT24 = 4,
+  AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SHORT_555 = 5,
+  AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SHORT_565 = 6,
+  AMDGPU_HSA_IMAGE_CHANNEL_TYPE_INT_101010 = 7,
+  AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SIGNED_INT8 = 8,
+  AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SIGNED_INT16 = 9,
+  AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SIGNED_INT32 = 10,
+  AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 = 11,
+  AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 = 12,
+  AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 = 13,
+  AMDGPU_HSA_IMAGE_CHANNEL_TYPE_HALF_FLOAT = 14,
+  AMDGPU_HSA_IMAGE_CHANNEL_TYPE_FLOAT = 15
+} amdgpu_hsa_image_channel_type_t;
+
+// AMD GPU Image Descriptor.
+typedef struct amdgpu_hsa_image_descriptor_s {
+  uint16_t size;
+  amdgpu_hsa_metadata_kind16_t kind;
+  amdgpu_hsa_image_geometry8_t geometry;
+  amdgpu_hsa_image_channel_order8_t channel_order;
+  amdgpu_hsa_image_channel_type8_t channel_type;
+  uint8_t reserved1;
+  uint64_t width;
+  uint64_t height;
+  uint64_t depth;
+  uint64_t array;
+} amdgpu_hsa_image_descriptor_t;
+
+typedef struct amdgpu_hsa_note_code_object_version_s {
+  uint32_t major_version;
+  uint32_t minor_version;
+} amdgpu_hsa_note_code_object_version_t;
+
+typedef struct amdgpu_hsa_note_hsail_s {
+  uint32_t hsail_major_version;
+  uint32_t hsail_minor_version;
+  uint8_t profile;
+  uint8_t machine_model;
+  uint8_t default_float_round;
+} amdgpu_hsa_note_hsail_t;
+
+typedef struct amdgpu_hsa_note_isa_s {
+  uint16_t vendor_name_size;
+  uint16_t architecture_name_size;
+  uint32_t major;
+  uint32_t minor;
+  uint32_t stepping;
+  char vendor_and_architecture_name[1];
+} amdgpu_hsa_note_isa_t;
+
+typedef struct amdgpu_hsa_note_producer_s {
+  uint16_t producer_name_size;
+  uint16_t reserved;
+  uint32_t producer_major_version;
+  uint32_t producer_minor_version;
+  char producer_name[1];
+} amdgpu_hsa_note_producer_t;
+
+typedef struct amdgpu_hsa_note_producer_options_s {
+  uint16_t producer_options_size;
+  char producer_options[1];
+} amdgpu_hsa_note_producer_options_t;
+
+typedef enum {
+  AMDGPU_HSA_RODATA_GLOBAL_PROGRAM = 0,
+  AMDGPU_HSA_RODATA_GLOBAL_AGENT,
+  AMDGPU_HSA_RODATA_READONLY_AGENT,
+  AMDGPU_HSA_DATA_GLOBAL_PROGRAM,
+  AMDGPU_HSA_DATA_GLOBAL_AGENT,
+  AMDGPU_HSA_DATA_READONLY_AGENT,
+  AMDGPU_HSA_BSS_GLOBAL_PROGRAM,
+  AMDGPU_HSA_BSS_GLOBAL_AGENT,
+  AMDGPU_HSA_BSS_READONLY_AGENT,
+  AMDGPU_HSA_SECTION_LAST,
+} amdgpu_hsa_elf_section_t;
+
+#endif // AMD_HSA_ELF_H
diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_kernel_code.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_kernel_code.h
new file mode 100644
index 0000000000..c00c88c024
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_kernel_code.h
@@ -0,0 +1,270 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+// 
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+// 
+// Developed by:
+// 
+//                 AMD Research and AMD HSA Software Development
+// 
+//                 Advanced Micro Devices, Inc.
+// 
+//                 www.amd.com
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+// 
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef AMD_HSA_KERNEL_CODE_H
+#define AMD_HSA_KERNEL_CODE_H
+
+#include "amd_hsa_common.h"
+#include "hsa.h"
+
+// AMD Kernel Code Version Enumeration Values.
+typedef uint32_t amd_kernel_code_version32_t;
+enum amd_kernel_code_version_t {
+  AMD_KERNEL_CODE_VERSION_MAJOR = 1,
+  AMD_KERNEL_CODE_VERSION_MINOR = 1
+};
+
+// AMD Machine Kind Enumeration Values.
+typedef uint16_t amd_machine_kind16_t;
+enum amd_machine_kind_t {
+  AMD_MACHINE_KIND_UNDEFINED = 0,
+  AMD_MACHINE_KIND_AMDGPU = 1
+};
+
+// AMD Machine Version.
+typedef uint16_t amd_machine_version16_t;
+
+// AMD Float Round Mode Enumeration Values.
+enum amd_float_round_mode_t {
+  AMD_FLOAT_ROUND_MODE_NEAREST_EVEN = 0,
+  AMD_FLOAT_ROUND_MODE_PLUS_INFINITY = 1,
+  AMD_FLOAT_ROUND_MODE_MINUS_INFINITY = 2,
+  AMD_FLOAT_ROUND_MODE_ZERO = 3
+};
+
+// AMD Float Denorm Mode Enumeration Values.
+enum amd_float_denorm_mode_t {
+  AMD_FLOAT_DENORM_MODE_FLUSH_SOURCE_OUTPUT = 0,
+  AMD_FLOAT_DENORM_MODE_FLUSH_OUTPUT = 1,
+  AMD_FLOAT_DENORM_MODE_FLUSH_SOURCE = 2,
+  AMD_FLOAT_DENORM_MODE_NO_FLUSH = 3
+};
+
+// AMD Compute Program Resource Register One.
+typedef uint32_t amd_compute_pgm_rsrc_one32_t;
+enum amd_compute_pgm_rsrc_one_t {
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT, 0, 6),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT, 6, 4),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_PRIORITY, 10, 2),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_32, 12, 2),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_16_64, 14, 2),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_32, 16, 2),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64, 18, 2),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_PRIV, 20, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_DX10_CLAMP, 21, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_DEBUG_MODE, 22, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE, 23, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_BULKY, 24, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_CDBG_USER, 25, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_RESERVED1, 26, 6)
+};
+
+// AMD System VGPR Workitem ID Enumeration Values.
+enum amd_system_vgpr_workitem_id_t {
+  AMD_SYSTEM_VGPR_WORKITEM_ID_X = 0,
+  AMD_SYSTEM_VGPR_WORKITEM_ID_X_Y = 1,
+  AMD_SYSTEM_VGPR_WORKITEM_ID_X_Y_Z = 2,
+  AMD_SYSTEM_VGPR_WORKITEM_ID_UNDEFINED = 3
+};
+
+// AMD Compute Program Resource Register Two.
+typedef uint32_t amd_compute_pgm_rsrc_two32_t;
+enum amd_compute_pgm_rsrc_two_t {
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_PRIVATE_SEGMENT_WAVE_BYTE_OFFSET, 0, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT, 1, 5),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_TRAP_HANDLER, 6, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X, 7, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Y, 8, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Z, 9, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_INFO, 10, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_VGPR_WORKITEM_ID, 11, 2),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_ADDRESS_WATCH, 13, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_MEMORY_VIOLATION, 14, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_GRANULATED_LDS_SIZE, 15, 9),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION, 24, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE, 25, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO, 26, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW, 27, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW, 28, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT, 29, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_INT_DIVISION_BY_ZERO, 30, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_RESERVED1, 31, 1)
+};
+
+// AMD Element Byte Size Enumeration Values.
+enum amd_element_byte_size_t {
+  AMD_ELEMENT_BYTE_SIZE_2 = 0,
+  AMD_ELEMENT_BYTE_SIZE_4 = 1,
+  AMD_ELEMENT_BYTE_SIZE_8 = 2,
+  AMD_ELEMENT_BYTE_SIZE_16 = 3
+};
+
+// AMD Kernel Code Properties.
+typedef uint32_t amd_kernel_code_properties32_t;
+enum amd_kernel_code_properties_t {
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER, 0, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR, 1, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR, 2, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR, 3, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_ID, 4, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_FLAT_SCRATCH_INIT, 5, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE, 6, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X, 7, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y, 8, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z, 9, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_WAVEFRONT_SIZE32, 10, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_RESERVED1, 11, 5),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_ORDERED_APPEND_GDS, 16, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_PRIVATE_ELEMENT_SIZE, 17, 2),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_IS_PTR64, 19, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_IS_DYNAMIC_CALLSTACK, 20, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_IS_DEBUG_ENABLED, 21, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_IS_XNACK_ENABLED, 22, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_RESERVED2, 23, 9)
+};
+
+// AMD Power Of Two Enumeration Values.
+typedef uint8_t amd_powertwo8_t;
+enum amd_powertwo_t {
+  AMD_POWERTWO_1 = 0,
+  AMD_POWERTWO_2 = 1,
+  AMD_POWERTWO_4 = 2,
+  AMD_POWERTWO_8 = 3,
+  AMD_POWERTWO_16 = 4,
+  AMD_POWERTWO_32 = 5,
+  AMD_POWERTWO_64 = 6,
+  AMD_POWERTWO_128 = 7,
+  AMD_POWERTWO_256 = 8
+};
+
+// AMD Enabled Control Directive Enumeration Values.
+typedef uint64_t amd_enabled_control_directive64_t;
+enum amd_enabled_control_directive_t {
+  AMD_ENABLED_CONTROL_DIRECTIVE_ENABLE_BREAK_EXCEPTIONS = 1,
+  AMD_ENABLED_CONTROL_DIRECTIVE_ENABLE_DETECT_EXCEPTIONS = 2,
+  AMD_ENABLED_CONTROL_DIRECTIVE_MAX_DYNAMIC_GROUP_SIZE = 4,
+  AMD_ENABLED_CONTROL_DIRECTIVE_MAX_FLAT_GRID_SIZE = 8,
+  AMD_ENABLED_CONTROL_DIRECTIVE_MAX_FLAT_WORKGROUP_SIZE = 16,
+  AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_DIM = 32,
+  AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_GRID_SIZE = 64,
+  AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_WORKGROUP_SIZE = 128,
+  AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRE_NO_PARTIAL_WORKGROUPS = 256
+};
+
+// AMD Exception Kind Enumeration Values.
+typedef uint16_t amd_exception_kind16_t;
+enum amd_exception_kind_t {
+  AMD_EXCEPTION_KIND_INVALID_OPERATION = 1,
+  AMD_EXCEPTION_KIND_DIVISION_BY_ZERO = 2,
+  AMD_EXCEPTION_KIND_OVERFLOW = 4,
+  AMD_EXCEPTION_KIND_UNDERFLOW = 8,
+  AMD_EXCEPTION_KIND_INEXACT = 16
+};
+
+// AMD Control Directives.
+#define AMD_CONTROL_DIRECTIVES_ALIGN_BYTES 64
+#define AMD_CONTROL_DIRECTIVES_ALIGN __ALIGNED__(AMD_CONTROL_DIRECTIVES_ALIGN_BYTES)
+typedef AMD_CONTROL_DIRECTIVES_ALIGN struct amd_control_directives_s {
+  amd_enabled_control_directive64_t enabled_control_directives;
+  uint16_t enable_break_exceptions;
+  uint16_t enable_detect_exceptions;
+  uint32_t max_dynamic_group_size;
+  uint64_t max_flat_grid_size;
+  uint32_t max_flat_workgroup_size;
+  uint8_t required_dim;
+  uint8_t reserved1[3];
+  uint64_t required_grid_size[3];
+  uint32_t required_workgroup_size[3];
+  uint8_t reserved2[60];
+} amd_control_directives_t;
+
+// AMD Kernel Code.
+#define AMD_ISA_ALIGN_BYTES 256
+#define AMD_KERNEL_CODE_ALIGN_BYTES 64
+#define AMD_KERNEL_CODE_ALIGN __ALIGNED__(AMD_KERNEL_CODE_ALIGN_BYTES)
+typedef AMD_KERNEL_CODE_ALIGN struct amd_kernel_code_s {
+  amd_kernel_code_version32_t amd_kernel_code_version_major;
+  amd_kernel_code_version32_t amd_kernel_code_version_minor;
+  amd_machine_kind16_t amd_machine_kind;
+  amd_machine_version16_t amd_machine_version_major;
+  amd_machine_version16_t amd_machine_version_minor;
+  amd_machine_version16_t amd_machine_version_stepping;
+  int64_t kernel_code_entry_byte_offset;
+  int64_t kernel_code_prefetch_byte_offset;
+  uint64_t kernel_code_prefetch_byte_size;
+  uint64_t max_scratch_backing_memory_byte_size;
+  amd_compute_pgm_rsrc_one32_t compute_pgm_rsrc1;
+  amd_compute_pgm_rsrc_two32_t compute_pgm_rsrc2;
+  amd_kernel_code_properties32_t kernel_code_properties;
+  uint32_t workitem_private_segment_byte_size;
+  uint32_t workgroup_group_segment_byte_size;
+  uint32_t gds_segment_byte_size;
+  uint64_t kernarg_segment_byte_size;
+  uint32_t workgroup_fbarrier_count;
+  uint16_t wavefront_sgpr_count;
+  uint16_t workitem_vgpr_count;
+  uint16_t reserved_vgpr_first;
+  uint16_t reserved_vgpr_count;
+  uint16_t reserved_sgpr_first;
+  uint16_t reserved_sgpr_count;
+  uint16_t debug_wavefront_private_segment_offset_sgpr;
+  uint16_t debug_private_segment_buffer_sgpr;
+  amd_powertwo8_t kernarg_segment_alignment;
+  amd_powertwo8_t group_segment_alignment;
+  amd_powertwo8_t private_segment_alignment;
+  amd_powertwo8_t wavefront_size;
+  int32_t call_convention;
+  uint8_t reserved1[12];
+  uint64_t runtime_loader_kernel_symbol;
+  amd_control_directives_t control_directives;
+} amd_kernel_code_t;
+
+// TODO: this struct should be completely gone once debugger designs/implements
+// Debugger APIs.
+typedef struct amd_runtime_loader_debug_info_s {
+  const void* elf_raw;
+  size_t elf_size;
+  const char *kernel_name;
+  const void *owning_segment;
+} amd_runtime_loader_debug_info_t;
+
+#endif // AMD_HSA_KERNEL_CODE_H
diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_queue.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_queue.h
new file mode 100644
index 0000000000..9f16f9b2e5
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_queue.h
@@ -0,0 +1,154 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef AMD_HSA_QUEUE_H
+#define AMD_HSA_QUEUE_H
+
+#include "amd_hsa_common.h"
+#include "hsa.h"
+
+// AMD Queue Properties.
+typedef uint32_t amd_queue_properties32_t;
+enum amd_queue_properties_t {
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER, 0, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_IS_PTR64, 1, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS, 2, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_ENABLE_PROFILING, 3, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE, 4, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_RESERVED1, 5, 27)
+};
+
+// AMD Queue.
+#define AMD_QUEUE_ALIGN_BYTES 64
+#define AMD_QUEUE_ALIGN __ALIGNED__(AMD_QUEUE_ALIGN_BYTES)
+
+// AMD Queue Capabilities.
+typedef uint32_t amd_queue_capabilities32_t;
+enum amd_queue_capabilities_t {
+  /* This version of CP FW supports dual-scratch and async-reclaim */
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_CAPS_CP_ASYNC_RECLAIM, 0, 1),
+
+  /*
+   * This version of ROCr supports async-reclaim and CP FW may access the
+   * V2 fields.
+   */
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_CAPS_SW_ASYNC_RECLAIM, 1, 1),
+};
+
+/* This is the original amd_queue_t structure. The definition is only kept
+ * for reference purposes. This structure should not be used. */
+typedef struct AMD_QUEUE_ALIGN amd_queue_s {
+  hsa_queue_t hsa_queue;
+  uint32_t caps;
+  uint32_t reserved1[3];
+  volatile uint64_t write_dispatch_id;
+  uint32_t group_segment_aperture_base_hi;
+  uint32_t private_segment_aperture_base_hi;
+  uint32_t max_cu_id;
+  uint32_t max_wave_id;
+  volatile uint64_t max_legacy_doorbell_dispatch_id_plus_1;
+  volatile uint32_t legacy_doorbell_lock;
+  uint32_t reserved2[9];
+  volatile uint64_t read_dispatch_id;
+  uint32_t read_dispatch_id_field_base_byte_offset;
+  uint32_t compute_tmpring_size;
+  uint32_t scratch_resource_descriptor[4];
+  uint64_t scratch_backing_memory_location;
+  uint32_t reserved3[2];
+  uint32_t scratch_wave64_lane_byte_size;
+  amd_queue_properties32_t queue_properties;
+  uint32_t reserved4[2];
+  hsa_signal_t queue_inactive_signal;
+  uint32_t reserved5[14];
+} amd_queue_t;
+
+/*
+ * AMD_QUEUE Version 2
+ * amd_queue_v2_t is backwards compatible with amd_queue_t structure and can
+ * be used with previous versions of CP FW. The added fields tagged as V2 are
+ * ignored when running previous versions of CP FW.
+ * CP FW will not try to access elements beyond the original 64-bytes
+ * (sizeof(amd_queue_t)) unless the AMD_QUEUE_CAPS_SW_ASYNC_RECLAIM bit is set.
+ */
+
+#define MAX_NUM_XCC 128
+typedef struct scratch_last_used_index_xcc_s {
+  volatile uint64_t main;
+  volatile uint64_t alt;
+} scratch_last_used_index_xcc_t;
+
+typedef struct AMD_QUEUE_ALIGN amd_queue_v2_s {
+  hsa_queue_t hsa_queue;
+  uint32_t caps;
+  uint32_t reserved1[3];
+  volatile uint64_t write_dispatch_id;
+  uint32_t group_segment_aperture_base_hi;
+  uint32_t private_segment_aperture_base_hi;
+  uint32_t max_cu_id;
+  uint32_t max_wave_id;
+  volatile uint64_t max_legacy_doorbell_dispatch_id_plus_1;
+  volatile uint32_t legacy_doorbell_lock;
+  uint32_t reserved2[9];
+  volatile uint64_t read_dispatch_id;
+  uint32_t read_dispatch_id_field_base_byte_offset;
+  uint32_t compute_tmpring_size;
+  uint32_t scratch_resource_descriptor[4];
+  uint64_t scratch_backing_memory_location;
+  uint64_t scratch_backing_memory_byte_size;
+  uint32_t scratch_wave64_lane_byte_size;
+  amd_queue_properties32_t queue_properties;
+  volatile uint64_t scratch_max_use_index;       /* V2 */
+  hsa_signal_t queue_inactive_signal;
+  volatile uint64_t alt_scratch_max_use_index;  /* V2 */
+  uint32_t alt_scratch_resource_descriptor[4];   /* V2 */
+  uint64_t alt_scratch_backing_memory_location;  /* V2 */
+  uint32_t alt_scratch_dispatch_limit_x;         /* V2 */
+  uint32_t alt_scratch_dispatch_limit_y;         /* V2 */
+  uint32_t alt_scratch_dispatch_limit_z;         /* V2 */
+  uint32_t alt_scratch_wave64_lane_byte_size;    /* V2 */
+  uint32_t alt_compute_tmpring_size;             /* V2 */
+  uint32_t reserved5;
+
+  scratch_last_used_index_xcc_t scratch_last_used_index[MAX_NUM_XCC];
+} amd_queue_v2_t;
+
+#endif // AMD_HSA_QUEUE_H
diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_signal.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_signal.h
new file mode 100644
index 0000000000..fa797599a0
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_signal.h
@@ -0,0 +1,79 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+// 
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+// 
+// Developed by:
+// 
+//                 AMD Research and AMD HSA Software Development
+// 
+//                 Advanced Micro Devices, Inc.
+// 
+//                 www.amd.com
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+// 
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef AMD_HSA_SIGNAL_H
+#define AMD_HSA_SIGNAL_H
+
+#include "amd_hsa_common.h"
+#include "amd_hsa_queue.h"
+
+// AMD Signal Kind Enumeration Values.
+typedef int64_t amd_signal_kind64_t;
+enum amd_signal_kind_t {
+  AMD_SIGNAL_KIND_INVALID = 0,
+  AMD_SIGNAL_KIND_USER = 1,
+  AMD_SIGNAL_KIND_DOORBELL = -1,
+  AMD_SIGNAL_KIND_LEGACY_DOORBELL = -2
+};
+
+// AMD Signal.
+#define AMD_SIGNAL_ALIGN_BYTES 64
+#define AMD_SIGNAL_ALIGN __ALIGNED__(AMD_SIGNAL_ALIGN_BYTES)
+typedef struct AMD_SIGNAL_ALIGN amd_signal_s {
+  amd_signal_kind64_t kind;
+  union {
+    volatile int64_t value;
+    volatile uint64_t* hardware_doorbell_ptr;
+  };
+  uint64_t event_mailbox_ptr;
+  uint32_t event_id;
+  uint32_t reserved1;
+  uint64_t start_ts;
+  uint64_t end_ts;
+  union {
+    amd_queue_v2_t* queue_ptr;
+    uint64_t reserved2;
+  };
+  uint32_t reserved3[2];
+} amd_signal_t;
+
+#endif // AMD_HSA_SIGNAL_H
diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa.h
new file mode 100644
index 0000000000..00753e992e
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa.h
@@ -0,0 +1,5752 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef HSA_RUNTIME_INC_HSA_H_
+#define HSA_RUNTIME_INC_HSA_H_
+
+#include <stddef.h>   /* size_t */
+#include <stdint.h>   /* uintXX_t */
+
+#ifndef __cplusplus
+#include <stdbool.h>  /* bool */
+#endif /* __cplusplus */
+
+// Placeholder for calling convention and import/export macros
+#ifndef HSA_CALL
+#define HSA_CALL
+#endif
+
+#ifndef HSA_EXPORT_DECORATOR
+#ifdef __GNUC__
+#define HSA_EXPORT_DECORATOR __attribute__ ((visibility ("default")))
+#else
+#define HSA_EXPORT_DECORATOR
+#endif
+#endif
+#define HSA_API_EXPORT HSA_EXPORT_DECORATOR HSA_CALL
+#define HSA_API_IMPORT HSA_CALL
+
+#if !defined(HSA_API) && defined(HSA_EXPORT)
+#define HSA_API HSA_API_EXPORT
+#else
+#define HSA_API HSA_API_IMPORT
+#endif
+
+// Detect and set large model builds.
+#undef HSA_LARGE_MODEL
+#if defined(__LP64__) || defined(_M_X64)
+#define HSA_LARGE_MODEL
+#endif
+
+// Try to detect CPU endianness
+#if !defined(LITTLEENDIAN_CPU) && !defined(BIGENDIAN_CPU)
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#define LITTLEENDIAN_CPU
+#elif defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#define BIGENDIAN_CPU
+#elif defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \
+      defined(_M_X64) || defined(__loongarch64) || defined(__riscv)
+#define LITTLEENDIAN_CPU
+#endif
+#endif
+
+#undef HSA_LITTLE_ENDIAN
+#if defined(LITTLEENDIAN_CPU)
+#define HSA_LITTLE_ENDIAN
+#elif defined(BIGENDIAN_CPU)
+#else
+#error "BIGENDIAN_CPU or LITTLEENDIAN_CPU must be defined"
+#endif
+
+#ifndef HSA_DEPRECATED
+#define HSA_DEPRECATED
+//#ifdef __GNUC__
+//#define HSA_DEPRECATED __attribute__((deprecated))
+//#else
+//#define HSA_DEPRECATED __declspec(deprecated)
+//#endif
+#endif
+
+#define HSA_VERSION_1_0                              1
+
+#ifdef __cplusplus
+extern "C" {
+#endif  /* __cplusplus */
+
+/** \addtogroup error-codes Error codes
+ *  @{
+ */
+
+/**
+ * @brief Status codes.
+ */
+typedef enum {
+  /**
+   * The function has been executed successfully.
+   */
+  HSA_STATUS_SUCCESS = 0x0,
+  /**
+   * A traversal over a list of elements has been interrupted by the
+   * application before completing.
+   */
+  HSA_STATUS_INFO_BREAK = 0x1,
+  /**
+   * A generic error has occurred.
+   */
+  HSA_STATUS_ERROR = 0x1000,
+  /**
+   * One of the actual arguments does not meet a precondition stated in the
+   * documentation of the corresponding formal argument.
+   */
+  HSA_STATUS_ERROR_INVALID_ARGUMENT = 0x1001,
+  /**
+   * The requested queue creation is not valid.
+   */
+  HSA_STATUS_ERROR_INVALID_QUEUE_CREATION = 0x1002,
+  /**
+   * The requested allocation is not valid.
+   */
+  HSA_STATUS_ERROR_INVALID_ALLOCATION = 0x1003,
+  /**
+   * The agent is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_AGENT = 0x1004,
+  /**
+   * The memory region is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_REGION = 0x1005,
+  /**
+   * The signal is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_SIGNAL = 0x1006,
+  /**
+   * The queue is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_QUEUE = 0x1007,
+  /**
+   * The HSA runtime failed to allocate the necessary resources. This error
+   * may also occur when the HSA runtime needs to spawn threads or create
+   * internal OS-specific events.
+   */
+  HSA_STATUS_ERROR_OUT_OF_RESOURCES = 0x1008,
+  /**
+   * The AQL packet is malformed.
+   */
+  HSA_STATUS_ERROR_INVALID_PACKET_FORMAT = 0x1009,
+  /**
+   * An error has been detected while releasing a resource.
+   */
+  HSA_STATUS_ERROR_RESOURCE_FREE = 0x100A,
+  /**
+   * An API other than ::hsa_init has been invoked while the reference count
+   * of the HSA runtime is 0.
+   */
+  HSA_STATUS_ERROR_NOT_INITIALIZED = 0x100B,
+  /**
+   * The maximum reference count for the object has been reached.
+   */
+  HSA_STATUS_ERROR_REFCOUNT_OVERFLOW = 0x100C,
+  /**
+   * The arguments passed to a functions are not compatible.
+   */
+  HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS = 0x100D,
+  /**
+   * The index is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_INDEX = 0x100E,
+  /**
+   * The instruction set architecture is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_ISA = 0x100F,
+  /**
+   * The instruction set architecture name is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_ISA_NAME = 0x1017,
+  /**
+   * The code object is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_CODE_OBJECT = 0x1010,
+  /**
+   * The executable is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_EXECUTABLE = 0x1011,
+  /**
+   * The executable is frozen.
+   */
+  HSA_STATUS_ERROR_FROZEN_EXECUTABLE = 0x1012,
+  /**
+   * There is no symbol with the given name.
+   */
+  HSA_STATUS_ERROR_INVALID_SYMBOL_NAME = 0x1013,
+  /**
+   * The variable is already defined.
+   */
+  HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED = 0x1014,
+  /**
+   * The variable is undefined.
+   */
+  HSA_STATUS_ERROR_VARIABLE_UNDEFINED = 0x1015,
+  /**
+   * An HSAIL operation resulted in a hardware exception.
+   */
+  HSA_STATUS_ERROR_EXCEPTION = 0x1016,
+  /**
+   * The code object symbol is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_CODE_SYMBOL = 0x1018,
+  /**
+   * The executable symbol is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_EXECUTABLE_SYMBOL = 0x1019,
+  /**
+   * The file descriptor is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_FILE = 0x1020,
+  /**
+   * The code object reader is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER = 0x1021,
+  /**
+   * The cache is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_CACHE = 0x1022,
+  /**
+   * The wavefront is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_WAVEFRONT = 0x1023,
+  /**
+   * The signal group is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_SIGNAL_GROUP = 0x1024,
+  /**
+   * The HSA runtime is not in the configuration state.
+   */
+  HSA_STATUS_ERROR_INVALID_RUNTIME_STATE = 0x1025,
+  /**
+  * The queue received an error that may require process termination.
+  */
+  HSA_STATUS_ERROR_FATAL = 0x1026
+} hsa_status_t;
+
+/**
+ * @brief Query additional information about a status code.
+ *
+ * @param[in] status Status code.
+ *
+ * @param[out] status_string A NUL-terminated string that describes the error
+ * status.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p status is an invalid
+ * status code, or @p status_string is NULL.
+ */
+hsa_status_t HSA_API hsa_status_string(
+    hsa_status_t status,
+    const char ** status_string);
+
+/** @} */
+
+/** \defgroup common Common Definitions
+ *  @{
+ */
+
+/**
+ * @brief Three-dimensional coordinate.
+ */
+typedef struct hsa_dim3_s {
+  /**
+   * X dimension.
+   */
+   uint32_t x;
+
+  /**
+   * Y dimension.
+   */
+   uint32_t y;
+
+   /**
+    * Z dimension.
+    */
+   uint32_t z;
+} hsa_dim3_t;
+
+/**
+ * @brief Access permissions.
+ */
+typedef enum {
+  /**
+   * Used to remove existing access
+   */
+  HSA_ACCESS_PERMISSION_NONE = 0,
+  /**
+   * Read-only access.
+   */
+  HSA_ACCESS_PERMISSION_RO = 1,
+  /**
+   * Write-only access.
+   */
+  HSA_ACCESS_PERMISSION_WO = 2,
+  /**
+   * Read and write access.
+   */
+  HSA_ACCESS_PERMISSION_RW = 3
+} hsa_access_permission_t;
+
+/**
+ * @brief POSIX file descriptor.
+ */
+typedef int hsa_file_t;
+
+/** @} **/
+
+
+/** \defgroup initshutdown Initialization and Shut Down
+ *  @{
+ */
+
+/**
+ * @brief Initialize the HSA runtime.
+ *
+ * @details Initializes the HSA runtime if it is not already initialized, and
+ * increases the reference counter associated with the HSA runtime for the
+ * current process. Invocation of any HSA function other than ::hsa_init results
+ * in undefined behavior if the current HSA runtime reference counter is less
+ * than one.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate
+ * the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_REFCOUNT_OVERFLOW The HSA runtime reference
+ * count reaches INT32_MAX.
+ */
+hsa_status_t HSA_API hsa_init();
+
+/**
+ * @brief Shut down the HSA runtime.
+ *
+ * @details Decreases the reference count of the HSA runtime instance. When the
+ * reference count reaches 0, the HSA runtime is no longer considered valid
+ * but the application might call ::hsa_init to initialize the HSA runtime
+ * again.
+ *
+ * Once the reference count of the HSA runtime reaches 0, all the resources
+ * associated with it (queues, signals, agent information, etc.) are
+ * considered invalid and any attempt to reference them in subsequent API calls
+ * results in undefined behavior. When the reference count reaches 0, the HSA
+ * runtime may release resources associated with it.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ */
+hsa_status_t HSA_API hsa_shut_down();
+
+/** @} **/
+
+/** \defgroup agentinfo System and Agent Information
+ *  @{
+ */
+
+/**
+ * @brief Endianness. A convention used to interpret the bytes making up a data
+ * word.
+ */
+typedef enum {
+    /**
+     * The least significant byte is stored in the smallest address.
+     */
+    HSA_ENDIANNESS_LITTLE = 0,
+    /**
+     * The most significant byte is stored in the smallest address.
+     */
+    HSA_ENDIANNESS_BIG = 1
+} hsa_endianness_t;
+
+/**
+ * @brief Machine model. A machine model determines the size of certain data
+ * types in HSA runtime and an agent.
+ */
+typedef enum {
+    /**
+     * Small machine model. Addresses use 32 bits.
+     */
+    HSA_MACHINE_MODEL_SMALL = 0,
+    /**
+     * Large machine model. Addresses use 64 bits.
+     */
+    HSA_MACHINE_MODEL_LARGE = 1
+} hsa_machine_model_t;
+
+/**
+ * @brief Profile. A profile indicates a particular level of feature
+ * support. For example, in the base profile the application must use the HSA
+ * runtime allocator to reserve shared virtual memory, while in the full profile
+ * any host pointer can be shared across all the agents.
+ */
+typedef enum {
+    /**
+     * Base profile.
+     */
+    HSA_PROFILE_BASE = 0,
+    /**
+     * Full profile.
+     */
+    HSA_PROFILE_FULL = 1
+} hsa_profile_t;
+
+/**
+ * @brief System attributes.
+ */
+typedef enum {
+  /**
+   * Major version of the HSA runtime specification supported by the
+   * implementation. The type of this attribute is uint16_t.
+   */
+  HSA_SYSTEM_INFO_VERSION_MAJOR = 0,
+  /**
+   * Minor version of the HSA runtime specification supported by the
+   * implementation. The type of this attribute is uint16_t.
+   */
+  HSA_SYSTEM_INFO_VERSION_MINOR = 1,
+  /**
+   * Current timestamp. The value of this attribute monotonically increases at a
+   * constant rate. The type of this attribute is uint64_t.
+   */
+  HSA_SYSTEM_INFO_TIMESTAMP = 2,
+  /**
+   * Timestamp value increase rate, in Hz. The timestamp (clock) frequency is
+   * in the range 1-400MHz. The type of this attribute is uint64_t.
+   */
+  HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY = 3,
+  /**
+   * Maximum duration of a signal wait operation. Expressed as a count based on
+   * the timestamp frequency. The type of this attribute is uint64_t.
+   */
+  HSA_SYSTEM_INFO_SIGNAL_MAX_WAIT = 4,
+  /**
+   * Endianness of the system. The type of this attribute is ::hsa_endianness_t.
+   */
+  HSA_SYSTEM_INFO_ENDIANNESS = 5,
+  /**
+   * Machine model supported by the HSA runtime. The type of this attribute is
+   * ::hsa_machine_model_t.
+   */
+  HSA_SYSTEM_INFO_MACHINE_MODEL = 6,
+  /**
+   * Bit-mask indicating which extensions are supported by the
+   * implementation. An extension with an ID of @p i is supported if the bit at
+   * position @p i is set. The type of this attribute is uint8_t[128].
+   */
+  HSA_SYSTEM_INFO_EXTENSIONS = 7,
+  /**
+  * String containing the ROCr build identifier.
+  */
+  HSA_AMD_SYSTEM_INFO_BUILD_VERSION = 0x200,
+  /**
+   * Returns true if hsa_amd_svm_* APIs are supported by the driver.  The type of
+   * this attribute is bool.
+   */
+  HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED = 0x201,
+  // TODO: Should this be per Agent?
+  /**
+   * Returns true if all Agents have access to system allocated memory (such as
+   * that allocated by mmap, malloc, or new) by default.
+   * If false then system allocated memory may only be made SVM accessible to
+   * an Agent by declaration of accessibility with hsa_amd_svm_set_attributes.
+   * The type of this attribute is bool.
+   */
+  HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT = 0x202,
+  /**
+   * Returns true if mwaitx is enabled on this system
+   * The type of this attribute is bool.
+   */
+  HSA_AMD_SYSTEM_INFO_MWAITX_ENABLED = 0x203,
+  /**
+   * Returns true if DMABUF APIs are supported by the driver.  The type of
+   * this attribute is bool.
+   */
+  HSA_AMD_SYSTEM_INFO_DMABUF_SUPPORTED = 0x204,
+  /**
+   * Returns true if Virtual Memory APIs are supported by the driver.  The type of
+   * this attribute is bool.
+   */
+  HSA_AMD_SYSTEM_INFO_VIRTUAL_MEM_API_SUPPORTED = 0x205,
+  /**
+   * Returns true if XNACK is enabled on this system.  The type of
+   * this attribute is bool.
+   */
+  HSA_AMD_SYSTEM_INFO_XNACK_ENABLED = 0x206,
+  /**
+   * Major version of the HSA runtime extension specification supported by the
+   * implementation. The type of this attribute is uint16_t.
+   */
+  HSA_AMD_SYSTEM_INFO_EXT_VERSION_MAJOR = 0x207,
+  /**
+   * Minor version of the HSA runtime extension specification supported by the
+   * implementation. The type of this attribute is uint16_t.
+   */
+  HSA_AMD_SYSTEM_INFO_EXT_VERSION_MINOR = 0x208,
+} hsa_system_info_t;
+
+/**
+ * @brief Get the current value of a system attribute.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to an application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid
+ * system attribute, or @p value is NULL.
+ */
+hsa_status_t HSA_API hsa_system_get_info(
+    hsa_system_info_t attribute,
+    void* value);
+
+/**
+ * @brief HSA extensions.
+ */
+typedef enum {
+  /**
+   * Finalizer extension.
+   */
+  HSA_EXTENSION_FINALIZER = 0,
+  /**
+   * Images extension.
+   */
+  HSA_EXTENSION_IMAGES = 1,
+
+  /**
+   * Performance counter extension.
+   */
+  HSA_EXTENSION_PERFORMANCE_COUNTERS = 2,
+
+  /**
+   * Profiling events extension.
+   */
+  HSA_EXTENSION_PROFILING_EVENTS = 3,
+  /**
+   * Extension count.
+   */
+  HSA_EXTENSION_STD_LAST = 3,
+  /**
+   * First AMD extension number.
+   */
+  HSA_AMD_FIRST_EXTENSION = 0x200,
+  /**
+   * Profiler extension.
+   */
+  HSA_EXTENSION_AMD_PROFILER = 0x200,
+  /**
+   * Loader extension.
+   */
+  HSA_EXTENSION_AMD_LOADER = 0x201,
+  /**
+   * AqlProfile extension.
+   */
+  HSA_EXTENSION_AMD_AQLPROFILE = 0x202,
+  /**
+   * PC Sampling extension.
+   */
+  HSA_EXTENSION_AMD_PC_SAMPLING = 0x203,
+  /**
+   * Last AMD extension.
+   */
+  HSA_AMD_LAST_EXTENSION = 0x203
+} hsa_extension_t;
+
+/**
+ * @brief Query the name of a given extension.
+ *
+ * @param[in] extension Extension identifier. If the extension is not supported
+ * by the implementation (see ::HSA_SYSTEM_INFO_EXTENSIONS), the behavior
+ * is undefined.
+ *
+ * @param[out] name Pointer to a memory location where the HSA runtime stores
+ * the extension name. The extension name is a NUL-terminated string.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid
+ * extension, or @p name is NULL.
+ */
+hsa_status_t HSA_API hsa_extension_get_name(
+    uint16_t extension,
+    const char **name);
+
+/**
+ * @deprecated
+ *
+ * @brief Query if a given version of an extension is supported by the HSA
+ * implementation.
+ *
+ * @param[in] extension Extension identifier.
+ *
+ * @param[in] version_major Major version number.
+ *
+ * @param[in] version_minor Minor version number.
+ *
+ * @param[out] result Pointer to a memory location where the HSA runtime stores
+ * the result of the check. The result is true if the specified version of the
+ * extension is supported, and false otherwise.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid
+ * extension, or @p result is NULL.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_system_extension_supported(
+    uint16_t extension,
+    uint16_t version_major,
+    uint16_t version_minor,
+    bool* result);
+
+/**
+ * @brief Query if a given version of an extension is supported by the HSA
+ * implementation. All minor versions from 0 up to the returned @p version_minor
+ * must be supported by the implementation.
+ *
+ * @param[in] extension Extension identifier.
+ *
+ * @param[in] version_major Major version number.
+ *
+ * @param[out] version_minor Minor version number.
+ *
+ * @param[out] result Pointer to a memory location where the HSA runtime stores
+ * the result of the check. The result is true if the specified version of the
+ * extension is supported, and false otherwise.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid
+ * extension, or @p version_minor is NULL, or @p result is NULL.
+ */
+hsa_status_t HSA_API hsa_system_major_extension_supported(
+    uint16_t extension,
+    uint16_t version_major,
+    uint16_t *version_minor,
+    bool* result);
+
+
+/**
+ * @deprecated
+ *
+ * @brief Retrieve the function pointers corresponding to a given version of an
+ * extension. Portable applications are expected to invoke the extension API
+ * using the returned function pointers
+ *
+ * @details The application is responsible for verifying that the given version
+ * of the extension is supported by the HSA implementation (see
+ * ::hsa_system_extension_supported). If the given combination of extension,
+ * major version, and minor version is not supported by the implementation, the
+ * behavior is undefined.
+ *
+ * @param[in] extension Extension identifier.
+ *
+ * @param[in] version_major Major version number for which to retrieve the
+ * function pointer table.
+ *
+ * @param[in] version_minor Minor version number for which to retrieve the
+ * function pointer table.
+ *
+ * @param[out] table Pointer to an application-allocated function pointer table
+ * that is populated by the HSA runtime. Must not be NULL. The memory associated
+ * with table can be reused or freed after the function returns.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid
+ * extension, or @p table is NULL.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_system_get_extension_table(
+    uint16_t extension,
+    uint16_t version_major,
+    uint16_t version_minor,
+    void *table);
+
+/**
+ * @brief Retrieve the function pointers corresponding to a given major version
+ * of an extension. Portable applications are expected to invoke the extension
+ * API using the returned function pointers.
+ *
+ * @details The application is responsible for verifying that the given major
+ * version of the extension is supported by the HSA implementation (see
+ * ::hsa_system_major_extension_supported). If the given combination of extension
+ * and major version is not supported by the implementation, the behavior is
+ * undefined. Additionally if the length doesn't allow space for a full minor
+ * version, it is implementation defined if only some of the function pointers for
+ * that minor version get written.
+ *
+ * @param[in] extension Extension identifier.
+ *
+ * @param[in] version_major Major version number for which to retrieve the
+ * function pointer table.
+ *
+ * @param[in] table_length Size in bytes of the function pointer table to be
+ * populated. The implementation will not write more than this many bytes to the
+ * table.
+ *
+ * @param[out] table Pointer to an application-allocated function pointer table
+ * that is populated by the HSA runtime. Must not be NULL. The memory associated
+ * with table can be reused or freed after the function returns.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid
+ * extension, or @p table is NULL.
+ */
+hsa_status_t HSA_API hsa_system_get_major_extension_table(
+    uint16_t extension,
+    uint16_t version_major,
+    size_t table_length,
+    void *table);
+
+/**
+ * @brief Struct containing an opaque handle to an agent, a device that participates in
+ * the HSA memory model. An agent can submit AQL packets for execution, and
+ * may also accept AQL packets for execution (agent dispatch packets or kernel
+ * dispatch packets launching HSAIL-derived binaries).
+ */
+typedef struct hsa_agent_s {
+  /**
+   * Opaque handle. Two handles reference the same object of the enclosing type
+   * if and only if they are equal.
+   */
+  uint64_t handle;
+} hsa_agent_t;
+
+/**
+ * @brief Agent features.
+ */
+typedef enum {
+    /**
+     * The agent supports AQL packets of kernel dispatch type. If this
+     * feature is enabled, the agent is also a kernel agent.
+     */
+    HSA_AGENT_FEATURE_KERNEL_DISPATCH = 1,
+    /**
+     * The agent supports AQL packets of agent dispatch type.
+     */
+    HSA_AGENT_FEATURE_AGENT_DISPATCH = 2
+} hsa_agent_feature_t;
+
+/**
+ * @brief Hardware device type.
+ */
+typedef enum {
+  /**
+   * CPU device.
+   */
+  HSA_DEVICE_TYPE_CPU = 0,
+  /**
+   * GPU device.
+   */
+  HSA_DEVICE_TYPE_GPU = 1,
+  /**
+   * DSP device.
+   */
+  HSA_DEVICE_TYPE_DSP = 2,
+  /**
+   * AI Engine (AIE) device.
+   */
+  HSA_DEVICE_TYPE_AIE = 3
+} hsa_device_type_t;
+
+/**
+ * @brief Default floating-point rounding mode.
+ */
+typedef enum {
+  /**
+   * Use a default floating-point rounding mode specified elsewhere.
+   */
+  HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT = 0,
+  /**
+   * Operations that specify the default floating-point mode are rounded to zero
+   * by default.
+   */
+  HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO = 1,
+  /**
+   * Operations that specify the default floating-point mode are rounded to the
+   * nearest representable number and that ties should be broken by selecting
+   * the value with an even least significant bit.
+   */
+  HSA_DEFAULT_FLOAT_ROUNDING_MODE_NEAR = 2
+} hsa_default_float_rounding_mode_t;
+
+/**
+ * @brief Agent attributes.
+ */
+typedef enum {
+  /**
+   * Agent name. The type of this attribute is a NUL-terminated char[64]. The
+   * name must be at most 63 characters long (not including the NUL terminator)
+   * and all array elements not used for the name must be NUL.
+   */
+  HSA_AGENT_INFO_NAME = 0,
+  /**
+   * Name of vendor. The type of this attribute is a NUL-terminated char[64].
+   * The name must be at most 63 characters long (not including the NUL
+   * terminator) and all array elements not used for the name must be NUL.
+   */
+  HSA_AGENT_INFO_VENDOR_NAME = 1,
+  /**
+   * Agent capability. The type of this attribute is ::hsa_agent_feature_t.
+   */
+  HSA_AGENT_INFO_FEATURE = 2,
+  /**
+   * @deprecated Query ::HSA_ISA_INFO_MACHINE_MODELS for a given intruction set
+   * architecture supported by the agent instead.  If more than one ISA is
+   * supported by the agent, the returned value corresponds to the first ISA
+   * enumerated by ::hsa_agent_iterate_isas.
+   *
+   * Machine model supported by the agent. The type of this attribute is
+   * ::hsa_machine_model_t.
+   */
+  HSA_AGENT_INFO_MACHINE_MODEL = 3,
+  /**
+   * @deprecated Query ::HSA_ISA_INFO_PROFILES for a given intruction set
+   * architecture supported by the agent instead.  If more than one ISA is
+   * supported by the agent, the returned value corresponds to the first ISA
+   * enumerated by ::hsa_agent_iterate_isas.
+   *
+   * Profile supported by the agent. The type of this attribute is
+   * ::hsa_profile_t.
+   */
+  HSA_AGENT_INFO_PROFILE = 4,
+  /**
+   * @deprecated Query ::HSA_ISA_INFO_DEFAULT_FLOAT_ROUNDING_MODES for a given
+   * intruction set architecture supported by the agent instead.  If more than
+   * one ISA is supported by the agent, the returned value corresponds to the
+   * first ISA enumerated by ::hsa_agent_iterate_isas.
+   *
+   * Default floating-point rounding mode. The type of this attribute is
+   * ::hsa_default_float_rounding_mode_t, but the value
+   * ::HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT is not allowed.
+   */
+  HSA_AGENT_INFO_DEFAULT_FLOAT_ROUNDING_MODE = 5,
+  /**
+   * @deprecated Query ::HSA_ISA_INFO_BASE_PROFILE_DEFAULT_FLOAT_ROUNDING_MODES
+   * for a given intruction set architecture supported by the agent instead.  If
+   * more than one ISA is supported by the agent, the returned value corresponds
+   * to the first ISA enumerated by ::hsa_agent_iterate_isas.
+   *
+   * A bit-mask of ::hsa_default_float_rounding_mode_t values, representing the
+   * default floating-point rounding modes supported by the agent in the Base
+   * profile. The type of this attribute is uint32_t. The default floating-point
+   * rounding mode (::HSA_AGENT_INFO_DEFAULT_FLOAT_ROUNDING_MODE) bit must not
+   * be set.
+   */
+  HSA_AGENT_INFO_BASE_PROFILE_DEFAULT_FLOAT_ROUNDING_MODES = 23,
+  /**
+   * @deprecated Query ::HSA_ISA_INFO_FAST_F16_OPERATION for a given intruction
+   * set architecture supported by the agent instead.  If more than one ISA is
+   * supported by the agent, the returned value corresponds to the first ISA
+   * enumerated by ::hsa_agent_iterate_isas.
+   *
+   * Flag indicating that the f16 HSAIL operation is at least as fast as the
+   * f32 operation in the current agent. The value of this attribute is
+   * undefined if the agent is not a kernel agent. The type of this
+   * attribute is bool.
+   */
+  HSA_AGENT_INFO_FAST_F16_OPERATION = 24,
+  /**
+   * @deprecated Query ::HSA_WAVEFRONT_INFO_SIZE for a given wavefront and
+   * intruction set architecture supported by the agent instead.  If more than
+   * one ISA is supported by the agent, the returned value corresponds to the
+   * first ISA enumerated by ::hsa_agent_iterate_isas and the first wavefront
+   * enumerated by ::hsa_isa_iterate_wavefronts for that ISA.
+   *
+   * Number of work-items in a wavefront. Must be a power of 2 in the range
+   * [1,256]. The value of this attribute is undefined if the agent is not
+   * a kernel agent. The type of this attribute is uint32_t.
+   */
+  HSA_AGENT_INFO_WAVEFRONT_SIZE = 6,
+  /**
+   * @deprecated Query ::HSA_ISA_INFO_WORKGROUP_MAX_DIM for a given intruction
+   * set architecture supported by the agent instead.  If more than one ISA is
+   * supported by the agent, the returned value corresponds to the first ISA
+   * enumerated by ::hsa_agent_iterate_isas.
+   *
+   * Maximum number of work-items of each dimension of a work-group.  Each
+   * maximum must be greater than 0. No maximum can exceed the value of
+   * ::HSA_AGENT_INFO_WORKGROUP_MAX_SIZE. The value of this attribute is
+   * undefined if the agent is not a kernel agent. The type of this
+   * attribute is uint16_t[3].
+   */
+  HSA_AGENT_INFO_WORKGROUP_MAX_DIM = 7,
+  /**
+   * @deprecated Query ::HSA_ISA_INFO_WORKGROUP_MAX_SIZE for a given intruction
+   * set architecture supported by the agent instead.  If more than one ISA is
+   * supported by the agent, the returned value corresponds to the first ISA
+   * enumerated by ::hsa_agent_iterate_isas.
+   *
+   * Maximum total number of work-items in a work-group. The value of this
+   * attribute is undefined if the agent is not a kernel agent. The type
+   * of this attribute is uint32_t.
+   */
+  HSA_AGENT_INFO_WORKGROUP_MAX_SIZE = 8,
+  /**
+   * @deprecated Query ::HSA_ISA_INFO_GRID_MAX_DIM for a given intruction set
+   * architecture supported by the agent instead.
+   *
+   * Maximum number of work-items of each dimension of a grid. Each maximum must
+   * be greater than 0, and must not be smaller than the corresponding value in
+   * ::HSA_AGENT_INFO_WORKGROUP_MAX_DIM. No maximum can exceed the value of
+   * ::HSA_AGENT_INFO_GRID_MAX_SIZE. The value of this attribute is undefined
+   * if the agent is not a kernel agent. The type of this attribute is
+   * ::hsa_dim3_t.
+   */
+  HSA_AGENT_INFO_GRID_MAX_DIM = 9,
+  /**
+   * @deprecated Query ::HSA_ISA_INFO_GRID_MAX_SIZE for a given intruction set
+   * architecture supported by the agent instead.  If more than one ISA is
+   * supported by the agent, the returned value corresponds to the first ISA
+   * enumerated by ::hsa_agent_iterate_isas.
+   *
+   * Maximum total number of work-items in a grid. The value of this attribute
+   * is undefined if the agent is not a kernel agent. The type of this
+   * attribute is uint32_t.
+   */
+  HSA_AGENT_INFO_GRID_MAX_SIZE = 10,
+  /**
+   * @deprecated Query ::HSA_ISA_INFO_FBARRIER_MAX_SIZE for a given intruction
+   * set architecture supported by the agent instead.  If more than one ISA is
+   * supported by the agent, the returned value corresponds to the first ISA
+   * enumerated by ::hsa_agent_iterate_isas.
+   *
+   * Maximum number of fbarriers per work-group. Must be at least 32. The value
+   * of this attribute is undefined if the agent is not a kernel agent. The
+   * type of this attribute is uint32_t.
+   */
+  HSA_AGENT_INFO_FBARRIER_MAX_SIZE = 11,
+  /**
+   * @deprecated The maximum number of queues is not statically determined.
+   *
+   * Maximum number of queues that can be active (created but not destroyed) at
+   * one time in the agent. The type of this attribute is uint32_t.
+   */
+  HSA_AGENT_INFO_QUEUES_MAX = 12,
+  /**
+   * Minimum number of packets that a queue created in the agent
+   * can hold. Must be a power of 2 greater than 0. Must not exceed
+   * the value of ::HSA_AGENT_INFO_QUEUE_MAX_SIZE. The type of this
+   * attribute is uint32_t.
+   */
+  HSA_AGENT_INFO_QUEUE_MIN_SIZE = 13,
+  /**
+   * Maximum number of packets that a queue created in the agent can
+   * hold. Must be a power of 2 greater than 0. The type of this attribute
+   * is uint32_t.
+   */
+  HSA_AGENT_INFO_QUEUE_MAX_SIZE = 14,
+  /**
+   * Type of a queue created in the agent. The type of this attribute is
+   * ::hsa_queue_type32_t.
+   */
+  HSA_AGENT_INFO_QUEUE_TYPE = 15,
+  /**
+   * @deprecated NUMA information is not exposed anywhere else in the API.
+   *
+   * Identifier of the NUMA node associated with the agent. The type of this
+   * attribute is uint32_t.
+   */
+  HSA_AGENT_INFO_NODE = 16,
+  /**
+   * Type of hardware device associated with the agent. The type of this
+   * attribute is ::hsa_device_type_t.
+   */
+  HSA_AGENT_INFO_DEVICE = 17,
+  /**
+   * @deprecated Query ::hsa_agent_iterate_caches to retrieve information about
+   * the caches present in a given agent.
+   *
+   * Array of data cache sizes (L1..L4). Each size is expressed in bytes. A size
+   * of 0 for a particular level indicates that there is no cache information
+   * for that level. The type of this attribute is uint32_t[4].
+   */
+  HSA_AGENT_INFO_CACHE_SIZE = 18,
+  /**
+   * @deprecated An agent may support multiple instruction set
+   * architectures. See ::hsa_agent_iterate_isas.  If more than one ISA is
+   * supported by the agent, the returned value corresponds to the first ISA
+   * enumerated by ::hsa_agent_iterate_isas.
+   *
+   * Instruction set architecture of the agent. The type of this attribute
+   * is ::hsa_isa_t.
+   */
+  HSA_AGENT_INFO_ISA = 19,
+  /**
+   * Bit-mask indicating which extensions are supported by the agent. An
+   * extension with an ID of @p i is supported if the bit at position @p i is
+   * set. The type of this attribute is uint8_t[128].
+   */
+  HSA_AGENT_INFO_EXTENSIONS = 20,
+  /**
+   * Major version of the HSA runtime specification supported by the
+   * agent. The type of this attribute is uint16_t.
+   */
+  HSA_AGENT_INFO_VERSION_MAJOR = 21,
+  /**
+   * Minor version of the HSA runtime specification supported by the
+   * agent. The type of this attribute is uint16_t.
+   */
+  HSA_AGENT_INFO_VERSION_MINOR = 22,
+  /**
+   * This enum does not have a fixed underlying type, thus in C++ post D2338:
+   * If the enumeration type does not have a fixed underlying type, the value is
+   * unchanged if the original value is within the range of the enumeration
+   * values (9.7.1 [dcl.enum]), and otherwise, the behavior is
+   * undefined.
+   * Thus increase the range of this enum to encompass vendor extensions.
+   */
+  HSA_AGENT_INFO_LAST = INT32_MAX
+} hsa_agent_info_t;
+
+/**
+ * @brief Get the current value of an attribute for a given agent.
+ *
+ * @param[in] agent A valid agent.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to an application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid
+ * agent attribute, or @p value is NULL.
+ */
+hsa_status_t HSA_API hsa_agent_get_info(
+    hsa_agent_t agent,
+    hsa_agent_info_t attribute,
+    void* value);
+
+/**
+ * @brief Iterate over the available agents, and invoke an
+ * application-defined callback on every iteration.
+ *
+ * @param[in] callback Callback to be invoked once per agent. The HSA
+ * runtime passes two arguments to the callback: the agent and the
+ * application data.  If @p callback returns a status other than
+ * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and
+ * ::hsa_iterate_agents returns that status value.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL.
+*/
+hsa_status_t HSA_API hsa_iterate_agents(
+    hsa_status_t (*callback)(hsa_agent_t agent, void* data),
+    void* data);
+
+/*
+
+// If we do not know the size of an attribute, we need to query it first
+// Note: this API will not be in the spec unless needed
+hsa_status_t HSA_API hsa_agent_get_info_size(
+    hsa_agent_t agent,
+    hsa_agent_info_t attribute,
+    size_t* size);
+
+// Set the value of an agents attribute
+// Note: this API will not be in the spec unless needed
+hsa_status_t HSA_API hsa_agent_set_info(
+    hsa_agent_t agent,
+    hsa_agent_info_t attribute,
+    void* value);
+
+*/
+
+/**
+ * @brief Exception policies applied in the presence of hardware exceptions.
+ */
+typedef enum {
+    /**
+     * If a hardware exception is detected, a work-item signals an exception.
+     */
+    HSA_EXCEPTION_POLICY_BREAK = 1,
+    /**
+     * If a hardware exception is detected, a hardware status bit is set.
+     */
+    HSA_EXCEPTION_POLICY_DETECT = 2
+} hsa_exception_policy_t;
+
+/**
+ * @deprecated Use ::hsa_isa_get_exception_policies for a given intruction set
+ * architecture supported by the agent instead. If more than one ISA is
+ * supported by the agent, this function uses the first value returned by
+ * ::hsa_agent_iterate_isas.
+ *
+ * @brief Retrieve the exception policy support for a given combination of
+ * agent and profile
+ *
+ * @param[in] agent Agent.
+ *
+ * @param[in] profile Profile.
+ *
+ * @param[out] mask Pointer to a memory location where the HSA runtime stores a
+ * mask of ::hsa_exception_policy_t values. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p profile is not a valid
+ * profile, or @p mask is NULL.
+ *
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_agent_get_exception_policies(
+    hsa_agent_t agent,
+    hsa_profile_t profile,
+    uint16_t *mask);
+
+/**
+ * @brief Cache handle.
+ */
+typedef struct hsa_cache_s {
+  /**
+   * Opaque handle. Two handles reference the same object of the enclosing type
+   * if and only if they are equal.
+   */
+  uint64_t handle;
+} hsa_cache_t;
+
+/**
+ * @brief Cache attributes.
+ */
+typedef enum {
+  /**
+   * The length of the cache name in bytes, not including the NUL terminator.
+   * The type of this attribute is uint32_t.
+   */
+  HSA_CACHE_INFO_NAME_LENGTH = 0,
+  /**
+   * Human-readable description.  The type of this attribute is a NUL-terminated
+   * character array with the length equal to the value of
+   * ::HSA_CACHE_INFO_NAME_LENGTH attribute.
+   */
+  HSA_CACHE_INFO_NAME = 1,
+  /**
+   * Cache level. A L1 cache must return a value of 1, a L2 must return a value
+   * of 2, and so on.  The type of this attribute is uint8_t.
+   */
+  HSA_CACHE_INFO_LEVEL = 2,
+  /**
+   * Cache size, in bytes. A value of 0 indicates that there is no size
+   * information available. The type of this attribute is uint32_t.
+   */
+  HSA_CACHE_INFO_SIZE = 3
+} hsa_cache_info_t;
+
+/**
+ * @brief Get the current value of an attribute for a given cache object.
+ *
+ * @param[in] cache Cache.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to an application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CACHE The cache is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid
+ * instruction set architecture attribute, or @p value is
+ * NULL.
+ */
+hsa_status_t HSA_API hsa_cache_get_info(
+    hsa_cache_t cache,
+    hsa_cache_info_t attribute,
+    void* value);
+
+/**
+ * @brief Iterate over the memory caches of a given agent, and
+ * invoke an application-defined callback on every iteration.
+ *
+ * @details Caches are visited in ascending order according to the value of the
+ * ::HSA_CACHE_INFO_LEVEL attribute.
+ *
+ * @param[in] agent A valid agent.
+ *
+ * @param[in] callback Callback to be invoked once per cache that is present in
+ * the agent.  The HSA runtime passes two arguments to the callback: the cache
+ * and the application data.  If @p callback returns a status other than
+ * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and
+ * that value is returned.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL.
+ */
+hsa_status_t HSA_API hsa_agent_iterate_caches(
+    hsa_agent_t agent,
+    hsa_status_t (*callback)(hsa_cache_t cache, void* data),
+    void* data);
+
+/**
+ * @deprecated
+ *
+ * @brief Query if a given version of an extension is supported by an agent
+ *
+ * @param[in] extension Extension identifier.
+ *
+ * @param[in] agent Agent.
+ *
+ * @param[in] version_major Major version number.
+ *
+ * @param[in] version_minor Minor version number.
+ *
+ * @param[out] result Pointer to a memory location where the HSA runtime stores
+ * the result of the check. The result is true if the specified version of the
+ * extension is supported, and false otherwise. The result must be false if
+ * ::hsa_system_extension_supported returns false for the same extension
+ * version.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid
+ * extension, or @p result is NULL.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_agent_extension_supported(
+    uint16_t extension,
+    hsa_agent_t agent,
+    uint16_t version_major,
+    uint16_t version_minor,
+    bool* result);
+
+/**
+ * @brief Query if a given version of an extension is supported by an agent. All
+ * minor versions from 0 up to the returned @p version_minor must be supported.
+ *
+ * @param[in] extension Extension identifier.
+ *
+ * @param[in] agent Agent.
+ *
+ * @param[in] version_major Major version number.
+ *
+ * @param[out] version_minor Minor version number.
+ *
+ * @param[out] result Pointer to a memory location where the HSA runtime stores
+ * the result of the check. The result is true if the specified version of the
+ * extension is supported, and false otherwise. The result must be false if
+ * ::hsa_system_extension_supported returns false for the same extension
+ * version.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid
+ * extension, or @p version_minor is NULL, or @p result is NULL.
+ */
+hsa_status_t HSA_API hsa_agent_major_extension_supported(
+    uint16_t extension,
+    hsa_agent_t agent,
+    uint16_t version_major,
+    uint16_t *version_minor,
+    bool* result);
+
+
+/** @} */
+
+
+/** \defgroup signals Signals
+ *  @{
+ */
+
+/**
+ * @brief Signal handle.
+ */
+typedef struct hsa_signal_s {
+  /**
+   * Opaque handle. Two handles reference the same object of the enclosing type
+   * if and only if they are equal. The value 0 is reserved.
+   */
+  uint64_t handle;
+} hsa_signal_t;
+
+/**
+ * @brief Signal value. The value occupies 32 bits in small machine mode, and 64
+ * bits in large machine mode.
+ */
+#ifdef HSA_LARGE_MODEL
+  typedef int64_t hsa_signal_value_t;
+#else
+  typedef int32_t hsa_signal_value_t;
+#endif
+
+/**
+ * @brief Create a signal.
+ *
+ * @param[in] initial_value Initial value of the signal.
+ *
+ * @param[in] num_consumers Size of @p consumers. A value of 0 indicates that
+ * any agent might wait on the signal.
+ *
+ * @param[in] consumers List of agents that might consume (wait on) the
+ * signal. If @p num_consumers is 0, this argument is ignored; otherwise, the
+ * HSA runtime might use the list to optimize the handling of the signal
+ * object. If an agent not listed in @p consumers waits on the returned
+ * signal, the behavior is undefined. The memory associated with @p consumers
+ * can be reused or freed after the function returns.
+ *
+ * @param[out] signal Pointer to a memory location where the HSA runtime will
+ * store the newly created signal handle. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate
+ * the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p signal is NULL, @p
+ * num_consumers is greater than 0 but @p consumers is NULL, or @p consumers
+ * contains duplicates.
+ */
+hsa_status_t HSA_API hsa_signal_create(
+    hsa_signal_value_t initial_value,
+    uint32_t num_consumers,
+    const hsa_agent_t *consumers,
+    hsa_signal_t *signal);
+
+/**
+ * @brief Destroy a signal previous created by ::hsa_signal_create.
+ *
+ * @param[in] signal Signal.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL @p signal is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT The handle in @p signal is 0.
+ */
+hsa_status_t HSA_API hsa_signal_destroy(
+    hsa_signal_t signal);
+
+/**
+ * @brief Atomically read the current value of a signal.
+ *
+ * @param[in] signal Signal.
+ *
+ * @return Value of the signal.
+*/
+hsa_signal_value_t HSA_API hsa_signal_load_scacquire(
+    hsa_signal_t signal);
+
+/**
+ * @copydoc hsa_signal_load_scacquire
+ */
+hsa_signal_value_t HSA_API hsa_signal_load_relaxed(
+    hsa_signal_t signal);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_load_scacquire.
+ *
+ * @copydoc hsa_signal_load_scacquire
+*/
+hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_load_acquire(
+    hsa_signal_t signal);
+
+/**
+ * @brief Atomically set the value of a signal.
+ *
+ * @details If the value of the signal is changed, all the agents waiting
+ * on @p signal for which @p value satisfies their wait condition are awakened.
+ *
+ * @param[in] signal Signal.
+ *
+ * @param[in] value New signal value.
+ */
+void HSA_API hsa_signal_store_relaxed(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_store_relaxed
+ */
+void HSA_API hsa_signal_store_screlease(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_store_screlease.
+ *
+ * @copydoc hsa_signal_store_screlease
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_store_release(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @brief Atomically set the value of a signal without necessarily notifying the
+ * the agents waiting on it.
+ *
+ * @details The agents waiting on @p signal may not wake up even when the new
+ * value satisfies their wait condition. If the application wants to update the
+ * signal and there is no need to notify any agent, invoking this function can
+ * be more efficient than calling the non-silent counterpart.
+ *
+ * @param[in] signal Signal.
+ *
+ * @param[in] value New signal value.
+ */
+void HSA_API hsa_signal_silent_store_relaxed(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_silent_store_relaxed
+ */
+void HSA_API hsa_signal_silent_store_screlease(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @brief Atomically set the value of a signal and return its previous value.
+ *
+ * @details If the value of the signal is changed, all the agents waiting
+ * on @p signal for which @p value satisfies their wait condition are awakened.
+ *
+ * @param[in] signal Signal. If @p signal is a queue doorbell signal, the
+ * behavior is undefined.
+ *
+ * @param[in] value New value.
+ *
+ * @return Value of the signal prior to the exchange.
+ *
+ */
+hsa_signal_value_t HSA_API hsa_signal_exchange_scacq_screl(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_exchange_scacq_screl.
+ *
+ * @copydoc hsa_signal_exchange_scacq_screl
+ */
+hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_exchange_acq_rel(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_exchange_scacq_screl
+ */
+hsa_signal_value_t HSA_API hsa_signal_exchange_scacquire(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_exchange_scacquire.
+ *
+ * @copydoc hsa_signal_exchange_scacquire
+ */
+hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_exchange_acquire(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_exchange_scacq_screl
+ */
+hsa_signal_value_t HSA_API hsa_signal_exchange_relaxed(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+/**
+ * @copydoc hsa_signal_exchange_scacq_screl
+ */
+hsa_signal_value_t HSA_API hsa_signal_exchange_screlease(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_exchange_screlease.
+ *
+ * @copydoc hsa_signal_exchange_screlease
+ */
+hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_exchange_release(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @brief Atomically set the value of a signal if the observed value is equal to
+ * the expected value. The observed value is returned regardless of whether the
+ * replacement was done.
+ *
+ * @details If the value of the signal is changed, all the agents waiting
+ * on @p signal for which @p value satisfies their wait condition are awakened.
+ *
+ * @param[in] signal Signal. If @p signal is a queue
+ * doorbell signal, the behavior is undefined.
+ *
+ * @param[in] expected Value to compare with.
+ *
+ * @param[in] value New value.
+ *
+ * @return Observed value of the signal.
+ *
+ */
+hsa_signal_value_t HSA_API hsa_signal_cas_scacq_screl(
+    hsa_signal_t signal,
+    hsa_signal_value_t expected,
+    hsa_signal_value_t value);
+
+
+/**
+ * @deprecated Renamed as ::hsa_signal_cas_scacq_screl.
+ *
+ * @copydoc hsa_signal_cas_scacq_screl
+ */
+hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_cas_acq_rel(
+    hsa_signal_t signal,
+    hsa_signal_value_t expected,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_cas_scacq_screl
+ */
+hsa_signal_value_t HSA_API hsa_signal_cas_scacquire(
+    hsa_signal_t signal,
+    hsa_signal_value_t expected,
+    hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_cas_scacquire.
+ *
+ * @copydoc hsa_signal_cas_scacquire
+ */
+hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_cas_acquire(
+    hsa_signal_t signal,
+    hsa_signal_value_t expected,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_cas_scacq_screl
+ */
+hsa_signal_value_t HSA_API hsa_signal_cas_relaxed(
+    hsa_signal_t signal,
+    hsa_signal_value_t expected,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_cas_scacq_screl
+ */
+hsa_signal_value_t HSA_API hsa_signal_cas_screlease(
+    hsa_signal_t signal,
+    hsa_signal_value_t expected,
+    hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_cas_screlease.
+ *
+ * @copydoc hsa_signal_cas_screlease
+ */
+hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_cas_release(
+    hsa_signal_t signal,
+    hsa_signal_value_t expected,
+    hsa_signal_value_t value);
+
+/**
+ * @brief Atomically increment the value of a signal by a given amount.
+ *
+ * @details If the value of the signal is changed, all the agents waiting on
+ * @p signal for which @p value satisfies their wait condition are awakened.
+ *
+ * @param[in] signal Signal. If @p signal is a queue doorbell signal, the
+ * behavior is undefined.
+ *
+ * @param[in] value Value to add to the value of the signal.
+ *
+ */
+void HSA_API hsa_signal_add_scacq_screl(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_add_scacq_screl.
+ *
+ * @copydoc hsa_signal_add_scacq_screl
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_add_acq_rel(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_add_scacq_screl
+ */
+void HSA_API hsa_signal_add_scacquire(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_add_scacquire.
+ *
+ * @copydoc hsa_signal_add_scacquire
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_add_acquire(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_add_scacq_screl
+ */
+void HSA_API hsa_signal_add_relaxed(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_add_scacq_screl
+ */
+void HSA_API hsa_signal_add_screlease(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+
+/**
+ * @deprecated Renamed as ::hsa_signal_add_screlease.
+ *
+ * @copydoc hsa_signal_add_screlease
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_add_release(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @brief Atomically decrement the value of a signal by a given amount.
+ *
+ * @details If the value of the signal is changed, all the agents waiting on
+ * @p signal for which @p value satisfies their wait condition are awakened.
+ *
+ * @param[in] signal Signal. If @p signal is a queue doorbell signal, the
+ * behavior is undefined.
+ *
+ * @param[in] value Value to subtract from the value of the signal.
+ *
+ */
+void HSA_API hsa_signal_subtract_scacq_screl(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+
+/**
+ * @deprecated Renamed as ::hsa_signal_subtract_scacq_screl.
+ *
+ * @copydoc hsa_signal_subtract_scacq_screl
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_subtract_acq_rel(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_subtract_scacq_screl
+ */
+void HSA_API hsa_signal_subtract_scacquire(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_subtract_scacquire.
+ *
+ * @copydoc hsa_signal_subtract_scacquire
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_subtract_acquire(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_subtract_scacq_screl
+ */
+void HSA_API hsa_signal_subtract_relaxed(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_subtract_scacq_screl
+ */
+void HSA_API hsa_signal_subtract_screlease(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+
+/**
+ * @deprecated Renamed as ::hsa_signal_subtract_screlease.
+ *
+ * @copydoc hsa_signal_subtract_screlease
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_subtract_release(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @brief Atomically perform a bitwise AND operation between the value of a
+ * signal and a given value.
+ *
+ * @details If the value of the signal is changed, all the agents waiting on
+ * @p signal for which @p value satisfies their wait condition are awakened.
+ *
+ * @param[in] signal Signal. If @p signal is a queue doorbell signal, the
+ * behavior is undefined.
+ *
+ * @param[in] value Value to AND with the value of the signal.
+ *
+ */
+void HSA_API hsa_signal_and_scacq_screl(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_and_scacq_screl.
+ *
+ * @copydoc hsa_signal_and_scacq_screl
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_and_acq_rel(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_and_scacq_screl
+ */
+void HSA_API hsa_signal_and_scacquire(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_and_scacquire.
+ *
+ * @copydoc hsa_signal_and_scacquire
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_and_acquire(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_and_scacq_screl
+ */
+void HSA_API hsa_signal_and_relaxed(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_and_scacq_screl
+ */
+void HSA_API hsa_signal_and_screlease(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+
+/**
+ * @deprecated Renamed as ::hsa_signal_and_screlease.
+ *
+ * @copydoc hsa_signal_and_screlease
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_and_release(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @brief Atomically perform a bitwise OR operation between the value of a
+ * signal and a given value.
+ *
+ * @details If the value of the signal is changed, all the agents waiting on
+ * @p signal for which @p value satisfies their wait condition are awakened.
+ *
+ * @param[in] signal Signal. If @p signal is a queue doorbell signal, the
+ * behavior is undefined.
+ *
+ * @param[in] value Value to OR with the value of the signal.
+ */
+void HSA_API hsa_signal_or_scacq_screl(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+
+/**
+ * @deprecated Renamed as ::hsa_signal_or_scacq_screl.
+ *
+ * @copydoc hsa_signal_or_scacq_screl
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_or_acq_rel(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_or_scacq_screl
+ */
+void HSA_API hsa_signal_or_scacquire(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_or_scacquire.
+ *
+ * @copydoc hsa_signal_or_scacquire
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_or_acquire(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_or_scacq_screl
+ */
+void HSA_API hsa_signal_or_relaxed(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_or_scacq_screl
+ */
+void HSA_API hsa_signal_or_screlease(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_or_screlease.
+ *
+ * @copydoc hsa_signal_or_screlease
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_or_release(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @brief Atomically perform a bitwise XOR operation between the value of a
+ * signal and a given value.
+ *
+ * @details If the value of the signal is changed, all the agents waiting on
+ * @p signal for which @p value satisfies their wait condition are awakened.
+ *
+ * @param[in] signal Signal. If @p signal is a queue doorbell signal, the
+ * behavior is undefined.
+ *
+ * @param[in] value Value to XOR with the value of the signal.
+ *
+ */
+void HSA_API hsa_signal_xor_scacq_screl(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+
+/**
+ * @deprecated Renamed as ::hsa_signal_xor_scacq_screl.
+ *
+ * @copydoc hsa_signal_xor_scacq_screl
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_xor_acq_rel(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_xor_scacq_screl
+ */
+void HSA_API hsa_signal_xor_scacquire(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_xor_scacquire.
+ *
+ * @copydoc hsa_signal_xor_scacquire
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_xor_acquire(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_xor_scacq_screl
+ */
+void HSA_API hsa_signal_xor_relaxed(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_xor_scacq_screl
+ */
+void HSA_API hsa_signal_xor_screlease(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_xor_screlease.
+ *
+ * @copydoc hsa_signal_xor_screlease
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_xor_release(
+    hsa_signal_t signal,
+    hsa_signal_value_t value);
+
+/**
+ * @brief Wait condition operator.
+ */
+typedef enum {
+    /**
+     * The two operands are equal.
+     */
+    HSA_SIGNAL_CONDITION_EQ = 0,
+    /**
+     * The two operands are not equal.
+     */
+    HSA_SIGNAL_CONDITION_NE = 1,
+    /**
+     * The first operand is less than the second operand.
+     */
+    HSA_SIGNAL_CONDITION_LT = 2,
+    /**
+     * The first operand is greater than or equal to the second operand.
+     */
+    HSA_SIGNAL_CONDITION_GTE = 3
+} hsa_signal_condition_t;
+
+/**
+ * @brief State of the application thread during a signal wait.
+ */
+typedef enum {
+    /**
+     * The application thread may be rescheduled while waiting on the signal.
+     */
+    HSA_WAIT_STATE_BLOCKED = 0,
+    /**
+     * The application thread stays active while waiting on a signal.
+     */
+    HSA_WAIT_STATE_ACTIVE = 1
+} hsa_wait_state_t;
+
+
+/**
+ * @brief Wait until a signal value satisfies a specified condition, or a
+ * certain amount of time has elapsed.
+ *
+ * @details A wait operation can spuriously resume at any time sooner than the
+ * timeout (for example, due to system or other external factors) even when the
+ * condition has not been met.
+ *
+ * The function is guaranteed to return if the signal value satisfies the
+ * condition at some point in time during the wait, but the value returned to
+ * the application might not satisfy the condition. The application must ensure
+ * that signals are used in such way that wait wakeup conditions are not
+ * invalidated before dependent threads have woken up.
+ *
+ * When the wait operation internally loads the value of the passed signal, it
+ * uses the memory order indicated in the function name.
+ *
+ * @param[in] signal Signal.
+ *
+ * @param[in] condition Condition used to compare the signal value with @p
+ * compare_value.
+ *
+ * @param[in] compare_value Value to compare with.
+ *
+ * @param[in] timeout_hint Maximum duration of the wait.  Specified in the same
+ * unit as the system timestamp. The operation might block for a shorter or
+ * longer time even if the condition is not met. A value of UINT64_MAX indicates
+ * no maximum.
+ *
+ * @param[in] wait_state_hint Hint used by the application to indicate the
+ * preferred waiting state. The actual waiting state is ultimately decided by
+ * HSA runtime and may not match the provided hint. A value of
+ * ::HSA_WAIT_STATE_ACTIVE may improve the latency of response to a signal
+ * update by avoiding rescheduling overhead.
+ *
+ * @return Observed value of the signal, which might not satisfy the specified
+ * condition.
+ *
+*/
+hsa_signal_value_t HSA_API hsa_signal_wait_scacquire(
+    hsa_signal_t signal,
+    hsa_signal_condition_t condition,
+    hsa_signal_value_t compare_value,
+    uint64_t timeout_hint,
+    hsa_wait_state_t wait_state_hint);
+
+/**
+ * @copydoc hsa_signal_wait_scacquire
+ */
+hsa_signal_value_t HSA_API hsa_signal_wait_relaxed(
+    hsa_signal_t signal,
+    hsa_signal_condition_t condition,
+    hsa_signal_value_t compare_value,
+    uint64_t timeout_hint,
+    hsa_wait_state_t wait_state_hint);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_wait_scacquire.
+ *
+ * @copydoc hsa_signal_wait_scacquire
+ */
+hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_wait_acquire(
+    hsa_signal_t signal,
+    hsa_signal_condition_t condition,
+    hsa_signal_value_t compare_value,
+    uint64_t timeout_hint,
+    hsa_wait_state_t wait_state_hint);
+
+/**
+ * @brief Group of signals.
+ */
+typedef struct hsa_signal_group_s {
+  /**
+   * Opaque handle. Two handles reference the same object of the enclosing type
+   * if and only if they are equal.
+   */
+  uint64_t handle;
+} hsa_signal_group_t;
+
+/**
+ * @brief Create a signal group.
+ *
+ * @param[in] num_signals Number of elements in @p signals. Must not be 0.
+ *
+ * @param[in] signals List of signals in the group. The list must not contain
+ * any repeated elements. Must not be NULL.
+ *
+ * @param[in] num_consumers Number of elements in @p consumers. Must not be 0.
+ *
+ * @param[in] consumers List of agents that might consume (wait on) the signal
+ * group. The list must not contain repeated elements, and must be a subset of
+ * the set of agents that are allowed to wait on all the signals in the
+ * group. If an agent not listed in @p consumers waits on the returned group,
+ * the behavior is undefined. The memory associated with @p consumers can be
+ * reused or freed after the function returns. Must not be NULL.
+ *
+ * @param[out] signal_group Pointer to newly created signal group. Must not be
+ * NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate
+ * the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_signals is 0, @p signals
+ * is NULL, @p num_consumers is 0, @p consumers is NULL, or @p signal_group is
+ * NULL.
+ */
+hsa_status_t HSA_API hsa_signal_group_create(
+    uint32_t num_signals,
+    const hsa_signal_t *signals,
+    uint32_t num_consumers,
+    const hsa_agent_t *consumers,
+    hsa_signal_group_t *signal_group);
+
+/**
+ * @brief Destroy a signal group previous created by ::hsa_signal_group_create.
+ *
+ * @param[in] signal_group Signal group.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL_GROUP @p signal_group is invalid.
+ */
+hsa_status_t HSA_API hsa_signal_group_destroy(
+    hsa_signal_group_t signal_group);
+
+/**
+ * @brief Wait until the value of at least one of the signals in a signal group
+ * satisfies its associated condition.
+ *
+ * @details The function is guaranteed to return if the value of at least one of
+ * the signals in the group satisfies its associated condition at some point in
+ * time during the wait, but the signal value returned to the application may no
+ * longer satisfy the condition. The application must ensure that signals in the
+ * group are used in such way that wait wakeup conditions are not invalidated
+ * before dependent threads have woken up.
+ *
+ * When this operation internally loads the value of the passed signal, it uses
+ * the memory order indicated in the function name.
+ *
+ * @param[in] signal_group Signal group.
+ *
+ * @param[in] conditions List of conditions. Each condition, and the value at
+ * the same index in @p compare_values, is used to compare the value of the
+ * signal at that index in @p signal_group (the signal passed by the application
+ * to ::hsa_signal_group_create at that particular index). The size of @p
+ * conditions must not be smaller than the number of signals in @p signal_group;
+ * any extra elements are ignored. Must not be NULL.
+ *
+ * @param[in] compare_values List of comparison values.  The size of @p
+ * compare_values must not be smaller than the number of signals in @p
+ * signal_group; any extra elements are ignored. Must not be NULL.
+ *
+ * @param[in] wait_state_hint Hint used by the application to indicate the
+ * preferred waiting state. The actual waiting state is decided by the HSA runtime
+ * and may not match the provided hint. A value of ::HSA_WAIT_STATE_ACTIVE may
+ * improve the latency of response to a signal update by avoiding rescheduling
+ * overhead.
+ *
+ * @param[out] signal Signal in the group that satisfied the associated
+ * condition. If several signals satisfied their condition, the function can
+ * return any of those signals. Must not be NULL.
+ *
+ * @param[out] value Observed value for @p signal, which might no longer satisfy
+ * the specified condition. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL_GROUP @p signal_group is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p conditions is NULL, @p
+ * compare_values is NULL, @p signal is NULL, or @p value is NULL.
+ */
+hsa_status_t HSA_API hsa_signal_group_wait_any_scacquire(
+    hsa_signal_group_t signal_group,
+    const hsa_signal_condition_t *conditions,
+    const hsa_signal_value_t *compare_values,
+    hsa_wait_state_t wait_state_hint,
+    hsa_signal_t *signal,
+    hsa_signal_value_t *value);
+
+/**
+ * @copydoc hsa_signal_group_wait_any_scacquire
+ */
+hsa_status_t HSA_API hsa_signal_group_wait_any_relaxed(
+    hsa_signal_group_t signal_group,
+    const hsa_signal_condition_t *conditions,
+    const hsa_signal_value_t *compare_values,
+    hsa_wait_state_t wait_state_hint,
+    hsa_signal_t *signal,
+    hsa_signal_value_t *value);
+
+/** @} */
+
+/** \defgroup memory Memory
+ *  @{
+ */
+
+/**
+ * @brief A memory region represents a block of virtual memory with certain
+ * properties. For example, the HSA runtime represents fine-grained memory in
+ * the global segment using a region. A region might be associated with more
+ * than one agent.
+ */
+typedef struct hsa_region_s {
+  /**
+   * Opaque handle. Two handles reference the same object of the enclosing type
+   * if and only if they are equal.
+   */
+  uint64_t handle;
+} hsa_region_t;
+
+/** @} */
+
+
+/** \defgroup queue Queues
+ *  @{
+ */
+
+/**
+ * @brief Queue type. Intended to be used for dynamic queue protocol
+ * determination.
+ */
+typedef enum {
+  /**
+   * Queue supports multiple producers. Use of multiproducer queue mechanics is
+   * required.
+   */
+  HSA_QUEUE_TYPE_MULTI = 0,
+  /**
+   * Queue only supports a single producer. In some scenarios, the application
+   * may want to limit the submission of AQL packets to a single agent. Queues
+   * that support a single producer may be more efficient than queues supporting
+   * multiple producers. Use of multiproducer queue mechanics is not supported.
+   */
+  HSA_QUEUE_TYPE_SINGLE = 1,
+  /**
+   * Queue supports multiple producers and cooperative dispatches. Cooperative
+   * dispatches are able to use GWS synchronization. Queues of this type may be
+   * limited in number. The runtime may return the same queue to serve multiple
+   * ::hsa_queue_create calls when this type is given. Callers must inspect the
+   * returned queue to discover queue size. Queues of this type are reference
+   * counted and require a matching number of ::hsa_queue_destroy calls to
+   * release. Use of multiproducer queue mechanics is required. See
+   * ::HSA_AMD_AGENT_INFO_COOPERATIVE_QUEUES to query agent support for this
+   * type.
+   */
+  HSA_QUEUE_TYPE_COOPERATIVE = 2
+} hsa_queue_type_t;
+
+/**
+ * @brief A fixed-size type used to represent ::hsa_queue_type_t constants.
+ */
+typedef uint32_t hsa_queue_type32_t;
+
+/**
+ * @brief Queue features.
+ */
+typedef enum {
+  /**
+   * Queue supports kernel dispatch packets.
+   */
+  HSA_QUEUE_FEATURE_KERNEL_DISPATCH = 1,
+
+  /**
+   * Queue supports agent dispatch packets.
+   */
+  HSA_QUEUE_FEATURE_AGENT_DISPATCH = 2
+} hsa_queue_feature_t;
+
+/**
+ * @brief User mode queue.
+ *
+ * @details The queue structure is read-only and allocated by the HSA runtime,
+ * but agents can directly modify the contents of the buffer pointed by @a
+ * base_address, or use HSA runtime APIs to access the doorbell signal.
+ *
+ */
+typedef struct hsa_queue_s {
+  /**
+   * Queue type.
+   */
+  hsa_queue_type32_t type;
+
+  /**
+   * Queue features mask. This is a bit-field of ::hsa_queue_feature_t
+   * values. Applications should ignore any unknown set bits.
+   */
+  uint32_t features;
+
+#ifdef HSA_LARGE_MODEL
+  void* base_address;
+#elif defined HSA_LITTLE_ENDIAN
+  /**
+   * Starting address of the HSA runtime-allocated buffer used to store the AQL
+   * packets. Must be aligned to the size of an AQL packet.
+   */
+  void* base_address;
+  /**
+   * Reserved. Must be 0.
+   */
+  uint32_t reserved0;
+#else
+  uint32_t reserved0;
+  void* base_address;
+#endif
+
+  /**
+   * Signal object used by the application to indicate the ID of a packet that
+   * is ready to be processed. The HSA runtime manages the doorbell signal. If
+   * the application tries to replace or destroy this signal, the behavior is
+   * undefined.
+   *
+   * If @a type is ::HSA_QUEUE_TYPE_SINGLE, the doorbell signal value must be
+   * updated in a monotonically increasing fashion. If @a type is
+   * ::HSA_QUEUE_TYPE_MULTI, the doorbell signal value can be updated with any
+   * value.
+   */
+  hsa_signal_t doorbell_signal;
+
+  /**
+   * Maximum number of packets the queue can hold. Must be a power of 2.
+   */
+  uint32_t size;
+  /**
+   * Reserved. Must be 0.
+   */
+  uint32_t reserved1;
+  /**
+   * Queue identifier, which is unique over the lifetime of the application.
+   */
+  uint64_t id;
+
+} hsa_queue_t;
+
+/**
+ * @brief Create a user mode queue.
+ *
+ * @details The HSA runtime creates the queue structure, the underlying packet
+ * buffer, the completion signal, and the write and read indexes. The initial
+ * value of the write and read indexes is 0. The type of every packet in the
+ * buffer is initialized to ::HSA_PACKET_TYPE_INVALID.
+ *
+ * The application should only rely on the error code returned to determine if
+ * the queue is valid.
+ *
+ * @param[in] agent Agent where to create the queue.
+ *
+ * @param[in] size Number of packets the queue is expected to
+ * hold. Must be a power of 2 between 1 and the value of
+ * ::HSA_AGENT_INFO_QUEUE_MAX_SIZE in @p agent. The size of the newly
+ * created queue is the maximum of @p size and the value of
+ * ::HSA_AGENT_INFO_QUEUE_MIN_SIZE in @p agent.
+ *
+ * @param[in] type Type of the queue, a bitwise OR of hsa_queue_type_t values.
+ * If the value of ::HSA_AGENT_INFO_QUEUE_TYPE in @p agent is ::HSA_QUEUE_TYPE_SINGLE,
+ * then @p type must also be ::HSA_QUEUE_TYPE_SINGLE.
+ *
+ * @param[in] callback Callback invoked by the HSA runtime for every
+ * asynchronous event related to the newly created queue. May be NULL. The HSA
+ * runtime passes three arguments to the callback: a code identifying the event
+ * that triggered the invocation, a pointer to the queue where the event
+ * originated, and the application data.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @param[in] private_segment_size Hint indicating the maximum
+ * expected private segment usage per work-item, in bytes. There may
+ * be performance degradation if the application places a kernel
+ * dispatch packet in the queue and the corresponding private segment
+ * usage exceeds @p private_segment_size. If the application does not
+ * want to specify any particular value for this argument, @p
+ * private_segment_size must be UINT32_MAX. If the queue does not
+ * support kernel dispatch packets, this argument is ignored.
+ *
+ * @param[in] group_segment_size Hint indicating the maximum expected
+ * group segment usage per work-group, in bytes. There may be
+ * performance degradation if the application places a kernel dispatch
+ * packet in the queue and the corresponding group segment usage
+ * exceeds @p group_segment_size. If the application does not want to
+ * specify any particular value for this argument, @p
+ * group_segment_size must be UINT32_MAX. If the queue does not
+ * support kernel dispatch packets, this argument is ignored.
+ *
+ * @param[out] queue Memory location where the HSA runtime stores a pointer to
+ * the newly created queue.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate
+ * the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE_CREATION @p agent does not
+ * support queues of the given type.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p size is not a power of two,
+ * @p size is 0, @p type is an invalid queue type, or @p queue is NULL.
+ *
+ */
+hsa_status_t HSA_API hsa_queue_create(
+    hsa_agent_t agent,
+    uint32_t size,
+    hsa_queue_type32_t type,
+    void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
+    void *data,
+    uint32_t private_segment_size,
+    uint32_t group_segment_size,
+    hsa_queue_t **queue);
+
+/**
+ * @brief Create a queue for which the application or a kernel is responsible
+ * for processing the AQL packets.
+ *
+ * @details The application can use this function to create queues where AQL
+ * packets are not parsed by the packet processor associated with an agent,
+ * but rather by a unit of execution running on that agent (for example, a
+ * thread in the host application).
+ *
+ * The application is responsible for ensuring that all the producers and
+ * consumers of the resulting queue can access the provided doorbell signal
+ * and memory region. The application is also responsible for ensuring that the
+ * unit of execution processing the queue packets supports the indicated
+ * features (AQL packet types).
+ *
+ * When the queue is created, the HSA runtime allocates the packet buffer using
+ * @p region, and the write and read indexes. The initial value of the write and
+ * read indexes is 0, and the type of every packet in the buffer is initialized
+ * to ::HSA_PACKET_TYPE_INVALID. The value of the @e size, @e type, @e features,
+ * and @e doorbell_signal fields in the returned queue match the values passed
+ * by the application.
+ *
+ * @param[in] region Memory region that the HSA runtime should use to allocate
+ * the AQL packet buffer and any other queue metadata.
+ *
+ * @param[in] size Number of packets the queue is expected to hold. Must be a
+ * power of 2 greater than 0.
+ *
+ * @param[in] type Queue type.
+ *
+ * @param[in] features Supported queue features. This is a bit-field of
+ * ::hsa_queue_feature_t values.
+ *
+ * @param[in] doorbell_signal Doorbell signal that the HSA runtime must
+ * associate with the returned queue. The signal handle must not be 0.
+ *
+ * @param[out] queue Memory location where the HSA runtime stores a pointer to
+ * the newly created queue. The application should not rely on the value
+ * returned for this argument but only in the status code to determine if the
+ * queue is valid. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate
+ * the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p size is not a power of two, @p
+ * size is 0, @p type is an invalid queue type, the doorbell signal handle is
+ * 0, or @p queue is NULL.
+ *
+ */
+hsa_status_t HSA_API hsa_soft_queue_create(
+    hsa_region_t region,
+    uint32_t size,
+    hsa_queue_type32_t type,
+    uint32_t features,
+    hsa_signal_t doorbell_signal,
+    hsa_queue_t **queue);
+
+/**
+ * @brief Destroy a user mode queue.
+ *
+ * @details When a queue is destroyed, the state of the AQL packets that have
+ * not been yet fully processed (their completion phase has not finished)
+ * becomes undefined. It is the responsibility of the application to ensure that
+ * all pending queue operations are finished if their results are required.
+ *
+ * The resources allocated by the HSA runtime during queue creation (queue
+ * structure, ring buffer, doorbell signal) are released.  The queue should not
+ * be accessed after being destroyed.
+ *
+ * @param[in] queue Pointer to a queue created using ::hsa_queue_create.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE The queue is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p queue is NULL.
+ */
+hsa_status_t HSA_API hsa_queue_destroy(
+    hsa_queue_t *queue);
+
+/**
+ * @brief Inactivate a queue.
+ *
+ * @details Inactivating the queue aborts any pending executions and prevent any
+ * new packets from being processed. Any more packets written to the queue once
+ * it is inactivated will be ignored by the packet processor.
+ *
+ * @param[in] queue Pointer to a queue.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE The queue is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p queue is NULL.
+ */
+hsa_status_t HSA_API hsa_queue_inactivate(
+    hsa_queue_t *queue);
+
+/**
+ * @deprecated Renamed as ::hsa_queue_load_read_index_scacquire.
+ *
+ * @copydoc hsa_queue_load_read_index_scacquire
+ */
+uint64_t HSA_API HSA_DEPRECATED hsa_queue_load_read_index_acquire(
+    const hsa_queue_t *queue);
+
+/**
+ * @brief Atomically load the read index of a queue.
+ *
+ * @param[in] queue Pointer to a queue.
+ *
+ * @return Read index of the queue pointed by @p queue.
+ */
+uint64_t HSA_API hsa_queue_load_read_index_scacquire(
+    const hsa_queue_t *queue);
+
+/**
+ * @copydoc hsa_queue_load_read_index_scacquire
+ */
+uint64_t HSA_API hsa_queue_load_read_index_relaxed(
+    const hsa_queue_t *queue);
+
+/**
+ * @deprecated Renamed as ::hsa_queue_load_write_index_scacquire.
+ *
+ * @copydoc hsa_queue_load_write_index_scacquire
+ */
+uint64_t HSA_API HSA_DEPRECATED hsa_queue_load_write_index_acquire(
+    const hsa_queue_t *queue);
+
+/**
+ * @brief Atomically load the write index of a queue.
+ *
+ * @param[in] queue Pointer to a queue.
+ *
+ * @return Write index of the queue pointed by @p queue.
+ */
+uint64_t HSA_API hsa_queue_load_write_index_scacquire(
+    const hsa_queue_t *queue);
+
+/**
+ * @copydoc hsa_queue_load_write_index_scacquire
+ */
+uint64_t HSA_API hsa_queue_load_write_index_relaxed(
+    const hsa_queue_t *queue);
+
+/**
+ * @brief Atomically set the write index of a queue.
+ *
+ * @details It is recommended that the application uses this function to update
+ * the write index when there is a single agent submitting work to the queue
+ * (the queue type is ::HSA_QUEUE_TYPE_SINGLE).
+ *
+ * @param[in] queue Pointer to a queue.
+ *
+ * @param[in] value Value to assign to the write index.
+ *
+ */
+void HSA_API hsa_queue_store_write_index_relaxed(
+    const hsa_queue_t *queue,
+    uint64_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_queue_store_write_index_screlease.
+ *
+ * @copydoc hsa_queue_store_write_index_screlease
+ */
+void HSA_API HSA_DEPRECATED hsa_queue_store_write_index_release(
+    const hsa_queue_t *queue,
+    uint64_t value);
+
+/**
+ * @copydoc hsa_queue_store_write_index_relaxed
+ */
+void HSA_API hsa_queue_store_write_index_screlease(
+    const hsa_queue_t *queue,
+    uint64_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_queue_cas_write_index_scacq_screl.
+ *
+ * @copydoc hsa_queue_cas_write_index_scacq_screl
+ */
+uint64_t HSA_API HSA_DEPRECATED hsa_queue_cas_write_index_acq_rel(
+    const hsa_queue_t *queue,
+    uint64_t expected,
+    uint64_t value);
+
+/**
+ * @brief Atomically set the write index of a queue if the observed value is
+ * equal to the expected value. The application can inspect the returned value
+ * to determine if the replacement was done.
+ *
+ * @param[in] queue Pointer to a queue.
+ *
+ * @param[in] expected Expected value.
+ *
+ * @param[in] value Value to assign to the write index if @p expected matches
+ * the observed write index. Must be greater than @p expected.
+ *
+ * @return Previous value of the write index.
+ */
+uint64_t HSA_API hsa_queue_cas_write_index_scacq_screl(
+    const hsa_queue_t *queue,
+    uint64_t expected,
+    uint64_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_queue_cas_write_index_scacquire.
+ *
+ * @copydoc hsa_queue_cas_write_index_scacquire
+ */
+uint64_t HSA_API HSA_DEPRECATED hsa_queue_cas_write_index_acquire(
+    const hsa_queue_t *queue,
+    uint64_t expected,
+    uint64_t value);
+
+/**
+ * @copydoc hsa_queue_cas_write_index_scacq_screl
+ */
+uint64_t HSA_API hsa_queue_cas_write_index_scacquire(
+    const hsa_queue_t *queue,
+    uint64_t expected,
+    uint64_t value);
+
+/**
+ * @copydoc hsa_queue_cas_write_index_scacq_screl
+ */
+uint64_t HSA_API hsa_queue_cas_write_index_relaxed(
+    const hsa_queue_t *queue,
+    uint64_t expected,
+    uint64_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_queue_cas_write_index_screlease.
+ *
+ * @copydoc hsa_queue_cas_write_index_screlease
+ */
+uint64_t HSA_API HSA_DEPRECATED hsa_queue_cas_write_index_release(
+    const hsa_queue_t *queue,
+    uint64_t expected,
+    uint64_t value);
+
+/**
+ * @copydoc hsa_queue_cas_write_index_scacq_screl
+ */
+uint64_t HSA_API hsa_queue_cas_write_index_screlease(
+    const hsa_queue_t *queue,
+    uint64_t expected,
+    uint64_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_queue_add_write_index_scacq_screl.
+ *
+ * @copydoc hsa_queue_add_write_index_scacq_screl
+ */
+uint64_t HSA_API HSA_DEPRECATED hsa_queue_add_write_index_acq_rel(
+    const hsa_queue_t *queue,
+    uint64_t value);
+
+/**
+ * @brief Atomically increment the write index of a queue by an offset.
+ *
+ * @param[in] queue Pointer to a queue.
+ *
+ * @param[in] value Value to add to the write index.
+ *
+ * @return Previous value of the write index.
+ */
+uint64_t HSA_API hsa_queue_add_write_index_scacq_screl(
+    const hsa_queue_t *queue,
+    uint64_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_queue_add_write_index_scacquire.
+ *
+ * @copydoc hsa_queue_add_write_index_scacquire
+ */
+uint64_t HSA_API HSA_DEPRECATED hsa_queue_add_write_index_acquire(
+    const hsa_queue_t *queue,
+    uint64_t value);
+
+/**
+ * @copydoc hsa_queue_add_write_index_scacq_screl
+ */
+uint64_t HSA_API hsa_queue_add_write_index_scacquire(
+    const hsa_queue_t *queue,
+    uint64_t value);
+
+/**
+ * @copydoc hsa_queue_add_write_index_scacq_screl
+ */
+uint64_t HSA_API hsa_queue_add_write_index_relaxed(
+    const hsa_queue_t *queue,
+    uint64_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_queue_add_write_index_screlease.
+ *
+ * @copydoc hsa_queue_add_write_index_screlease
+ */
+uint64_t HSA_API HSA_DEPRECATED hsa_queue_add_write_index_release(
+    const hsa_queue_t *queue,
+    uint64_t value);
+
+/**
+ * @copydoc hsa_queue_add_write_index_scacq_screl
+ */
+uint64_t HSA_API hsa_queue_add_write_index_screlease(
+    const hsa_queue_t *queue,
+    uint64_t value);
+
+/**
+ * @brief Atomically set the read index of a queue.
+ *
+ * @details Modifications of the read index are not allowed and result in
+ * undefined behavior if the queue is associated with an agent for which
+ * only the corresponding packet processor is permitted to update the read
+ * index.
+ *
+ * @param[in] queue Pointer to a queue.
+ *
+ * @param[in] value Value to assign to the read index.
+ *
+ */
+void HSA_API hsa_queue_store_read_index_relaxed(
+    const hsa_queue_t *queue,
+    uint64_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_queue_store_read_index_screlease.
+ *
+ * @copydoc hsa_queue_store_read_index_screlease
+ */
+void HSA_API HSA_DEPRECATED hsa_queue_store_read_index_release(
+    const hsa_queue_t *queue,
+    uint64_t value);
+
+/**
+ * @copydoc hsa_queue_store_read_index_relaxed
+ */
+void HSA_API hsa_queue_store_read_index_screlease(
+   const hsa_queue_t *queue,
+   uint64_t value);
+/** @} */
+
+
+/** \defgroup aql Architected Queuing Language
+ *  @{
+ */
+
+/**
+ * @brief Packet type.
+ */
+typedef enum {
+  /**
+   * Vendor-specific packet.
+   */
+  HSA_PACKET_TYPE_VENDOR_SPECIFIC = 0,
+  /**
+   * The packet has been processed in the past, but has not been reassigned to
+   * the packet processor. A packet processor must not process a packet of this
+   * type. All queues support this packet type.
+   */
+  HSA_PACKET_TYPE_INVALID = 1,
+  /**
+   * Packet used by agents for dispatching jobs to kernel agents. Not all
+   * queues support packets of this type (see ::hsa_queue_feature_t).
+   */
+  HSA_PACKET_TYPE_KERNEL_DISPATCH = 2,
+  /**
+   * Packet used by agents to delay processing of subsequent packets, and to
+   * express complex dependencies between multiple packets. All queues support
+   * this packet type.
+   */
+  HSA_PACKET_TYPE_BARRIER_AND = 3,
+  /**
+   * Packet used by agents for dispatching jobs to agents.  Not all
+   * queues support packets of this type (see ::hsa_queue_feature_t).
+   */
+  HSA_PACKET_TYPE_AGENT_DISPATCH = 4,
+  /**
+   * Packet used by agents to delay processing of subsequent packets, and to
+   * express complex dependencies between multiple packets. All queues support
+   * this packet type.
+   */
+  HSA_PACKET_TYPE_BARRIER_OR = 5
+} hsa_packet_type_t;
+
+/**
+ * @brief Scope of the memory fence operation associated with a packet.
+ */
+typedef enum {
+  /**
+   * No scope (no fence is applied). The packet relies on external fences to
+   * ensure visibility of memory updates.
+   */
+  HSA_FENCE_SCOPE_NONE = 0,
+  /**
+   * The fence is applied with agent scope for the global segment.
+   */
+  HSA_FENCE_SCOPE_AGENT = 1,
+  /**
+   * The fence is applied across both agent and system scope for the global
+   * segment.
+   */
+  HSA_FENCE_SCOPE_SYSTEM = 2
+} hsa_fence_scope_t;
+
+/**
+ * @brief Sub-fields of the @a header field that is present in any AQL
+ * packet. The offset (with respect to the address of @a header) of a sub-field
+ * is identical to its enumeration constant. The width of each sub-field is
+ * determined by the corresponding value in ::hsa_packet_header_width_t. The
+ * offset and the width are expressed in bits.
+ */
+ typedef enum {
+  /**
+   * Packet type. The value of this sub-field must be one of
+   * ::hsa_packet_type_t. If the type is ::HSA_PACKET_TYPE_VENDOR_SPECIFIC, the
+   * packet layout is vendor-specific.
+   */
+   HSA_PACKET_HEADER_TYPE = 0,
+  /**
+   * Barrier bit. If the barrier bit is set, the processing of the current
+   * packet only launches when all preceding packets (within the same queue) are
+   * complete.
+   */
+   HSA_PACKET_HEADER_BARRIER = 8,
+  /**
+   * Acquire fence scope. The value of this sub-field determines the scope and
+   * type of the memory fence operation applied before the packet enters the
+   * active phase. An acquire fence ensures that any subsequent global segment
+   * or image loads by any unit of execution that belongs to a dispatch that has
+   * not yet entered the active phase on any queue of the same kernel agent,
+   * sees any data previously released at the scopes specified by the acquire
+   * fence. The value of this sub-field must be one of ::hsa_fence_scope_t.
+   */
+   HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE = 9,
+   /**
+    * @deprecated Renamed as ::HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE.
+    */
+   HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE = 9,
+  /**
+   * Release fence scope, The value of this sub-field determines the scope and
+   * type of the memory fence operation applied after kernel completion but
+   * before the packet is completed. A release fence makes any global segment or
+   * image data that was stored by any unit of execution that belonged to a
+   * dispatch that has completed the active phase on any queue of the same
+   * kernel agent visible in all the scopes specified by the release fence. The
+   * value of this sub-field must be one of ::hsa_fence_scope_t.
+   */
+   HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE = 11,
+   /**
+    * @deprecated Renamed as ::HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE.
+    */
+   HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE = 11
+ } hsa_packet_header_t;
+
+/**
+ * @brief Width (in bits) of the sub-fields in ::hsa_packet_header_t.
+ */
+ typedef enum {
+   HSA_PACKET_HEADER_WIDTH_TYPE = 8,
+   HSA_PACKET_HEADER_WIDTH_BARRIER = 1,
+   HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE = 2,
+   /**
+    * @deprecated Use HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE.
+    */
+   HSA_PACKET_HEADER_WIDTH_ACQUIRE_FENCE_SCOPE = 2,
+   HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE = 2,
+   /**
+    * @deprecated Use HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE.
+    */
+   HSA_PACKET_HEADER_WIDTH_RELEASE_FENCE_SCOPE = 2
+ } hsa_packet_header_width_t;
+
+/**
+ * @brief Sub-fields of the kernel dispatch packet @a setup field. The offset
+ * (with respect to the address of @a setup) of a sub-field is identical to its
+ * enumeration constant. The width of each sub-field is determined by the
+ * corresponding value in ::hsa_kernel_dispatch_packet_setup_width_t. The
+ * offset and the width are expressed in bits.
+ */
+ typedef enum {
+  /**
+   * Number of dimensions of the grid. Valid values are 1, 2, or 3.
+   *
+   */
+   HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS = 0
+ } hsa_kernel_dispatch_packet_setup_t;
+
+/**
+ * @brief Width (in bits) of the sub-fields in
+ * ::hsa_kernel_dispatch_packet_setup_t.
+ */
+ typedef enum {
+   HSA_KERNEL_DISPATCH_PACKET_SETUP_WIDTH_DIMENSIONS = 2
+ } hsa_kernel_dispatch_packet_setup_width_t;
+
+/**
+ * @brief AQL kernel dispatch packet
+ */
+typedef struct hsa_kernel_dispatch_packet_s {
+  union {
+    struct {
+        /**
+         * Packet header. Used to configure multiple packet parameters such as the
+         * packet type. The parameters are described by ::hsa_packet_header_t.
+         */
+        uint16_t header;
+
+        /**
+         * Dispatch setup parameters. Used to configure kernel dispatch parameters
+         * such as the number of dimensions in the grid. The parameters are described
+         * by ::hsa_kernel_dispatch_packet_setup_t.
+         */
+        uint16_t setup;
+    };
+    uint32_t full_header;
+  };
+
+  /**
+   * X dimension of work-group, in work-items. Must be greater than 0.
+   */
+  uint16_t workgroup_size_x;
+
+  /**
+   * Y dimension of work-group, in work-items. Must be greater than
+   * 0. If the grid has 1 dimension, the only valid value is 1.
+   */
+  uint16_t workgroup_size_y;
+
+  /**
+   * Z dimension of work-group, in work-items. Must be greater than
+   * 0. If the grid has 1 or 2 dimensions, the only valid value is 1.
+   */
+  uint16_t workgroup_size_z;
+
+  /**
+   * Reserved. Must be 0.
+   */
+  uint16_t reserved0;
+
+  /**
+   * X dimension of grid, in work-items. Must be greater than 0. Must
+   * not be smaller than @a workgroup_size_x.
+   */
+  uint32_t grid_size_x;
+
+  /**
+   * Y dimension of grid, in work-items. Must be greater than 0. If the grid has
+   * 1 dimension, the only valid value is 1. Must not be smaller than @a
+   * workgroup_size_y.
+   */
+  uint32_t grid_size_y;
+
+  /**
+   * Z dimension of grid, in work-items. Must be greater than 0. If the grid has
+   * 1 or 2 dimensions, the only valid value is 1. Must not be smaller than @a
+   * workgroup_size_z.
+   */
+  uint32_t grid_size_z;
+
+  /**
+   * Size in bytes of private memory allocation request (per work-item).
+   */
+  uint32_t private_segment_size;
+
+  /**
+   * Size in bytes of group memory allocation request (per work-group). Must not
+   * be less than the sum of the group memory used by the kernel (and the
+   * functions it calls directly or indirectly) and the dynamically allocated
+   * group segment variables.
+   */
+  uint32_t group_segment_size;
+
+  /**
+   * Opaque handle to a code object that includes an implementation-defined
+   * executable code for the kernel.
+   */
+  uint64_t kernel_object;
+
+#ifdef HSA_LARGE_MODEL
+  void* kernarg_address;
+#elif defined HSA_LITTLE_ENDIAN
+  /**
+   * Pointer to a buffer containing the kernel arguments. May be NULL.
+   *
+   * The buffer must be allocated using ::hsa_memory_allocate, and must not be
+   * modified once the kernel dispatch packet is enqueued until the dispatch has
+   * completed execution.
+   */
+  void* kernarg_address;
+  /**
+   * Reserved. Must be 0.
+   */
+  uint32_t reserved1;
+#else
+  uint32_t reserved1;
+  void* kernarg_address;
+#endif
+
+  /**
+   * Reserved. Must be 0.
+   */
+  uint64_t reserved2;
+
+  /**
+   * Signal used to indicate completion of the job. The application can use the
+   * special signal handle 0 to indicate that no signal is used.
+   */
+  hsa_signal_t completion_signal;
+
+} hsa_kernel_dispatch_packet_t;
+
+/**
+ * @brief Agent dispatch packet.
+ */
+typedef struct hsa_agent_dispatch_packet_s {
+  /**
+   * Packet header. Used to configure multiple packet parameters such as the
+   * packet type. The parameters are described by ::hsa_packet_header_t.
+   */
+  uint16_t header;
+
+  /**
+   * Application-defined function to be performed by the destination agent.
+   */
+  uint16_t type;
+
+  /**
+   * Reserved. Must be 0.
+   */
+  uint32_t reserved0;
+
+#ifdef HSA_LARGE_MODEL
+  void* return_address;
+#elif defined HSA_LITTLE_ENDIAN
+  /**
+   * Address where to store the function return values, if any.
+   */
+  void* return_address;
+  /**
+   * Reserved. Must be 0.
+   */
+  uint32_t reserved1;
+#else
+  uint32_t reserved1;
+  void* return_address;
+#endif
+
+  /**
+   * Function arguments.
+   */
+  uint64_t arg[4];
+
+  /**
+   * Reserved. Must be 0.
+   */
+  uint64_t reserved2;
+
+  /**
+   * Signal used to indicate completion of the job. The application can use the
+   * special signal handle 0 to indicate that no signal is used.
+   */
+  hsa_signal_t completion_signal;
+
+} hsa_agent_dispatch_packet_t;
+
+/**
+ * @brief Barrier-AND packet.
+ */
+typedef struct hsa_barrier_and_packet_s {
+  /**
+   * Packet header. Used to configure multiple packet parameters such as the
+   * packet type. The parameters are described by ::hsa_packet_header_t.
+   */
+  uint16_t header;
+
+  /**
+   * Reserved. Must be 0.
+   */
+  uint16_t reserved0;
+
+  /**
+   * Reserved. Must be 0.
+   */
+  uint32_t reserved1;
+
+  /**
+   * Array of dependent signal objects. Signals with a handle value of 0 are
+   * allowed and are interpreted by the packet processor as satisfied
+   * dependencies.
+   */
+  hsa_signal_t dep_signal[5];
+
+  /**
+   * Reserved. Must be 0.
+   */
+  uint64_t reserved2;
+
+  /**
+   * Signal used to indicate completion of the job. The application can use the
+   * special signal handle 0 to indicate that no signal is used.
+   */
+  hsa_signal_t completion_signal;
+
+} hsa_barrier_and_packet_t;
+
+/**
+ * @brief Barrier-OR packet.
+ */
+typedef struct hsa_barrier_or_packet_s {
+  /**
+   * Packet header. Used to configure multiple packet parameters such as the
+   * packet type. The parameters are described by ::hsa_packet_header_t.
+   */
+  uint16_t header;
+
+  /**
+   * Reserved. Must be 0.
+   */
+  uint16_t reserved0;
+
+  /**
+   * Reserved. Must be 0.
+   */
+  uint32_t reserved1;
+
+  /**
+   * Array of dependent signal objects. Signals with a handle value of 0 are
+   * allowed and are interpreted by the packet processor as dependencies not
+   * satisfied.
+   */
+  hsa_signal_t dep_signal[5];
+
+  /**
+   * Reserved. Must be 0.
+   */
+  uint64_t reserved2;
+
+  /**
+   * Signal used to indicate completion of the job. The application can use the
+   * special signal handle 0 to indicate that no signal is used.
+   */
+  hsa_signal_t completion_signal;
+
+} hsa_barrier_or_packet_t;
+
+/** @} */
+
+/** \addtogroup memory Memory
+ *  @{
+ */
+
+/**
+ * @brief Memory segments associated with a region.
+ */
+typedef enum {
+  /**
+   * Global segment. Used to hold data that is shared by all agents.
+   */
+  HSA_REGION_SEGMENT_GLOBAL = 0,
+  /**
+   * Read-only segment. Used to hold data that remains constant during the
+   * execution of a kernel.
+   */
+  HSA_REGION_SEGMENT_READONLY = 1,
+  /**
+   * Private segment. Used to hold data that is local to a single work-item.
+   */
+  HSA_REGION_SEGMENT_PRIVATE = 2,
+  /**
+   * Group segment. Used to hold data that is shared by the work-items of a
+   * work-group.
+  */
+  HSA_REGION_SEGMENT_GROUP = 3,
+  /**
+   * Kernarg segment. Used to store kernel arguments.
+  */
+  HSA_REGION_SEGMENT_KERNARG = 4
+} hsa_region_segment_t;
+
+/**
+ * @brief Global region flags.
+ */
+typedef enum {
+  /**
+   * The application can use memory in the region to store kernel arguments, and
+   * provide the values for the kernarg segment of a kernel dispatch. If this
+   * flag is set, then ::HSA_REGION_GLOBAL_FLAG_FINE_GRAINED must be set.
+   */
+  HSA_REGION_GLOBAL_FLAG_KERNARG = 1,
+  /**
+   * Updates to memory in this region are immediately visible to all the
+   * agents under the terms of the HSA memory model. If this
+   * flag is set, then ::HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED must not be set.
+   */
+  HSA_REGION_GLOBAL_FLAG_FINE_GRAINED = 2,
+  /**
+   * Updates to memory in this region can be performed by a single agent at
+   * a time. If a different agent in the system is allowed to access the
+   * region, the application must explicitely invoke ::hsa_memory_assign_agent
+   * in order to transfer ownership to that agent for a particular buffer.
+   */
+  HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED = 4,
+
+  /**
+   * Updates to memory in this region have extended scope, where the device-scope atomics
+   * to this memory type act as system-scope with respect to all variables located in
+   * memory regions of this type.
+   * Note: On non-compliant systems, the application may still be responsible for performing
+   * device-specific actions necessary to achieve system-scope coherence.
+   */
+  HSA_REGION_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED = 8
+} hsa_region_global_flag_t;
+
+/**
+ * @brief Attributes of a memory region.
+ */
+
+#ifdef __cplusplus
+typedef enum : int {
+#else
+typedef enum {
+#endif
+  /**
+   * Segment where memory in the region can be used. The type of this
+   * attribute is ::hsa_region_segment_t.
+   */
+  HSA_REGION_INFO_SEGMENT = 0,
+  /**
+   * Flag mask. The value of this attribute is undefined if the value of
+   * ::HSA_REGION_INFO_SEGMENT is not ::HSA_REGION_SEGMENT_GLOBAL. The type of
+   * this attribute is uint32_t, a bit-field of ::hsa_region_global_flag_t
+   * values.
+   */
+  HSA_REGION_INFO_GLOBAL_FLAGS = 1,
+  /**
+   * Size of this region, in bytes. The type of this attribute is size_t.
+   */
+  HSA_REGION_INFO_SIZE = 2,
+  /**
+   * Maximum allocation size in this region, in bytes. Must not exceed the value
+   * of ::HSA_REGION_INFO_SIZE. The type of this attribute is size_t.
+   *
+   * If the region is in the global or readonly segments, this is the maximum
+   * size that the application can pass to ::hsa_memory_allocate.
+   *
+   * If the region is in the group segment, this is the maximum size (per
+   * work-group) that can be requested for a given kernel dispatch. If the
+   * region is in the private segment, this is the maximum size (per work-item)
+   * that can be requested for a specific kernel dispatch, and must be at least
+   * 256 bytes.
+   */
+  HSA_REGION_INFO_ALLOC_MAX_SIZE = 4,
+  /**
+   * Maximum size (per work-group) of private memory that can be requested for a
+   * specific kernel dispatch. Must be at least 65536 bytes. The type of this
+   * attribute is uint32_t. The value of this attribute is undefined if the
+   * region is not in the private segment.
+   */
+  HSA_REGION_INFO_ALLOC_MAX_PRIVATE_WORKGROUP_SIZE = 8,
+  /**
+   * Indicates whether memory in this region can be allocated using
+   * ::hsa_memory_allocate. The type of this attribute is bool.
+   *
+   * The value of this flag is always false for regions in the group and private
+   * segments.
+   */
+  HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED = 5,
+  /**
+   * Allocation granularity of buffers allocated by ::hsa_memory_allocate in
+   * this region. The size of a buffer allocated in this region is a multiple of
+   * the value of this attribute. The value of this attribute is only defined if
+   * ::HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED is true for this region. The type
+   * of this attribute is size_t.
+   */
+  HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE = 6,
+  /**
+   * Alignment of buffers allocated by ::hsa_memory_allocate in this region. The
+   * value of this attribute is only defined if
+   * ::HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED is true for this region, and must be
+   * a power of 2. The type of this attribute is size_t.
+   */
+  HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT = 7
+} hsa_region_info_t;
+
+/**
+ * @brief Get the current value of an attribute of a region.
+ *
+ * @param[in] region A valid region.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to a application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_REGION The region is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid
+ * region attribute, or @p value is NULL.
+ */
+hsa_status_t HSA_API hsa_region_get_info(
+    hsa_region_t region,
+    hsa_region_info_t attribute,
+    void* value);
+
+/**
+ * @brief Iterate over the memory regions associated with a given agent, and
+ * invoke an application-defined callback on every iteration.
+ *
+ * @param[in] agent A valid agent.
+ *
+ * @param[in] callback Callback to be invoked once per region that is
+ * accessible from the agent.  The HSA runtime passes two arguments to the
+ * callback, the region and the application data.  If @p callback returns a
+ * status other than ::HSA_STATUS_SUCCESS for a particular iteration, the
+ * traversal stops and ::hsa_agent_iterate_regions returns that status value.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL.
+ */
+hsa_status_t HSA_API hsa_agent_iterate_regions(
+    hsa_agent_t agent,
+    hsa_status_t (*callback)(hsa_region_t region, void* data),
+    void* data);
+
+/**
+ * @brief Allocate a block of memory in a given region.
+ *
+ * @param[in] region Region where to allocate memory from. The region must have
+ * the ::HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED flag set.
+ *
+ * @param[in] size Allocation size, in bytes. Must not be zero. This value is
+ * rounded up to the nearest multiple of ::HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE
+ * in @p region.
+ *
+ * @param[out] ptr Pointer to the location where to store the base address of
+ * the allocated block. The returned base address is aligned to the value of
+ * ::HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT in @p region. If the allocation
+ * fails, the returned value is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate
+ * the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_REGION The region is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION The host is not allowed to
+ * allocate memory in @p region, or @p size is greater than the value of
+ * HSA_REGION_INFO_ALLOC_MAX_SIZE in @p region.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is NULL, or @p size is 0.
+ */
+hsa_status_t HSA_API hsa_memory_allocate(hsa_region_t region,
+    size_t size,
+    void** ptr);
+
+/**
+ * @brief Deallocate a block of memory previously allocated using
+ * ::hsa_memory_allocate.
+ *
+ * @param[in] ptr Pointer to a memory block. If @p ptr does not match a value
+ * previously returned by ::hsa_memory_allocate, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ */
+hsa_status_t HSA_API hsa_memory_free(void* ptr);
+
+/**
+ * @brief Copy a block of memory from the location pointed to by @p src to the
+ * memory block pointed to by @p dst.
+ *
+ * @param[out] dst Buffer where the content is to be copied. If @p dst is in
+ * coarse-grained memory, the copied data is only visible to the agent currently
+ * assigned (::hsa_memory_assign_agent) to @p dst.
+ *
+ * @param[in] src A valid pointer to the source of data to be copied. The source
+ * buffer must not overlap with the destination buffer. If the source buffer is
+ * in coarse-grained memory then it must be assigned to an agent, from which the
+ * data will be retrieved.
+ *
+ * @param[in] size Number of bytes to copy. If @p size is 0, no copy is
+ * performed and the function returns success. Copying a number of bytes larger
+ * than the size of the buffers pointed by @p dst or @p src results in undefined
+ * behavior.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT The source or destination
+ * pointers are NULL.
+ */
+hsa_status_t HSA_API hsa_memory_copy(
+    void *dst,
+    const void *src,
+    size_t size);
+
+/**
+ * @brief Change the ownership of a global, coarse-grained buffer.
+ *
+ * @details The contents of a coarse-grained buffer are visible to an agent
+ * only after ownership has been explicitely transferred to that agent. Once the
+ * operation completes, the previous owner cannot longer access the data in the
+ * buffer.
+ *
+ * An implementation of the HSA runtime is allowed, but not required, to change
+ * the physical location of the buffer when ownership is transferred to a
+ * different agent. In general the application must not assume this
+ * behavior. The virtual location (address) of the passed buffer is never
+ * modified.
+ *
+ * @param[in] ptr Base address of a global buffer. The pointer must match an
+ * address previously returned by ::hsa_memory_allocate. The size of the buffer
+ * affected by the ownership change is identical to the size of that previous
+ * allocation. If @p ptr points to a fine-grained global buffer, no operation is
+ * performed and the function returns success. If @p ptr does not point to
+ * global memory, the behavior is undefined.
+ *
+ * @param[in] agent Agent that becomes the owner of the buffer. The
+ * application is responsible for ensuring that @p agent has access to the
+ * region that contains the buffer. It is allowed to change ownership to an
+ * agent that is already the owner of the buffer, with the same or different
+ * access permissions.
+ *
+ * @param[in] access Access permissions requested for the new owner.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate
+ * the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is NULL, or @p access is
+ * not a valid access value.
+ */
+hsa_status_t HSA_API hsa_memory_assign_agent(
+    void *ptr,
+    hsa_agent_t agent,
+    hsa_access_permission_t access);
+
+/**
+ *
+ * @brief Register a global, fine-grained buffer.
+ *
+ * @details Registering a buffer serves as an indication to the HSA runtime that
+ * the memory might be accessed from a kernel agent other than the
+ * host. Registration is a performance hint that allows the HSA runtime
+ * implementation to know which buffers will be accessed by some of the kernel
+ * agents ahead of time.
+ *
+ * Registration is only recommended for buffers in the global segment that have
+ * not been allocated using the HSA allocator (::hsa_memory_allocate), but an OS
+ * allocator instead. Registering an OS-allocated buffer in the base profile is
+ * equivalent to a no-op.
+ *
+ * Registrations should not overlap.
+ *
+ * @param[in] ptr A buffer in global, fine-grained memory. If a NULL pointer is
+ * passed, no operation is performed. If the buffer has been allocated using
+ * ::hsa_memory_allocate, or has already been registered, no operation is
+ * performed.
+ *
+ * @param[in] size Requested registration size in bytes. A size of 0 is
+ * only allowed if @p ptr is NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate
+ * the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p size is 0 but @p ptr
+ * is not NULL.
+ */
+hsa_status_t HSA_API hsa_memory_register(
+    void *ptr,
+    size_t size);
+
+/**
+ *
+ * @brief Deregister memory previously registered using ::hsa_memory_register.
+ *
+ * @details If the memory interval being deregistered does not match a previous
+ * registration (start and end addresses), the behavior is undefined.
+ *
+ * @param[in] ptr A pointer to the base of the buffer to be deregistered. If
+ * a NULL pointer is passed, no operation is performed.
+ *
+ * @param[in] size Size of the buffer to be deregistered.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ */
+hsa_status_t HSA_API hsa_memory_deregister(
+    void *ptr,
+    size_t size);
+
+/** @} */
+
+
+/** \defgroup instruction-set-architecture Instruction Set Architecture.
+ *  @{
+ */
+
+/**
+ * @brief Instruction set architecture.
+ */
+typedef struct hsa_isa_s {
+  /**
+   * Opaque handle. Two handles reference the same object of the enclosing type
+   * if and only if they are equal.
+   */
+  uint64_t handle;
+} hsa_isa_t;
+
+/**
+ * @brief Retrieve a reference to an instruction set architecture handle out of
+ * a symbolic name.
+ *
+ * @param[in] name Vendor-specific name associated with a a particular
+ * instruction set architecture. @p name must start with the vendor name and a
+ * colon (for example, "AMD:"). The rest of the name is vendor-specific. Must be
+ * a NUL-terminated string.
+ *
+ * @param[out] isa Memory location where the HSA runtime stores the ISA handle
+ * corresponding to the given name. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ISA_NAME The given name does not
+ * correspond to any instruction set architecture.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p name is NULL, or @p isa is
+ * NULL.
+ */
+hsa_status_t HSA_API hsa_isa_from_name(
+    const char *name,
+    hsa_isa_t *isa);
+
+/**
+ * @brief Iterate over the instruction sets supported by the given agent, and
+ * invoke an application-defined callback on every iteration. The iterator is
+ * deterministic: if an agent supports several instruction set architectures,
+ * they are traversed in the same order in every invocation of this function.
+ *
+ * @param[in] agent A valid agent.
+ *
+ * @param[in] callback Callback to be invoked once per instruction set
+ * architecture.  The HSA runtime passes two arguments to the callback: the
+ * ISA and the application data.  If @p callback returns a status other than
+ * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and
+ * that status value is returned.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL.
+ */
+hsa_status_t HSA_API hsa_agent_iterate_isas(
+    hsa_agent_t agent,
+    hsa_status_t (*callback)(hsa_isa_t isa, void *data),
+    void *data);
+
+/**
+ * @brief Instruction set architecture attributes.
+ */
+typedef enum {
+  /**
+   * The length of the ISA name in bytes, not including the NUL terminator. The
+   * type of this attribute is uint32_t.
+   */
+  HSA_ISA_INFO_NAME_LENGTH = 0,
+  /**
+   * Human-readable description.  The type of this attribute is character array
+   * with the length equal to the value of ::HSA_ISA_INFO_NAME_LENGTH attribute.
+   */
+  HSA_ISA_INFO_NAME = 1,
+  /**
+   * @deprecated
+   *
+   * Number of call conventions supported by the instruction set architecture.
+   * Must be greater than zero. The type of this attribute is uint32_t.
+   */
+  HSA_ISA_INFO_CALL_CONVENTION_COUNT = 2,
+  /**
+   * @deprecated
+   *
+   * Number of work-items in a wavefront for a given call convention. Must be a
+   * power of 2 in the range [1,256]. The type of this attribute is uint32_t.
+   */
+  HSA_ISA_INFO_CALL_CONVENTION_INFO_WAVEFRONT_SIZE = 3,
+  /**
+   * @deprecated
+   *
+   * Number of wavefronts per compute unit for a given call convention. In
+   * practice, other factors (for example, the amount of group memory used by a
+   * work-group) may further limit the number of wavefronts per compute
+   * unit. The type of this attribute is uint32_t.
+   */
+  HSA_ISA_INFO_CALL_CONVENTION_INFO_WAVEFRONTS_PER_COMPUTE_UNIT = 4,
+  /**
+   * Machine models supported by the instruction set architecture. The type of
+   * this attribute is a bool[2]. If the ISA supports the small machine model,
+   * the element at index ::HSA_MACHINE_MODEL_SMALL is true. If the ISA supports
+   * the large model, the element at index ::HSA_MACHINE_MODEL_LARGE is true.
+   */
+  HSA_ISA_INFO_MACHINE_MODELS = 5,
+  /**
+   * Profiles supported by the instruction set architecture. The type of this
+   * attribute is a bool[2]. If the ISA supports the base profile, the element
+   * at index ::HSA_PROFILE_BASE is true. If the ISA supports the full profile,
+   * the element at index ::HSA_PROFILE_FULL is true.
+   */
+  HSA_ISA_INFO_PROFILES = 6,
+  /**
+   * Default floating-point rounding modes supported by the instruction set
+   * architecture. The type of this attribute is a bool[3]. The value at a given
+   * index is true if the corresponding rounding mode in
+   * ::hsa_default_float_rounding_mode_t is supported. At least one default mode
+   * has to be supported.
+   *
+   * If the default mode is supported, then
+   * ::HSA_ISA_INFO_BASE_PROFILE_DEFAULT_FLOAT_ROUNDING_MODES must report that
+   * both the zero and the near roundings modes are supported.
+   */
+  HSA_ISA_INFO_DEFAULT_FLOAT_ROUNDING_MODES = 7,
+  /**
+   * Default floating-point rounding modes supported by the instruction set
+   * architecture in the Base profile. The type of this attribute is a
+   * bool[3]. The value at a given index is true if the corresponding rounding
+   * mode in ::hsa_default_float_rounding_mode_t is supported. The value at
+   * index HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT must be false.  At least one
+   * of the values at indexes ::HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO or
+   * HSA_DEFAULT_FLOAT_ROUNDING_MODE_NEAR must be true.
+   */
+  HSA_ISA_INFO_BASE_PROFILE_DEFAULT_FLOAT_ROUNDING_MODES = 8,
+  /**
+   * Flag indicating that the f16 HSAIL operation is at least as fast as the
+   * f32 operation in the instruction set architecture. The type of this
+   * attribute is bool.
+   */
+  HSA_ISA_INFO_FAST_F16_OPERATION = 9,
+  /**
+   * Maximum number of work-items of each dimension of a work-group.  Each
+   * maximum must be greater than 0. No maximum can exceed the value of
+   * ::HSA_ISA_INFO_WORKGROUP_MAX_SIZE. The type of this attribute is
+   * uint16_t[3].
+   */
+  HSA_ISA_INFO_WORKGROUP_MAX_DIM = 12,
+  /**
+   * Maximum total number of work-items in a work-group. The type
+   * of this attribute is uint32_t.
+   */
+  HSA_ISA_INFO_WORKGROUP_MAX_SIZE = 13,
+  /**
+   * Maximum number of work-items of each dimension of a grid. Each maximum must
+   * be greater than 0, and must not be smaller than the corresponding value in
+   * ::HSA_ISA_INFO_WORKGROUP_MAX_DIM. No maximum can exceed the value of
+   * ::HSA_ISA_INFO_GRID_MAX_SIZE. The type of this attribute is
+   * ::hsa_dim3_t.
+   */
+  HSA_ISA_INFO_GRID_MAX_DIM = 14,
+  /**
+   * Maximum total number of work-items in a grid. The type of this
+   * attribute is uint64_t.
+   */
+  HSA_ISA_INFO_GRID_MAX_SIZE = 16,
+  /**
+   * Maximum number of fbarriers per work-group. Must be at least 32. The
+   * type of this attribute is uint32_t.
+   */
+  HSA_ISA_INFO_FBARRIER_MAX_SIZE = 17
+} hsa_isa_info_t;
+
+/**
+ * @deprecated The concept of call convention has been deprecated. If the
+ * application wants to query the value of an attribute for a given instruction
+ * set architecture, use ::hsa_isa_get_info_alt instead. If the application
+ * wants to query an attribute that is specific to a given combination of ISA
+ * and wavefront, use ::hsa_wavefront_get_info.
+ *
+ * @brief Get the current value of an attribute for a given instruction set
+ * architecture (ISA).
+ *
+ * @param[in] isa A valid instruction set architecture.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[in] index Call convention index. Used only for call convention
+ * attributes, otherwise ignored. Must have a value between 0 (inclusive) and
+ * the value of the attribute ::HSA_ISA_INFO_CALL_CONVENTION_COUNT (not
+ * inclusive) in @p isa.
+ *
+ * @param[out] value Pointer to an application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ISA The instruction set architecture is
+ * invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_INDEX The index is out of range.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid
+ * instruction set architecture attribute, or @p value is
+ * NULL.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_isa_get_info(
+    hsa_isa_t isa,
+    hsa_isa_info_t attribute,
+    uint32_t index,
+    void *value);
+
+/**
+ * @brief Get the current value of an attribute for a given instruction set
+ * architecture (ISA).
+ *
+ * @param[in] isa A valid instruction set architecture.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to an application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ISA The instruction set architecture is
+ * invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid
+ * instruction set architecture attribute, or @p value is
+ * NULL.
+ */
+hsa_status_t HSA_API hsa_isa_get_info_alt(
+    hsa_isa_t isa,
+    hsa_isa_info_t attribute,
+    void *value);
+
+/**
+ * @brief Retrieve the exception policy support for a given combination of
+ * instruction set architecture and profile.
+ *
+ * @param[in] isa A valid instruction set architecture.
+ *
+ * @param[in] profile Profile.
+ *
+ * @param[out] mask Pointer to a memory location where the HSA runtime stores a
+ * mask of ::hsa_exception_policy_t values. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ISA The instruction set architecture is
+ * invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p profile is not a valid
+ * profile, or @p mask is NULL.
+ */
+hsa_status_t HSA_API hsa_isa_get_exception_policies(
+    hsa_isa_t isa,
+    hsa_profile_t profile,
+    uint16_t *mask);
+
+/**
+ * @brief Floating-point types.
+ */
+typedef enum {
+  /**
+   * 16-bit floating-point type.
+   */
+  HSA_FP_TYPE_16 = 1,
+  /**
+   * 32-bit floating-point type.
+   */
+  HSA_FP_TYPE_32 = 2,
+  /**
+   * 64-bit floating-point type.
+   */
+  HSA_FP_TYPE_64 = 4
+} hsa_fp_type_t;
+
+/**
+ * @brief Flush to zero modes.
+ */
+typedef enum {
+  /**
+   * Flush to zero.
+   */
+  HSA_FLUSH_MODE_FTZ = 1,
+  /**
+   * Do not flush to zero.
+   */
+  HSA_FLUSH_MODE_NON_FTZ = 2
+} hsa_flush_mode_t;
+
+/**
+ * @brief Round methods.
+ */
+typedef enum {
+  /**
+   * Single round method.
+   */
+  HSA_ROUND_METHOD_SINGLE = 1,
+  /**
+   * Double round method.
+   */
+  HSA_ROUND_METHOD_DOUBLE = 2
+} hsa_round_method_t;
+
+/**
+ * @brief Retrieve the round method (single or double) used to implement the
+ * floating-point multiply add instruction (mad) for a given combination of
+ * instruction set architecture, floating-point type, and flush to zero
+ * modifier.
+ *
+ * @param[in] isa Instruction set architecture.
+ *
+ * @param[in] fp_type Floating-point type.
+ *
+ * @param[in] flush_mode Flush to zero modifier.
+ *
+ * @param[out] round_method Pointer to a memory location where the HSA
+ * runtime stores the round method used by the implementation. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ISA The instruction set architecture is
+ * invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p fp_type is not a valid
+ * floating-point type, or @p flush_mode is not a valid flush to zero modifier,
+ * or @p round_method is NULL.
+ */
+hsa_status_t HSA_API hsa_isa_get_round_method(
+    hsa_isa_t isa,
+    hsa_fp_type_t fp_type,
+    hsa_flush_mode_t flush_mode,
+    hsa_round_method_t *round_method);
+
+/**
+ * @brief Wavefront handle
+ */
+typedef struct hsa_wavefront_s {
+  /**
+   * Opaque handle. Two handles reference the same object of the enclosing type
+   * if and only if they are equal.
+   */
+  uint64_t handle;
+} hsa_wavefront_t;
+
+/**
+ * @brief Wavefront attributes.
+ */
+typedef enum {
+  /**
+   * Number of work-items in the wavefront. Must be a power of 2 in the range
+   * [1,256]. The type of this attribute is uint32_t.
+   */
+  HSA_WAVEFRONT_INFO_SIZE = 0
+} hsa_wavefront_info_t;
+
+/**
+ * @brief Get the current value of a wavefront attribute.
+ *
+ * @param[in] wavefront A wavefront.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to an application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_WAVEFRONT The wavefront is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid
+ * wavefront attribute, or @p value is NULL.
+ */
+hsa_status_t HSA_API hsa_wavefront_get_info(
+    hsa_wavefront_t wavefront,
+    hsa_wavefront_info_t attribute,
+    void *value);
+
+/**
+ * @brief Iterate over the different wavefronts supported by an instruction set
+ * architecture, and invoke an application-defined callback on every iteration.
+ *
+ * @param[in] isa Instruction set architecture.
+ *
+ * @param[in] callback Callback to be invoked once per wavefront that is
+ * supported by the agent. The HSA runtime passes two arguments to the callback:
+ * the wavefront handle and the application data.  If @p callback returns a
+ * status other than ::HSA_STATUS_SUCCESS for a particular iteration, the
+ * traversal stops and that value is returned.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ISA The instruction set architecture is
+ * invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL.
+ */
+hsa_status_t HSA_API hsa_isa_iterate_wavefronts(
+    hsa_isa_t isa,
+    hsa_status_t (*callback)(hsa_wavefront_t wavefront, void *data),
+    void *data);
+
+/**
+ * @deprecated Use ::hsa_agent_iterate_isas to query which instructions set
+ * architectures are supported by a given agent.
+ *
+ * @brief Check if the instruction set architecture of a code object can be
+ * executed on an agent associated with another architecture.
+ *
+ * @param[in] code_object_isa Instruction set architecture associated with a
+ * code object.
+ *
+ * @param[in] agent_isa Instruction set architecture associated with an agent.
+ *
+ * @param[out] result Pointer to a memory location where the HSA runtime stores
+ * the result of the check. If the two architectures are compatible, the result
+ * is true; if they are incompatible, the result is false.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ISA @p code_object_isa or @p agent_isa are
+ * invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p result is NULL.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_isa_compatible(
+    hsa_isa_t code_object_isa,
+    hsa_isa_t agent_isa,
+    bool *result);
+
+/** @} */
+
+
+/** \defgroup executable Executable
+ *  @{
+ */
+
+/**
+ * @brief Code object reader handle. A code object reader is used to
+ * load a code object from file (when created using
+ * ::hsa_code_object_reader_create_from_file), or from memory (if created using
+ * ::hsa_code_object_reader_create_from_memory).
+ */
+typedef struct hsa_code_object_reader_s {
+  /**
+   * Opaque handle. Two handles reference the same object of the enclosing type
+   * if and only if they are equal.
+   */
+  uint64_t handle;
+} hsa_code_object_reader_t;
+
+/**
+ * @brief Create a code object reader to operate on a file.
+ *
+ * @param[in] file File descriptor. The file must have been opened by
+ * application with at least read permissions prior calling this function. The
+ * file must contain a vendor-specific code object.
+ *
+ * The file is owned and managed by the application; the lifetime of the file
+ * descriptor must exceed that of any associated code object reader.
+ *
+ * @param[out] code_object_reader Memory location to store the newly created
+ * code object reader handle. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_FILE @p file is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p code_object_reader is NULL.
+ */
+hsa_status_t HSA_API hsa_code_object_reader_create_from_file(
+    hsa_file_t file,
+    hsa_code_object_reader_t *code_object_reader);
+
+/**
+ * @brief Create a code object reader to operate on memory.
+ *
+ * @param[in] code_object Memory buffer that contains a vendor-specific code
+ * object. The buffer is owned and managed by the application; the lifetime of
+ * the buffer must exceed that of any associated code object reader.
+ *
+ * @param[in] size Size of the buffer pointed to by @p code_object. Must not be
+ * 0.
+ *
+ * @param[out] code_object_reader Memory location to store newly created code
+ * object reader handle. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p code_object is NULL, @p size
+ * is zero, or @p code_object_reader is NULL.
+ */
+hsa_status_t HSA_API hsa_code_object_reader_create_from_memory(
+    const void *code_object,
+    size_t size,
+    hsa_code_object_reader_t *code_object_reader);
+
+/**
+ * @brief Destroy a code object reader.
+ *
+ * @details The code object reader handle becomes invalid after completion of
+ * this function. Any file or memory used to create the code object read is not
+ * closed, removed, or deallocated by this function.
+ *
+ * @param[in] code_object_reader Code object reader to destroy.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER @p code_object_reader
+ * is invalid.
+ */
+hsa_status_t HSA_API hsa_code_object_reader_destroy(
+    hsa_code_object_reader_t code_object_reader);
+
+/**
+ * @brief Struct containing an opaque handle to an executable, which contains
+ * ISA for finalized kernels and indirect functions together with the allocated
+ * global or readonly segment variables they reference.
+ */
+typedef struct hsa_executable_s {
+  /**
+   * Opaque handle. Two handles reference the same object of the enclosing type
+   * if and only if they are equal.
+   */
+  uint64_t handle;
+} hsa_executable_t;
+
+/**
+ * @brief Executable state.
+ */
+typedef enum {
+  /**
+   * Executable state, which allows the user to load code objects and define
+   * external variables. Variable addresses, kernel code handles, and
+   * indirect function code handles are not available in query operations until
+   * the executable is frozen (zero always returned).
+   */
+  HSA_EXECUTABLE_STATE_UNFROZEN = 0,
+  /**
+   * Executable state, which allows the user to query variable addresses,
+   * kernel code handles, and indirect function code handles using query
+   * operations. Loading new code objects, as well as defining external
+   * variables, is not allowed in this state.
+   */
+  HSA_EXECUTABLE_STATE_FROZEN = 1
+} hsa_executable_state_t;
+
+/**
+ * @deprecated Use ::hsa_executable_create_alt instead, which allows the
+ * application to specify the default floating-point rounding mode of the
+ * executable and assumes an unfrozen initial state.
+ *
+ * @brief Create an empty executable.
+ *
+ * @param[in] profile Profile used in the executable.
+ *
+ * @param[in] executable_state Executable state. If the state is
+ * ::HSA_EXECUTABLE_STATE_FROZEN, the resulting executable is useless because no
+ * code objects can be loaded, and no variables can be defined.
+ *
+ * @param[in] options Standard and vendor-specific options. Unknown options are
+ * ignored. A standard option begins with the "-hsa_" prefix. Options beginning
+ * with the "-hsa_ext_<extension_name>_" prefix are reserved for extensions. A
+ * vendor-specific option begins with the "-<vendor_name>_" prefix. Must be a
+ * NUL-terminated string. May be NULL.
+ *
+ * @param[out] executable Memory location where the HSA runtime stores the newly
+ * created executable handle.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p profile is invalid, or
+ * @p executable is NULL.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_executable_create(
+    hsa_profile_t profile,
+    hsa_executable_state_t executable_state,
+    const char *options,
+    hsa_executable_t *executable);
+
+/**
+ * @brief Create an empty executable.
+ *
+ * @param[in] profile Profile used in the executable.
+ *
+ * @param[in] default_float_rounding_mode Default floating-point rounding mode
+ * used in the executable. Allowed rounding modes are near and zero (default is
+ * not allowed).
+ *
+ * @param[in] options Standard and vendor-specific options. Unknown options are
+ * ignored. A standard option begins with the "-hsa_" prefix. Options beginning
+ * with the "-hsa_ext_<extension_name>_" prefix are reserved for extensions. A
+ * vendor-specific option begins with the "-<vendor_name>_" prefix. Must be a
+ * NUL-terminated string. May be NULL.
+ *
+ * @param[out] executable Memory location where the HSA runtime stores newly
+ * created executable handle. The initial state of the executable is
+ * ::HSA_EXECUTABLE_STATE_UNFROZEN.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p profile is invalid, or
+ * @p executable is NULL.
+ */
+hsa_status_t HSA_API hsa_executable_create_alt(
+    hsa_profile_t profile,
+    hsa_default_float_rounding_mode_t default_float_rounding_mode,
+    const char *options,
+    hsa_executable_t *executable);
+
+/**
+ * @brief Destroy an executable.
+ *
+ * @details An executable handle becomes invalid after the executable has been
+ * destroyed. Code object handles that were loaded into this executable are
+ * still valid after the executable has been destroyed, and can be used as
+ * intended. Resources allocated outside and associated with this executable
+ * (such as external global or readonly variables) can be released after the
+ * executable has been destroyed.
+ *
+ * Executable should not be destroyed while kernels are in flight.
+ *
+ * @param[in] executable Executable.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ */
+hsa_status_t HSA_API hsa_executable_destroy(
+    hsa_executable_t executable);
+
+/**
+ * @brief Loaded code object handle.
+ */
+typedef struct hsa_loaded_code_object_s {
+  /**
+   * Opaque handle. Two handles reference the same object of the enclosing type
+   * if and only if they are equal.
+   */
+  uint64_t handle;
+} hsa_loaded_code_object_t;
+
+/**
+ * @brief Load a program code object into an executable.
+ *
+ * @details A program code object contains information about resources that are
+ * accessible by all kernel agents that run the executable, and can be loaded
+ * at most once into an executable.
+ *
+ * If the program code object uses extensions, the implementation must support
+ * them for this operation to return successfully.
+ *
+ * @param[in] executable Executable.
+ *
+ * @param[in] code_object_reader A code object reader that holds the program
+ * code object to load. If a code object reader is destroyed before all the
+ * associated executables are destroyed, the behavior is undefined.
+ *
+ * @param[in] options Standard and vendor-specific options. Unknown options are
+ * ignored. A standard option begins with the "-hsa_" prefix. Options beginning
+ * with the "-hsa_ext_<extension_name>_" prefix are reserved for extensions. A
+ * vendor-specific option begins with the "-<vendor_name>_" prefix. Must be a
+ * NUL-terminated string. May be NULL.
+ *
+ * @param[out] loaded_code_object Pointer to a memory location where the HSA
+ * runtime stores the loaded code object handle. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE The executable is frozen.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER @p code_object_reader
+ * is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS The program code object is
+ * not compatible with the executable or the implementation (for example, the
+ * code object uses an extension that is not supported by the implementation).
+ */
+hsa_status_t HSA_API hsa_executable_load_program_code_object(
+    hsa_executable_t executable,
+    hsa_code_object_reader_t code_object_reader,
+    const char *options,
+    hsa_loaded_code_object_t *loaded_code_object);
+
+/**
+ * @brief Load an agent code object into an executable.
+ *
+ * @details The agent code object contains all defined agent
+ * allocation variables, functions, indirect functions, and kernels in a given
+ * program for a given instruction set architecture.
+ *
+ * Any module linkage declaration must have been defined either by a define
+ * variable or by loading a code object that has a symbol with module linkage
+ * definition.
+ *
+ * The default floating-point rounding mode of the code object associated with
+ * @p code_object_reader must match that of the executable
+ * (::HSA_EXECUTABLE_INFO_DEFAULT_FLOAT_ROUNDING_MODE), or be default (in which
+ * case the value of ::HSA_EXECUTABLE_INFO_DEFAULT_FLOAT_ROUNDING_MODE is used).
+ * If the agent code object uses extensions, the implementation and the agent
+ * must support them for this operation to return successfully.
+ *
+ * @param[in] executable Executable.
+ *
+ * @param[in] agent Agent to load code object for. A code object can be loaded
+ * into an executable at most once for a given agent. The instruction set
+ * architecture of the code object must be supported by the agent.
+ *
+ * @param[in] code_object_reader A code object reader that holds the code object
+ * to load. If a code object reader is destroyed before all the associated
+ * executables are destroyed, the behavior is undefined.
+ *
+ * @param[in] options Standard and vendor-specific options. Unknown options are
+ * ignored. A standard option begins with the "-hsa_" prefix. Options beginning
+ * with the "-hsa_ext_<extension_name>_" prefix are reserved for extensions. A
+ * vendor-specific option begins with the "-<vendor_name>_" prefix. Must be a
+ * NUL-terminated string. May be NULL.
+ *
+ * @param[out] loaded_code_object Pointer to a memory location where the HSA
+ * runtime stores the loaded code object handle. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE The executable is frozen.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER @p code_object_reader
+ * is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS The code object read by @p
+ * code_object_reader is not compatible with the agent (for example, the agent
+ * does not support the instruction set architecture of the code object), the
+ * executable (for example, there is a default floating-point mode mismatch
+ * between the two), or the implementation.
+ */
+hsa_status_t HSA_API hsa_executable_load_agent_code_object(
+    hsa_executable_t executable,
+    hsa_agent_t agent,
+    hsa_code_object_reader_t code_object_reader,
+    const char *options,
+    hsa_loaded_code_object_t *loaded_code_object);
+
+/**
+ * @brief Freeze the executable.
+ *
+ * @details No modifications to executable can be made after freezing: no code
+ * objects can be loaded to the executable, and no external variables can be
+ * defined. Freezing the executable does not prevent querying the executable's
+ * attributes. The application must define all the external variables in an
+ * executable before freezing it.
+ *
+ * @param[in] executable Executable.
+ *
+ * @param[in] options Standard and vendor-specific options. Unknown options are
+ * ignored. A standard option begins with the "-hsa_" prefix. Options beginning
+ * with the "-hsa_ext_<extension_name>_" prefix are reserved for extensions. A
+ * vendor-specific option begins with the "-<vendor_name>_" prefix. Must be a
+ * NUL-terminated string. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_VARIABLE_UNDEFINED One or more variables are
+ * undefined in the executable.
+ *
+ * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is already frozen.
+ */
+hsa_status_t HSA_API hsa_executable_freeze(
+    hsa_executable_t executable,
+    const char *options);
+
+/**
+ * @brief Executable attributes.
+ */
+typedef enum {
+  /**
+   * Profile this executable is created for. The type of this attribute is
+   * ::hsa_profile_t.
+   */
+  HSA_EXECUTABLE_INFO_PROFILE = 1,
+  /**
+   * Executable state. The type of this attribute is ::hsa_executable_state_t.
+   */
+  HSA_EXECUTABLE_INFO_STATE = 2,
+  /**
+   * Default floating-point rounding mode specified when executable was created.
+   * The type of this attribute is ::hsa_default_float_rounding_mode_t.
+   */
+  HSA_EXECUTABLE_INFO_DEFAULT_FLOAT_ROUNDING_MODE = 3
+} hsa_executable_info_t;
+
+/**
+ * @brief Get the current value of an attribute for a given executable.
+ *
+ * @param[in] executable Executable.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to an application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid
+ * executable attribute, or @p value is NULL.
+ */
+hsa_status_t HSA_API hsa_executable_get_info(
+    hsa_executable_t executable,
+    hsa_executable_info_t attribute,
+    void *value);
+
+/**
+ * @brief Define an external global variable with program allocation.
+ *
+ * @details This function allows the application to provide the definition
+ * of a variable in the global segment memory with program allocation. The
+ * variable must be defined before loading a code object into an executable.
+ * In addition, code objects loaded must not define the variable.
+ *
+ * @param[in] executable Executable. Must not be in frozen state.
+ *
+ * @param[in] variable_name Name of the variable. The Programmer's Reference
+ * Manual describes the standard name mangling scheme.
+ *
+ * @param[in] address Address where the variable is defined. This address must
+ * be in global memory and can be read and written by any agent in the
+ * system. The application cannot deallocate the buffer pointed by @p address
+ * before @p executable is destroyed.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED The variable is
+ * already defined.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no variable with the
+ * @p variable_name.
+ *
+ * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is frozen.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p variable_name is NULL.
+ */
+hsa_status_t HSA_API hsa_executable_global_variable_define(
+    hsa_executable_t executable,
+    const char *variable_name,
+    void *address);
+
+/**
+ * @brief Define an external global variable with agent allocation.
+ *
+ * @details This function allows the application to provide the definition
+ * of a variable in the global segment memory with agent allocation. The
+ * variable must be defined before loading a code object into an executable.
+ * In addition, code objects loaded must not define the variable.
+ *
+ * @param[in] executable Executable. Must not be in frozen state.
+ *
+ * @param[in] agent Agent for which the variable is being defined.
+ *
+ * @param[in] variable_name Name of the variable. The Programmer's Reference
+ * Manual describes the standard name mangling scheme.
+ *
+ * @param[in] address Address where the variable is defined. This address must
+ * have been previously allocated using ::hsa_memory_allocate in a global region
+ * that is only visible to @p agent. The application cannot deallocate the
+ * buffer pointed by @p address before @p executable is destroyed.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT @p agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED The variable is
+ * already defined.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no variable with the
+ * @p variable_name.
+ *
+ * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is frozen.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p variable_name is NULL.
+ */
+hsa_status_t HSA_API hsa_executable_agent_global_variable_define(
+    hsa_executable_t executable,
+    hsa_agent_t agent,
+    const char *variable_name,
+    void *address);
+
+/**
+ * @brief Define an external readonly variable.
+ *
+ * @details This function allows the application to provide the definition
+ * of a variable in the readonly segment memory. The variable must be defined
+ * before loading a code object into an executable. In addition, code objects
+ * loaded must not define the variable.
+ *
+ * @param[in] executable Executable. Must not be in frozen state.
+ *
+ * @param[in] agent Agent for which the variable is being defined.
+ *
+ * @param[in] variable_name Name of the variable. The Programmer's Reference
+ * Manual describes the standard name mangling scheme.
+ *
+ * @param[in] address Address where the variable is defined. This address must
+ * have been previously allocated using ::hsa_memory_allocate in a readonly
+ * region associated with @p agent. The application cannot deallocate the buffer
+ * pointed by @p address before @p executable is destroyed.
+ *
+ * @param[in] address Address where the variable is defined. The buffer pointed
+ * by @p address is owned by the application, and cannot be deallocated before
+ * @p executable is destroyed.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE Executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT @p agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED The variable is
+ * already defined.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no variable with the
+ * @p variable_name.
+ *
+ * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is frozen.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p variable_name is NULL.
+ */
+hsa_status_t HSA_API hsa_executable_readonly_variable_define(
+    hsa_executable_t executable,
+    hsa_agent_t agent,
+    const char *variable_name,
+    void *address);
+
+/**
+ * @brief Validate an executable. Checks that all code objects have matching
+ * machine model, profile, and default floating-point rounding mode. Checks that
+ * all declarations have definitions. Checks declaration-definition
+ * compatibility (see the HSA Programming Reference Manual for compatibility
+ * rules). Invoking this function is equivalent to invoking
+ * ::hsa_executable_validate_alt with no options.
+ *
+ * @param[in] executable Executable. Must be in frozen state.
+ *
+ * @param[out] result Memory location where the HSA runtime stores the
+ * validation result. If the executable passes validation, the result is 0.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE @p executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p result is NULL.
+ */
+hsa_status_t HSA_API hsa_executable_validate(
+    hsa_executable_t executable,
+    uint32_t *result);
+
+/**
+ * @brief Validate an executable. Checks that all code objects have matching
+ * machine model, profile, and default floating-point rounding mode. Checks that
+ * all declarations have definitions. Checks declaration-definition
+ * compatibility (see the HSA Programming Reference Manual for compatibility
+ * rules).
+ *
+ * @param[in] executable Executable. Must be in frozen state.
+ *
+ * @param[in] options Standard and vendor-specific options. Unknown options are
+ * ignored. A standard option begins with the "-hsa_" prefix. Options beginning
+ * with the "-hsa_ext_<extension_name>_" prefix are reserved for extensions. A
+ * vendor-specific option begins with the "-<vendor_name>_" prefix. Must be a
+ * NUL-terminated string. May be NULL.
+ *
+ * @param[out] result Memory location where the HSA runtime stores the
+ * validation result. If the executable passes validation, the result is 0.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE @p executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p result is NULL.
+ */
+hsa_status_t HSA_API hsa_executable_validate_alt(
+    hsa_executable_t executable,
+    const char *options,
+    uint32_t *result);
+
+/**
+ * @brief Executable symbol handle.
+ *
+ * The lifetime of an executable object symbol matches that of the executable
+ * associated with it. An operation on a symbol whose associated executable has
+ * been destroyed results in undefined behavior.
+ */
+typedef struct hsa_executable_symbol_s {
+  /**
+   * Opaque handle. Two handles reference the same object of the enclosing type
+   * if and only if they are equal.
+   */
+  uint64_t handle;
+} hsa_executable_symbol_t;
+
+/**
+ * @deprecated Use ::hsa_executable_get_symbol_by_name instead.
+ *
+ * @brief Get the symbol handle for a given a symbol name.
+ *
+ * @param[in] executable Executable.
+ *
+ * @param[in] module_name Module name. Must be NULL if the symbol has
+ * program linkage.
+ *
+ * @param[in] symbol_name Symbol name.
+ *
+ * @param[in] agent Agent associated with the symbol. If the symbol is
+ * independent of any agent (for example, a variable with program
+ * allocation), this argument is ignored.
+ *
+ * @param[in] call_convention Call convention associated with the symbol. If the
+ * symbol does not correspond to an indirect function, this argument is ignored.
+ *
+ * @param[out] symbol Memory location where the HSA runtime stores the symbol
+ * handle.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no symbol with a name
+ * that matches @p symbol_name.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p symbol_name is NULL, or
+ * @p symbol is NULL.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_executable_get_symbol(
+    hsa_executable_t executable,
+    const char *module_name,
+    const char *symbol_name,
+    hsa_agent_t agent,
+    int32_t call_convention,
+    hsa_executable_symbol_t *symbol);
+
+/**
+ * @brief Retrieve the symbol handle corresponding to a given a symbol name.
+ *
+ * @param[in] executable Executable.
+ *
+ * @param[in] symbol_name Symbol name. Must be a NUL-terminated character
+ * array. The Programmer's Reference Manual describes the standard name mangling
+ * scheme.
+ *
+ * @param[in] agent Pointer to the agent for which the symbol with the given
+ * name is defined. If the symbol corresponding to the given name has program
+ * allocation, @p agent must be NULL.
+ *
+ * @param[out] symbol Memory location where the HSA runtime stores the symbol
+ * handle. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no symbol with a name
+ * that matches @p symbol_name.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p symbol_name is NULL, or @p
+ * symbol is NULL.
+ */
+hsa_status_t HSA_API hsa_executable_get_symbol_by_name(
+    hsa_executable_t executable,
+    const char *symbol_name,
+    const hsa_agent_t *agent,
+    hsa_executable_symbol_t *symbol);
+
+/**
+ * @brief Symbol type.
+ */
+typedef enum {
+  /**
+   * Variable.
+   */
+  HSA_SYMBOL_KIND_VARIABLE = 0,
+  /**
+   * Kernel.
+   */
+  HSA_SYMBOL_KIND_KERNEL = 1,
+  /**
+   * Indirect function.
+   */
+  HSA_SYMBOL_KIND_INDIRECT_FUNCTION = 2
+} hsa_symbol_kind_t;
+
+/**
+ * @brief Linkage type of a symbol.
+ */
+typedef enum {
+  /**
+   * Module linkage.
+   */
+  HSA_SYMBOL_LINKAGE_MODULE = 0,
+  /**
+   * Program linkage.
+   */
+  HSA_SYMBOL_LINKAGE_PROGRAM = 1
+} hsa_symbol_linkage_t;
+
+/**
+ * @brief Allocation type of a variable.
+ */
+typedef enum {
+  /**
+   * Agent allocation.
+   */
+  HSA_VARIABLE_ALLOCATION_AGENT = 0,
+  /**
+   * Program allocation.
+   */
+  HSA_VARIABLE_ALLOCATION_PROGRAM = 1
+} hsa_variable_allocation_t;
+
+/**
+ * @brief Memory segment associated with a variable.
+ */
+typedef enum {
+  /**
+   * Global memory segment.
+   */
+  HSA_VARIABLE_SEGMENT_GLOBAL = 0,
+  /**
+   * Readonly memory segment.
+   */
+  HSA_VARIABLE_SEGMENT_READONLY = 1
+} hsa_variable_segment_t;
+
+/**
+ * @brief Executable symbol attributes.
+ */
+typedef enum {
+  /**
+   * The kind of the symbol. The type of this attribute is ::hsa_symbol_kind_t.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_TYPE = 0,
+  /**
+   * The length of the symbol name in bytes, not including the NUL terminator.
+   * The type of this attribute is uint32_t.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH = 1,
+  /**
+   * The name of the symbol. The type of this attribute is character array with
+   * the length equal to the value of ::HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH
+   * attribute.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_NAME = 2,
+  /**
+   * @deprecated
+   *
+   * The length of the module name in bytes (not including the NUL terminator)
+   * to which this symbol belongs if this symbol has module linkage, otherwise 0
+   * is returned. The type of this attribute is uint32_t.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_MODULE_NAME_LENGTH = 3,
+  /**
+   * @deprecated
+   *
+   * The module name to which this symbol belongs if this symbol has module
+   * linkage, otherwise an empty string is returned. The type of this attribute
+   * is character array with the length equal to the value of
+   * ::HSA_EXECUTABLE_SYMBOL_INFO_MODULE_NAME_LENGTH attribute.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_MODULE_NAME = 4,
+  /**
+   * @deprecated
+   *
+   * Agent associated with this symbol. If the symbol is a variable, the
+   * value of this attribute is only defined if
+   * ::HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ALLOCATION is
+   * ::HSA_VARIABLE_ALLOCATION_AGENT. The type of this attribute is hsa_agent_t.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_AGENT = 20,
+  /**
+   * The address of the variable. The value of this attribute is undefined if
+   * the symbol is not a variable. The type of this attribute is uint64_t.
+   *
+   * If executable's state is ::HSA_EXECUTABLE_STATE_UNFROZEN, then 0 is
+   * returned.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS = 21,
+  /**
+   * The linkage kind of the symbol. The type of this attribute is
+   * ::hsa_symbol_linkage_t.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_LINKAGE = 5,
+  /**
+   * Indicates whether the symbol corresponds to a definition. The type of this
+   * attribute is bool.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_IS_DEFINITION = 17,
+  /**
+   * @deprecated
+   *
+   * The allocation kind of the variable. The value of this attribute is
+   * undefined if the symbol is not a variable.  The type of this attribute is
+   * ::hsa_variable_allocation_t.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ALLOCATION = 6,
+  /**
+   * @deprecated
+   *
+   * The segment kind of the variable. The value of this attribute is undefined
+   * if the symbol is not a variable. The type of this attribute is
+   * ::hsa_variable_segment_t.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SEGMENT = 7,
+  /**
+   * @deprecated
+   *
+   * Alignment of the symbol in memory. The value of this attribute is undefined
+   * if the symbol is not a variable. The type of this attribute is uint32_t.
+   *
+   * The current alignment of the variable in memory may be greater than the
+   * value specified in the source program variable declaration.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ALIGNMENT = 8,
+  /**
+   * @deprecated
+   *
+   * Size of the variable. The value of this attribute is undefined if
+   * the symbol is not a variable. The type of this attribute is uint32_t.
+   *
+   * A value of 0 is returned if the variable is an external variable and has an
+   * unknown dimension.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE = 9,
+  /**
+   * @deprecated
+   *
+   * Indicates whether the variable is constant. The value of this attribute is
+   * undefined if the symbol is not a variable. The type of this attribute is
+   * bool.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_IS_CONST = 10,
+  /**
+   * Kernel object handle, used in the kernel dispatch packet. The value of this
+   * attribute is undefined if the symbol is not a kernel. The type of this
+   * attribute is uint64_t.
+   *
+   * If the state of the executable is ::HSA_EXECUTABLE_STATE_UNFROZEN, then 0
+   * is returned.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT = 22,
+  /**
+   * Size of kernarg segment memory that is required to hold the values of the
+   * kernel arguments, in bytes. Must be a multiple of 16. The value of this
+   * attribute is undefined if the symbol is not a kernel. The type of this
+   * attribute is uint32_t.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE = 11,
+  /**
+   * Alignment (in bytes) of the buffer used to pass arguments to the kernel,
+   * which is the maximum of 16 and the maximum alignment of any of the kernel
+   * arguments. The value of this attribute is undefined if the symbol is not a
+   * kernel. The type of this attribute is uint32_t.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT = 12,
+  /**
+   * Size of static group segment memory required by the kernel (per
+   * work-group), in bytes. The value of this attribute is undefined
+   * if the symbol is not a kernel. The type of this attribute is uint32_t.
+   *
+   * The reported amount does not include any dynamically allocated group
+   * segment memory that may be requested by the application when a kernel is
+   * dispatched.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE = 13,
+  /**
+   * Size of static private, spill, and arg segment memory required by
+   * this kernel (per work-item), in bytes. The value of this attribute is
+   * undefined if the symbol is not a kernel. The type of this attribute is
+   * uint32_t.
+   *
+   * If the value of ::HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK is
+   * true, the kernel may use more private memory than the reported value, and
+   * the application must add the dynamic call stack usage to @a
+   * private_segment_size when populating a kernel dispatch packet.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE = 14,
+  /**
+   * Dynamic callstack flag. The value of this attribute is undefined if the
+   * symbol is not a kernel. The type of this attribute is bool.
+   *
+   * If this flag is set (the value is true), the kernel uses a dynamically
+   * sized call stack. This can happen if recursive calls, calls to indirect
+   * functions, or the HSAIL alloca instruction are present in the kernel.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK = 15,
+  /**
+   * @deprecated
+   *
+   * Call convention of the kernel. The value of this attribute is undefined if
+   * the symbol is not a kernel. The type of this attribute is uint32_t.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_CALL_CONVENTION = 18,
+  /**
+   * Indirect function object handle. The value of this attribute is undefined
+   * if the symbol is not an indirect function, or the associated agent does
+   * not support the Full Profile. The type of this attribute depends on the
+   * machine model: the type is uint32_t for small machine model, and uint64_t
+   * for large model.
+   *
+   * If the state of the executable is ::HSA_EXECUTABLE_STATE_UNFROZEN, then 0
+   * is returned.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_INDIRECT_FUNCTION_OBJECT = 23,
+  /**
+   * @deprecated
+   *
+   * Call convention of the indirect function. The value of this attribute is
+   * undefined if the symbol is not an indirect function, or the associated
+   * agent does not support the Full Profile. The type of this attribute is
+   * uint32_t.
+   */
+  HSA_EXECUTABLE_SYMBOL_INFO_INDIRECT_FUNCTION_CALL_CONVENTION = 16
+} hsa_executable_symbol_info_t;
+
+/**
+ * @brief Get the current value of an attribute for a given executable symbol.
+ *
+ * @param[in] executable_symbol Executable symbol.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to an application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE_SYMBOL The executable symbol is
+ * invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid
+ * executable symbol attribute, or @p value is NULL.
+ */
+hsa_status_t HSA_API hsa_executable_symbol_get_info(
+    hsa_executable_symbol_t executable_symbol,
+    hsa_executable_symbol_info_t attribute,
+    void *value);
+
+/**
+ * @deprecated
+ *
+ * @brief Iterate over the symbols in a executable, and invoke an
+ * application-defined callback on every iteration.
+ *
+ * @param[in] executable Executable.
+ *
+ * @param[in] callback Callback to be invoked once per executable symbol. The
+ * HSA runtime passes three arguments to the callback: the executable, a symbol,
+ * and the application data.  If @p callback returns a status other than
+ * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and
+ * ::hsa_executable_iterate_symbols returns that status value.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_executable_iterate_symbols(
+    hsa_executable_t executable,
+    hsa_status_t (*callback)(hsa_executable_t exec,
+                             hsa_executable_symbol_t symbol,
+                             void *data),
+    void *data);
+
+/**
+ * @brief Iterate over the kernels, indirect functions, and agent allocation
+ * variables in an executable for a given agent, and invoke an application-
+ * defined callback on every iteration.
+ *
+ * @param[in] executable Executable.
+ *
+ * @param[in] agent Agent.
+ *
+ * @param[in] callback Callback to be invoked once per executable symbol. The
+ * HSA runtime passes three arguments to the callback: the executable, a symbol,
+ * and the application data.  If @p callback returns a status other than
+ * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and
+ * ::hsa_executable_iterate_symbols returns that status value.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL.
+ */
+hsa_status_t HSA_API hsa_executable_iterate_agent_symbols(
+    hsa_executable_t executable,
+    hsa_agent_t agent,
+    hsa_status_t (*callback)(hsa_executable_t exec,
+                             hsa_agent_t agent,
+                             hsa_executable_symbol_t symbol,
+                             void *data),
+    void *data);
+
+/**
+ * @brief Iterate over the program allocation variables in an executable, and
+ * invoke an application-defined callback on every iteration.
+ *
+ * @param[in] executable Executable.
+ *
+ * @param[in] callback Callback to be invoked once per executable symbol. The
+ * HSA runtime passes three arguments to the callback: the executable, a symbol,
+ * and the application data.  If @p callback returns a status other than
+ * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and
+ * ::hsa_executable_iterate_symbols returns that status value.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL.
+ */
+hsa_status_t HSA_API hsa_executable_iterate_program_symbols(
+    hsa_executable_t executable,
+    hsa_status_t (*callback)(hsa_executable_t exec,
+                             hsa_executable_symbol_t symbol,
+                             void *data),
+    void *data);
+
+/** @} */
+
+
+/** \defgroup code-object Code Objects (deprecated).
+ *  @{
+ */
+
+/**
+ * @deprecated
+ *
+ * @brief Struct containing an opaque handle to a code object, which contains
+ * ISA for finalized kernels and indirect functions together with information
+ * about the global or readonly segment variables they reference.
+ */
+typedef struct hsa_code_object_s {
+  /**
+   * Opaque handle. Two handles reference the same object of the enclosing type
+   * if and only if they are equal.
+   */
+  uint64_t handle;
+} hsa_code_object_t;
+
+/**
+ * @deprecated
+ *
+ * @brief Application data handle that is passed to the serialization
+ * and deserialization functions.
+ */
+typedef struct hsa_callback_data_s {
+  /**
+   * Opaque handle.
+   */
+  uint64_t handle;
+} hsa_callback_data_t;
+
+/**
+ * @deprecated
+ *
+ * @brief Serialize a code object. Can be used for offline finalization,
+ * install-time finalization, disk code caching, etc.
+ *
+ * @param[in] code_object Code object.
+ *
+ * @param[in] alloc_callback Callback function for memory allocation. Must not
+ * be NULL. The HSA runtime passes three arguments to the callback: the
+ * allocation size, the application data, and a pointer to a memory location
+ * where the application stores the allocation result. The HSA runtime invokes
+ * @p alloc_callback once to allocate a buffer that contains the serialized
+ * version of @p code_object.  If the callback returns a status code other than
+ * ::HSA_STATUS_SUCCESS, this function returns the same code.
+ *
+ * @param[in] callback_data Application data that is passed to @p
+ * alloc_callback. May be NULL.
+ *
+ * @param[in] options Standard and vendor-specific options. Unknown options are
+ * ignored. A standard option begins with the "-hsa_" prefix. Options beginning
+ * with the "-hsa_ext_<extension_name>_" prefix are reserved for extensions. A
+ * vendor-specific option begins with the "-<vendor_name>_" prefix. Must be a
+ * NUL-terminated string. May be NULL.
+ *
+ * @param[out] serialized_code_object Memory location where the HSA runtime
+ * stores a pointer to the serialized code object. Must not be NULL.
+ *
+ * @param[out] serialized_code_object_size Memory location where the HSA runtime
+ * stores the size (in bytes) of @p serialized_code_object. The returned value
+ * matches the allocation size passed by the HSA runtime to @p
+ * alloc_callback. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p alloc_callback, @p
+ * serialized_code_object, or @p serialized_code_object_size are NULL.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_serialize(
+    hsa_code_object_t code_object,
+    hsa_status_t (*alloc_callback)(size_t size,
+                                   hsa_callback_data_t data,
+                                   void **address),
+    hsa_callback_data_t callback_data,
+    const char *options,
+    void **serialized_code_object,
+    size_t *serialized_code_object_size);
+
+/**
+ * @deprecated
+ *
+ * @brief Deserialize a code object.
+ *
+ * @param[in] serialized_code_object A serialized code object. Must not be NULL.
+ *
+ * @param[in] serialized_code_object_size The size (in bytes) of @p
+ * serialized_code_object. Must not be 0.
+ *
+ * @param[in] options Standard and vendor-specific options. Unknown options are
+ * ignored. A standard option begins with the "-hsa_" prefix. Options beginning
+ * with the "-hsa_ext_<extension_name>_" prefix are reserved for extensions. A
+ * vendor-specific option begins with the "-<vendor_name>_" prefix. Must be a
+ * NUL-terminated string. May be NULL.
+ *
+ * @param[out] code_object Memory location where the HSA runtime stores the
+ * deserialized code object.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p serialized_code_object, or @p
+ * code_object are NULL, or @p serialized_code_object_size is 0.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_deserialize(
+    void *serialized_code_object,
+    size_t serialized_code_object_size,
+    const char *options,
+    hsa_code_object_t *code_object);
+
+/**
+ * @deprecated
+ *
+ * @brief Destroy a code object.
+ *
+ * @details The lifetime of a code object must exceed that of any executable
+ * where it has been loaded. If an executable that loaded @p code_object has not
+ * been destroyed, the behavior is undefined.
+ *
+ * @param[in] code_object Code object. The handle becomes invalid after it has
+ * been destroyed.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_destroy(
+    hsa_code_object_t code_object);
+
+/**
+ * @deprecated
+ *
+ * @brief Code object type.
+ */
+typedef enum {
+  /**
+   * Produces code object that contains ISA for all kernels and indirect
+   * functions in HSA source.
+   */
+  HSA_CODE_OBJECT_TYPE_PROGRAM = 0
+} hsa_code_object_type_t;
+
+/**
+ * @deprecated
+ *
+ * @brief Code object attributes.
+ */
+typedef enum {
+  /**
+   * The version of the code object. The type of this attribute is a
+   * NUL-terminated char[64]. The name must be at most 63 characters long (not
+   * including the NUL terminator) and all array elements not used for the name
+   * must be NUL.
+   */
+  HSA_CODE_OBJECT_INFO_VERSION = 0,
+  /**
+   * Type of code object. The type of this attribute is
+   * ::hsa_code_object_type_t.
+   */
+  HSA_CODE_OBJECT_INFO_TYPE = 1,
+  /**
+   * Instruction set architecture this code object is produced for. The type of
+   * this attribute is ::hsa_isa_t.
+   */
+  HSA_CODE_OBJECT_INFO_ISA = 2,
+  /**
+   * Machine model this code object is produced for. The type of this attribute
+   * is ::hsa_machine_model_t.
+   */
+  HSA_CODE_OBJECT_INFO_MACHINE_MODEL = 3,
+  /**
+   * Profile this code object is produced for. The type of this attribute is
+   * ::hsa_profile_t.
+   */
+  HSA_CODE_OBJECT_INFO_PROFILE = 4,
+  /**
+   * Default floating-point rounding mode used when the code object is
+   * produced. The type of this attribute is
+   * ::hsa_default_float_rounding_mode_t.
+   */
+  HSA_CODE_OBJECT_INFO_DEFAULT_FLOAT_ROUNDING_MODE = 5
+} hsa_code_object_info_t;
+
+/**
+ * @deprecated
+ *
+ * @brief Get the current value of an attribute for a given code object.
+ *
+ * @param[in] code_object Code object.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to an application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid
+ * code object attribute, or @p value is NULL.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_get_info(
+    hsa_code_object_t code_object,
+    hsa_code_object_info_t attribute,
+    void *value);
+
+/**
+ * @deprecated
+ *
+ * @brief Load code object into the executable.
+ *
+ * @details Every global or readonly variable that is external must be defined
+ * before loading the code object. An internal global or readonly variable is
+ * allocated once the code object, that is being loaded, references this
+ * variable and this variable is not allocated.
+ *
+ * Any module linkage declaration must have been defined either by a define
+ * variable or by loading a code object that has a symbol with module linkage
+ * definition.
+ *
+ * @param[in] executable Executable.
+ *
+ * @param[in] agent Agent to load code object for. The agent must support the
+ * default floating-point rounding mode used by @p code_object.
+ *
+ * @param[in] code_object Code object to load.  The lifetime of the code object
+ * must exceed that of the executable: if @p code_object is destroyed before @p
+ * executable, the behavior is undefined.
+ *
+ * @param[in] options Standard and vendor-specific options. Unknown options are
+ * ignored. A standard option begins with the "-hsa_" prefix. Options beginning
+ * with the "-hsa_ext_<extension_name>_" prefix are reserved for extensions. A
+ * vendor-specific option begins with the "-<vendor_name>_" prefix. Must be a
+ * NUL-terminated string. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS @p agent is not compatible
+ * with @p code_object (for example, @p agent does not support the default
+ * floating-point rounding mode specified by @p code_object), or @p code_object
+ * is not compatible with @p executable (for example, @p code_object and @p
+ * executable have different machine models or profiles).
+ *
+ * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is frozen.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_executable_load_code_object(
+    hsa_executable_t executable,
+    hsa_agent_t agent,
+    hsa_code_object_t code_object,
+    const char *options);
+
+/**
+ * @deprecated
+ *
+ * @brief Code object symbol handle.
+ *
+ * The lifetime of a code object symbol matches that of the code object
+ * associated with it. An operation on a symbol whose associated code object has
+ * been destroyed results in undefined behavior.
+ */
+typedef struct hsa_code_symbol_s {
+  /**
+   * Opaque handle. Two handles reference the same object of the enclosing type
+   * if and only if they are equal.
+   */
+  uint64_t handle;
+} hsa_code_symbol_t;
+
+/**
+ * @deprecated
+ *
+ * @brief Get the symbol handle within a code object for a given a symbol name.
+ *
+ * @param[in] code_object Code object.
+ *
+ * @param[in] symbol_name Symbol name.
+ *
+ * @param[out] symbol Memory location where the HSA runtime stores the symbol
+ * handle.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no symbol with a name
+ * that matches @p symbol_name.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p symbol_name is NULL, or
+ * @p symbol is NULL.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_get_symbol(
+    hsa_code_object_t code_object,
+    const char *symbol_name,
+    hsa_code_symbol_t *symbol);
+
+/**
+ * @deprecated
+ *
+ * @brief Get the symbol handle within a code object for a given a symbol name.
+ *
+ * @param[in] code_object Code object.
+ *
+ * @param[in] module_name Module name. Must be NULL if the symbol has
+ * program linkage.
+ *
+ * @param[in] symbol_name Symbol name.
+ *
+ * @param[out] symbol Memory location where the HSA runtime stores the symbol
+ * handle.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no symbol with a name
+ * that matches @p symbol_name.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p symbol_name is NULL, or
+ * @p symbol is NULL.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_get_symbol_from_name(
+    hsa_code_object_t code_object,
+    const char *module_name,
+    const char *symbol_name,
+    hsa_code_symbol_t *symbol);
+
+/**
+ * @deprecated
+ *
+ * @brief Code object symbol attributes.
+ */
+typedef enum {
+  /**
+   * The type of the symbol. The type of this attribute is ::hsa_symbol_kind_t.
+   */
+  HSA_CODE_SYMBOL_INFO_TYPE = 0,
+  /**
+   * The length of the symbol name in bytes, not including the NUL terminator.
+   * The type of this attribute is uint32_t.
+   */
+  HSA_CODE_SYMBOL_INFO_NAME_LENGTH = 1,
+  /**
+   * The name of the symbol. The type of this attribute is character array with
+   * the length equal to the value of ::HSA_CODE_SYMBOL_INFO_NAME_LENGTH
+   * attribute.
+   */
+  HSA_CODE_SYMBOL_INFO_NAME = 2,
+  /**
+   * The length of the module name in bytes (not including the NUL terminator)
+   * to which this symbol belongs if this symbol has module linkage, otherwise 0
+   * is returned. The type of this attribute is uint32_t.
+   */
+  HSA_CODE_SYMBOL_INFO_MODULE_NAME_LENGTH = 3,
+  /**
+   * The module name to which this symbol belongs if this symbol has module
+   * linkage, otherwise an empty string is returned. The type of this attribute
+   * is character array with the length equal to the value of
+   * ::HSA_CODE_SYMBOL_INFO_MODULE_NAME_LENGTH attribute.
+   */
+  HSA_CODE_SYMBOL_INFO_MODULE_NAME = 4,
+  /**
+   * The linkage kind of the symbol. The type of this attribute is
+   * ::hsa_symbol_linkage_t.
+   */
+  HSA_CODE_SYMBOL_INFO_LINKAGE = 5,
+  /**
+   * Indicates whether the symbol corresponds to a definition. The type of this
+   * attribute is bool.
+   */
+  HSA_CODE_SYMBOL_INFO_IS_DEFINITION = 17,
+  /**
+   * The allocation kind of the variable. The value of this attribute is
+   * undefined if the symbol is not a variable. The type of this attribute is
+   * ::hsa_variable_allocation_t.
+   */
+  HSA_CODE_SYMBOL_INFO_VARIABLE_ALLOCATION = 6,
+  /**
+   * The segment kind of the variable. The value of this attribute is
+   * undefined if the symbol is not a variable. The type of this attribute is
+   * ::hsa_variable_segment_t.
+   */
+  HSA_CODE_SYMBOL_INFO_VARIABLE_SEGMENT = 7,
+  /**
+   * Alignment of the symbol in memory. The value of this attribute is undefined
+   * if the symbol is not a variable. The type of this attribute is uint32_t.
+   *
+   * The current alignment of the variable in memory may be greater than the
+   * value specified in the source program variable declaration.
+   */
+  HSA_CODE_SYMBOL_INFO_VARIABLE_ALIGNMENT = 8,
+  /**
+   * Size of the variable. The value of this attribute is undefined if the
+   * symbol is not a variable. The type of this attribute is uint32_t.
+   *
+   * A size of 0 is returned if the variable is an external variable and has an
+   * unknown dimension.
+   */
+  HSA_CODE_SYMBOL_INFO_VARIABLE_SIZE = 9,
+  /**
+   * Indicates whether the variable is constant. The value of this attribute is
+   * undefined if the symbol is not a variable. The type of this attribute is
+   * bool.
+   */
+  HSA_CODE_SYMBOL_INFO_VARIABLE_IS_CONST = 10,
+  /**
+   * Size of kernarg segment memory that is required to hold the values of the
+   * kernel arguments, in bytes. Must be a multiple of 16. The value of this
+   * attribute is undefined if the symbol is not a kernel. The type of this
+   * attribute is uint32_t.
+   */
+  HSA_CODE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE = 11,
+  /**
+   * Alignment (in bytes) of the buffer used to pass arguments to the kernel,
+   * which is the maximum of 16 and the maximum alignment of any of the kernel
+   * arguments. The value of this attribute is undefined if the symbol is not a
+   * kernel. The type of this attribute is uint32_t.
+   */
+  HSA_CODE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT = 12,
+  /**
+   * Size of static group segment memory required by the kernel (per
+   * work-group), in bytes. The value of this attribute is undefined
+   * if the symbol is not a kernel. The type of this attribute is uint32_t.
+   *
+   * The reported amount does not include any dynamically allocated group
+   * segment memory that may be requested by the application when a kernel is
+   * dispatched.
+   */
+  HSA_CODE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE = 13,
+  /**
+   * Size of static private, spill, and arg segment memory required by
+   * this kernel (per work-item), in bytes. The value of this attribute is
+   * undefined if the symbol is not a kernel. The type of this attribute is
+   * uint32_t.
+   *
+   * If the value of ::HSA_CODE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK is true,
+   * the kernel may use more private memory than the reported value, and the
+   * application must add the dynamic call stack usage to @a
+   * private_segment_size when populating a kernel dispatch packet.
+   */
+  HSA_CODE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE = 14,
+  /**
+   * Dynamic callstack flag. The value of this attribute is undefined if the
+   * symbol is not a kernel. The type of this attribute is bool.
+   *
+   * If this flag is set (the value is true), the kernel uses a dynamically
+   * sized call stack. This can happen if recursive calls, calls to indirect
+   * functions, or the HSAIL alloca instruction are present in the kernel.
+   */
+  HSA_CODE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK = 15,
+  /**
+   * Call convention of the kernel. The value of this attribute is undefined if
+   * the symbol is not a kernel. The type of this attribute is uint32_t.
+   */
+  HSA_CODE_SYMBOL_INFO_KERNEL_CALL_CONVENTION = 18,
+  /**
+   * Call convention of the indirect function. The value of this attribute is
+   * undefined if the symbol is not an indirect function. The type of this
+   * attribute is uint32_t.
+   */
+  HSA_CODE_SYMBOL_INFO_INDIRECT_FUNCTION_CALL_CONVENTION = 16,
+  /**
+   * Wavefront size used by the kernel. The value of this attribute is either
+   * 32 or 64. The type of this attribute is uint32_t.
+   */
+  HSA_CODE_SYMBOL_INFO_KERNEL_WAVEFRONT_SIZE = 19
+} hsa_code_symbol_info_t;
+
+/**
+ * @deprecated
+ *
+ * @brief Get the current value of an attribute for a given code symbol.
+ *
+ * @param[in] code_symbol Code symbol.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to an application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CODE_SYMBOL The code symbol is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid
+ * code symbol attribute, or @p value is NULL.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_code_symbol_get_info(
+    hsa_code_symbol_t code_symbol,
+    hsa_code_symbol_info_t attribute,
+    void *value);
+
+/**
+ * @deprecated
+ *
+ * @brief Iterate over the symbols in a code object, and invoke an
+ * application-defined callback on every iteration.
+ *
+ * @param[in] code_object Code object.
+ *
+ * @param[in] callback Callback to be invoked once per code object symbol. The
+ * HSA runtime passes three arguments to the callback: the code object, a
+ * symbol, and the application data.  If @p callback returns a status other than
+ * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and
+ * ::hsa_code_object_iterate_symbols returns that status value.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_iterate_symbols(
+    hsa_code_object_t code_object,
+    hsa_status_t (*callback)(hsa_code_object_t code_object,
+                             hsa_code_symbol_t symbol,
+                             void *data),
+    void *data);
+
+/** @} */
+
+#ifdef __cplusplus
+}  // end extern "C" block
+#endif
+
+#endif  // header guard
diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_amd_tool.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_amd_tool.h
new file mode 100644
index 0000000000..22847a8a44
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_amd_tool.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright © Advanced Micro Devices, Inc., or its affiliates. 
+ * 
+ * SPDX-License-Identifier: MIT
+ */
+ 
+#ifndef HSA_RUNTIME_AMD_TOOL_EVENTS_H_
+#define HSA_RUNTIME_AMD_TOOL_EVENTS_H_
+
+// Insert license header
+
+#include <stddef.h>
+#include <stdint.h>
+#include "hsa.h"
+
+
+typedef enum {
+  HSA_AMD_EVENT_SCRATCH_ALLOC_FLAG_NONE = 0,
+  HSA_AMD_EVENT_SCRATCH_ALLOC_FLAG_USE_ONCE =
+      (1 << 0),  // This scratch allocation is only valid for 1 dispatch.
+  HSA_AMD_EVENT_SCRATCH_ALLOC_FLAG_ALT =
+      (1 << 1),  // Used alternate scratch instead of main scratch
+} hsa_amd_event_scratch_alloc_flag_t;
+
+typedef enum {
+  HSA_AMD_TOOL_EVENT_MIN = 0,
+
+  // Scratch memory tracking
+  HSA_AMD_TOOL_EVENT_SCRATCH_ALLOC_START,
+  HSA_AMD_TOOL_EVENT_SCRATCH_ALLOC_END,
+  HSA_AMD_TOOL_EVENT_SCRATCH_FREE_START,
+  HSA_AMD_TOOL_EVENT_SCRATCH_FREE_END,
+  HSA_AMD_TOOL_EVENT_SCRATCH_ASYNC_RECLAIM_START,
+  HSA_AMD_TOOL_EVENT_SCRATCH_ASYNC_RECLAIM_END,
+
+  // Add new events above ^
+  HSA_AMD_TOOL_EVENT_MAX
+} hsa_amd_tool_event_kind_t;
+
+typedef struct {
+  hsa_amd_tool_event_kind_t kind;
+} hsa_amd_tool_event_none_t;
+
+typedef struct {
+  hsa_amd_tool_event_kind_t kind;
+  const hsa_queue_t* queue;
+  hsa_amd_event_scratch_alloc_flag_t flags;
+  uint64_t dispatch_id;  // Dispatch ID of the AQL packet that needs more scratch memory
+} hsa_amd_event_scratch_alloc_start_t;
+
+typedef struct {
+  hsa_amd_tool_event_kind_t kind;
+  const hsa_queue_t* queue;
+  hsa_amd_event_scratch_alloc_flag_t flags;
+  uint64_t dispatch_id;  // Dispatch ID of the AQL packet that needs more scratch memory
+  size_t size;           // Amount of scratch allocated - in bytes
+  size_t num_slots;      // limit of number of waves
+} hsa_amd_event_scratch_alloc_end_t;
+
+typedef struct {
+  hsa_amd_tool_event_kind_t kind;
+  const hsa_queue_t* queue;
+  hsa_amd_event_scratch_alloc_flag_t flags;
+} hsa_amd_event_scratch_free_start_t;
+
+typedef struct {
+  hsa_amd_tool_event_kind_t kind;
+  const hsa_queue_t* queue;
+  hsa_amd_event_scratch_alloc_flag_t flags;
+} hsa_amd_event_scratch_free_end_t;
+
+typedef struct {
+  hsa_amd_tool_event_kind_t kind;
+  const hsa_queue_t* queue;
+  hsa_amd_event_scratch_alloc_flag_t flags;
+} hsa_amd_event_scratch_async_reclaim_start_t;
+
+typedef struct {
+  hsa_amd_tool_event_kind_t kind;
+  const hsa_queue_t* queue;
+  hsa_amd_event_scratch_alloc_flag_t flags;
+} hsa_amd_event_scratch_async_reclaim_end_t;
+
+typedef union {
+  const hsa_amd_tool_event_none_t* none;
+  const hsa_amd_event_scratch_alloc_start_t* scratch_alloc_start;
+  const hsa_amd_event_scratch_alloc_end_t* scratch_alloc_end;
+  const hsa_amd_event_scratch_free_start_t* scratch_free_start;
+  const hsa_amd_event_scratch_free_end_t* scratch_free_end;
+  const hsa_amd_event_scratch_async_reclaim_start_t* scratch_async_reclaim_start;
+  const hsa_amd_event_scratch_async_reclaim_end_t* scratch_async_reclaim_end;
+} hsa_amd_tool_event_t;
+
+typedef hsa_status_t (*hsa_amd_tool_event)(hsa_amd_tool_event_t);
+
+
+#endif
\ No newline at end of file
diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_api_trace.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_api_trace.h
new file mode 100644
index 0000000000..cc33320269
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_api_trace.h
@@ -0,0 +1,587 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2025, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef HSA_RUNTIME_INC_HSA_API_TRACE_H
+#define HSA_RUNTIME_INC_HSA_API_TRACE_H
+
+#include "hsa.h"
+#include "hsa_api_trace_version.h"
+#ifdef AMD_INTERNAL_BUILD
+#include "hsa_ext_image.h"
+#include "hsa_ext_amd.h"
+#include "hsa_ext_finalize.h"
+#include "hsa_amd_tool.h"
+#include "hsa_ven_amd_pc_sampling.h"
+#else
+#include "inc/hsa_ext_image.h"
+#include "inc/hsa_ext_amd.h"
+#include "inc/hsa_ext_finalize.h"
+#include "inc/hsa_amd_tool.h"
+#include "inc/hsa_ven_amd_pc_sampling.h"
+#endif
+
+#include <string.h>
+#include <assert.h>
+#include <stddef.h>
+
+// Table MAJOR_VERSION and STEP_VERSION defines have moved to hsa_api_trace_version.h
+
+// Min function used to copy Api Tables
+static inline uint32_t Min(const uint32_t a, const uint32_t b) {
+  return (a > b) ? b : a;
+}
+
+// Declarations of APIs intended for use only by tools.
+
+// An AQL packet that can be put in an intercept queue to cause a callback to
+// be invoked when the packet is about to be submitted to the underlying
+// hardware queue. These packets are not copied to the underlying hardware
+// queue. These packets should come immediately before the regular AQL packet
+// they relate to. This implies that packet rewriters should always keep these
+// packets adjacent to the regular AQL packet that follows them.
+const uint32_t AMD_AQL_FORMAT_INTERCEPT_MARKER = 0xFE;
+
+struct amd_aql_intercept_marker_s;
+
+// When an intercept queue is processing rewritten packets to put them on the
+// underlying hardware queue, if it encounters a
+// AMD_AQL_FORMAT_INTERCEPT_MARKER vendor AQL packet it will call the following
+// handler. packet points to the packet, queue is the underlying hardware
+// queue, and packet_id is the packet id of the next packet to be put on the
+// underlying hardware queue. The intercept queue does not put these packets
+// onto the underlying hardware queue.
+typedef void (*amd_intercept_marker_handler)(const struct amd_aql_intercept_marker_s* packet,
+                                             hsa_queue_t* queue, uint64_t packet_id);
+// An AQL vendor packet used by the intercept queue to mark the following
+// packet. The callback will be invoked to allow a tool to know where in the
+// underlying hardware queue the following packet will be placed. user_data can
+// be used to hold any data useful to the tool.
+typedef struct amd_aql_intercept_marker_s {
+  uint16_t header; // Must have a packet type of HSA_PACKET_TYPE_VENDOR_SPECIFIC.
+  uint8_t format; // Must be AMD_AQL_FORMAT_INTERCEPT_MARKER.
+  uint8_t reserved[5]; // Must be 0.
+#ifdef HSA_LARGE_MODEL
+  amd_intercept_marker_handler callback;
+#elif defined HSA_LITTLE_ENDIAN
+  amd_intercept_marker_handler callback;
+  uint32_t reserved1; // Must be 0.
+#else
+  uint32_t reserved1; // Must be 0.
+  amd_intercept_marker_handler callback;
+#endif
+  uint64_t user_data[6];
+} amd_aql_intercept_marker_t;
+
+typedef void (*hsa_amd_queue_intercept_packet_writer)(const void* pkts, uint64_t pkt_count);
+typedef void (*hsa_amd_queue_intercept_handler)(const void* pkts, uint64_t pkt_count,
+                                                uint64_t user_pkt_index, void* data,
+                                                hsa_amd_queue_intercept_packet_writer writer);
+hsa_status_t hsa_amd_queue_intercept_register(hsa_queue_t* queue,
+                                              hsa_amd_queue_intercept_handler callback,
+                                              void* user_data);
+hsa_status_t hsa_amd_queue_intercept_create(
+    hsa_agent_t agent_handle, uint32_t size, hsa_queue_type32_t type,
+    void (*callback)(hsa_status_t status, hsa_queue_t* source, void* data), void* data,
+    uint32_t private_segment_size, uint32_t group_segment_size, hsa_queue_t** queue);
+
+typedef void (*hsa_amd_runtime_queue_notifier)(const hsa_queue_t* queue, hsa_agent_t agent,
+                                               void* data);
+hsa_status_t hsa_amd_runtime_queue_create_register(hsa_amd_runtime_queue_notifier callback,
+                                                   void* user_data);
+
+// Structure of Version used to identify an instance of Api table
+// Must be the first member (offsetof == 0) of all API tables.
+// This is the root of the table passing ABI.
+struct ApiTableVersion {
+  uint32_t major_id;
+  uint32_t minor_id;
+  uint32_t step_id;
+  uint32_t reserved;
+};
+
+struct ToolsApiTable {
+  ApiTableVersion version;
+
+  hsa_amd_tool_event hsa_amd_tool_scratch_event_alloc_start_fn;
+  hsa_amd_tool_event hsa_amd_tool_scratch_event_alloc_end_fn;
+  hsa_amd_tool_event hsa_amd_tool_scratch_event_free_start_fn;
+  hsa_amd_tool_event hsa_amd_tool_scratch_event_free_end_fn;
+  hsa_amd_tool_event hsa_amd_tool_scratch_event_async_reclaim_start_fn;
+  hsa_amd_tool_event hsa_amd_tool_scratch_event_async_reclaim_end_fn;
+};
+
+// Table to export HSA Finalizer Extension Apis
+struct FinalizerExtTable {
+  ApiTableVersion version;
+	decltype(hsa_ext_program_create)* hsa_ext_program_create_fn;
+	decltype(hsa_ext_program_destroy)* hsa_ext_program_destroy_fn;
+	decltype(hsa_ext_program_add_module)* hsa_ext_program_add_module_fn;
+	decltype(hsa_ext_program_iterate_modules)* hsa_ext_program_iterate_modules_fn;
+	decltype(hsa_ext_program_get_info)* hsa_ext_program_get_info_fn;
+	decltype(hsa_ext_program_finalize)* hsa_ext_program_finalize_fn;
+};
+
+// Table to export HSA Image Extension Apis
+struct ImageExtTable {
+  ApiTableVersion version;
+	decltype(hsa_ext_image_get_capability)* hsa_ext_image_get_capability_fn;
+	decltype(hsa_ext_image_data_get_info)* hsa_ext_image_data_get_info_fn;
+	decltype(hsa_ext_image_create)* hsa_ext_image_create_fn;
+	decltype(hsa_ext_image_import)* hsa_ext_image_import_fn;
+	decltype(hsa_ext_image_export)* hsa_ext_image_export_fn;
+	decltype(hsa_ext_image_copy)* hsa_ext_image_copy_fn;
+	decltype(hsa_ext_image_clear)* hsa_ext_image_clear_fn;
+	decltype(hsa_ext_image_destroy)* hsa_ext_image_destroy_fn;
+	decltype(hsa_ext_sampler_create)* hsa_ext_sampler_create_fn;
+	decltype(hsa_ext_sampler_destroy)* hsa_ext_sampler_destroy_fn;
+  decltype(hsa_ext_image_get_capability_with_layout)* hsa_ext_image_get_capability_with_layout_fn;
+  decltype(hsa_ext_image_data_get_info_with_layout)* hsa_ext_image_data_get_info_with_layout_fn;
+  decltype(hsa_ext_image_create_with_layout)* hsa_ext_image_create_with_layout_fn;
+  decltype(hsa_ext_sampler_create_v2)* hsa_ext_sampler_create_v2_fn;
+
+};
+
+// Table to export HSA PC Sampling Extension Apis
+struct PcSamplingExtTable {
+  ApiTableVersion version;
+  decltype(hsa_ven_amd_pcs_iterate_configuration)* hsa_ven_amd_pcs_iterate_configuration_fn;
+  decltype(hsa_ven_amd_pcs_create)* hsa_ven_amd_pcs_create_fn;
+  decltype(hsa_ven_amd_pcs_create_from_id)* hsa_ven_amd_pcs_create_from_id_fn;
+  decltype(hsa_ven_amd_pcs_destroy)* hsa_ven_amd_pcs_destroy_fn;
+  decltype(hsa_ven_amd_pcs_start)* hsa_ven_amd_pcs_start_fn;
+  decltype(hsa_ven_amd_pcs_stop)* hsa_ven_amd_pcs_stop_fn;
+  decltype(hsa_ven_amd_pcs_flush)* hsa_ven_amd_pcs_flush_fn;
+};
+
+
+// Table to export AMD Extension Apis
+struct AmdExtTable {
+  ApiTableVersion version;
+	decltype(hsa_amd_coherency_get_type)* hsa_amd_coherency_get_type_fn;
+	decltype(hsa_amd_coherency_set_type)* hsa_amd_coherency_set_type_fn;
+  decltype(hsa_amd_profiling_set_profiler_enabled)* hsa_amd_profiling_set_profiler_enabled_fn;
+  decltype(hsa_amd_profiling_async_copy_enable) *hsa_amd_profiling_async_copy_enable_fn;
+  decltype(hsa_amd_profiling_get_dispatch_time)* hsa_amd_profiling_get_dispatch_time_fn;
+  decltype(hsa_amd_profiling_get_async_copy_time) *hsa_amd_profiling_get_async_copy_time_fn;
+  decltype(hsa_amd_profiling_convert_tick_to_system_domain)* hsa_amd_profiling_convert_tick_to_system_domain_fn;
+  decltype(hsa_amd_signal_async_handler)* hsa_amd_signal_async_handler_fn;
+  decltype(hsa_amd_async_function)* hsa_amd_async_function_fn;
+  decltype(hsa_amd_signal_wait_any)* hsa_amd_signal_wait_any_fn;
+  decltype(hsa_amd_queue_cu_set_mask)* hsa_amd_queue_cu_set_mask_fn;
+  decltype(hsa_amd_memory_pool_get_info)* hsa_amd_memory_pool_get_info_fn;
+  decltype(hsa_amd_agent_iterate_memory_pools)* hsa_amd_agent_iterate_memory_pools_fn;
+  decltype(hsa_amd_memory_pool_allocate)* hsa_amd_memory_pool_allocate_fn;
+  decltype(hsa_amd_memory_pool_free)* hsa_amd_memory_pool_free_fn;
+  decltype(hsa_amd_memory_async_copy)* hsa_amd_memory_async_copy_fn;
+  decltype(hsa_amd_memory_async_copy_on_engine)* hsa_amd_memory_async_copy_on_engine_fn;
+  decltype(hsa_amd_memory_copy_engine_status)* hsa_amd_memory_copy_engine_status_fn;
+  decltype(hsa_amd_agent_memory_pool_get_info)* hsa_amd_agent_memory_pool_get_info_fn;
+  decltype(hsa_amd_agents_allow_access)* hsa_amd_agents_allow_access_fn;
+  decltype(hsa_amd_memory_pool_can_migrate)* hsa_amd_memory_pool_can_migrate_fn;
+  decltype(hsa_amd_memory_migrate)* hsa_amd_memory_migrate_fn;
+  decltype(hsa_amd_memory_lock)* hsa_amd_memory_lock_fn;
+  decltype(hsa_amd_memory_unlock)* hsa_amd_memory_unlock_fn;
+  decltype(hsa_amd_memory_fill)* hsa_amd_memory_fill_fn;
+  decltype(hsa_amd_interop_map_buffer)* hsa_amd_interop_map_buffer_fn;
+  decltype(hsa_amd_interop_unmap_buffer)* hsa_amd_interop_unmap_buffer_fn;
+  decltype(hsa_amd_image_create)* hsa_amd_image_create_fn;
+  decltype(hsa_amd_pointer_info)* hsa_amd_pointer_info_fn;
+  decltype(hsa_amd_pointer_info_set_userdata)* hsa_amd_pointer_info_set_userdata_fn;
+  decltype(hsa_amd_ipc_memory_create)* hsa_amd_ipc_memory_create_fn;
+  decltype(hsa_amd_ipc_memory_attach)* hsa_amd_ipc_memory_attach_fn;
+  decltype(hsa_amd_ipc_memory_detach)* hsa_amd_ipc_memory_detach_fn;
+  decltype(hsa_amd_signal_create)* hsa_amd_signal_create_fn;
+  decltype(hsa_amd_ipc_signal_create)* hsa_amd_ipc_signal_create_fn;
+  decltype(hsa_amd_ipc_signal_attach)* hsa_amd_ipc_signal_attach_fn;
+  decltype(hsa_amd_register_system_event_handler)* hsa_amd_register_system_event_handler_fn;
+  decltype(hsa_amd_queue_intercept_create)* hsa_amd_queue_intercept_create_fn;
+  decltype(hsa_amd_queue_intercept_register)* hsa_amd_queue_intercept_register_fn;
+  decltype(hsa_amd_queue_set_priority)* hsa_amd_queue_set_priority_fn;
+  decltype(hsa_amd_memory_async_copy_rect)* hsa_amd_memory_async_copy_rect_fn;
+  decltype(hsa_amd_runtime_queue_create_register)* hsa_amd_runtime_queue_create_register_fn;
+  decltype(hsa_amd_memory_lock_to_pool)* hsa_amd_memory_lock_to_pool_fn;
+  decltype(hsa_amd_register_deallocation_callback)* hsa_amd_register_deallocation_callback_fn;
+  decltype(hsa_amd_deregister_deallocation_callback)* hsa_amd_deregister_deallocation_callback_fn;
+  decltype(hsa_amd_signal_value_pointer)* hsa_amd_signal_value_pointer_fn;
+  decltype(hsa_amd_svm_attributes_set)* hsa_amd_svm_attributes_set_fn;
+  decltype(hsa_amd_svm_attributes_get)* hsa_amd_svm_attributes_get_fn;
+  decltype(hsa_amd_svm_prefetch_async)* hsa_amd_svm_prefetch_async_fn;
+  decltype(hsa_amd_spm_acquire)* hsa_amd_spm_acquire_fn;
+  decltype(hsa_amd_spm_release)* hsa_amd_spm_release_fn;
+  decltype(hsa_amd_spm_set_dest_buffer)* hsa_amd_spm_set_dest_buffer_fn;
+  decltype(hsa_amd_queue_cu_get_mask)* hsa_amd_queue_cu_get_mask_fn;
+  decltype(hsa_amd_portable_export_dmabuf)* hsa_amd_portable_export_dmabuf_fn;
+  decltype(hsa_amd_portable_close_dmabuf)* hsa_amd_portable_close_dmabuf_fn;
+  decltype(hsa_amd_vmem_address_reserve)* hsa_amd_vmem_address_reserve_fn;
+  decltype(hsa_amd_vmem_address_free)* hsa_amd_vmem_address_free_fn;
+  decltype(hsa_amd_vmem_handle_create)* hsa_amd_vmem_handle_create_fn;
+  decltype(hsa_amd_vmem_handle_release)* hsa_amd_vmem_handle_release_fn;
+  decltype(hsa_amd_vmem_map)* hsa_amd_vmem_map_fn;
+  decltype(hsa_amd_vmem_unmap)* hsa_amd_vmem_unmap_fn;
+  decltype(hsa_amd_vmem_set_access)* hsa_amd_vmem_set_access_fn;
+  decltype(hsa_amd_vmem_get_access)* hsa_amd_vmem_get_access_fn;
+  decltype(hsa_amd_vmem_export_shareable_handle)* hsa_amd_vmem_export_shareable_handle_fn;
+  decltype(hsa_amd_vmem_import_shareable_handle)* hsa_amd_vmem_import_shareable_handle_fn;
+  decltype(hsa_amd_vmem_retain_alloc_handle)* hsa_amd_vmem_retain_alloc_handle_fn;
+  decltype(hsa_amd_vmem_get_alloc_properties_from_handle)*
+      hsa_amd_vmem_get_alloc_properties_from_handle_fn;
+  decltype(hsa_amd_agent_set_async_scratch_limit)* hsa_amd_agent_set_async_scratch_limit_fn;
+  decltype(hsa_amd_queue_get_info)* hsa_amd_queue_get_info_fn;
+  decltype(hsa_amd_vmem_address_reserve_align)* hsa_amd_vmem_address_reserve_align_fn;
+  decltype(hsa_amd_enable_logging)* hsa_amd_enable_logging_fn;
+  decltype(hsa_amd_signal_wait_all)* hsa_amd_signal_wait_all_fn;
+  decltype(hsa_amd_memory_get_preferred_copy_engine)* hsa_amd_memory_get_preferred_copy_engine_fn;
+  decltype(hsa_amd_portable_export_dmabuf_v2)* hsa_amd_portable_export_dmabuf_v2_fn;
+  decltype(hsa_amd_ais_file_write)* hsa_amd_ais_file_write_fn;
+  decltype(hsa_amd_ais_file_read)* hsa_amd_ais_file_read_fn;
+};
+
+// Table to export HSA Core Runtime Apis
+struct CoreApiTable {
+  ApiTableVersion version;
+  decltype(hsa_init)* hsa_init_fn;
+  decltype(hsa_shut_down)* hsa_shut_down_fn;
+  decltype(hsa_system_get_info)* hsa_system_get_info_fn;
+  decltype(hsa_system_extension_supported)* hsa_system_extension_supported_fn;
+  decltype(hsa_system_get_extension_table)* hsa_system_get_extension_table_fn;
+  decltype(hsa_iterate_agents)* hsa_iterate_agents_fn;
+  decltype(hsa_agent_get_info)* hsa_agent_get_info_fn;
+  decltype(hsa_queue_create)* hsa_queue_create_fn;
+  decltype(hsa_soft_queue_create)* hsa_soft_queue_create_fn;
+  decltype(hsa_queue_destroy)* hsa_queue_destroy_fn;
+  decltype(hsa_queue_inactivate)* hsa_queue_inactivate_fn;
+  decltype(hsa_queue_load_read_index_scacquire)* hsa_queue_load_read_index_scacquire_fn;
+  decltype(hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed_fn;
+  decltype(hsa_queue_load_write_index_scacquire)* hsa_queue_load_write_index_scacquire_fn;
+  decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed_fn;
+  decltype(hsa_queue_store_write_index_relaxed)* hsa_queue_store_write_index_relaxed_fn;
+  decltype(hsa_queue_store_write_index_screlease)* hsa_queue_store_write_index_screlease_fn;
+  decltype(hsa_queue_cas_write_index_scacq_screl)* hsa_queue_cas_write_index_scacq_screl_fn;
+  decltype(hsa_queue_cas_write_index_scacquire)* hsa_queue_cas_write_index_scacquire_fn;
+  decltype(hsa_queue_cas_write_index_relaxed)* hsa_queue_cas_write_index_relaxed_fn;
+  decltype(hsa_queue_cas_write_index_screlease)* hsa_queue_cas_write_index_screlease_fn;
+  decltype(hsa_queue_add_write_index_scacq_screl)* hsa_queue_add_write_index_scacq_screl_fn;
+  decltype(hsa_queue_add_write_index_scacquire)* hsa_queue_add_write_index_scacquire_fn;
+  decltype(hsa_queue_add_write_index_relaxed)* hsa_queue_add_write_index_relaxed_fn;
+  decltype(hsa_queue_add_write_index_screlease)* hsa_queue_add_write_index_screlease_fn;
+  decltype(hsa_queue_store_read_index_relaxed)* hsa_queue_store_read_index_relaxed_fn;
+  decltype(hsa_queue_store_read_index_screlease)* hsa_queue_store_read_index_screlease_fn;
+  decltype(hsa_agent_iterate_regions)* hsa_agent_iterate_regions_fn;
+  decltype(hsa_region_get_info)* hsa_region_get_info_fn;
+  decltype(hsa_agent_get_exception_policies)* hsa_agent_get_exception_policies_fn;
+  decltype(hsa_agent_extension_supported)* hsa_agent_extension_supported_fn;
+  decltype(hsa_memory_register)* hsa_memory_register_fn;
+  decltype(hsa_memory_deregister)* hsa_memory_deregister_fn;
+  decltype(hsa_memory_allocate)* hsa_memory_allocate_fn;
+  decltype(hsa_memory_free)* hsa_memory_free_fn;
+  decltype(hsa_memory_copy)* hsa_memory_copy_fn;
+  decltype(hsa_memory_assign_agent)* hsa_memory_assign_agent_fn;
+  decltype(hsa_signal_create)* hsa_signal_create_fn;
+  decltype(hsa_signal_destroy)* hsa_signal_destroy_fn;
+  decltype(hsa_signal_load_relaxed)* hsa_signal_load_relaxed_fn;
+  decltype(hsa_signal_load_scacquire)* hsa_signal_load_scacquire_fn;
+  decltype(hsa_signal_store_relaxed)* hsa_signal_store_relaxed_fn;
+  decltype(hsa_signal_store_screlease)* hsa_signal_store_screlease_fn;
+  decltype(hsa_signal_wait_relaxed)* hsa_signal_wait_relaxed_fn;
+  decltype(hsa_signal_wait_scacquire)* hsa_signal_wait_scacquire_fn;
+  decltype(hsa_signal_and_relaxed)* hsa_signal_and_relaxed_fn;
+  decltype(hsa_signal_and_scacquire)* hsa_signal_and_scacquire_fn;
+  decltype(hsa_signal_and_screlease)* hsa_signal_and_screlease_fn;
+  decltype(hsa_signal_and_scacq_screl)* hsa_signal_and_scacq_screl_fn;
+  decltype(hsa_signal_or_relaxed)* hsa_signal_or_relaxed_fn;
+  decltype(hsa_signal_or_scacquire)* hsa_signal_or_scacquire_fn;
+  decltype(hsa_signal_or_screlease)* hsa_signal_or_screlease_fn;
+  decltype(hsa_signal_or_scacq_screl)* hsa_signal_or_scacq_screl_fn;
+  decltype(hsa_signal_xor_relaxed)* hsa_signal_xor_relaxed_fn;
+  decltype(hsa_signal_xor_scacquire)* hsa_signal_xor_scacquire_fn;
+  decltype(hsa_signal_xor_screlease)* hsa_signal_xor_screlease_fn;
+  decltype(hsa_signal_xor_scacq_screl)* hsa_signal_xor_scacq_screl_fn;
+  decltype(hsa_signal_exchange_relaxed)* hsa_signal_exchange_relaxed_fn;
+  decltype(hsa_signal_exchange_scacquire)* hsa_signal_exchange_scacquire_fn;
+  decltype(hsa_signal_exchange_screlease)* hsa_signal_exchange_screlease_fn;
+  decltype(hsa_signal_exchange_scacq_screl)* hsa_signal_exchange_scacq_screl_fn;
+  decltype(hsa_signal_add_relaxed)* hsa_signal_add_relaxed_fn;
+  decltype(hsa_signal_add_scacquire)* hsa_signal_add_scacquire_fn;
+  decltype(hsa_signal_add_screlease)* hsa_signal_add_screlease_fn;
+  decltype(hsa_signal_add_scacq_screl)* hsa_signal_add_scacq_screl_fn;
+  decltype(hsa_signal_subtract_relaxed)* hsa_signal_subtract_relaxed_fn;
+  decltype(hsa_signal_subtract_scacquire)* hsa_signal_subtract_scacquire_fn;
+  decltype(hsa_signal_subtract_screlease)* hsa_signal_subtract_screlease_fn;
+  decltype(hsa_signal_subtract_scacq_screl)* hsa_signal_subtract_scacq_screl_fn;
+  decltype(hsa_signal_cas_relaxed)* hsa_signal_cas_relaxed_fn;
+  decltype(hsa_signal_cas_scacquire)* hsa_signal_cas_scacquire_fn;
+  decltype(hsa_signal_cas_screlease)* hsa_signal_cas_screlease_fn;
+  decltype(hsa_signal_cas_scacq_screl)* hsa_signal_cas_scacq_screl_fn;
+
+  //===--- Instruction Set Architecture -----------------------------------===//
+
+  decltype(hsa_isa_from_name)* hsa_isa_from_name_fn;
+  // Deprecated since v1.1.
+  decltype(hsa_isa_get_info)* hsa_isa_get_info_fn;
+  // Deprecated since v1.1.
+  decltype(hsa_isa_compatible)* hsa_isa_compatible_fn;
+
+  //===--- Code Objects (deprecated) --------------------------------------===//
+
+  // Deprecated since v1.1.
+  decltype(hsa_code_object_serialize)* hsa_code_object_serialize_fn;
+  // Deprecated since v1.1.
+  decltype(hsa_code_object_deserialize)* hsa_code_object_deserialize_fn;
+  // Deprecated since v1.1.
+  decltype(hsa_code_object_destroy)* hsa_code_object_destroy_fn;
+  // Deprecated since v1.1.
+  decltype(hsa_code_object_get_info)* hsa_code_object_get_info_fn;
+  // Deprecated since v1.1.
+  decltype(hsa_code_object_get_symbol)* hsa_code_object_get_symbol_fn;
+  // Deprecated since v1.1.
+  decltype(hsa_code_symbol_get_info)* hsa_code_symbol_get_info_fn;
+  // Deprecated since v1.1.
+  decltype(hsa_code_object_iterate_symbols)* hsa_code_object_iterate_symbols_fn;
+
+  //===--- Executable -----------------------------------------------------===//
+
+  // Deprecated since v1.1.
+  decltype(hsa_executable_create)* hsa_executable_create_fn;
+  decltype(hsa_executable_destroy)* hsa_executable_destroy_fn;
+  // Deprecated since v1.1.
+  decltype(hsa_executable_load_code_object)* hsa_executable_load_code_object_fn;
+  decltype(hsa_executable_freeze)* hsa_executable_freeze_fn;
+  decltype(hsa_executable_get_info)* hsa_executable_get_info_fn;
+  decltype(hsa_executable_global_variable_define)*
+      hsa_executable_global_variable_define_fn;
+  decltype(hsa_executable_agent_global_variable_define)*
+      hsa_executable_agent_global_variable_define_fn;
+  decltype(hsa_executable_readonly_variable_define)*
+      hsa_executable_readonly_variable_define_fn;
+  decltype(hsa_executable_validate)* hsa_executable_validate_fn;
+  // Deprecated since v1.1.
+  decltype(hsa_executable_get_symbol)* hsa_executable_get_symbol_fn;
+  decltype(hsa_executable_symbol_get_info)* hsa_executable_symbol_get_info_fn;
+  // Deprecated since v1.1.
+  decltype(hsa_executable_iterate_symbols)* hsa_executable_iterate_symbols_fn;
+
+  //===--- Runtime Notifications ------------------------------------------===//
+
+  decltype(hsa_status_string)* hsa_status_string_fn;
+
+  // Start HSA v1.1 additions
+  decltype(hsa_extension_get_name)* hsa_extension_get_name_fn;
+  decltype(hsa_system_major_extension_supported)* hsa_system_major_extension_supported_fn;
+  decltype(hsa_system_get_major_extension_table)* hsa_system_get_major_extension_table_fn;
+  decltype(hsa_agent_major_extension_supported)* hsa_agent_major_extension_supported_fn;
+  decltype(hsa_cache_get_info)* hsa_cache_get_info_fn;
+  decltype(hsa_agent_iterate_caches)* hsa_agent_iterate_caches_fn;
+  decltype(hsa_signal_silent_store_relaxed)* hsa_signal_silent_store_relaxed_fn;
+  decltype(hsa_signal_silent_store_screlease)* hsa_signal_silent_store_screlease_fn;
+  decltype(hsa_signal_group_create)* hsa_signal_group_create_fn;
+  decltype(hsa_signal_group_destroy)* hsa_signal_group_destroy_fn;
+  decltype(hsa_signal_group_wait_any_scacquire)* hsa_signal_group_wait_any_scacquire_fn;
+  decltype(hsa_signal_group_wait_any_relaxed)* hsa_signal_group_wait_any_relaxed_fn;
+
+  //===--- Instruction Set Architecture - HSA v1.1 additions --------------===//
+
+  decltype(hsa_agent_iterate_isas)* hsa_agent_iterate_isas_fn;
+  decltype(hsa_isa_get_info_alt)* hsa_isa_get_info_alt_fn;
+  decltype(hsa_isa_get_exception_policies)* hsa_isa_get_exception_policies_fn;
+  decltype(hsa_isa_get_round_method)* hsa_isa_get_round_method_fn;
+  decltype(hsa_wavefront_get_info)* hsa_wavefront_get_info_fn;
+  decltype(hsa_isa_iterate_wavefronts)* hsa_isa_iterate_wavefronts_fn;
+
+  //===--- Code Objects (deprecated) - HSA v1.1 additions -----------------===//
+
+  // Deprecated since v1.1.
+  decltype(hsa_code_object_get_symbol_from_name)*
+      hsa_code_object_get_symbol_from_name_fn;
+
+  //===--- Executable - HSA v1.1 additions --------------------------------===//
+
+  decltype(hsa_code_object_reader_create_from_file)*
+      hsa_code_object_reader_create_from_file_fn;
+  decltype(hsa_code_object_reader_create_from_memory)*
+      hsa_code_object_reader_create_from_memory_fn;
+  decltype(hsa_code_object_reader_destroy)* hsa_code_object_reader_destroy_fn;
+  decltype(hsa_executable_create_alt)* hsa_executable_create_alt_fn;
+  decltype(hsa_executable_load_program_code_object)*
+      hsa_executable_load_program_code_object_fn;
+  decltype(hsa_executable_load_agent_code_object)*
+      hsa_executable_load_agent_code_object_fn;
+  decltype(hsa_executable_validate_alt)* hsa_executable_validate_alt_fn;
+  decltype(hsa_executable_get_symbol_by_name)*
+      hsa_executable_get_symbol_by_name_fn;
+  decltype(hsa_executable_iterate_agent_symbols)*
+      hsa_executable_iterate_agent_symbols_fn;
+  decltype(hsa_executable_iterate_program_symbols)*
+      hsa_executable_iterate_program_symbols_fn;
+};
+
+// Table to export HSA Apis from Core Runtime, Amd Extensions
+// Finalizer and Images
+struct HsaApiTable {
+
+  // Version of Hsa Api Table
+  ApiTableVersion version;
+
+  // Table of function pointers to HSA Core Runtime
+	CoreApiTable* core_;
+
+  // Table of function pointers to AMD extensions
+	AmdExtTable* amd_ext_;
+
+  // Table of function pointers to HSA Finalizer Extension
+	FinalizerExtTable* finalizer_ext_;
+
+  // Table of function pointers to HSA Image Extension
+	ImageExtTable* image_ext_;
+
+  // Table of function pointers for tools to use
+  ToolsApiTable* tools_;
+
+  // Table of function pointers to AMD PC Sampling Extension
+  PcSamplingExtTable* pc_sampling_ext_;
+};
+
+// Structure containing instances of different api tables
+struct HsaApiTableContainer {
+  HsaApiTable root;
+	CoreApiTable core;
+	AmdExtTable amd_ext;
+	FinalizerExtTable finalizer_ext;
+	ImageExtTable image_ext;
+	ToolsApiTable tools;
+  PcSamplingExtTable pc_sampling_ext;
+
+  // Default initialization of a container instance
+  HsaApiTableContainer() {
+    root.version.major_id = HSA_API_TABLE_MAJOR_VERSION;
+    root.version.minor_id = sizeof(HsaApiTable);
+    root.version.step_id = HSA_API_TABLE_STEP_VERSION;
+
+    core.version.major_id = HSA_CORE_API_TABLE_MAJOR_VERSION;
+    core.version.minor_id = sizeof(CoreApiTable);
+    core.version.step_id = HSA_CORE_API_TABLE_STEP_VERSION;
+    root.core_ = &core;
+
+    amd_ext.version.major_id = HSA_AMD_EXT_API_TABLE_MAJOR_VERSION;
+    amd_ext.version.minor_id = sizeof(AmdExtTable);
+    amd_ext.version.step_id = HSA_AMD_EXT_API_TABLE_STEP_VERSION;
+    root.amd_ext_ = &amd_ext;
+
+    finalizer_ext.version.major_id = HSA_FINALIZER_API_TABLE_MAJOR_VERSION;
+    finalizer_ext.version.minor_id = sizeof(FinalizerExtTable);
+    finalizer_ext.version.step_id = HSA_FINALIZER_API_TABLE_STEP_VERSION;
+    root.finalizer_ext_ = &finalizer_ext;
+
+    image_ext.version.major_id = HSA_IMAGE_API_TABLE_MAJOR_VERSION;
+    image_ext.version.minor_id = sizeof(ImageExtTable);
+    image_ext.version.step_id = HSA_IMAGE_API_TABLE_STEP_VERSION;
+    root.image_ext_ = &image_ext;
+
+    tools.version.major_id = HSA_TOOLS_API_TABLE_MAJOR_VERSION;
+    tools.version.minor_id = sizeof(ToolsApiTable);
+    tools.version.step_id = HSA_TOOLS_API_TABLE_STEP_VERSION;
+    root.tools_ = &tools;
+
+    pc_sampling_ext.version.major_id = HSA_PC_SAMPLING_API_TABLE_MAJOR_VERSION;
+    pc_sampling_ext.version.minor_id = sizeof(PcSamplingExtTable);
+    pc_sampling_ext.version.step_id = HSA_PC_SAMPLING_API_TABLE_STEP_VERSION;
+    root.pc_sampling_ext_ = &pc_sampling_ext;
+  }
+};
+
+// Api to copy function pointers of a table
+static
+void inline copyApi(void* src, void* dest, size_t size) {
+  assert(size >= sizeof(ApiTableVersion));
+  memcpy((char*)src + sizeof(ApiTableVersion),
+         (char*)dest + sizeof(ApiTableVersion),
+         (size - sizeof(ApiTableVersion)));
+}
+
+// Copy Api child tables if valid.
+static void inline copyElement(ApiTableVersion* dest, ApiTableVersion* src) {
+  if (src->major_id && (dest->major_id == src->major_id)) {
+    dest->step_id = src->step_id;
+    dest->minor_id = Min(dest->minor_id, src->minor_id);
+    copyApi(dest, src, dest->minor_id);
+  } else {
+    dest->major_id = 0;
+    dest->minor_id = 0;
+    dest->step_id = 0;
+  }
+}
+
+// Copy constructor for all Api tables. The function assumes the
+// user has initialized an instance of tables container correctly
+// for the Major, Minor and Stepping Ids of Root and Child Api tables.
+// The function will overwrite the value of Minor Id by taking the
+// minimum of source and destination parameters. It will also overwrite
+// the stepping Id with value from source parameter.
+static void inline copyTables(const HsaApiTable* src, HsaApiTable* dest) {
+  // Verify Major Id of source and destination tables match
+  if (dest->version.major_id != src->version.major_id) {
+    dest->version.major_id = 0;
+    dest->version.minor_id = 0;
+    dest->version.step_id = 0;
+    return;
+  }
+
+  // Initialize the stepping id and minor id of root table. For the
+  // minor id which encodes struct size, take the minimum of source
+  // and destination parameters
+  dest->version.step_id = src->version.step_id;
+  dest->version.minor_id = Min(dest->version.minor_id, src->version.minor_id);
+
+  // Copy child tables if present
+  if ((offsetof(HsaApiTable, core_) < dest->version.minor_id))
+    copyElement(&dest->core_->version, &src->core_->version);
+  if ((offsetof(HsaApiTable, amd_ext_) < dest->version.minor_id))
+    copyElement(&dest->amd_ext_->version, &src->amd_ext_->version);
+  if ((offsetof(HsaApiTable, finalizer_ext_) < dest->version.minor_id))
+    copyElement(&dest->finalizer_ext_->version, &src->finalizer_ext_->version);
+  if ((offsetof(HsaApiTable, image_ext_) < dest->version.minor_id))
+    copyElement(&dest->image_ext_->version, &src->image_ext_->version);
+  if ((offsetof(HsaApiTable, tools_) < dest->version.minor_id))
+    copyElement(&dest->tools_->version, &src->tools_->version);
+  if ((offsetof(HsaApiTable, pc_sampling_ext_) < dest->version.minor_id))
+    copyElement(&dest->pc_sampling_ext_->version, &src->pc_sampling_ext_->version);
+}
+#endif
diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_api_trace_version.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_api_trace_version.h
new file mode 100644
index 0000000000..6cf1054823
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_api_trace_version.h
@@ -0,0 +1,70 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2025, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef HSA_RUNTIME_INC_HSA_API_TRACE_VERSION_H
+#define HSA_RUNTIME_INC_HSA_API_TRACE_VERSION_H
+
+// CODE IN THIS FILE **MUST** BE C-COMPATIBLE
+
+// Major Ids of the Api tables exported by Hsa Core Runtime
+#define HSA_API_TABLE_MAJOR_VERSION                 0x03
+#define HSA_CORE_API_TABLE_MAJOR_VERSION            0x02
+#define HSA_AMD_EXT_API_TABLE_MAJOR_VERSION         0x02
+#define HSA_FINALIZER_API_TABLE_MAJOR_VERSION       0x02
+#define HSA_IMAGE_API_TABLE_MAJOR_VERSION           0x02
+#define HSA_AQLPROFILE_API_TABLE_MAJOR_VERSION      0x01
+#define HSA_TOOLS_API_TABLE_MAJOR_VERSION           0x01
+#define HSA_PC_SAMPLING_API_TABLE_MAJOR_VERSION     0x01
+
+// Step Ids of the Api tables exported by Hsa Core Runtime
+#define HSA_API_TABLE_STEP_VERSION                  0x01
+#define HSA_CORE_API_TABLE_STEP_VERSION             0x00
+#define HSA_AMD_EXT_API_TABLE_STEP_VERSION          0x08
+#define HSA_FINALIZER_API_TABLE_STEP_VERSION        0x00
+#define HSA_IMAGE_API_TABLE_STEP_VERSION            0x01
+// Rocprofiler just checks HSA_MAGE_EXT_API_TABLE_STEP_VERSION
+#define HSA_IMAGE_EXT_API_TABLE_STEP_VERSION        HSA_IMAGE_API_TABLE_STEP_VERSION
+#define HSA_AQLPROFILE_API_TABLE_STEP_VERSION       0x00
+#define HSA_TOOLS_API_TABLE_STEP_VERSION            0x00
+#define HSA_PC_SAMPLING_API_TABLE_STEP_VERSION      0x00
+
+#endif  // HSA_RUNTIME_INC_HSA_API_TRACE_VERSION_H
diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ext_amd.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ext_amd.h
new file mode 100644
index 0000000000..3fd1f9348e
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ext_amd.h
@@ -0,0 +1,3782 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2025, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// HSA AMD extension.
+
+#ifndef HSA_RUNTIME_EXT_AMD_H_
+#define HSA_RUNTIME_EXT_AMD_H_
+
+#include "hsa.h"
+#include "hsa_ext_image.h"
+#include "hsa_ven_amd_pc_sampling.h"
+
+/**
+ * - 1.0 - initial version
+ * - 1.1 - dmabuf export
+ * - 1.2 - hsa_amd_memory_async_copy_on_engine
+ * - 1.3 - HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED pool
+ * - 1.4 - Virtual Memory API
+ * - 1.5 - hsa_amd_agent_info: HSA_AMD_AGENT_INFO_MEMORY_PROPERTIES
+ * - 1.6 - Virtual Memory API: hsa_amd_vmem_address_reserve_align
+ * - 1.7 - hsa_amd_signal_wait_all
+ * - 1.8 - hsa_amd_memory_get_preferred_copy_engine
+ * - 1.9 - hsa_amd_portable_export_dmabuf_v2
+ * - 1.10 - hsa_amd_vmem_address_reserve: HSA_AMD_VMEM_ADDRESS_NO_REGISTER
+ * - 1.11 - hsa_amd_agent_info_t: HSA_AMD_AGENT_INFO_CLOCK_COUNTERS
+ * - 1.12 - hsa_amd_pointer_info: HSA_EXT_POINTER_TYPE_HSA_VMEM and HSA_EXT_POINTER_TYPE_RESERVED_ADDR
+ * - 1.13 - hsa_amd_pointer_info: Added new registered field to hsa_amd_pointer_info_t
+ * - 1.14 - hsa_amd_ais_file_write, hsa_amd_ais_file_read
+ */
+#define HSA_AMD_INTERFACE_VERSION_MAJOR 1
+#define HSA_AMD_INTERFACE_VERSION_MINOR 14
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** \addtogroup aql Architected Queuing Language
+ *  @{
+ */
+
+/**
+ * @brief Macro to set a flag within uint8_t[8] types.
+ */
+static inline void hsa_flag_set64(uint8_t* value, uint32_t bit) {
+  unsigned int index = bit / 8;
+  unsigned int subBit = bit % 8;
+  (((uint8_t*)value)[index]) |= (1 << subBit);
+}
+
+/**
+ * @brief Macro to determine whether a flag is set within uint8_t[8] types.
+ */
+static inline bool hsa_flag_isset64(uint8_t* value, uint32_t bit) {
+  unsigned int index = bit / 8;
+  unsigned int subBit = bit % 8;
+  return ((uint8_t*)value)[index] & (1 << subBit);
+}
+
+/**
+ * @brief A fixed-size type used to represent ::hsa_signal_condition_t constants.
+ */
+typedef uint32_t hsa_signal_condition32_t;
+
+/**
+ * @brief AMD vendor specific packet type.
+ */
+typedef enum {
+  /**
+   * Packet used by agents to delay processing of subsequent packets until a
+   * configurable condition is satisfied by an HSA signal.  Only kernel dispatch
+   * queues created from AMD GPU Agents support this packet.
+   */
+  HSA_AMD_PACKET_TYPE_BARRIER_VALUE = 2,
+  /**
+   * Packet used to send commands to an AIE agent's embedded runtime (ERT). The
+   * ERT is responsible for, among other things, handling dispatches. Only
+   * queues created on AIE agents support this packet.
+   */
+  HSA_AMD_PACKET_TYPE_AIE_ERT = 3
+} hsa_amd_packet_type_t;
+
+/**
+ * @brief A fixed-size type used to represent ::hsa_amd_packet_type_t constants.
+ */
+typedef uint8_t hsa_amd_packet_type8_t;
+
+/**
+ * @brief AMD vendor specific AQL packet header
+ */
+typedef struct hsa_amd_packet_header_s {
+  /**
+   * Packet header. Used to configure multiple packet parameters such as the
+   * packet type. The parameters are described by ::hsa_packet_header_t.
+   */
+  uint16_t header;
+
+  /**
+   * Format of the vendor specific packet.
+   */
+  hsa_amd_packet_type8_t AmdFormat;
+
+  /**
+   * Reserved. Must be 0.
+   */
+  uint8_t reserved;
+} hsa_amd_vendor_packet_header_t;
+
+/**
+ * @brief AMD barrier value packet.  Halts packet processing and waits for
+ * (signal_value & ::mask) ::cond ::value to be satisfied, where signal_value
+ * is the value of the signal ::signal.
+ */
+typedef struct hsa_amd_barrier_value_packet_s {
+  /**
+   * AMD vendor specific packet header.
+   */
+  hsa_amd_vendor_packet_header_t header;
+
+  /**
+   * Reserved. Must be 0.
+   */
+  uint32_t reserved0;
+
+  /**
+   * Dependent signal object. A signal with a handle value of 0 is
+   * allowed and is interpreted by the packet processor a satisfied
+   * dependency.
+   */
+  hsa_signal_t signal;
+
+  /**
+   * Value to compare against.
+   */
+  hsa_signal_value_t value;
+
+  /**
+   * Bit mask to be combined by bitwise AND with ::signal's value.
+   */
+  hsa_signal_value_t mask;
+
+  /**
+   * Comparison operation.  See ::hsa_signal_condition_t.
+   */
+  hsa_signal_condition32_t cond;
+
+  /**
+   * Reserved. Must be 0.
+   */
+  uint32_t reserved1;
+
+  /**
+   * Reserved. Must be 0.
+   */
+  uint64_t reserved2;
+
+  /**
+   * Reserved. Must be 0.
+   */
+  uint64_t reserved3;
+
+  /**
+   * Signal used to indicate completion of the job. The application can use the
+   * special signal handle 0 to indicate that no signal is used.
+   */
+  hsa_signal_t completion_signal;
+} hsa_amd_barrier_value_packet_t;
+
+/**
+ * State of an AIE ERT command.
+ */
+typedef enum {
+  /**
+   * Set by the host before submitting a command to the scheduler.
+   */
+  HSA_AMD_AIE_ERT_STATE_NEW = 1,
+  /**
+   * Internal scheduler state.
+   */
+  HSA_AMD_AIE_ERT_STATE_QUEUED = 2,
+  /**
+   * Internal scheduler state.
+   */
+  HSA_AMD_AIE_ERT_STATE_RUNNING = 3,
+  /**
+   * Set by the scheduler when a command completes.
+   */
+  HSA_AMD_AIE_ERT_STATE_COMPLETED = 4,
+  /**
+   * Set by the scheduler if a command failed.
+   */
+  HSA_AMD_AIE_ERT_STATE_ERROR = 5,
+  /**
+   * Set by the scheduler if a command aborted.
+   */
+  HSA_AMD_AIE_ERT_STATE_ABORT = 6,
+  /**
+   * Internal scheduler state.
+   */
+  HSA_AMD_AIE_ERT_STATE_SUBMITTED = 7,
+  /**
+   * Set by the scheduler on a timeout and reset.
+   */
+  HSA_AMD_AIE_ERT_STATE_TIMEOUT = 8,
+  /**
+   * Set by the scheduler on a timeout and fail to reset.
+   */
+  HSA_AMD_AIE_ERT_STATE_NORESPONSE = 9,
+  HSA_AMD_AIE_ERT_STATE_SKERROR = 10,
+  HSA_AMD_AIE_ERT_STATE_SKCRASHED = 11,
+  HSA_AMD_AIE_ERT_STATE_MAX
+} hsa_amd_aie_ert_state;
+
+/**
+ * Opcode types for HSA AIE ERT commands.
+ */
+typedef enum {
+  /**
+   * Start a workgroup on a compute unit (CU).
+   */
+  HSA_AMD_AIE_ERT_START_CU = 0,
+  /**
+   * Currently aliased to HSA_AMD_AIE_ERT_START_CU.
+   */
+  HSA_AMD_AIE_ERT_START_KERNEL = 0,
+  /**
+   * Configure command scheduler.
+   */
+  HSA_AMD_AIE_ERT_CONFIGURE = 2,
+  HSA_AMD_AIE_ERT_EXIT = 3,
+  HSA_AMD_AIE_ERT_ABORT = 4,
+  /**
+   * Execute a specified CU after writing.
+   */
+  HSA_AMD_AIE_ERT_EXEC_WRITE = 5,
+  /**
+   * Get stats about a CU's execution.
+   */
+  HSA_AMD_AIE_ERT_CU_STAT = 6,
+  /**
+   * Start KDMA CU or P2P.
+   */
+  HSA_AMD_AIE_ERT_START_COPYBO = 7,
+  /**
+   * Configure a soft kernel.
+   */
+  HSA_AMD_AIE_ERT_SK_CONFIG = 8,
+  /**
+   * Start a soft kernel.
+   */
+  HSA_AMD_AIE_ERT_SK_START = 9,
+  /**
+   * Unconfigure a soft kernel.
+   */
+  HSA_AMD_AIE_ERT_SK_UNCONFIG = 10,
+  /**
+   * Initialize a CU.
+   */
+  HSA_AMD_AIE_ERT_INIT_CU = 11,
+  HSA_AMD_AIE_ERT_START_FA = 12,
+  HSA_AMD_AIE_ERT_CLK_CALIB = 13,
+  HSA_AMD_AIE_ERT_MB_VALIDATE = 14,
+  /**
+   * Same as HSA_AMD_AIE_ERT_START_CU but with a key-value pair.
+   */
+  HSA_AMD_AIE_ERT_START_KEY_VAL = 15,
+  HSA_AMD_AIE_ERT_ACCESS_TEST_C = 16,
+  HSA_AMD_AIE_ERT_ACCESS_TEST = 17,
+  /**
+   * Instruction buffer command format.
+   */
+  HSA_AMD_AIE_ERT_START_DPU = 18,
+  /**
+   * Command chain.
+   */
+  HSA_AMD_AIE_ERT_CMD_CHAIN = 19,
+  /**
+   * Instruction buffer command format on NPU.
+   */
+  HSA_AMD_AIE_ERT_START_NPU = 20,
+  /**
+   * Instruction buffer command with pre-emption format on the NPU.
+   */
+  HSA_AMD_AIE_ERT_START_NPU_PREEMPT = 21
+} hsa_amd_aie_ert_cmd_opcode_t;
+
+/**
+ * Payload data for AIE ERT start kernel packets (i.e., when the opcode is
+ * HSA_AMD_AIE_ERT_START_KERNEL).
+ */
+typedef struct hsa_amd_aie_ert_start_kernel_data_s {
+  /**
+   * Address to the PDI.
+   */
+  void* pdi_addr;
+  /**
+   * Opcode, instructions and kernel arguments.
+   */
+  uint32_t data[];
+} hsa_amd_aie_ert_start_kernel_data_t;
+
+/**
+ * AMD AIE ERT packet. Used for sending a command to an AIE agent.
+ */
+typedef struct hsa_amd_aie_ert_packet_s {
+  /**
+   * AMD vendor specific packet header.
+   */
+  hsa_amd_vendor_packet_header_t header;
+  /**
+   * Format for packets interpreted by the ERT to understand the command and
+   * payload data.
+   */
+  struct {
+    /**
+     * Current state of a command.
+     */
+    uint32_t state : 4;
+    /**
+     * Flexible field that can be interpreted on a per-command basis.
+     */
+    uint32_t custom : 8;
+    /**
+     * Number of DWORDs in the payload data.
+     */
+    uint32_t count : 11;
+    /**
+     * Opcode identifying the command.
+     */
+    uint32_t opcode : 5;
+    /**
+     * Type of a command (currently 0).
+     */
+    uint32_t type : 4;
+  };
+  /**
+   * Reserved. Must be 0.
+   */
+  uint64_t reserved0;
+  /**
+   * Reserved. Must be 0.
+   */
+  uint64_t reserved1;
+  /**
+   * Reserved. Must be 0.
+   */
+  uint64_t reserved2;
+  /**
+   * Reserved. Must be 0.
+   */
+  uint64_t reserved3;
+  /**
+   * Reserved. Must be 0.
+   */
+  uint64_t reserved4;
+  /**
+   * Reserved. Must be 0.
+   */
+  uint64_t reserved5;
+  /**
+   * Address of packet data payload. ERT commands contain arbitrarily sized
+   * data payloads.
+   */
+  uint64_t payload_data;
+} hsa_amd_aie_ert_packet_t;
+
+/** @} */
+
+/** \defgroup error-codes Error codes
+ *  @{
+ */
+
+/**
+ * @brief Enumeration constants added to ::hsa_status_t.
+ *
+ * @remark Additions to hsa_status_t
+ */
+enum {
+  /**
+   * The memory pool is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_MEMORY_POOL = 40,
+
+  /**
+   * Agent accessed memory beyond the maximum legal address.
+   */
+  HSA_STATUS_ERROR_MEMORY_APERTURE_VIOLATION = 41,
+
+  /**
+   * Agent executed an invalid shader instruction.
+   */
+  HSA_STATUS_ERROR_ILLEGAL_INSTRUCTION = 42,
+
+  /**
+   * Agent attempted to access an inaccessible address.
+   * See hsa_amd_register_system_event_handler and
+   * HSA_AMD_GPU_MEMORY_FAULT_EVENT for more information on illegal accesses.
+   */
+  HSA_STATUS_ERROR_MEMORY_FAULT = 43,
+
+  /**
+   * The CU mask was successfully set but the mask attempted to enable a CU
+   * which was disabled for the process.  CUs disabled for the process remain
+   * disabled.
+   */
+  HSA_STATUS_CU_MASK_REDUCED = 44,
+
+  /**
+   * Exceeded number of VGPRs available on this agent
+   */
+  HSA_STATUS_ERROR_OUT_OF_REGISTERS = 45,
+
+  /**
+   * Resource is busy or temporarily unavailable
+   */
+  HSA_STATUS_ERROR_RESOURCE_BUSY = 46,
+
+  /**
+   * Request is not supported by this system
+   */
+  HSA_STATUS_ERROR_NOT_SUPPORTED = 47,
+};
+
+/** @} */
+
+/** \addtogroup memory Memory
+ *  @{
+ */
+
+/**
+ * @brief IOMMU version supported
+ */
+typedef enum {
+  /**
+   * IOMMU not supported
+   */
+  HSA_IOMMU_SUPPORT_NONE = 0,
+  /* IOMMU V1 support is not relevant to user applications, so not reporting it */
+  /**
+   * IOMMU V2 supported
+   */
+  HSA_IOMMU_SUPPORT_V2 = 1,
+} hsa_amd_iommu_version_t;
+
+/**
+ * @brief Structure containing information on the agent's clock counters.
+ */
+typedef struct hsa_amd_clock_counters_s {
+  uint64_t gpu_clock_counter;
+  uint64_t cpu_clock_counter;
+  uint64_t system_clock_counter;
+  uint64_t system_clock_frequency;
+} hsa_amd_clock_counters_t;
+
+/**
+ * @brief Agent attributes.
+ */
+typedef enum hsa_amd_agent_info_s {
+  /**
+   * Chip identifier. The type of this attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_CHIP_ID = 0xA000,
+  /**
+   * Size of a cacheline in bytes. The type of this attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_CACHELINE_SIZE = 0xA001,
+  /**
+   * The number of compute unit available in the agent. The type of this
+   * attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT = 0xA002,
+  /**
+   * The maximum clock frequency of the agent in MHz. The type of this
+   * attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY = 0xA003,
+  /**
+   * Internal driver node identifier. The type of this attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_DRIVER_NODE_ID = 0xA004,
+  /**
+   * Max number of watch points on memory address ranges to generate exception
+   * events when the watched addresses are accessed.  The type of this
+   * attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_MAX_ADDRESS_WATCH_POINTS = 0xA005,
+  /**
+   * Agent BDF_ID, named LocationID in thunk. The type of this attribute is
+   * uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_BDFID = 0xA006,
+  /**
+   * Memory Interface width, the return value type is uint32_t.
+   * This attribute is deprecated.
+   */
+  HSA_AMD_AGENT_INFO_MEMORY_WIDTH = 0xA007,
+  /**
+   * Max Memory Clock, the return value type is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_MEMORY_MAX_FREQUENCY = 0xA008,
+  /**
+   * Board name of Agent - populated from MarketingName of Kfd Node
+   * The value is an Ascii string of 64 chars.
+   */
+  HSA_AMD_AGENT_INFO_PRODUCT_NAME = 0xA009,
+  /**
+   * Maximum number of waves possible in a Compute Unit.
+   * The type of this attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU = 0xA00A,
+  /**
+   * Number of SIMD's per compute unit CU
+   * The type of this attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU = 0xA00B,
+  /**
+   * Number of Shader Engines (SE) in Gpu
+   * The type of this attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES = 0xA00C,
+  /**
+   * Number of Shader Arrays Per Shader Engines in Gpu
+   * The type of this attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE = 0xA00D,
+  /**
+   * Address of the HDP flush registers.  Use of these registers does not conform to the HSA memory
+   * model and should be treated with caution.
+   * The type of this attribute is hsa_amd_hdp_flush_t.
+   */
+  HSA_AMD_AGENT_INFO_HDP_FLUSH = 0xA00E,
+  /**
+   * PCIe domain for the agent.  Pairs with HSA_AMD_AGENT_INFO_BDFID
+   * to give the full physical location of the Agent.
+   * The type of this attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_DOMAIN = 0xA00F,
+  /**
+   * Queries for support of cooperative queues.  See ::HSA_QUEUE_TYPE_COOPERATIVE.
+   * The type of this attribute is bool.
+   */
+  HSA_AMD_AGENT_INFO_COOPERATIVE_QUEUES = 0xA010,
+  /**
+   * Queries UUID of an agent. The value is an Ascii string with a maximum
+   * of 21 chars including NUL. The string value consists of two parts: header
+   * and body. The header identifies device type (GPU, CPU, DSP) while body
+   * encodes UUID as a 16 digit hex string
+   *
+   * Agents that do not support UUID will return the string "GPU-XX" or
+   * "CPU-XX" or "DSP-XX" depending upon their device type ::hsa_device_type_t
+   */
+  HSA_AMD_AGENT_INFO_UUID = 0xA011,
+  /**
+   * Queries for the ASIC revision of an agent. The value is an integer that
+   * increments for each revision. This can be used by user-level software to
+   * change how it operates, depending on the hardware version. This allows
+   * selective workarounds for hardware errata.
+   * The type of this attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_ASIC_REVISION = 0xA012,
+  /**
+   * Queries whether or not the host can directly access SVM memory that is
+   * physically resident in the agent's local memory.
+   * The type of this attribute is bool.
+   */
+  HSA_AMD_AGENT_INFO_SVM_DIRECT_HOST_ACCESS = 0xA013,
+  /**
+   * Some processors support more CUs than can reliably be used in a cooperative
+   * dispatch.  This queries the count of CUs which are fully enabled for
+   * cooperative dispatch.
+   * The type of this attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_COOPERATIVE_COMPUTE_UNIT_COUNT = 0xA014,
+  /**
+   * Queries the amount of memory available in bytes accross all global pools
+   * owned by the agent.
+   * The type of this attribute is uint64_t.
+   */
+  HSA_AMD_AGENT_INFO_MEMORY_AVAIL = 0xA015,
+  /**
+   * Timestamp value increase rate, in Hz. The timestamp (clock) frequency is
+   * in the range 1-400MHz.
+   * The type of this attribute is uint64_t.
+   */
+  HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY = 0xA016,
+  /**
+   * Queries for the ASIC family ID of an agent.
+   * The type of this attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_ASIC_FAMILY_ID = 0xA107,
+  /**
+   * Queries for the Packet Processor(CP Firmware) ucode version of an agent.
+   * The type of this attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_UCODE_VERSION = 0xA108,
+  /**
+   * Queries for the SDMA engine ucode of an agent.
+   * The type of this attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_SDMA_UCODE_VERSION = 0xA109,
+  /**
+   * Queries the number of SDMA engines.
+   * If HSA_AMD_AGENT_INFO_NUM_SDMA_XGMI_ENG query returns non-zero,
+   * this query returns the the number of SDMA engines optimized for
+   * host to device bidirectional traffic.
+   * The type of this attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_NUM_SDMA_ENG = 0xA10A,
+  /**
+   * Queries the number of additional SDMA engines optimized for D2D xGMI copies.
+   * The type of this attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_NUM_SDMA_XGMI_ENG = 0xA10B,
+  /**
+   * Queries for version of IOMMU supported by agent.
+   * The type of this attribute is hsa_amd_iommu_version_t.
+   */
+  HSA_AMD_AGENT_INFO_IOMMU_SUPPORT = 0xA110,
+  /**
+   * Queries for number of XCCs within the agent.
+   * The type of this attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_NUM_XCC = 0xA111,
+  /**
+   * Queries for driver unique identifier.
+   * The type of this attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_DRIVER_UID = 0xA112,
+  /**
+   * Returns the hsa_agent_t of the nearest CPU agent
+   * The type of this attribute is hsa_agent_t.
+   */
+  HSA_AMD_AGENT_INFO_NEAREST_CPU = 0xA113,
+  /**
+   * Bit-mask indicating memory properties of this agent. A memory property is set if the flag bit
+   * is set at that position. User may use the hsa_flag_isset64 macro to verify whether a flag
+   * is set. The type of this attribute is uint8_t[8].
+   */
+  HSA_AMD_AGENT_INFO_MEMORY_PROPERTIES = 0xA114,
+  /**
+   * Bit-mask indicating AQL Extensions supported by this agent. An AQL extension is set if the flag
+   * bit is set at that position. User may use the hsa_flag_isset64 macro to verify whether a flag
+   * is set. The type of this attribute is uint8_t[8].
+   */
+  HSA_AMD_AGENT_INFO_AQL_EXTENSIONS = 0xA115, /* Not implemented yet */
+  /**
+   * Maximum allowed value in bytes for scratch limit for this agent. This amount
+   * is shared accross all queues created on this agent.
+   * The type of this attribute is uint64_t.
+   */
+  HSA_AMD_AGENT_INFO_SCRATCH_LIMIT_MAX = 0xA116,
+  /**
+   * Current scratch limit threshold in bytes for this agent. This limit can be
+   * modified using the hsa_amd_agent_set_async_scratch_limit call.
+   * - AQL dispatches that require scratch-memory above this threshold will trigger a
+   *   scratch use-once.
+   * - AQL dispatches using less scratch-memory than this threshold, ROCr will
+   *   permanently assign the allocated scratch memory to the queue handling the dispatch.
+   *   This memory can be reclaimed by calling hsa_amd_agent_set_async_scratch_limit
+   *   with a lower threshold by current value.
+   *
+   * The type of this attribute is uint64_t.
+   */
+  HSA_AMD_AGENT_INFO_SCRATCH_LIMIT_CURRENT = 0xA117,
+  /**
+   * Queries the driver for clock counters of the agent.
+   * The type of this attribute is hsa_amd_clock_counters_t.
+   */
+  HSA_AMD_AGENT_INFO_CLOCK_COUNTERS = 0xA118
+} hsa_amd_agent_info_t;
+
+/**
+ * @brief Agent memory properties attributes
+ */
+typedef enum hsa_amd_agent_memory_properties_s {
+  HSA_AMD_MEMORY_PROPERTY_AGENT_IS_APU = (1 << 0),
+} hsa_amd_agent_memory_properties_t;
+
+/**
+ * @brief SDMA engine IDs unique by single set bit position.
+ */
+typedef enum hsa_amd_sdma_engine_id {
+  HSA_AMD_SDMA_ENGINE_0 = 0x1,
+  HSA_AMD_SDMA_ENGINE_1 = 0x2,
+  HSA_AMD_SDMA_ENGINE_2 = 0x4,
+  HSA_AMD_SDMA_ENGINE_3 = 0x8,
+  HSA_AMD_SDMA_ENGINE_4 = 0x10,
+  HSA_AMD_SDMA_ENGINE_5 = 0x20,
+  HSA_AMD_SDMA_ENGINE_6 = 0x40,
+  HSA_AMD_SDMA_ENGINE_7 = 0x80,
+  HSA_AMD_SDMA_ENGINE_8 = 0x100,
+  HSA_AMD_SDMA_ENGINE_9 = 0x200,
+  HSA_AMD_SDMA_ENGINE_10 = 0x400,
+  HSA_AMD_SDMA_ENGINE_11 = 0x800,
+  HSA_AMD_SDMA_ENGINE_12 = 0x1000,
+  HSA_AMD_SDMA_ENGINE_13 = 0x2000,
+  HSA_AMD_SDMA_ENGINE_14 = 0x4000,
+  HSA_AMD_SDMA_ENGINE_15 = 0x8000
+} hsa_amd_sdma_engine_id_t;
+
+typedef struct hsa_amd_hdp_flush_s {
+  uint32_t* HDP_MEM_FLUSH_CNTL;
+  uint32_t* HDP_REG_FLUSH_CNTL;
+} hsa_amd_hdp_flush_t;
+
+/**
+ * @brief Region attributes.
+ */
+#ifdef __cplusplus
+typedef enum hsa_amd_region_info_s : int {
+#else
+typedef enum hsa_amd_region_info_s {
+#endif
+  /**
+   * Determine if host can access the region. The type of this attribute
+   * is bool.
+   */
+  HSA_AMD_REGION_INFO_HOST_ACCESSIBLE = 0xA000,
+  /**
+   * Base address of the region in flat address space.
+   */
+  HSA_AMD_REGION_INFO_BASE = 0xA001,
+  /**
+   * Memory Interface width, the return value type is uint32_t.
+   * This attribute is deprecated. Use HSA_AMD_AGENT_INFO_MEMORY_WIDTH.
+   */
+  HSA_AMD_REGION_INFO_BUS_WIDTH = 0xA002,
+  /**
+   * Max Memory Clock, the return value type is uint32_t.
+   * This attribute is deprecated. Use HSA_AMD_AGENT_INFO_MEMORY_MAX_FREQUENCY.
+   */
+  HSA_AMD_REGION_INFO_MAX_CLOCK_FREQUENCY = 0xA003,
+} hsa_amd_region_info_t;
+
+/**
+ * @brief Coherency attributes of fine grain region.
+ */
+typedef enum hsa_amd_coherency_type_s {
+  /**
+   * Coherent region.
+   */
+  HSA_AMD_COHERENCY_TYPE_COHERENT = 0,
+  /**
+   * Non coherent region.
+   */
+  HSA_AMD_COHERENCY_TYPE_NONCOHERENT = 1
+} hsa_amd_coherency_type_t;
+
+
+/**
+ * @brief dmabuf attributes
+ */
+#ifdef __cplusplus
+typedef enum hsa_amd_dma_buf_mapping_type_s : int {
+#else
+typedef enum hsa_amd_dma_buf_mapping_type_s {
+#endif
+  HSA_AMD_DMABUF_MAPPING_TYPE_NONE = 0,
+  HSA_AMD_DMABUF_MAPPING_TYPE_PCIE = 1
+} hsa_amd_dma_buf_mapping_type_t;
+/**
+ * @brief Get the coherency type of the fine grain region of an agent.
+ *
+ * @param[in] agent A valid agent.
+ *
+ * @param[out] type Pointer to a memory location where the HSA runtime will
+ * store the coherency type of the fine grain region.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p type is NULL.
+ */
+hsa_status_t HSA_API hsa_amd_coherency_get_type(hsa_agent_t agent,
+                                                hsa_amd_coherency_type_t* type);
+
+/**
+ * @brief Set the coherency type of the fine grain region of an agent.
+ * Deprecated.  This is supported on KV platforms.  For backward compatibility
+ * other platforms will spuriously succeed.
+ *
+ * @param[in] agent A valid agent.
+ *
+ * @param[in] type The coherency type to be set.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p type is invalid.
+ */
+hsa_status_t HSA_API hsa_amd_coherency_set_type(hsa_agent_t agent,
+                                                hsa_amd_coherency_type_t type);
+
+/** @} */
+
+/** \defgroup profile Profiling
+ *  @{
+ */
+
+/**
+ * @brief Structure containing profiling dispatch time information.
+ *
+ * Times are reported as ticks in the domain of the HSA system clock.
+ * The HSA system clock tick and frequency is obtained via hsa_system_get_info.
+ */
+typedef struct hsa_amd_profiling_dispatch_time_s {
+  /**
+   * Dispatch packet processing start time.
+   */
+  uint64_t start;
+  /**
+   * Dispatch packet completion time.
+   */
+  uint64_t end;
+} hsa_amd_profiling_dispatch_time_t;
+
+/**
+ * @brief Structure containing profiling async copy time information.
+ *
+ * Times are reported as ticks in the domain of the HSA system clock.
+ * The HSA system clock tick and frequency is obtained via hsa_system_get_info.
+ */
+typedef struct hsa_amd_profiling_async_copy_time_s {
+  /**
+   * Async copy processing start time.
+   */
+  uint64_t start;
+  /**
+   * Async copy completion time.
+   */
+  uint64_t end;
+} hsa_amd_profiling_async_copy_time_t;
+
+/**
+ * @brief Enable or disable profiling capability of a queue.
+ *
+ * @param[in] queue A valid queue.
+ *
+ * @param[in] enable 1 to enable profiling. 0 to disable profiling.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE The queue is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p queue is NULL.
+ */
+hsa_status_t HSA_API
+    hsa_amd_profiling_set_profiler_enabled(hsa_queue_t* queue, int enable);
+
+/**
+ * @brief Enable or disable asynchronous memory copy profiling.
+ *
+ * @details The runtime will provide the copy processing start timestamp and
+ * completion timestamp of each call to hsa_amd_memory_async_copy if the
+ * async copy profiling is enabled prior to the call to
+ * hsa_amd_memory_async_copy. The completion signal object is used to
+ * hold the last async copy start and end timestamp. The client can retrieve
+ * these timestamps via call to hsa_amd_profiling_get_async_copy_time.
+ *
+ * @param[in] enable True to enable profiling. False to disable profiling.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Failed on allocating resources
+ * needed to profile the asynchronous copy.
+ */
+hsa_status_t HSA_API
+    hsa_amd_profiling_async_copy_enable(bool enable);
+
+/**
+ * @brief Retrieve packet processing time stamps.
+ *
+ * @param[in] agent The agent with which the signal was last used.  For
+ * instance, if the profiled dispatch packet is dispatched onto queue Q,
+ * which was created on agent A, then this parameter must be A.
+ *
+ * @param[in] signal A signal used as the completion signal of the dispatch
+ * packet to retrieve time stamps from.  This dispatch packet must have been
+ * issued to a queue with profiling enabled and have already completed.  Also
+ * the signal must not have yet been used in any other packet following the
+ * completion of the profiled dispatch packet.
+ *
+ * @param[out] time Packet processing timestamps in the HSA system clock
+ * domain.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL The signal is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p time is NULL.
+ */
+hsa_status_t HSA_API hsa_amd_profiling_get_dispatch_time(
+    hsa_agent_t agent, hsa_signal_t signal,
+    hsa_amd_profiling_dispatch_time_t* time);
+
+/**
+ * @brief Retrieve asynchronous copy timestamps.
+ *
+ * @details Async copy profiling is enabled via call to
+ * hsa_amd_profiling_async_copy_enable.
+ *
+ * @param[in] signal A signal used as the completion signal of the call to
+ * hsa_amd_memory_async_copy.
+ *
+ * @param[out] time Async copy processing timestamps in the HSA system clock
+ * domain.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL The signal is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p time is NULL.
+ */
+hsa_status_t HSA_API hsa_amd_profiling_get_async_copy_time(
+    hsa_signal_t signal, hsa_amd_profiling_async_copy_time_t* time);
+
+/**
+ * @brief Computes the frequency ratio and offset between the agent clock and
+ * HSA system clock and converts the agent's tick to HSA system domain tick.
+ *
+ * @param[in] agent The agent used to retrieve the agent_tick. It is user's
+ * responsibility to make sure the tick number is from this agent, otherwise,
+ * the behavior is undefined.
+ *
+ * @param[in] agent_tick The tick count retrieved from the specified @p agent.
+ *
+ * @param[out] system_tick The translated HSA system domain clock counter tick.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p system_tick is NULL;
+ */
+hsa_status_t HSA_API
+    hsa_amd_profiling_convert_tick_to_system_domain(hsa_agent_t agent,
+                                                    uint64_t agent_tick,
+                                                    uint64_t* system_tick);
+
+/** @} */
+
+/** \defgroup status Runtime notifications
+ *  @{
+ */
+
+/**
+ * @brief Signal attribute flags.
+ */
+typedef enum {
+  /**
+   * Signal will only be consumed by AMD GPUs.  Limits signal consumption to
+   * AMD GPU agents only.  Ignored if @p num_consumers is not zero (all agents).
+   */
+  HSA_AMD_SIGNAL_AMD_GPU_ONLY = 1,
+  /**
+   * Signal may be used for interprocess communication.
+   * IPC signals can be read, written, and waited on from any process.
+   * Profiling using an IPC enabled signal is only supported in a single process
+   * at a time.  Producing profiling data in one process and consuming it in
+   * another process is undefined.
+   */
+  HSA_AMD_SIGNAL_IPC = 2,
+} hsa_amd_signal_attribute_t;
+
+/**
+ * @brief Create a signal with specific attributes.
+ *
+ * @param[in] initial_value Initial value of the signal.
+ *
+ * @param[in] num_consumers Size of @p consumers. A value of 0 indicates that
+ * any agent might wait on the signal.
+ *
+ * @param[in] consumers List of agents that might consume (wait on) the
+ * signal. If @p num_consumers is 0, this argument is ignored; otherwise, the
+ * HSA runtime might use the list to optimize the handling of the signal
+ * object. If an agent not listed in @p consumers waits on the returned
+ * signal, the behavior is undefined. The memory associated with @p consumers
+ * can be reused or freed after the function returns.
+ *
+ * @param[in] attributes Requested signal attributes.  Multiple signal attributes
+ * may be requested by combining them with bitwise OR.  Requesting no attributes
+ * (@p attributes == 0) results in the same signal as would have been obtained
+ * via hsa_signal_create.
+ *
+ * @param[out] signal Pointer to a memory location where the HSA runtime will
+ * store the newly created signal handle. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate
+ * the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p signal is NULL, @p
+ * num_consumers is greater than 0 but @p consumers is NULL, or @p consumers
+ * contains duplicates.
+ */
+hsa_status_t HSA_API hsa_amd_signal_create(hsa_signal_value_t initial_value, uint32_t num_consumers,
+                                           const hsa_agent_t* consumers, uint64_t attributes,
+                                           hsa_signal_t* signal);
+
+/**
+ * @brief Returns a pointer to the value of a signal.
+ *
+ * Use of this API does not modify the lifetime of ::signal and any
+ * hsa_signal_value_t retrieved by this API has lifetime equal to that of
+ * ::signal.
+ *
+ * This API is intended for partial interoperability with non-HSA compatible
+ * devices and should not be used where HSA interfaces are available.
+ *
+ * Use of the signal value must comply with use restritions of ::signal.
+ * Use may result in data races if the operations performed are not platform
+ * atomic.  Use with HSA_AMD_SIGNAL_AMD_GPU_ONLY or HSA_AMD_SIGNAL_IPC
+ * attributed signals is required.
+ *
+ * @param[in] Signal handle to extract the signal value pointer from.
+ *
+ * @param[out] Location where the extracted signal value pointer will be placed.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL signal is not a valid hsa_signal_t
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT value_ptr is NULL.
+ */
+hsa_status_t hsa_amd_signal_value_pointer(hsa_signal_t signal,
+                                          volatile hsa_signal_value_t** value_ptr);
+
+/**
+ * @brief Asyncronous signal handler function type.
+ *
+ * @details Type definition of callback function to be used with
+ * hsa_amd_signal_async_handler. This callback is invoked if the associated
+ * signal and condition are met. The callback receives the value of the signal
+ * which satisfied the associated wait condition and a user provided value. If
+ * the callback returns true then the callback will be called again if the
+ * associated signal and condition are satisfied again. If the callback returns
+ * false then it will not be called again.
+ *
+ * @param[in] value Contains the value of the signal observed by
+ * hsa_amd_signal_async_handler which caused the signal handler to be invoked.
+ *
+ * @param[in] arg Contains the user provided value given when the signal handler
+ * was registered with hsa_amd_signal_async_handler
+ *
+ * @retval true resumes monitoring the signal with this handler (as if calling
+ * hsa_amd_signal_async_handler again with identical parameters)
+ *
+ * @retval false stops monitoring the signal with this handler (handler will
+ * not be called again for this signal)
+ *
+ */
+typedef bool (*hsa_amd_signal_handler)(hsa_signal_value_t value, void* arg);
+
+/**
+ * @brief Register asynchronous signal handler function.
+ *
+ * @details Allows registering a callback function and user provided value with
+ * a signal and wait condition. The callback will be invoked if the associated
+ * signal and wait condition are satisfied. Callbacks will be invoked serially
+ * but in an arbitrary order so callbacks should be independent of each other.
+ * After being invoked a callback may continue to wait for its associated signal
+ * and condition and, possibly, be invoked again. Or the callback may stop
+ * waiting. If the callback returns true then it will continue waiting and may
+ * be called again. If false then the callback will not wait again and will not
+ * be called again for the associated signal and condition. It is possible to
+ * register the same callback multiple times with the same or different signals
+ * and/or conditions. Each registration of the callback will be treated entirely
+ * independently.
+ *
+ * @param[in] signal hsa signal to be asynchronously monitored
+ *
+ * @param[in] cond condition value to monitor for
+ *
+ * @param[in] value signal value used in condition expression
+ *
+ * @param[in] handler asynchronous signal handler invoked when signal's
+ * condition is met
+ *
+ * @param[in] arg user provided value which is provided to handler when handler
+ * is invoked
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL signal is not a valid hsa_signal_t
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT handler is invalid (NULL)
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime is out of
+ * resources or blocking signals are not supported by the HSA driver component.
+ *
+ */
+hsa_status_t HSA_API
+    hsa_amd_signal_async_handler(hsa_signal_t signal,
+                                 hsa_signal_condition_t cond,
+                                 hsa_signal_value_t value,
+                                 hsa_amd_signal_handler handler, void* arg);
+
+/**
+ * @brief Wait for all signal-condition pairs to be satisfied.
+ *
+ * @details Allows waiting for all of several signal and condition pairs to be
+ * satisfied. The function returns 0 if all signals met their conditions and -1
+ * on a timeout. The value of each signal's satisfying value is returned in
+ * satisfying_value unless satisfying_value is nullptr. NULL and invalid signals
+ * are considered to have value 0 and their conditions already satisfied. This
+ * function provides only relaxed memory semantics.
+ */
+uint32_t HSA_API hsa_amd_signal_wait_all(uint32_t signal_count, hsa_signal_t* signals,
+                                         hsa_signal_condition_t* conds, hsa_signal_value_t* values,
+                                         uint64_t timeout_hint, hsa_wait_state_t wait_hint,
+                                         hsa_signal_value_t* satisfying_values);
+
+/**
+ * @brief Wait for any signal-condition pair to be satisfied.
+ *
+ * @details Allows waiting for any of several signal and conditions pairs to be
+ * satisfied. The function returns the index into the list of signals of the
+ * first satisfying signal-condition pair. The function returns
+ * std::numeric_limits<uint32_t>::max() if no valid signal is provided. The value
+ * of the satisfying signal's value is returned in satisfying_value, unless
+ * satisfying_value is nullptr or there's no valid signal in the signal-condition
+ * pairs. NULL and invalid signals are ignored. This function provides only
+ * relaxed memory semantics.
+ */
+uint32_t HSA_API
+    hsa_amd_signal_wait_any(uint32_t signal_count, hsa_signal_t* signals,
+                            hsa_signal_condition_t* conds,
+                            hsa_signal_value_t* values, uint64_t timeout_hint,
+                            hsa_wait_state_t wait_hint,
+                            hsa_signal_value_t* satisfying_value);
+
+/** @} */
+
+/**
+ * @brief Call a function asynchronously
+ *
+ * @details Provides access to the runtime's asynchronous event handling thread
+ * for general asynchronous functions.  Functions queued this way are executed
+ * in the same manner as if they were a signal handler who's signal is
+ * satisfied.
+ *
+ * @param[in] callback asynchronous function to be invoked
+ *
+ * @param[in] arg user provided value which is provided to handler when handler
+ * is invoked
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT handler is invalid (NULL)
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime is out of
+ * resources or blocking signals are not supported by the HSA driver component.
+ *
+ */
+hsa_status_t HSA_API
+    hsa_amd_async_function(void (*callback)(void* arg), void* arg);
+
+/** \addtogroup ext-images Images and samplers
+ *  @{
+ */
+
+/**
+ * @brief Encodes an opaque vendor specific image format.  The length of data
+ * depends on the underlying format.  This structure must not be copied as its
+ * true length can not be determined.
+ */
+typedef struct hsa_amd_image_descriptor_s {
+  /*
+  Version number of the descriptor
+  */
+  uint32_t version;
+
+  /*
+  Vendor and device PCI IDs for the format as VENDOR_ID<<16|DEVICE_ID.
+  */
+  uint32_t deviceID;
+
+  /*
+  Start of vendor specific data.
+  */
+  uint32_t data[1];
+} hsa_amd_image_descriptor_t;
+
+/**
+ * @brief Creates an image from an opaque vendor specific image format.
+ * Does not modify data at image_data.  Intended initially for
+ * accessing interop images.
+ *
+ * @param agent[in] Agent on which to create the image
+ *
+ * @param[in] image_descriptor[in] Vendor specific image format
+ *
+ * @param[in] image_data Pointer to image backing store
+ *
+ * @param[in] access_permission Access permissions for the image object
+ *
+ * @param[out] image Created image object.
+ *
+ * @retval HSA_STATUS_SUCCESS Image created successfully
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized
+ *
+ * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating
+ * necessary resources
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT Bad or mismatched descriptor,
+ * null image_data, or mismatched access_permission.
+ */
+hsa_status_t HSA_API hsa_amd_image_create(
+    hsa_agent_t agent,
+    const hsa_ext_image_descriptor_t *image_descriptor,
+    const hsa_amd_image_descriptor_t *image_layout,
+    const void *image_data,
+    hsa_access_permission_t access_permission,
+    hsa_ext_image_t *image
+);
+
+/**
+ * @brief Query image limits.
+ *
+ * @param[in] agent A valid agent.
+ *
+ * @param[in] attribute HSA image info attribute to query.
+ *
+ * @param[out] value Pointer to an application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE @p value is NULL or @p attribute <
+ * HSA_EXT_AGENT_INFO_IMAGE_1D_MAX_ELEMENTS or @p attribute >
+ * HSA_EXT_AGENT_INFO_IMAGE_ARRAY_MAX_LAYERS.
+ *
+ */
+hsa_status_t HSA_API hsa_amd_image_get_info_max_dim(hsa_agent_t agent,
+                                                    hsa_agent_info_t attribute,
+                                                    void* value);
+
+/** @} */
+
+/** \addtogroup queue Queues
+ *  @{
+ */
+
+/**
+ * @brief Set a queue's CU affinity mask.
+ *
+ * @details Enables the queue to run on only selected CUs.  The given mask is
+ * combined by bitwise AND with any device wide mask in HSA_CU_MASK before
+ * being applied.
+ * If num_cu_mask_count is 0 then the request is interpreted as a request to
+ * enable all CUs and no cu_mask array need be given.
+ *
+ * @param[in] queue A pointer to HSA queue.
+ *
+ * @param[in] num_cu_mask_count Size of CUMask bit array passed in, in bits.
+ *
+ * @param[in] cu_mask Bit-vector representing the CU mask.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_CU_MASK_REDUCED The function was successfully executed
+ * but the given mask attempted to enable a CU which was disabled by
+ * HSA_CU_MASK.  CUs disabled by HSA_CU_MASK remain disabled.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE @p queue is NULL or invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_cu_mask_count is not
+ * a multiple of 32 or @p num_cu_mask_count is not 0 and cu_mask is NULL.
+ * Devices with work group processors must even-index contiguous pairwise
+ * CU enable e.g. 0x33(b'110011) is valid while 0x5(0x101) and 0x6(b'0110)
+ * are invalid.
+ *
+ */
+hsa_status_t HSA_API hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue,
+                                               uint32_t num_cu_mask_count,
+                                               const uint32_t* cu_mask);
+
+/**
+ * @brief Retrieve a queue's CU affinity mask.
+ *
+ * @details Returns the first num_cu_mask_count bits of a queue's CU mask.
+ * Ensure that num_cu_mask_count is at least as large as
+ * HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT to retrieve the entire mask.
+ *
+ * @param[in] queue A pointer to HSA queue.
+ *
+ * @param[in] num_cu_mask_count Size of CUMask bit array passed in, in bits.
+ *
+ * @param[out] cu_mask Bit-vector representing the CU mask.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE @p queue is NULL or invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_cu_mask_count is 0, not
+ * a multiple of 32 or @p cu_mask is NULL.
+ *
+ */
+hsa_status_t HSA_API hsa_amd_queue_cu_get_mask(const hsa_queue_t* queue, uint32_t num_cu_mask_count,
+                                               uint32_t* cu_mask);
+
+/** @} */
+
+/** \addtogroup memory Memory
+ *  @{
+ */
+
+/**
+ * @brief Memory segments associated with a memory pool.
+ */
+typedef enum {
+  /**
+   * Global segment. Used to hold data that is shared by all agents.
+   */
+  HSA_AMD_SEGMENT_GLOBAL = 0,
+  /**
+   * Read-only segment. Used to hold data that remains constant during the
+   * execution of a kernel.
+   */
+  HSA_AMD_SEGMENT_READONLY = 1,
+  /**
+   * Private segment. Used to hold data that is local to a single work-item.
+   */
+  HSA_AMD_SEGMENT_PRIVATE = 2,
+  /**
+   * Group segment. Used to hold data that is shared by the work-items of a
+   * work-group.
+   */
+  HSA_AMD_SEGMENT_GROUP = 3,
+} hsa_amd_segment_t;
+
+/**
+ * @brief A memory pool encapsulates physical storage on an agent
+ * along with a memory access model.
+ *
+ * @details A memory pool encapsulates a physical partition of an agent's
+ * memory system along with a memory access model.  Division of a single
+ * memory system into separate pools allows querying each partition's access
+ * path properties (see ::hsa_amd_agent_memory_pool_get_info). Allocations
+ * from a pool are preferentially bound to that pool's physical partition.
+ * Binding to the pool's preferential physical partition may not be
+ * possible or persistent depending on the system's memory policy
+ * and/or state which is beyond the scope of HSA APIs.
+ *
+ * For example, a multi-node NUMA memory system may be represented by multiple
+ * pool's with each pool providing size and access path information for the
+ * partition it represents.  Allocations from a pool are preferentially bound
+ * to the pool's partition (which in this example is a NUMA node) while
+ * following its memory access model. The actual placement may vary or migrate
+ * due to the system's NUMA policy and state, which is beyond the scope of
+ * HSA APIs.
+ */
+typedef struct hsa_amd_memory_pool_s {
+  /**
+   * Opaque handle.
+   */
+  uint64_t handle;
+} hsa_amd_memory_pool_t;
+
+typedef enum hsa_amd_memory_pool_global_flag_s {
+  /**
+   * The application can use allocations in the memory pool to store kernel
+   * arguments, and provide the values for the kernarg segment of
+   * a kernel dispatch.
+   */
+  HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT = 1,
+  /**
+   * Updates to memory in this pool conform to HSA memory consistency model.
+   * If this flag is set, then ::HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED
+   * must not be set.
+   */
+  HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED = 2,
+  /**
+   * Writes to memory in this pool can be performed by a single agent at a time.
+   */
+  HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED = 4,
+
+  /** Updates to memory in this memory pool have extended scope, acting as
+   * system-scope atomics for variables in memory regions of this type.
+   * Note: On non-compliant systems, device-specific actions may be required
+   * for system-scope coherence. */
+  HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED = 8,
+
+} hsa_amd_memory_pool_global_flag_t;
+
+typedef enum hsa_amd_memory_pool_location_s {
+    /**
+     * This memory pool resides on the host (CPU)
+     */
+    HSA_AMD_MEMORY_POOL_LOCATION_CPU = 0,
+    /**
+     * This memory pool resides on a GPU
+     */
+    HSA_AMD_MEMORY_POOL_LOCATION_GPU = 1
+} hsa_amd_memory_pool_location_t;
+
+/**
+ * @brief Memory pool features.
+ */
+typedef enum {
+  /**
+  * Segment where the memory pool resides. The type of this attribute is
+  * ::hsa_amd_segment_t.
+  */
+  HSA_AMD_MEMORY_POOL_INFO_SEGMENT = 0,
+  /**
+  * Flag mask. The value of this attribute is undefined if the value of
+  * ::HSA_AMD_MEMORY_POOL_INFO_SEGMENT is not ::HSA_AMD_SEGMENT_GLOBAL. The type
+  * of
+  * this attribute is uint32_t, a bit-field of
+  * ::hsa_amd_memory_pool_global_flag_t
+  * values.
+  */
+  HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS = 1,
+  /**
+  * Size of this pool, in bytes. The type of this attribute is size_t.
+  */
+  HSA_AMD_MEMORY_POOL_INFO_SIZE = 2,
+  /**
+  * Indicates whether memory in this pool can be allocated using
+  * ::hsa_amd_memory_pool_allocate. The type of this attribute is bool.
+  *
+  * The value of this flag is always false for memory pools in the group and
+  * private segments.
+  */
+  HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED = 5,
+  /**
+   * Allocation granularity of buffers allocated by
+   * ::hsa_amd_memory_pool_allocate
+   * in this memory pool. The size of a buffer allocated in this pool is a
+   * multiple of the value of this attribute. While this is the minimum size of
+   * allocation allowed, it is recommened to use
+   * HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_REC_GRANULE to obtain the recommended
+   * allocation granularity size for this pool.
+   * The value of this attribute is only defined if
+   * ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED is true for
+   * this pool. The type of this attribute is size_t.
+   */
+  HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE = 6,
+  /**
+   * Alignment of buffers allocated by ::hsa_amd_memory_pool_allocate in this
+   * pool. The value of this attribute is only defined if
+   * ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED is true for this pool, and
+   * must be a power of 2. The type of this attribute is size_t.
+   */
+  HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT = 7,
+  /**
+   * This memory_pool can be made directly accessible by all the agents in the
+   * system (::hsa_amd_agent_memory_pool_get_info does not return
+   * ::HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED for any agent). The type of this
+   * attribute is bool.
+   */
+  HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL = 15,
+  /**
+   * Maximum aggregate allocation size in bytes. The type of this attribute
+   * is size_t.
+   */
+  HSA_AMD_MEMORY_POOL_INFO_ALLOC_MAX_SIZE = 16,
+  /**
+   * Location of this memory pool. The type of this attribute
+   * is hsa_amd_memory_pool_location_t.
+   */
+  HSA_AMD_MEMORY_POOL_INFO_LOCATION = 17,
+  /**
+   * Internal block size for allocations. This would also be the recommended
+   * granularity size for allocations as this prevents internal fragmentation.
+   * The value of this attribute is only defined if
+   * ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED is true for this pool.
+   * The size of this attribute is size_t.
+   */
+  HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_REC_GRANULE = 18,
+} hsa_amd_memory_pool_info_t;
+
+/**
+ * @brief Memory pool flag used to specify allocation directives
+ *
+ */
+typedef enum hsa_amd_memory_pool_flag_s {
+  /**
+   * Allocates memory that conforms to standard HSA memory consistency model
+   */
+  HSA_AMD_MEMORY_POOL_STANDARD_FLAG = 0,
+  /**
+   * Allocates fine grain memory type where memory ordering is per point to point
+   * connection. Atomic memory operations on these memory buffers are not
+   * guaranteed to be visible at system scope.
+   */
+  HSA_AMD_MEMORY_POOL_PCIE_FLAG = (1 << 0),
+  /**
+   *  Allocates physically contiguous memory
+   */
+  HSA_AMD_MEMORY_POOL_CONTIGUOUS_FLAG = (1 << 1),
+  /**
+   *  Allocates executable memory
+   */
+  HSA_AMD_MEMORY_POOL_EXECUTABLE_FLAG = (1 << 2),
+  /**
+   *  Allocates uncached memory
+   */
+  HSA_AMD_MEMORY_POOL_UNCACHED_FLAG = (1 << 3),
+} hsa_amd_memory_pool_flag_t;
+
+/**
+ * @brief Get the current value of an attribute of a memory pool.
+ *
+ * @param[in] memory_pool A valid memory pool.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to a application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ */
+hsa_status_t HSA_API
+    hsa_amd_memory_pool_get_info(hsa_amd_memory_pool_t memory_pool,
+                                 hsa_amd_memory_pool_info_t attribute,
+                                 void* value);
+
+/**
+ * @brief Iterate over the memory pools associated with a given agent, and
+ * invoke an application-defined callback on every iteration.
+ *
+ * @details An agent can directly access buffers located in some memory pool, or
+ * be enabled to access them by the application (see ::hsa_amd_agents_allow_access),
+ * yet that memory pool may not be returned by this function for that given
+ * agent.
+ *
+ * A memory pool of fine-grained type must be associated only with the host.
+ *
+ * @param[in] agent A valid agent.
+ *
+ * @param[in] callback Callback to be invoked on the same thread that called
+ * ::hsa_amd_agent_iterate_memory_pools, serially, once per memory pool that is
+ * associated with the agent.  The HSA runtime passes two arguments to the
+ * callback: the memory pool, and the application data.  If @p callback
+ * returns a status other than ::HSA_STATUS_SUCCESS for a particular iteration,
+ * the traversal stops and ::hsa_amd_agent_iterate_memory_pools returns that status
+ * value.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL.
+ */
+hsa_status_t HSA_API hsa_amd_agent_iterate_memory_pools(
+    hsa_agent_t agent,
+    hsa_status_t (*callback)(hsa_amd_memory_pool_t memory_pool, void* data),
+    void* data);
+
+/**
+ * @brief Allocate a block of memory (or buffer) in the specified pool.
+ *
+ * @param[in] memory_pool Memory pool where to allocate memory from. The memory
+ * pool must have the ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED flag set.
+ *
+ * @param[in] size Allocation size, in bytes. Must not be zero. This value is
+ * rounded up to the nearest multiple of
+ * ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE in @p memory_pool.
+ *
+ * @param[in] flags A bit-field that is used to specify allocation
+ * directives.
+ *
+ * @param[out] ptr Pointer to the location where to store the base virtual
+ * address of
+ * the allocated block. The returned base address is aligned to the value of
+ * ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT in @p memory_pool. If the
+ * allocation fails, the returned value is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES No memory is available.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_MEMORY_POOL The memory pool is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION The host is not allowed to
+ * allocate memory in @p memory_pool, or @p size is greater than
+ * the value of HSA_AMD_MEMORY_POOL_INFO_ALLOC_MAX_SIZE in @p memory_pool.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is NULL, or @p size is 0,
+ * or flags is not 0.
+ *
+ */
+hsa_status_t HSA_API
+    hsa_amd_memory_pool_allocate(hsa_amd_memory_pool_t memory_pool, size_t size,
+                                 uint32_t flags, void** ptr);
+
+/**
+ * @brief Deallocate a block of memory previously allocated using
+ * ::hsa_amd_memory_pool_allocate.
+ *
+ * @param[in] ptr Pointer to a memory block. If @p ptr does not match a value
+ * previously returned by ::hsa_amd_memory_pool_allocate, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ */
+hsa_status_t HSA_API hsa_amd_memory_pool_free(void* ptr);
+
+/**
+ * @brief Asynchronously copy a block of memory from the location pointed to by
+ * @p src on the @p src_agent to the memory block pointed to by @p dst on the @p
+ * dst_agent.
+ * Because the DMA engines used may not be in the same coherency domain, the caller must ensure
+ * that buffers are system-level coherent. In general this requires the sending device to have
+ * released the buffer to system scope prior to executing the copy API and the receiving device
+ * must execute a system scope acquire fence prior to use of the destination buffer.
+ *
+ * @param[out] dst Buffer where the content is to be copied.
+ *
+ * @param[in] dst_agent Agent associated with the @p dst. The agent must be able to directly
+ * access both the source and destination buffers in their current locations.
+ * May be zero in which case the runtime will attempt to discover the destination agent.
+ * Discovery may have variable and/or high latency.
+ *
+ * @param[in] src A valid pointer to the source of data to be copied. The source
+ * buffer must not overlap with the destination buffer, otherwise the copy will succeed
+ * but contents of @p dst is undefined.
+ *
+ * @param[in] src_agent Agent associated with the @p src. The agent must be able to directly
+ * access both the source and destination buffers in their current locations.
+ * May be zero in which case the runtime will attempt to discover the destination agent.
+ * Discovery may have variable and/or high latency.
+ *
+ * @param[in] size Number of bytes to copy. If @p size is 0, no copy is
+ * performed and the function returns success. Copying a number of bytes larger
+ * than the size of the buffers pointed by @p dst or @p src results in undefined
+ * behavior.
+ *
+ * @param[in] num_dep_signals Number of dependent signals. Can be 0.
+ *
+ * @param[in] dep_signals List of signals that must be waited on before the copy
+ * operation starts. The copy will start after every signal has been observed with
+ * the value 0. The dependent signal should not include completion signal from
+ * hsa_amd_memory_async_copy operation to be issued in future as that can result
+ * in a deadlock. If @p num_dep_signals is 0, this argument is ignored.
+ *
+ * @param[in] completion_signal Signal used to indicate completion of the copy
+ * operation. When the copy operation is finished, the value of the signal is
+ * decremented. The runtime indicates that an error has occurred during the copy
+ * operation by setting the value of the completion signal to a negative
+ * number. The signal handle must not be 0.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. The
+ * application is responsible for checking for asynchronous error conditions
+ * (see the description of @p completion_signal).
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT An agent is invalid or no discovered agent has access.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL @p completion_signal is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT The source or destination
+ * pointers are NULL, or the completion signal is 0.
+ */
+hsa_status_t HSA_API
+    hsa_amd_memory_async_copy(void* dst, hsa_agent_t dst_agent, const void* src,
+                              hsa_agent_t src_agent, size_t size,
+                              uint32_t num_dep_signals,
+                              const hsa_signal_t* dep_signals,
+                              hsa_signal_t completion_signal);
+
+/**
+ * @brief Asynchronously copy a block of memory from the location pointed to by
+ * @p src on the @p src_agent to the memory block pointed to by @p dst on the @p
+ * dst_agent on engine_id.
+ *
+ * WARNING: Concurrent use of this call with hsa_amd_memory_async_copy can result
+ * in resource conflicts as HSA runtime will auto assign engines with the latter
+ * call.  Approach using both calls concurrently with caution.
+ *
+ * All param definitions are identical to hsa_amd_memory_async_copy with the
+ * exception of engine_id and force_copy_on_sdma.
+ *
+ * @param[in] - engine_id Target engine defined by hsa_amd_sdma_engine_id_t.
+ * Client should use hsa_amd_memory_copy_engine_status first to get the ID
+ * availability.
+ *
+ * @param[in] - force_copy_on_sdma By default, blit kernel copies are used when
+ * dst_agent == src_agent.  Setting this to true will force the copy over SDMA1.
+ *
+ * All return definitions are identical to hsa_amd_memory_async_copy with the
+ * following ammendments:
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT The source or destination
+ * pointers are NULL, or the completion signal is 0 or engine_id is improperly
+ * bounded.
+ */
+hsa_status_t HSA_API
+    hsa_amd_memory_async_copy_on_engine(void* dst, hsa_agent_t dst_agent, const void* src,
+                              hsa_agent_t src_agent, size_t size,
+                              uint32_t num_dep_signals,
+                              const hsa_signal_t* dep_signals,
+                              hsa_signal_t completion_signal,
+                              hsa_amd_sdma_engine_id_t engine_id,
+                              bool force_copy_on_sdma);
+/**
+ * @brief Reports the availability of SDMA copy engines.
+ *
+ * @param[in] dst_agent Destination agent of copy status direction.
+ *
+ * @param[in] src_agent Source agent of copy status direction.
+ *
+ * @param[out] engine_ids_mask returns available SDMA engine IDs that can be masked
+ * with hsa_amd_sdma_engine_id_t.
+ *
+ * @retval ::HSA_STATUS_SUCCESS Agent has available SDMA engines.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Agent does not have available SDMA engines.
+ *
+ */
+hsa_status_t HSA_API
+hsa_amd_memory_copy_engine_status(hsa_agent_t dst_agent, hsa_agent_t src_agent,
+                                      uint32_t *engine_ids_mask);
+ /**
+ * @brief Returns the preferred SDMA engine mask.
+ *
+ * @param[in] dst_agent Destination agent of copy status direction.
+ *
+ * @param[in] src_agent Source agent of copy status direction.
+ *
+ * @param[out] recommended_ids_mask returns available SDMA engine IDs for max bandwidth
+ * that can be masked with hsa_amd_sdma_engine_id_t. Can be 0 if there is no preference
+ *
+ * @retval ::HSA_STATUS_SUCCESS For mask returned
+ *
+ */
+hsa_status_t HSA_API
+hsa_amd_memory_get_preferred_copy_engine(hsa_agent_t dst_agent, hsa_agent_t src_agent,
+                                         uint32_t* recommended_ids_mask);
+
+/*
+[Provisional API]
+Pitched memory descriptor.
+All elements must be 4 byte aligned.  Pitch and slice are in bytes.
+*/
+typedef struct hsa_pitched_ptr_s {
+  void* base;
+  size_t pitch;
+  size_t slice;
+} hsa_pitched_ptr_t;
+
+/*
+[Provisional API]
+Copy direction flag.
+*/
+typedef enum {
+  hsaHostToHost = 0,
+  hsaHostToDevice = 1,
+  hsaDeviceToHost = 2,
+  hsaDeviceToDevice = 3
+} hsa_amd_copy_direction_t;
+
+/*
+[Provisional API]
+SDMA 3D memory copy API.  The same requirements must be met by src and dst as in
+hsa_amd_memory_async_copy.
+Both src and dst must be directly accessible to the copy_agent during the copy, src and dst rects
+must not overlap.
+CPU agents are not supported.  API requires SDMA and will return an error if SDMA is not available.
+Offsets and range carry x in bytes, y and z in rows and layers.
+*/
+hsa_status_t HSA_API hsa_amd_memory_async_copy_rect(
+    const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src,
+    const hsa_dim3_t* src_offset, const hsa_dim3_t* range, hsa_agent_t copy_agent,
+    hsa_amd_copy_direction_t dir, uint32_t num_dep_signals, const hsa_signal_t* dep_signals,
+    hsa_signal_t completion_signal);
+
+/**
+ * @brief Type of accesses to a memory pool from a given agent.
+ */
+typedef enum {
+  /**
+  * The agent cannot directly access any buffer in the memory pool.
+  */
+  HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED = 0,
+  /**
+  * The agent can directly access a buffer located in the pool; the application
+  * does not need to invoke ::hsa_amd_agents_allow_access.
+  */
+  HSA_AMD_MEMORY_POOL_ACCESS_ALLOWED_BY_DEFAULT = 1,
+  /**
+  * The agent can directly access a buffer located in the pool, but only if the
+  * application has previously requested access to that buffer using
+  * ::hsa_amd_agents_allow_access.
+  */
+  HSA_AMD_MEMORY_POOL_ACCESS_DISALLOWED_BY_DEFAULT = 2
+} hsa_amd_memory_pool_access_t;
+
+/**
+ * @brief Properties of the relationship between an agent a memory pool.
+ */
+typedef enum {
+  /**
+  * Hyper-transport bus type.
+  */
+  HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT = 0,
+
+  /**
+  * QPI bus type.
+  */
+  HSA_AMD_LINK_INFO_TYPE_QPI = 1,
+
+  /**
+  * PCIe bus type.
+  */
+  HSA_AMD_LINK_INFO_TYPE_PCIE = 2,
+
+  /**
+  * Infiniband bus type.
+  */
+  HSA_AMD_LINK_INFO_TYPE_INFINBAND = 3,
+
+  /**
+  * xGMI link type.
+  */
+  HSA_AMD_LINK_INFO_TYPE_XGMI = 4
+
+} hsa_amd_link_info_type_t;
+
+/**
+ * @brief Link properties when accessing the memory pool from the specified
+ * agent.
+ */
+typedef struct hsa_amd_memory_pool_link_info_s {
+  /**
+  * Minimum transfer latency (rounded to ns).
+  */
+  uint32_t min_latency;
+
+  /**
+  * Maximum transfer latency (rounded to ns).
+  */
+  uint32_t max_latency;
+
+  /**
+  * Minimum link interface bandwidth in MB/s.
+  */
+  uint32_t min_bandwidth;
+
+  /**
+  * Maximum link interface bandwidth in MB/s.
+  */
+  uint32_t max_bandwidth;
+
+  /**
+  * Support for 32-bit atomic transactions.
+  */
+  bool atomic_support_32bit;
+
+  /**
+  * Support for 64-bit atomic transactions.
+  */
+  bool atomic_support_64bit;
+
+  /**
+  * Support for cache coherent transactions.
+  */
+  bool coherent_support;
+
+  /**
+  * The type of bus/link.
+  */
+  hsa_amd_link_info_type_t link_type;
+
+  /**
+   * NUMA distance of memory pool relative to querying agent
+   */
+  uint32_t numa_distance;
+} hsa_amd_memory_pool_link_info_t;
+
+/**
+ * @brief Properties of the relationship between an agent a memory pool.
+ */
+typedef enum {
+  /**
+  * Access to buffers located in the memory pool. The type of this attribute
+  * is ::hsa_amd_memory_pool_access_t.
+  *
+  * An agent can always directly access buffers currently located in a memory
+  * pool that is associated (the memory_pool is one of the values returned by
+  * ::hsa_amd_agent_iterate_memory_pools on the agent) with that agent. If the
+  * buffer is currently located in a memory pool that is not associated with
+  * the agent, and the value returned by this function for the given
+  * combination of agent and memory pool is not
+  * HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED, the application still needs to invoke
+  * ::hsa_amd_agents_allow_access in order to gain direct access to the buffer.
+  *
+  * If the given agent can directly access buffers the pool, the result is not
+  * HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED. If the memory pool is associated with
+  * the agent, or it is of fined-grained type, the result must not be
+  * HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED. If the memory pool is not associated
+  * with the agent, and does not reside in the global segment, the result must
+  * be HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED.
+  */
+  HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS = 0,
+
+  /**
+  * Number of links to hop when accessing the memory pool from the specified
+  * agent. The value of this attribute is zero if the memory pool is associated
+  * with the agent, or if the access type is
+  * HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED. The type of this attribute is
+  * uint32_t.
+  */
+  HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS = 1,
+
+  /**
+  * Details of each link hop when accessing the memory pool starting from the
+  * specified agent. The type of this attribute is an array size of
+  * HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS with each element containing
+  * ::hsa_amd_memory_pool_link_info_t.
+  */
+  HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO = 2
+
+} hsa_amd_agent_memory_pool_info_t;
+
+/**
+ * @brief Get the current value of an attribute of the relationship between an
+ * agent and a memory pool.
+ *
+ * @param[in] agent Agent.
+ *
+ * @param[in] memory_pool Memory pool.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to a application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ */
+hsa_status_t HSA_API hsa_amd_agent_memory_pool_get_info(
+    hsa_agent_t agent, hsa_amd_memory_pool_t memory_pool,
+    hsa_amd_agent_memory_pool_info_t attribute, void* value);
+
+/**
+ * @brief Enable direct access to a buffer from a given set of agents.
+ *
+ * @details
+ *
+ * Upon return, only the listed agents and the agent associated with the
+ * buffer's memory pool have direct access to the @p ptr.
+ *
+ * Any agent that has access to the buffer before and after the call to
+ * ::hsa_amd_agents_allow_access will also have access while
+ * ::hsa_amd_agents_allow_access is in progress.
+ *
+ * The caller is responsible for ensuring that each agent in the list
+ * must be able to access the memory pool containing @p ptr
+ * (using ::hsa_amd_agent_memory_pool_get_info with ::HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS attribute),
+ * otherwise error code is returned.
+ *
+ * @param[in] num_agents Size of @p agents.
+ *
+ * @param[in] agents List of agents. If @p num_agents is 0, this argument is
+ * ignored.
+ *
+ * @param[in] flags A list of bit-field that is used to specify access
+ * information in a per-agent basis. This is currently reserved and must be NULL.
+ *
+ * @param[in] ptr A buffer previously allocated using ::hsa_amd_memory_pool_allocate.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_agents is 0, or @p agents
+ * is NULL, @p flags is not NULL, or attempting to enable access to agent(s)
+ * because @p ptr is allocated from an inaccessible pool.
+ *
+ */
+hsa_status_t HSA_API
+    hsa_amd_agents_allow_access(uint32_t num_agents, const hsa_agent_t* agents,
+                                const uint32_t* flags, const void* ptr);
+
+/**
+ * @brief Query if buffers currently located in some memory pool can be
+ * relocated to a destination memory pool.
+ *
+ * @details If the returned value is non-zero, a migration of a buffer to @p
+ * dst_memory_pool using ::hsa_amd_memory_migrate may nevertheless fail due to
+ * resource limitations.
+ *
+ * @param[in] src_memory_pool Source memory pool.
+ *
+ * @param[in] dst_memory_pool Destination memory pool.
+ *
+ * @param[out] result Pointer to a memory location where the result of the query
+ * is stored. Must not be NULL. If buffers currently located in @p
+ * src_memory_pool can be relocated to @p dst_memory_pool, the result is
+ * true.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_MEMORY_POOL One of the memory pools is
+ * invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p result is NULL.
+ */
+hsa_status_t HSA_API
+    hsa_amd_memory_pool_can_migrate(hsa_amd_memory_pool_t src_memory_pool,
+                                    hsa_amd_memory_pool_t dst_memory_pool,
+                                    bool* result);
+
+/**
+ * @brief Relocate a buffer to a new memory pool.
+ *
+ * @details When a buffer is migrated, its virtual address remains the same but
+ * its physical contents are moved to the indicated memory pool.
+ *
+ * After migration, only the agent associated with the destination pool will have access.
+ *
+ * The caller is also responsible for ensuring that the allocation in the
+ * source memory pool where the buffer is currently located can be migrated to the
+ * specified destination memory pool (using ::hsa_amd_memory_pool_can_migrate returns a value of true
+ * for the source and destination memory pools), otherwise behavior is undefined.
+ *
+ * The caller must ensure that the buffer is not accessed while it is migrated.
+ *
+ * @param[in] ptr Buffer to be relocated. The buffer must have been released to system
+ * prior to call this API.  The buffer will be released to system upon completion.
+ *
+ * @param[in] memory_pool Memory pool where to place the buffer.
+ *
+ * @param[in] flags A bit-field that is used to specify migration
+ * information. Must be zero.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_MEMORY_POOL The destination memory pool is
+ * invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure in
+ * allocating the necessary resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p flags is not 0.
+ */
+hsa_status_t HSA_API hsa_amd_memory_migrate(const void* ptr,
+                                            hsa_amd_memory_pool_t memory_pool,
+                                            uint32_t flags);
+
+/**
+ *
+ * @brief Pin a host pointer allocated by C/C++ or OS allocator (i.e. ordinary system DRAM) and
+ * return a new pointer accessible by the @p agents. If the @p host_ptr overlaps with previously
+ * locked memory, then the overlap area is kept locked (i.e multiple mappings are permitted). In
+ * this case, the same input @p host_ptr may give different locked @p agent_ptr and when it does,
+ * they are not necessarily coherent (i.e. accessing either @p agent_ptr is not equivalent).
+ * Accesses to @p agent_ptr are coarse grained.
+ *
+ * @param[in] host_ptr A buffer allocated by C/C++ or OS allocator.
+ *
+ * @param[in] size The size to be locked.
+ *
+ * @param[in] agents Array of agent handle to gain access to the @p host_ptr.
+ * If this parameter is NULL and the @p num_agent is 0, all agents
+ * in the platform will gain access to the @p host_ptr.
+ *
+ * @param[out] agent_ptr Pointer to the location where to store the new address.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure in
+ * allocating the necessary resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT One or more agent in @p agents is
+ * invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p size is 0 or @p host_ptr or
+ * @p agent_ptr is NULL or @p agents not NULL but @p num_agent is 0 or @p agents
+ * is NULL but @p num_agent is not 0.
+ */
+hsa_status_t HSA_API hsa_amd_memory_lock(void* host_ptr, size_t size,
+                                         hsa_agent_t* agents, int num_agent,
+                                         void** agent_ptr);
+
+/**
+ *
+ * @brief Pin a host pointer allocated by C/C++ or OS allocator (i.e. ordinary system DRAM) and
+ * return a new pointer accessible by the @p agents. If the @p host_ptr overlaps with previously
+ * locked memory, then the overlap area is kept locked (i.e. multiple mappings are permitted).
+ * In this case, the same input @p host_ptr may give different locked @p agent_ptr and when it
+ * does, they are not necessarily coherent (i.e. accessing either @p agent_ptr is not equivalent).
+ * Acesses to the memory via @p agent_ptr have the same access properties as memory allocated from
+ * @p pool as determined by ::hsa_amd_memory_pool_get_info and ::hsa_amd_agent_memory_pool_get_info
+ * (ex. coarse/fine grain, platform atomic support, link info).  Physical composition and placement
+ * of the memory (ex. page size, NUMA binding) is not changed.
+ *
+ * @param[in] host_ptr A buffer allocated by C/C++ or OS allocator.
+ *
+ * @param[in] size The size to be locked.
+ *
+ * @param[in] agents Array of agent handle to gain access to the @p host_ptr.
+ * If this parameter is NULL and the @p num_agent is 0, all agents
+ * in the platform will gain access to the @p host_ptr.
+ *
+ * @param[in] pool Global memory pool owned by a CPU agent.
+ *
+ * @param[in] flags A bit-field that is used to specify allocation
+ * directives. Reserved parameter, must be 0.
+ *
+ * @param[out] agent_ptr Pointer to the location where to store the new address.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure in
+ * allocating the necessary resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT One or more agent in @p agents is
+ * invalid or can not access @p pool.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_MEMORY_POOL @p pool is invalid or not owned
+ * by a CPU agent.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p size is 0 or @p host_ptr or
+ * @p agent_ptr is NULL or @p agents not NULL but @p num_agent is 0 or @p agents
+ * is NULL but @p num_agent is not 0 or flags is not 0.
+ */
+hsa_status_t HSA_API hsa_amd_memory_lock_to_pool(void* host_ptr, size_t size, hsa_agent_t* agents,
+                                                 int num_agent, hsa_amd_memory_pool_t pool,
+                                                 uint32_t flags, void** agent_ptr);
+
+/**
+ *
+ * @brief Unpin the host pointer previously pinned via ::hsa_amd_memory_lock or
+ * ::hsa_amd_memory_lock_to_pool.
+ *
+ * @details The behavior is undefined if the host pointer being unpinned does not
+ * match previous pinned address or if the host pointer was already deallocated.
+ *
+ * @param[in] host_ptr A buffer allocated by C/C++ or OS allocator that was
+ * pinned previously via ::hsa_amd_memory_lock or ::hsa_amd_memory_lock_to_pool.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ */
+hsa_status_t HSA_API hsa_amd_memory_unlock(void* host_ptr);
+
+/**
+ * @brief Sets the first @p count of uint32_t of the block of memory pointed by
+ * @p ptr to the specified @p value.
+ *
+ * @param[in] ptr Pointer to the block of memory to fill.
+ *
+ * @param[in] value Value to be set.
+ *
+ * @param[in] count Number of uint32_t element to be set to the value.
+ *
+ * @retval HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is NULL or
+ * not 4 bytes aligned
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ALLOCATION if the given memory
+ * region was not allocated with HSA runtime APIs.
+ *
+ */
+hsa_status_t HSA_API
+    hsa_amd_memory_fill(void* ptr, uint32_t value, size_t count);
+
+/**
+ * @brief Maps an interop object into the HSA flat address space and establishes
+ * memory residency.  The metadata pointer is valid during the lifetime of the
+ * map (until hsa_amd_interop_unmap_buffer is called).
+ * Multiple calls to hsa_amd_interop_map_buffer with the same interop_handle
+ * result in multiple mappings with potentially different addresses and
+ * different metadata pointers.  Concurrent operations on these addresses are
+ * not coherent.  Memory must be fenced to system scope to ensure consistency,
+ * between mappings and with any views of this buffer in the originating
+ * software stack.
+ *
+ * @param[in] num_agents Number of agents which require access to the memory
+ *
+ * @param[in] agents List of accessing agents.
+ *
+ * @param[in] interop_handle Handle of interop buffer (dmabuf handle in Linux)
+ *
+ * @param [in] flags Reserved, must be 0
+ *
+ * @param[out] size Size in bytes of the mapped object
+ *
+ * @param[out] ptr Base address of the mapped object
+ *
+ * @param[out] metadata_size Size of metadata in bytes, may be NULL
+ *
+ * @param[out] metadata Pointer to metadata, may be NULL
+ *
+ * @retval HSA_STATUS_SUCCESS if successfully mapped
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized
+ *
+ * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating
+ * necessary resources
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT all other errors
+ */
+hsa_status_t HSA_API hsa_amd_interop_map_buffer(uint32_t num_agents,
+                                        hsa_agent_t* agents,
+                                        int interop_handle,
+                                        uint32_t flags,
+                                        size_t* size,
+                                        void** ptr,
+                                        size_t* metadata_size,
+                                        const void** metadata);
+
+/**
+ * @brief Removes a previously mapped interop object from HSA's flat address space.
+ * Ends lifetime for the mapping's associated metadata pointer.
+ */
+hsa_status_t HSA_API hsa_amd_interop_unmap_buffer(void* ptr);
+
+/**
+ * @brief Denotes the type of memory in a pointer info query.
+ */
+typedef enum {
+  /*
+  Memory is not known to the HSA driver.  Unallocated or unlocked system memory.
+  */
+  HSA_EXT_POINTER_TYPE_UNKNOWN = 0,
+  /*
+  Memory was allocated with an HSA memory allocator.
+  */
+  HSA_EXT_POINTER_TYPE_HSA = 1,
+  /*
+  System memory which has been locked for use with an HSA agent.
+
+  Memory of this type is normal malloc'd memory and is always accessible to
+  the CPU.  Pointer info queries may not include CPU agents in the accessible
+  agents list as the CPU has implicit access.
+  */
+  HSA_EXT_POINTER_TYPE_LOCKED = 2,
+  /*
+  Memory originated in a graphics component and is shared with ROCr.
+  */
+  HSA_EXT_POINTER_TYPE_GRAPHICS = 3,
+  /*
+  Memory has been shared with the local process via ROCr IPC APIs.
+  */
+  HSA_EXT_POINTER_TYPE_IPC = 4,
+  /*
+  No backend memory but virtual address
+  */
+  HSA_EXT_POINTER_TYPE_RESERVED_ADDR = 5,
+  /*
+  Memory was allocated with an HSA virtual memory allocator
+  */
+  HSA_EXT_POINTER_TYPE_HSA_VMEM = 6
+} hsa_amd_pointer_type_t;
+
+/**
+ * @brief Describes a memory allocation known to ROCr.
+ * Within a ROCr major version this structure can only grow.
+ */
+typedef struct hsa_amd_pointer_info_s {
+  /*
+  Size in bytes of this structure.  Used for version control within a major ROCr
+  revision.  Set to sizeof(hsa_amd_pointer_t) prior to calling
+  hsa_amd_pointer_info.  If the runtime supports an older version of pointer
+  info then size will be smaller on return.  Members starting after the return
+  value of size will not be updated by hsa_amd_pointer_info.
+  */
+  uint32_t size;
+  /*
+  The type of allocation referenced.
+  */
+  hsa_amd_pointer_type_t type;
+  /*
+  Base address at which non-host agents may access the allocation. This field is
+  not meaningful if the type of the allocation is HSA_EXT_POINTER_TYPE_UNKNOWN.
+  */
+  void* agentBaseAddress;
+  /*
+  Base address at which the host agent may access the allocation. This field is
+  not meaningful if the type of the allocation is HSA_EXT_POINTER_TYPE_UNKNOWN.
+  */
+  void* hostBaseAddress;
+  /*
+  Size of the allocation. This field is not meaningful if the type of the allocation
+  is HSA_EXT_POINTER_TYPE_UNKNOWN.
+  */
+  size_t sizeInBytes;
+  /*
+  Application provided value. This field is not meaningful if the type of the
+  allocation is HSA_EXT_POINTER_TYPE_UNKNOWN.
+  */
+  void* userData;
+  /*
+  Reports an agent which "owns" (ie has preferred access to) the pool in which the
+  allocation was
+  made.  When multiple agents share equal access to a pool (ex: multiple CPU agents, or multi-die
+  GPU boards) any such agent may be returned. This field is not meaningful if
+  the type of the allocation is HSA_EXT_POINTER_TYPE_UNKNOWN or if this agent is not available in
+  this process, for e.g if this agent is masked using ROCR_VISIBLE_DEVICES.
+  */
+  hsa_agent_t agentOwner;
+  /*
+  Contains a bitfield of hsa_amd_memory_pool_global_flag_t values.
+  Reports the effective global flags bitmask for the allocation.  This field is not
+  meaningful if the type of the allocation is HSA_EXT_POINTER_TYPE_UNKNOWN.
+  */
+  uint32_t global_flags;
+
+  /*
+  Set to true if this allocation was registered with the underlying driver
+  This field is not meaningful if the type of the allocation is
+  HSA_EXT_POINTER_TYPE_UNKNOWN.
+  */
+  bool registered;
+} hsa_amd_pointer_info_t;
+
+/**
+ * @brief Retrieves information about the allocation referenced by the given
+ * pointer.  Optionally returns the number and list of agents which can
+ * directly access the allocation. In case this virtual address is unknown, the
+ * pointer type returned will be HSA_EXT_POINTER_TYPE_UNKNOWN and the only fields
+ * that are valid after hsa_amd_pointer_info returns are size and type.
+ *
+ * @param[in] ptr Pointer which references the allocation to retrieve info for.
+ *
+ * @param[in, out] info Pointer to structure to be filled with allocation info.
+ * Data member size must be set to the size of the structure prior to calling
+ * hsa_amd_pointer_info.  On return size will be set to the size of the
+ * pointer info structure supported by the runtime, if smaller.  Members
+ * beyond the returned value of size will not be updated by the API.
+ * Must not be NULL.
+ *
+ * @param[in] alloc Function pointer to an allocator used to allocate the
+ * @p accessible array.  If NULL @p accessible will not be returned.
+ *
+ * @param[out] num_agents_accessible Recieves the count of agents in
+ * @p accessible.  If NULL @p accessible will not be returned.
+ *
+ * @param[out] accessible Recieves a pointer to the array, allocated by @p alloc,
+ * holding the list of agents which may directly access the allocation.
+ * May be NULL.
+ *
+ * @retval HSA_STATUS_SUCCESS Info retrieved successfully
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized
+ *
+ * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating
+ * necessary resources
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT NULL in @p ptr or @p info.
+ */
+hsa_status_t HSA_API hsa_amd_pointer_info(const void* ptr,
+                                          hsa_amd_pointer_info_t* info,
+                                          void* (*alloc)(size_t),
+                                          uint32_t* num_agents_accessible,
+                                          hsa_agent_t** accessible);
+
+/**
+ * @brief Associates an arbitrary pointer with an allocation known to ROCr.
+ * The pointer can be fetched by hsa_amd_pointer_info in the userData field.
+ *
+ * @param[in] ptr Pointer to the first byte of an allocation known to ROCr
+ * with which to associate @p userdata.
+ *
+ * @param[in] userdata Abitrary pointer to associate with the allocation.
+ *
+ * @retval HSA_STATUS_SUCCESS @p userdata successfully stored.
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized
+ *
+ * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating
+ * necessary resources
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is not known to ROCr.
+ */
+hsa_status_t HSA_API hsa_amd_pointer_info_set_userdata(const void* ptr,
+                                                       void* userdata);
+
+/**
+ * @brief 256-bit process independent identifier for a ROCr shared memory
+ * allocation.
+ */
+typedef struct hsa_amd_ipc_memory_s {
+  uint32_t handle[8];
+} hsa_amd_ipc_memory_t;
+
+/**
+ * @brief Prepares an allocation for interprocess sharing and creates a
+ * handle of type hsa_amd_ipc_memory_t uniquely identifying the allocation.  A
+ * handle is valid while the allocation it references remains accessible in
+ * any process.  In general applications should confirm that a shared memory
+ * region has been attached (via hsa_amd_ipc_memory_attach) in the remote
+ * process prior to releasing that memory in the local process.
+ * Repeated calls for the same allocation may, but are not required to, return
+ * unique handles. The allocation needs to be on memory on an agent of type
+ * HSA_DEVICE_TYPE_GPU.
+ *
+ * @param[in] ptr Pointer to device memory allocated via ROCr APIs to prepare for
+ * sharing.
+ *
+ * @param[in] len Length in bytes of the allocation to share.
+ *
+ * @param[out] handle Process independent identifier referencing the shared
+ * allocation.
+ *
+ * @retval HSA_STATUS_SUCCESS allocation is prepared for interprocess sharing.
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized
+ *
+ * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating
+ * necessary resources
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr does not point to the
+ * first byte of an allocation made through ROCr, or len is not the full length
+ * of the allocation or handle is NULL.
+ */
+hsa_status_t HSA_API hsa_amd_ipc_memory_create(void* ptr, size_t len,
+                                               hsa_amd_ipc_memory_t* handle);
+
+/**
+ * @brief Imports shared memory into the local process and makes it accessible
+ * by the given agents.  If a shared memory handle is attached multiple times
+ * in a process each attach may return a different address.  Each returned
+ * address is refcounted and requires a matching number of calls to
+ * hsa_amd_ipc_memory_detach to release the shared memory mapping.
+ *
+ * @param[in] handle Pointer to the identifier for the shared memory.
+ *
+ * @param[in] len Length of the shared memory to import.
+ * Reserved.  Must be the full length of the shared allocation in this version.
+ *
+ * @param[in] num_agents Count of agents in @p mapping_agents.
+ * May be zero if all agents are to be allowed access.
+ *
+ * @param[in] mapping_agents List of agents to access the shared memory.
+ * Ignored if @p num_agents is zero.
+ *
+ * @param[out] mapped_ptr Recieves a process local pointer to the shared memory.
+ *
+ * @retval HSA_STATUS_SUCCESS if memory is successfully imported.
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized
+ *
+ * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating
+ * necessary resources
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p handle is not valid, @p len is
+ * incorrect, @p mapped_ptr is NULL, or some agent for which access was
+ * requested can not access the shared memory.
+ */
+hsa_status_t HSA_API hsa_amd_ipc_memory_attach(
+    const hsa_amd_ipc_memory_t* handle, size_t len,
+    uint32_t num_agents,
+    const hsa_agent_t* mapping_agents,
+    void** mapped_ptr);
+
+/**
+ * @brief Decrements the reference count for the shared memory mapping and
+ * releases access to shared memory imported with hsa_amd_ipc_memory_attach.
+ *
+ * @param[in] mapped_ptr Pointer to the first byte of a shared allocation
+ * imported with hsa_amd_ipc_memory_attach.
+ *
+ * @retval HSA_STATUS_SUCCESS if @p mapped_ptr was imported with
+ * hsa_amd_ipc_memory_attach.
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p mapped_ptr was not imported
+ * with hsa_amd_ipc_memory_attach.
+ */
+hsa_status_t HSA_API hsa_amd_ipc_memory_detach(void* mapped_ptr);
+
+/** @} */
+
+/** \addtogroup status Runtime notifications
+ *  @{
+ */
+
+/**
+ * @brief 256-bit process independent identifier for a ROCr IPC signal.
+ */
+typedef hsa_amd_ipc_memory_t hsa_amd_ipc_signal_t;
+
+/**
+ * @brief Obtains an interprocess sharing handle for a signal.  The handle is
+ * valid while the signal it references remains valid in any process.  In
+ * general applications should confirm that the signal has been attached (via
+ * hsa_amd_ipc_signal_attach) in the remote process prior to destroying that
+ * signal in the local process.
+ * Repeated calls for the same signal may, but are not required to, return
+ * unique handles.
+ *
+ * @param[in] signal Signal created with attribute HSA_AMD_SIGNAL_IPC.
+ *
+ * @param[out] handle Process independent identifier referencing the shared
+ * signal.
+ *
+ * @retval HSA_STATUS_SUCCESS @p handle is ready to use for interprocess sharing.
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized
+ *
+ * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating
+ * necessary resources
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p signal is not a valid signal
+ * created with attribute HSA_AMD_SIGNAL_IPC or handle is NULL.
+ */
+hsa_status_t HSA_API hsa_amd_ipc_signal_create(hsa_signal_t signal, hsa_amd_ipc_signal_t* handle);
+
+/**
+ * @brief Imports an IPC capable signal into the local process.  If an IPC
+ * signal handle is attached multiple times in a process each attach may return
+ * a different signal handle.  Each returned signal handle is refcounted and
+ * requires a matching number of calls to hsa_signal_destroy to release the
+ * shared signal.
+ *
+ * @param[in] handle Pointer to the identifier for the shared signal.
+ *
+ * @param[out] signal Recieves a process local signal handle to the shared signal.
+ *
+ * @retval HSA_STATUS_SUCCESS if the signal is successfully imported.
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized
+ *
+ * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating
+ * necessary resources
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p handle is not valid.
+ */
+hsa_status_t HSA_API hsa_amd_ipc_signal_attach(const hsa_amd_ipc_signal_t* handle,
+                                               hsa_signal_t* signal);
+
+/**
+ * @brief GPU system event type.
+ */
+typedef enum hsa_amd_event_type_s {
+  /*
+   AMD GPU memory fault.
+   */
+  HSA_AMD_GPU_MEMORY_FAULT_EVENT = 0,
+  /*
+   AMD GPU HW Exception.
+   */
+  HSA_AMD_GPU_HW_EXCEPTION_EVENT,
+  /*
+   AMD GPU memory error.
+   */
+  HSA_AMD_GPU_MEMORY_ERROR_EVENT,
+} hsa_amd_event_type_t;
+
+/**
+ * @brief Flags denoting the cause of a memory fault.
+ */
+typedef enum {
+  // Page not present or supervisor privilege.
+  HSA_AMD_MEMORY_FAULT_PAGE_NOT_PRESENT = 1 << 0,
+  // Write access to a read-only page.
+  HSA_AMD_MEMORY_FAULT_READ_ONLY = 1 << 1,
+  // Execute access to a page marked NX.
+  HSA_AMD_MEMORY_FAULT_NX = 1 << 2,
+  // GPU attempted access to a host only page.
+  HSA_AMD_MEMORY_FAULT_HOST_ONLY = 1 << 3,
+  // DRAM ECC failure.
+  HSA_AMD_MEMORY_FAULT_DRAMECC = 1 << 4,
+  // Can't determine the exact fault address.
+  HSA_AMD_MEMORY_FAULT_IMPRECISE = 1 << 5,
+  // SRAM ECC failure (ie registers, no fault address).
+  HSA_AMD_MEMORY_FAULT_SRAMECC = 1 << 6,
+  // GPU reset following unspecified hang.
+  HSA_AMD_MEMORY_FAULT_HANG = 1U << 31
+} hsa_amd_memory_fault_reason_t;
+
+/**
+ * @brief AMD GPU memory fault event data.
+ */
+typedef struct hsa_amd_gpu_memory_fault_info_s {
+  /*
+  The agent where the memory fault occurred.
+  */
+  hsa_agent_t agent;
+  /*
+  Virtual address accessed.
+  */
+  uint64_t virtual_address;
+  /*
+  Bit field encoding the memory access failure reasons. There could be multiple bits set
+  for one fault.  Bits are defined in hsa_amd_memory_fault_reason_t.
+  */
+  uint32_t fault_reason_mask;
+} hsa_amd_gpu_memory_fault_info_t;
+
+/**
+ * @brief Flags denoting the cause of a memory error.
+ */
+typedef enum {
+  // Memory was in use by low-level HW component and cannot be released
+  HSA_AMD_MEMORY_ERROR_MEMORY_IN_USE = (1 << 0),
+} hsa_amd_memory_error_reason_t;
+
+/**
+ * @brief AMD GPU memory error event data.
+ */
+typedef struct hsa_amd_gpu_memory_error_info_s {
+  /*
+  The agent where the memory error occurred.
+  */
+  hsa_agent_t agent;
+  /*
+  Virtual address involved.
+  */
+  uint64_t virtual_address;
+  /*
+  Bit field encoding the memory error failure reasons. There could be multiple bits set
+  for one error.  Bits are defined in hsa_amd_memory_error_reason_t.
+  */
+  uint32_t error_reason_mask;
+} hsa_amd_gpu_memory_error_info_t;
+
+/**
+ * @brief Flags denoting the type of a HW exception
+ */
+typedef enum {
+  // Unused for now
+  HSA_AMD_HW_EXCEPTION_RESET_TYPE_OTHER = 1 << 0,
+} hsa_amd_hw_exception_reset_type_t;
+
+/**
+ * @brief Flags denoting the cause of a HW exception
+ */
+typedef enum {
+  // GPU Hang
+  HSA_AMD_HW_EXCEPTION_CAUSE_GPU_HANG = 1 << 0,
+  // SRAM ECC
+  HSA_AMD_HW_EXCEPTION_CAUSE_ECC = 1 << 1,
+} hsa_amd_hw_exception_reset_cause_t;
+
+/**
+ * @brief AMD GPU HW Exception event data.
+ */
+typedef struct hsa_amd_gpu_hw_exception_info_s {
+  /*
+  The agent where the HW exception occurred.
+  */
+  hsa_agent_t agent;
+  hsa_amd_hw_exception_reset_type_t reset_type;
+  hsa_amd_hw_exception_reset_cause_t reset_cause;
+} hsa_amd_gpu_hw_exception_info_t;
+
+/**
+ * @brief AMD GPU event data passed to event handler.
+ */
+typedef struct hsa_amd_event_s {
+  /*
+  The event type.
+  */
+  hsa_amd_event_type_t event_type;
+  union {
+    /*
+    The memory fault info, only valid when @p event_type is HSA_AMD_GPU_MEMORY_FAULT_EVENT.
+    */
+    hsa_amd_gpu_memory_fault_info_t memory_fault;
+    /*
+    The memory fault info, only valid when @p event_type is HSA_AMD_GPU_HW_EXCEPTION_EVENT.
+    */
+    hsa_amd_gpu_hw_exception_info_t hw_exception;
+    /*
+    The memory error info, only valid when @p event_type is HSA_AMD_GPU_MEMORY_ERROR_EVENT.
+    */
+    hsa_amd_gpu_memory_error_info_t memory_error;
+  };
+} hsa_amd_event_t;
+
+typedef hsa_status_t (*hsa_amd_system_event_callback_t)(const hsa_amd_event_t* event, void* data);
+
+/**
+ * @brief Register AMD GPU event handler.
+ *
+ * @param[in] callback Callback to be invoked when an event is triggered.
+ * The HSA runtime passes two arguments to the callback: @p event
+ * is defined per event by the HSA runtime, and @p data is the user data.
+ *
+ * @param[in] data User data that is passed to @p callback. May be NULL.
+ *
+ * @retval HSA_STATUS_SUCCESS The handler has been registered successfully.
+ *
+ * @retval HSA_STATUS_ERROR An event handler has already been registered.
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p event is invalid.
+ */
+hsa_status_t HSA_API hsa_amd_register_system_event_handler(hsa_amd_system_event_callback_t callback,
+                                                   void* data);
+
+/** @} */
+
+/** \addtogroup queue Queues
+ *  @{
+ */
+
+/**
+ * @brief Per-queue dispatch and wavefront scheduling priority.
+ */
+typedef enum hsa_amd_queue_priority_s {
+  /*
+  Below normal/high priority compute and all graphics
+  */
+  HSA_AMD_QUEUE_PRIORITY_LOW = 0,
+  /*
+  Above low priority compute, below high priority compute and all graphics
+  */
+  HSA_AMD_QUEUE_PRIORITY_NORMAL = 1,
+  /*
+  Above low/normal priority compute and all graphics
+  */
+  HSA_AMD_QUEUE_PRIORITY_HIGH = 2,
+} hsa_amd_queue_priority_t;
+
+/**
+ * @brief Modifies the dispatch and wavefront scheduling prioirty for a
+ * given compute queue. The default is HSA_AMD_QUEUE_PRIORITY_NORMAL.
+ *
+ * @param[in] queue Compute queue to apply new priority to.
+ *
+ * @param[in] priority Priority to associate with queue.
+ *
+ * @retval HSA_STATUS_SUCCESS if priority was changed successfully.
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_QUEUE if queue is not a valid
+ * compute queue handle.
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT if priority is not a valid
+ * value from hsa_amd_queue_priority_t.
+ */
+hsa_status_t HSA_API hsa_amd_queue_set_priority(hsa_queue_t* queue,
+                                                hsa_amd_queue_priority_t priority);
+
+/**
+ * @brief Queue creation attributes.
+ */
+typedef enum {
+  /**
+   * The queue's packet buffer and queue descriptor struct should be
+   * allocated in system memory (default). Mutually exclusive with
+   * HSA_AMD_QUEUE_CREATE_DEVICE_MEM_RING_BUF and
+   * HSA_AMD_QUEUE_CREATE_DEVICE_MEM_QUEUE_DESCRIPTOR.
+   */
+  HSA_AMD_QUEUE_CREATE_SYSTEM_MEM = 0,
+  /**
+   * The queue's packet buffer should be allocated in the agent's
+   * fine-grain device memory region.
+   */
+  HSA_AMD_QUEUE_CREATE_DEVICE_MEM_RING_BUF = (1 << 0),
+  /**
+   * The queue desciptor struct should be allocated in the agent's
+   * fine-grain device memory region. Not supported for devices
+   * connected via PCIe because the CPU's atomic read-modify-write
+   * operations cannot be promoted to PCIe atomic read-modify-write
+   * operations.
+   */
+  HSA_AMD_QUEUE_CREATE_DEVICE_MEM_QUEUE_DESCRIPTOR = (1 << 1),
+} hsa_amd_queue_create_flag_t;
+
+/** @} */
+
+/** \addtogroup memory Memory
+ *  @{
+ */
+
+/**
+ * @brief Deallocation notifier function type.
+ */
+typedef void (*hsa_amd_deallocation_callback_t)(void* ptr, void* user_data);
+
+/**
+ * @brief Registers a deallocation notifier monitoring for release of agent
+ * accessible address @p ptr.  If successful, @p callback will be invoked when
+ * @p ptr is removed from accessibility from all agents.
+ *
+ * Notification callbacks are automatically deregistered when they are invoked.
+ *
+ * Note: The current version supports notifications of address release
+ * originating from ::hsa_amd_memory_pool_free.  Support for other address
+ * release APIs will follow.
+ *
+ * @param[in] ptr Agent accessible address to monitor for deallocation.  Passed
+ * to @p callback.
+ *
+ * @param[in] callback Notifier to be invoked when @p ptr is released from
+ * agent accessibility.
+ *
+ * @param[in] user_data User provided value passed to @p callback.  May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The notifier registered successfully
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION @p ptr does not refer to a valid agent accessible
+ * address.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL or @p ptr is NULL.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating
+ * necessary resources
+ */
+hsa_status_t HSA_API hsa_amd_register_deallocation_callback(void* ptr,
+                                                    hsa_amd_deallocation_callback_t callback,
+                                                    void* user_data);
+
+/**
+ * @brief Removes a deallocation notifier previously registered with
+ * ::hsa_amd_register_deallocation_callback.  Arguments must be identical to
+ * those given in ::hsa_amd_register_deallocation_callback.
+ *
+ * @param[in] ptr Agent accessible address which was monitored for deallocation.
+ *
+ * @param[in] callback Notifier to be removed.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The notifier has been removed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT The given notifier was not registered.
+ */
+hsa_status_t HSA_API hsa_amd_deregister_deallocation_callback(void* ptr,
+                                                      hsa_amd_deallocation_callback_t callback);
+
+typedef enum hsa_amd_svm_model_s {
+  /**
+   * Updates to memory with this attribute conform to HSA memory consistency
+   * model.
+   */
+  HSA_AMD_SVM_GLOBAL_FLAG_FINE_GRAINED = 0,
+  /**
+   * Writes to memory with this attribute can be performed by a single agent
+   * at a time.
+   */
+  HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED = 1,
+  /**
+   * Memory region queried contains subregions with both
+   * HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED and
+   * HSA_AMD_SVM_GLOBAL_FLAG_FINE_GRAINED attributes.
+   *
+   * This attribute can not be used in hsa_amd_svm_attributes_set.  It is a
+   * possible return from hsa_amd_svm_attributes_get indicating that the query
+   * region contains both coarse and fine grained memory.
+   */
+  HSA_AMD_SVM_GLOBAL_FLAG_INDETERMINATE = 2
+} hsa_amd_svm_model_t;
+
+typedef enum hsa_amd_svm_attribute_s {
+  // Memory model attribute.
+  // Type of this attribute is hsa_amd_svm_model_t.
+  HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG = 0,
+  // Marks the range read only.  This allows multiple physical copies to be
+  // placed local to each accessing device.
+  // Type of this attribute is bool.
+  HSA_AMD_SVM_ATTRIB_READ_ONLY = 1,
+  // Automatic migrations should attempt to keep the memory within the xgmi hive
+  // containing accessible agents.
+  // Type of this attribute is bool.
+  HSA_AMD_SVM_ATTRIB_HIVE_LOCAL = 2,
+  // Page granularity to migrate at once.  Page granularity is specified as
+  // log2(page_count).
+  // Type of this attribute is uint64_t.
+  HSA_AMD_SVM_ATTRIB_MIGRATION_GRANULARITY = 3,
+  // Physical location to prefer when automatic migration occurs.
+  // Set to the null agent handle (handle == 0) to indicate there
+  // is no preferred location.
+  // Type of this attribute is hsa_agent_t.
+  HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION = 4,
+  // This attribute can not be used in ::hsa_amd_svm_attributes_set (see
+  // ::hsa_amd_svm_prefetch_async).
+  // Queries the physical location of most recent prefetch command.
+  // If the prefetch location has not been set or is not uniform across the
+  // address range then returned hsa_agent_t::handle will be 0.
+  // Querying this attribute will return the destination agent of the most
+  // recent ::hsa_amd_svm_prefetch_async targeting the address range.  If
+  // multiple async prefetches have been issued targeting the region and the
+  // most recently issued prefetch has completed then the query will return
+  // the location of the most recently completed prefetch.
+  // Type of this attribute is hsa_agent_t.
+  HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION = 5,
+  // Optimizes with the anticipation that the majority of operations to the
+  // range will be read operations.
+  // Type of this attribute is bool.
+  HSA_AMD_SVM_ATTRIB_READ_MOSTLY = 6,
+  // Allows the execution on GPU.
+  // Type of this attribute is bool.
+  HSA_AMD_SVM_ATTRIB_GPU_EXEC = 7,
+  // This attribute can not be used in ::hsa_amd_svm_attributes_get.
+  // Enables an agent for access to the range.  Access may incur a page fault
+  // and associated memory migration.  Either this or
+  // HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE is required prior to SVM
+  // access if HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT is false.
+  // Type of this attribute is hsa_agent_t.
+  HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE = 0x200,
+  // This attribute can not be used in ::hsa_amd_svm_attributes_get.
+  // Enables an agent for access to the range without page faults.  Access
+  // will not incur a page fault and will not cause access based migration.
+  // and associated memory migration.  Either this or
+  // HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE is required prior to SVM access if
+  // HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT is false.
+  // Type of this attribute is hsa_agent_t.
+  HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE = 0x201,
+  // This attribute can not be used in ::hsa_amd_svm_attributes_get.
+  // Denies an agent access to the memory range.  Access will cause a terminal
+  // segfault.
+  // Type of this attribute is hsa_agent_t.
+  HSA_AMD_SVM_ATTRIB_AGENT_NO_ACCESS = 0x202,
+  // This attribute can not be used in ::hsa_amd_svm_attributes_set.
+  // Returns the access attribute associated with the agent.
+  // The agent to query must be set in the attribute value field.
+  // The attribute enum will be replaced with the agent's current access
+  // attribute for the address range.
+  // TODO: Clarify KFD return value for non-uniform access attribute.
+  // Type of this attribute is hsa_agent_t.
+  HSA_AMD_SVM_ATTRIB_ACCESS_QUERY = 0x203,
+} hsa_amd_svm_attribute_t;
+
+// List type for hsa_amd_svm_attributes_set/get.
+typedef struct hsa_amd_svm_attribute_pair_s {
+  // hsa_amd_svm_attribute_t value.
+  uint64_t attribute;
+  // Attribute value.  Bit values should be interpreted according to the type
+  // given in the associated attribute description.
+  uint64_t value;
+} hsa_amd_svm_attribute_pair_t;
+
+/**
+ * @brief Sets SVM memory attributes.
+ *
+ * If HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT returns false then enabling
+ * access to an Agent via this API (setting HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE
+ * or HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE) is required prior to SVM
+ * memory access by that Agent.
+ *
+ * Attributes HSA_AMD_SVM_ATTRIB_ACCESS_QUERY and HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION
+ * may not be used with this API.
+ *
+ * @param[in] ptr Will be aligned down to nearest page boundary.
+ *
+ * @param[in] size Will be aligned up to nearest page boundary.
+ *
+ * @param[in] attribute_list List of attributes to set for the address range.
+ *
+ * @param[in] attribute_count Length of @p attribute_list.
+ */
+hsa_status_t hsa_amd_svm_attributes_set(void* ptr, size_t size,
+                                        hsa_amd_svm_attribute_pair_t* attribute_list,
+                                        size_t attribute_count);
+
+/**
+ * @brief Gets SVM memory attributes.
+ *
+ * Attributes HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE,
+ * HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE and
+ * HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION may not be used with this API.
+ *
+ * Note that attribute HSA_AMD_SVM_ATTRIB_ACCESS_QUERY takes as input an
+ * hsa_agent_t and returns the current access type through its attribute field.
+ *
+ * @param[in] ptr Will be aligned down to nearest page boundary.
+ *
+ * @param[in] size Will be aligned up to nearest page boundary.
+ *
+ * @param[in] attribute_list List of attributes to set for the address range.
+ *
+ * @param[in] attribute_count Length of @p attribute_list.
+ */
+hsa_status_t hsa_amd_svm_attributes_get(void* ptr, size_t size,
+                                        hsa_amd_svm_attribute_pair_t* attribute_list,
+                                        size_t attribute_count);
+
+/**
+ * @brief Asynchronously migrates memory to an agent.
+ *
+ * Schedules memory migration to @p agent when @p dep_signals have been observed equal to zero.
+ * @p completion_signal will decrement when the migration is complete.
+ *
+ * @param[in] ptr Will be aligned down to nearest page boundary.
+ *
+ * @param[in] size Will be aligned up to nearest page boundary.
+ *
+ * @param[in] agent Agent to migrate to.
+ *
+ * @param[in] num_dep_signals Number of dependent signals. Can be 0.
+ *
+ * @param[in] dep_signals List of signals that must be waited on before the migration
+ * operation starts. The migration will start after every signal has been observed with
+ * the value 0. If @p num_dep_signals is 0, this argument is ignored.
+ *
+ * @param[in] completion_signal Signal used to indicate completion of the migration
+ * operation. When the migration operation is finished, the value of the signal is
+ * decremented. The runtime indicates that an error has occurred during the copy
+ * operation by setting the value of the completion signal to a negative
+ * number. If no completion signal is required this handle may be null.
+ */
+hsa_status_t hsa_amd_svm_prefetch_async(void* ptr, size_t size, hsa_agent_t agent,
+                                        uint32_t num_dep_signals, const hsa_signal_t* dep_signals,
+                                        hsa_signal_t completion_signal);
+
+/** @} */
+
+/** \addtogroup profile Profiling
+ *  @{
+ */
+
+/**
+ * @brief Acquire Stream Performance Monitor on an agent
+ *
+ * Acquire exclusive use of SPM on @p preferred_agent.
+ * See hsa_amd_spm_set_dest_buffer to provide a destination buffer to KFD to start recording and
+ * retrieve this data.
+ * @param[in] preferred_agent Agent on which to acquire SPM
+ */
+hsa_status_t hsa_amd_spm_acquire(hsa_agent_t preferred_agent);
+
+/**
+ * @brief Release Stream Performance Monitor on an agent
+ *
+ * Release exclusive use of SPM on @p preferred_agent. This will stop KFD writing SPM data.
+ * If a destination buffer is set, then data in the destination buffer is available to user
+ * when this function returns.
+ *
+ * @param[in] preferred_agent Agent on which to release SPM
+ */
+hsa_status_t hsa_amd_spm_release(hsa_agent_t preferred_agent);
+
+/**
+ * @brief  Set up the current destination user mode buffer for stream performance
+ * counter data. KFD will start writing SPM data into the destination buffer. KFD will continue
+ * to copy data into the current destination buffer until any of the following functions are called
+ * - hsa_amd_spm_release
+ * - hsa_amd_spm_set_dest_buffer with dest set to NULL
+ * - hsa_amd_spm_set_dest_buffer with dest set to a new buffer
+ *
+ * if @p timeout is non-0, the call will wait for up to @p timeout ms for the previous
+ * buffer to be filled. If previous buffer to be filled before timeout, the @p timeout
+ * will be updated value with the time remaining. If the timeout is exceeded, the function
+ * copies any partial data available into the previous user buffer and returns success.
+ * User should not access destination data while KFD is copying data.
+ * If the previous destination buffer was full, then @p is_data_loss flag is set.
+ * @p dest is CPU accessible memory. It could be malloc'ed memory or host allocated memory
+ *
+ * @param[in] preferred_agent Agent on which to set the dest buffer
+ *
+ * @param[in] size_in_bytes size of the buffer
+ *
+ * @param[in,out] timeout timeout in milliseconds
+ *
+ * @param[out] size_copied number of bytes copied
+ *
+ * @param[in] dest destination address. Set to NULL to stop copy on previous buffer
+ *
+ * @param[out] is_data_loss true is data was lost
+ */
+hsa_status_t hsa_amd_spm_set_dest_buffer(hsa_agent_t preferred_agent, size_t size_in_bytes,
+                                         uint32_t* timeout, uint32_t* size_copied, void* dest,
+                                         bool* is_data_loss);
+
+/** @} */
+
+/** \addtogroup memory Memory
+ *  @{
+ */
+
+/**
+ * @brief Older version of export dmabuf
+ *
+ * This is the same as calling the v2 version of export dmabuf with the
+ * flags argument set to HSA_AMD_DMABUF_MAPPING_TYPE_NONE.
+ *
+ * @param[in] ptr Pointer to the allocation being exported.
+ *
+ * @param[in] size Size in bytes to export following @p ptr.  The entire range
+ * being exported must be contained within a single allocation.
+ *
+ * @param[out] dmabuf Pointer to a dma-buf file descriptor holding a reference to the
+ * allocation.  Contents will not be altered in the event of failure.
+ *
+ * @param[out] offset Offset in bytes into the memory referenced by the dma-buf
+ * object at which @p ptr resides.  Contents will not be altered in the event
+ * of failure.
+ *
+ * @retval ::HSA_STATUS_SUCCESS Export completed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT One or more arguments is NULL.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION The address range described by
+ * @p ptr and @p size are not contained within a single allocation.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The allocation described by @p ptr
+ * and @p size was allocated on a device which can not export memory.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The return file descriptor,
+ * @p dmabuf, could not be created.
+ */
+hsa_status_t hsa_amd_portable_export_dmabuf(const void* ptr, size_t size, int* dmabuf,
+                                            uint64_t* offset);
+
+                                            /**
+ * @brief Obtains an OS specific, vendor neutral, handle to a memory allocation.
+ *
+ * Obtains an OS specific handle to GPU agent memory.  The memory must be part
+ * of a single allocation from an hsa_amd_memory_pool_t exposed by a GPU Agent.
+ * The handle may be used with other APIs (e.g. Vulkan) to obtain shared access
+ * to the allocation.
+ *
+ * Shared access to the memory is not guaranteed to be fine grain coherent even
+ * if the allocation exported is from a fine grain pool.  The shared memory
+ * consistency model will be no stronger than the model exported from, consult
+ * the importing API to determine the final consistency model.
+ *
+ * The allocation's memory remains valid as long as the handle and any mapping
+ * of the handle remains valid.  When the handle and all mappings are closed
+ * the backing memory will be released for reuse.
+ *
+ * @param[in] ptr Pointer to the allocation being exported.
+ *
+ * @param[in] size Size in bytes to export following @p ptr.  The entire range
+ * being exported must be contained within a single allocation.
+ *
+ * @param[out] dmabuf Pointer to a dma-buf file descriptor holding a reference to the
+ * allocation.  Contents will not be altered in the event of failure.
+ *
+ * @param[out] offset Offset in bytes into the memory referenced by the dma-buf
+ * object at which @p ptr resides.  Contents will not be altered in the event
+ * of failure.
+ *
+ * @param[in] flags Bitmask of hsa_amd_dma_buf_mapping_type_t flags.
+ *
+ * @retval ::HSA_STATUS_SUCCESS Export completed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT One or more arguments is NULL.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION The address range described by
+ * @p ptr and @p size are not contained within a single allocation.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The allocation described by @p ptr
+ * and @p size was allocated on a device which can not export memory.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The return file descriptor,
+ * @p dmabuf, could not be created.
+ */
+hsa_status_t hsa_amd_portable_export_dmabuf_v2(const void* ptr, size_t size,
+                               int* dmabuf, uint64_t* offset, uint64_t flags);
+
+/**
+ * @brief Closes an OS specific, vendor neutral, handle to a memory allocation.
+ *
+ * Closes an OS specific handle to GPU agent memory.
+ *
+ * Applications should close a handle after imports are complete.  The handle
+ * is not required to remain open for the lifetime of imported mappings.  The
+ * referenced allocation will remain valid until all handles and mappings
+ * are closed.
+ *
+ * @param[in] dmabuf Handle to be closed.
+ *
+ * @retval ::HSA_STATUS_SUCCESS Handle closed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_RESOURCE_FREE A generic error was encountered
+ * when closing the handle.  The handle may have been closed already or an
+ * async IO error may have occured.
+ */
+hsa_status_t hsa_amd_portable_close_dmabuf(int dmabuf);
+
+typedef enum hsa_amd_vmem_address_reserve_flag_s {
+  // Only reserve a VA range without registering it to the underlying driver
+  HSA_AMD_VMEM_ADDRESS_NO_REGISTER = (1UL << 0),
+} hsa_amd_vmem_address_reserve_flag_t;
+
+/**
+ * @brief Allocate a reserved address range
+ *
+ * Reserve a virtual address range. The size must be a multiple of the system page size.
+ * If it is not possible to allocate the address specified by @p address, then @p va will be
+ * a different address range.
+ * Address range should be released by calling hsa_amd_vmem_address_free.
+ *
+ * @param[out] va virtual address allocated
+ * @param[in] size of address range requested
+ * @param[in] address requested
+ * @param[in] flags optional hsa_amd_vmem_address_reserve_flag_t
+ *
+ * @retval ::HSA_STATUS_SUCCESS Address range allocated successfully
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Insufficient resources to allocate an address
+ * range of this size.
+ *
+ * Note that this API will be deprecated in a future release and replaced by
+ * hsa_amd_vmem_address_reserve_align
+ */
+hsa_status_t hsa_amd_vmem_address_reserve(void** va, size_t size, uint64_t address,
+                                          uint64_t flags);
+
+/**
+ * @brief Allocate a reserved address range
+ *
+ * Reserve a virtual address range. The size must be a multiple of the system page size.
+ * If it is not possible to allocate the address specified by @p address, then @p va will be
+ * a different address range.
+ * Address range should be released by calling hsa_amd_vmem_address_free.
+ *
+ * @param[out] va virtual address allocated
+ * @param[in] size of address range requested
+ * @param[in] address requested
+ * @param[in] alignment requested. 0 for default. Must be >= page-size and a power of 2
+ * @param[in] flags optional hsa_amd_vmem_address_reserve_flag_t
+ *
+ * @retval ::HSA_STATUS_SUCCESS Address range allocated successfully
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Insufficient resources to allocate an address
+ * range of this size.
+ */
+hsa_status_t hsa_amd_vmem_address_reserve_align(void** va, size_t size, uint64_t address,
+                                          uint64_t alignment, uint64_t flags);
+
+/**
+ * @brief Free a reserved address range
+ *
+ * Free a previously allocated address range. The size must match the size of a previously
+ * allocated address range.
+ *
+ * @param[out] va virtual address to be freed
+ * @param[in] size of address range
+ *
+ * @retval ::HSA_STATUS_SUCCESS Address range released successfully
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION Invalid va specified
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid size specified
+ * @retval ::HSA_STATUS_ERROR_RESOURCE_FREE Address range is still in use
+ * @retval ::HSA_STATUS_ERROR Internal unexpected error
+ */
+hsa_status_t hsa_amd_vmem_address_free(void* va, size_t size);
+
+/**
+ * @brief Struct containing an opaque handle to a memory allocation handle
+ */
+typedef struct hsa_amd_vmem_alloc_handle_s {
+  /**
+   * Opaque handle. Two handles reference the same object of the enclosing type
+   * if and only if they are equal.
+   */
+  uint64_t handle;
+} hsa_amd_vmem_alloc_handle_t;
+
+typedef enum {
+  MEMORY_TYPE_NONE,
+  MEMORY_TYPE_PINNED,
+} hsa_amd_memory_type_t;
+
+/**
+ * @brief Create a virtual memory handle
+ *
+ * Create a virtual memory handle within this pool
+ * @p size must be a aligned to allocation granule size for this memory pool, see
+ * HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE
+ * To minimize internal memory fragmentation, align the size to the recommended allocation granule
+ * size, see HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_REC_GRANULE
+ *
+ * @param[in] pool memory to use
+ * @param[in] size of the memory allocation
+ * @param[in] type of memory
+ * @param[in] flags - currently unsupported
+ * @param[out] memory_handle - handle for the allocation
+ *
+ * @retval ::HSA_STATUS_SUCCESS memory allocated successfully
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid arguments
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION This memory pool does not support allocations
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Insufficient resources to allocate this memory
+ */
+hsa_status_t hsa_amd_vmem_handle_create(hsa_amd_memory_pool_t pool, size_t size,
+                                        hsa_amd_memory_type_t type, uint64_t flags,
+                                        hsa_amd_vmem_alloc_handle_t* memory_handle);
+
+/**
+ * @brief Release a virtual memory handle
+ *
+ * @param[in] memory handle that was previously allocated
+ *
+ * @retval ::HSA_STATUS_SUCCESS Address range allocated successfully
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION Invalid memory handle
+ */
+hsa_status_t hsa_amd_vmem_handle_release(hsa_amd_vmem_alloc_handle_t memory_handle);
+
+/**
+ * @brief Map a virtual memory handle
+ *
+ * Map a virtual memory handle to a reserved address range. The virtual address requested must be
+ * within a previously reserved address range. @p va and (@p va + size) must be must be within
+ * (va + size) of the previous allocated address range.
+ * @p size must be equal to size of the @p memory_handle
+ * hsa_amd_vmem_set_access needs to be called to make the memory accessible to specific agents
+ *
+ * @param[in] va virtual address range where memory will be mapped
+ * @param[in] size of memory mapping
+ * @param[in] in_offset offset into memory. Currently unsupported
+ * @param[in] memory_handle virtual memory handle to be mapped
+ * @param[in] flags. Currently unsupported
+ *
+ * @retval ::HSA_STATUS_SUCCESS Memory mapped successfully
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT va, size or memory_handle are invalid
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Insufficient resources
+ *
+ * @retval ::HSA_STATUS_ERROR Unexpected internal error
+ */
+hsa_status_t hsa_amd_vmem_map(void* va, size_t size, size_t in_offset,
+                              hsa_amd_vmem_alloc_handle_t memory_handle, uint64_t flags);
+
+/**
+ * @brief Unmap a virtual memory handle
+ *
+ * Unmap previously mapped virtual address range
+ *
+ * @param[in] va virtual address range where memory will be mapped
+ * @param[in] size of memory mapping
+ *
+ * @retval ::HSA_STATUS_SUCCESS Memory backing unmapped successfully
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION memory_handle is invalid
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT size is invalid
+ *
+ * @retval ::HSA_STATUS_ERROR Unexpected internal error
+ */
+hsa_status_t hsa_amd_vmem_unmap(void* va, size_t size);
+
+typedef struct hsa_amd_memory_access_desc_s {
+  hsa_access_permission_t permissions;
+  hsa_agent_t agent_handle;
+} hsa_amd_memory_access_desc_t;
+
+/**
+ * @brief Make a memory mapping accessible
+ *
+ * Make previously mapped virtual address accessible to specific agents. @p size must be equal to
+ * size of previously mapped virtual memory handle.
+ * Calling hsa_amd_vmem_set_access multiple times on the same @p va:
+ *  - Will overwrite permissions for agents specified in @p desc
+ *  - Will leave permissions unchanged for agents not specified in @p desc
+ *
+ * @param[in] va previously mapped virtual address
+ * @param[in] size of memory mapping
+ * @param[in] desc list of access permissions for each agent
+ * @param[in] desc_cnt number of elements in desc
+ *
+ * @retval ::HSA_STATUS_SUCCESS
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT va, size or memory_handle are invalid
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION memory_handle is invalid
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Insufficient resources
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT Invalid agent in desc
+ *
+ * @retval ::HSA_STATUS_ERROR Unexpected internal error
+ */
+hsa_status_t hsa_amd_vmem_set_access(void* va, size_t size,
+                                     const hsa_amd_memory_access_desc_t* desc,
+                                     size_t desc_cnt);
+
+/**
+ * @brief Get current access permissions for memory mapping
+ *
+ * Get access permissions for memory mapping for specific agent.
+ *
+ * @param[in] va previously mapped virtual address
+ * @param[in] perms current permissions
+ * @param[in] agent_handle agent
+ *
+ * @retval ::HSA_STATUS_SUCCESS
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT Invalid agent
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION va is not mapped or permissions never set for this
+ * agent
+ *
+ * @retval ::HSA_STATUS_ERROR Unexpected internal error
+ */
+hsa_status_t hsa_amd_vmem_get_access(void* va, hsa_access_permission_t* perms,
+                                     hsa_agent_t agent_handle);
+
+/**
+ * @brief Get an exportable shareable handle
+ *
+ * Get an exportable shareable handle for a memory_handle. This shareabl handle can then be used to
+ * re-create a virtual memory handle using hsa_amd_vmem_import_shareable_handle. The shareable
+ * handle can be transferred using mechanisms that support posix file descriptors Once all shareable
+ * handles are closed, the memory_handle is released.
+ *
+ * @param[out] dmabuf_fd shareable handle
+ * @param[in] handle previously allocated virtual memory handle
+ * @param[in] flags Currently unsupported
+ *
+ * @retval ::HSA_STATUS_SUCCESS
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION Invalid memory handle
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Out of resources
+ *
+ * @retval ::HSA_STATUS_ERROR Unexpected internal error
+ */
+hsa_status_t hsa_amd_vmem_export_shareable_handle(int* dmabuf_fd,
+                                                  hsa_amd_vmem_alloc_handle_t handle,
+                                                  uint64_t flags);
+/**
+ * @brief Import a shareable handle
+ *
+ * Import a shareable handle for a memory handle. Importing a shareable handle that has been closed
+ * and released results in undefined behavior.
+ *
+ * @param[in] dmabuf_fd shareable handle exported with hsa_amd_vmem_export_shareable_handle
+ * @param[out] handle virtual memory handle
+ *
+ * @retval ::HSA_STATUS_SUCCESS
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION Invalid memory handle
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Out of resources
+ *
+ * @retval ::HSA_STATUS_ERROR Unexpected internal error
+ */
+hsa_status_t hsa_amd_vmem_import_shareable_handle(int dmabuf_fd,
+                                                  hsa_amd_vmem_alloc_handle_t* handle);
+
+/**
+ * @brief Returns memory handle for mapped memory
+ *
+ * Return a memory handle for previously mapped memory. The handle will be the same value of handle
+ * used to map the memory. The returned handle must be released with corresponding number of calls
+ * to hsa_amd_vmem_handle_release.
+ *
+ * @param[out] memory_handle memory handle for this mapped address
+ * @param[in] mapped address
+ *
+ * @retval ::HSA_STATUS_SUCCESS
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION Invalid address
+ */
+hsa_status_t hsa_amd_vmem_retain_alloc_handle(hsa_amd_vmem_alloc_handle_t* memory_handle,
+                                              void* addr);
+
+/**
+ * @brief Returns the current allocation properties of a handle
+ *
+ * Returns the allocation properties of an existing handle
+ *
+ * @param[in] memory_handle memory handle to be queried
+ * @param[out] pool memory pool that owns this handle
+ * @param[out] memory type
+
+ * @retval ::HSA_STATUS_SUCCESS
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION Invalid memory_handle
+ */
+hsa_status_t hsa_amd_vmem_get_alloc_properties_from_handle(
+    hsa_amd_vmem_alloc_handle_t memory_handle, hsa_amd_memory_pool_t* pool,
+    hsa_amd_memory_type_t* type);
+
+/** @} */
+
+/** \addtogroup queue Queues
+ *  @{
+ */
+
+/**
+ * @brief Set the asynchronous scratch limit threshold on all the queues for this agent.
+ * Dispatches that are enqueued on HW queues on this agent that are smaller than threshold will not
+ * result in a scratch use-once method.
+ *
+ * Increasing this threshold will only increase the internal limit and not cause immediate allocation
+ * of additional scratch memory. Decreasing this threshold will result in a release in scratch memory
+ * on queues where the current amount of allocated scratch exceeds the new limit.
+ *
+ * If this API call would result in a release in scratch memory and there are dispatches that are
+ * currently using scratch memory on this agent, this will result into a blocking call until the
+ * current dispatches are completed.
+ *
+ * This API is only supported on devices that support asynchronous scratch reclaim.
+ *
+ * @param[in] agent A valid agent.
+ *
+ * @param[in] threshold Threshold size in bytes
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT This agent does not support asynchronous scratch
+ * reclaim
+ */
+hsa_status_t HSA_API hsa_amd_agent_set_async_scratch_limit(hsa_agent_t agent, size_t threshold);
+
+typedef enum {
+  /*
+   * Returns the agent that owns the underlying HW queue.
+   * The type of this attribute is hsa_agent_t.
+   */
+  HSA_AMD_QUEUE_INFO_AGENT,
+  /*
+   * Returns the doorbell ID of the completion signal of the queue
+   * The type of this attribute is uint64_t.
+   */
+  HSA_AMD_QUEUE_INFO_DOORBELL_ID,
+} hsa_queue_info_attribute_t;
+
+hsa_status_t hsa_amd_queue_get_info(hsa_queue_t* queue, hsa_queue_info_attribute_t attribute,
+                                    void* value);
+
+typedef struct hsa_amd_ais_file_handle_s {
+  /*
+   * file handle for AIS read & write. Linux will use fd.
+   * pad is keep the size consistent accross different platforms.
+   */
+  union {
+    void*      handle;
+    int        fd;
+    uint8_t    pad[8];
+  };
+} hsa_amd_ais_file_handle_t;
+
+/**
+ * @brief Write data from device memory to a file
+ *
+ * Writes data from device memory buffer to a file at the specified offset.
+ * The device memory pointer must be accessible from the host and point to
+ * a valid allocation.
+ *
+ * EXPERIMENTAL: AIS read and write calls are currently in experimental phase and
+ *  APIs may be modified
+ *
+ * @param[in] handle Handle of the file to write to.
+ *
+ * @param[in] devicePtr Device memory buffer pointer containing data to write.
+ *
+ * @param[in] size Size in bytes of the data to write.
+ *
+ * @param[in] file_offset Offset in bytes into the file where data will be written.
+ *
+ * @param[in/out] size_copied Actual number of bytes copied
+ *
+ * @param[in/out] status Additional status if any
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p fd is invalid, @p devicePtr
+ * is NULL, or @p size is 0.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION @p devicePtr does not refer to
+ * a valid allocation.
+ *
+ * @retval ::HSA_STATUS_ERROR An error occurred during the write operation.
+ */
+hsa_status_t HSA_API hsa_amd_ais_file_write(hsa_amd_ais_file_handle_t handle, void *devicePtr,
+                                            uint64_t size, int64_t file_offset,
+                                            uint64_t *size_copied, int32_t *status);
+
+/**
+ * @brief Read data from a file to device memory
+ *
+ * Reads data from a file at the specified offset into a device memory buffer.
+ * The device memory pointer must be accessible from the host and point to
+ * a valid allocation.
+ *
+ * EXPERIMENTAL: AIS read and write calls are currently in experimental phase and
+ *  APIs may be modified
+ * @param[in] hanlde Handle of the file to read from.
+ *
+ * @param[in] devicePtr Device memory buffer pointer to store the read data.
+ *
+ * @param[in] size Size in bytes of the data to read.
+ *
+ * @param[in] file_offset Offset in bytes into the file where data will be read from.
+ *
+ * @param[in/out] size_copied Actual number of bytes copied
+ *
+ * @param[in/out] status Additional status if any
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p fd is invalid, @p devicePtr
+ * is NULL, or @p size is 0.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION @p devicePtr does not refer to
+ * a valid allocation.
+ *
+ * @retval ::HSA_STATUS_ERROR An error occurred during the read operation.
+ */
+hsa_status_t HSA_API hsa_amd_ais_file_read(hsa_amd_ais_file_handle_t handle, void *devicePtr,
+                                           uint64_t size, int64_t file_offset,
+                                           uint64_t *size_copied, int32_t *status);
+
+/**
+ * @brief logging types
+ */
+typedef enum hsa_amd_log_flag_s {
+  /* Log AQL packets internally enqueued by ROCr */
+  HSA_AMD_LOG_FLAG_BLIT_KERNEL_PKTS = 0,
+  HSA_AMD_LOG_FLAG_AQL = 0,
+  /* Log SDMA packets */
+  HSA_AMD_LOG_FLAG_SDMA = 1,
+  /* Log INFO */
+  HSA_AMD_LOG_FLAG_INFO = 2,
+} hsa_amd_log_flag_t;
+
+/**
+ * @brief Enable logging via external file
+ * If this function is called multiple times, the last call to this function will overwrite the
+ * previous @p flags and @p file.
+ *
+ * @param[in] flags is used to filter types of logging. Type is uint8_t[8].
+ * Can be set using the hsa_flag_set64 macro. Setting @p flags to 0 will disable logging.
+ * @param[in] file file stream to output logging. If file is NULL, prints are sent to stderr.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ */
+hsa_status_t hsa_amd_enable_logging(uint8_t* flags, void* file);
+
+/** @} */
+
+#ifdef __cplusplus
+}  // end extern "C" block
+#endif
+
+#endif  // header guard
diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ext_finalize.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ext_finalize.h
new file mode 100644
index 0000000000..94c4582055
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ext_finalize.h
@@ -0,0 +1,531 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+// 
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+// 
+// Developed by:
+// 
+//                 AMD Research and AMD HSA Software Development
+// 
+//                 Advanced Micro Devices, Inc.
+// 
+//                 www.amd.com
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+// 
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef HSA_RUNTIME_INC_HSA_EXT_FINALIZE_H_
+#define HSA_RUNTIME_INC_HSA_EXT_FINALIZE_H_
+
+#include "hsa.h"
+
+#undef HSA_API
+#ifdef HSA_EXPORT_FINALIZER
+#define HSA_API HSA_API_EXPORT
+#else
+#define HSA_API HSA_API_IMPORT
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+struct BrigModuleHeader;
+typedef struct BrigModuleHeader* BrigModule_t;
+
+/** \defgroup ext-alt-finalizer-extensions Finalization Extensions
+ *  @{
+ */
+
+/**
+ * @brief Enumeration constants added to ::hsa_status_t by this extension.
+ */
+enum {
+  /**
+   * The HSAIL program is invalid.
+   */
+  HSA_EXT_STATUS_ERROR_INVALID_PROGRAM = 0x2000,
+  /**
+   * The HSAIL module is invalid.
+   */
+  HSA_EXT_STATUS_ERROR_INVALID_MODULE = 0x2001,
+  /**
+   * Machine model or profile of the HSAIL module do not match the machine model
+   * or profile of the HSAIL program.
+   */
+  HSA_EXT_STATUS_ERROR_INCOMPATIBLE_MODULE = 0x2002,
+  /**
+   * The HSAIL module is already a part of the HSAIL program.
+   */
+  HSA_EXT_STATUS_ERROR_MODULE_ALREADY_INCLUDED = 0x2003,
+  /**
+   * Compatibility mismatch between symbol declaration and symbol definition.
+   */
+  HSA_EXT_STATUS_ERROR_SYMBOL_MISMATCH = 0x2004,
+  /**
+   * The finalization encountered an error while finalizing a kernel or
+   * indirect function.
+   */
+  HSA_EXT_STATUS_ERROR_FINALIZATION_FAILED = 0x2005,
+  /**
+   * Mismatch between a directive in the control directive structure and in
+   * the HSAIL kernel.
+   */
+  HSA_EXT_STATUS_ERROR_DIRECTIVE_MISMATCH = 0x2006
+};
+
+/** @} */
+
+/** \defgroup ext-alt-finalizer-program Finalization Program
+ *  @{
+ */
+
+/**
+ * @brief HSAIL (BRIG) module. The HSA Programmer's Reference Manual contains
+ * the definition of the BrigModule_t type.
+ */
+typedef BrigModule_t hsa_ext_module_t;
+
+/**
+ * @brief An opaque handle to a HSAIL program, which groups a set of HSAIL
+ * modules that collectively define functions and variables used by kernels and
+ * indirect functions.
+ */
+typedef struct hsa_ext_program_s {
+  /**
+   * Opaque handle.
+   */
+  uint64_t handle;
+} hsa_ext_program_t;
+
+/**
+ * @brief Create an empty HSAIL program.
+ *
+ * @param[in] machine_model Machine model used in the HSAIL program.
+ *
+ * @param[in] profile Profile used in the HSAIL program.
+ *
+ * @param[in] default_float_rounding_mode Default float rounding mode used in
+ * the HSAIL program.
+ *
+ * @param[in] options Vendor-specific options. May be NULL.
+ *
+ * @param[out] program Memory location where the HSA runtime stores the newly
+ * created HSAIL program handle.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure to allocate
+ * resources required for the operation.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p machine_model is invalid,
+ * @p profile is invalid, @p default_float_rounding_mode is invalid, or
+ * @p program is NULL.
+ */
+hsa_status_t HSA_API hsa_ext_program_create(
+    hsa_machine_model_t machine_model,
+    hsa_profile_t profile,
+    hsa_default_float_rounding_mode_t default_float_rounding_mode,
+    const char *options,
+    hsa_ext_program_t *program);
+
+/**
+ * @brief Destroy a HSAIL program.
+ *
+ * @details The HSAIL program handle becomes invalid after it has been
+ * destroyed. Code object handles produced by ::hsa_ext_program_finalize are
+ * still valid after the HSAIL program has been destroyed, and can be used as
+ * intended. Resources allocated outside and associated with the HSAIL program
+ * (such as HSAIL modules that are added to the HSAIL program) can be released
+ * after the finalization program has been destroyed.
+ *
+ * @param[in] program HSAIL program.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_INVALID_PROGRAM The HSAIL program is
+ * invalid.
+ */
+hsa_status_t HSA_API hsa_ext_program_destroy(
+    hsa_ext_program_t program);
+
+/**
+ * @brief Add a HSAIL module to an existing HSAIL program.
+ *
+ * @details The HSA runtime does not perform a deep copy of the HSAIL module
+ * upon addition. Instead, it stores a pointer to the HSAIL module. The
+ * ownership of the HSAIL module belongs to the application, which must ensure
+ * that @p module is not released before destroying the HSAIL program.
+ *
+ * The HSAIL module is successfully added to the HSAIL program if @p module is
+ * valid, if all the declarations and definitions for the same symbol are
+ * compatible, and if @p module specify machine model and profile that matches
+ * the HSAIL program.
+ *
+ * @param[in] program HSAIL program.
+ *
+ * @param[in] module HSAIL module. The application can add the same HSAIL module
+ * to @p program at most once. The HSAIL module must specify the same machine
+ * model and profile as @p program. If the floating-mode rounding mode of @p
+ * module is not default, then it should match that of @p program.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure to allocate
+ * resources required for the operation.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_INVALID_PROGRAM The HSAIL program is invalid.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_INVALID_MODULE The HSAIL module is invalid.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_INCOMPATIBLE_MODULE The machine model of @p
+ * module does not match machine model of @p program, or the profile of @p
+ * module does not match profile of @p program.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_MODULE_ALREADY_INCLUDED The HSAIL module is
+ * already a part of the HSAIL program.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_SYMBOL_MISMATCH Symbol declaration and symbol
+ * definition compatibility mismatch. See the symbol compatibility rules in the
+ * HSA Programming Reference Manual.
+ */
+hsa_status_t HSA_API hsa_ext_program_add_module(
+    hsa_ext_program_t program,
+    hsa_ext_module_t module);
+
+/**
+ * @brief Iterate over the HSAIL modules in a program, and invoke an
+ * application-defined callback on every iteration.
+ *
+ * @param[in] program HSAIL program.
+ *
+ * @param[in] callback Callback to be invoked once per HSAIL module in the
+ * program. The HSA runtime passes three arguments to the callback: the program,
+ * a HSAIL module, and the application data.  If @p callback returns a status
+ * other than ::HSA_STATUS_SUCCESS for a particular iteration, the traversal
+ * stops and ::hsa_ext_program_iterate_modules returns that status value.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_INVALID_PROGRAM The program is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL.
+ */
+hsa_status_t HSA_API hsa_ext_program_iterate_modules(
+    hsa_ext_program_t program,
+    hsa_status_t (*callback)(hsa_ext_program_t program, hsa_ext_module_t module,
+                             void* data),
+    void* data);
+
+/**
+ * @brief HSAIL program attributes.
+ */
+typedef enum {
+  /**
+   * Machine model specified when the HSAIL program was created. The type
+   * of this attribute is ::hsa_machine_model_t.
+   */
+  HSA_EXT_PROGRAM_INFO_MACHINE_MODEL = 0,
+  /**
+   * Profile specified when the HSAIL program was created. The type of
+   * this attribute is ::hsa_profile_t.
+   */
+  HSA_EXT_PROGRAM_INFO_PROFILE = 1,
+  /**
+   * Default float rounding mode specified when the HSAIL program was
+   * created. The type of this attribute is ::hsa_default_float_rounding_mode_t.
+   */
+  HSA_EXT_PROGRAM_INFO_DEFAULT_FLOAT_ROUNDING_MODE = 2
+} hsa_ext_program_info_t;
+
+/**
+ * @brief Get the current value of an attribute for a given HSAIL program.
+ *
+ * @param[in] program HSAIL program.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to an application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behaviour is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_INVALID_PROGRAM The HSAIL program is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid
+ * HSAIL program attribute, or @p value is NULL.
+ */
+hsa_status_t HSA_API hsa_ext_program_get_info(
+    hsa_ext_program_t program,
+    hsa_ext_program_info_t attribute,
+    void *value);
+
+/**
+ * @brief Finalizer-determined call convention.
+ */
+typedef enum {
+ /**
+  * Finalizer-determined call convention.
+  */
+  HSA_EXT_FINALIZER_CALL_CONVENTION_AUTO = -1
+} hsa_ext_finalizer_call_convention_t;
+
+/**
+ * @brief Control directives specify low-level information about the
+ * finalization process.
+ */
+typedef struct hsa_ext_control_directives_s {
+  /**
+   * Bitset indicating which control directives are enabled. The bit assigned to
+   * a control directive is determined by the corresponding value in
+   * BrigControlDirective.
+   *
+   * If a control directive is disabled, its corresponding field value (if any)
+   * must be 0. Control directives that are only present or absent (such as
+   * partial workgroups) have no corresponding field as the presence of the bit
+   * in this mask is sufficient.
+   */
+  uint64_t control_directives_mask;
+  /**
+   * Bitset of HSAIL exceptions that must have the BREAK policy enabled. The bit
+   * assigned to an HSAIL exception is determined by the corresponding value
+   * in BrigExceptionsMask. If the kernel contains a enablebreakexceptions
+   * control directive, the finalizer uses the union of the two masks.
+   */
+  uint16_t break_exceptions_mask;
+  /**
+   * Bitset of HSAIL exceptions that must have the DETECT policy enabled. The
+   * bit assigned to an HSAIL exception is determined by the corresponding value
+   * in BrigExceptionsMask. If the kernel contains a enabledetectexceptions
+   * control directive, the finalizer uses the union of the two masks.
+   */
+  uint16_t detect_exceptions_mask;
+  /**
+   * Maximum size (in bytes) of dynamic group memory that will be allocated by
+   * the application for any dispatch of the kernel.  If the kernel contains a
+   * maxdynamicsize control directive, the two values should match.
+   */
+  uint32_t max_dynamic_group_size;
+  /**
+   * Maximum number of grid work-items that will be used by the application to
+   * launch the kernel. If the kernel contains a maxflatgridsize control
+   * directive, the value of @a max_flat_grid_size must not be greater than the
+   * value of the directive, and takes precedence.
+   *
+   * The value specified for maximum absolute grid size must be greater than or
+   * equal to the product of the values specified by @a required_grid_size.
+   *
+   * If the bit at position BRIG_CONTROL_MAXFLATGRIDSIZE is set in @a
+   * control_directives_mask, this field must be greater than 0.
+   */
+  uint64_t max_flat_grid_size;
+  /**
+   * Maximum number of work-group work-items that will be used by the
+   * application to launch the kernel. If the kernel contains a
+   * maxflatworkgroupsize control directive, the value of @a
+   * max_flat_workgroup_size must not be greater than the value of the
+   * directive, and takes precedence.
+   *
+   * The value specified for maximum absolute grid size must be greater than or
+   * equal to the product of the values specified by @a required_workgroup_size.
+   *
+   * If the bit at position BRIG_CONTROL_MAXFLATWORKGROUPSIZE is set in @a
+   * control_directives_mask, this field must be greater than 0.
+   */
+  uint32_t max_flat_workgroup_size;
+  /**
+   * Reserved. Must be 0.
+   */
+  uint32_t reserved1;
+  /**
+   * Grid size that will be used by the application in any dispatch of the
+   * kernel. If the kernel contains a requiredgridsize control directive, the
+   * dimensions should match.
+   *
+   * The specified grid size must be consistent with @a required_workgroup_size
+   * and @a required_dim. Also, the product of the three dimensions must not
+   * exceed @a max_flat_grid_size. Note that the listed invariants must hold
+   * only if all the corresponding control directives are enabled.
+   *
+   * If the bit at position BRIG_CONTROL_REQUIREDGRIDSIZE is set in @a
+   * control_directives_mask, the three dimension values must be greater than 0.
+   */
+  uint64_t required_grid_size[3];
+  /**
+   * Work-group size that will be used by the application in any dispatch of the
+   * kernel. If the kernel contains a requiredworkgroupsize control directive,
+   * the dimensions should match.
+   *
+   * The specified work-group size must be consistent with @a required_grid_size
+   * and @a required_dim. Also, the product of the three dimensions must not
+   * exceed @a max_flat_workgroup_size. Note that the listed invariants must
+   * hold only if all the corresponding control directives are enabled.
+   *
+   * If the bit at position BRIG_CONTROL_REQUIREDWORKGROUPSIZE is set in @a
+   * control_directives_mask, the three dimension values must be greater than 0.
+   */
+  hsa_dim3_t required_workgroup_size;
+  /**
+   * Number of dimensions that will be used by the application to launch the
+   * kernel. If the kernel contains a requireddim control directive, the two
+   * values should match.
+   *
+   * The specified dimensions must be consistent with @a required_grid_size and
+   * @a required_workgroup_size. This invariant must hold only if all the
+   * corresponding control directives are enabled.
+   *
+   * If the bit at position BRIG_CONTROL_REQUIREDDIM is set in @a
+   * control_directives_mask, this field must be 1, 2, or 3.
+   */
+  uint8_t required_dim;
+  /**
+   * Reserved. Must be 0.
+   */
+  uint8_t reserved2[75];
+} hsa_ext_control_directives_t;
+
+/**
+ * @brief Finalize an HSAIL program for a given instruction set architecture.
+ *
+ * @details Finalize all of the kernels and indirect functions that belong to
+ * the same HSAIL program for a specific instruction set architecture (ISA). The
+ * transitive closure of all functions specified by call or scall must be
+ * defined. Kernels and indirect functions that are being finalized must be
+ * defined. Kernels and indirect functions that are referenced in kernels and
+ * indirect functions being finalized may or may not be defined, but must be
+ * declared. All the global/readonly segment variables that are referenced in
+ * kernels and indirect functions being finalized may or may not be defined, but
+ * must be declared.
+ *
+ * @param[in] program HSAIL program.
+ *
+ * @param[in] isa Instruction set architecture to finalize for.
+ *
+ * @param[in] call_convention A call convention used in a finalization. Must
+ * have a value between ::HSA_EXT_FINALIZER_CALL_CONVENTION_AUTO (inclusive)
+ * and the value of the attribute ::HSA_ISA_INFO_CALL_CONVENTION_COUNT in @p
+ * isa (not inclusive).
+ *
+ * @param[in] control_directives Low-level control directives that influence
+ * the finalization process.
+ *
+ * @param[in] options Vendor-specific options. May be NULL.
+ *
+ * @param[in] code_object_type Type of code object to produce.
+ *
+ * @param[out] code_object Code object generated by the Finalizer, which
+ * contains the machine code for the kernels and indirect functions in the HSAIL
+ * program. The code object is independent of the HSAIL module that was used to
+ * generate it.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure to allocate
+ * resources required for the operation.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_INVALID_PROGRAM The HSAIL program is
+ * invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ISA @p isa is invalid.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_DIRECTIVE_MISMATCH The directive in
+ * the control directive structure and in the HSAIL kernel mismatch, or if the
+ * same directive is used with a different value in one of the functions used by
+ * this kernel.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_FINALIZATION_FAILED The Finalizer
+ * encountered an error while compiling a kernel or an indirect function.
+ */
+hsa_status_t HSA_API hsa_ext_program_finalize(
+    hsa_ext_program_t program,
+    hsa_isa_t isa,
+    int32_t call_convention,
+    hsa_ext_control_directives_t control_directives,
+    const char *options,
+    hsa_code_object_type_t code_object_type,
+    hsa_code_object_t *code_object);
+
+/** @} */
+
+#define hsa_ext_finalizer_1_00
+
+typedef struct hsa_ext_finalizer_1_00_pfn_s {
+  hsa_status_t (*hsa_ext_program_create)(
+      hsa_machine_model_t machine_model, hsa_profile_t profile,
+      hsa_default_float_rounding_mode_t default_float_rounding_mode,
+      const char *options, hsa_ext_program_t *program);
+
+  hsa_status_t (*hsa_ext_program_destroy)(hsa_ext_program_t program);
+
+  hsa_status_t (*hsa_ext_program_add_module)(hsa_ext_program_t program,
+                                                 hsa_ext_module_t module);
+
+  hsa_status_t (*hsa_ext_program_iterate_modules)(
+      hsa_ext_program_t program,
+      hsa_status_t (*callback)(hsa_ext_program_t program,
+                               hsa_ext_module_t module, void *data),
+      void *data);
+
+  hsa_status_t (*hsa_ext_program_get_info)(
+      hsa_ext_program_t program, hsa_ext_program_info_t attribute,
+      void *value);
+
+  hsa_status_t (*hsa_ext_program_finalize)(
+      hsa_ext_program_t program, hsa_isa_t isa, int32_t call_convention,
+      hsa_ext_control_directives_t control_directives, const char *options,
+      hsa_code_object_type_t code_object_type, hsa_code_object_t *code_object);
+} hsa_ext_finalizer_1_00_pfn_t;
+
+#ifdef __cplusplus
+} // extern "C" block
+#endif // __cplusplus
+
+#endif // HSA_RUNTIME_INC_HSA_EXT_FINALIZE_H_
diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ext_image.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ext_image.h
new file mode 100644
index 0000000000..cad9b50820
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ext_image.h
@@ -0,0 +1,1515 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+// 
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+// 
+// Developed by:
+// 
+//                 AMD Research and AMD HSA Software Development
+// 
+//                 Advanced Micro Devices, Inc.
+// 
+//                 www.amd.com
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+// 
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef HSA_EXT_IMAGE_H
+#define HSA_EXT_IMAGE_H
+
+#include "hsa.h"
+
+#undef HSA_API
+#ifdef HSA_EXPORT_IMAGES
+#define HSA_API HSA_API_EXPORT
+#else
+#define HSA_API HSA_API_IMPORT
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif /*__cplusplus*/ 
+
+/** \defgroup ext-images Images and Samplers
+ *  @{
+ */
+
+/**
+ * @brief Enumeration constants added to ::hsa_status_t by this extension.
+ *
+ * @remark Additions to hsa_status_t
+ */
+enum {
+    /**
+     * Image format is not supported.
+     */
+    HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED = 0x3000,
+    /**
+     * Image size is not supported.
+     */
+    HSA_EXT_STATUS_ERROR_IMAGE_SIZE_UNSUPPORTED = 0x3001,
+    /**
+     * Image pitch is not supported or invalid.
+     */
+    HSA_EXT_STATUS_ERROR_IMAGE_PITCH_UNSUPPORTED = 0x3002,
+    /**
+     * Sampler descriptor is not supported or invalid.
+     */
+    HSA_EXT_STATUS_ERROR_SAMPLER_DESCRIPTOR_UNSUPPORTED = 0x3003
+};
+
+/**
+ * @brief Enumeration constants added to ::hsa_agent_info_t by this
+ * extension.
+ *
+ * @remark Additions to hsa_agent_info_t
+ */
+enum {
+  /**
+   * Maximum number of elements in 1D images. Must be at least 16384. The type
+   * of this attribute is size_t.
+   */
+  HSA_EXT_AGENT_INFO_IMAGE_1D_MAX_ELEMENTS = 0x3000,
+  /**
+   * Maximum number of elements in 1DA images. Must be at least 16384. The type
+   * of this attribute is size_t.
+   */
+  HSA_EXT_AGENT_INFO_IMAGE_1DA_MAX_ELEMENTS = 0x3001,
+  /**
+   * Maximum number of elements in 1DB images. Must be at least 65536. The type
+   * of this attribute is size_t.
+   */
+  HSA_EXT_AGENT_INFO_IMAGE_1DB_MAX_ELEMENTS = 0x3002,
+  /**
+   * Maximum dimensions (width, height) of 2D images, in image elements. The X
+   * and Y maximums must be at least 16384. The type of this attribute is
+   * size_t[2].
+   */
+  HSA_EXT_AGENT_INFO_IMAGE_2D_MAX_ELEMENTS = 0x3003,
+  /**
+   * Maximum dimensions (width, height) of 2DA images, in image elements. The X
+   * and Y maximums must be at least 16384. The type of this attribute is
+   * size_t[2].
+   */
+  HSA_EXT_AGENT_INFO_IMAGE_2DA_MAX_ELEMENTS = 0x3004,
+  /**
+   * Maximum dimensions (width, height) of 2DDEPTH images, in image
+   * elements. The X and Y maximums must be at least 16384. The type of this
+   * attribute is size_t[2].
+   */
+  HSA_EXT_AGENT_INFO_IMAGE_2DDEPTH_MAX_ELEMENTS = 0x3005,
+  /**
+   * Maximum dimensions (width, height) of 2DADEPTH images, in image
+   * elements. The X and Y maximums must be at least 16384. The type of this
+   * attribute is size_t[2].
+   */
+  HSA_EXT_AGENT_INFO_IMAGE_2DADEPTH_MAX_ELEMENTS = 0x3006,
+  /**
+   * Maximum dimensions (width, height, depth) of 3D images, in image
+   * elements. The maximum along any dimension must be at least 2048. The type
+   * of this attribute is size_t[3].
+   */
+  HSA_EXT_AGENT_INFO_IMAGE_3D_MAX_ELEMENTS = 0x3007,
+  /**
+   * Maximum number of image layers in a image array. Must be at least 2048. The
+   * type of this attribute is size_t.
+   */
+  HSA_EXT_AGENT_INFO_IMAGE_ARRAY_MAX_LAYERS = 0x3008,
+  /**
+   * Maximum number of read-only image handles that can be created for an agent at any one
+   * time. Must be at least 128. The type of this attribute is size_t.
+   */
+  HSA_EXT_AGENT_INFO_MAX_IMAGE_RD_HANDLES = 0x3009,
+  /**
+   * Maximum number of write-only and read-write image handles (combined) that
+   * can be created for an agent at any one time. Must be at least 64. The type of this
+   * attribute is size_t.
+   */
+  HSA_EXT_AGENT_INFO_MAX_IMAGE_RORW_HANDLES = 0x300A,
+  /**
+   * Maximum number of sampler handlers that can be created for an agent at any one
+   * time. Must be at least 16. The type of this attribute is size_t.
+   */
+  HSA_EXT_AGENT_INFO_MAX_SAMPLER_HANDLERS = 0x300B,
+  /**
+   * Image pitch alignment. The agent only supports linear image data
+   * layouts with a row pitch that is a multiple of this value. Must be
+   * a power of 2. The type of this attribute is size_t.
+   */
+  HSA_EXT_AGENT_INFO_IMAGE_LINEAR_ROW_PITCH_ALIGNMENT = 0x300C
+};
+
+/**
+ * @brief Image handle, populated by ::hsa_ext_image_create or
+ * ::hsa_ext_image_create_with_layout. Image
+ * handles are only unique within an agent, not across agents.
+ *
+ */
+typedef struct hsa_ext_image_s {
+  /**
+   *  Opaque handle. For a given agent, two handles reference the same object of
+   *  the enclosing type if and only if they are equal.
+   */
+    uint64_t handle;
+
+} hsa_ext_image_t;
+
+/**
+ * @brief Geometry associated with the image. This specifies the
+ * number of image dimensions and whether the image is an image
+ * array. See the <em>Image Geometry</em> section in the <em>HSA
+ * Programming Reference Manual</em> for definitions on each
+ * geometry. The enumeration values match the BRIG type @p
+ * hsa_ext_brig_image_geometry_t.
+ */
+typedef enum {
+/**
+   * One-dimensional image addressed by width coordinate.
+   */
+  HSA_EXT_IMAGE_GEOMETRY_1D = 0,
+
+  /**
+   * Two-dimensional image addressed by width and height coordinates.
+   */
+  HSA_EXT_IMAGE_GEOMETRY_2D = 1,
+
+  /**
+   * Three-dimensional image addressed by width, height, and depth coordinates.
+   */
+  HSA_EXT_IMAGE_GEOMETRY_3D = 2,
+
+  /**
+   * Array of one-dimensional images with the same size and format. 1D arrays
+   * are addressed by width and index coordinate.
+   */
+  HSA_EXT_IMAGE_GEOMETRY_1DA = 3,
+
+  /**
+   * Array of two-dimensional images with the same size and format. 2D arrays
+   * are addressed by width,  height, and index coordinates.
+   */
+  HSA_EXT_IMAGE_GEOMETRY_2DA = 4,
+
+  /**
+   * One-dimensional image addressed by width coordinate. It has
+   * specific restrictions compared to ::HSA_EXT_IMAGE_GEOMETRY_1D. An
+   * image with an opaque image data layout will always use a linear
+   * image data layout, and one with an explicit image data layout
+   * must specify ::HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR.
+   */
+  HSA_EXT_IMAGE_GEOMETRY_1DB = 5,
+
+  /**
+   * Two-dimensional depth image addressed by width and height coordinates.
+   */
+  HSA_EXT_IMAGE_GEOMETRY_2DDEPTH = 6,
+
+  /**
+   * Array of two-dimensional depth images with the same size and format. 2D
+   * arrays are addressed by width, height, and index coordinates.
+   */
+  HSA_EXT_IMAGE_GEOMETRY_2DADEPTH = 7
+} hsa_ext_image_geometry_t;
+
+/**
+ * @brief Channel type associated with the elements of an image. See
+ * the <em>Channel Type</em> section in the <em>HSA Programming Reference
+ * Manual</em> for definitions on each channel type. The
+ * enumeration values and definition match the BRIG type @p
+ * hsa_ext_brig_image_channel_type_t.
+ */
+typedef enum {
+    HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT8 = 0,
+    HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT16 = 1,
+    HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT8 = 2,
+    HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT16 = 3,
+    HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT24 = 4,
+    HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 = 5,
+    HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 = 6,
+    HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_101010 = 7,
+    HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8 = 8,
+    HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT16 = 9,
+    HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32 = 10,
+    HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 = 11,
+    HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 = 12,
+    HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 = 13,
+    HSA_EXT_IMAGE_CHANNEL_TYPE_HALF_FLOAT = 14,
+    HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT = 15
+} hsa_ext_image_channel_type_t;
+
+/**
+ * @brief A fixed-size type used to represent ::hsa_ext_image_channel_type_t constants.
+ */
+typedef uint32_t hsa_ext_image_channel_type32_t;
+    
+/**
+ *
+ * @brief Channel order associated with the elements of an image. See
+ * the <em>Channel Order</em> section in the <em>HSA Programming Reference
+ * Manual</em> for definitions on each channel order. The
+ * enumeration values match the BRIG type @p
+ * hsa_ext_brig_image_channel_order_t.
+ */
+typedef enum {
+    HSA_EXT_IMAGE_CHANNEL_ORDER_A = 0,
+    HSA_EXT_IMAGE_CHANNEL_ORDER_R = 1,
+    HSA_EXT_IMAGE_CHANNEL_ORDER_RX = 2,
+    HSA_EXT_IMAGE_CHANNEL_ORDER_RG = 3,
+    HSA_EXT_IMAGE_CHANNEL_ORDER_RGX = 4,
+    HSA_EXT_IMAGE_CHANNEL_ORDER_RA = 5,
+    HSA_EXT_IMAGE_CHANNEL_ORDER_RGB = 6,
+    HSA_EXT_IMAGE_CHANNEL_ORDER_RGBX = 7,
+    HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA = 8,
+    HSA_EXT_IMAGE_CHANNEL_ORDER_BGRA = 9,
+    HSA_EXT_IMAGE_CHANNEL_ORDER_ARGB = 10,
+    HSA_EXT_IMAGE_CHANNEL_ORDER_ABGR = 11,
+    HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB = 12,
+    HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBX = 13,
+    HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBA = 14,
+    HSA_EXT_IMAGE_CHANNEL_ORDER_SBGRA = 15,
+    HSA_EXT_IMAGE_CHANNEL_ORDER_INTENSITY = 16,
+    HSA_EXT_IMAGE_CHANNEL_ORDER_LUMINANCE = 17,
+    HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH = 18,
+    HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL = 19
+} hsa_ext_image_channel_order_t;
+
+/**
+ * @brief A fixed-size type used to represent ::hsa_ext_image_channel_order_t constants.
+ */
+typedef uint32_t hsa_ext_image_channel_order32_t;
+    
+
+/**
+ * @brief Image format.
+ */
+typedef struct hsa_ext_image_format_s {
+  /**
+    * Channel type.
+    */
+    hsa_ext_image_channel_type32_t channel_type;
+
+   /**
+    * Channel order.
+    */
+    hsa_ext_image_channel_order32_t channel_order;
+} hsa_ext_image_format_t;
+
+/**
+ * @brief Implementation independent image descriptor.
+ */
+typedef struct hsa_ext_image_descriptor_s {
+    /**
+     * Image geometry.
+     */
+    hsa_ext_image_geometry_t geometry;
+    /**
+     * Width of the image, in components.
+     */
+    size_t width;
+    /**
+     * Height of the image, in components. Only used if the geometry is
+     * ::HSA_EXT_IMAGE_GEOMETRY_2D, ::HSA_EXT_IMAGE_GEOMETRY_3D,
+     * HSA_EXT_IMAGE_GEOMETRY_2DA, HSA_EXT_IMAGE_GEOMETRY_2DDEPTH, or
+     * HSA_EXT_IMAGE_GEOMETRY_2DADEPTH, otherwise must be 0.
+     */
+    size_t height;
+    /**
+     * Depth of the image, in components. Only used if the geometry is
+     * ::HSA_EXT_IMAGE_GEOMETRY_3D, otherwise must be 0.
+     */
+    size_t depth;
+    /**
+     * Number of image layers in the image array. Only used if the geometry is
+     * ::HSA_EXT_IMAGE_GEOMETRY_1DA, ::HSA_EXT_IMAGE_GEOMETRY_2DA, or
+     * HSA_EXT_IMAGE_GEOMETRY_2DADEPTH, otherwise must be 0.
+     */
+    size_t array_size;
+    /**
+     * Image format.
+     */
+    hsa_ext_image_format_t format;
+} hsa_ext_image_descriptor_t;
+
+/**
+ * @brief Image capability.
+ */
+typedef enum  {
+   /**
+    * Images of this geometry, format, and layout are not supported by
+    * the agent.
+    */
+    HSA_EXT_IMAGE_CAPABILITY_NOT_SUPPORTED = 0x0,
+   /**
+    * Read-only images of this geometry, format, and layout are
+    * supported by the agent.
+    */
+    HSA_EXT_IMAGE_CAPABILITY_READ_ONLY = 0x1,
+   /**
+    * Write-only images of this geometry, format, and layout are
+    * supported by the agent.
+    */
+    HSA_EXT_IMAGE_CAPABILITY_WRITE_ONLY = 0x2,
+   /**
+    * Read-write images of this geometry, format, and layout are
+    * supported by the agent.
+    */
+    HSA_EXT_IMAGE_CAPABILITY_READ_WRITE = 0x4,
+   /**
+    * @deprecated Images of this geometry, format, and layout can be accessed from
+    * read-modify-write atomic operations in the agent.
+    */
+    HSA_EXT_IMAGE_CAPABILITY_READ_MODIFY_WRITE = 0x8,
+    /**
+    * Images of this geometry, format, and layout are guaranteed to
+    * have a consistent data layout regardless of how they are
+    * accessed by the associated agent.
+    */
+    HSA_EXT_IMAGE_CAPABILITY_ACCESS_INVARIANT_DATA_LAYOUT = 0x10
+} hsa_ext_image_capability_t;
+
+/**
+ * @brief Image data layout.
+ *
+ * @details An image data layout denotes such aspects of image data
+ * layout as tiling and organization of channels in memory. Some image
+ * data layouts may only apply to specific image geometries, formats,
+ * and access permissions. Different agents may support different
+ * image layout identifiers, including vendor specific layouts. Note
+ * that an agent may not support the same image data layout for
+ * different access permissions to images with the same image
+ * geometry, size, and format. If multiple agents support the same
+ * image data layout then it is possible to use separate image handles
+ * for each agent that references the same image data.
+ */
+
+typedef enum  {
+   /**
+    * An implementation specific opaque image data layout which can
+    * vary depending on the agent, geometry, image format, image size,
+    * and access permissions.
+    */
+    HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE = 0x0,
+   /**
+    * The image data layout is specified by the following rules in
+    * ascending byte address order. For a 3D image, 2DA image array,
+    * or 1DA image array, the image data is stored as a linear sequence
+    * of adjacent 2D image slices, 2D images, or 1D images
+    * respectively, spaced according to the slice pitch. Each 2D image
+    * is stored as a linear sequence of adjacent image rows, spaced
+    * according to the row pitch. Each 1D or 1DB image is stored as a
+    * single image row. Each image row is stored as a linear sequence
+    * of image elements. Each image element is stored as a linear
+    * sequence of image components specified by the left to right
+    * channel order definition. Each image component is stored using
+    * the memory type specified by the channel type.
+    *
+    * The 1DB image geometry always uses the linear image data layout.
+    */
+    HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR = 0x1
+} hsa_ext_image_data_layout_t;
+
+/**
+ * @brief Retrieve the supported image capabilities for a given combination of
+ * agent, geometry, and image format for an image created with an opaque image
+ * data layout.
+ *
+ * @param[in] agent Agent to be associated with the image handle.
+ *
+ * @param[in] geometry Geometry.
+ *
+ * @param[in] image_format Pointer to an image format. Must not be NULL.
+ *
+ * @param[out] capability_mask Pointer to a memory location where the HSA
+ * runtime stores a bit-mask of supported image capability
+ * (::hsa_ext_image_capability_t) values. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_format is
+ * NULL, or @p capability_mask is NULL.
+ */
+hsa_status_t HSA_API hsa_ext_image_get_capability(
+    hsa_agent_t agent,
+    hsa_ext_image_geometry_t geometry,
+    const hsa_ext_image_format_t *image_format,
+    uint32_t *capability_mask);
+
+/**
+ * @brief Retrieve the supported image capabilities for a given combination of
+ * agent, geometry, image format, and image layout for an image created with
+ * an explicit image data layout.
+ *
+ * @param[in] agent Agent to be associated with the image handle.
+ *
+ * @param[in] geometry Geometry.
+ *
+ * @param[in] image_format Pointer to an image format. Must not be NULL.
+ *
+ * @param[in] image_data_layout The image data layout.
+ * It is invalid to use ::HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE; use
+ * ::hsa_ext_image_get_capability instead.
+ *
+ * @param[out] capability_mask Pointer to a memory location where the HSA
+ * runtime stores a bit-mask of supported image capability
+ * (::hsa_ext_image_capability_t) values. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_format is
+ * NULL, @p image_data_layout is ::HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE,
+ * or @p capability_mask is NULL.
+ */
+hsa_status_t HSA_API hsa_ext_image_get_capability_with_layout(
+    hsa_agent_t agent,
+    hsa_ext_image_geometry_t geometry,
+    const hsa_ext_image_format_t *image_format,
+    hsa_ext_image_data_layout_t image_data_layout,
+    uint32_t *capability_mask);
+
+/**
+ * @brief Agent specific image size and alignment requirements, populated by
+ * ::hsa_ext_image_data_get_info and ::hsa_ext_image_data_get_info_with_layout.
+ */
+typedef struct hsa_ext_image_data_info_s {
+  /**
+   * Image data size, in bytes.
+   */
+  size_t size;
+
+  /**
+   * Image data alignment, in bytes. Must always be a power of 2.
+   */
+  size_t alignment;
+
+} hsa_ext_image_data_info_t;
+
+/**
+ * @brief Retrieve the image data requirements for a given combination of agent, image
+ * descriptor, and access permission for an image created with an opaque image
+ * data layout.
+ *
+ * @details The optimal image data size and alignment requirements may
+ * vary depending on the image attributes specified in @p
+ * image_descriptor, the @p access_permission, and the @p agent. Also,
+ * different implementations of the HSA runtime may return different
+ * requirements for the same input values.
+ *
+ * The implementation must return the same image data requirements for
+ * different access permissions with matching image descriptors as long
+ * as ::hsa_ext_image_get_capability reports
+ * ::HSA_EXT_IMAGE_CAPABILITY_ACCESS_INVARIANT_DATA_LAYOUT. Image
+ * descriptors match if they have the same values, with the exception
+ * that s-form channel orders match the corresponding non-s-form
+ * channel order and vice versa.
+ *
+ * @param[in] agent Agent to be associated with the image handle.
+ *
+ * @param[in] image_descriptor Pointer to an image descriptor. Must not be NULL.
+ *
+ * @param[in] access_permission Access permission of the image when
+ * accessed by @p agent. The access permission defines how the agent
+ * is allowed to access the image and must match the corresponding
+ * HSAIL image handle type. The @p agent must support the image format
+ * specified in @p image_descriptor for the given @p
+ * access_permission.
+ *
+ * @param[out] image_data_info Memory location where the runtime stores the
+ * size and alignment requirements. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED The @p
+ * agent does not support the image format specified by @p
+ * image_descriptor with the specified @p access_permission.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_SIZE_UNSUPPORTED The agent
+ * does not support the image dimensions specified by @p
+ * image_descriptor with the specified @p access_permission.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_descriptor is NULL, @p
+ * access_permission is not a valid access permission value, or @p
+ * image_data_info is NULL.
+ */
+hsa_status_t HSA_API hsa_ext_image_data_get_info(
+    hsa_agent_t agent,
+    const hsa_ext_image_descriptor_t *image_descriptor,
+    hsa_access_permission_t access_permission,
+    hsa_ext_image_data_info_t *image_data_info);
+
+/**
+ * @brief Retrieve the image data requirements for a given combination of
+ * image descriptor, access permission, image data layout, image data row pitch,
+ * and image data slice pitch for an image created with an explicit image
+ * data layout.
+ *
+ * @details The image data size and alignment requirements may vary
+ * depending on the image attributes specified in @p image_descriptor,
+ * the @p access_permission, and the image layout. However, different
+ * implementations of the HSA runtime will return the same
+ * requirements for the same input values.
+ *
+ * The implementation must return the same image data requirements for
+ * different access permissions with matching image descriptors and
+ * matching image layouts as long as ::hsa_ext_image_get_capability
+ * reports
+ * ::HSA_EXT_IMAGE_CAPABILITY_ACCESS_INVARIANT_DATA_LAYOUT. Image
+ * descriptors match if they have the same values, with the exception
+ * that s-form channel orders match the corresponding non-s-form
+ * channel order and vice versa. Image layouts match if they are the
+ * same image data layout and use the same image row and slice pitch
+ * values.
+ *
+ * @param[in] image_descriptor Pointer to an image descriptor. Must not be NULL.
+ *
+ * @param[in] access_permission Access permission of the image when
+ * accessed by an agent. The access permission defines how the agent
+ * is allowed to access the image and must match the corresponding
+ * HSAIL image handle type.
+ *
+ * @param[in] image_data_layout The image data layout to use.
+ * It is invalid to use ::HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE; use
+ * ::hsa_ext_image_data_get_info instead.
+ *
+ * @param[in] image_data_row_pitch The size in bytes for a single row
+ * of the image in the image data. If 0 is specified then the default
+ * row pitch value is used: image width * image element byte size.
+ * The value used must be greater than or equal to the default row
+ * pitch, and be a multiple of the image element byte size. For the
+ * linear image layout it must also be a multiple of the image linear
+ * row pitch alignment for the agents that will access the image data
+ * using image instructions.
+ *
+ * @param[in] image_data_slice_pitch The size in bytes of a single
+ * slice of a 3D image, or the size in bytes of each image layer in an
+ * image array in the image data. If 0 is specified then the default
+ * slice pitch value is used: row pitch * height if geometry is
+ * ::HSA_EXT_IMAGE_GEOMETRY_3D, ::HSA_EXT_IMAGE_GEOMETRY_2DA, or
+ * ::HSA_EXT_IMAGE_GEOMETRY_2DADEPTH; row pitch if geometry is
+ * ::HSA_EXT_IMAGE_GEOMETRY_1DA; and 0 otherwise. The value used must
+ * be 0 if the default slice pitch is 0, be greater than or equal to
+ * the default slice pitch, and be a multiple of the row pitch.
+ *
+ * @param[out] image_data_info Memory location where the runtime stores the
+ * size and alignment requirements. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED The image
+ * format specified by @p image_descriptor is not supported for the
+ * @p access_permission and @p image_data_layout specified.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_SIZE_UNSUPPORTED The image
+ * dimensions specified by @p image_descriptor are not supported for
+ * the @p access_permission and @p image_data_layout specified.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_PITCH_UNSUPPORTED The row and
+ * slice pitch specified by @p image_data_row_pitch and @p
+ * image_data_slice_pitch are invalid or not supported.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_descriptor is
+ * NULL, @p image_data_layout is ::HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE,
+ * or @p image_data_info is NULL.
+ */
+hsa_status_t HSA_API hsa_ext_image_data_get_info_with_layout(
+    hsa_agent_t agent,
+    const hsa_ext_image_descriptor_t *image_descriptor,
+    hsa_access_permission_t access_permission,
+    hsa_ext_image_data_layout_t image_data_layout,
+    size_t image_data_row_pitch,
+    size_t image_data_slice_pitch,
+    hsa_ext_image_data_info_t *image_data_info);
+
+/**
+ * @brief Creates an agent specific image handle to an image with an
+ * opaque image data layout.
+ *
+ * @details Images with an opaque image data layout created with
+ * different access permissions but matching image descriptors and
+ * same agent can share the same image data if
+ * ::HSA_EXT_IMAGE_CAPABILITY_ACCESS_INVARIANT_DATA_LAYOUT is reported
+ * by ::hsa_ext_image_get_capability for the image format specified in
+ * the image descriptor. Image descriptors match if they have the same
+ * values, with the exception that s-form channel orders match the
+ * corresponding non-s-form channel order and vice versa.
+ *
+ * If necessary, an application can use image operations (import,
+ * export, copy, clear) to prepare the image for the intended use
+ * regardless of the access permissions.
+ *
+ * @param[in] agent agent to be associated with the image handle created.
+ *
+ * @param[in] image_descriptor Pointer to an image descriptor. Must not be NULL.
+ *
+ * @param[in] image_data Image data buffer that must have been allocated
+ * according to the size and alignment requirements dictated by
+ * ::hsa_ext_image_data_get_info. Must not be NULL.
+ *
+ * Any previous memory contents are preserved upon creation. The application is
+ * responsible for ensuring that the lifetime of the image data exceeds that of
+ * all the associated images.
+ *
+ * @param[in] access_permission Access permission of the image when
+ * accessed by agent. The access permission defines how the agent
+ * is allowed to access the image using the image handle created and
+ * must match the corresponding HSAIL image handle type. The agent
+ * must support the image format specified in @p image_descriptor for
+ * the given @p access_permission.
+ *
+ * @param[out] image Pointer to a memory location where the HSA runtime stores
+ * the newly created image handle. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED The agent
+ * does not have the capability to support the image format contained
+ * in @p image_descriptor using the specified @p access_permission.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_SIZE_UNSUPPORTED The agent
+ * does not support the image dimensions specified by @p
+ * image_descriptor using the specified @p access_permission.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate
+ * the required resources.
+ *
+ * support the creation of more image handles with the given @p access_permission).
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_descriptor is NULL, @p
+ * image_data is NULL, @p image_data does not have a valid alignment,
+ * @p access_permission is not a valid access permission
+ * value, or @p image is NULL.
+ */
+hsa_status_t HSA_API hsa_ext_image_create(
+    hsa_agent_t agent,
+    const hsa_ext_image_descriptor_t *image_descriptor,
+    const void *image_data,
+    hsa_access_permission_t access_permission,
+    hsa_ext_image_t *image);
+
+/**
+ * @brief Creates an agent specific image handle to an image with an explicit
+ * image data layout.
+ *
+ * @details Images with an explicit image data layout created with
+ * different access permissions but matching image descriptors and
+ * matching image layout can share the same image data if
+ * ::HSA_EXT_IMAGE_CAPABILITY_ACCESS_INVARIANT_DATA_LAYOUT is reported
+ * by ::hsa_ext_image_get_capability_with_layout for the image format
+ * specified in the image descriptor and specified image data
+ * layout. Image descriptors match if they have the same values, with
+ * the exception that s-form channel orders match the corresponding
+ * non-s-form channel order and vice versa. Image layouts match if
+ * they are the same image data layout and use the same image row and
+ * slice values.
+ *
+ * If necessary, an application can use image operations (import, export, copy,
+ * clear) to prepare the image for the intended use regardless of the access
+ * permissions.
+ *
+ * @param[in] agent agent to be associated with the image handle created.
+ *
+ * @param[in] image_descriptor Pointer to an image descriptor. Must not be NULL.
+ *
+ * @param[in] image_data Image data buffer that must have been allocated
+ * according to the size and alignment requirements dictated by
+ * ::hsa_ext_image_data_get_info_with_layout. Must not be NULL.
+ *
+ * Any previous memory contents are preserved upon creation. The application is
+ * responsible for ensuring that the lifetime of the image data exceeds that of
+ * all the associated images.
+ *
+ * @param[in] access_permission Access permission of the image when
+ * accessed by the agent. The access permission defines how the agent
+ * is allowed to access the image and must match the corresponding
+ * HSAIL image handle type. The agent must support the image format
+ * specified in @p image_descriptor for the given @p access_permission
+ * and @p image_data_layout.
+ *
+ * @param[in] image_data_layout The image data layout to use for the
+ * @p image_data. It is invalid to use
+ * ::HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE; use ::hsa_ext_image_create
+ * instead.
+ *
+ * @param[in] image_data_row_pitch The size in bytes for a single row
+ * of the image in the image data. If 0 is specified then the default
+ * row pitch value is used: image width * image element byte size.
+ * The value used must be greater than or equal to the default row
+ * pitch, and be a multiple of the image element byte size. For the
+ * linear image layout it must also be a multiple of the image linear
+ * row pitch alignment for the agents that will access the image data
+ * using image instructions.
+ *
+ * @param[in] image_data_slice_pitch The size in bytes of a single
+ * slice of a 3D image, or the size in bytes of each image layer in an
+ * image array in the image data. If 0 is specified then the default
+ * slice pitch value is used: row pitch * height if geometry is
+ * ::HSA_EXT_IMAGE_GEOMETRY_3D, ::HSA_EXT_IMAGE_GEOMETRY_2DA, or
+ * ::HSA_EXT_IMAGE_GEOMETRY_2DADEPTH; row pitch if geometry is
+ * ::HSA_EXT_IMAGE_GEOMETRY_1DA; and 0 otherwise. The value used must
+ * be 0 if the default slice pitch is 0, be greater than or equal to
+ * the default slice pitch, and be a multiple of the row pitch.
+ *
+ * @param[out] image Pointer to a memory location where the HSA runtime stores
+ * the newly created image handle. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED The agent does
+ * not have the capability to support the image format contained in the image
+ * descriptor using the specified @p access_permission and @p image_data_layout.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_SIZE_UNSUPPORTED The agent
+ * does not support the image dimensions specified by @p
+ * image_descriptor using the specified @p access_permission and @p
+ * image_data_layout.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_PITCH_UNSUPPORTED The agent does
+ * not support the row and slice pitch specified by @p image_data_row_pitch
+ * and @p image_data_slice_pitch, or the values are invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate
+ * the required resources.
+ *
+ * support the creation of more image handles with the given @p access_permission).
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_descriptor is NULL, @p
+ * image_data is NULL, @p image_data does not have a valid alignment,
+ * @p image_data_layout is ::HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE,
+ * or @p image is NULL.
+ */
+hsa_status_t HSA_API hsa_ext_image_create_with_layout(
+    hsa_agent_t agent,
+    const hsa_ext_image_descriptor_t *image_descriptor,
+    const void *image_data,
+    hsa_access_permission_t access_permission,
+    hsa_ext_image_data_layout_t image_data_layout,
+    size_t image_data_row_pitch,
+    size_t image_data_slice_pitch,
+    hsa_ext_image_t *image);
+
+/**
+ * @brief Destroy an image handle previously created using ::hsa_ext_image_create or
+ * ::hsa_ext_image_create_with_layout.
+ *
+ * @details Destroying the image handle does not free the associated image data,
+ * or modify its contents. The application should not destroy an image handle while
+ * there are references to it queued for execution or currently being used in a
+ * kernel dispatch.
+ *
+ * @param[in] agent Agent associated with the image handle.
+ *
+ * @param[in] image Image handle to destroy.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ */
+hsa_status_t HSA_API hsa_ext_image_destroy(
+    hsa_agent_t agent,
+    hsa_ext_image_t image);
+
+/**
+ * @brief Copies a portion of one image (the source) to another image (the
+ * destination).
+ *
+ * @details The source and destination image formats should be the
+ * same, with the exception that s-form channel orders match the
+ * corresponding non-s-form channel order and vice versa. For example,
+ * it is allowed to copy a source image with a channel order of
+ * HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB to a destination image with a
+ * channel order of HSA_EXT_IMAGE_CHANNEL_ORDER_RGB.
+ *
+ * The source and destination images do not have to be of the same geometry and
+ * appropriate scaling is performed by the HSA runtime. It is possible to copy
+ * subregions between any combinations of source and destination geometries, provided
+ * that the dimensions of the subregions are the same. For example, it is
+ * allowed to copy a rectangular region from a 2D image to a slice of a 3D
+ * image.
+ *
+ * If the source and destination image data overlap, or the combination of
+ * offset and range references an out-out-bounds element in any of the images,
+ * the behavior is undefined.
+ *
+ * @param[in] agent Agent associated with both the source and destination image handles.
+ *
+ * @param[in] src_image Image handle of source image. The agent associated with the source
+ * image handle must be identical to that of the destination image.
+ *
+ * @param[in] src_offset Pointer to the offset within the source image where to
+ * copy the data from. Must not be NULL.
+ *
+ * @param[in] dst_image Image handle of destination image.
+ *
+ * @param[in] dst_offset Pointer to the offset within the destination
+ * image where to copy the data. Must not be NULL.
+ *
+ * @param[in] range Dimensions of the image portion to be copied. The HSA
+ * runtime computes the size of the image data to be copied using this
+ * argument. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p src_offset is
+ * NULL, @p dst_offset is NULL, or @p range is NULL.
+ */
+hsa_status_t HSA_API hsa_ext_image_copy(
+    hsa_agent_t agent,
+    hsa_ext_image_t src_image,
+    const hsa_dim3_t* src_offset,
+    hsa_ext_image_t dst_image,
+    const hsa_dim3_t* dst_offset,
+    const hsa_dim3_t* range);
+
+/**
+ * @brief Image region.
+ */
+typedef struct hsa_ext_image_region_s {
+   /**
+    * Offset within an image (in coordinates).
+    */
+    hsa_dim3_t offset;
+
+   /**
+    * Dimension size of the image range (in coordinates). The x, y, and z dimensions
+    * correspond to width, height, and depth or index respectively.
+    */
+    hsa_dim3_t range;
+} hsa_ext_image_region_t;
+
+/**
+ * @brief Import a linearly organized image data from memory directly to an
+ * image handle.
+ *
+ * @details This operation updates the image data referenced by the image handle
+ * from the source memory. The size of the data imported from memory is
+ * implicitly derived from the image region.
+ *
+ * It is the application's responsibility to avoid out of bounds memory access.
+ *
+ * None of the source memory or destination image data memory can
+ * overlap. Overlapping of any of the source and destination image
+ * data memory within the import operation produces undefined results.
+ *
+ * @param[in] agent Agent associated with the image handle.
+ *
+ * @param[in] src_memory Source memory. Must not be NULL.
+ *
+ * @param[in] src_row_pitch The size in bytes of a single row of the image in the
+ * source memory. If the value is smaller than the destination image region
+ * width * image element byte size, then region width * image element byte
+ * size is used.
+ *
+ * @param[in] src_slice_pitch The size in bytes of a single 2D slice of a 3D image,
+ * or the size in bytes of each image layer in an image array in the source memory.
+ * If the geometry is ::HSA_EXT_IMAGE_GEOMETRY_1DA and the value is smaller than the
+ * value used for @p src_row_pitch, then the value used for @p src_row_pitch is used.
+ * If the geometry is ::HSA_EXT_IMAGE_GEOMETRY_3D, ::HSA_EXT_IMAGE_GEOMETRY_2DA, or
+ * HSA_EXT_IMAGE_GEOMETRY_2DADEPTH and the value is smaller than the value used for
+ * @p src_row_pitch * destination image region height, then the value used for
+ * @p src_row_pitch * destination image region height is used.
+ * Otherwise, the value is not used.
+ *
+ * @param[in] dst_image Image handle of destination image.
+ *
+ * @param[in] image_region Pointer to the image region to be updated. Must not
+ * be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p src_memory is NULL, or @p
+ * image_region is NULL.
+ *
+ */
+hsa_status_t HSA_API hsa_ext_image_import(
+    hsa_agent_t agent,
+    const void *src_memory,
+    size_t src_row_pitch,
+    size_t src_slice_pitch,
+    hsa_ext_image_t dst_image,
+    const hsa_ext_image_region_t *image_region);
+
+/**
+ * @brief Export the image data to linearly organized memory.
+ *
+ * @details The operation updates the destination memory with the image data of
+ * @p src_image. The size of the data exported to memory is implicitly derived
+ * from the image region.
+ *
+ * It is the application's responsibility to avoid out of bounds memory access.
+ *
+ * None of the destination memory or source image data memory can
+ * overlap. Overlapping of any of the source and destination image
+ * data memory within the export operation produces undefined results.
+ *
+ * @param[in] agent Agent associated with the image handle.
+ *
+ * @param[in] src_image Image handle of source image.
+ *
+ * @param[in] dst_memory Destination memory. Must not be NULL.
+ *
+ * @param[in] dst_row_pitch The size in bytes of a single row of the image in the
+ * destination memory. If the value is smaller than the source image region
+ * width * image element byte size, then region width * image element byte
+ * size is used.
+ *
+ * @param[in] dst_slice_pitch The size in bytes of a single 2D slice of a 3D image,
+ * or the size in bytes of each image in an image array in the destination memory.
+ * If the geometry is ::HSA_EXT_IMAGE_GEOMETRY_1DA and the value is smaller than the
+ * value used for @p dst_row_pitch, then the value used for @p dst_row_pitch is used.
+ * If the geometry is ::HSA_EXT_IMAGE_GEOMETRY_3D, ::HSA_EXT_IMAGE_GEOMETRY_2DA, or
+ * HSA_EXT_IMAGE_GEOMETRY_2DADEPTH and the value is smaller than the value used for
+ * @p dst_row_pitch * source image region height, then the value used for
+ * @p dst_row_pitch * source image region height is used.
+ * Otherwise, the value is not used.
+ *
+ * @param[in] image_region Pointer to the image region to be exported. Must not
+ * be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p dst_memory is NULL, or @p
+ * image_region is NULL.
+ */
+hsa_status_t HSA_API hsa_ext_image_export(
+    hsa_agent_t agent,
+    hsa_ext_image_t src_image,
+    void *dst_memory,
+    size_t dst_row_pitch,
+    size_t dst_slice_pitch,
+    const hsa_ext_image_region_t *image_region);
+
+/**
+ * @brief Clear a region of an image so that every image element has
+ * the specified value.
+ *
+ * @param[in] agent Agent associated with the image handle.
+ *
+ * @param[in] image Image handle for image to be cleared.
+ *
+ * @param[in] data The value to which to set each image element being
+ * cleared. It is specified as an array of image component values. The
+ * number of array elements must match the number of access components
+ * for the image channel order. The type of each array element must
+ * match the image access type of the image channel type. When the
+ * value is used to set the value of an image element, the conversion
+ * method corresponding to the image channel type is used. See the
+ * <em>Channel Order</em> section and <em>Channel Type</em> section in
+ * the <em>HSA Programming Reference Manual</em> for more
+ * information. Must not be NULL.
+ *
+ * @param[in] image_region Pointer to the image region to clear. Must not be
+ * NULL. If the region references an out-out-bounds element, the behavior is
+ * undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p data is NULL, or @p
+ * image_region is NULL.
+ */
+hsa_status_t HSA_API hsa_ext_image_clear(
+    hsa_agent_t agent,
+    hsa_ext_image_t image,
+    const void* data,
+    const hsa_ext_image_region_t *image_region);
+
+/**
+ * @brief Sampler handle. Samplers are populated by
+ * ::hsa_ext_sampler_create or ::hsa_ext_sampler_create_v2. Sampler handles are only unique
+ *  within an agent, not across agents.
+ */
+typedef struct hsa_ext_sampler_s {
+  /**
+   *  Opaque handle. For a given agent, two handles reference the same object of
+   *  the enclosing type if and only if they are equal.
+   */
+    uint64_t handle;
+} hsa_ext_sampler_t;
+
+/**
+ * @brief Sampler address modes. The sampler address mode describes
+ * the processing of out-of-range image coordinates. See the
+ * <em>Addressing Mode</em> section in the <em>HSA Programming Reference
+ * Manual</em> for definitions on each address mode. The values
+ * match the BRIG type @p hsa_ext_brig_sampler_addressing_t.
+ */
+typedef enum {
+  /**
+   * Out-of-range coordinates are not handled.
+   */
+  HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED = 0,
+
+  /**
+   * Clamp out-of-range coordinates to the image edge.
+   */
+  HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE = 1,
+
+  /**
+   * Clamp out-of-range coordinates to the image border color.
+   */
+  HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER = 2,
+
+  /**
+   * Wrap out-of-range coordinates back into the valid coordinate
+   * range so the image appears as repeated tiles.
+   */
+  HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT = 3,
+
+  /**
+   * Mirror out-of-range coordinates back into the valid coordinate
+   * range so the image appears as repeated tiles with every other
+   * tile a reflection.
+   */
+  HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT = 4
+
+} hsa_ext_sampler_addressing_mode_t;
+
+/**
+ * @brief A fixed-size type used to represent ::hsa_ext_sampler_addressing_mode_t constants.
+ */
+typedef uint32_t hsa_ext_sampler_addressing_mode32_t;
+
+/**
+ * @brief Sampler coordinate normalization modes. See the
+ * <em>Coordinate Normalization Mode</em> section in the <em>HSA
+ * Programming Reference Manual</em> for definitions on each
+ * coordinate normalization mode. The values match the BRIG type @p
+ * hsa_ext_brig_sampler_coord_normalization_t.
+ */
+typedef enum {
+
+  /**
+   * Coordinates are used to directly address an image element.
+   */
+  HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED = 0,
+
+  /**
+   * Coordinates are scaled by the image dimension size before being
+   * used to address an image element.
+   */
+  HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED = 1
+
+} hsa_ext_sampler_coordinate_mode_t;
+
+/**
+ * @brief A fixed-size type used to represent ::hsa_ext_sampler_coordinate_mode_t constants.
+ */
+typedef uint32_t hsa_ext_sampler_coordinate_mode32_t;
+    
+
+/**
+ * @brief Sampler filter modes. See the <em>Filter Mode</em> section
+ * in the <em>HSA Programming Reference Manual</em> for definitions
+ * on each address mode. The enumeration values match the BRIG type @p
+ * hsa_ext_brig_sampler_filter_t.
+ */
+typedef enum {
+  /**
+   * Filter to the image element nearest (in Manhattan distance) to the
+   * specified coordinate.
+   */
+  HSA_EXT_SAMPLER_FILTER_MODE_NEAREST = 0,
+
+  /**
+   * Filter to the image element calculated by combining the elements in a 2x2
+   * square block or 2x2x2 cube block around the specified coordinate. The
+   * elements are combined using linear interpolation.
+   */
+  HSA_EXT_SAMPLER_FILTER_MODE_LINEAR = 1
+
+} hsa_ext_sampler_filter_mode_t;
+
+/**
+ * @brief A fixed-size type used to represent ::hsa_ext_sampler_filter_mode_t constants.
+ */
+typedef uint32_t hsa_ext_sampler_filter_mode32_t;
+
+/**
+ * @brief Implementation independent sampler descriptor.
+ */
+typedef struct hsa_ext_sampler_descriptor_s {
+  /**
+   * Sampler coordinate mode describes the normalization of image coordinates.
+   */
+  hsa_ext_sampler_coordinate_mode32_t coordinate_mode;
+
+  /**
+   * Sampler filter type describes the type of sampling performed.
+   */
+  hsa_ext_sampler_filter_mode32_t filter_mode;
+
+  /**
+   * Sampler address mode describes the processing of out-of-range image
+   * coordinates.
+   */
+  hsa_ext_sampler_addressing_mode32_t address_mode;
+} hsa_ext_sampler_descriptor_t;
+
+/**
+ * @brief Implementation independent sampler descriptor v2 which supports
+ *  different address modes in X, Y and Z axises.
+ */
+typedef struct hsa_ext_sampler_descriptor_v2_s {
+  /**
+   * Sampler coordinate mode describes the normalization of image coordinates.
+   */
+  hsa_ext_sampler_coordinate_mode32_t coordinate_mode;
+
+  /**
+   * Sampler filter type describes the type of sampling performed.
+   */
+  hsa_ext_sampler_filter_mode32_t filter_mode;
+
+  /**
+   * Sampler address mode describes the processing of out-of-range image
+   * coordinates.
+   */
+  hsa_ext_sampler_addressing_mode32_t address_modes[3]; // in X, Y and Z axises
+} hsa_ext_sampler_descriptor_v2_t;
+
+/**
+ * @brief Create an agent specific sampler handle for a given agent
+ * independent sampler descriptor and agent.
+ *
+ * @param[in] agent Agent to be associated with the sampler handle created.
+ *
+ * @param[in] sampler_descriptor Pointer to a sampler descriptor. Must not be
+ * NULL.
+ *
+ * @param[out] sampler Memory location where the HSA runtime stores the newly
+ * created sampler handle. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_SAMPLER_DESCRIPTOR_UNSUPPORTED The
+ * @p agent does not have the capability to support the properties
+ * specified by @p sampler_descriptor or it is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate
+ * the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p sampler_descriptor is NULL, or
+ * @p sampler is NULL.
+ */
+hsa_status_t HSA_API hsa_ext_sampler_create(
+    hsa_agent_t agent,
+    const hsa_ext_sampler_descriptor_t *sampler_descriptor,
+    hsa_ext_sampler_t *sampler);
+
+/**
+ * @brief Create an agent specific sampler handle for a given agent
+ * independent sampler descriptor v2 and agent.
+ *
+ * @param[in] agent Agent to be associated with the sampler handle created.
+ *
+ * @param[in] sampler_descriptor v2 Pointer to a sampler descriptor. Must not be
+ * NULL.
+ *
+ * @param[out] sampler Memory location where the HSA runtime stores the newly
+ * created sampler handle. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_SAMPLER_DESCRIPTOR_UNSUPPORTED The
+ * @p agent does not have the capability to support the properties
+ * specified by @p sampler_descriptor or it is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate
+ * the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p sampler_descriptor is NULL, or
+ * @p sampler is NULL.
+ */
+hsa_status_t HSA_API hsa_ext_sampler_create_v2(
+    hsa_agent_t agent,
+    const hsa_ext_sampler_descriptor_v2_t *sampler_descriptor,
+    hsa_ext_sampler_t *sampler);
+
+/**
+ * @brief Destroy a sampler handle previously created using ::hsa_ext_sampler_create or
+ * ::hsa_ext_sampler_create_v2.
+ *
+ * @details The sampler handle should not be destroyed while there are
+ * references to it queued for execution or currently being used in a
+ * kernel dispatch.
+ *
+ * @param[in] agent Agent associated with the sampler handle.
+ *
+ * @param[in] sampler Sampler handle to destroy.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ */
+hsa_status_t HSA_API hsa_ext_sampler_destroy(
+    hsa_agent_t agent,
+    hsa_ext_sampler_t sampler);
+
+
+#define hsa_ext_images_1_00
+
+/**
+ * @brief The function pointer table for the images v1.00 extension. Can be returned by ::hsa_system_get_extension_table or ::hsa_system_get_major_extension_table.
+ */
+typedef struct hsa_ext_images_1_00_pfn_s {
+
+  hsa_status_t (*hsa_ext_image_get_capability)(
+    hsa_agent_t agent,
+    hsa_ext_image_geometry_t geometry,
+    const hsa_ext_image_format_t *image_format,
+    uint32_t *capability_mask);
+
+  hsa_status_t (*hsa_ext_image_data_get_info)(
+    hsa_agent_t agent,
+    const hsa_ext_image_descriptor_t *image_descriptor,
+    hsa_access_permission_t access_permission,
+    hsa_ext_image_data_info_t *image_data_info);
+
+  hsa_status_t (*hsa_ext_image_create)(
+    hsa_agent_t agent,
+    const hsa_ext_image_descriptor_t *image_descriptor,
+    const void *image_data,
+    hsa_access_permission_t access_permission,
+    hsa_ext_image_t *image);
+
+  hsa_status_t (*hsa_ext_image_destroy)(
+    hsa_agent_t agent,
+    hsa_ext_image_t image);
+
+  hsa_status_t (*hsa_ext_image_copy)(
+    hsa_agent_t agent,
+    hsa_ext_image_t src_image,
+    const hsa_dim3_t* src_offset,
+    hsa_ext_image_t dst_image,
+    const hsa_dim3_t* dst_offset,
+    const hsa_dim3_t* range);
+
+  hsa_status_t (*hsa_ext_image_import)(
+    hsa_agent_t agent,
+    const void *src_memory,
+    size_t src_row_pitch,
+    size_t src_slice_pitch,
+    hsa_ext_image_t dst_image,
+    const hsa_ext_image_region_t *image_region);
+
+  hsa_status_t (*hsa_ext_image_export)(
+    hsa_agent_t agent,
+    hsa_ext_image_t src_image,
+    void *dst_memory,
+    size_t dst_row_pitch,
+    size_t dst_slice_pitch,
+    const hsa_ext_image_region_t *image_region);
+
+  hsa_status_t (*hsa_ext_image_clear)(
+    hsa_agent_t agent,
+    hsa_ext_image_t image,
+    const void* data,
+    const hsa_ext_image_region_t *image_region);
+
+  hsa_status_t (*hsa_ext_sampler_create)(
+    hsa_agent_t agent,
+    const hsa_ext_sampler_descriptor_t *sampler_descriptor,
+    hsa_ext_sampler_t *sampler);
+
+  hsa_status_t (*hsa_ext_sampler_destroy)(
+    hsa_agent_t agent,
+    hsa_ext_sampler_t sampler);
+
+} hsa_ext_images_1_00_pfn_t;
+
+#define hsa_ext_images_1
+
+/**
+ * @brief The function pointer table for the images v1 extension. Can be returned by ::hsa_system_get_extension_table or ::hsa_system_get_major_extension_table.
+ */
+typedef struct hsa_ext_images_1_pfn_s {
+
+  hsa_status_t (*hsa_ext_image_get_capability)(
+    hsa_agent_t agent,
+    hsa_ext_image_geometry_t geometry,
+    const hsa_ext_image_format_t *image_format,
+    uint32_t *capability_mask);
+
+  hsa_status_t (*hsa_ext_image_data_get_info)(
+    hsa_agent_t agent,
+    const hsa_ext_image_descriptor_t *image_descriptor,
+    hsa_access_permission_t access_permission,
+    hsa_ext_image_data_info_t *image_data_info);
+
+  hsa_status_t (*hsa_ext_image_create)(
+    hsa_agent_t agent,
+    const hsa_ext_image_descriptor_t *image_descriptor,
+    const void *image_data,
+    hsa_access_permission_t access_permission,
+    hsa_ext_image_t *image);
+
+  hsa_status_t (*hsa_ext_image_destroy)(
+    hsa_agent_t agent,
+    hsa_ext_image_t image);
+
+  hsa_status_t (*hsa_ext_image_copy)(
+    hsa_agent_t agent,
+    hsa_ext_image_t src_image,
+    const hsa_dim3_t* src_offset,
+    hsa_ext_image_t dst_image,
+    const hsa_dim3_t* dst_offset,
+    const hsa_dim3_t* range);
+
+  hsa_status_t (*hsa_ext_image_import)(
+    hsa_agent_t agent,
+    const void *src_memory,
+    size_t src_row_pitch,
+    size_t src_slice_pitch,
+    hsa_ext_image_t dst_image,
+    const hsa_ext_image_region_t *image_region);
+
+  hsa_status_t (*hsa_ext_image_export)(
+    hsa_agent_t agent,
+    hsa_ext_image_t src_image,
+    void *dst_memory,
+    size_t dst_row_pitch,
+    size_t dst_slice_pitch,
+    const hsa_ext_image_region_t *image_region);
+
+  hsa_status_t (*hsa_ext_image_clear)(
+    hsa_agent_t agent,
+    hsa_ext_image_t image,
+    const void* data,
+    const hsa_ext_image_region_t *image_region);
+
+  hsa_status_t (*hsa_ext_sampler_create)(
+    hsa_agent_t agent,
+    const hsa_ext_sampler_descriptor_t *sampler_descriptor,
+    hsa_ext_sampler_t *sampler);
+
+  hsa_status_t (*hsa_ext_sampler_destroy)(
+    hsa_agent_t agent,
+    hsa_ext_sampler_t sampler);
+
+  hsa_status_t (*hsa_ext_image_get_capability_with_layout)(
+    hsa_agent_t agent,
+    hsa_ext_image_geometry_t geometry,
+    const hsa_ext_image_format_t *image_format,
+    hsa_ext_image_data_layout_t image_data_layout,
+    uint32_t *capability_mask);
+
+  hsa_status_t (*hsa_ext_image_data_get_info_with_layout)(
+    hsa_agent_t agent,
+    const hsa_ext_image_descriptor_t *image_descriptor,
+    hsa_access_permission_t access_permission,
+    hsa_ext_image_data_layout_t image_data_layout,
+    size_t image_data_row_pitch,
+    size_t image_data_slice_pitch,
+    hsa_ext_image_data_info_t *image_data_info);
+
+  hsa_status_t (*hsa_ext_image_create_with_layout)(
+    hsa_agent_t agent,
+    const hsa_ext_image_descriptor_t *image_descriptor,
+    const void *image_data,
+    hsa_access_permission_t access_permission,
+    hsa_ext_image_data_layout_t image_data_layout,
+    size_t image_data_row_pitch,
+    size_t image_data_slice_pitch,
+    hsa_ext_image_t *image);
+
+  hsa_status_t (*hsa_ext_sampler_create_v2)(
+    hsa_agent_t agent,
+    const hsa_ext_sampler_descriptor_v2_t *sampler_descriptor,
+    hsa_ext_sampler_t *sampler);
+
+} hsa_ext_images_1_pfn_t;
+/** @} */
+    
+#ifdef __cplusplus
+}  // end extern "C" block
+#endif /*__cplusplus*/ 
+
+#endif
diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ven_amd_aqlprofile.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ven_amd_aqlprofile.h
new file mode 100644
index 0000000000..a49221c49e
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ven_amd_aqlprofile.h
@@ -0,0 +1,488 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef OPENSRC_HSA_RUNTIME_INC_HSA_VEN_AMD_AQLPROFILE_H_
+#define OPENSRC_HSA_RUNTIME_INC_HSA_VEN_AMD_AQLPROFILE_H_
+
+#include <stdint.h>
+#include "hsa.h"
+
+#define HSA_AQLPROFILE_VERSION_MAJOR 2
+#define HSA_AQLPROFILE_VERSION_MINOR 0
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+////////////////////////////////////////////////////////////////////////////////
+// Library version
+uint32_t hsa_ven_amd_aqlprofile_version_major();
+uint32_t hsa_ven_amd_aqlprofile_version_minor();
+
+///////////////////////////////////////////////////////////////////////
+// Library API:
+// The library provides helper methods for instantiation of
+// the profile context object and for populating of the start
+// and stop AQL packets. The profile object contains a profiling
+// events list and needed for profiling buffers descriptors,
+// a command buffer and an output data buffer. To check if there
+// was an error the library methods return a status code. Also
+// the library provides methods for querying required buffers
+// attributes, to validate the event attributes and to get profiling
+// output data.
+//
+// Returned status:
+//     hsa_status_t – HSA status codes are used from hsa.h header
+//
+// Supported profiling features:
+//
+// Supported profiling events
+typedef enum {
+  HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC = 0,
+  HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_TRACE = 1,
+} hsa_ven_amd_aqlprofile_event_type_t;
+
+// Supported performance counters (PMC) blocks
+// The block ID is the same for a block instances set, for example
+// each block instance from the TCC block set, TCC0, TCC1, …, TCCN
+// will have the same block ID HSA_VEN_AMD_AQLPROFILE_BLOCKS_TCC.
+typedef enum {
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_CPC = 0,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_CPF = 1,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GDS = 2,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GRBM = 3,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GRBMSE = 4,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SPI = 5,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQ = 6,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQCS = 7,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SRBM = 8,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SX = 9,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TA = 10,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCA = 11,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCC = 12,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCP = 13,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TD = 14,
+  // Memory related blocks
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCARB = 15,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCHUB = 16,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCMCBVM = 17,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCSEQ = 18,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCVML2 = 19,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCXBAR = 20,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_ATC = 21,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_ATCL2 = 22,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GCEA = 23,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_RPB = 24,
+  // System blocks
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SDMA = 25,
+  // GFX10 added blocks
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL1A = 26,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL1C = 27,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL2A = 28,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL2C = 29,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GCR = 30,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GUS = 31,
+
+  // UMC & MMEA System Blocks
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_UMC = 32,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MMEA = 33,
+
+  HSA_VEN_AMD_AQLPROFILE_BLOCKS_NUMBER
+} hsa_ven_amd_aqlprofile_block_name_t;
+
+// PMC event object structure
+// ‘counter_id’ value is specified in GFXIPs perfcounter user guides
+// which is the counters select value, “Performance Counters Selection”
+// chapter.
+typedef struct {
+  hsa_ven_amd_aqlprofile_block_name_t block_name;
+  uint32_t block_index;
+  uint32_t counter_id;
+} hsa_ven_amd_aqlprofile_event_t;
+
+// Check if event is valid for the specific GPU
+hsa_status_t hsa_ven_amd_aqlprofile_validate_event(
+    hsa_agent_t agent,                            // HSA handle for the profiling GPU
+    const hsa_ven_amd_aqlprofile_event_t* event,  // [in] Pointer on validated event
+    bool* result);                                // [out] True if the event valid, False otherwise
+
+// Profiling parameters
+// All parameters are generic and if not applicable for a specific
+// profile configuration then error status will be returned.
+typedef enum {
+  /**
+   * Select the target compute unit (wgp) for profiling.
+   */
+  HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET = 0,
+  /**
+   * VMID Mask
+   */
+  HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_VM_ID_MASK = 1,
+  /**
+   * Legacy. Deprecated.
+   */
+  HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_MASK = 2,
+  /**
+   * Legacy. Deprecated.
+   */
+  HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK = 3,
+  /**
+   * Legacy. Deprecated.
+   */
+  HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK2 = 4,
+  /**
+   * Shader engine mask for selection.
+   */
+  HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SE_MASK = 5,
+  /**
+   * Legacy. Deprecated.
+   */
+  HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SAMPLE_RATE = 6,
+  /**
+   * Legacy. Deprecated.
+   */
+  HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_K_CONCURRENT = 7,
+  /**
+   * Set SIMD Mask (GFX9) or SIMD ID for collection (Navi)
+   */
+  HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SIMD_SELECTION = 8,
+  /**
+   * Set true for occupancy collection only.
+   */
+  HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_OCCUPANCY_MODE = 9,
+  /**
+   * ATT collection max data size, in MB. Shared among shader engines.
+   */
+  HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_ATT_BUFFER_SIZE = 10,
+  /**
+   * Mask of which compute units to generate perfcounters. GFX9 only.
+   */
+  HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_PERFCOUNTER_MASK = 240,
+  /**
+   * Select collection period for perfcounters. GFX9 only.
+   */
+  HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_PERFCOUNTER_CTRL = 241,
+  /**
+   * Select perfcounter ID (SQ block) for collection. GFX9 only.
+   */
+  HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_PERFCOUNTER_NAME = 242,
+} hsa_ven_amd_aqlprofile_parameter_name_t;
+
+// Profile parameter object
+typedef struct {
+  hsa_ven_amd_aqlprofile_parameter_name_t parameter_name;
+  uint32_t value;
+} hsa_ven_amd_aqlprofile_parameter_t;
+
+typedef enum {
+  HSA_VEN_AMD_AQLPROFILE_ATT_CHANNEL_0 = 0,
+  HSA_VEN_AMD_AQLPROFILE_ATT_CHANNEL_1,
+  HSA_VEN_AMD_AQLPROFILE_ATT_CHANNEL_2,
+  HSA_VEN_AMD_AQLPROFILE_ATT_CHANNEL_3
+} hsa_ven_amd_aqlprofile_att_marker_channel_t;
+
+//
+// Profile context object:
+// The library provides a profile object structure which contains
+// the events array, a buffer for the profiling start/stop commands
+// and a buffer for the output data.
+// The buffers are specified by the buffer descriptors and allocated
+// by the application. The buffers allocation attributes, the command
+// buffer size, the PMC output buffer size as well as profiling output
+// data can be get using the generic get profile info helper _get_info.
+//
+// Buffer descriptor
+typedef struct {
+  void* ptr;
+  uint32_t size;
+} hsa_ven_amd_aqlprofile_descriptor_t;
+
+// Profile context object structure, contains profiling events list and
+// needed for profiling buffers descriptors, a command buffer and
+// an output data buffer
+typedef struct {
+  hsa_agent_t agent;                                     // GFXIP handle
+  hsa_ven_amd_aqlprofile_event_type_t type;              // Events type
+  const hsa_ven_amd_aqlprofile_event_t* events;          // Events array
+  uint32_t event_count;                                  // Events count
+  const hsa_ven_amd_aqlprofile_parameter_t* parameters;  // Parameters array
+  uint32_t parameter_count;                              // Parameters count
+  hsa_ven_amd_aqlprofile_descriptor_t output_buffer;     // Output buffer
+  hsa_ven_amd_aqlprofile_descriptor_t command_buffer;    // PM4 commands
+} hsa_ven_amd_aqlprofile_profile_t;
+
+//
+// AQL packets populating methods:
+// The helper methods to populate provided by the application START and
+// STOP AQL packets which the application is required to submit before and
+// after profiled GPU task packets respectively.
+//
+// AQL Vendor Specific packet which carries a PM4 command
+typedef struct {
+  uint16_t header;
+  uint16_t pm4_command[27];
+  hsa_signal_t completion_signal;
+} hsa_ext_amd_aql_pm4_packet_t;
+
+// Method to populate the provided AQL packet with profiling start commands
+// Only 'pm4_command' fields of the packet are set and the application
+// is responsible to set Vendor Specific header type a completion signal
+hsa_status_t hsa_ven_amd_aqlprofile_start(
+    hsa_ven_amd_aqlprofile_profile_t* profile,        // [in,out] profile context object
+    hsa_ext_amd_aql_pm4_packet_t* aql_start_packet);  // [out] profile start AQL packet
+
+// Method to populate the provided AQL packet with profiling stop commands
+// Only 'pm4_command' fields of the packet are set and the application
+// is responsible to set Vendor Specific header type and a completion signal
+hsa_status_t hsa_ven_amd_aqlprofile_stop(
+    const hsa_ven_amd_aqlprofile_profile_t* profile,  // [in] profile context object
+    hsa_ext_amd_aql_pm4_packet_t* aql_stop_packet);   // [out] profile stop AQL packet
+
+// Method to populate the provided AQL packet with profiling read commands
+// Only 'pm4_command' fields of the packet are set and the application
+// is responsible to set Vendor Specific header type and a completion signal
+hsa_status_t hsa_ven_amd_aqlprofile_read(
+    const hsa_ven_amd_aqlprofile_profile_t* profile,  // [in] profile context object
+    hsa_ext_amd_aql_pm4_packet_t* aql_read_packet);   // [out] profile stop AQL packet
+
+// Legacy devices, PM4 profiling packet size
+const unsigned HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE = 192;
+// Legacy devices, converting the profiling AQL packet to PM4 packet blob
+hsa_status_t hsa_ven_amd_aqlprofile_legacy_get_pm4(
+    const hsa_ext_amd_aql_pm4_packet_t* aql_packet,  // [in] AQL packet
+    void* data);                                     // [out] PM4 packet blob
+
+// Method to add a marker (correlation ID) into the ATT buffer.
+hsa_status_t hsa_ven_amd_aqlprofile_att_marker(
+    hsa_ven_amd_aqlprofile_profile_t* profile,            // [in,out] profile context object
+    hsa_ext_amd_aql_pm4_packet_t* aql_marker_packet,      // [out] profile marker AQL packet
+    uint32_t data,                                        // [in] Data to be inserted
+    hsa_ven_amd_aqlprofile_att_marker_channel_t channel); // [in] Comm channel
+
+//
+// Get profile info:
+// Generic method for getting various profile info including profile buffers
+// attributes like the command buffer size and the profiling PMC results.
+// It’s implied that all counters are 64bit values.
+//
+// Profile generic output data:
+typedef struct {
+  uint32_t sample_id;  // PMC sample or trace buffer index
+  union {
+    struct {
+      hsa_ven_amd_aqlprofile_event_t event;  // PMC event
+      uint64_t result;                       // PMC result
+    } pmc_data;
+    hsa_ven_amd_aqlprofile_descriptor_t trace_data;  // Trace output data descriptor
+  };
+} hsa_ven_amd_aqlprofile_info_data_t;
+
+// ID query type
+typedef struct {
+  const char* name;
+  uint32_t id;
+  uint32_t instance_count;
+} hsa_ven_amd_aqlprofile_id_query_t;
+
+// Profile attributes
+typedef enum {
+  HSA_VEN_AMD_AQLPROFILE_INFO_COMMAND_BUFFER_SIZE = 0,  // get_info returns uint32_t value
+  HSA_VEN_AMD_AQLPROFILE_INFO_PMC_DATA_SIZE = 1,        // get_info returns uint32_t value
+  HSA_VEN_AMD_AQLPROFILE_INFO_PMC_DATA = 2,             // get_info returns PMC uint64_t value
+                                                        // in info_data object
+  HSA_VEN_AMD_AQLPROFILE_INFO_TRACE_DATA = 3,           // get_info returns trace buffer ptr/size
+                                                        // in info_data object
+  HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_COUNTERS = 4,       // get_info returns number of block counter
+  HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_ID = 5,             // get_info returns block id, instances
+                                                        // by name string using _id_query_t
+  HSA_VEN_AMD_AQLPROFILE_INFO_ENABLE_CMD = 6,           // get_info returns size/pointer for
+                                                        // counters enable command buffer
+  HSA_VEN_AMD_AQLPROFILE_INFO_DISABLE_CMD = 7,          // get_info returns size/pointer for
+                                                        // counters disable command buffer
+} hsa_ven_amd_aqlprofile_info_type_t;
+
+
+// Definition of output data iterator callback
+typedef hsa_status_t (*hsa_ven_amd_aqlprofile_data_callback_t)(
+    hsa_ven_amd_aqlprofile_info_type_t info_type,   // [in] data type, PMC or trace data
+    hsa_ven_amd_aqlprofile_info_data_t* info_data,  // [in] info_data object
+    void* callback_data);                           // [in,out] data passed to the callback
+
+// Method for getting the profile info
+hsa_status_t hsa_ven_amd_aqlprofile_get_info(
+    const hsa_ven_amd_aqlprofile_profile_t* profile,  // [in] profile context object
+    hsa_ven_amd_aqlprofile_info_type_t attribute,     // [in] requested profile attribute
+    void* value);                                     // [in,out] returned value
+
+// Method for iterating the events output data
+hsa_status_t hsa_ven_amd_aqlprofile_iterate_data(
+    const hsa_ven_amd_aqlprofile_profile_t* profile,  // [in] profile context object
+    hsa_ven_amd_aqlprofile_data_callback_t callback,  // [in] callback to iterate the output data
+    void* data);                                      // [in,out] data passed to the callback
+
+// Return error string
+hsa_status_t hsa_ven_amd_aqlprofile_error_string(
+    const char** str);  // [out] pointer on the error string
+
+/**
+ * @brief Callback for iteration of all possible event coordinate IDs and coordinate names.
+ */
+typedef hsa_status_t(*hsa_ven_amd_aqlprofile_eventname_callback_t)(int id, const char* name);
+/**
+ * @brief Iterate over all possible event coordinate IDs and their names.
+ */
+hsa_status_t hsa_ven_amd_aqlprofile_iterate_event_ids(hsa_ven_amd_aqlprofile_eventname_callback_t);
+
+/**
+ * @brief Iterate over all event coordinates for a given agent_t and event_t.
+ * @param position A counting sequence indicating callback number.
+ * @param id Coordinate ID as in _iterate_event_ids.
+ * @param extent Coordinate extent indicating maximum allowed instances.
+ * @param coordinate The coordinate, in the range [0,extent-1].
+ * @param name Coordinate name as in _iterate_event_ids.
+ * @param userdata Userdata returned from _iterate_event_coord function.
+ */
+typedef hsa_status_t(*hsa_ven_amd_aqlprofile_coordinate_callback_t)(
+  int position,
+  int id,
+  int extent,
+  int coordinate,
+  const char* name,
+  void* userdata
+);
+
+/**
+ * @brief Iterate over all event coordinates for a given agent_t and event_t.
+ * @param[in] agent HSA agent.
+ * @param[in] event The event ID and block ID to iterate for.
+ * @param[in] sample_id aqlprofile_info_data_t.sample_id returned from _aqlprofile_iterate_data.
+ * @param[in] callback Callback function to return the coordinates.
+ * @param[in] userdata Arbitrary data pointer to be sent back to the user via callback.
+ */
+hsa_status_t hsa_ven_amd_aqlprofile_iterate_event_coord(
+  hsa_agent_t agent,
+  hsa_ven_amd_aqlprofile_event_t event,
+  uint32_t sample_id,
+  hsa_ven_amd_aqlprofile_coordinate_callback_t callback,
+  void* userdata
+);
+
+/**
+ * @brief Extension version.
+ */
+#define hsa_ven_amd_aqlprofile_VERSION_MAJOR 1
+#define hsa_ven_amd_aqlprofile_LIB(suff) "libhsa-amd-aqlprofile" suff ".so"
+
+#ifdef HSA_LARGE_MODEL
+static const char kAqlProfileLib[] = hsa_ven_amd_aqlprofile_LIB("64");
+#else
+static const char kAqlProfileLib[] = hsa_ven_amd_aqlprofile_LIB("");
+#endif
+
+/**
+ * @brief Extension function table.
+ */
+typedef struct hsa_ven_amd_aqlprofile_1_00_pfn_s {
+  uint32_t (*hsa_ven_amd_aqlprofile_version_major)();
+  uint32_t (*hsa_ven_amd_aqlprofile_version_minor)();
+
+  hsa_status_t (*hsa_ven_amd_aqlprofile_error_string)(
+      const char** str);
+
+  hsa_status_t (*hsa_ven_amd_aqlprofile_validate_event)(
+      hsa_agent_t agent,
+      const hsa_ven_amd_aqlprofile_event_t* event,
+      bool* result);
+
+  hsa_status_t (*hsa_ven_amd_aqlprofile_start)(
+      hsa_ven_amd_aqlprofile_profile_t* profile,
+      hsa_ext_amd_aql_pm4_packet_t* aql_start_packet);
+
+  hsa_status_t (*hsa_ven_amd_aqlprofile_stop)(
+      const hsa_ven_amd_aqlprofile_profile_t* profile,
+      hsa_ext_amd_aql_pm4_packet_t* aql_stop_packet);
+
+  hsa_status_t (*hsa_ven_amd_aqlprofile_read)(
+      const hsa_ven_amd_aqlprofile_profile_t* profile,
+      hsa_ext_amd_aql_pm4_packet_t* aql_read_packet);
+
+  hsa_status_t (*hsa_ven_amd_aqlprofile_legacy_get_pm4)(
+      const hsa_ext_amd_aql_pm4_packet_t* aql_packet,
+      void* data);
+
+  hsa_status_t (*hsa_ven_amd_aqlprofile_get_info)(
+      const hsa_ven_amd_aqlprofile_profile_t* profile,
+      hsa_ven_amd_aqlprofile_info_type_t attribute,
+      void* value);
+
+  hsa_status_t (*hsa_ven_amd_aqlprofile_iterate_data)(
+      const hsa_ven_amd_aqlprofile_profile_t* profile,
+      hsa_ven_amd_aqlprofile_data_callback_t callback,
+      void* data);
+
+  hsa_status_t (*hsa_ven_amd_aqlprofile_iterate_event_ids)(
+      hsa_ven_amd_aqlprofile_eventname_callback_t
+  );
+
+  hsa_status_t (*hsa_ven_amd_aqlprofile_iterate_event_coord)(
+      hsa_agent_t agent,
+      hsa_ven_amd_aqlprofile_event_t event,
+      uint32_t sample_id,
+      hsa_ven_amd_aqlprofile_coordinate_callback_t callback,
+      void* userdata
+  );
+
+  hsa_status_t (*hsa_ven_amd_aqlprofile_att_marker)(
+      hsa_ven_amd_aqlprofile_profile_t* profile,
+      hsa_ext_amd_aql_pm4_packet_t* aql_packet,
+      uint32_t data,
+      hsa_ven_amd_aqlprofile_att_marker_channel_t channel
+  );
+} hsa_ven_amd_aqlprofile_1_00_pfn_t;
+
+typedef hsa_ven_amd_aqlprofile_1_00_pfn_t hsa_ven_amd_aqlprofile_pfn_t;
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // OPENSRC_HSA_RUNTIME_INC_HSA_VEN_AMD_AQLPROFILE_H_
diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ven_amd_loader.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ven_amd_loader.h
new file mode 100644
index 0000000000..47236c86e9
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ven_amd_loader.h
@@ -0,0 +1,667 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// HSA AMD extension for additional loader functionality.
+
+#ifndef HSA_VEN_AMD_LOADER_H
+#define HSA_VEN_AMD_LOADER_H
+
+#include "hsa.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/**
+ * @brief Queries equivalent host address for given @p device_address, and
+ * records it in @p host_address.
+ *
+ *
+ * @details Contents of memory pointed to by @p host_address would be identical
+ * to contents of memory pointed to by @p device_address. Only difference
+ * between the two is host accessibility: @p host_address is always accessible
+ * from host, @p device_address might not be accessible from host.
+ *
+ * If @p device_address already points to host accessible memory, then the value
+ * of @p device_address is simply copied into @p host_address.
+ *
+ * The lifetime of @p host_address is the same as the lifetime of @p
+ * device_address, and both lifetimes are limited by the lifetime of the
+ * executable that is managing these addresses.
+ *
+ *
+ * @param[in] device_address Device address to query equivalent host address
+ * for.
+ *
+ * @param[out] host_address Pointer to application-allocated buffer to record
+ * queried equivalent host address in.
+ *
+ *
+ * @retval HSA_STATUS_SUCCESS Function is executed successfully.
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED Runtime is not initialized.
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p device_address is invalid or
+ * null, or @p host_address is null.
+ */
+hsa_status_t hsa_ven_amd_loader_query_host_address(
+  const void *device_address,
+  const void **host_address);
+
+/**
+ * @brief The storage type of the code object that is backing loaded memory
+ * segment.
+ */
+typedef enum {
+  /**
+   * Loaded memory segment is not backed by any code object (anonymous), as the
+   * case would be with BSS (uninitialized data).
+   */
+  HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_NONE = 0,
+  /**
+   * Loaded memory segment is backed by the code object that is stored in the
+   * file.
+   */
+  HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE = 1,
+  /**
+   * Loaded memory segment is backed by the code object that is stored in the
+   * memory.
+   */
+  HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY = 2
+} hsa_ven_amd_loader_code_object_storage_type_t;
+
+/**
+ * @brief Loaded memory segment descriptor.
+ *
+ *
+ * @details Loaded memory segment descriptor describes underlying loaded memory
+ * segment. Loaded memory segment is created/allocated by the executable during
+ * the loading of the code object that is backing underlying memory segment.
+ *
+ * The lifetime of underlying memory segment is limited by the lifetime of the
+ * executable that is managing underlying memory segment.
+ */
+typedef struct hsa_ven_amd_loader_segment_descriptor_s {
+  /**
+   * Agent underlying memory segment is allocated on. If the code object that is
+   * backing underlying memory segment is program code object, then 0.
+   */
+  hsa_agent_t agent;
+  /**
+   * Executable that is managing this underlying memory segment.
+   */
+  hsa_executable_t executable;
+  /**
+   * Storage type of the code object that is backing underlying memory segment.
+   */
+  hsa_ven_amd_loader_code_object_storage_type_t code_object_storage_type;
+  /**
+   * If the storage type of the code object that is backing underlying memory
+   * segment is:
+   *   - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_NONE, then null;
+   *   - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE, then null-terminated
+   *     filepath to the code object;
+   *   - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY, then host
+   *     accessible pointer to the first byte of the code object.
+   */
+  const void *code_object_storage_base;
+  /**
+   * If the storage type of the code object that is backing underlying memory
+   * segment is:
+   *   - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_NONE, then 0;
+   *   - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE, then the length of
+   *     the filepath to the code object (including null-terminating character);
+   *   - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY, then the size, in
+   *     bytes, of the memory occupied by the code object.
+   */
+  size_t code_object_storage_size;
+  /**
+   * If the storage type of the code object that is backing underlying memory
+   * segment is:
+   *   - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_NONE, then 0;
+   *   - other, then offset, in bytes, from the beginning of the code object to
+   *     the first byte in the code object data is copied from.
+   */
+  size_t code_object_storage_offset;
+  /**
+   * Starting address of the underlying memory segment.
+   */
+  const void *segment_base;
+  /**
+   * Size, in bytes, of the underlying memory segment.
+   */
+  size_t segment_size;
+} hsa_ven_amd_loader_segment_descriptor_t;
+
+/**
+ * @brief Either queries loaded memory segment descriptors, or total number of
+ * loaded memory segment descriptors.
+ *
+ *
+ * @details If @p segment_descriptors is not null and @p num_segment_descriptors
+ * points to number that exactly matches total number of loaded memory segment
+ * descriptors, then queries loaded memory segment descriptors, and records them
+ * in @p segment_descriptors. If @p segment_descriptors is null and @p
+ * num_segment_descriptors points to zero, then queries total number of loaded
+ * memory segment descriptors, and records it in @p num_segment_descriptors. In
+ * all other cases returns appropriate error code (see below).
+ *
+ * The caller of this function is responsible for the allocation/deallocation
+ * and the lifetime of @p segment_descriptors and @p num_segment_descriptors.
+ *
+ * The lifetime of loaded memory segments that are described by queried loaded
+ * memory segment descriptors is limited by the lifetime of the executable that
+ * is managing loaded memory segments.
+ *
+ * Queried loaded memory segment descriptors are always self-consistent: they
+ * describe a complete set of loaded memory segments that are being backed by
+ * fully loaded code objects that are present at the time (i.e. this function
+ * is blocked until all executable manipulations are fully complete).
+ *
+ *
+ * @param[out] segment_descriptors Pointer to application-allocated buffer to
+ * record queried loaded memory segment descriptors in. Can be null if @p
+ * num_segment_descriptors points to zero.
+ *
+ * @param[in,out] num_segment_descriptors Pointer to application-allocated
+ * buffer that contains either total number of loaded memory segment descriptors
+ * or zero.
+ *
+ *
+ * @retval HSA_STATUS_SUCCESS Function is executed successfully.
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED Runtime is not initialized.
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p segment_descriptors is null
+ * while @p num_segment_descriptors points to non-zero number, @p
+ * segment_descriptors is not null while @p num_segment_descriptors points to
+ * zero, or @p num_segment_descriptors is null.
+ *
+ * @retval HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS @p num_segment_descriptors
+ * does not point to number that exactly matches total number of loaded memory
+ * segment descriptors.
+ */
+hsa_status_t hsa_ven_amd_loader_query_segment_descriptors(
+  hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors,
+  size_t *num_segment_descriptors);
+
+/**
+ * @brief Obtains the handle of executable to which the device address belongs.
+ *
+ * @details This method should not be used to obtain executable handle by using
+ * a host address. The executable returned is expected to be alive until its
+ * destroyed by the user.
+ *
+ * @retval HSA_STATUS_SUCCESS Function is executed successfully.
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED Runtime is not initialized.
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT The input is invalid or there
+ * is no exectuable found for this kernel code object.
+ */
+hsa_status_t hsa_ven_amd_loader_query_executable(
+  const void *device_address,
+  hsa_executable_t *executable);
+
+//===----------------------------------------------------------------------===//
+
+/**
+ * @brief Iterate over the loaded code objects in an executable, and invoke
+ * an application-defined callback on every iteration.
+ *
+ * @param[in] executable Executable.
+ *
+ * @param[in] callback Callback to be invoked once per loaded code object. The
+ * HSA runtime passes three arguments to the callback: the executable, a
+ * loaded code object, and the application data. If @p callback returns a
+ * status other than ::HSA_STATUS_SUCCESS for a particular iteration, the
+ * traversal stops and
+ * ::hsa_ven_amd_loader_executable_iterate_loaded_code_objects returns that
+ * status value.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL.
+ */
+hsa_status_t hsa_ven_amd_loader_executable_iterate_loaded_code_objects(
+  hsa_executable_t executable,
+  hsa_status_t (*callback)(
+    hsa_executable_t executable,
+    hsa_loaded_code_object_t loaded_code_object,
+    void *data),
+  void *data);
+
+/**
+ * @brief Loaded code object kind.
+ */
+typedef enum {
+  /**
+   * Program code object.
+   */
+  HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_KIND_PROGRAM = 1,
+  /**
+   * Agent code object.
+   */
+  HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_KIND_AGENT = 2
+} hsa_ven_amd_loader_loaded_code_object_kind_t;
+
+/**
+ * @brief Loaded code object attributes.
+ */
+typedef enum hsa_ven_amd_loader_loaded_code_object_info_e {
+  /**
+   * The executable in which this loaded code object is loaded. The
+   * type of this attribute is ::hsa_executable_t.
+   */
+  HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_EXECUTABLE = 1,
+  /**
+   * The kind of this loaded code object. The type of this attribute is
+   * ::uint32_t interpreted as ::hsa_ven_amd_loader_loaded_code_object_kind_t.
+   */
+  HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_KIND = 2,
+  /**
+   * The agent on which this loaded code object is loaded. The
+   * value of this attribute is only defined if
+   * ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_KIND is
+   * ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_KIND_AGENT. The type of this
+   * attribute is ::hsa_agent_t.
+   */
+  HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_AGENT = 3,
+  /**
+   * The storage type of the code object reader used to load the loaded code object.
+   * The type of this attribute is ::uint32_t interpreted as a
+   * ::hsa_ven_amd_loader_code_object_storage_type_t.
+   */
+  HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_TYPE = 4,
+  /**
+   * The memory address of the first byte of the code object that was loaaded.
+   * The value of this attribute is only defined if
+   * ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_TYPE is
+   * ::HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY. The type of this
+   * attribute is ::uint64_t.
+   */
+  HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_MEMORY_BASE = 5,
+  /**
+   * The memory size in bytes of the code object that was loaaded.
+   * The value of this attribute is only defined if
+   * ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_TYPE is
+   * ::HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY. The type of this
+   * attribute is ::uint64_t.
+   */
+  HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_MEMORY_SIZE = 6,
+  /**
+   * The file descriptor of the code object that was loaaded.
+   * The value of this attribute is only defined if
+   * ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_TYPE is
+   * ::HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE. The type of this
+   * attribute is ::int.
+   */
+  HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_FILE = 7,
+  /**
+   * The signed byte address difference of the memory address at which the code
+   * object is loaded minus the virtual address specified in the code object
+   * that is loaded. The value of this attribute is only defined if the
+   * executable in which the code object is loaded is froozen. The type of this
+   * attribute is ::int64_t.
+   */
+  HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_DELTA = 8,
+  /**
+   * The base memory address at which the code object is loaded. This is the
+   * base address of the allocation for the lowest addressed segment of the code
+   * object that is loaded. Note that any non-loaded segments before the first
+   * loaded segment are ignored. The value of this attribute is only defined if
+   * the executable in which the code object is loaded is froozen. The type of
+   * this attribute is ::uint64_t.
+   */
+  HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_BASE = 9,
+  /**
+   * The byte size of the loaded code objects contiguous memory allocation. The
+   * value of this attribute is only defined if the executable in which the code
+   * object is loaded is froozen. The type of this attribute is ::uint64_t.
+   */
+  HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_SIZE = 10,
+  /**
+   * The length of the URI in bytes, not including the NUL terminator. The type
+   * of this attribute is uint32_t.
+   */
+  HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI_LENGTH = 11,
+  /**
+   * The URI name from which the code object was loaded. The type of this
+   * attribute is a NUL terminated \p char* with the length equal to the value
+   * of ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI_LENGTH attribute.
+   * The URI name syntax is defined by the following BNF syntax:
+   *
+   *     code_object_uri ::== file_uri | memory_uri
+   *     file_uri        ::== "file://" file_path [ range_specifier ]
+   *     memory_uri      ::== "memory://" process_id range_specifier
+   *     range_specifier ::== [ "#" | "?" ] "offset=" number "&" "size=" number
+   *     file_path       ::== URI_ENCODED_OS_FILE_PATH
+   *     process_id      ::== DECIMAL_NUMBER
+   *     number          ::== HEX_NUMBER | DECIMAL_NUMBER | OCTAL_NUMBER
+   *
+   * ``number`` is a C integral literal where hexadecimal values are prefixed by
+   * "0x" or "0X", and octal values by "0".
+   *
+   * ``file_path`` is the file's path specified as a URI encoded UTF-8 string.
+   * In URI encoding, every character that is not in the regular expression
+   * ``[a-zA-Z0-9/_.~-]`` is encoded as two uppercase hexidecimal digits
+   * proceeded by "%".  Directories in the path are separated by "/".
+   *
+   * ``offset`` is a 0-based byte offset to the start of the code object.  For a
+   * file URI, it is from the start of the file specified by the ``file_path``,
+   * and if omitted defaults to 0. For a memory URI, it is the memory address
+   * and is required.
+   *
+   * ``size`` is the number of bytes in the code object.  For a file URI, if
+   * omitted it defaults to the size of the file.  It is required for a memory
+   * URI.
+   *
+   * ``process_id`` is the identity of the process owning the memory.  For Linux
+   * it is the C unsigned integral decimal literal for the process ID (PID).
+   *
+   * For example:
+   *
+   *     file:///dir1/dir2/file1
+   *     file:///dir3/dir4/file2#offset=0x2000&size=3000
+   *     memory://1234#offset=0x20000&size=3000
+   */
+  HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI = 12,
+} hsa_ven_amd_loader_loaded_code_object_info_t;
+
+/**
+ * @brief Get the current value of an attribute for a given loaded code
+ * object.
+ *
+ * @param[in] loaded_code_object Loaded code object.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to an application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT The loaded code object is
+ * invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid
+ * loaded code object attribute, or @p value is NULL.
+ */
+hsa_status_t hsa_ven_amd_loader_loaded_code_object_get_info(
+  hsa_loaded_code_object_t loaded_code_object,
+  hsa_ven_amd_loader_loaded_code_object_info_t attribute,
+  void *value);
+
+//===----------------------------------------------------------------------===//
+
+/**
+ * @brief Create a code object reader to operate on a file with size and offset.
+ *
+ * @param[in] file File descriptor. The file must have been opened by
+ * application with at least read permissions prior calling this function. The
+ * file must contain a vendor-specific code object.
+ *
+ * The file is owned and managed by the application; the lifetime of the file
+ * descriptor must exceed that of any associated code object reader.
+ *
+ * @param[in] size Size of the code object embedded in @p file.
+ *
+ * @param[in] offset 0-based offset relative to the beginning of the @p file
+ * that denotes the beginning of the code object embedded within the @p file.
+ *
+ * @param[out] code_object_reader Memory location to store the newly created
+ * code object reader handle. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_FILE @p file is not opened with at least
+ * read permissions. This condition may also be reported as
+ * ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER by the
+ * ::hsa_executable_load_agent_code_object function.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT The bytes starting at offset
+ * do not form a valid code object. If file size is 0. Or offset > file size.
+ * This condition may also be reported as
+ * ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT by the
+ * ::hsa_executable_load_agent_code_object function.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p code_object_reader is NULL.
+ */
+hsa_status_t
+hsa_ven_amd_loader_code_object_reader_create_from_file_with_offset_size(
+    hsa_file_t file,
+    size_t offset,
+    size_t size,
+    hsa_code_object_reader_t *code_object_reader);
+
+//===----------------------------------------------------------------------===//
+
+/**
+ * @brief Iterate over the available executables, and invoke an
+ * application-defined callback on every iteration. While
+ * ::hsa_ven_amd_loader_iterate_executables is executing any calls to
+ * ::hsa_executable_create, ::hsa_executable_create_alt, or
+ * ::hsa_executable_destroy will be blocked.
+ *
+ * @param[in] callback Callback to be invoked once per executable. The HSA
+ * runtime passes two arguments to the callback: the executable and the
+ * application data. If @p callback returns a status other than
+ * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and
+ * ::hsa_ven_amd_loader_iterate_executables returns that status value. If
+ * @p callback invokes ::hsa_executable_create, ::hsa_executable_create_alt, or
+ * ::hsa_executable_destroy then the behavior is undefined.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL.
+*/
+hsa_status_t
+hsa_ven_amd_loader_iterate_executables(
+    hsa_status_t (*callback)(
+      hsa_executable_t executable,
+      void *data),
+    void *data);
+
+//===----------------------------------------------------------------------===//
+
+/**
+ * @brief Extension version.
+ */
+#define hsa_ven_amd_loader 001003
+
+/**
+ * @brief Extension function table version 1.00.
+ */
+typedef struct hsa_ven_amd_loader_1_00_pfn_s {
+  hsa_status_t (*hsa_ven_amd_loader_query_host_address)(
+    const void *device_address,
+    const void **host_address);
+
+  hsa_status_t (*hsa_ven_amd_loader_query_segment_descriptors)(
+    hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors,
+    size_t *num_segment_descriptors);
+
+  hsa_status_t (*hsa_ven_amd_loader_query_executable)(
+    const void *device_address,
+    hsa_executable_t *executable);
+} hsa_ven_amd_loader_1_00_pfn_t;
+
+/**
+ * @brief Extension function table version 1.01.
+ */
+typedef struct hsa_ven_amd_loader_1_01_pfn_s {
+  hsa_status_t (*hsa_ven_amd_loader_query_host_address)(
+    const void *device_address,
+    const void **host_address);
+
+  hsa_status_t (*hsa_ven_amd_loader_query_segment_descriptors)(
+    hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors,
+    size_t *num_segment_descriptors);
+
+  hsa_status_t (*hsa_ven_amd_loader_query_executable)(
+    const void *device_address,
+    hsa_executable_t *executable);
+
+  hsa_status_t (*hsa_ven_amd_loader_executable_iterate_loaded_code_objects)(
+    hsa_executable_t executable,
+    hsa_status_t (*callback)(
+      hsa_executable_t executable,
+      hsa_loaded_code_object_t loaded_code_object,
+      void *data),
+    void *data);
+
+  hsa_status_t (*hsa_ven_amd_loader_loaded_code_object_get_info)(
+    hsa_loaded_code_object_t loaded_code_object,
+    hsa_ven_amd_loader_loaded_code_object_info_t attribute,
+    void *value);
+} hsa_ven_amd_loader_1_01_pfn_t;
+
+/**
+ * @brief Extension function table version 1.02.
+ */
+typedef struct hsa_ven_amd_loader_1_02_pfn_s {
+  hsa_status_t (*hsa_ven_amd_loader_query_host_address)(
+    const void *device_address,
+    const void **host_address);
+
+  hsa_status_t (*hsa_ven_amd_loader_query_segment_descriptors)(
+    hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors,
+    size_t *num_segment_descriptors);
+
+  hsa_status_t (*hsa_ven_amd_loader_query_executable)(
+    const void *device_address,
+    hsa_executable_t *executable);
+
+  hsa_status_t (*hsa_ven_amd_loader_executable_iterate_loaded_code_objects)(
+    hsa_executable_t executable,
+    hsa_status_t (*callback)(
+      hsa_executable_t executable,
+      hsa_loaded_code_object_t loaded_code_object,
+      void *data),
+    void *data);
+
+  hsa_status_t (*hsa_ven_amd_loader_loaded_code_object_get_info)(
+    hsa_loaded_code_object_t loaded_code_object,
+    hsa_ven_amd_loader_loaded_code_object_info_t attribute,
+    void *value);
+
+  hsa_status_t
+    (*hsa_ven_amd_loader_code_object_reader_create_from_file_with_offset_size)(
+      hsa_file_t file,
+      size_t offset,
+      size_t size,
+      hsa_code_object_reader_t *code_object_reader);
+} hsa_ven_amd_loader_1_02_pfn_t;
+
+/**
+ * @brief Extension function table version 1.03.
+ */
+typedef struct hsa_ven_amd_loader_1_03_pfn_s {
+  hsa_status_t (*hsa_ven_amd_loader_query_host_address)(
+    const void *device_address,
+    const void **host_address);
+
+  hsa_status_t (*hsa_ven_amd_loader_query_segment_descriptors)(
+    hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors,
+    size_t *num_segment_descriptors);
+
+  hsa_status_t (*hsa_ven_amd_loader_query_executable)(
+    const void *device_address,
+    hsa_executable_t *executable);
+
+  hsa_status_t (*hsa_ven_amd_loader_executable_iterate_loaded_code_objects)(
+    hsa_executable_t executable,
+    hsa_status_t (*callback)(
+      hsa_executable_t executable,
+      hsa_loaded_code_object_t loaded_code_object,
+      void *data),
+    void *data);
+
+  hsa_status_t (*hsa_ven_amd_loader_loaded_code_object_get_info)(
+    hsa_loaded_code_object_t loaded_code_object,
+    hsa_ven_amd_loader_loaded_code_object_info_t attribute,
+    void *value);
+
+  hsa_status_t
+    (*hsa_ven_amd_loader_code_object_reader_create_from_file_with_offset_size)(
+      hsa_file_t file,
+      size_t offset,
+      size_t size,
+      hsa_code_object_reader_t *code_object_reader);
+
+  hsa_status_t
+    (*hsa_ven_amd_loader_iterate_executables)(
+      hsa_status_t (*callback)(
+        hsa_executable_t executable,
+        void *data),
+      void *data);
+} hsa_ven_amd_loader_1_03_pfn_t;
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* HSA_VEN_AMD_LOADER_H */
diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ven_amd_pc_sampling.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ven_amd_pc_sampling.h
new file mode 100644
index 0000000000..019f0ea5c9
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ven_amd_pc_sampling.h
@@ -0,0 +1,416 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef HSA_VEN_AMD_PC_SAMPLING_H
+#define HSA_VEN_AMD_PC_SAMPLING_H
+
+#include "hsa.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif /*__cplusplus*/
+
+
+/**
+ * @brief HSA AMD Vendor PC Sampling APIs
+ * EXPERIMENTAL: All PC Sampling APIs are currently in an experimental phase and the APIs may be
+ * modified extensively in the future
+ */
+
+/**
+ * @brief PC Sampling sample data for hosttrap sampling method
+ */
+typedef struct {
+  uint64_t pc;
+  uint64_t exec_mask;
+  uint32_t workgroup_id_x;
+  uint32_t workgroup_id_y;
+  uint32_t workgroup_id_z;
+  uint32_t wave_in_wg : 6;
+  uint32_t chiplet    : 3;   // Currently not used
+  uint32_t reserved   : 23;
+  uint32_t hw_id;
+  uint32_t reserved0;
+  uint64_t reserved1;
+  uint64_t timestamp;
+  uint64_t correlation_id;
+} perf_sample_hosttrap_v1_t;
+
+/**
+ * @brief PC Sampling sample data for stochastic sampling method
+ */
+typedef struct {
+  uint64_t pc;
+  uint64_t exec_mask;
+  uint32_t workgroup_id_x;
+  uint32_t workgroup_id_y;
+  uint32_t workgroup_id_z;
+  uint32_t wave_in_wg : 6;
+  uint32_t chiplet    : 3;   // Currently not used
+  uint32_t reserved   : 23;
+  uint32_t hw_id;
+  uint32_t perf_snapshot_data;
+  uint32_t perf_snapshot_data1;
+  uint32_t perf_snapshot_data2;
+  uint64_t timestamp;
+  uint64_t correlation_id;
+} perf_sample_snapshot_v1_t;
+
+/**
+ * @brief PC Sampling method kinds
+ */
+typedef enum {
+  HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1,
+  HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1
+} hsa_ven_amd_pcs_method_kind_t;
+
+/**
+ * @brief PC Sampling interval unit type
+ */
+typedef enum {
+  HSA_VEN_AMD_PCS_INTERVAL_UNITS_MICRO_SECONDS,
+  HSA_VEN_AMD_PCS_INTERVAL_UNITS_CLOCK_CYCLES,
+  HSA_VEN_AMD_PCS_INTERVAL_UNITS_INSTRUCTIONS
+} hsa_ven_amd_pcs_units_t;
+
+/**
+ * @brief HSA callback function to perform the copy onto a destination buffer
+ *
+ * If data_size is 0, HSA will stop current copy operation and keep remaining data in internal
+ * buffers. Remaining contents of HSA internal buffers will be included in next
+ * hsa_ven_amd_pcs_data_ready_callback_t. HSA internal buffers can also be drained by calling
+ * hsa_ven_amd_pcs_flush.
+ *
+ * @param[in] hsa_callback_data private data to pass back to HSA. Provided in
+ * hsa_ven_amd_pcs_data_ready_callback_t
+ *
+ * @param[in] data_size size of destination buffer in bytes.
+ * @param[in] destination destination buffer
+ * @retval    TBD: but could be used to indicate that there is no more data to be read.
+ * Or indicate an error and abort of current copy operations
+ */
+typedef hsa_status_t (*hsa_ven_amd_pcs_data_copy_callback_t)(void* hsa_callback_data,
+                                                             size_t data_size, void* destination);
+
+/**
+ * @brief HSA callback function to to indicate that there is data ready to be copied
+ *
+ * When the client receives this callback, the client should call back @p data_copy_callback for HSA
+ * to perform the copy operation into an available buffer. @p data_copy_callback can be called back
+ * multiple times with smaller @p data_size to split the copy operation.
+ *
+ * This callback must not call ::hsa_ven_amd_pcs_flush.
+ *
+ * @param[in] client_callback_data client private data passed in via
+ * hsa_ven_amd_pcs_create/hsa_ven_amd_pcs_create_from_id
+ * @param[in] data_size size of data available to be copied
+ * @param[in] lost_sample_count number of lost samples since last call to
+ * hsa_ven_amd_pcs_data_ready_callback_t.
+ * @param[in] data_copy_callback callback function for HSA to perform the actual copy
+ * @param[in] hsa_callback_data private data to pass back to HSA
+ */
+typedef void (*hsa_ven_amd_pcs_data_ready_callback_t)(
+    void* client_callback_data, size_t data_size, size_t lost_sample_count,
+    hsa_ven_amd_pcs_data_copy_callback_t data_copy_callback, void* hsa_callback_data);
+
+/**
+ * @brief Opaque handle representing a sampling session.
+ * Two sessions having same handle value represent the same session
+ */
+typedef struct {
+  uint64_t handle;
+} hsa_ven_amd_pcs_t;
+
+/**
+ * @brief PC Sampling configuration flag options
+ */
+typedef enum {
+  /* The interval for this sampling method have to be a power of 2 */
+  HSA_VEN_AMD_PCS_CONFIGURATION_FLAGS_INTERVAL_POWER_OF_2 = (1 << 0)
+} hsa_ven_amd_pcs_configuration_flags_t;
+
+/**
+ * @brief PC Sampling method information
+ * Used to provide client with list of supported PC Sampling methods
+ */
+typedef struct {
+  hsa_ven_amd_pcs_method_kind_t method;
+  hsa_ven_amd_pcs_units_t units;
+  size_t min_interval;
+  size_t max_interval;
+  uint64_t flags;
+} hsa_ven_amd_pcs_configuration_t;
+
+/**
+ * @brief Callback function to iterate through list of supported PC Sampling configurations
+ *
+ * @param[in] configuration one entry for supported PC Sampling method and configuration options
+ * @param[in] callback_data client private callback data that was passed in when calling
+ * hsa_ven_amd_pcs_iterate_configuration
+ */
+typedef hsa_status_t (*hsa_ven_amd_pcs_iterate_configuration_callback_t)(
+    const hsa_ven_amd_pcs_configuration_t* configuration, void* callback_data);
+
+/**
+ * @brief Iterate through list of current supported PC Sampling configurations for this @p agent
+ *
+ * HSA will callback @p configuration_callback for each currently available PC Sampling
+ * configuration. The list of currently available configurations may not be the complete list of
+ * configurations supported on the @p agent. The list of currently available configurations may be
+ * reduced if the @p agent is currently handling other PC sampling sessions.
+ *
+ * @param[in] agent target agent
+ * @param[in] configuration_callback callback function to iterate through list of configurations
+ * @param[in] callback_data client private callback data
+ **/
+hsa_status_t hsa_ven_amd_pcs_iterate_configuration(
+    hsa_agent_t agent, hsa_ven_amd_pcs_iterate_configuration_callback_t configuration_callback,
+    void* callback_data);
+
+/**
+ * @brief  Create a PC Sampling session on @p agent
+ *
+ * Allocate the resources required for a PC Sampling session. The @p method, @p units, @p interval
+ * parameters must be a legal configuration value, as described by the
+ * hsa_ven_amd_pcs_configuration_t configurations passed to the callbacks of
+ * hsa_ven_amd_pcs_iterate_configuration for this @p agent.
+ * A successfull call may restrict the list of possible PC sampling methods available to subsequent
+ * calls to hsa_ven_amd_pcs_iterate_configuration on the same agent as agents have limitations
+ * on what types of PC sampling they can perform concurrently.
+ * For all successful calls, hsa_ven_amd_pcs_destroy should be called to free this session.
+ * The session will be in a stopped/inactive state after this call
+ *
+ * @param[in] agent target agent
+ * @param[in] method method to use
+ * @param[in] units sampling units
+ * @param[in] interval sampling interval in @p units
+ * @param[in] latency expected latency in microseconds for client to provide a buffer for the data
+ * copy callback once HSA calls @p data_ready_callback. This is a performance hint to avoid the
+ * buffer filling up before the client is notified that data is ready. HSA-runtime will estimate
+ * how many samples are received within @p latency and call @p data_ready_callback ahead of time so
+ * that the client has @p latency time to allocate the buffer before the HSA-runtime internal
+ * buffers are full. The value of latency can be 0.
+ * @param[in] buffer_size size of client buffer in bytes. @p data_ready_callback will be called once
+ * HSA-runtime has enough samples to fill @p buffer_size. This needs to be a multiple of size of
+ * perf_sample_hosttrap_v1_t or size of perf_sample_snapshot_v1_t.
+ * @param[in] data_ready_callback client callback function that will be called when:
+ *   1. There is enough samples fill a buffer with @p buffer_size  - estimated samples received
+ *      within @p latency period.
+ * OR
+ *   2. When hsa_ven_amd_pcs_flush is called.
+ * @param[in] client_callback_data client private data to be provided back when data_ready_callback
+ * is called.
+ * @param[out] pc_sampling PC sampling session handle used to reference this session when calling
+ * hsa_ven_amd_pcs_start, hsa_ven_amd_pcs_stop, hsa_ven_amd_pcs_destroy
+ *
+ * @retval ::HSA_STATUS_SUCCESS session created successfully
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT invalid parameters
+ * @retval ::HSA_STATUS_ERROR_RESOURCE_BUSY agent currently handling another PC Sampling session and
+ * cannot handle the type requested.
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Failed to allocate resources
+ * @retval ::HSA_STATUS_ERROR Unexpected error
+ **/
+hsa_status_t hsa_ven_amd_pcs_create(hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method,
+                                    hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency,
+                                    size_t buffer_size,
+                                    hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback,
+                                    void* client_callback_data, hsa_ven_amd_pcs_t* pc_sampling);
+
+
+/**
+ * @brief  Creates a PC Sampling session on @p agent. Assumes that the caller provides the
+ * @p pcs_id generated by the previous call to the underlying driver that reserved PC sampling
+ * on the @p agent.
+ *
+ * Similar to the @ref hsa_ven_amd_pcs_create with the difference that it inherits an existing
+ * PC sampling session that was previously created in the underlying driver.
+ *
+ * Allocate the resources required for a PC Sampling session. The @p method, @p units, @p interval
+ * parameters must be a legal configuration value, and match the parameters that we used to create
+ * the underlying PC Sampling session in the underlying driver.
+ * A successfull call may restrict the list of possible PC sampling methods available to subsequent
+ * calls to hsa_ven_amd_pcs_iterate_configuration on the same agent as agents have limitations
+ * on what types of PC sampling they can perform concurrently.
+ * For all successful calls, hsa_ven_amd_pcs_destroy should be called to free this session.
+ * The session will be in a stopped/inactive state after this call
+ *
+ * @param[in] pcs_id ID that uniquely identifies the PC sampling session within underlying driver
+ * @param[in] agent target agent
+ * @param[in] method method to use
+ * @param[in] units sampling units
+ * @param[in] interval sampling interval in @p units
+ * @param[in] latency expected latency in microseconds for client to provide a buffer for the data
+ * copy callback once HSA calls @p data_ready_callback. This is a performance hint to avoid the
+ * buffer filling up before the client is notified that data is ready. HSA-runtime will estimate
+ * how many samples are received within @p latency and call @p data_ready_callback ahead of time so
+ * that the client has @p latency time to allocate the buffer before the HSA-runtime internal
+ * buffers are full. The value of latency can be 0.
+ * @param[in] buffer_size size of client buffer in bytes. @p data_ready_callback will be called once
+ * HSA-runtime has enough samples to fill @p buffer_size. This needs to be a multiple of size of
+ * perf_sample_hosttrap_v1_t or size of perf_sample_snapshot_v1_t.
+ * @param[in] data_ready_callback client callback function that will be called when:
+ *   1. There is enough samples fill a buffer with @p buffer_size  - estimated samples received
+ *      within @p latency period.
+ * OR
+ *   2. When hsa_ven_amd_pcs_flush is called.
+ * @param[in] client_callback_data client private data to be provided back when data_ready_callback
+ * is called.
+ * @param[out] pc_sampling PC sampling session handle used to reference this session when calling
+ * hsa_ven_amd_pcs_start, hsa_ven_amd_pcs_stop, hsa_ven_amd_pcs_destroy
+ *
+ * @retval ::HSA_STATUS_SUCCESS session created successfully
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT invalid parameters
+ * @retval ::HSA_STATUS_ERROR_RESOURCE_BUSY agent currently handling another PC Sampling session and
+ * cannot handle the type requested.
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Failed to allocate resources
+ * @retval ::HSA_STATUS_ERROR Unexpected error
+ **/
+hsa_status_t hsa_ven_amd_pcs_create_from_id(
+    uint32_t pcs_id, hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method,
+    hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency, size_t buffer_size,
+    hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, void* client_callback_data,
+    hsa_ven_amd_pcs_t* pc_sampling);
+
+/**
+ * @brief  Free a PC Sampling session on @p agent
+ *
+ * Free all the resources allocated for a PC Sampling session on @p agent
+ * Internal buffers for this session will be lost.
+ * If the session was active, the session will be stopped before it is destroyed.
+ *
+ * @param[in] pc_sampling PC sampling session handle
+ *
+ * @retval ::HSA_STATUS_SUCCESS Session destroyed successfully
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid PC sampling handle
+ * @retval ::HSA_STATUS_ERROR unexpected error
+ */
+hsa_status_t hsa_ven_amd_pcs_destroy(hsa_ven_amd_pcs_t pc_sampling);
+
+/**
+ * @brief  Start a PC Sampling session
+ *
+ * Activate a PC Sampling session that was previous created.
+ * The session with be in a active state after this call
+ * If the session was already active, this will result in a no-op and will return HSA_STATUS_SUCCESS
+ *
+ * @param[in] pc_sampling PC sampling session handle
+ *
+ * @retval ::HSA_STATUS_SUCCESS Session started successfully
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid PC sampling handle
+ * @retval ::HSA_STATUS_ERROR unexpected error
+ */
+hsa_status_t hsa_ven_amd_pcs_start(hsa_ven_amd_pcs_t pc_sampling);
+
+/**
+ * @brief  Stop a PC Sampling session
+ *
+ * Stop a session that is currently active
+ * After a session is stopped HSA may still have some PC Sampling data in its internal buffers.
+ * The internal buffers can be drained using hsa_ven_amd_pcs_flush. If the internal
+ * buffers are not drained and the session is started again, the internal buffers will be available
+ * on the next data_ready_callback.
+ * If the session was already inactive, this will result in a no-op and will return
+ * HSA_STATUS_SUCCESS
+ *
+ * @param[in] pc_sampling PC sampling session handle
+ *
+ * @retval ::HSA_STATUS_SUCCESS Session stopped successfully
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid PC sampling handle
+ */
+hsa_status_t hsa_ven_amd_pcs_stop(hsa_ven_amd_pcs_t pc_sampling);
+
+/**
+ * @brief  Flush internal buffers for a PC Sampling session
+ *
+ * Drain internal buffers for a PC Sampling session. If internal buffers have available data,
+ * this trigger a data_ready_callback.
+ *
+ * The function blocks until all PC samples associated with the @p pc_sampling session
+ * generated prior to the function call have been communicated by invocations of
+ * @p data_ready_callback having completed execution.
+ *
+ * @param[in] pc_sampling PC sampling session handle
+ *
+ * @retval ::HSA_STATUS_SUCCESS Session flushed successfully
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid PC sampling handle
+ */
+hsa_status_t hsa_ven_amd_pcs_flush(hsa_ven_amd_pcs_t pc_sampling);
+
+#define hsa_ven_amd_pc_sampling_1_00
+
+/**
+ * @brief The function pointer table for the PC Sampling v1.00 extension. Can be returned by
+ * ::hsa_system_get_extension_table or ::hsa_system_get_major_extension_table.
+ */
+typedef struct hsa_ven_amd_pc_sampling_1_00_pfn_t {
+  hsa_status_t (*hsa_ven_amd_pcs_iterate_configuration)(
+      hsa_agent_t agent, hsa_ven_amd_pcs_iterate_configuration_callback_t configuration_callback,
+      void* callback_data);
+
+  hsa_status_t (*hsa_ven_amd_pcs_create)(hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method,
+                                         hsa_ven_amd_pcs_units_t units, size_t interval,
+                                         size_t latency, size_t buffer_size,
+                                         hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback,
+                                         void* client_callback_data,
+                                         hsa_ven_amd_pcs_t* pc_sampling);
+
+  hsa_status_t (*hsa_ven_amd_pcs_create_from_id)(
+      uint32_t pcs_id, hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method,
+      hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency, size_t buffer_size,
+      hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, void* client_callback_data,
+      hsa_ven_amd_pcs_t* pc_sampling);
+
+  hsa_status_t (*hsa_ven_amd_pcs_destroy)(hsa_ven_amd_pcs_t pc_sampling);
+
+  hsa_status_t (*hsa_ven_amd_pcs_start)(hsa_ven_amd_pcs_t pc_sampling);
+
+  hsa_status_t (*hsa_ven_amd_pcs_stop)(hsa_ven_amd_pcs_t pc_sampling);
+
+  hsa_status_t (*hsa_ven_amd_pcs_flush)(hsa_ven_amd_pcs_t pc_sampling);
+
+} hsa_ven_amd_pc_sampling_1_00_pfn_t;
+
+#ifdef __cplusplus
+}  // end extern "C" block
+#endif /*__cplusplus*/
+
+#endif /* HSA_VEN_AMD_PC_SAMPLING_H */
diff --git a/projects/rocr-runtime/libhsakmt/include/impl/pm4_cmds.h b/projects/rocr-runtime/libhsakmt/include/impl/pm4_cmds.h
new file mode 100644
index 0000000000..44b7fb00aa
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/include/impl/pm4_cmds.h
@@ -0,0 +1,1090 @@
+#ifndef _WSL_INC_PM4_CMDS_H_
+#define _WSL_INC_PM4_CMDS_H_
+
+#include <cstdint>
+
+#define mmCOMPUTE_NUM_THREAD_X              0x2E07
+#define mmCOMPUTE_PGM_LO                    0x2E0C
+#define mmCOMPUTE_DISPATCH_SCRATCH_BASE_LO  0x2E10
+#define mmCOMPUTE_PGM_RSRC1                 0x2E12
+#define mmCOMPUTE_PGM_RSRC3                 0x2E28
+#define mmCOMPUTE_RESOURCE_LIMITS           0x2E15
+#define mmCOMPUTE_USER_DATA_0               0x2E40
+
+#define PM4_TYPE_SHIFT        30
+#define PM4_COUNT_SHIFT       16
+#define PM4_OPCODE_SHIFT      8
+#define PM4_SHADER_TYPE_SHIFT 1
+
+#define PM4_GFX_SHADER     0
+#define PM4_COMPUTE_SHADER 1
+
+#define PM4_TYPE3_HDR(_opc_, _count_) \
+    (uint32_t)((3)                 << PM4_TYPE_SHIFT | \
+               ((_count_) - 2)     << PM4_COUNT_SHIFT | \
+               (_opc_)             << PM4_OPCODE_SHIFT) | \
+               (PM4_COMPUTE_SHADER << PM4_SHADER_TYPE_SHIFT)
+
+union PM4_MEC_TYPE_3_HEADER {
+    struct {
+        uint32_t reserved1 : 8; ///< reserved
+        uint32_t opcode    : 8; ///< IT opcode
+        uint32_t count     : 14;///< number of DWORDs - 1 in the information body.
+        uint32_t type      : 2; ///< packet identifier. It should be 3 for type 3 packets
+    };
+    uint32_t u32All;
+};
+
+#define IT_DISPATCH_DIRECT 0x15
+#define IT_ATOMIC_MEM      0x1E
+#define IT_WRITE_DATA      0x37
+#define IT_INDIRECT_BUFFER  0x3F
+#define IT_COPY_DATA       0x40
+#define IT_EVENT_WRITE     0x46
+#define IT_RELEASE_MEM     0x49
+#define IT_ACQUIRE_MEM     0x58
+#define IT_SET_SH_REG      0x76
+
+struct PM4_MEC_SET_SH_REG {
+  union {
+    PM4_MEC_TYPE_3_HEADER header;
+    uint32_t ordinal1;
+  };
+  union {
+    struct {
+      uint32_t reg_offset:16;
+      uint32_t reserved1:16;
+    } bitfields2;
+    uint32_t ordinal2;
+  };
+};
+
+struct PM4_MEC_DISPATCH_DIRECT {
+  union {
+    PM4_MEC_TYPE_3_HEADER   header;
+    uint32_t            ordinal1;
+  };
+  uint32_t dim_x;
+  uint32_t dim_y;
+  uint32_t dim_z;
+  uint32_t dispatch_initiator;
+};
+
+// ------------------------------- MEC_EVENT_WRITE_event_index_enum -------------------------------
+enum MEC_EVENT_WRITE_event_index_enum {
+  event_index__mec_event_write__other                         =  0,
+  event_index__mec_event_write__sample_pipelinestat           =  2,
+  event_index__mec_event_write__cs_partial_flush              =  4,
+  event_index__mec_event_write__sample_streamoutstats__GFX11  =  8,
+  event_index__mec_event_write__sample_streamoutstats1__GFX11 =  9,
+  event_index__mec_event_write__sample_streamoutstats2__GFX11 = 10,
+  event_index__mec_event_write__sample_streamoutstats3__GFX11 = 11,
+};
+
+enum VGT_EVENT_TYPE {
+  Reserved_0x00                                      = 0x00000000,
+  SAMPLE_STREAMOUTSTATS1                             = 0x00000001,
+  SAMPLE_STREAMOUTSTATS2                             = 0x00000002,
+  SAMPLE_STREAMOUTSTATS3                             = 0x00000003,
+  CACHE_FLUSH_TS                                     = 0x00000004,
+  CONTEXT_DONE                                       = 0x00000005,
+  CACHE_FLUSH                                        = 0x00000006,
+  CS_PARTIAL_FLUSH                                   = 0x00000007,
+  VGT_STREAMOUT_SYNC                                 = 0x00000008,
+  VGT_STREAMOUT_RESET                                = 0x0000000a,
+  END_OF_PIPE_INCR_DE                                = 0x0000000b,
+  END_OF_PIPE_IB_END                                 = 0x0000000c,
+  RST_PIX_CNT                                        = 0x0000000d,
+  BREAK_BATCH                                        = 0x0000000e,
+  VS_PARTIAL_FLUSH                                   = 0x0000000f,
+  PS_PARTIAL_FLUSH                                   = 0x00000010,
+  FLUSH_HS_OUTPUT                                    = 0x00000011,
+  FLUSH_DFSM                                         = 0x00000012,
+  RESET_TO_LOWEST_VGT                                = 0x00000013,
+  CACHE_FLUSH_AND_INV_TS_EVENT                       = 0x00000014,
+  CACHE_FLUSH_AND_INV_EVENT                          = 0x00000016,
+  PERFCOUNTER_START                                  = 0x00000017,
+  PERFCOUNTER_STOP                                   = 0x00000018,
+  PIPELINESTAT_START                                 = 0x00000019,
+  PIPELINESTAT_STOP                                  = 0x0000001a,
+  PERFCOUNTER_SAMPLE                                 = 0x0000001b,
+  SAMPLE_PIPELINESTAT                                = 0x0000001e,
+  SO_VGTSTREAMOUT_FLUSH                              = 0x0000001f,
+  SAMPLE_STREAMOUTSTATS                              = 0x00000020,
+  RESET_VTX_CNT                                      = 0x00000021,
+  BLOCK_CONTEXT_DONE                                 = 0x00000022,
+  CS_CONTEXT_DONE                                    = 0x00000023,
+  VGT_FLUSH                                          = 0x00000024,
+  TGID_ROLLOVER                                      = 0x00000025,
+  SQ_NON_EVENT                                       = 0x00000026,
+  SC_SEND_DB_VPZ                                     = 0x00000027,
+  BOTTOM_OF_PIPE_TS                                  = 0x00000028,
+  FLUSH_SX_TS                                        = 0x00000029,
+  DB_CACHE_FLUSH_AND_INV                             = 0x0000002a,
+  FLUSH_AND_INV_DB_DATA_TS                           = 0x0000002b,
+  FLUSH_AND_INV_DB_META                              = 0x0000002c,
+  FLUSH_AND_INV_CB_DATA_TS                           = 0x0000002d,
+  FLUSH_AND_INV_CB_META                              = 0x0000002e,
+  CS_DONE                                            = 0x0000002f,
+  PS_DONE                                            = 0x00000030,
+  FLUSH_AND_INV_CB_PIXEL_DATA                        = 0x00000031,
+  SX_CB_RAT_ACK_REQUEST                              = 0x00000032,
+  THREAD_TRACE_START                                 = 0x00000033,
+  THREAD_TRACE_STOP                                  = 0x00000034,
+  THREAD_TRACE_MARKER                                = 0x00000035,
+  THREAD_TRACE_FINISH                                = 0x00000037,
+  PIXEL_PIPE_STAT_CONTROL                            = 0x00000038,
+  PIXEL_PIPE_STAT_DUMP                               = 0x00000039,
+  PIXEL_PIPE_STAT_RESET                              = 0x0000003a,
+  CONTEXT_SUSPEND                                    = 0x0000003b,
+  OFFCHIP_HS_DEALLOC                                 = 0x0000003c,
+  ENABLE_NGG_PIPELINE                                = 0x0000003d,
+  SET_FE_ID__GFX09                                   = 0x00000009,
+  Available_0x1c__GFX09                              = 0x0000001c,
+  Available_0x1d__GFX09                              = 0x0000001d,
+  THREAD_TRACE_FLUSH__GFX09                          = 0x00000036,
+  Reserved_0x3f__GFX09                               = 0x0000003f,
+  ZPASS_DONE__GFX09_10                               = 0x00000015,
+  ENABLE_LEGACY_PIPELINE__GFX09_10                   = 0x0000003e,
+  Reserved_0x09__GFX10PLUS                           = 0x00000009,
+  FLUSH_ES_OUTPUT__GFX10PLUS                         = 0x0000001c,
+  BIN_CONF_OVERRIDE_CHECK__GFX10PLUS                 = 0x0000001d,
+  THREAD_TRACE_DRAW__GFX10PLUS                       = 0x00000036,
+  DRAW_DONE__GFX10PLUS                               = 0x0000003f,
+  WAIT_SYNC__GFX11                                   = 0x00000015,
+  ENABLE_PIPELINE_NOT_USED__GFX11                    = 0x0000003e,
+};
+
+struct PM4_MEC_EVENT_WRITE {
+  union {
+    PM4_MEC_TYPE_3_HEADER   header;
+    uint32_t            ordinal1;
+  };
+  union {
+    struct {
+      uint32_t event_type:6;
+      uint32_t reserved1:2;
+      uint32_t event_index:4;
+      uint32_t reserved2:19;
+      uint32_t offload_enable:1;
+    } bitfields2;
+    uint32_t ordinal2;
+  };
+};
+
+struct PM4_MEC_ATOMIC_MEM {
+  union {
+    PM4_MEC_TYPE_3_HEADER   header;
+    uint32_t            ordinal1;
+  };
+  union {
+    struct {
+      uint32_t atomic:7;
+      uint32_t reserved1:1;
+      uint32_t command:4;
+      uint32_t reserved2:13;
+      uint32_t cache_policy:2;
+      uint32_t reserved3:5;
+    } bitfields2;
+    uint32_t ordinal2;
+  };
+  uint32_t addr_lo;
+  uint32_t addr_hi;
+  uint32_t src_data_lo;
+  uint32_t src_data_hi;
+  uint32_t cmp_data_lo;
+  uint32_t cmp_data_hi;
+  union {
+    struct {
+      uint32_t loop_interval:13;
+      uint32_t reserved4:19;
+    } bitfields9;
+    uint32_t ordinal9;
+  };
+};
+
+struct PM4_MEC_WRITE_DATA {
+  union {
+    PM4_MEC_TYPE_3_HEADER   header;
+    uint32_t            ordinal1;
+  };
+  union {
+    struct {
+      uint32_t reserved1:8;
+      uint32_t dst_sel:4;
+      uint32_t reserved2:4;
+      uint32_t addr_incr:1;
+      uint32_t reserved3:2;
+      uint32_t resume_vf:1;
+      uint32_t wr_confirm:1;
+      uint32_t reserved4:4;
+      uint32_t cache_policy:2;
+      uint32_t reserved5:5;
+    } bitfields2;
+    uint32_t ordinal2;
+  };
+  union {
+    struct {
+      uint32_t dst_mmreg_addr:18;
+      uint32_t reserved6:14;
+    } bitfields3a;
+    struct {
+      uint32_t dst_gds_addr:16;
+      uint32_t reserved7:16;
+    } bitfields3b;
+    struct {
+      uint32_t reserved8:2;
+      uint32_t dst_mem_addr_lo:30;
+    } bitfields3c;
+    uint32_t ordinal3;
+  };
+  uint32_t dst_mem_addr_hi;
+  uint64_t write_data_value;
+};
+
+#define PERSISTENT_SPACE_START         0x00002c00
+
+template <class T>
+void GenerateSetShRegHeader(T* pm4, uint32_t reg_addr) {
+  pm4->cmd_set_data.header.u32All = PM4_TYPE3_HDR(IT_SET_SH_REG,
+                                                  sizeof(T) / sizeof(uint32_t));
+  pm4->cmd_set_data.bitfields2.reg_offset = reg_addr - PERSISTENT_SPACE_START;
+}
+
+template <class T>
+void GenerateCmdHeader(T* pm4, int op_code) {
+  pm4->header.u32All = PM4_TYPE3_HDR(op_code, sizeof(T) / sizeof(uint32_t));
+}
+
+/// @brief Defines the Gpu command to dispatch a kernel. It embeds
+/// various Gpu hardware specific data structures for initialization
+/// and configuration before a dispatch begins to run
+struct DispatchTemplate {
+
+  /// @brief Structure used to initialize the group dimensions
+  /// of a kernel dispatch and if performance counters are enabled
+  struct DispatchDimensionRegs {
+    PM4_MEC_SET_SH_REG cmd_set_data;
+    uint32_t compute_num_thread_x;
+    uint32_t compute_num_thread_y;
+    uint32_t compute_num_thread_z;
+  } dimension_regs;
+
+  struct DispatchProgramRegs {
+    PM4_MEC_SET_SH_REG cmd_set_data;
+    uint32_t compute_pgm_lo;
+    uint32_t compute_pgm_hi;
+  } program_regs;
+
+  struct DispatchProgramResourceRegs {
+    PM4_MEC_SET_SH_REG cmd_set_data;
+    uint32_t compute_pgm_rsrc1;
+    uint32_t compute_pgm_rsrc2;
+  } program_resource_regs;
+
+  /// @brief Structure used to initialize parameters related to
+  /// thread management i.e. number of waves to issue and number
+  /// of Compute Units to use
+  struct DispatchResourceRegs {
+    PM4_MEC_SET_SH_REG cmd_set_data;
+    uint32_t compute_resource_limits;
+    uint32_t compute_static_thread_mgmt_se0;
+    uint32_t compute_static_thread_mgmt_se1;
+    uint32_t compute_tmpring_size;
+    uint32_t compute_static_thread_mgmt_se2;
+    uint32_t compute_static_thread_mgmt_se3;
+  } resource_regs;
+
+  /// @brief Structure used to pass handles of the Aql dispatch
+  /// packet, Aql queue, Kernel argument address block, Scratch
+  /// buffer
+  struct DispatchComputeUserDataRegs {
+    PM4_MEC_SET_SH_REG cmd_set_data;
+    uint32_t compute_user_data[16];
+  } compute_user_data_regs;
+
+  /// @brief Structure used to configure Cache flush policy
+  /// and dimensions of total work size
+  PM4_MEC_DISPATCH_DIRECT dispatch_direct;
+};
+
+struct DispatchProgramResourceRegs {
+    PM4_MEC_SET_SH_REG cmd_set_data;
+    uint32_t compute_pgm_rsrc3;
+};
+
+
+/// @brief Structure used to issue a programing scratch command for gfx11+
+struct SetScratchTemplate {
+  PM4_MEC_SET_SH_REG cmd_set_data;
+  uint32_t scratch_lo;
+  uint32_t scratch_hi;
+};
+
+/// @brief Structure used to issue a Gpu Barrier command
+struct BarrierTemplate {
+  PM4_MEC_EVENT_WRITE event_write;
+};
+
+//--------------------MEC_ATOMIC_MEM--------------------
+enum MEC_ATOMIC_MEM_command_enum {
+  command__mec_atomic_mem__single_pass_atomic           = 0,
+  command__mec_atomic_mem__loop_until_compare_satisfied = 1,
+  command__mec_atomic_mem__wait_for_write_confirmation  = 2,
+  command__mec_atomic_mem__send_and_continue            = 3,
+};
+
+enum MEC_ATOMIC_MEM_cache_policy_enum {
+  cache_policy__mec_atomic_mem__lru     = 0,
+  cache_policy__mec_atomic_mem__stream  = 1,
+  cache_policy__mec_atomic_mem__noa     = 2,
+  cache_policy__mec_atomic_mem__bypass  = 3,
+};
+
+enum TC_OP {
+  TC_OP_READ                                         = 0x00000000,
+  TC_OP_ATOMIC_FCMPSWAP_RTN_32                       = 0x00000001,
+  TC_OP_ATOMIC_FMIN_RTN_32                           = 0x00000002,
+  TC_OP_ATOMIC_FMAX_RTN_32                           = 0x00000003,
+  TC_OP_RESERVED_FOP_RTN_32_0                        = 0x00000004,
+  TC_OP_RESERVED_FOP_RTN_32_2                        = 0x00000006,
+  TC_OP_ATOMIC_SWAP_RTN_32                           = 0x00000007,
+  TC_OP_ATOMIC_CMPSWAP_RTN_32                        = 0x00000008,
+  TC_OP_ATOMIC_FCMPSWAP_FLUSH_DENORM_RTN_32          = 0x00000009,
+  TC_OP_ATOMIC_FMIN_FLUSH_DENORM_RTN_32              = 0x0000000a,
+  TC_OP_ATOMIC_FMAX_FLUSH_DENORM_RTN_32              = 0x0000000b,
+  TC_OP_PROBE_FILTER                                 = 0x0000000c,
+  TC_OP_RESERVED_FOP_FLUSH_DENORM_RTN_32_2           = 0x0000000e,
+  TC_OP_ATOMIC_ADD_RTN_32                            = 0x0000000f,
+  TC_OP_ATOMIC_SUB_RTN_32                            = 0x00000010,
+  TC_OP_ATOMIC_SMIN_RTN_32                           = 0x00000011,
+  TC_OP_ATOMIC_UMIN_RTN_32                           = 0x00000012,
+  TC_OP_ATOMIC_SMAX_RTN_32                           = 0x00000013,
+  TC_OP_ATOMIC_UMAX_RTN_32                           = 0x00000014,
+  TC_OP_ATOMIC_AND_RTN_32                            = 0x00000015,
+  TC_OP_ATOMIC_OR_RTN_32                             = 0x00000016,
+  TC_OP_ATOMIC_XOR_RTN_32                            = 0x00000017,
+  TC_OP_ATOMIC_INC_RTN_32                            = 0x00000018,
+  TC_OP_ATOMIC_DEC_RTN_32                            = 0x00000019,
+  TC_OP_WBINVL1_VOL                                  = 0x0000001a,
+  TC_OP_WBINVL1_SD                                   = 0x0000001b,
+  TC_OP_RESERVED_NON_FLOAT_RTN_32_0                  = 0x0000001c,
+  TC_OP_RESERVED_NON_FLOAT_RTN_32_1                  = 0x0000001d,
+  TC_OP_RESERVED_NON_FLOAT_RTN_32_2                  = 0x0000001e,
+  TC_OP_RESERVED_NON_FLOAT_RTN_32_3                  = 0x0000001f,
+  TC_OP_WRITE                                        = 0x00000020,
+  TC_OP_ATOMIC_FCMPSWAP_RTN_64                       = 0x00000021,
+  TC_OP_ATOMIC_FMIN_RTN_64                           = 0x00000022,
+  TC_OP_ATOMIC_FMAX_RTN_64                           = 0x00000023,
+  TC_OP_RESERVED_FOP_RTN_64_0                        = 0x00000024,
+  TC_OP_RESERVED_FOP_RTN_64_1                        = 0x00000025,
+  TC_OP_RESERVED_FOP_RTN_64_2                        = 0x00000026,
+  TC_OP_ATOMIC_SWAP_RTN_64                           = 0x00000027,
+  TC_OP_ATOMIC_CMPSWAP_RTN_64                        = 0x00000028,
+  TC_OP_ATOMIC_FCMPSWAP_FLUSH_DENORM_RTN_64          = 0x00000029,
+  TC_OP_ATOMIC_FMIN_FLUSH_DENORM_RTN_64              = 0x0000002a,
+  TC_OP_ATOMIC_FMAX_FLUSH_DENORM_RTN_64              = 0x0000002b,
+  TC_OP_WBINVL2_SD                                   = 0x0000002c,
+  TC_OP_RESERVED_FOP_FLUSH_DENORM_RTN_64_0           = 0x0000002d,
+  TC_OP_RESERVED_FOP_FLUSH_DENORM_RTN_64_1           = 0x0000002e,
+  TC_OP_ATOMIC_ADD_RTN_64                            = 0x0000002f,
+  TC_OP_ATOMIC_SUB_RTN_64                            = 0x00000030,
+  TC_OP_ATOMIC_SMIN_RTN_64                           = 0x00000031,
+  TC_OP_ATOMIC_UMIN_RTN_64                           = 0x00000032,
+  TC_OP_ATOMIC_SMAX_RTN_64                           = 0x00000033,
+  TC_OP_ATOMIC_UMAX_RTN_64                           = 0x00000034,
+  TC_OP_ATOMIC_AND_RTN_64                            = 0x00000035,
+  TC_OP_ATOMIC_OR_RTN_64                             = 0x00000036,
+  TC_OP_ATOMIC_XOR_RTN_64                            = 0x00000037,
+  TC_OP_ATOMIC_INC_RTN_64                            = 0x00000038,
+  TC_OP_ATOMIC_DEC_RTN_64                            = 0x00000039,
+  TC_OP_WBL2_NC                                      = 0x0000003a,
+  TC_OP_WBL2_WC                                      = 0x0000003b,
+  TC_OP_RESERVED_NON_FLOAT_RTN_64_1                  = 0x0000003c,
+  TC_OP_RESERVED_NON_FLOAT_RTN_64_2                  = 0x0000003d,
+  TC_OP_RESERVED_NON_FLOAT_RTN_64_3                  = 0x0000003e,
+  TC_OP_RESERVED_NON_FLOAT_RTN_64_4                  = 0x0000003f,
+  TC_OP_WBINVL1                                      = 0x00000040,
+  TC_OP_ATOMIC_FCMPSWAP_32                           = 0x00000041,
+  TC_OP_ATOMIC_FMIN_32                               = 0x00000042,
+  TC_OP_ATOMIC_FMAX_32                               = 0x00000043,
+  TC_OP_RESERVED_FOP_32_0                            = 0x00000044,
+  TC_OP_RESERVED_FOP_32_2                            = 0x00000046,
+  TC_OP_ATOMIC_SWAP_32                               = 0x00000047,
+  TC_OP_ATOMIC_CMPSWAP_32                            = 0x00000048,
+  TC_OP_ATOMIC_FCMPSWAP_FLUSH_DENORM_32              = 0x00000049,
+  TC_OP_ATOMIC_FMIN_FLUSH_DENORM_32                  = 0x0000004a,
+  TC_OP_ATOMIC_FMAX_FLUSH_DENORM_32                  = 0x0000004b,
+  TC_OP_INV_METADATA                                 = 0x0000004c,
+  TC_OP_RESERVED_FOP_FLUSH_DENORM_32_2               = 0x0000004e,
+  TC_OP_ATOMIC_ADD_32                                = 0x0000004f,
+  TC_OP_ATOMIC_SUB_32                                = 0x00000050,
+  TC_OP_ATOMIC_SMIN_32                               = 0x00000051,
+  TC_OP_ATOMIC_UMIN_32                               = 0x00000052,
+  TC_OP_ATOMIC_SMAX_32                               = 0x00000053,
+  TC_OP_ATOMIC_UMAX_32                               = 0x00000054,
+  TC_OP_ATOMIC_AND_32                                = 0x00000055,
+  TC_OP_ATOMIC_OR_32                                 = 0x00000056,
+  TC_OP_ATOMIC_XOR_32                                = 0x00000057,
+  TC_OP_ATOMIC_INC_32                                = 0x00000058,
+  TC_OP_ATOMIC_DEC_32                                = 0x00000059,
+  TC_OP_INVL2_NC                                     = 0x0000005a,
+  TC_OP_NOP_RTN0                                     = 0x0000005b,
+  TC_OP_RESERVED_NON_FLOAT_32_1                      = 0x0000005c,
+  TC_OP_RESERVED_NON_FLOAT_32_2                      = 0x0000005d,
+  TC_OP_RESERVED_NON_FLOAT_32_3                      = 0x0000005e,
+  TC_OP_RESERVED_NON_FLOAT_32_4                      = 0x0000005f,
+  TC_OP_WBINVL2                                      = 0x00000060,
+  TC_OP_ATOMIC_FCMPSWAP_64                           = 0x00000061,
+  TC_OP_ATOMIC_FMIN_64                               = 0x00000062,
+  TC_OP_ATOMIC_FMAX_64                               = 0x00000063,
+  TC_OP_RESERVED_FOP_64_0                            = 0x00000064,
+  TC_OP_RESERVED_FOP_64_1                            = 0x00000065,
+  TC_OP_RESERVED_FOP_64_2                            = 0x00000066,
+  TC_OP_ATOMIC_SWAP_64                               = 0x00000067,
+  TC_OP_ATOMIC_CMPSWAP_64                            = 0x00000068,
+  TC_OP_ATOMIC_FCMPSWAP_FLUSH_DENORM_64              = 0x00000069,
+  TC_OP_ATOMIC_FMIN_FLUSH_DENORM_64                  = 0x0000006a,
+  TC_OP_ATOMIC_FMAX_FLUSH_DENORM_64                  = 0x0000006b,
+  TC_OP_RESERVED_FOP_FLUSH_DENORM_64_0               = 0x0000006c,
+  TC_OP_RESERVED_FOP_FLUSH_DENORM_64_1               = 0x0000006d,
+  TC_OP_RESERVED_FOP_FLUSH_DENORM_64_2               = 0x0000006e,
+  TC_OP_ATOMIC_ADD_64                                = 0x0000006f,
+  TC_OP_ATOMIC_SUB_64                                = 0x00000070,
+  TC_OP_ATOMIC_SMIN_64                               = 0x00000071,
+  TC_OP_ATOMIC_UMIN_64                               = 0x00000072,
+  TC_OP_ATOMIC_SMAX_64                               = 0x00000073,
+  TC_OP_ATOMIC_UMAX_64                               = 0x00000074,
+  TC_OP_ATOMIC_AND_64                                = 0x00000075,
+  TC_OP_ATOMIC_OR_64                                 = 0x00000076,
+  TC_OP_ATOMIC_XOR_64                                = 0x00000077,
+  TC_OP_ATOMIC_INC_64                                = 0x00000078,
+  TC_OP_ATOMIC_DEC_64                                = 0x00000079,
+  TC_OP_WBINVL2_NC                                   = 0x0000007a,
+  TC_OP_NOP_ACK                                      = 0x0000007b,
+  TC_OP_RESERVED_NON_FLOAT_64_1                      = 0x0000007c,
+  TC_OP_RESERVED_NON_FLOAT_64_2                      = 0x0000007d,
+  TC_OP_RESERVED_NON_FLOAT_64_3                      = 0x0000007e,
+  TC_OP_RESERVED_NON_FLOAT_64_4                      = 0x0000007f,
+  TC_OP_RESERVED_FOP_RTN_32_1__GFX09_10              = 0x00000005,
+  TC_OP_RESERVED_FOP_FLUSH_DENORM_RTN_32_1__GFX09_10 = 0x0000000d,
+  TC_OP_RESERVED_FOP_32_1__GFX09_10                  = 0x00000045,
+  TC_OP_RESERVED_FOP_FLUSH_DENORM_32_1__GFX09_10     = 0x0000004d,
+  TC_OP_RESERVED_FADD_RTN_32__GFX11                  = 0x00000005,
+  TC_OP_ATOMIC_FADD_FLUSH_DENORM_RTN_32__GFX11       = 0x0000000d,
+  TC_OP_RESERVED_FADD_32__GFX11                      = 0x00000045,
+  TC_OP_ATOMIC_FADD_FLUSH_DENORM_32__GFX11           = 0x0000004d,
+};
+
+// Desc: Strucuture used to perform various atomic
+// operations - add, subtract, increment, etc
+struct AtomicTemplate {
+  PM4_MEC_ATOMIC_MEM atomic;
+};
+
+/// @brief PM4 command to write a 64-bit value into a memory
+/// location accessible to Gpu
+struct WriteDataTemplate {
+  PM4_MEC_WRITE_DATA write_data;
+};
+
+// ---------------------------------- MEC_COPY_DATA_src_sel_enum ----------------------------------
+enum MEC_COPY_DATA_src_sel_enum {
+  src_sel__mec_copy_data__mem_mapped_register     =  0,
+  src_sel__mec_copy_data__tc_l2_obsolete          =  1,
+  src_sel__mec_copy_data__tc_l2                   =  2,
+  src_sel__mec_copy_data__gds                     =  3,
+  src_sel__mec_copy_data__perfcounters            =  4,
+  src_sel__mec_copy_data__immediate_data          =  5,
+  src_sel__mec_copy_data__atomic_return_data      =  6,
+  src_sel__mec_copy_data__gds_atomic_return_data0 =  7,
+  src_sel__mec_copy_data__gds_atomic_return_data1 =  8,
+  src_sel__mec_copy_data__gpu_clock_count         =  9,
+  src_sel__mec_copy_data__system_clock_count      = 10,
+  src_sel__mec_copy_data__ext32perfcntr           = 11,
+};
+
+// ---------------------------------- MEC_COPY_DATA_dst_sel_enum ----------------------------------
+enum MEC_COPY_DATA_dst_sel_enum {
+  dst_sel__mec_copy_data__mem_mapped_register =  0,
+  dst_sel__mec_copy_data__tc_l2               =  2,
+  dst_sel__mec_copy_data__gds                 =  3,
+  dst_sel__mec_copy_data__perfcounters        =  4,
+  dst_sel__mec_copy_data__tc_l2_obsolete      =  5,
+  dst_sel__mec_copy_data__mem_mapped_reg_dc   =  6,
+  dst_sel__mec_copy_data__ext32perfcntr       = 11,
+};
+
+// ------------------------------ MEC_COPY_DATA_src_cache_policy_enum ------------------------------
+enum MEC_COPY_DATA_src_cache_policy_enum {
+  src_cache_policy__mec_copy_data__lru    =  0,
+  src_cache_policy__mec_copy_data__stream =  1,
+  src_cache_policy__mec_copy_data__noa    =  2,
+  src_cache_policy__mec_copy_data__bypass =  3,
+};
+
+// --------------------------------- MEC_COPY_DATA_count_sel_enum ---------------------------------
+enum MEC_COPY_DATA_count_sel_enum {
+  count_sel__mec_copy_data__32_bits_of_data =  0,
+  count_sel__mec_copy_data__64_bits_of_data =  1,
+};
+
+// --------------------------------- MEC_COPY_DATA_wr_confirm_enum ---------------------------------
+enum MEC_COPY_DATA_wr_confirm_enum {
+  wr_confirm__mec_copy_data__do_not_wait_for_confirmation =  0,
+  wr_confirm__mec_copy_data__wait_for_confirmation        =  1,
+};
+
+// ------------------------------ MEC_COPY_DATA_dst_cache_policy_enum ------------------------------
+enum MEC_COPY_DATA_dst_cache_policy_enum {
+  dst_cache_policy__mec_copy_data__lru    =  0,
+  dst_cache_policy__mec_copy_data__stream =  1,
+  dst_cache_policy__mec_copy_data__noa    =  2,
+  dst_cache_policy__mec_copy_data__bypass =  3,
+};
+
+// ------------------------------- MEC_COPY_DATA_pq_exe_status_enum -------------------------------
+enum MEC_COPY_DATA_pq_exe_status_enum {
+  pq_exe_status__mec_copy_data__default      =  0,
+  pq_exe_status__mec_copy_data__phase_update =  1,
+};
+
+// ------------------------------- MEC_WRITE_DATA_dst_sel_enum -------------------------------
+enum MEC_WRITE_DATA_dst_sel_enum {
+     dst_sel__mec_write_data__mem_mapped_register = 0,
+     dst_sel__mec_write_data__tc_l2 = 2,
+     dst_sel__mec_write_data__gds = 3,
+     dst_sel__mec_write_data__memory = 5,
+     dst_sel__mec_write_data__memory_mapped_adc_persistent_state = 6 };
+
+// ------------------------------- MEC_WRITE_DATA_addr_incr_enum -------------------------------
+enum MEC_WRITE_DATA_addr_incr_enum {
+     addr_incr__mec_write_data__increment_address = 0,
+     addr_incr__mec_write_data__do_not_increment_address = 1 };
+
+// ------------------------------- MEC_WRITE_DATA_wr_confirm_enum -------------------------------
+enum MEC_WRITE_DATA_wr_confirm_enum {
+     wr_confirm__mec_write_data__do_not_wait_for_write_confirmation = 0,
+     wr_confirm__mec_write_data__wait_for_write_confirmation = 1 };
+
+// ------------------------------- MEC_WRITE_DATA_cache_policy_enum -------------------------------
+enum MEC_WRITE_DATA_cache_policy_enum {
+     cache_policy__mec_write_data__lru = 0,
+     cache_policy__mec_write_data__stream = 1,
+     cache_policy__mec_write_data__noa    =  2,
+     cache_policy__mec_write_data__bypass =  3 };
+
+typedef struct PM4_MEC_COPY_DATA {
+  union {
+    PM4_MEC_TYPE_3_HEADER header;  /// header
+    uint32_t ordinal1;
+  };
+  union {
+    struct {
+      uint32_t src_sel : 4;
+      uint32_t reserved1 : 4;
+      uint32_t dst_sel : 4;
+      uint32_t reserved2 : 1;
+      uint32_t src_cache_policy : 2;
+      uint32_t reserved3 : 1;
+      uint32_t count_sel : 1;
+      uint32_t reserved4 : 3;
+      uint32_t wr_confirm : 1;
+      uint32_t reserved5 : 4;
+      uint32_t dst_cache_policy : 2;
+      uint32_t reserved6 : 2;
+      uint32_t pq_exe_status : 1;
+      uint32_t reserved7 : 2;
+    } bitfields2;
+    uint32_t ordinal2;
+  };
+  union {
+    struct {
+      uint32_t src_reg_offset : 18;
+      uint32_t reserved8 : 14;
+    } bitfields3a;
+    struct {
+      uint32_t reserved9 : 2;
+      uint32_t src_32b_addr_lo : 30;
+    } bitfields3b;
+    struct {
+      uint32_t reserved10 : 3;
+      uint32_t src_64b_addr_lo : 29;
+    } bitfields3c;
+    struct {
+      uint32_t src_gds_addr_lo : 16;
+      uint32_t reserved11 : 16;
+    } bitfields3d;
+    uint32_t imm_data;
+    uint32_t ordinal3;
+  };
+  union {
+    uint32_t src_memtc_addr_hi;
+    uint32_t src_imm_data;
+    uint32_t ordinal4;
+  };
+  union {
+    struct {
+      uint32_t dst_reg_offset : 18;
+      uint32_t reserved12 : 14;
+    } bitfields5a;
+    struct {
+      uint32_t reserved13 : 2;
+      uint32_t dst_32b_addr_lo : 30;
+    } bitfields5b;
+    struct {
+      uint32_t reserved14 : 3;
+      uint32_t dst_64b_addr_lo : 29;
+    } bitfields5c;
+    struct {
+      uint32_t dst_gds_addr_lo : 16;
+      uint32_t reserved15 : 16;
+    } bitfields5d;
+    uint32_t ordinal5;
+  };
+  uint32_t dst_addr_hi;
+} PM4MEC_COPY_DATA;
+namespace gfx9 {
+
+struct PM4_MEC_ACQUIRE_MEM {
+  union {
+    PM4_MEC_TYPE_3_HEADER   header;
+    uint32_t            ordinal1;
+  };
+  union {
+    struct {
+      uint32_t coher_cntl:31;
+      uint32_t reserved1:1;
+    } bitfields2;
+    uint32_t ordinal2;
+  };
+  uint32_t coher_size;
+  union {
+    struct {
+      uint32_t coher_size_hi:8;
+      uint32_t reserved2:24;
+    } bitfields4;
+    uint32_t ordinal4;
+  };
+  uint32_t coher_base_lo;
+  union {
+    struct {
+      uint32_t coher_base_hi:24;
+      uint32_t reserved3:8;
+    } bitfields6;
+    uint32_t ordinal6;
+  };
+  union {
+    struct {
+      uint32_t poll_interval:16;
+      uint32_t reserved4:16;
+    } bitfields7;
+    uint32_t ordinal7;
+  };
+};
+
+struct PM4_MEC_RELEASE_MEM {
+    union {
+        PM4_MEC_TYPE_3_HEADER   header;
+        uint32_t            ordinal1;
+    };
+    union {
+        struct {
+            uint32_t event_type:6;
+            uint32_t reserved1:2;
+            uint32_t event_index:4;
+            uint32_t tcl1_vol_action_ena:1;
+            uint32_t tc_vol_action_ena:1;
+            uint32_t reserved2:1;
+            uint32_t tc_wb_action_ena:1;
+            uint32_t tcl1_action_ena:1;
+            uint32_t tc_action_ena:1;
+            uint32_t reserved3:1;
+            uint32_t tc_nc_action_ena:1;
+            uint32_t tc_wc_action_ena:1;
+            uint32_t tc_md_action_ena:1;
+            uint32_t reserved4:3;
+            uint32_t cache_policy:2;
+            uint32_t reserved5:2;
+            uint32_t pq_exe_status:1;
+            uint32_t reserved6:2;
+        } bitfields2;
+        uint32_t ordinal2;
+    };
+    union {
+        struct {
+            uint32_t reserved7:16;
+            uint32_t dst_sel:2;
+            uint32_t reserved8:6;
+            uint32_t int_sel:3;
+            uint32_t reserved9:2;
+            uint32_t data_sel:3;
+        } bitfields3;
+        uint32_t ordinal3;
+    };
+    union {
+        struct {
+            uint32_t reserved10:2;
+            uint32_t address_lo_32b:30;
+        } bitfields4a;
+        struct {
+            uint32_t reserved11:3;
+            uint32_t address_lo_64b:29;
+        } bitfields4b;
+        uint32_t reserved12;
+        uint32_t ordinal4;
+    };
+    union {
+        uint32_t address_hi;
+        uint32_t reserved13;
+        uint32_t ordinal5;
+    };
+    union {
+        uint32_t data_lo;
+        uint32_t cmp_data_lo;
+        struct {
+            uint32_t dw_offset:16;
+            uint32_t num_dwords:16;
+        } bitfields6c;
+        uint32_t reserved14;
+        uint32_t ordinal6;
+    };
+    union {
+        uint32_t data_hi;
+        uint32_t cmp_data_hi;
+        uint32_t reserved15;
+        uint32_t reserved16;
+        uint32_t ordinal7;
+    };
+    uint32_t int_ctxid;
+};
+
+struct PM4_MEC_WAIT_REG_MEM64 {
+  union {
+    PM4_MEC_TYPE_3_HEADER   header;
+    uint32_t            ordinal1;
+  };
+  union {
+    struct {
+      uint32_t function:3;
+      uint32_t reserved1:1;
+      uint32_t mem_space:2;
+      uint32_t operation:2;
+      uint32_t reserved2:24;
+    } bitfields2;
+    uint32_t ordinal2;
+  };
+  union {
+    struct {
+      uint32_t reserved3:3;
+      uint32_t mem_poll_addr_lo:29;
+    } bitfields3a;
+    struct {
+      uint32_t reg_poll_addr:18;
+      uint32_t reserved4:14;
+    } bitfields3b;
+    struct {
+      uint32_t reg_write_addr1:18;
+      uint32_t reserved5:14;
+    } bitfields3c;
+    uint32_t ordinal3;
+  };
+  union {
+    uint32_t mem_poll_addr_hi;
+    struct {
+      uint32_t reg_write_addr2:18;
+      uint32_t reserved6:14;
+    } bitfields4b;
+    uint32_t ordinal4;
+  };
+  uint32_t reference;
+  uint32_t reference_hi;
+  uint32_t mask;
+  uint32_t mask_hi;
+  union {
+    struct {
+      uint32_t poll_interval:16;
+      uint32_t reserved7:16;
+    } bitfields9;
+    uint32_t ordinal9;
+  };
+};
+
+/// @brief Structure used to configure the flushing of
+/// various caches - instruction, constants, L1 and L2
+struct AcquireMemTemplate {
+  PM4_MEC_ACQUIRE_MEM acquire_mem;
+};
+
+struct EndofKernelNotifyTemplate {
+  PM4_MEC_RELEASE_MEM release_mem;
+};
+
+/// @brief PM4 command to wait for a certain event before proceeding
+/// to process another command on the queue
+struct WaitRegMem64Template {
+  PM4_MEC_WAIT_REG_MEM64 wait_reg_mem;
+};
+
+}  // gfx9 namespace
+
+namespace gfx10 {
+
+struct PM4_MEC_ACQUIRE_MEM {
+  union {
+    PM4_MEC_TYPE_3_HEADER   header;
+    uint32_t            ordinal1;
+  };
+  uint32_t reserved1;
+  uint32_t coher_size;
+  union {
+    struct {
+      uint32_t coher_size_hi:8;
+      uint32_t reserved2:24;
+    } bitfields4;
+    uint32_t ordinal4;
+  };
+  uint32_t coher_base_lo;
+  union {
+    struct {
+      uint32_t coher_base_hi:24;
+      uint32_t reserved3:8;
+    } bitfields6;
+    uint32_t ordinal6;
+  };
+  union {
+    struct {
+      uint32_t poll_interval:16;
+      uint32_t reserved4:16;
+    } bitfields7;
+    uint32_t ordinal7;
+  };
+  union {
+    struct {
+      uint32_t gcr_cntl:19;
+      uint32_t reserved4:13;
+    } bitfields8;
+    uint32_t ordinal8;
+  };
+};
+
+struct PM4_MEC_RELEASE_MEM {
+    union {
+        PM4_MEC_TYPE_3_HEADER   header;
+        uint32_t            ordinal1;
+    };
+    union {
+        struct {
+            uint32_t event_type:6;
+            uint32_t reserved1:2;
+            uint32_t event_index:4;
+            uint32_t gcr_cntl:12;
+            uint32_t reserved2:1;
+            uint32_t cache_policy:2;
+            uint32_t reserved3:2;
+            uint32_t pq_exe_status:1;
+            uint32_t reserved4:2;
+        } bitfields2;
+        uint32_t ordinal2;
+    };
+    union {
+        struct {
+            uint32_t reserved7:16;
+            uint32_t dst_sel:2;
+            uint32_t reserved8:2;
+            uint32_t mes_intr_pipe:2;
+            uint32_t mes_action_id:2;
+            uint32_t int_sel:3;
+            uint32_t reserved9:2;
+            uint32_t data_sel:3;
+        } bitfields3;
+        uint32_t ordinal3;
+    };
+    union {
+        struct {
+            uint32_t reserved10:2;
+            uint32_t address_lo_32b:30;
+        } bitfields4a;
+        struct {
+            uint32_t reserved11:3;
+            uint32_t address_lo_64b:29;
+        } bitfields4b;
+        uint32_t reserved12;
+        uint32_t ordinal4;
+    };
+    union {
+        uint32_t address_hi;
+        uint32_t reserved13;
+        uint32_t ordinal5;
+    };
+    union {
+        uint32_t data_lo;
+        uint32_t cmp_data_lo;
+        struct {
+            uint32_t dw_offset:16;
+            uint32_t num_dwords:16;
+        } bitfields6c;
+        uint32_t reserved14;
+        uint32_t ordinal6;
+    };
+    union {
+        uint32_t data_hi;
+        uint32_t cmp_data_hi;
+        uint32_t reserved15;
+        uint32_t reserved16;
+        uint32_t ordinal7;
+    };
+    uint32_t int_ctxid;
+};
+
+struct PM4_MEC_WAIT_REG_MEM64 {
+    union {
+        PM4_MEC_TYPE_3_HEADER   header;            ///header
+        uint32_t            ordinal1;
+    };
+    union {
+        struct {
+            uint32_t function:3;
+            uint32_t reserved1:1;
+            uint32_t mem_space:2;
+            uint32_t operation:2;
+            uint32_t reserved2:14;
+            uint32_t mes_intr_pipe:2;
+            uint32_t mes_action:1;
+            uint32_t cache_policy:2;
+            uint32_t reserved3:5;
+        } bitfields2;
+        uint32_t ordinal2;
+    };
+    union {
+        struct {
+            uint32_t reserved4:3;
+            uint32_t mem_poll_addr_lo:29;
+        } bitfields3a;
+        struct {
+            uint32_t reg_poll_addr:18;
+            uint32_t reserved5:14;
+        } bitfields3b;
+        struct {
+            uint32_t reg_write_addr1:18;
+            uint32_t reserved6:14;
+        } bitfields3c;
+        uint32_t ordinal3;
+    };
+    union {
+        uint32_t mem_poll_addr_hi;
+        struct {
+            uint32_t reg_write_addr2:18;
+            uint32_t reserved7:14;
+        } bitfields4b;
+        uint32_t ordinal4;
+    };
+    uint32_t reference;
+    uint32_t reference_hi;
+    uint32_t mask;
+    uint32_t mask_hi;
+    union {
+        struct {
+            uint32_t poll_interval:16;
+            uint32_t reserved8:15;
+            uint32_t optimize_ace_offload_mode:1;
+        } bitfields9;
+        uint32_t ordinal9;
+    };
+};
+
+/// @brief Structure used to configure the flushing of
+/// various caches - instruction, constants, L1 and L2
+struct AcquireMemTemplate {
+  PM4_MEC_ACQUIRE_MEM acquire_mem;
+};
+
+struct EndofKernelNotifyTemplate {
+  PM4_MEC_RELEASE_MEM release_mem;
+};
+
+struct WaitRegMem64Template {
+  PM4_MEC_WAIT_REG_MEM64 wait_reg_mem;
+};
+
+} // gfx10 namespace
+
+namespace gfx11 {
+
+struct PM4_MEC_RELEASE_MEM {
+    union {
+        PM4_MEC_TYPE_3_HEADER header;
+        uint32_t ordinal1;
+    };
+    union {
+        struct {
+            uint32_t event_type:6;
+            uint32_t reserved1:2;
+            uint32_t event_index:4;
+            uint32_t gcr_cntl:13;
+            uint32_t cache_policy:2;
+            uint32_t reserved2:1;
+            uint32_t pq_exe_status:1;
+            uint32_t reserved3:1;
+            uint32_t glk_inv:1;
+            uint32_t reserved4:1;
+        } bitfields2;
+        uint32_t ordinal2;
+    };
+    union {
+        struct {
+            uint32_t reserved5:16;
+            uint32_t dst_sel:2;
+            uint32_t reserved6:2;
+            uint32_t mes_intr_pipe:2;
+            uint32_t mes_action_id:2;
+            uint32_t int_sel:3;
+            uint32_t reserved7:2;
+            uint32_t data_sel:3;
+        } bitfields3;
+        uint32_t ordinal3;
+    };
+    union {
+        struct {
+            uint32_t reserved8:2;
+            uint32_t address_lo_32b:30;
+        } bitfields4a;
+        struct {
+            uint32_t reserved9:3;
+            uint32_t address_lo_64b:29;
+        } bitfields4b;
+        uint32_t reserved10;
+        uint32_t ordinal4;
+    };
+    union {
+        uint32_t address_hi;
+        uint32_t reserved11;
+        uint32_t ordinal5;
+    };
+    union {
+        uint32_t data_lo;
+        uint32_t cmp_data_lo;
+        struct {
+            uint32_t dw_offset:16;
+            uint32_t num_dwords:16;
+        } bitfields6c;
+        uint32_t reserved12;
+        uint32_t ordinal6;
+    };
+    union {
+        uint32_t data_hi;
+        uint32_t cmp_data_hi;
+        uint32_t reserved13;
+        uint32_t reserved14;
+        uint32_t ordinal7;
+    };
+    uint32_t int_ctxid;
+};
+
+struct EndofKernelNotifyTemplate {
+  PM4_MEC_RELEASE_MEM release_mem;
+};
+
+} // gfx11 namespace
+
+#endif
diff --git a/projects/rocr-runtime/libhsakmt/include/impl/registers.h b/projects/rocr-runtime/libhsakmt/include/impl/registers.h
new file mode 100644
index 0000000000..4d430b41e4
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/include/impl/registers.h
@@ -0,0 +1,363 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+// 
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+// 
+// Developed by:
+// 
+//                 AMD Research and AMD HSA Software Development
+// 
+//                 Advanced Micro Devices, Inc.
+// 
+//                 www.amd.com
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+// 
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// This file is used only for open source cmake builds, if we hardcode the
+// register values in amd_aql_queue.cpp then this file won't be required. For
+// now we are using this file where register details are  spelled out in the
+// structs/unions below.
+#ifndef _WSL_INC_REGISTERS_H_
+#define _WSL_INC_REGISTERS_H_
+
+typedef enum SQ_RSRC_BUF_TYPE {
+SQ_RSRC_BUF                              = 0x00000000,
+SQ_RSRC_BUF_RSVD_1                       = 0x00000001,
+SQ_RSRC_BUF_RSVD_2                       = 0x00000002,
+SQ_RSRC_BUF_RSVD_3                       = 0x00000003,
+} SQ_RSRC_BUF_TYPE;
+
+typedef enum BUF_DATA_FORMAT {
+BUF_DATA_FORMAT_INVALID                  = 0x00000000,
+BUF_DATA_FORMAT_8                        = 0x00000001,
+BUF_DATA_FORMAT_16                       = 0x00000002,
+BUF_DATA_FORMAT_8_8                      = 0x00000003,
+BUF_DATA_FORMAT_32                       = 0x00000004,
+BUF_DATA_FORMAT_16_16                    = 0x00000005,
+BUF_DATA_FORMAT_10_11_11                 = 0x00000006,
+BUF_DATA_FORMAT_11_11_10                 = 0x00000007,
+BUF_DATA_FORMAT_10_10_10_2               = 0x00000008,
+BUF_DATA_FORMAT_2_10_10_10               = 0x00000009,
+BUF_DATA_FORMAT_8_8_8_8                  = 0x0000000a,
+BUF_DATA_FORMAT_32_32                    = 0x0000000b,
+BUF_DATA_FORMAT_16_16_16_16              = 0x0000000c,
+BUF_DATA_FORMAT_32_32_32                 = 0x0000000d,
+BUF_DATA_FORMAT_32_32_32_32              = 0x0000000e,
+BUF_DATA_FORMAT_RESERVED_15              = 0x0000000f,
+} BUF_DATA_FORMAT;
+
+typedef enum BUF_NUM_FORMAT {
+BUF_NUM_FORMAT_UNORM                     = 0x00000000,
+BUF_NUM_FORMAT_SNORM                     = 0x00000001,
+BUF_NUM_FORMAT_USCALED                   = 0x00000002,
+BUF_NUM_FORMAT_SSCALED                   = 0x00000003,
+BUF_NUM_FORMAT_UINT                      = 0x00000004,
+BUF_NUM_FORMAT_SINT                      = 0x00000005,
+BUF_NUM_FORMAT_SNORM_OGL__SI__CI         = 0x00000006,
+BUF_NUM_FORMAT_RESERVED_6__VI            = 0x00000006,
+BUF_NUM_FORMAT_FLOAT                     = 0x00000007,
+} BUF_NUM_FORMAT;
+
+typedef enum BUF_FORMAT {
+BUF_FORMAT_32_UINT                       = 0x00000014,
+} BUF_FORMAT;
+
+typedef enum SQ_SEL_XYZW01 {
+SQ_SEL_0                                 = 0x00000000,
+SQ_SEL_1                                 = 0x00000001,
+SQ_SEL_RESERVED_0                        = 0x00000002,
+SQ_SEL_RESERVED_1                        = 0x00000003,
+SQ_SEL_X                                 = 0x00000004,
+SQ_SEL_Y                                 = 0x00000005,
+SQ_SEL_Z                                 = 0x00000006,
+SQ_SEL_W                                 = 0x00000007,
+} SQ_SEL_XYZW01;
+
+	union COMPUTE_TMPRING_SIZE {
+	struct {
+#if		defined(LITTLEENDIAN_CPU)
+		unsigned int                           WAVES : 12;
+		unsigned int                        WAVESIZE : 13;
+		unsigned int                                 : 7;
+#elif		defined(BIGENDIAN_CPU)
+		unsigned int                                 : 7;
+		unsigned int                        WAVESIZE : 13;
+		unsigned int                           WAVES : 12;
+#endif
+	} bitfields, bits;
+	unsigned int	u32All;
+	signed int	i32All;
+	float	f32All;
+	};
+
+        union COMPUTE_TMPRING_SIZE_GFX11 {
+          struct {
+#if defined(LITTLEENDIAN_CPU)
+            unsigned int WAVES : 12;
+            unsigned int WAVESIZE : 15;
+            unsigned int : 5;
+#elif defined(BIGENDIAN_CPU)
+            unsigned int : 5;
+            unsigned int WAVESIZE : 15;
+            unsigned int WAVES : 12;
+#endif
+          } bitfields, bits;
+          unsigned int u32All;
+          signed int i32All;
+          float f32All;
+        };
+
+        union COMPUTE_TMPRING_SIZE_GFX12 {
+          struct {
+#if defined(LITTLEENDIAN_CPU)
+            unsigned int WAVES : 12;
+            unsigned int WAVESIZE : 18;
+            unsigned int : 2;
+#elif defined(BIGENDIAN_CPU)
+            unsigned int : 2;
+            unsigned int WAVESIZE : 18;
+            unsigned int WAVES : 12;
+#endif
+          } bitfields, bits;
+          unsigned int u32All;
+          signed int i32All;
+          float f32All;
+        };
+
+        union SQ_BUF_RSRC_WORD0 {
+	struct {
+#if		defined(LITTLEENDIAN_CPU)
+		unsigned int                    BASE_ADDRESS : 32;
+#elif		defined(BIGENDIAN_CPU)
+		unsigned int                    BASE_ADDRESS : 32;
+#endif
+	} bitfields, bits;
+	unsigned int	u32All;
+	signed int	i32All;
+	float	f32All;
+	};
+
+
+	union SQ_BUF_RSRC_WORD1 {
+	struct {
+#if		defined(LITTLEENDIAN_CPU)
+		unsigned int                 BASE_ADDRESS_HI : 16;
+		unsigned int                          STRIDE : 14;
+		unsigned int                   CACHE_SWIZZLE : 1;
+		unsigned int                  SWIZZLE_ENABLE : 1;
+#elif		defined(BIGENDIAN_CPU)
+		unsigned int                  SWIZZLE_ENABLE : 1;
+		unsigned int                   CACHE_SWIZZLE : 1;
+		unsigned int                          STRIDE : 14;
+		unsigned int                 BASE_ADDRESS_HI : 16;
+#endif
+	} bitfields, bits;
+	unsigned int	u32All;
+	signed int	i32All;
+	float	f32All;
+	};
+
+        union SQ_BUF_RSRC_WORD1_GFX11 {
+          struct {
+#if defined(LITTLEENDIAN_CPU)
+            unsigned int BASE_ADDRESS_HI : 16;
+            unsigned int STRIDE : 14;
+            unsigned int SWIZZLE_ENABLE : 2;
+#elif defined(BIGENDIAN_CPU)
+            unsigned int SWIZZLE_ENABLE : 2;
+            unsigned int STRIDE : 14;
+            unsigned int BASE_ADDRESS_HI : 16;
+#endif
+          } bitfields, bits;
+          unsigned int u32All;
+          signed int i32All;
+          float f32All;
+        };
+
+
+        union SQ_BUF_RSRC_WORD2 {
+	struct {
+#if		defined(LITTLEENDIAN_CPU)
+		unsigned int                     NUM_RECORDS : 32;
+#elif		defined(BIGENDIAN_CPU)
+		unsigned int                     NUM_RECORDS : 32;
+#endif
+	} bitfields, bits;
+	unsigned int	u32All;
+	signed int	i32All;
+	float	f32All;
+	};
+
+
+	union SQ_BUF_RSRC_WORD3 {
+	struct {
+#if		defined(LITTLEENDIAN_CPU)
+                unsigned int                       DST_SEL_X : 3;
+                unsigned int                       DST_SEL_Y : 3;
+                unsigned int                       DST_SEL_Z : 3;
+                unsigned int                       DST_SEL_W : 3;
+                unsigned int                      NUM_FORMAT : 3;
+                unsigned int                     DATA_FORMAT : 4;
+                unsigned int                    ELEMENT_SIZE : 2;
+                unsigned int                    INDEX_STRIDE : 2;
+                unsigned int                  ADD_TID_ENABLE : 1;
+                unsigned int                     ATC__CI__VI : 1;
+                unsigned int                     HASH_ENABLE : 1;
+                unsigned int                            HEAP : 1;
+                unsigned int                   MTYPE__CI__VI : 3;
+                unsigned int                            TYPE : 2;
+#elif		defined(BIGENDIAN_CPU)
+                unsigned int                            TYPE : 2;
+                unsigned int                   MTYPE__CI__VI : 3;
+                unsigned int                            HEAP : 1;
+                unsigned int                     HASH_ENABLE : 1;
+                unsigned int                     ATC__CI__VI : 1;
+                unsigned int                  ADD_TID_ENABLE : 1;
+                unsigned int                    INDEX_STRIDE : 2;
+                unsigned int                    ELEMENT_SIZE : 2;
+                unsigned int                     DATA_FORMAT : 4;
+                unsigned int                      NUM_FORMAT : 3;
+                unsigned int                       DST_SEL_W : 3;
+                unsigned int                       DST_SEL_Z : 3;
+                unsigned int                       DST_SEL_Y : 3;
+                unsigned int                       DST_SEL_X : 3;
+#endif
+	} bitfields, bits;
+	unsigned int	u32All;
+	signed int	i32All;
+	float	f32All;
+	};
+
+	union SQ_BUF_RSRC_WORD3_GFX10 {
+	struct {
+#if		defined(LITTLEENDIAN_CPU)
+                unsigned int                       DST_SEL_X : 3;
+                unsigned int                       DST_SEL_Y : 3;
+                unsigned int                       DST_SEL_Z : 3;
+                unsigned int                       DST_SEL_W : 3;
+                unsigned int                          FORMAT : 7;
+                unsigned int                       RESERVED1 : 2;
+                unsigned int                    INDEX_STRIDE : 2;
+                unsigned int                  ADD_TID_ENABLE : 1;
+                unsigned int                  RESOURCE_LEVEL : 1;
+                unsigned int                       RESERVED2 : 3;
+                unsigned int                      OOB_SELECT : 2;
+                unsigned int                            TYPE : 2;
+#elif		defined(BIGENDIAN_CPU)
+                unsigned int                            TYPE : 2;
+                unsigned int                      OOB_SELECT : 2;
+                unsigned int                       RESERVED2 : 3;
+                unsigned int                  RESOURCE_LEVEL : 1;
+                unsigned int                  ADD_TID_ENABLE : 1;
+                unsigned int                    INDEX_STRIDE : 2;
+                unsigned int                       RESERVED1 : 2;
+                unsigned int                          FORMAT : 7;
+                unsigned int                       DST_SEL_W : 3;
+                unsigned int                       DST_SEL_Z : 3;
+                unsigned int                       DST_SEL_Y : 3;
+                unsigned int                       DST_SEL_X : 3;
+#endif
+        } bitfields, bits;
+        unsigned int u32All;
+        signed int i32All;
+        float f32All;
+        };
+
+        // From V# Table
+        union SQ_BUF_RSRC_WORD3_GFX11 {
+          struct {
+#if defined(LITTLEENDIAN_CPU)
+            unsigned int DST_SEL_X : 3;
+            unsigned int DST_SEL_Y : 3;
+            unsigned int DST_SEL_Z : 3;
+            unsigned int DST_SEL_W : 3;
+            unsigned int FORMAT : 6;
+            unsigned int RESERVED1 : 3;
+            unsigned int INDEX_STRIDE : 2;
+            unsigned int ADD_TID_ENABLE : 1;
+            unsigned int RESERVED2 : 4;
+            unsigned int OOB_SELECT : 2;
+            unsigned int TYPE : 2;
+#elif defined(BIGENDIAN_CPU)
+            unsigned int TYPE : 2;
+            unsigned int OOB_SELECT : 2;
+            unsigned int RESERVED2 : 4;
+            unsigned int ADD_TID_ENABLE : 1;
+            unsigned int INDEX_STRIDE : 2;
+            unsigned int RESERVED1 : 3;
+            unsigned int FORMAT : 6;
+            unsigned int DST_SEL_W : 3;
+            unsigned int DST_SEL_Z : 3;
+            unsigned int DST_SEL_Y : 3;
+            unsigned int DST_SEL_X : 3;
+#endif
+          } bitfields, bits;
+        unsigned int	u32All;
+	signed int	i32All;
+	float	f32All;
+        };
+                        // From V# Table
+        union SQ_BUF_RSRC_WORD3_GFX12 {
+          struct {
+#if defined(LITTLEENDIAN_CPU)
+            unsigned int DST_SEL_X : 3;
+            unsigned int DST_SEL_Y : 3;
+            unsigned int DST_SEL_Z : 3;
+            unsigned int DST_SEL_W : 3;
+            unsigned int FORMAT : 6;
+            unsigned int RESERVED1 : 3;
+            unsigned int INDEX_STRIDE : 2;
+            unsigned int ADD_TID_ENABLE : 1;
+            unsigned int WRITE_COMPRESS_ENABLE : 1;
+            unsigned int COMPRESSION_EN : 1;
+            unsigned int COMPRESSION_ACCESS_MODE : 2;
+            unsigned int OOB_SELECT : 2;
+            unsigned int TYPE : 2;
+#elif defined(BIGENDIAN_CPU)
+            unsigned int TYPE : 2;
+            unsigned int OOB_SELECT : 2;
+            unsigned int COMPRESSION_ACCESS_MODE : 2;
+            unsigned int COMPRESSION_EN : 1;
+            unsigned int WRITE_COMPRESS_ENABLE : 1;
+            unsigned int ADD_TID_ENABLE : 1;
+            unsigned int INDEX_STRIDE : 2;
+            unsigned int RESERVED1 : 3;
+            unsigned int FORMAT : 6;
+            unsigned int DST_SEL_W : 3;
+            unsigned int DST_SEL_Z : 3;
+            unsigned int DST_SEL_Y : 3;
+            unsigned int DST_SEL_X : 3;
+#endif
+          } bitfields, bits;
+        unsigned int	u32All;
+	signed int	i32All;
+	float	f32All;
+        };
+#endif  // header guard
diff --git a/projects/rocr-runtime/libhsakmt/include/impl/thunk_proxy/thunk_proxy.h b/projects/rocr-runtime/libhsakmt/include/impl/thunk_proxy/thunk_proxy.h
new file mode 100644
index 0000000000..d6bdce2451
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/include/impl/thunk_proxy/thunk_proxy.h
@@ -0,0 +1,122 @@
+#ifndef _WSL_INC_THUNK_PROXY_H_
+#define _WSL_INC_THUNK_PROXY_H_
+
+#include <vector>
+
+namespace thunk_proxy {
+enum AllocDomain {
+  kSystem,
+  kLocal,
+  kUserMemory,
+  kUserQueue,
+  kDomainCount,
+};
+
+enum MemFlag {
+  kFineGrain  = (1ULL << 0),
+  kKernarg    = (1ULL << 1),
+};
+
+enum EngineFlag {
+  KCOMPUTE0   = (1ULL << 0),
+  KDRMDMA     = (1ULL << 1),
+  KDRMDMA1    = (1ULL << 2),
+};
+
+enum SchedLevel {
+  kLow = 0,
+  kNormal = 1,
+  kHigh = 2,
+};
+
+struct HwsInfo {
+  union {
+    struct {
+      uint32_t gfxHwsEnabled     : 1;
+      uint32_t computeHwsEnabled : 1;
+      uint32_t dmaHwsEnabled     : 1;
+      uint32_t dma1HwsEnabled    : 1;
+      uint32_t reserved          : 28;
+    } hwsMask;
+    uint32_t osHwsEnableFlags;
+  };
+  uint64_t engineOrdinalMask; // Indicates which engines (by ordinal) support MES HWS
+};
+
+typedef struct {
+  int major;
+  int minor;
+  int stepping;
+  bool is_dgpu;
+  char product_name[MAX_PATH];
+  uint64_t uuid;
+  uint32_t family;
+  uint32_t device_id;
+  uint32_t wavefront_size;
+  uint32_t compute_unit_count;
+  uint32_t max_engine_clock_mhz;
+  uint32_t watch_points_num;
+  uint32_t pci_bus_addr;
+  uint32_t memory_bus_width;
+  uint32_t max_memory_clock_mhz;
+  uint64_t gpu_counter_frequency;
+  uint32_t wave_per_cu;
+  uint32_t simd_per_cu;
+  uint32_t max_scratch_slots_per_cu;
+  uint32_t num_shader_engine;
+  uint32_t shader_array_per_shader_engine;
+  uint32_t domain;
+  uint32_t num_gws;
+  uint32_t asic_revision;
+  uint64_t local_visible_heap_size;
+  uint64_t local_invisible_heap_size;
+  uint64_t non_local_heap_size;
+  uint64_t private_aperture_base;
+  uint64_t private_aperture_size;
+  uint64_t shared_aperture_base;
+  uint64_t shared_aperture_size;
+  uint32_t user_queue_size;
+  uint32_t lds_size;
+  uint32_t big_page_alignment_size;
+  uint32_t hw_big_page_min_alignment_size;
+  uint32_t hw_big_page_alignment_size;
+  bool enable_big_page_alignment;
+  uint32_t mec_fw_version;
+  uint32_t sdma_fw_version;
+  uint32_t l1_cache_size;
+  uint32_t l2_cache_size;
+  uint32_t l3_cache_size;
+  uint32_t gl2_cacheline_size;
+  uint32_t num_cp_queues;
+  HwsInfo hwsInfo;
+  std::vector<int> sdma_schedid;
+  uint32_t compute_schedid;
+  bool state_shadowing_by_cpfw;
+  bool platform_atomic_support;
+  void *adapter_info;
+  uint32_t kmd_version;
+} DeviceInfo;
+
+int EngineOrdinal(int engine, DeviceInfo *device_info);
+bool GetHwsEnabled(int engine, DeviceInfo *device_info);
+bool ShouldDisableGpuTimeout(int engine, DeviceInfo *device_info);
+bool ParseAdapterInfo(D3DKMT_HANDLE adapter, DeviceInfo *device_info);
+bool QueryAdapterSupported(unsigned int device_id);
+
+uint32_t QueueEngine2EngineFlag(uint32_t queue_engine);
+void SetAllocationInfo(void *data, uint64_t size, AllocDomain domain,
+                      uint64_t addr, uint32_t mem_flags, uint32_t engine_flag, const DeviceInfo &device_info);
+void GetAllocPrivDataSize(int *priv_drv_data_size, int *priv_alloc_data_size);
+void FillinAllocPrivDrvData(void *drv_priv, int priv_alloc_data_size);
+
+int GetSubmitPrivDataSize();
+void FillinSubmitPrivData(void *priv_data, D3DKMT_HANDLE queue, uint64_t command_addr,
+                        uint64_t command_size, bool is_hw_queue);
+int GetHwQueuePrivDataSize();
+void FillinHwQueuePrivData(void *priv_data, bool FwManagedGfxState, SchedLevel level = kNormal);
+int GetContextPrivDataSize();
+void FillinContextPrivData(void *priv_data, bool FwManagedGfxState);
+int GetPowerOptPrivDataSize();
+void FillinPowerOptPrivData(void *priv_data, bool restore);
+}
+#endif
diff --git a/projects/rocr-runtime/libhsakmt/include/impl/thunk_proxy/wddm_types.h b/projects/rocr-runtime/libhsakmt/include/impl/thunk_proxy/wddm_types.h
new file mode 100644
index 0000000000..3fd3f69553
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/include/impl/thunk_proxy/wddm_types.h
@@ -0,0 +1,169 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef _WSL_INC_THUNK_PROXY_WDDM_TYPES_H_
+#define _WSL_INC_THUNK_PROXY_WDDM_TYPES_H_
+
+#include <stdint.h>
+
+#include <no_sal2.h>
+
+typedef uint32_t UINT, *UINT_PTR;
+typedef int32_t  INT32;
+typedef int32_t  LONG;
+typedef uint32_t ULONG, *ULONG_PTR;
+typedef int64_t  LONGLONG;
+typedef int64_t  LONG64;
+typedef uint64_t ULONGLONG;
+typedef uint64_t ULONG64, *ULONG64_PTR;
+typedef uint8_t  BYTE;
+typedef uint16_t WORD;
+typedef uint32_t DWORD;
+typedef int32_t  BOOL;
+typedef int32_t  NTSTATUS;
+typedef uint16_t USHORT;
+typedef uint16_t UINT16;
+typedef uint32_t UINT32;
+typedef uint64_t UINT64;
+typedef int32_t  INT;
+typedef uint64_t SIZE_T;
+typedef void VOID;
+typedef float FLOAT;
+typedef char CHAR;
+typedef unsigned char UCHAR;
+typedef UCHAR BOOLEAN;
+typedef int16_t WCHAR;
+typedef void *HANDLE;
+typedef void *PVOID;
+typedef void *LPVOID;
+typedef const int16_t *PCWSTR;
+
+#define ULONG ULONG
+#define ULONG_PTR ULONG_PTR
+#define USHORT USHORT
+
+#define DECLARE_HANDLE(name) struct name##__{int unused;}; typedef struct name##__ *name
+#define C_ASSERT(e) typedef char __C_ASSERT__[(e)?1:-1]
+
+DECLARE_HANDLE(HWND);
+DECLARE_HANDLE(HDC);
+DECLARE_HANDLE(PALETTEENTRY);
+
+typedef struct tagPOINT {
+    LONG x;
+    LONG y;
+} POINT;
+
+typedef struct tagRECT {
+    LONG left;
+    LONG top;
+    LONG right;
+    LONG bottom;
+} RECT;
+
+typedef struct tagRECTL {
+    LONG left;
+    LONG top;
+    LONG right;
+    LONG bottom;
+} RECTL;
+
+typedef union _LARGE_INTEGER {
+	struct {
+		DWORD LowPart;
+		DWORD HighPart;
+	} u;
+	LONGLONG QuadPart;
+} LARGE_INTEGER;
+
+typedef LARGE_INTEGER *PLARGE_INTEGER;
+
+typedef union _ULARGE_INTEGER {
+    struct {
+        ULONG LowPart;
+        ULONG HighPart;
+    } DUMMYSTRUCTNAME;
+    struct {
+        ULONG LowPart;
+        ULONG HighPart;
+    } u;
+    ULONGLONG QuadPart;
+} ULARGE_INTEGER;
+
+typedef ULARGE_INTEGER *PULARGE_INTEGER;
+
+typedef struct _LUID {
+    ULONG LowPart;
+    LONG HighPart;
+} LUID, *PLUID;
+
+typedef enum _DEVICE_POWER_STATE {
+    PowerDeviceUnspecified = 0,
+    PowerDeviceD0,
+    PowerDeviceD1,
+    PowerDeviceD2,
+    PowerDeviceD3,
+    PowerDeviceMaximum
+} DEVICE_POWER_STATE, *PDEVICE_POWER_STATE;
+
+#define _Check_return_
+#define APIENTRY
+#define CONST const
+#define IN
+#define OUT
+#define FAR
+#define MAX_PATH 260
+#define __stdcall
+
+#ifndef GUID_DEFINED
+#define GUID_DEFINED
+typedef struct _GUID {
+    uint32_t Data1;
+    uint16_t Data2;
+    uint16_t Data3;
+    uint8_t  Data4[ 8 ];
+} GUID;
+#endif
+
+#include <guiddef.h>
+
+#endif
diff --git a/projects/rocr-runtime/libhsakmt/include/impl/wddm/cmd_util.h b/projects/rocr-runtime/libhsakmt/include/impl/wddm/cmd_util.h
new file mode 100644
index 0000000000..f1e7d22d91
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/include/impl/wddm/cmd_util.h
@@ -0,0 +1,82 @@
+/* Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. */
+
+#ifndef _WSL_INC_WDDM_CMD_UTIL_H_
+#define _WSL_INC_WDDM_CMD_UTIL_H_
+
+#include <string.h>
+#include "impl/hsa/hsa.h"
+#include "impl/hsa/amd_hsa_queue.h"
+#include "impl/hsa/amd_hsa_kernel_code.h"
+#include "impl/pm4_cmds.h"
+#include "util/utils.h"
+
+namespace wsl {
+namespace thunk {
+
+struct DispatchInfo {
+  uint8_t                       major;
+  hsa_kernel_dispatch_packet_t  *pPacket;
+  void                          *pEntry;
+  const amd_kernel_code_t       *pKernelObject;
+  uint32_t                      ldsBlks;
+  amd_queue_v2_t                *pAmdQueue;
+  bool                          wave32;
+  uint32_t                      srd;
+  void                          *pScratchBase;
+  uint32_t                      scratchSizePerWave;
+  uint32_t                      scratchBaseOffset[2];
+  uint32_t                      offsetCnt;
+};
+
+class CmdUtil {
+public:
+  CmdUtil() {};
+  ~CmdUtil() {};
+
+  static size_t BuildCopyData(
+    uint64_t  *pDstAddr,
+    void      *pBuffer,
+    uint32_t  dstSel = dst_sel__mec_copy_data__tc_l2,
+    uint32_t  dstCachePolicy = dst_cache_policy__mec_copy_data__stream,
+    uint32_t  srcSel = src_sel__mec_copy_data__gpu_clock_count,
+    uint32_t  srcCachePolicy = src_cache_policy__mec_copy_data__lru,
+    uint32_t  countSel = count_sel__mec_copy_data__64_bits_of_data,
+    uint32_t  wrConfirm = wr_confirm__mec_copy_data__wait_for_confirmation);
+
+  static size_t BuildBarrier(
+    void      *pBuffer,
+    uint32_t  eventIndex = event_index__mec_event_write__cs_partial_flush,
+    uint32_t  eventType = CS_PARTIAL_FLUSH);
+
+  static size_t BuildWriteData64Command(
+    void      *pBuffer,
+    uint64_t* write_addr,
+    uint64_t write_value);
+
+  static size_t BuildAcquireMem(
+    uint8_t major,
+    void    *pBuffer);
+
+  static size_t BuildScratch(
+    void  *pScratchBase,
+    void  *pBuffer);
+
+  static size_t BuildComputeShaderParams(
+    void  *pBuffer);
+
+  static size_t BuildDispatch(
+    struct DispatchInfo *pInfo,
+    void                *pBuffer);
+
+  static size_t BuildAtomicMem(
+    uint64_t  *pAddr,
+    uint32_t  atomic,
+    void      *pBuffer,
+    uint32_t  cachePolicy = cache_policy__mec_atomic_mem__stream,
+    uint64_t  srcData = 1);
+};
+
+} // namespace thunk
+} // namespace wsl
+
+#endif
\ No newline at end of file
diff --git a/projects/rocr-runtime/libhsakmt/include/impl/wddm/device.h b/projects/rocr-runtime/libhsakmt/include/impl/wddm/device.h
new file mode 100644
index 0000000000..15821b5483
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/include/impl/wddm/device.h
@@ -0,0 +1,246 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef _WSL_INC_WDDM_DEVICE_H_
+#define _WSL_INC_WDDM_DEVICE_H_
+
+#include <cassert>
+#include <ntstatus.h>
+
+#include <atomic>
+#include <memory>
+#include <vector>
+
+#include "impl/wddm/types.h"
+#include "impl/thunk_proxy/thunk_proxy.h"
+#include "impl/wddm/va_mgr.h"
+#include "impl/wddm/status.h"
+#include "impl/wddm/types.h"
+#include "impl/wddm/gpu_memory.h"
+#include "impl/wddm/cmd_util.h"
+
+namespace wsl {
+namespace thunk {
+
+//class Queue;
+class WDDMQueue;
+
+// WSL2 hyperv GPADL protocol limitation
+#define MAX_USERPTR_BLOCK_SIZE 0xf0000000
+#define START_NON_CANONICAL_ADDR (1ULL << 47)
+#define END_NON_CANONICAL_ADDR (~0UL - (1UL << 47))
+#define IS_OVERLAPPING(start1, size1, start2, size2) \
+  ((start1 < (start2 + size2)) && (start2 < (start1 + size1)))
+
+struct SegmentInfo {
+  uint32_t segment_id;
+  uint32_t segment_type;    // 0=aperture, 1=gpu memory, 2=system memory
+  bool aperture;
+  bool system_memory;
+  uint64_t commit_limit;
+
+  SegmentInfo()
+      : segment_id(0), segment_type(0), aperture(false),
+        system_memory(false), commit_limit(0) {}
+};
+
+class WDDMDevice {
+public:
+  static constexpr size_t GpuMemoryChunkSize = 2 * (1ULL << 30);   // 2 GB
+
+  WDDMDevice(D3DKMT_HANDLE adapter, LUID adapter_luid, uint32_t node_id);
+  ~WDDMDevice();
+
+  int NodeId() const { return node_id_; }
+  int Major() { return device_info_.major; }
+  int Minor() { return device_info_.minor; }
+  int Stepping() { return device_info_.stepping; }
+  bool IsDgpu() { return device_info_.is_dgpu; }
+  const char *ProductName() { return device_info_.product_name; }
+  uint64_t Uuid() { return device_info_.uuid; }
+  uint32_t GfxFamily() { return device_info_.family; }
+  uint32_t DeviceId() { return device_info_.device_id; }
+  uint32_t WavefrontSize() { return device_info_.wavefront_size; }
+  uint32_t ComputeUnitCount() { return device_info_.compute_unit_count; }
+  uint32_t MaxEngineClockMhz() { return device_info_.max_engine_clock_mhz; }
+  uint32_t WatchPointsNum() { return device_info_.watch_points_num; }
+  uint32_t PciBusAddr() { return device_info_.pci_bus_addr; }
+
+  uint32_t MemoryBusWidth() { return device_info_.memory_bus_width; }
+  uint32_t MaxMemoryClockMhz() { return device_info_.max_memory_clock_mhz; }
+  uint32_t WavePerCu() { return device_info_.wave_per_cu; }
+  uint32_t SimdPerCu() { return device_info_.simd_per_cu; }
+  uint32_t MaxScratchSlotsPerCu() { return device_info_.max_scratch_slots_per_cu; }
+  uint32_t NumShaderEngine() { return device_info_.num_shader_engine; }
+  uint32_t ShaderArrayPerShaderEngine() { return device_info_.shader_array_per_shader_engine; }
+  uint32_t NumSdmaEngine() { return device_info_.sdma_schedid.size(); }
+  uint32_t Domain() { return device_info_.domain; }
+  uint32_t NumGws() { return device_info_.num_gws; }
+  uint32_t AsicRevision() { return device_info_.asic_revision; }
+  uint64_t LocalHeapSize() { return device_info_.local_visible_heap_size + device_info_.local_invisible_heap_size; }
+  uint64_t LocalVisibleHeapSize() { return device_info_.local_visible_heap_size; }
+  uint64_t LocalInvisibleHeapSize() { return device_info_.local_invisible_heap_size; }
+  uint64_t NonLocalHeapSize() { return device_info_.non_local_heap_size; }
+  uint64_t PrivateApertureBase() { return device_info_.private_aperture_base; }
+  uint64_t PrivateApertureSize() { return device_info_.private_aperture_size; }
+  uint64_t SharedApertureBase() { return device_info_.shared_aperture_base; }
+  uint64_t SharedApertureSize() { return device_info_.shared_aperture_size; }
+  uint32_t LdsSize() { return device_info_.lds_size; }
+  uint64_t GPUCounterFrequency() { return device_info_.gpu_counter_frequency; }
+  uint32_t GetSwsQueueSize(void) const { return device_info_.user_queue_size; }
+  uint32_t GetMecFwVersion() { return device_info_.mec_fw_version; }
+  uint32_t GetSdmaFwVersion() { return device_info_.sdma_fw_version; }
+  uint32_t GetL1CacheSize() { return device_info_.l1_cache_size; }
+  uint32_t GetL2CacheSize() { return device_info_.l2_cache_size; }
+  uint32_t GetL3CacheSize() { return device_info_.l3_cache_size; }
+  uint32_t Gl2CacheLineSize() { return device_info_.gl2_cacheline_size; }
+  bool SupportStateShadowingByCpFw(void) const { return device_info_.state_shadowing_by_cpfw; }
+  bool SupportPlatformAtomic(void) const { return device_info_.platform_atomic_support; }
+  uint32_t GetSdmaEngine(uint32_t idx) {
+    assert(idx < NumSdmaEngine());
+    return device_info_.sdma_schedid[idx];
+  }
+  uint32_t GetComputeEngine() { return device_info_.compute_schedid; }
+
+  uint64_t VramAvail();
+
+  void GetClockCounters(uint64_t *gpu, uint64_t *cpu);
+  uint32_t GetNumCpQueues() { return device_info_.num_cp_queues; }
+
+  bool CreateSyncobj(D3DKMT_HANDLE *handle, uint64_t **addr);
+  void DestroySyncobj(D3DKMT_HANDLE handle);
+
+  bool CreateQueue(WDDMQueue *queue);
+  void DestroyQueue(WDDMQueue *queue);
+  bool CreateHwQueue(WDDMQueue *queue);
+  bool DestroyHwQueue(WDDMQueue *queue);
+  bool SubmitToSwQueue(WDDMQueue *queue, uint64_t command_addr,
+                      uint64_t command_size, uint64_t fence_value);
+  bool SubmitToHwQueue(WDDMQueue *queue, uint64_t command_addr,
+                      uint64_t command_size, uint64_t fence_value);
+
+  bool WaitPagingFence(WDDMQueue *queue) {
+    uint64_t value = page_fence_value_;
+
+    if (*page_fence_addr_ < value &&
+        !GpuWait(queue, &page_syncobj_, &value, 1))
+      return false;
+
+    return true;
+  }
+
+  bool GpuWait(WDDMQueue *queue, const D3DKMT_HANDLE *syncobjs,
+	       uint64_t *values, int count);
+  bool GpuSignal(D3DKMT_HANDLE context, const D3DKMT_HANDLE *syncobjs,
+		  uint64_t *value, int count);
+  bool CpuWait(const D3DKMT_HANDLE *syncobjs, uint64_t *value,
+	       int count, bool wait_any);
+  bool WaitOnPagingFenceFromCpu();
+
+  uint32_t LdsBlocks(const hsa_kernel_dispatch_packet_t *pkt);
+  uint32_t GetCmdbufSize(void) const { return cmdbuf_size_; }
+  uint32_t GetAqlFrameSize(void) const { return cmdbuf_aql_frame_size_; }
+  static uint32_t GetAqlFrameNum(void) { return cmdbuf_aql_frame_num_; }
+
+  // Both legacy HWS and stage 1 HWS use KMD to alloc use queue memory,
+  // return false by default
+  bool AllocUserQueueMemFromUMD(void) const { return false; }
+
+  bool IsHwsEnabled(int engine) {
+    return thunk_proxy::GetHwsEnabled(engine, &device_info_);
+  }
+
+  void UpdatePageFence(uint64_t fence_value);
+
+  D3DKMT_HANDLE PagingQueue() const { return page_queue_; }
+  D3DKMT_HANDLE PagingFence() const { return page_syncobj_; }
+  D3DKMT_HANDLE DeviceHandle() const { return device_; }
+  LUID GetLuid() const { return adapter_luid_; }
+  D3DKMT_HANDLE GetAdapter() const { return adapter_; }
+
+  const thunk_proxy::DeviceInfo& DeviceInfo() const { return device_info_; }
+
+  ErrorCode CreateGpuMemory(const GpuMemoryCreateInfo &create_info, GpuMemory **gpu_mem, gpusize *gpu_va = nullptr);
+
+private:
+  bool ParseDeviceInfo(void);
+  void DestroyDeviceInfo(void);
+  bool CreateDevice(void);
+  bool DestroyDevice(void);
+  bool CreatePagingQueue(void);
+  bool DestroyPagingQueue(void);
+  void *Lock(D3DKMT_HANDLE handle);
+  bool Unlock(D3DKMT_HANDLE handle);
+  bool CreateContext(int engine, D3DKMT_HANDLE *handle);
+  bool DestroyContext(D3DKMT_HANDLE handle);
+
+  void SetPowerOptimization(bool restore);
+  void InitCmdbufInfo(void);
+
+  bool QuerySegmentInfo();
+  bool GetSegmentId(D3DKMT_QUERYSTATISTICS_SEGMENT_TYPE segment_type, uint32_t &segment_id);
+
+  D3DKMT_HANDLE adapter_;
+  LUID adapter_luid_;
+  D3DKMT_HANDLE device_;
+
+  D3DKMT_HANDLE page_queue_;
+  D3DKMT_HANDLE page_syncobj_;
+  uint64_t *page_fence_addr_;
+  std::atomic<uint64_t> page_fence_value_;
+
+  uint32_t cmdbuf_size_;
+  uint32_t cmdbuf_aql_frame_size_;
+  static const uint32_t cmdbuf_aql_frame_num_;
+  uint32_t node_id_;
+  // device info
+  thunk_proxy::DeviceInfo device_info_;
+  std::vector<struct SegmentInfo> segment_infos_;
+  //CmdUtil cmd_util;
+};
+
+NTSTATUS WDDMCreateDevices(std::vector<WDDMDevice *> &devices);
+
+} // namespace thunk
+} // namespace wsl
+
+#endif
diff --git a/projects/rocr-runtime/libhsakmt/include/impl/wddm/gpu_memory.h b/projects/rocr-runtime/libhsakmt/include/impl/wddm/gpu_memory.h
new file mode 100644
index 0000000000..9703a6d2c7
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/include/impl/wddm/gpu_memory.h
@@ -0,0 +1,249 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef _WSL_INC_WDDM_GPU_MEMORY_H_
+#define _WSL_INC_WDDM_GPU_MEMORY_H_
+
+#include <cstddef>
+#include <cstdint>
+#include "util/utils.h"
+#include "impl/wddm/types.h"
+#include "impl/wddm/thunks.h"
+#include "impl/thunk_proxy/thunk_proxy.h"
+
+namespace wsl {
+namespace thunk {
+
+class WDDMDevice;
+
+union GpuMemoryCreateFlags {
+  struct {
+    uint64_t virtual_alloc              : 1; // only allocate virtual address, without physical buffer
+    uint64_t physical_only              : 1; // only allocate physical buffer, without virutal address
+    uint64_t interprocess               : 1; // physical buffer need share info between exporter and importer
+    uint64_t locked                     : 1; // lock virtual address space into RAM, preventing that memory from being paged to the swap area
+    uint64_t physical_contiguous        : 1; // contiguous physical pages
+    uint64_t sysmem_ipc_sig_importer         : 1; // allocate system memory for IPC signal
+    uint64_t sysmem_ipc_sig_exporter            : 1; // allocate system memory for IPC signal, prepare to export
+    uint64_t alloc_va                   : 1; // allocate va. 0 for vmem import
+    uint64_t blit_kernel_object         : 1; // allocate executable blit kernel object
+    uint64_t unused                     : 55;
+  };
+  uint64_t reserved;
+};
+
+union GpuMemoryDescFlags {
+  struct {
+    uint32_t is_virtual  : 1;
+    uint32_t is_shared   : 1;
+    uint32_t is_external : 1;
+    uint32_t is_physical_only : 1;
+    uint32_t is_locked : 1;
+    uint32_t is_queue_referenced : 1;
+    uint32_t is_physical_contiguous : 1;
+    uint32_t is_imported_sys_memfd : 1;     // 0 - ignored; 1 - va from system heap
+    uint32_t is_sysmem_exporter : 1; // allocate system memory for IPC signal, prepare to export
+    uint32_t is_va_required :1;
+    uint32_t is_imported_vram_vmem	:1;
+    uint32_t is_imported_vram_ipc	:1;
+    uint32_t is_imported_from_same_process : 3; // imported from same process, record shared cnt
+    uint32_t is_blit_kernel_object : 1; // blit kernel object
+    uint32_t unused : 16;
+  };
+
+  uint32_t reserved;
+};
+
+struct GpuMemoryCreateInfo {
+  GpuMemoryCreateInfo() {
+    flags.reserved = 0;
+    domain = thunk_proxy::kLocal;
+    size = 0;
+    alignment = 0;
+    mem_flags = 0;
+    engine_flag = 0;
+    va_hint = 0;
+    user_ptr = nullptr;
+    dmabuf_fd = -1;
+  }
+
+  GpuMemoryCreateFlags flags;
+  thunk_proxy::AllocDomain domain;
+  gpusize size;
+  gpusize alignment;
+  int mem_flags;
+  int engine_flag;
+  int dmabuf_fd; // Import from dmabuf
+
+  void *user_ptr;
+  gpusize va_hint;
+};
+
+struct GpuMemoryDesc {
+  GpuMemoryDesc() {
+    gpu_addr = 0;
+    cpu_addr = nullptr;
+    client_size = 0;
+    size = alignment = 0;
+    flags.reserved = 0;
+    mem_flags = 0;
+    engine_flag = 0;
+    handle_ape_addr = 0;
+  }
+
+  thunk_proxy::AllocDomain domain;
+  LUID adapter_luid;      // Where is the backing store location
+  gpusize gpu_addr;
+  void *cpu_addr;
+  gpusize client_size;    // user request size
+  gpusize size;
+  gpusize alignment;
+  gpusize handle_ape_addr;
+
+  GpuMemoryDescFlags flags;
+  int mem_flags;
+  int engine_flag;
+};
+
+struct SharedHandleInfo {
+  thunk_proxy::AllocDomain domain;
+  LUID adapter_luid;
+  gpusize client_size;    // user request size
+  uint64_t size;
+  uint32_t flags;
+  int mem_flags;
+  pid_t pid;
+  gpusize gpu_addr;
+};
+
+using GpuMemoryHandle = void *;
+
+class GpuMemory {
+public:
+  static size_t CalcChunkNumbers(gpusize size);
+
+  ErrorCode Init(const GpuMemoryCreateInfo &create_info);
+
+  WDDMDevice *GetDevice() const { return device_; }
+  gpusize Size() const { return desc_.size; }
+  gpusize ClientSize() const { return desc_.client_size; }
+  uint64_t GpuAddress() const { return desc_.gpu_addr; }
+  void *CpuAddress() const { return desc_.cpu_addr; }
+  uint64_t HandleApeAddress() const { return desc_.handle_ape_addr; }
+
+  inline bool IsLocal() const { return desc_.domain == thunk_proxy::kLocal; }
+  inline bool IsUserMemory() const { return desc_.domain == thunk_proxy::kUserMemory; }
+  inline bool IsSystem() const { return desc_.domain == thunk_proxy::kSystem; }
+  inline bool IsSysMemFd() const { return desc_.flags.is_imported_sys_memfd; }
+  inline bool IsUserQueue() const { return desc_.domain == thunk_proxy::kUserQueue; }
+  inline bool IsPhysicalOnly() const { return desc_.flags.is_physical_only; }
+  inline bool IsPhysicalContiguous() const { return desc_.flags.is_physical_contiguous; }
+  inline bool IsVirtual() const { return desc_.flags.is_virtual; }
+  inline bool IsShared() const { return desc_.flags.is_shared; }
+  inline bool IsExternal() const { return desc_.flags.is_external; }
+  inline bool IsVaAllocated() const { return desc_.flags.is_va_required; }
+  inline bool IsBlitKernelObject() const { return desc_.flags.is_blit_kernel_object; }
+
+  inline uint32_t Flags() const { return desc_.flags.reserved; }
+  inline int GetAllocInfo() const { return desc_.mem_flags; }
+  inline bool IsFineGrain() const { return (desc_.mem_flags & thunk_proxy::kFineGrain); }
+  inline bool IsSameAdapter(const LUID &luid) const {
+    return (desc_.adapter_luid.HighPart == luid.HighPart &&
+      desc_.adapter_luid.LowPart == luid.LowPart);
+  }
+  inline void GetQueueReference() { desc_.flags.is_queue_referenced = 1; }
+  inline void PutQueueReference() { desc_.flags.is_queue_referenced = 0; }
+  inline bool IsQueueReferenced() const { return desc_.flags.is_queue_referenced; }
+  inline void IncSharedReference() { desc_.flags.is_imported_from_same_process++; }
+  inline uint32_t DecSharedReference() { return (desc_.flags.is_imported_from_same_process == 0) ? 0 : --desc_.flags.is_imported_from_same_process; }
+  inline bool IsSharedFromSameProcess() const { return desc_.flags.is_imported_from_same_process > 0; }
+
+  WinAllocationHandle GetAllocationHandle(size_t index) const { return alloc_handles_ptr_[index]; }
+  size_t NumChunks() const { return num_allocations_; }
+
+  const GpuMemoryHandle GetGpuMemoryHandle() const {
+    return reinterpret_cast<GpuMemoryHandle>(const_cast<GpuMemory*>(this));
+  }
+
+  static GpuMemory *Convert(GpuMemoryHandle handle) { return reinterpret_cast<GpuMemory *>(handle); }
+
+  ErrorCode ReserveGpuVirtualAddress(gpusize base_virt_addr, gpusize va_size, gpusize alignment);
+  ErrorCode FreeGpuVirtualAddress(gpusize va_start_address, gpusize va_size);
+
+  ErrorCode MapGpuVirtualAddress(const gpusize map_addr, const gpusize size, gpusize offset = 0);
+  ErrorCode UnmapGpuVirtualAddress(const gpusize map_addr, const gpusize size, gpusize offset = 0);
+
+  ErrorCode MakeResident();
+  ErrorCode Evict();
+
+  ErrorCode ExportPhysicalHandle(int* dmabuf_fd, uint32_t flags = SHARED_ALLOCATION_ALL_ACCESS);
+  ErrorCode ImportPhysicalHandle(const GpuMemoryCreateInfo &create_info, gpusize *gpu_addr = nullptr);
+  ~GpuMemory();
+protected:
+  explicit GpuMemory(WDDMDevice *device);
+private:
+  ErrorCode CreatePhysicalMemory();
+  ErrorCode FreePhysicalMemory();
+
+  uint64_t AdjustSize(gpusize size) const;
+private:
+  friend class WDDMDevice;
+
+  WDDMDevice *const device_;
+
+  GpuMemoryDesc desc_;
+
+  size_t num_allocations_;
+  WinAllocationHandle *alloc_handles_ptr_;
+  WinAllocationHandle alloc_handle_; // Optimization for num_allocations_ is 1
+
+  WinResourceHandle resource_;     // Handle to a resource object that wraps the allocation. Used for shared resources
+
+  int mem_fd_; // IPC sigal's sys mem fd
+
+  DISALLOW_COPY_AND_ASSIGN(GpuMemory);
+};
+
+} // namespace thunk
+} // namespace wsl
+
+#endif
diff --git a/projects/rocr-runtime/libhsakmt/include/impl/wddm/queue.h b/projects/rocr-runtime/libhsakmt/include/impl/wddm/queue.h
new file mode 100644
index 0000000000..0e936c5721
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/include/impl/wddm/queue.h
@@ -0,0 +1,370 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+#ifndef _WSL_INC_WDDM_QUEUE_H_
+#define _WSL_INC_WDDM_QUEUE_H_
+
+#include <cinttypes>
+#include <condition_variable>
+#include <iostream>
+#include <queue>
+#include <utility>
+#include "impl/wddm/types.h"
+#include "impl/wddm/device.h"
+#include "impl/wddm/gpu_memory.h"
+#include "impl/hsa/hsa_ext_amd.h"
+#include "impl/hsa/amd_hsa_queue.h"
+#include "impl/hsa/amd_hsa_signal.h"
+#include "impl/wddm/cmd_util.h"
+
+namespace wsl {
+namespace thunk {
+
+class Queue;
+class WDDMDevice;
+
+class WDDMQueue {
+public:
+  WDDMQueue(WDDMDevice *device,
+            uint64_t cmdbuf_addr,
+            uint32_t cmdbuf_size,
+            uint32_t engine,
+            bool use_hws = true) :
+            device(device),
+            context(0),
+            queue(0),
+            syncobj(0),
+            sync_addr(NULL),
+            cmdbuf(0),
+            cmdbuf_addr(cmdbuf_addr),
+            cmdbuf_size(cmdbuf_size),
+            queue_engine(engine),
+            use_hws(use_hws),
+            prio(thunk_proxy::kNormal) {
+
+  }
+
+  virtual ~WDDMQueue() { }
+
+  virtual hsa_status_t Init(void) { return HSA_STATUS_SUCCESS; }
+  virtual hsa_status_t Fini(void) { return HSA_STATUS_SUCCESS; }
+  virtual void RingDoorbell() { }
+  virtual void* GetHsaQueueAddr(void) const { return reinterpret_cast<void*>(GetCmdbufAddr()); }
+
+  hsa_status_t SwsInit(void);
+  hsa_status_t SwsFini(void);
+  hsa_status_t SwsSubmit(uint64_t command_addr,
+                         uint64_t command_size,
+                         uint64_t fence_value);
+
+  hsa_status_t HwsInit(void);
+  hsa_status_t HwsFini(void);
+  hsa_status_t HwsSubmit(uint64_t command_addr,
+                         uint64_t command_size,
+                         uint64_t fence_value);
+  hsa_status_t SetPriority(hsa_amd_queue_priority_t priority);
+
+  uint64_t *GetSyncAddr(void) const { return sync_addr; }
+  uint64_t GetCmdbufAddr(void) const { return cmdbuf_addr; }
+
+  thunk_proxy::SchedLevel ConvertSchedLevel(hsa_amd_queue_priority_t prio) const {
+    switch (prio) {
+    case HSA_AMD_QUEUE_PRIORITY_LOW:
+      return thunk_proxy::kLow;
+    case HSA_AMD_QUEUE_PRIORITY_HIGH:
+      return thunk_proxy::kHigh;
+    case HSA_AMD_QUEUE_PRIORITY_NORMAL:
+    default:
+      return thunk_proxy::kNormal;
+    }
+  }
+
+  WDDMDevice *device;
+
+  D3DKMT_HANDLE context;
+  D3DKMT_HANDLE queue;
+
+  D3DKMT_HANDLE syncobj;
+  uint64_t *sync_addr;
+
+  GpuMemoryHandle cmdbuf;
+  uint64_t cmdbuf_addr;
+  uint32_t cmdbuf_size;
+
+  GpuMemoryHandle queue_mem;
+  uint64_t queue_addr;
+
+  uint32_t queue_engine;
+
+  bool use_hws;
+  thunk_proxy::SchedLevel prio;
+};
+
+class ComputeQueue : public WDDMQueue {
+public:
+  ComputeQueue(WDDMDevice *device,
+               void *ring,
+               uint64_t ring_size,
+               std::atomic<uint64_t> *ring_wptr,
+               std::atomic<uint64_t> *ring_rptr,
+               volatile int64_t *error_addr,
+               uint32_t cmdbuf_size,
+               uint32_t engine,
+               bool use_hws = true);
+
+  ~ComputeQueue();
+
+  virtual hsa_status_t Init(void);
+  virtual hsa_status_t Fini(void);
+  virtual hsa_status_t Submit(void);
+
+  void* GetRing(void) const { return ring; }
+  uint64_t GetRingSize(void) const { return ring_size; }
+  std::atomic<uint64_t>* GetRingWptr(void) const { return ring_wptr; }
+  std::atomic<uint64_t>* GetRingRptr(void) const { return ring_rptr; }
+
+  uint64_t GetAqlWriteIndex(void) const { return cmdbuf_aql_frame_write_index; }
+  uint32_t GetAqlFrameSize(void) const { return cmdbuf_aql_frame_size; }
+  void* GetHsaQueueAddr(void) const { return ring; }
+
+  bool IsInvalidPacket(void) const {
+    uint16_t *packet = (uint16_t *)((char *)ring +
+                       (cmdbuf_aql_frame_write_index % ring_size) * 64);
+    return ((*packet >> HSA_PACKET_HEADER_TYPE) & ((1 << HSA_PACKET_HEADER_WIDTH_TYPE) - 1))
+           == HSA_PACKET_TYPE_INVALID;
+  }
+
+  hsa_status_t Process(void);
+  uint64_t * GetDoorbellPtr() const { return (uint64_t *)&doorbell_signal_value_; }
+  void RingDoorbell();
+private:
+  hsa_status_t KernelDispatchAqlToPm4(char *cpu, hsa_kernel_dispatch_packet_t *packet);
+  hsa_status_t BarrierGenericAqlToPm4(char *cpu, hsa_barrier_and_packet_t *packet, bool is_or = false);
+
+  uint64_t CalcDispatchGroups(hsa_kernel_dispatch_packet_t *packet);
+  uint64_t CalcDispatchWavesPerGroup(hsa_kernel_dispatch_packet_t *packet, bool wave32);
+
+  struct amd_aql_pm4_ib {
+      uint16_t header;
+      uint16_t ven_hdr;
+      uint32_t ib_jump_cmd[4];
+      uint32_t dw_cnt_remain;
+      uint32_t reserved[8];
+      hsa_signal_t completion_signal;
+  };
+  hsa_status_t VendorSpecificAqlToPm4(char *cpu, amd_aql_pm4_ib *packet);
+  hsa_status_t SwitchAql2PM4(void);
+
+  hsa_status_t PreSubmit(void);
+  hsa_status_t EndSubmit(void);
+
+  void *ring;
+  uint64_t ring_size;
+  std::atomic<uint64_t> *ring_wptr;
+  std::atomic<uint64_t> *ring_rptr;
+
+  // ib_start_addr is the current ib start address
+  uint64_t ib_start_addr;
+
+  // ib_size is the current ib size.
+  uint64_t ib_size;
+
+  // record the last submitted aql frame write index
+  uint64_t sync_point;
+
+  uint64_t cmdbuf_aql_frame_write_index;
+  uint32_t cmdbuf_aql_frame_size;
+
+  uint64_t  *signal_addr_;
+  bool platform_atomic_support_;
+  bool needs_barrier;
+  bool ready_to_submit;
+
+  CmdUtil cmd_util;
+
+private:
+  bool EnableProfiling() {
+    return AMD_HSA_BITS_GET(amd_queue_rocr_->queue_properties, AMD_QUEUE_PROPERTIES_ENABLE_PROFILING);
+  }
+  void HandleError(hsa_status_t status);
+  bool UpdateScratch(hsa_kernel_dispatch_packet_t *packet, bool wave32);
+
+  uint32_t UpdateIndexStride(uint32_t srd, bool wave32);
+
+  void *ScratchBase() { return scratch_base_; }
+
+  void AppendCmdbufSratchBaseOffset(int offset) {
+      scratch_base_offset_array_.push_back(offset);
+  }
+
+  bool RelocateCmdbufScratchBase(uint64_t addr);
+
+  uint32_t ScratchSizePerWave() { return scratch_size_per_wave_; }
+  uint64_t GetKernelObjAddr(uint64_t addr) const;
+  void InitScratchSRD();
+  GpuMemoryHandle amd_queue_mem_;
+  amd_queue_v2_t *amd_queue_;
+  amd_queue_v2_t *amd_queue_rocr_;
+  uint64_t doorbell_signal_value_;
+  volatile std::atomic<int64_t> *error_code_;
+  std::thread aql_to_pm4_thread_;
+  bool thread_stop_;
+  std::mutex thread_cond_lock_;
+  std::condition_variable thread_cond_;
+  static void AqlToPm4Thread(ComputeQueue *queue);
+
+  uint64_t max_scratch_waves_;
+  uint64_t dispatch_waves_;
+  uint64_t scratch_size_per_wave_;
+  uint64_t scratch_size_;
+  uint64_t total_scratch_size_;
+  void *scratch_base_;
+  uint32_t scratch_mem_alignment_size_;
+  GpuMemoryHandle scratch_mem_;
+
+  std::vector<int> scratch_base_offset_array_;
+};
+
+class SDMAQueue : public WDDMQueue {
+public:
+  SDMAQueue(WDDMDevice *device,
+            void *ring,
+            uint64_t cmdbuf_size,
+            uint32_t engine,
+            bool use_hws = true);
+
+  virtual ~SDMAQueue();
+
+  hsa_status_t Init(void);
+  hsa_status_t Fini(void);
+  hsa_status_t Submit(void);
+
+  int PreparePacket(uint32_t offset, uint64_t size);
+
+  void WaitQueue(void) {
+    device->CpuWait(&syncobj, &rptr_next, 1, false);
+  }
+
+  uint64_t * GetRingWptr(void) { return &wptr_next_; }
+  uint64_t * GetRingRptr(void) { return WDDMQueue::GetSyncAddr(); }
+  uint64_t * GetDoorbellPtr() { return &doorbell_; }
+  void RingDoorbell();
+  void* GetHsaQueueAddr(void) const { return reinterpret_cast<void*>(GetCmdbufAddr()); }
+
+private:
+  uint64_t wptr_next_;
+  uint64_t wptr_pre_;
+  uint64_t rptr_next;
+  uint64_t doorbell_;
+  std::vector<std::pair<uint64_t, uint64_t>> wptr_queue_;
+  uint64_t ib_size;
+  uint64_t ib_start_addr;
+
+  std::thread thread_;
+  bool thread_stop_;
+  std::mutex thread_cond_lock_;
+  std::condition_variable thread_cond_;
+  static void SdmaThread(SDMAQueue *queue);
+
+  struct SDMA_PKT_POLL_REGMEM {
+    union {
+      struct {
+        unsigned int op : 8;
+        unsigned int sub_op : 8;
+        unsigned int reserved_0 : 10;
+        unsigned int hdp_flush : 1;
+        unsigned int reserved_1 : 1;
+        unsigned int func : 3;
+        unsigned int mem_poll : 1;
+      };
+      unsigned int DW_0_DATA;
+    } HEADER_UNION;
+
+    union {
+      struct {
+        unsigned int addr_31_0 : 32;
+      };
+      unsigned int DW_1_DATA;
+    } ADDR_LO_UNION;
+
+    union {
+      struct {
+        unsigned int addr_63_32 : 32;
+      };
+      unsigned int DW_2_DATA;
+    } ADDR_HI_UNION;
+
+    union {
+      struct {
+        unsigned int value : 32;
+      };
+      unsigned int DW_3_DATA;
+    } VALUE_UNION;
+
+    union {
+      struct {
+        unsigned int mask : 32;
+      };
+      unsigned int DW_4_DATA;
+    } MASK_UNION;
+
+    union {
+      struct {
+        unsigned int interval : 16;
+        unsigned int retry_count : 12;
+        unsigned int reserved_0 : 4;
+      };
+      unsigned int DW_5_DATA;
+    } DW5_UNION;
+  };
+  const unsigned int SDMA_OP_POLL_REGMEM = 8;
+  bool IsPollPacket(SDMA_PKT_POLL_REGMEM* pkt) {
+    return pkt->HEADER_UNION.op == SDMA_OP_POLL_REGMEM &&
+          pkt->HEADER_UNION.mem_poll == 1 &&
+          pkt->HEADER_UNION.func == 3;
+  }
+  uint32_t WrapIntoRocrRing(uint64_t idx) { return (idx & (cmdbuf_size - 1)); }
+};
+
+} // namespace thunk
+} // namespace wsl
+
+#endif
diff --git a/projects/rocr-runtime/libhsakmt/include/impl/wddm/status.h b/projects/rocr-runtime/libhsakmt/include/impl/wddm/status.h
new file mode 100644
index 0000000000..0efd9559fd
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/include/impl/wddm/status.h
@@ -0,0 +1,61 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef _WSL_INC_WDDM_STATUS_H
+#define _WSL_INC_WDDM_STATUS_H
+
+enum class ErrorCode {
+  Success,
+  DeviceLost,
+  UnSupported,
+  NotReady,
+  OutOfMemory,
+  OutOfGpuMemory,
+  OutOfHandleApeMemory,
+  Timeout,
+  SyscallFail,
+  InvalidateParams,
+  SameProcessSameDevice,
+  Unknown,
+};
+
+#endif
diff --git a/projects/rocr-runtime/libhsakmt/include/impl/wddm/thunks.h b/projects/rocr-runtime/libhsakmt/include/impl/wddm/thunks.h
new file mode 100644
index 0000000000..68f0015d6d
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/include/impl/wddm/thunks.h
@@ -0,0 +1,233 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef _WSL_INC_WDDM_THUNKS_H
+#define _WSL_INC_WDDM_THUNKS_H
+
+#include "impl/wddm/status.h"
+#include "impl/wddm/types.h"
+#include "dxcore_loader.h"
+
+namespace wsl {
+namespace thunk {
+
+inline ErrorCode TranslateNtStatus(NTSTATUS status) {
+  switch (status) {
+  case STATUS_SUCCESS:
+    return ErrorCode::Success;
+  case STATUS_PENDING:
+    return ErrorCode::NotReady;
+  case STATUS_NO_MEMORY:
+     return ErrorCode::OutOfMemory;
+  case STATUS_DEVICE_REMOVED:
+    return ErrorCode::DeviceLost;
+   case STATUS_GRAPHICS_NO_VIDEO_MEMORY:
+    return ErrorCode::OutOfGpuMemory;
+  case STATUS_TIMEOUT:
+    return ErrorCode::Timeout;
+  case STATUS_INVALID_PARAMETER:
+    return ErrorCode::InvalidateParams;
+  default:
+    break;
+  }
+  return ErrorCode::Unknown;
+}
+
+namespace d3dthunk {
+
+typedef D3DKMT_CREATEALLOCATION                      CreateAllocationArgs;
+typedef D3DKMT_CREATECONTEXT                         CreateContextArgs;
+typedef D3DKMT_CREATECONTEXTVIRTUAL                  CreateContextVirtualArgs;
+typedef D3DKMT_CREATEPAGINGQUEUE                     CreatePagingQueueArgs;
+typedef D3DKMT_CREATESYNCHRONIZATIONOBJECT           CreateSynchronizationObjectArgs;
+typedef D3DKMT_CREATESYNCHRONIZATIONOBJECT2          CreateSynchronizationObject2Args;
+typedef D3DKMT_ESCAPE                                EscapeArgs;
+typedef D3DKMT_EVICT                                 EvictArgs;
+typedef D3DKMT_FREEGPUVIRTUALADDRESS                 FreeGpuVirtualAddressArgs;
+typedef D3DKMT_LOCK                                  LockArgs;
+typedef D3DKMT_LOCK2                                 Lock2Args;
+typedef D3DKMT_OPENRESOURCE                          OpenResourceArgs;
+typedef D3DKMT_OPENRESOURCEFROMNTHANDLE              OpenResourceFromNtHandleArgs;
+typedef D3DKMT_QUERYADAPTERINFO                      QueryAdapterInfoArgs;
+typedef D3DKMT_SIGNALSYNCHRONIZATIONOBJECT           SignalSynchronizationObjectArgs;
+typedef D3DKMT_SIGNALSYNCHRONIZATIONOBJECT2          SignalSynchronizationObject2Args;
+typedef D3DKMT_SIGNALSYNCHRONIZATIONOBJECTFROMCPU    SignalSynchronizationObjectFromCpuArgs;
+typedef D3DKMT_SIGNALSYNCHRONIZATIONOBJECTFROMGPU2   SignalSynchronizationObjectFromGpuArgs;
+typedef D3DKMT_SUBMITCOMMAND                         SubmitCommandArgs;
+typedef D3DKMT_UNLOCK                                UnlockArgs;
+typedef D3DKMT_UNLOCK2                               Unlock2Args;
+typedef D3DKMT_UPDATEGPUVIRTUALADDRESS               UpdateGpuVirtualAddressArgs;
+typedef D3DKMT_WAITFORSYNCHRONIZATIONOBJECT          WaitForSynchronizationObjectArgs;
+typedef D3DKMT_WAITFORSYNCHRONIZATIONOBJECT2         WaitForSynchronizationObject2Args;
+typedef D3DKMT_WAITFORSYNCHRONIZATIONOBJECTFROMCPU   WaitForSynchronizationObjectFromCpuArgs;
+typedef D3DKMT_WAITFORSYNCHRONIZATIONOBJECTFROMGPU   WaitForSynchronizationObjectFromGpuArgs;
+typedef D3DKMT_ACQUIREKEYEDMUTEX                     AcquireKeyedMutexArgs;
+typedef D3DKMT_RELEASEKEYEDMUTEX                     ReleaseKeyedMutexArgs;
+typedef D3DKMT_OPENKEYEDMUTEX                        OpenKeyedMutexArgs;
+typedef D3DKMT_DESTROYKEYEDMUTEX                     DestroyKeyedMutexArgs;
+typedef D3DKMT_QUERYVIDEOMEMORYINFO                  QueryVideoMemoryInfoArgs;
+typedef D3DKMT_CREATEHWQUEUE                         CreateHwQueueArgs;
+typedef D3DKMT_DESTROYHWQUEUE                        DestroyHwQueueArgs;
+typedef D3DKMT_SUBMITCOMMANDTOHWQUEUE                SubmitCommandToHwQueueArgs;
+typedef D3DKMT_SUBMITPRESENTTOHWQUEUE                SubmitPresentToHwQueueArgs;
+typedef D3DKMT_SUBMITSIGNALSYNCOBJECTSTOHWQUEUE      SubmitSignalSyncObjectsToHwQueueArgs;
+typedef D3DKMT_SUBMITWAITFORSYNCOBJECTSTOHWQUEUE     SubmitWaitForSyncObjectsToHwQueueArgs;
+typedef D3DKMT_CREATESYNCFILE                        CreateSyncFileArgs;
+
+inline ErrorCode MapGpuVirtualAddress(D3DDDI_MAPGPUVIRTUALADDRESS *args) {
+  return TranslateNtStatus(DXCORE_CALL(D3DKMTMapGpuVirtualAddress(args)));
+}
+
+inline ErrorCode CreateAllocation(CreateAllocationArgs *args) {
+  return TranslateNtStatus(DXCORE_CALL(D3DKMTCreateAllocation2(args)));
+}
+
+inline ErrorCode DestroyAllocation(
+            WinDeviceHandle device,
+            WinResourceHandle resource,
+            size_t num_allocations,
+            const WinAllocationHandle *alloc_handles) {
+
+  D3DKMT_DESTROYALLOCATION2 args{};
+
+  memset(&args, 0, sizeof(args));
+  args.hDevice = device;
+  if (resource) {
+    args.hResource = resource;
+  } else {
+    args.phAllocationList = alloc_handles;
+    args.AllocationCount = num_allocations;
+  }
+
+  return TranslateNtStatus(DXCORE_CALL(D3DKMTDestroyAllocation2(&args)));
+}
+
+inline ErrorCode ReserveGpuVirtualAddress(D3DDDI_RESERVEGPUVIRTUALADDRESS *args) {
+  return TranslateNtStatus(DXCORE_CALL(D3DKMTReserveGpuVirtualAddress(args)));
+}
+
+inline ErrorCode ReserveGpuVirtualAddress(WinAdapterHandle handle,
+                                          gpusize size,
+                                          gpusize base_address,
+                                          gpusize *out_addr) {
+  D3DDDI_RESERVEGPUVIRTUALADDRESS args{};
+  args.hPagingQueue = handle;
+  args.Size = size;
+  args.BaseAddress = base_address;
+
+  auto code = ReserveGpuVirtualAddress(&args);
+  if (code == ErrorCode::Success)
+    *out_addr = args.VirtualAddress;
+  return code;
+}
+
+inline ErrorCode ReserveGpuVirtualAddress(WinAdapterHandle handle,
+                                          gpusize size,
+                                          gpusize minimum_address,
+                                          gpusize maximum_address,
+                                          gpusize *out_addr) {
+  D3DDDI_RESERVEGPUVIRTUALADDRESS args{};
+  args.hPagingQueue = handle;
+  args.Size = size;
+  args.MinimumAddress = minimum_address;
+  args.MaximumAddress = maximum_address;
+
+  auto code = ReserveGpuVirtualAddress(&args);
+  if (code == ErrorCode::Success)
+    *out_addr = args.VirtualAddress;
+  return code;
+}
+
+inline ErrorCode FreeGpuVirtualAddress(FreeGpuVirtualAddressArgs *args) {
+  return TranslateNtStatus(DXCORE_CALL(D3DKMTFreeGpuVirtualAddress(args)));
+}
+
+inline ErrorCode FreeGpuVirtualAddress(WinAdapterHandle handle,
+                                       gpusize base_address,
+                                       gpusize size) {
+  FreeGpuVirtualAddressArgs args{};
+  args.hAdapter = handle;
+  args.Size = size;
+  args.BaseAddress = base_address;
+  return FreeGpuVirtualAddress(&args);
+}
+
+inline ErrorCode MakeResident(D3DDDI_MAKERESIDENT *args) {
+  return TranslateNtStatus(DXCORE_CALL(D3DKMTMakeResident(args)));
+}
+
+inline ErrorCode Evict(EvictArgs *args) {
+  return TranslateNtStatus(DXCORE_CALL(D3DKMTEvict(args)));
+}
+
+inline ErrorCode ShareObjects(size_t num_allocations,
+                               WinResourceHandle resource,
+                               uint32_t flags,
+                               int* dmabuf_fd) {
+  OBJECT_ATTRIBUTES obj_attr;
+  HANDLE nt_handle;
+  ErrorCode ret;
+
+  InitializeObjectAttributes(&obj_attr, nullptr, OBJ_INHERIT, nullptr, nullptr);
+  ret = TranslateNtStatus(DXCORE_CALL(D3DKMTShareObjects(num_allocations,
+        &resource, &obj_attr, flags, &nt_handle)));
+  if (ret == ErrorCode::Success)
+    *dmabuf_fd = *(reinterpret_cast<int*>(&nt_handle));
+  else
+    *dmabuf_fd = -1;
+
+  return ret;
+}
+
+inline ErrorCode QueryResourceInfoFromNtHandle(D3DKMT_QUERYRESOURCEINFOFROMNTHANDLE *args) {
+  return TranslateNtStatus(DXCORE_CALL(D3DKMTQueryResourceInfoFromNtHandle(args)));
+}
+
+inline ErrorCode OpenResourceFromNtHandle(D3DKMT_OPENRESOURCEFROMNTHANDLE *args) {
+  return TranslateNtStatus(DXCORE_CALL(D3DKMTOpenResourceFromNtHandle(args)));
+}
+
+} // namespace d3dthunk
+} // namespace thunk
+} // namespace wsl
+
+#endif
diff --git a/projects/rocr-runtime/libhsakmt/include/impl/wddm/types.h b/projects/rocr-runtime/libhsakmt/include/impl/wddm/types.h
new file mode 100644
index 0000000000..0a3ca35ebc
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/include/impl/wddm/types.h
@@ -0,0 +1,101 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef _WSL_INC_WDDM_TYPES_H_
+#define _WSL_INC_WDDM_TYPES_H_
+
+#include <cstdint>
+#include <ntstatus.h>
+#include "impl/thunk_proxy/wddm_types.h"
+// windows wchar is 16bit, but linux is 32bit
+// seems libdxcore (not dxgkrnl.ko) convert thunk windows wchar to linux one
+// so only accept 32bit wchar args. note driver private data structure still
+// use 16bit wchar
+#define WCHAR wchar_t
+#define PCWSTR const wchar_t *
+#include <d3dkmthk.h>
+#undef WCHAR
+#undef PCWSTR
+
+using gpusize = uint64_t; // Used to specify GPU addresses and sizes of GPU allocations
+using WinAllocationHandle = D3DKMT_HANDLE;
+using WinResourceHandle = D3DKMT_HANDLE;
+using WinContextHandle = D3DKMT_HANDLE;
+using WinDeviceHandle = D3DKMT_HANDLE;
+using WinAdapterHandle = D3DKMT_HANDLE;
+
+//reference dk/winnt.h
+#define STANDARD_RIGHTS_REQUIRED         (0x000F0000L)
+
+//reference dk/ntdef.h
+#define OBJ_INHERIT                      (0x00000002L)
+typedef WCHAR *PWCHAR, *LPWCH, *PWCH;
+typedef struct _UNICODE_STRING {
+    USHORT Length;
+    USHORT MaximumLength;
+#ifdef MIDL_PASS
+    [size_is(MaximumLength / 2), length_is((Length) / 2) ] USHORT * Buffer;
+#else // MIDL_PASS
+    _Field_size_bytes_part_opt_(MaximumLength, Length) PWCH   Buffer;
+#endif // MIDL_PASS
+} UNICODE_STRING;
+typedef UNICODE_STRING *PUNICODE_STRING;
+typedef const UNICODE_STRING *PCUNICODE_STRING;
+
+typedef struct _OBJECT_ATTRIBUTES {
+  ULONG           Length;
+  HANDLE          RootDirectory;
+  PUNICODE_STRING ObjectName;
+  ULONG           Attributes;
+  PVOID           SecurityDescriptor;
+  PVOID           SecurityQualityOfService;
+} OBJECT_ATTRIBUTES;
+#define InitializeObjectAttributes( p, n, a, r, s ) {   \
+    (p)->Length = sizeof( OBJECT_ATTRIBUTES );          \
+    (p)->RootDirectory = r;                             \
+    (p)->Attributes = a;                                \
+    (p)->ObjectName = n;                                \
+    (p)->SecurityDescriptor = s;                        \
+    (p)->SecurityQualityOfService = NULL;               \
+    }
+
+#endif
\ No newline at end of file
diff --git a/projects/rocr-runtime/libhsakmt/include/impl/wddm/va_mgr.h b/projects/rocr-runtime/libhsakmt/include/impl/wddm/va_mgr.h
new file mode 100644
index 0000000000..675bfc3e39
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/include/impl/wddm/va_mgr.h
@@ -0,0 +1,86 @@
+#ifndef _WSL_INC_WDDM_VA_MGR_H_
+#define _WSL_INC_WDDM_VA_MGR_H_
+
+#include <mutex>
+#include <map>
+#include "util/utils.h"
+
+namespace wsl {
+namespace thunk {
+
+class VaMgr {
+public:
+  VaMgr(uint64_t start, uint64_t size, uint64_t min_align);
+  ~VaMgr();
+
+  /* Allocate `bytes` VA, if `align` is not zero, the returned address is aligned by `align`.
+   * If `addr` parameter is not zero, try best to allocate VA from fixed address `addr`.
+   */
+  uint64_t Alloc(uint64_t bytes, uint64_t align, uint64_t addr = 0);
+
+  void Free(uint64_t addr);
+
+private:
+  uint64_t AllocImpl(uint64_t bytes, uint64_t align);
+
+  struct Fragment {
+    using ptr = std::multimap<uint64_t, uint64_t>::iterator;
+    ptr free_list_entry_;
+
+    struct {
+      uint64_t size : 63;
+      bool is_free : 1;
+    };
+
+    Fragment() : size(0), is_free(false) {}
+    Fragment(ptr iterator, uint64_t len, bool is_free)
+        : free_list_entry_(iterator), size(len), is_free(is_free) {}
+  };
+
+  static inline Fragment make_fragment(typename Fragment::ptr iter, uint64_t len) {
+    return {iter, len, true};
+  }
+
+  inline Fragment make_fragment(uint64_t len) { return {free_list_.end(), len, false}; }
+
+  static inline bool is_free(const Fragment& f) { return f.is_free; }
+  void set_used(Fragment& f) {
+    f.is_free = false;
+    f.free_list_entry_ = free_list_.end();
+  }
+  static void set_free(Fragment& f, typename Fragment::ptr iter) {
+    f.free_list_entry_ = iter;
+    f.is_free = true;
+  }
+
+  inline void remove_free_list_entry(Fragment& frag) {
+    if (frag.free_list_entry_ != free_list_.end()) {
+      free_list_.erase(frag.free_list_entry_);
+      frag.free_list_entry_ = free_list_.end();
+    }
+  }
+
+  inline void add_free_fragment(uint64_t size, uint64_t base) {
+    auto it = free_list_.insert(std::make_pair(size, base));
+    frag_map_[base] = make_fragment(it, size);
+  }
+
+  inline void add_used_fragment(uint64_t size, uint64_t base) {
+    frag_map_[base] = make_fragment(size);
+  }
+  // Indexed by size
+  std::multimap<uint64_t, uint64_t> free_list_;
+  // Indexed by VA, each fragment has no overlap
+  std::map<uint64_t, Fragment> frag_map_;
+
+  uint64_t min_align_;
+
+  std::mutex lock_;  // Mutex protecting allocation and free of va
+
+
+  DISALLOW_COPY_AND_ASSIGN(VaMgr);
+};
+
+} // namespace thunk
+} // namespace wsl
+#endif
diff --git a/projects/rocr-runtime/libhsakmt/librocdxg.pc.in b/projects/rocr-runtime/libhsakmt/librocdxg.pc.in
new file mode 100755
index 0000000000..d9b362399d
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/librocdxg.pc.in
@@ -0,0 +1,11 @@
+prefix=${pcfiledir}/../../..
+exec_prefix=${prefix}
+libdir=${prefix}/@CMAKE_INSTALL_LIBDIR@
+includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
+
+Name: librocdxg
+Description: HSA Kernel Mode Thunk library for WSL support
+Version: @LIB_VERSION_STRING@
+
+Libs: -L${libdir} -lrocdxg
+Cflags: -I${includedir}
diff --git a/projects/rocr-runtime/libhsakmt/rocdxg-config.cmake.in b/projects/rocr-runtime/libhsakmt/rocdxg-config.cmake.in
new file mode 100644
index 0000000000..5f2ab41f37
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/rocdxg-config.cmake.in
@@ -0,0 +1,14 @@
+@PACKAGE_INIT@
+
+include( CMakeFindDependencyMacro )
+
+# Locate dependent packages here.  Finding them propagates usage requirements,
+# if any, to our clients and ensures that their target names are in scope for
+# the build.  rocdxg has no cmake project dependencies so there is nothing to
+# find.  If we switch to use find_package with external (to ROCm) library
+# dependencies (ie libnuma) then those packages should be located here using
+# find_dependencies as shown below.
+#find_dependency(Bar, 2.0)
+
+include( "${CMAKE_CURRENT_LIST_DIR}/@ROCDXG_TARGET@Targets.cmake" )
+
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/ais.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/ais.cpp
new file mode 100644
index 0000000000..e32c28b1d4
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/ais.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright © 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtAisReadWriteFile(void *MemoryAddress,
+					      HSAuint64 MemorySizeInBytes,
+					      HSAint32 fd,
+					      HSAint64 file_offset,
+					      HsaAisFlags AisFlags,
+					      HSAuint64 *SizeCopiedInBytes,
+					      HSAint32 *status)
+{
+	CHECK_DXG_OPEN();
+
+	pr_warn_once("not implemented\n");
+	return HSAKMT_STATUS_NOT_SUPPORTED;
+}
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/debug.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/debug.cpp
new file mode 100644
index 0000000000..2b4425599a
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/debug.cpp
@@ -0,0 +1,126 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <cassert>
+#include <cstring>
+
+
+static uint32_t runtime_capabilities_mask = 0;
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtDbgRegister(HSAuint32 NodeId) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtDbgUnregister(HSAuint32 NodeId) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtDbgWavefrontControl(
+    HSAuint32 NodeId, HSA_DBG_WAVEOP Operand, HSA_DBG_WAVEMODE Mode,
+    HSAuint32 TrapId, HsaDbgWaveMessage *DbgWaveMsgRing) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtDbgAddressWatch(
+    HSAuint32 NodeId, HSAuint32 NumWatchPoints, HSA_DBG_WATCH_MODE WatchMode[],
+    void *WatchAddress[], HSAuint64 WatchMask[], HsaEvent *WatchEvent[]) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtCheckRuntimeDebugSupport(void) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtRuntimeEnable(void *rDebug, bool setupTtmp) {
+  HSAKMT_STATUS result = hsaKmtCheckRuntimeDebugSupport();
+
+  if (result)
+    return result;
+
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtRuntimeDisable(void) {
+  HSAKMT_STATUS result = hsaKmtCheckRuntimeDebugSupport();
+
+  if (result)
+    return HSAKMT_STATUS_SUCCESS;
+
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtGetRuntimeCapabilities(HSAuint32 *caps_mask) {
+  CHECK_DXG_OPEN();
+  *caps_mask = runtime_capabilities_mask;
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtDbgEnable(void **runtime_info,
+                                        HSAuint32 *data_size) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+HSAKMT_STATUS HSAKMTAPI hsaKmtDbgDisable(void) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtDbgGetDeviceData(void **data,
+                                               HSAuint32 *n_entries,
+                                               HSAuint32 *entry_size) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtDbgGetQueueData(void **data, HSAuint32 *n_entries,
+                                              HSAuint32 *entry_size,
+                                              bool suspend_queues) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI
+hsaKmtDebugTrapIoctl(struct kfd_ioctl_dbg_trap_args *args, HSA_QUEUEID *Queues,
+                     HSAuint64 *DebugReturn) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/dxcore_loader.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/dxcore_loader.cpp
new file mode 100644
index 0000000000..5d38d69c8d
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/dxcore_loader.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+ */
+
+#include "dxcore_loader.h"
+#include "librocdxg.h"
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <ntstatus.h>
+
+namespace wsl {
+namespace thunk {
+namespace dxcore {
+
+DxcoreLoader::DxcoreLoader()
+    : dxcore_handle_(nullptr)
+    , init_flag_()
+    , pfn_D3DKMTCreateAllocation2(nullptr)
+    , pfn_D3DKMTDestroyAllocation2(nullptr)
+    , pfn_D3DKMTMapGpuVirtualAddress(nullptr)
+    , pfn_D3DKMTReserveGpuVirtualAddress(nullptr)
+    , pfn_D3DKMTFreeGpuVirtualAddress(nullptr)
+    , pfn_D3DKMTCreateDevice(nullptr)
+    , pfn_D3DKMTDestroyDevice(nullptr)
+    , pfn_D3DKMTEnumAdapters2(nullptr)
+    , pfn_D3DKMTQueryAdapterInfo(nullptr)
+    , pfn_D3DKMTCreateContextVirtual(nullptr)
+    , pfn_D3DKMTDestroyContext(nullptr)
+    , pfn_D3DKMTSubmitCommand(nullptr)
+    , pfn_D3DKMTCreateSynchronizationObject2(nullptr)
+    , pfn_D3DKMTDestroySynchronizationObject(nullptr)
+    , pfn_D3DKMTQueryStatistics(nullptr)
+    , pfn_D3DKMTEscape(nullptr)
+    , pfn_D3DKMTLock2(nullptr)
+    , pfn_D3DKMTUnlock2(nullptr)
+    , pfn_D3DKMTCreatePagingQueue(nullptr)
+    , pfn_D3DKMTDestroyPagingQueue(nullptr)
+    , pfn_D3DKMTWaitForSynchronizationObjectFromGpu(nullptr)
+    , pfn_D3DKMTSignalSynchronizationObjectFromGpu(nullptr)
+    , pfn_D3DKMTWaitForSynchronizationObjectFromCpu(nullptr)
+    , pfn_D3DKMTQueryClockCalibration(nullptr)
+    , pfn_D3DKMTMakeResident(nullptr)
+    , pfn_D3DKMTEvict(nullptr)
+    , pfn_D3DKMTShareObjects(nullptr)
+    , pfn_D3DKMTQueryResourceInfoFromNtHandle(nullptr)
+    , pfn_D3DKMTOpenResourceFromNtHandle(nullptr)
+    , pfn_D3DKMTCreateHwQueue(nullptr)
+    , pfn_D3DKMTDestroyHwQueue(nullptr)
+    , pfn_D3DKMTSubmitCommandToHwQueue(nullptr) {
+}
+
+DxcoreLoader::~DxcoreLoader() {
+    Shutdown();
+}
+
+bool DxcoreLoader::Initialize() {
+    dlerror(); // Clear error
+    dxcore_handle_ = dlopen("libdxcore.so", RTLD_LAZY);
+
+    if (!dxcore_handle_) {
+        pr_err("[DxcoreLoader] Cannot load libdxcore.so: %s\n", dlerror());
+        return false;
+    }
+
+    pr_info("[DxcoreLoader] libdxcore.so loaded successfully\n");
+    if (!LoadDxcoreApis()) {
+        // If API loading failed, close the handle to indicate failure
+        dlclose(dxcore_handle_);
+        dxcore_handle_ = nullptr;
+        return false;
+    }
+
+    return IsLoaded();
+}
+
+void DxcoreLoader::Shutdown() {
+    if (dxcore_handle_) {
+        if (dlclose(dxcore_handle_) != 0) {
+            pr_err("[DxcoreLoader] Cannot unload libdxcore.so: %s\n", dlerror());
+        } else {
+            pr_info("[DxcoreLoader] libdxcore.so unloaded successfully\n");
+        }
+        dxcore_handle_ = nullptr;
+    }
+}
+
+bool DxcoreLoader::LoadDxcoreApis() {
+    if (!dxcore_handle_) {
+        pr_err("[DxcoreLoader] Error: dxcore_handle_ is null\n");
+        return false;
+    }
+
+    dlerror(); // Clear error
+
+    // Load all D3DKMT functions
+    #define LOAD_DXCORE_API(func_name) \
+        DXCORE_PFN(func_name) = (DXCORE_DEF(func_name)*)dlsym(dxcore_handle_, #func_name); \
+        if (!DXCORE_PFN(func_name)) { \
+            pr_err("[DxcoreLoader] Failed to load " #func_name ": %s\n", dlerror()); \
+            goto ERROR; \
+        }
+
+    LOAD_DXCORE_API(D3DKMTCreateAllocation2);
+    LOAD_DXCORE_API(D3DKMTDestroyAllocation2);
+    LOAD_DXCORE_API(D3DKMTMapGpuVirtualAddress);
+    LOAD_DXCORE_API(D3DKMTReserveGpuVirtualAddress);
+    LOAD_DXCORE_API(D3DKMTFreeGpuVirtualAddress);
+    LOAD_DXCORE_API(D3DKMTCreateDevice);
+    LOAD_DXCORE_API(D3DKMTDestroyDevice);
+    LOAD_DXCORE_API(D3DKMTEnumAdapters2);
+    LOAD_DXCORE_API(D3DKMTQueryAdapterInfo);
+    LOAD_DXCORE_API(D3DKMTCreateContextVirtual);
+    LOAD_DXCORE_API(D3DKMTDestroyContext);
+    LOAD_DXCORE_API(D3DKMTSubmitCommand);
+    LOAD_DXCORE_API(D3DKMTCreateSynchronizationObject2);
+    LOAD_DXCORE_API(D3DKMTDestroySynchronizationObject);
+    LOAD_DXCORE_API(D3DKMTQueryStatistics);
+    LOAD_DXCORE_API(D3DKMTEscape);
+    LOAD_DXCORE_API(D3DKMTLock2);
+    LOAD_DXCORE_API(D3DKMTUnlock2);
+    LOAD_DXCORE_API(D3DKMTCreatePagingQueue);
+    LOAD_DXCORE_API(D3DKMTDestroyPagingQueue);
+    LOAD_DXCORE_API(D3DKMTWaitForSynchronizationObjectFromGpu);
+    LOAD_DXCORE_API(D3DKMTSignalSynchronizationObjectFromGpu);
+    LOAD_DXCORE_API(D3DKMTWaitForSynchronizationObjectFromCpu);
+    LOAD_DXCORE_API(D3DKMTQueryClockCalibration);
+    LOAD_DXCORE_API(D3DKMTMakeResident);
+    LOAD_DXCORE_API(D3DKMTEvict);
+    LOAD_DXCORE_API(D3DKMTShareObjects);
+    LOAD_DXCORE_API(D3DKMTQueryResourceInfoFromNtHandle);
+    LOAD_DXCORE_API(D3DKMTOpenResourceFromNtHandle);
+    LOAD_DXCORE_API(D3DKMTCreateHwQueue);
+    LOAD_DXCORE_API(D3DKMTDestroyHwQueue);
+    LOAD_DXCORE_API(D3DKMTSubmitCommandToHwQueue);
+
+    #undef LOAD_DXCORE_API
+
+    pr_info("[DxcoreLoader] All DXCore APIs loaded successfully\n");
+    return true;
+ERROR:
+    pr_err("[DxcoreLoader] Failed to load DXCore APIs\n");
+    return false;
+}
+
+} // namespace dxcore
+} // namespace thunk
+} // namespace wsl
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/dxcore_loader.h b/projects/rocr-runtime/libhsakmt/src/dxg/dxcore_loader.h
new file mode 100644
index 0000000000..3f649a4da0
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/dxcore_loader.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef LIBROCDXG_DXCORE_LOADER_H
+#define LIBROCDXG_DXCORE_LOADER_H
+
+#include "impl/wddm/types.h"
+#include <dlfcn.h>
+#include <mutex>
+
+#define DXCORE_CALL(function_name)  wsl::thunk::dxcore::DxcoreLoader::Instance().pfn_##function_name
+
+namespace wsl {
+namespace thunk {
+namespace dxcore {
+
+/**
+ * @brief DxcoreLoader class for dynamic loading of libdxcore.so
+ * 
+ * This class provides a singleton loader for the DXCore library, allowing
+ * optional loading based on environment variable LIBROCDXG_ENABLE_DXCORE.
+ * Supported values: "1", "true", "yes" (case-sensitive).
+ * If not set or invalid, fallback to stub implementations.
+ * 
+ * Thread-safe initialization using std::call_once.
+ */
+
+// Macro definitions mimicking HSAKMT design
+#define DXCORE_DEF(function_name)   PFN##function_name
+#define DXCORE_PFN(function_name)   pfn_##function_name
+
+class DxcoreLoader {
+public:
+    // D3DKMT function type definitions
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTCreateAllocation2))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTDestroyAllocation2))(void *args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTMapGpuVirtualAddress))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTReserveGpuVirtualAddress))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTFreeGpuVirtualAddress))(void *args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTCreateDevice))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTDestroyDevice))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTEnumAdapters2))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTQueryAdapterInfo))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTCreateContextVirtual))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTDestroyContext))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTSubmitCommand))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTCreateSynchronizationObject2))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTDestroySynchronizationObject))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTQueryStatistics))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTEscape))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTLock2))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTUnlock2))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTCreatePagingQueue))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTDestroyPagingQueue))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTWaitForSynchronizationObjectFromGpu))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTSignalSynchronizationObjectFromGpu))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTWaitForSynchronizationObjectFromCpu))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTQueryClockCalibration))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTMakeResident))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTEvict))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTShareObjects))(size_t num_allocations, WinResourceHandle* resource, OBJECT_ATTRIBUTES* obj_attr, uint32_t flags, void** nt_handle);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTQueryResourceInfoFromNtHandle))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTOpenResourceFromNtHandle))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTCreateHwQueue))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTDestroyHwQueue))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTSubmitCommandToHwQueue))(void* args);
+
+    static DxcoreLoader& Instance() {
+        static DxcoreLoader* instance = new DxcoreLoader();
+        return (*instance);
+    }
+
+    bool Initialize();
+    void Shutdown();
+    bool IsLoaded() const { return dxcore_handle_ != nullptr; }
+
+    // Function pointer declarations
+    DXCORE_DEF(D3DKMTCreateAllocation2)* DXCORE_PFN(D3DKMTCreateAllocation2);
+    DXCORE_DEF(D3DKMTDestroyAllocation2)* DXCORE_PFN(D3DKMTDestroyAllocation2);
+    DXCORE_DEF(D3DKMTMapGpuVirtualAddress)* DXCORE_PFN(D3DKMTMapGpuVirtualAddress);
+    DXCORE_DEF(D3DKMTReserveGpuVirtualAddress)* DXCORE_PFN(D3DKMTReserveGpuVirtualAddress);
+    DXCORE_DEF(D3DKMTFreeGpuVirtualAddress)* DXCORE_PFN(D3DKMTFreeGpuVirtualAddress);
+    DXCORE_DEF(D3DKMTCreateDevice)* DXCORE_PFN(D3DKMTCreateDevice);
+    DXCORE_DEF(D3DKMTDestroyDevice)* DXCORE_PFN(D3DKMTDestroyDevice);
+    DXCORE_DEF(D3DKMTEnumAdapters2)* DXCORE_PFN(D3DKMTEnumAdapters2);
+    DXCORE_DEF(D3DKMTQueryAdapterInfo)* DXCORE_PFN(D3DKMTQueryAdapterInfo);
+    DXCORE_DEF(D3DKMTCreateContextVirtual)* DXCORE_PFN(D3DKMTCreateContextVirtual);
+    DXCORE_DEF(D3DKMTDestroyContext)* DXCORE_PFN(D3DKMTDestroyContext);
+    DXCORE_DEF(D3DKMTSubmitCommand)* DXCORE_PFN(D3DKMTSubmitCommand);
+    DXCORE_DEF(D3DKMTCreateSynchronizationObject2)* DXCORE_PFN(D3DKMTCreateSynchronizationObject2);
+    DXCORE_DEF(D3DKMTDestroySynchronizationObject)* DXCORE_PFN(D3DKMTDestroySynchronizationObject);
+    DXCORE_DEF(D3DKMTQueryStatistics)* DXCORE_PFN(D3DKMTQueryStatistics);
+    DXCORE_DEF(D3DKMTEscape)* DXCORE_PFN(D3DKMTEscape);
+    DXCORE_DEF(D3DKMTLock2)* DXCORE_PFN(D3DKMTLock2);
+    DXCORE_DEF(D3DKMTUnlock2)* DXCORE_PFN(D3DKMTUnlock2);
+    DXCORE_DEF(D3DKMTCreatePagingQueue)* DXCORE_PFN(D3DKMTCreatePagingQueue);
+    DXCORE_DEF(D3DKMTDestroyPagingQueue)* DXCORE_PFN(D3DKMTDestroyPagingQueue);
+    DXCORE_DEF(D3DKMTWaitForSynchronizationObjectFromGpu)* DXCORE_PFN(D3DKMTWaitForSynchronizationObjectFromGpu);
+    DXCORE_DEF(D3DKMTSignalSynchronizationObjectFromGpu)* DXCORE_PFN(D3DKMTSignalSynchronizationObjectFromGpu);
+    DXCORE_DEF(D3DKMTWaitForSynchronizationObjectFromCpu)* DXCORE_PFN(D3DKMTWaitForSynchronizationObjectFromCpu);
+    DXCORE_DEF(D3DKMTQueryClockCalibration)* DXCORE_PFN(D3DKMTQueryClockCalibration);
+    DXCORE_DEF(D3DKMTMakeResident)* DXCORE_PFN(D3DKMTMakeResident);
+    DXCORE_DEF(D3DKMTEvict)* DXCORE_PFN(D3DKMTEvict);
+    DXCORE_DEF(D3DKMTShareObjects)* DXCORE_PFN(D3DKMTShareObjects);
+    DXCORE_DEF(D3DKMTQueryResourceInfoFromNtHandle)* DXCORE_PFN(D3DKMTQueryResourceInfoFromNtHandle);
+    DXCORE_DEF(D3DKMTOpenResourceFromNtHandle)* DXCORE_PFN(D3DKMTOpenResourceFromNtHandle);
+    DXCORE_DEF(D3DKMTCreateHwQueue)* DXCORE_PFN(D3DKMTCreateHwQueue);
+    DXCORE_DEF(D3DKMTDestroyHwQueue)* DXCORE_PFN(D3DKMTDestroyHwQueue);
+    DXCORE_DEF(D3DKMTSubmitCommandToHwQueue)* DXCORE_PFN(D3DKMTSubmitCommandToHwQueue);
+
+private:
+    DxcoreLoader();
+    ~DxcoreLoader();
+
+    bool LoadDxcoreApis();
+
+    void* dxcore_handle_;
+    std::once_flag init_flag_;  // For thread-safe initialization
+
+    // Disable copy
+    DxcoreLoader(const DxcoreLoader&) = delete;
+    DxcoreLoader& operator=(const DxcoreLoader&) = delete;
+};
+
+} // namespace dxcore
+} // namespace thunk
+} // namespace wsl
+
+#endif // LIBROCDXG_DXCORE_LOADER_H
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/events.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/events.cpp
new file mode 100644
index 0000000000..1a360832de
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/events.cpp
@@ -0,0 +1,127 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <cstdio>
+#include <cassert>
+#include <thread>
+#include <chrono>
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtCreateEvent(HsaEventDescriptor *EventDesc,
+                                          bool ManualReset, bool IsSignaled,
+                                          HsaEvent **Event) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtDestroyEvent(HsaEvent *Event) {
+  CHECK_DXG_OPEN();
+  if (!Event)
+    return HSAKMT_STATUS_SUCCESS;
+
+  pr_warn_once("not supported\n");
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtSetEvent(HsaEvent *Event) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  if (!Event)
+    return HSAKMT_STATUS_INVALID_HANDLE;
+
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtResetEvent(HsaEvent *Event) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  if (!Event)
+    return HSAKMT_STATUS_INVALID_HANDLE;
+
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtQueryEventState(HsaEvent *Event) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  if (!Event)
+    return HSAKMT_STATUS_INVALID_HANDLE;
+
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnEvent(HsaEvent *Event,
+                                          HSAuint32 Milliseconds) {
+  return hsaKmtWaitOnEvent_Ext(Event, Milliseconds, NULL);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnEvent_Ext(HsaEvent *Event,
+                                              HSAuint32 Milliseconds,
+                                              uint64_t *event_age) {
+  if (!Event)
+    return HSAKMT_STATUS_INVALID_HANDLE;
+
+  return hsaKmtWaitOnMultipleEvents_Ext(&Event, 1, true, Milliseconds,
+                                        event_age);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents(HsaEvent *Events[],
+                                                   HSAuint32 NumEvents,
+                                                   bool WaitOnAll,
+                                                   HSAuint32 Milliseconds) {
+  return hsaKmtWaitOnMultipleEvents_Ext(Events, NumEvents, WaitOnAll,
+                                        Milliseconds, NULL);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents_Ext(HsaEvent *Events[],
+                                                       HSAuint32 NumEvents,
+                                                       bool WaitOnAll,
+                                                       HSAuint32 Milliseconds,
+                                                       uint64_t *event_age) {
+  CHECK_DXG_OPEN();
+
+  if (!Events)
+    return HSAKMT_STATUS_INVALID_HANDLE;
+
+  if (NumEvents == 1 && Events[0] == nullptr) {
+    std::this_thread::sleep_for(std::chrono::microseconds(20));
+    return HSAKMT_STATUS_SUCCESS;
+  }
+
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtOpenSMI(HSAuint32 NodeId, int *fd) {
+  CHECK_DXG_OPEN();
+  pr_debug("node id %d\n", NodeId);
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/hsa.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/hsa.cpp
new file mode 100755
index 0000000000..431e7bb91a
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/hsa.cpp
@@ -0,0 +1,137 @@
+#include <dlfcn.h>
+#include "impl/hsa/hsa.h"
+#include "impl/hsa/hsa_ven_amd_loader.h"
+
+static std::mutex* lock_ = new std::mutex();
+
+#if 1
+#define _HSAKMT_LOOKUP_SYMS(_sym)                                              \
+if (fn_##_sym == nullptr) {                                                    \
+    std::lock_guard<std::mutex> gard(*lock_);                                  \
+    if (fn_##_sym == nullptr) {                                                \
+      fn_##_sym =                                                              \
+        reinterpret_cast<decltype(fn_##_sym)>(dlsym(RTLD_DEFAULT, #_sym));     \
+      if (!fn_##_sym) {                                                        \
+        pr_err("%s not found - %s\n", #_sym, dlerror());                       \
+      }                                                                        \
+    }                                                                          \
+}
+
+#define _HSAKMT_EXEC_API(_sym, ...) \
+do { \
+    if (fn_##_sym != nullptr) {    \
+        return fn_##_sym(__VA_ARGS__);   \
+    } \
+} while(0);
+
+bool hsakmt_hsa_loader_init() {
+  void *hsa_loader_handle = dlopen("libhsa-runtime64.so", RTLD_NOW | RTLD_GLOBAL);
+  if (hsa_loader_handle == nullptr) {
+    pr_err("dlopen libhsa-runtime64.so failed - %s\n", dlerror());
+    return false;
+  }
+  dlclose(hsa_loader_handle);
+  return true;
+}
+
+hsa_signal_value_t hsakmt_hsa_signal_load_relaxed(hsa_signal_t signal) {
+  static hsa_signal_value_t (*fn_hsa_signal_load_relaxed)(hsa_signal_t signal) = nullptr;
+
+  _HSAKMT_LOOKUP_SYMS(hsa_signal_load_relaxed);
+  _HSAKMT_EXEC_API(hsa_signal_load_relaxed, signal);
+
+  return 0;
+}
+
+hsa_signal_value_t hsakmt_hsa_signal_wait_relaxed(
+    hsa_signal_t signal, hsa_signal_condition_t condition,
+    hsa_signal_value_t compare_value, uint64_t timeout_hint,
+    hsa_wait_state_t wait_state_hint) {
+static hsa_signal_value_t (*fn_hsa_signal_wait_relaxed)(
+    hsa_signal_t signal, hsa_signal_condition_t condition,
+    hsa_signal_value_t compare_value, uint64_t timeout_hint,
+    hsa_wait_state_t wait_state_hint) = nullptr;
+
+  _HSAKMT_LOOKUP_SYMS(hsa_signal_wait_relaxed);
+  _HSAKMT_EXEC_API(hsa_signal_wait_relaxed, signal, condition, compare_value,
+                   timeout_hint, wait_state_hint);
+
+  return 0;
+}
+
+void hsakmt_hsa_signal_store_screlease(hsa_signal_t hsa_signal,
+                                      hsa_signal_value_t value){
+static void (*fn_hsa_signal_store_screlease)(hsa_signal_t hsa_signal,
+                                      hsa_signal_value_t value) = nullptr;
+
+  _HSAKMT_LOOKUP_SYMS(hsa_signal_store_screlease);
+  _HSAKMT_EXEC_API(hsa_signal_store_screlease, hsa_signal, value);
+}
+
+hsa_status_t hsakmt_hsa_ven_amd_loader_query_host_address(
+    const void *device_address, const void **host_address) {
+  static hsa_status_t (*fn_hsa_ven_amd_loader_query_host_address)(
+    const void *device_address, const void **host_address) = nullptr;
+
+  if (fn_hsa_ven_amd_loader_query_host_address == nullptr) {
+    std::lock_guard<std::mutex> gard(*lock_);
+    if (fn_hsa_ven_amd_loader_query_host_address == nullptr) {
+      hsa_status_t (*fn_hsa_system_get_extension_table)(
+      uint16_t extension, uint16_t version_major, uint16_t version_minor, void *table);
+      fn_hsa_system_get_extension_table =
+        reinterpret_cast<decltype(fn_hsa_system_get_extension_table)>(dlsym(RTLD_DEFAULT, "hsa_system_get_extension_table"));
+      if (fn_hsa_system_get_extension_table == nullptr) {
+        pr_err("%s not found - %s\n", "hsa_system_get_extension_table", dlerror());
+        return HSA_STATUS_ERROR;
+      }
+
+      hsa_ven_amd_loader_1_03_pfn_t table;
+      fn_hsa_system_get_extension_table(HSA_EXTENSION_AMD_LOADER, 1, 3, &table);
+      fn_hsa_ven_amd_loader_query_host_address =
+          table.hsa_ven_amd_loader_query_host_address;
+    }
+  }
+
+  _HSAKMT_EXEC_API(hsa_ven_amd_loader_query_host_address, device_address, host_address);
+  return HSA_STATUS_ERROR;
+}
+
+#else
+hsa_signal_value_t hsakmt_hsa_signal_load_relaxed(hsa_signal_t signal) {
+  return hsa_signal_load_relaxed(signal);
+}
+
+hsa_signal_value_t hsakmt_hsa_signal_wait_relaxed(
+    hsa_signal_t signal, hsa_signal_condition_t condition,
+    hsa_signal_value_t compare_value, uint64_t timeout_hint,
+    hsa_wait_state_t wait_state_hint) {
+  return hsa_signal_wait_relaxed(signal, condition, compare_value, timeout_hint,
+                                 wait_state_hint);
+}
+
+void hsakmt_hsa_signal_store_screlease(hsa_signal_t hsa_signal,
+                                      hsa_signal_value_t value) {
+  hsa_signal_store_screlease(hsa_signal, value);
+}
+
+hsa_status_t hsakmt_hsa_ven_amd_loader_query_host_address(
+    const void *device_address, const void **host_address) {
+  static hsa_status_t (*fn_hsa_ven_amd_loader_query_host_address)(
+    const void *device_address, const void **host_address) = nullptr;
+
+  if (fn_hsa_ven_amd_loader_query_host_address == nullptr) {
+    std::lock_guard<std::mutex> gard(*lock_);
+    if (fn_hsa_ven_amd_loader_query_host_address == nullptr) {
+      hsa_ven_amd_loader_1_03_pfn_t table;
+      hsa_system_get_extension_table(HSA_EXTENSION_AMD_LOADER, 1, 3, &table);
+      fn_hsa_ven_amd_loader_query_host_address =
+          table.hsa_ven_amd_loader_query_host_address;
+    }
+  }
+
+  if (fn_hsa_ven_amd_loader_query_host_address)
+    return fn_hsa_ven_amd_loader_query_host_address(device_address, host_address);
+
+  return HSA_STATUS_ERROR;
+}
+#endif
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/hsakmtmodel.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/hsakmtmodel.cpp
new file mode 100644
index 0000000000..6799f5d891
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/hsakmtmodel.cpp
@@ -0,0 +1,31 @@
+/*
+* Copyright © 2025 Advanced Micro Devices, Inc.
+*
+* Permission is hereby granted, free of charge, to any person
+* obtaining a copy of this software and associated documentation
+* files (the "Software"), to deal in the Software without
+* restriction, including without limitation the rights to use, copy,
+* modify, merge, publish, distribute, sublicense, and/or sell copies
+* of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including
+* the next paragraph) shall be included in all copies or substantial
+* portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+* DEALINGS IN THE SOFTWARE.
+*/
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtModelEnabled(bool* enable)
+{
+  *enable = false;
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_SUCCESS;
+}
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/libdrm.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/libdrm.cpp
new file mode 100644
index 0000000000..2e125dfb3e
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/libdrm.cpp
@@ -0,0 +1,182 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+#include <cstdint>
+
+#include "impl/wddm/types.h"
+#include "impl/wddm/device.h"
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtGetAMDGPUDeviceHandle(
+    HSAuint32 NodeId, HsaAMDGPUDeviceHandle *DeviceHandle) {
+  CHECK_DXG_OPEN();
+
+  wsl::thunk::WDDMDevice *pDevice = get_wddmdev(NodeId);
+  if (pDevice != nullptr) {
+    *DeviceHandle = reinterpret_cast<HsaAMDGPUDeviceHandle>(pDevice);
+    return HSAKMT_STATUS_SUCCESS;
+  }
+  return HSAKMT_STATUS_ERROR;
+}
+
+HSAKMTAPI int amdgpu_device_initialize(int fd,
+                                       uint32_t *major_version,
+                                       uint32_t *minor_version,
+                                       amdgpu_device_handle *device_handle) {
+  return 0;
+}
+
+HSAKMTAPI int amdgpu_device_deinitialize(amdgpu_device_handle device_handle) {
+  return 0;
+}
+
+HSAKMTAPI int amdgpu_query_gpu_info(amdgpu_device_handle dev,
+                                    struct amdgpu_gpu_info *info) {
+  wsl::thunk::WDDMDevice *pDevice =
+    reinterpret_cast<wsl::thunk::WDDMDevice *>(dev);
+  memset(info, 0, sizeof(*info));
+  info->gpu_counter_freq = pDevice->GPUCounterFrequency() / 1000ull;
+  return 0;
+}
+
+HSAKMTAPI int amdgpu_device_get_fd(amdgpu_device_handle dev) {
+  return dxg_runtime->dxg_fd;
+}
+
+HSAKMTAPI int amdgpu_bo_cpu_map(amdgpu_bo_handle bo, void **cpu) {
+  wsl::thunk::GpuMemory *gpu_mem = reinterpret_cast<wsl::thunk::GpuMemory *>(bo);
+  if (gpu_mem->IsSysMemFd())
+    *cpu = gpu_mem->CpuAddress();
+  return 0;
+}
+
+HSAKMTAPI int amdgpu_bo_free(amdgpu_bo_handle buf_handle) {
+  wsl::thunk::GpuMemory *gpu_mem = reinterpret_cast<wsl::thunk::GpuMemory *>(buf_handle);
+  void *MemoryAddress = gpu_mem->IsVaAllocated() ? (void*)gpu_mem->GpuAddress() : (void*)gpu_mem->HandleApeAddress();
+  auto ret = hsaKmtFreeMemory((void*)MemoryAddress, gpu_mem->Size());
+  return ret == HSAKMT_STATUS_SUCCESS ? 0 : -1;
+}
+
+HSAKMTAPI int amdgpu_bo_export(amdgpu_bo_handle bo,
+                               enum amdgpu_bo_handle_type type,
+                               uint32_t *shared_handle) {
+  *shared_handle = 0;
+  return 0;
+}
+
+HSAKMTAPI int amdgpu_bo_import(amdgpu_device_handle dev,
+                               enum amdgpu_bo_handle_type type,
+                               uint32_t shared_handle,
+                               struct amdgpu_bo_import_result *output) {
+  if (type != amdgpu_bo_handle_type_dma_buf_fd) {
+    pr_err("not implemented\n");
+    return -1;
+  }
+
+
+  wsl::thunk::WDDMDevice *pDevice = reinterpret_cast<wsl::thunk::WDDMDevice *>(dev);
+  wsl::thunk::GpuMemoryHandle mem_handle;
+  bool is_ipc_memfd = is_ipc_sysmemfd(shared_handle);
+  bool alloc_va = is_ipc_memfd;
+
+  HSAKMT_STATUS ret = import_dmabuf_fd(shared_handle, pDevice->NodeId(),
+                                        alloc_va, is_ipc_memfd, &mem_handle);
+  if (ret == HSAKMT_STATUS_SUCCESS) {
+    //use GpuMemory object handle as drm buf handle
+    output->buf_handle = reinterpret_cast<amdgpu_bo_handle>(mem_handle);
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+HSAKMTAPI int amdgpu_bo_va_op(amdgpu_bo_handle bo,
+                              uint64_t offset,
+                              uint64_t size,
+                              uint64_t addr,
+                              uint64_t flags,
+                              uint32_t ops) {
+  wsl::thunk::GpuMemory *gpu_mem = reinterpret_cast<wsl::thunk::GpuMemory *>(bo);
+  assert(gpu_mem != nullptr);
+
+  switch(ops) {
+    case AMDGPU_VA_OP_MAP:
+      {
+        if (gpu_mem->GpuAddress() == addr) {
+          pr_info("bo is mapped already\n");
+          return 0;
+        } else if (gpu_mem->GpuAddress()) {
+          pr_err("amdgpu_bo_va_op: GPU memory already mapped at %p, but requested to map at %p\n",
+                 reinterpret_cast<void *>(gpu_mem->GpuAddress()), reinterpret_cast<void *>(addr));
+          return -1;
+        }
+        auto code = gpu_mem->MapGpuVirtualAddress(reinterpret_cast<gpusize>(addr), size, offset);
+        if (code != ErrorCode::Success)
+          return -1;
+
+        code = gpu_mem->MakeResident();
+        if (code != ErrorCode::Success)
+          return -1;
+      }
+      break;
+    case AMDGPU_VA_OP_UNMAP:
+      {
+        auto code = gpu_mem->UnmapGpuVirtualAddress(reinterpret_cast<gpusize>(addr), size, offset);
+        if (code != ErrorCode::Success)
+          return -1;
+        gpu_mem->Evict();
+      }
+      break;
+  }
+  return 0;
+}
+
+HSAKMTAPI int amdgpu_bo_query_info(amdgpu_bo_handle bo, struct amdgpu_bo_info* info) {
+  return 0;
+}
+
+HSAKMTAPI int amdgpu_bo_set_metadata(amdgpu_bo_handle bo, struct amdgpu_bo_metadata* info) {
+  return 0;
+}
+
+HSAKMTAPI int drmCommandWriteRead(int fd, unsigned long drmCommandIndex,
+                                  void *data, unsigned long size) {
+  return 0;
+}
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/librocdxg.h b/projects/rocr-runtime/libhsakmt/src/dxg/librocdxg.h
new file mode 100644
index 0000000000..02826b22b0
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/librocdxg.h
@@ -0,0 +1,289 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LIBHSAKMT_H_INCLUDED
+#define LIBHSAKMT_H_INCLUDED
+
+#include <pthread.h>
+#include <stdint.h>
+#include <limits.h>
+#include "hsakmt/hsakmt.h"
+#include "hsakmt/hsakmt_drm.h"
+
+#include "impl/wddm/va_mgr.h"
+#include "impl/wddm/types.h"
+#include "impl/wddm/device.h"
+#include "dxcore_loader.h"
+
+wsl::thunk::WDDMDevice* get_wddmdev(uint32_t node_id);
+uint32_t get_num_wddmdev();
+wsl::thunk::GpuMemory *get_gpu_mem(void *MemoryAddress);
+
+#define HSAKMT_DEBUG_LEVEL_ERR      -1
+#define HSAKMT_DEBUG_LEVEL_DEFAULT  3
+#define HSAKMT_DEBUG_LEVEL_WARNING  4
+#define HSAKMT_DEBUG_LEVEL_INFO     6
+#define HSAKMT_DEBUG_LEVEL_DEBUG    7
+
+struct hsakmtRuntime {
+  hsakmtRuntime()
+    : dxg_fd(-1),
+    parent_pid(getpid()),
+    is_forked(false),
+    hsakmt_debug_level(HSAKMT_DEBUG_LEVEL_DEFAULT),
+    dxg_open_count(0),
+    hsakmt_mutex(PTHREAD_MUTEX_INITIALIZER),
+    hsakmt_is_dgpu(false),
+    is_svm_api_supported(false),
+    zfb_support(0),
+    vendor_packet_process(0),
+    check_avail_sysram(false),
+    max_single_alloc_size(0),
+    enable_thunk_sub_allocator(0),
+    local_heap_space_start_(0),
+    local_heap_space_size_(0),
+    system_heap_space_start_(0),
+    system_heap_space_size_(0),
+    handle_aperture_start_(0),
+    handle_aperture_size_(0),
+    default_node(1) {}
+
+  void HeapInit();
+  void HeapFini();
+  bool ReserveSvmSpace(uint64_t &base, uint64_t &size, uint64_t align);
+  bool FreeSvmSpace(uint64_t &base, uint64_t &size);
+  bool ReserveLocalHeapSpace();
+  bool FreeLocalHeapSpace();
+  void InitLocalHeapMgr();
+  bool ReserveSystemHeapSpace();
+  uint64_t SystemHeapSize() { return system_heap_space_size_; }
+  bool FreeSystemHeapSpace();
+  bool CommitSystemHeapSpace(void* addr, int64_t size, bool lock);
+  bool DecommitSystemHeapSpace(void* addr, int64_t size);
+  void InitSystemHeapMgr();
+  ErrorCode ReserveGpuVirtualAddress(const thunk_proxy::AllocDomain domain,
+          gpusize hit_base_addr, gpusize size,
+          gpusize *out_gpu_virt_addr, gpusize alignment, bool lock);
+  ErrorCode FreeGpuVirtualAddress(const thunk_proxy::AllocDomain domain,
+          gpusize gpu_addr, gpusize size);
+  bool CommitSystemHeapSpaceIPC(void* addr, int64_t size, int &fd, bool lock=false);
+  bool DecommitSystemHeapSpaceIPC(void* addr, int64_t size, int &memfd);
+  ErrorCode ReserveIPCSysMem(gpusize size,
+          gpusize *out_gpu_virt_addr, gpusize alignment,
+          int &memfd, bool lock);
+  ErrorCode FreeIPCSysMem(gpusize gpu_addr, gpusize size, int &memfd);
+  bool InitHandleApertureSpace();
+  void InitHandleApertureMgr();
+  ErrorCode HandleApertureAlloc(gpusize size, gpusize *out_gpu_virt_addr);
+  void HandleApertureFree(gpusize gpu_addr);
+
+  pthread_mutex_t hsakmt_mutex;
+  const char *dxg_device_name = "/dev/dxg";
+  long page_size;
+  int page_shift;
+  int dxg_fd = -1;
+  pid_t parent_pid = -1;
+  bool is_forked = false;
+  int hsakmt_debug_level = HSAKMT_DEBUG_LEVEL_DEFAULT;
+  unsigned long dxg_open_count;
+  bool hsakmt_is_dgpu;
+  bool is_svm_api_supported;
+  int zfb_support;
+  int vendor_packet_process;
+  bool check_avail_sysram;
+  size_t max_single_alloc_size;
+  int enable_thunk_sub_allocator;
+  uint32_t default_node;
+
+  /* local heap means bo's backend is vram of all GPUs */
+  uint64_t local_heap_space_start_;
+  uint64_t local_heap_space_size_;
+
+  /* manage the reserved local heap space which shared by CPU and GPUs */
+  std::unique_ptr<wsl::thunk::VaMgr> local_heap_mgr_;
+
+  /* system heap means bo's backend is system ram */
+  uint64_t system_heap_space_start_;
+  uint64_t system_heap_space_size_;
+
+  /* manage the reserved system heap space which shared by CPU and GPUs */
+  std::unique_ptr<wsl::thunk::VaMgr> system_heap_mgr_;
+
+  uint64_t handle_aperture_start_;
+  uint64_t handle_aperture_size_;
+  std::unique_ptr<wsl::thunk::VaMgr> handle_aperture_mgr_;
+};
+
+extern hsakmtRuntime *dxg_runtime;
+
+#undef HSAKMTAPI
+#define HSAKMTAPI __attribute__((visibility ("default")))
+
+#if defined(__clang__)
+#if __has_feature(address_sanitizer)
+#define SANITIZER_AMDGPU 1
+#endif
+#endif
+
+/*Avoid pointer-to-int-cast warning*/
+#define PORT_VPTR_TO_UINT64(vptr) ((uint64_t)(unsigned long)(vptr))
+
+/*Avoid int-to-pointer-cast warning*/
+#define PORT_UINT64_TO_VPTR(v) ((void*)(unsigned long)(v))
+
+#define CHECK_DXG_OPEN() \
+	do { if (dxg_runtime->dxg_open_count == 0 || dxg_runtime->is_forked) return HSAKMT_STATUS_KERNEL_IO_CHANNEL_NOT_OPENED; } while (0)
+
+/* 64KB BigK fragment size for TLB efficiency */
+#define GPU_BIGK_PAGE_SIZE (1 << 16)
+
+/* 2MB huge page size for 4-level page tables on Vega10 and later GPUs */
+#define GPU_HUGE_PAGE_SIZE (2 << 20)
+
+#define CHECK_PAGE_MULTIPLE(x) \
+	do { if ((uint64_t)PORT_VPTR_TO_UINT64(x) % dxg_runtime->page_size) return HSAKMT_STATUS_INVALID_PARAMETER; } while(0)
+
+#define ALIGN_UP(x,align) (((uint64_t)(x) + (align) - 1) & ~(uint64_t)((align)-1))
+#define ALIGN_UP_32(x,align) (((uint32_t)(x) + (align) - 1) & ~(uint32_t)((align)-1))
+#define PAGE_ALIGN_UP(x) ALIGN_UP(x,dxg_runtime->page_size)
+#define BITMASK(n) ((n) ? (UINT64_MAX >> (sizeof(UINT64_MAX) * CHAR_BIT - (n))) : 0)
+#define ARRAY_LEN(array) (sizeof(array) / sizeof(array[0]))
+
+/* HSA Thunk logging usage */
+#define get_thread_id()                                                                                                          \
+    ([]() -> std::string {                                                                                                       \
+        std::stringstream str_thrd_id;                                                                                           \
+        str_thrd_id << std::hex << std::this_thread::get_id();                                                                   \
+        return str_thrd_id.str();                                                                                                \
+    })()
+#define hsakmt_print_common(stream, fmt, ...)                                                                                    \
+    do {                                                                                                                         \
+        fprintf(stream, "pid:%d tid:0x%s [%s] " fmt, getpid(), get_thread_id().c_str(), __FUNCTION__, ##__VA_ARGS__);            \
+        fflush(stream);                                                                                                          \
+    } while (false)
+#ifdef NDEBUG
+#define hsakmt_print(level, fmt, ...)                                                                                            \
+    do { } while (false)
+#else
+#define hsakmt_print(level, fmt, ...)                                                                                            \
+    do {                                                                                                                         \
+        if (level <= dxg_runtime->hsakmt_debug_level) {                                                                          \
+            hsakmt_print_common(stdout, fmt, ##__VA_ARGS__);                                                                     \
+        }                                                                                                                        \
+    } while (false)
+#endif
+
+#define pr_err(fmt, ...) \
+	hsakmt_print_common(stderr, fmt, ##__VA_ARGS__)
+#define pr_warn(fmt, ...) \
+	hsakmt_print(HSAKMT_DEBUG_LEVEL_WARNING, fmt, ##__VA_ARGS__)
+#define pr_info(fmt, ...) \
+	hsakmt_print(HSAKMT_DEBUG_LEVEL_INFO, fmt, ##__VA_ARGS__)
+#define pr_debug(fmt, ...) \
+	hsakmt_print(HSAKMT_DEBUG_LEVEL_DEBUG, fmt, ##__VA_ARGS__)
+#define pr_err_once(fmt, ...)                   \
+({                                              \
+        static bool __print_once;               \
+        if (!__print_once) {                    \
+                __print_once = true;            \
+                pr_err(fmt, ##__VA_ARGS__);     \
+        }                                       \
+})
+#define pr_warn_once(fmt, ...)                  \
+({                                              \
+        static bool __print_once;               \
+        if (!__print_once) {                    \
+                __print_once = true;            \
+                pr_warn(fmt, ##__VA_ARGS__);    \
+        }                                       \
+})
+
+/* Expects HSA_ENGINE_ID.ui32, returns gfxv (full) in hex */
+#define HSA_GET_GFX_VERSION_FULL(ui32) \
+	(((ui32.Major) << 16) | ((ui32.Minor) << 8) | (ui32.Stepping))
+
+HSAKMT_STATUS validate_nodeid(uint32_t nodeid, uint32_t *gpu_id);
+HSAKMT_STATUS gpuid_to_nodeid(uint32_t gpu_id, uint32_t* node_id);
+bool prefer_ats(HSAuint32 node_id);
+uint16_t get_device_id_by_node_id(HSAuint32 node_id);
+uint16_t get_device_id_by_gpu_id(HSAuint32 gpu_id);
+uint32_t get_direct_link_cpu(uint32_t gpu_node);
+
+HSAKMT_STATUS topology_sysfs_get_system_props(HsaSystemProperties& props);
+HSAKMT_STATUS topology_get_node_props(HSAuint32 NodeId,
+				      HsaNodeProperties *NodeProperties);
+HSAKMT_STATUS topology_get_iolink_props(HSAuint32 NodeId,
+					HSAuint32 NumIoLinks,
+					HsaIoLinkProperties *IoLinkProperties);
+void topology_setup_is_dgpu_param(HsaNodeProperties *props);
+
+HSAuint32 PageSizeFromFlags(unsigned int pageSizeFlags);
+
+uint32_t get_num_sysfs_nodes(void);
+
+bool is_forked_child(void);
+
+void clear_allocation_map(void);
+
+class BlockAllocator {
+private:
+    static const size_t block_size_ = 128 * 1024 * 1024;  // 128MB blocks.
+
+public:
+    void* alloc(size_t request_size, size_t& allocated_size) const;
+    void free(void* ptr, size_t length) const;
+    size_t block_size() const { return block_size_; }
+};
+
+void reset_suballocator(void);
+void trim_suballocator(void);
+
+HSAKMT_STATUS hsaKmtAllocMemoryAlignInternal(HSAuint32 PreferredNode,
+                                            HSAuint64 SizeInBytes,
+                                            HSAuint64 Alignment,
+                                            HsaMemFlags MemFlags,
+                                            void **MemoryAddress,
+                                            bool SkipSubAlloc = false);
+
+HSAKMT_STATUS hsaKmtFreeMemoryInternal(void *MemoryAddress,
+                                    HSAuint64 SizeInBytes,
+                                    bool SkipSubAlloc = false);
+
+bool queue_acquire_buffer(void *MemoryAddress);
+bool queue_release_buffer(void *MemoryAddress);
+/* Calculate VGPR and SGPR register file size per CU */
+uint32_t get_vgpr_size_per_cu(HSA_ENGINE_ID id);
+#define SGPR_SIZE_PER_CU 0x4000
+
+bool is_ipc_sysmemfd(int fd);
+
+HSAKMT_STATUS import_dmabuf_fd(int DMABufFd,
+                                       uint32_t NodeId,
+                                       bool alloc_va,
+                                       bool is_ipc_memfd,
+                                       wsl::thunk::GpuMemoryHandle *GpuMemHandle);
+
+bool hsakmt_hsa_loader_init();
+#endif
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/librocdxg.ver b/projects/rocr-runtime/libhsakmt/src/dxg/librocdxg.ver
new file mode 100644
index 0000000000..d91b29ec90
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/librocdxg.ver
@@ -0,0 +1,113 @@
+HSAKMT_1
+{
+global:
+hsaKmtOpenKFD;
+hsaKmtCloseKFD;
+hsaKmtGetVersion;
+hsaKmtAcquireSystemProperties;
+hsaKmtReleaseSystemProperties;
+hsaKmtGetNodeProperties;
+hsaKmtGetNodeMemoryProperties;
+hsaKmtGetNodeCacheProperties;
+hsaKmtGetNodeIoLinkProperties;
+hsaKmtCreateEvent;
+hsaKmtDestroyEvent;
+hsaKmtSetEvent;
+hsaKmtResetEvent;
+hsaKmtQueryEventState;
+hsaKmtWaitOnEvent;
+hsaKmtWaitOnMultipleEvents;
+hsaKmtCreateQueue;
+hsaKmtCreateQueueExt;
+hsaKmtUpdateQueue;
+hsaKmtDestroyQueue;
+hsaKmtSetQueueCUMask;
+hsaKmtSetMemoryPolicy;
+hsaKmtAllocMemory;
+hsaKmtAllocMemoryAlign;
+hsaKmtFreeMemory;
+hsaKmtAvailableMemory;
+hsaKmtRegisterMemory;
+hsaKmtRegisterMemoryToNodes;
+hsaKmtRegisterMemoryWithFlags;
+hsaKmtRegisterGraphicsHandleToNodes;
+hsaKmtRegisterGraphicsHandleToNodesExt;
+hsaKmtShareMemory;
+hsaKmtRegisterSharedHandle;
+hsaKmtRegisterSharedHandleToNodes;
+hsaKmtProcessVMRead;
+hsaKmtProcessVMWrite;
+hsaKmtDeregisterMemory;
+hsaKmtMapMemoryToGPU;
+hsaKmtMapMemoryToGPUNodes;
+hsaKmtUnmapMemoryToGPU;
+hsaKmtDbgRegister;
+hsaKmtDbgUnregister;
+hsaKmtDbgWavefrontControl;
+hsaKmtDbgAddressWatch;
+hsaKmtDbgEnable;
+hsaKmtDbgDisable;
+hsaKmtDbgGetDeviceData;
+hsaKmtDbgGetQueueData;
+hsaKmtGetClockCounters;
+hsaKmtPmcGetCounterProperties;
+hsaKmtPmcRegisterTrace;
+hsaKmtPmcUnregisterTrace;
+hsaKmtPmcAcquireTraceAccess;
+hsaKmtPmcReleaseTraceAccess;
+hsaKmtPmcStartTrace;
+hsaKmtPmcQueryTrace;
+hsaKmtPmcStopTrace;
+hsaKmtMapGraphicHandle;
+hsaKmtUnmapGraphicHandle;
+hsaKmtSetTrapHandler;
+hsaKmtGetTileConfig;
+hsaKmtQueryPointerInfo;
+hsaKmtSetMemoryUserData;
+hsaKmtGetQueueInfo;
+hsaKmtAllocQueueGWS;
+hsaKmtRuntimeEnable;
+hsaKmtRuntimeDisable;
+hsaKmtCheckRuntimeDebugSupport;
+hsaKmtGetRuntimeCapabilities;
+hsaKmtDebugTrapIoctl;
+hsaKmtSPMAcquire;
+hsaKmtSPMRelease;
+hsaKmtSPMSetDestBuffer;
+hsaKmtSVMSetAttr;
+hsaKmtSVMGetAttr;
+hsaKmtSetXNACKMode;
+hsaKmtGetXNACKMode;
+hsaKmtOpenSMI;
+hsaKmtExportDMABufHandle;
+hsaKmtGetMemoryHandle;
+hsaKmtWaitOnEvent_Ext;
+hsaKmtWaitOnMultipleEvents_Ext;
+hsaKmtReplaceAsanHeaderPage;
+hsaKmtReturnAsanHeaderPage;
+hsaKmtGetAMDGPUDeviceHandle;
+hsaKmtPcSamplingQueryCapabilities;
+hsaKmtPcSamplingCreate;
+hsaKmtPcSamplingDestroy;
+hsaKmtPcSamplingStart;
+hsaKmtPcSamplingStop;
+hsaKmtPcSamplingSupport;
+hsaKmtAisReadWriteFile;
+hsaKmtModelEnabled;
+hsaKmtQueueRingDoorbell;
+amdgpu_device_initialize;
+amdgpu_device_deinitialize;
+amdgpu_query_gpu_info;
+amdgpu_bo_import;
+amdgpu_bo_va_op;
+amdgpu_device_get_fd;
+amdgpu_bo_cpu_map;
+amdgpu_bo_free;
+amdgpu_bo_export;
+amdgpu_bo_query_info;
+amdgpu_bo_set_metadata;
+drmCommandWriteRead;
+
+local: *;
+};
+
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/memory.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/memory.cpp
new file mode 100644
index 0000000000..b6ef48cf29
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/memory.cpp
@@ -0,0 +1,989 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/sysinfo.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include "impl/wddm/gpu_memory.h"
+#include "util/simple_heap.h"
+
+struct Allocation {
+  Allocation()
+      : handle(0), cpu_addr(0), gpu_addr(0), size(0), userptr(false),
+        user_data(nullptr), size_requested(0), node_id(0), mem_flags_value(0),
+        dmabuf_fd(-1), rocr_userdata(nullptr) {}
+  Allocation(wsl::thunk::GpuMemoryHandle handle_arg, void *cpu_addr_arg,
+             uint64_t gpu_addr_arg, size_t size_arg, bool userptr_arg = false,
+             void *user_data_arg = nullptr, size_t user_size_arg = 0,
+             HSAuint32 node_id_arg = 0, HSAuint32 mem_flags_value_arg = 0)
+      : handle(handle_arg), cpu_addr(cpu_addr_arg), gpu_addr(gpu_addr_arg),
+        size(size_arg), userptr(userptr_arg), user_data(user_data_arg),
+        size_requested(user_size_arg), node_id(node_id_arg),
+        mem_flags_value(mem_flags_value_arg), dmabuf_fd(-1), rocr_userdata(nullptr) {}
+
+  wsl::thunk::GpuMemoryHandle handle;
+  void *cpu_addr;
+  uint64_t gpu_addr;
+  bool userptr;
+  size_t size; /* actual size = align_up(size_requested, granularity) */
+  void *user_data;
+  size_t size_requested; /* size requested by user */
+  HSAuint32 node_id;
+  HSAuint32 mem_flags_value;
+  int dmabuf_fd;
+  void *rocr_userdata;
+};
+
+static std::map<const void *, Allocation>* allocation_map_ = new std::map<const void *, Allocation>();
+static std::mutex* allocation_map_lock_ = new std::mutex();
+
+void clear_allocation_map(void)
+{
+  //delete allocation_map_lock_;
+  allocation_map_lock_ = new std::mutex();
+  std::lock_guard<std::mutex> lock(*allocation_map_lock_);
+  delete allocation_map_;
+  allocation_map_ = new std::map<const void *, Allocation>();
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtSetMemoryPolicy(HSAuint32 Node,
+                                              HSAuint32 DefaultPolicy,
+                                              HSAuint32 AlternatePolicy,
+                                              void *MemoryAddressAlternate,
+                                              HSAuint64 MemorySizeInBytes) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not implemented\n");
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAuint32 PageSizeFromFlags(unsigned int pageSizeFlags) {
+  switch (pageSizeFlags) {
+  case HSA_PAGE_SIZE_4KB:
+    return 4 * 1024;
+  case HSA_PAGE_SIZE_64KB:
+    return 64 * 1024;
+  case HSA_PAGE_SIZE_2MB:
+    return 2 * 1024 * 1024;
+  case HSA_PAGE_SIZE_1GB:
+    return 1024 * 1024 * 1024;
+  default:
+    assert(false);
+    return 4 * 1024;
+  }
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemory(HSAuint32 PreferredNode,
+                                          HSAuint64 SizeInBytes,
+                                          HsaMemFlags MemFlags,
+                                          void **MemoryAddress) {
+  return hsaKmtAllocMemoryAlign(PreferredNode, SizeInBytes, 0, MemFlags,
+                                MemoryAddress);
+}
+
+#define POWER_OF_2(x) ((x && (!(x & (x - 1)))) ? 1 : 0)
+
+bool isSystemMemoryAvailable(HSAuint64 SizeInBytes) {
+  struct sysinfo info;
+  if (sysinfo(&info) != 0)
+    return false;
+  return SizeInBytes <= info.freeram;
+}
+
+void* BlockAllocator::alloc(size_t request_size, size_t& allocated_size) const {
+  void *address;
+  HsaMemFlags MemFlags;
+
+  MemFlags.Value = 0;
+  MemFlags.ui32.CoarseGrain = 1;
+  MemFlags.ui32.NoSubstitute = 1;
+  allocated_size = wsl::AlignUp(request_size, block_size());
+  if (HSAKMT_STATUS_SUCCESS == hsaKmtAllocMemoryAlignInternal(1, allocated_size, 0, MemFlags, &address, true))
+    return address;
+
+  return nullptr;
+}
+
+void BlockAllocator::free(void* ptr, size_t length) const {
+  if (HSAKMT_STATUS_SUCCESS != hsaKmtFreeMemoryInternal(ptr, length, true))
+    pr_err("wsl-thunk: BlockAllocator::free() err, address %p, length:%zu\n", ptr, length);
+}
+
+static wsl::SimpleHeap<BlockAllocator> fragment_allocator_;
+
+void reset_suballocator(void) {
+  fragment_allocator_.reset();
+}
+
+void trim_suballocator(void) {
+  fragment_allocator_.trim();
+}
+
+HSAKMT_STATUS hsaKmtAllocMemoryAlignInternal(HSAuint32 PreferredNode,
+                                             HSAuint64 SizeInBytes,
+                                             HSAuint64 Alignment,
+                                             HsaMemFlags MemFlags,
+                                             void **MemoryAddress,
+                                             bool SkipSubAlloc) {
+  CHECK_DXG_OPEN();
+
+  if (!MemoryAddress)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  if (MemFlags.ui32.FixedAddress) {
+    if (*MemoryAddress == nullptr)
+      return HSAKMT_STATUS_INVALID_PARAMETER;
+  } else
+    *MemoryAddress = nullptr;
+
+  uint32_t node = (PreferredNode == 0) ? dxg_runtime->default_node : PreferredNode;
+  wsl::thunk::WDDMDevice *dev = get_wddmdev(node);
+  if (!dev)
+    return HSAKMT_STATUS_ERROR;
+
+  wsl::thunk::GpuMemory *gpu_mem = nullptr;
+  wsl::thunk::GpuMemoryCreateInfo create_info{};
+  create_info.size = SizeInBytes;
+
+  /* If initialize scratch pool of GpuAgent, treat it as SVM reserve */
+  if (MemFlags.ui32.Scratch && MemFlags.ui32.HostAccess && SizeInBytes > 0x80000000)
+    MemFlags.ui32.OnlyAddress = 1;
+
+  create_info.alignment = Alignment;
+  create_info.va_hint = reinterpret_cast<gpusize>(*MemoryAddress);
+  if ((PreferredNode == 0 && MemFlags.ui32.HostAccess)
+    || dxg_runtime->zfb_support || MemFlags.ui32.GTTAccess) {
+    if (SizeInBytes > dxg_runtime->max_single_alloc_size)
+      return HSAKMT_STATUS_NO_MEMORY;
+
+    if (dxg_runtime->check_avail_sysram && !isSystemMemoryAvailable(SizeInBytes))
+      return HSAKMT_STATUS_NO_MEMORY;
+
+    /* If allocate VRAM under ZFB mode */
+    if (dxg_runtime->zfb_support && MemFlags.ui32.NonPaged == 1)
+      MemFlags.ui32.CoarseGrain = 1;
+
+    // AllocateNonPaged == AllocateIPC
+    create_info.flags.sysmem_ipc_sig_exporter = !!(MemFlags.ui32.NonPaged && !MemFlags.ui32.GTTAccess);
+
+    create_info.domain = thunk_proxy::AllocDomain::kSystem;
+  } else {
+    create_info.domain = thunk_proxy::AllocDomain::kLocal;
+  }
+
+  if (!MemFlags.ui32.CoarseGrain)
+    create_info.mem_flags = thunk_proxy::kFineGrain;
+
+  //In hsa-runtime, only kernarg region set Uncached.
+  if (MemFlags.ui32.Uncached)
+    create_info.mem_flags |= thunk_proxy::kKernarg;
+
+  create_info.flags.physical_only = MemFlags.ui32.NoAddress;
+  create_info.flags.alloc_va = !create_info.flags.physical_only;
+  create_info.flags.interprocess = MemFlags.ui32.NoAddress;
+  create_info.flags.interprocess |= MemFlags.ui32.Contiguous;
+  create_info.flags.physical_contiguous = MemFlags.ui32.Contiguous;
+  create_info.flags.locked = MemFlags.ui32.NoSubstitute;//AllocatePinned
+  create_info.flags.virtual_alloc = MemFlags.ui32.OnlyAddress;
+  create_info.flags.blit_kernel_object =
+      (MemFlags.ui32.ExecuteBlit && MemFlags.ui32.ExecuteAccess &&
+      (create_info.domain == thunk_proxy::AllocDomain::kSystem));
+  /*when only alloc virtual or only physical, it's vmm allocation, force to local*/
+  if (create_info.flags.virtual_alloc || create_info.flags.physical_only
+        || create_info.flags.physical_contiguous) {
+    create_info.domain = thunk_proxy::AllocDomain::kLocal;
+    SkipSubAlloc = true;
+  }
+
+  /* Only allow using the suballocator for ordinary VRAM.*/
+  bool trim_safe = false;
+  if (!SkipSubAlloc && create_info.domain == thunk_proxy::AllocDomain::kLocal) {
+    /* just quickly skip SA if size is bigger than SA block size.*/
+    gpusize real_size;
+    if (create_info.size > GPU_HUGE_PAGE_SIZE)
+      real_size = wsl::AlignUp(create_info.size, GPU_HUGE_PAGE_SIZE);
+    else
+      real_size = wsl::AlignUp(create_info.size, getpagesize());
+
+    if (real_size < fragment_allocator_.default_block_size()) {
+      *MemoryAddress = fragment_allocator_.alloc(real_size);
+      if (*MemoryAddress)
+        return HSAKMT_STATUS_SUCCESS;
+    }
+
+    /* SA might keep a lot of free blocks as *cache*.
+       * We can trim them if direct allocation fails at first time.
+       */
+    trim_safe = true;
+  }
+
+after_trim:
+  auto code = dev->CreateGpuMemory(create_info, &gpu_mem);
+  if (code == ErrorCode::Success) {
+    std::lock_guard<std::mutex> gard(*allocation_map_lock_);
+
+    /* For these physical allcations, use GpuMemory object's address as thunk handle*/
+    if (create_info.flags.physical_only || create_info.dmabuf_fd > 0)
+      *MemoryAddress = reinterpret_cast<void*>(gpu_mem->HandleApeAddress());
+    else
+      *MemoryAddress = reinterpret_cast<void *>(gpu_mem->GpuAddress());
+
+    (*allocation_map_)[*MemoryAddress] = Allocation(
+        gpu_mem->GetGpuMemoryHandle(), *MemoryAddress, (uint64_t)*MemoryAddress,
+        create_info.size, false, nullptr, SizeInBytes,
+        MemFlags.ui32.GTTAccess ? 0 : PreferredNode, MemFlags.Value);
+    return HSAKMT_STATUS_SUCCESS;
+  } else if (trim_safe) {
+    /* attempt to release memory from the block allocator and retry */
+    fragment_allocator_.trim();
+    trim_safe = false;
+    goto after_trim;
+  }
+
+  return HSAKMT_STATUS_ERROR;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemoryAlign(HSAuint32 PreferredNode,
+                                               HSAuint64 SizeInBytes,
+                                               HSAuint64 Alignment,
+                                               HsaMemFlags MemFlags,
+                                               void **MemoryAddress) {
+  return hsaKmtAllocMemoryAlignInternal(PreferredNode, SizeInBytes,
+                                        Alignment, MemFlags,
+                                        MemoryAddress,
+                                        !dxg_runtime->enable_thunk_sub_allocator);
+}
+
+HSAKMT_STATUS hsaKmtFreeMemoryInternal(void *MemoryAddress,
+                                       HSAuint64 SizeInBytes,
+                                       bool SkipSubAlloc) {
+  CHECK_DXG_OPEN();
+
+  if (!MemoryAddress)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  if (!SkipSubAlloc) {
+    if (fragment_allocator_.free(MemoryAddress))
+      return HSAKMT_STATUS_SUCCESS;
+  }
+
+  wsl::thunk::GpuMemory *gpu_mem = nullptr;
+  {
+    std::lock_guard<std::mutex> gard(*allocation_map_lock_);
+    auto it = allocation_map_->find(MemoryAddress);
+    if (it == allocation_map_->end()) {
+      return HSAKMT_STATUS_ERROR;
+    }
+
+    gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
+    if (gpu_mem->IsQueueReferenced())
+      return HSAKMT_STATUS_ERROR;
+
+    wsl::thunk::GpuMemoryDescFlags flags;
+    flags.reserved = gpu_mem->Flags();
+    if (flags.is_imported_vram_ipc &&
+      gpu_mem->DecSharedReference()) {
+      pr_info("memory is still referenced\n");
+      return HSAKMT_STATUS_SUCCESS;
+    }
+
+    if (it->second.dmabuf_fd >= 0) {
+      close(it->second.dmabuf_fd);
+      it->second.dmabuf_fd = -1;
+    }
+    allocation_map_->erase(it);
+  }
+
+  delete gpu_mem;
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtFreeMemory(void *MemoryAddress,
+                     HSAuint64 SizeInBytes) {
+  return hsaKmtFreeMemoryInternal(MemoryAddress, SizeInBytes);
+}
+
+bool queue_acquire_buffer(void *MemoryAddress) {
+  if (!MemoryAddress)
+  return false;
+
+  wsl::thunk::GpuMemory *gpu_mem = nullptr;
+  {
+  std::lock_guard<std::mutex> gard(*allocation_map_lock_);
+  auto it = allocation_map_->find(MemoryAddress);
+  if (it == allocation_map_->end()) {
+    return HSAKMT_STATUS_ERROR;
+  }
+
+  gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
+  gpu_mem->GetQueueReference();
+  }
+  if (gpu_mem == nullptr)
+  return false;
+
+  return true;
+}
+
+bool queue_release_buffer(void *MemoryAddress) {
+  if (!MemoryAddress)
+    return false;
+
+  wsl::thunk::GpuMemory *gpu_mem = nullptr;
+  {
+    std::lock_guard<std::mutex> gard(*allocation_map_lock_);
+    auto it = allocation_map_->find(MemoryAddress);
+    if (it == allocation_map_->end()) {
+      return HSAKMT_STATUS_ERROR;
+    }
+
+    gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
+    gpu_mem->PutQueueReference();
+  }
+  if (gpu_mem == nullptr)
+    return false;
+
+  return true;
+}
+
+wsl::thunk::GpuMemory *get_gpu_mem(void *MemoryAddress) {
+  std::lock_guard<std::mutex> gard(*allocation_map_lock_);
+  auto it = allocation_map_->find(MemoryAddress);
+  if (it == allocation_map_->end()) {
+    return nullptr;
+  }
+
+  return wsl::thunk::GpuMemory::Convert(it->second.handle);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtAvailableMemory(HSAuint32 Node,
+                                              HSAuint64 *AvailableBytes) {
+  CHECK_DXG_OPEN();
+
+  if (!AvailableBytes)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  wsl::thunk::WDDMDevice *dev = get_wddmdev(Node);
+  if (!dev)
+    return HSAKMT_STATUS_ERROR;
+
+  *AvailableBytes = dev->VramAvail();
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemory(void *MemoryAddress,
+                                             HSAuint64 MemorySizeInBytes) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not implemented\n");
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryToNodes(void *MemoryAddress,
+                                                    HSAuint64 MemorySizeInBytes,
+                                                    HSAuint64 NumberOfNodes,
+                                                    HSAuint32 *NodeArray) {
+  CHECK_DXG_OPEN();
+
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryWithFlags(
+    void *MemoryAddress, HSAuint64 MemorySizeInBytes, HsaMemFlags MemFlags) {
+  CHECK_DXG_OPEN();
+
+  if (!MemoryAddress)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  pr_debug("address %p\n", MemoryAddress);
+
+  if (MemFlags.ui32.ExtendedCoherent && MemFlags.ui32.CoarseGrain)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  // Registered memory should be ordinary paged host memory.
+  if ((MemFlags.ui32.HostAccess != 1) || (MemFlags.ui32.NonPaged == 1))
+    return HSAKMT_STATUS_NOT_SUPPORTED;
+
+  if (!dxg_runtime->hsakmt_is_dgpu)
+    /* TODO: support mixed APU and dGPU configurations */
+    return HSAKMT_STATUS_NOT_SUPPORTED;
+
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+bool is_ipc_sysmemfd(int fd) {
+  std::string fdPath = "/proc/self/fd/" + std::to_string(fd);
+  char linkTarget[256];
+  ssize_t bytes = readlink(fdPath.c_str(), linkTarget, sizeof(linkTarget) - 1);
+  if (bytes == -1)
+    return false;
+  linkTarget[bytes] = '\0';
+  return strstr(linkTarget, "rocr4wsl_gtt") != nullptr;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterGraphicsHandleToNodes(HSAuint64 GraphicsResourceHandle,
+                                                            HsaGraphicsResourceInfo *GraphicsResourceInfo,
+                                                            HSAuint64 NumberOfNodes,
+                                                            HSAuint32 *NodeArray) {
+  HSA_REGISTER_MEM_FLAGS regFlags;
+  regFlags.Value = 0;
+
+  return hsaKmtRegisterGraphicsHandleToNodesExt(GraphicsResourceHandle,
+            GraphicsResourceInfo,
+            NumberOfNodes,
+            NodeArray,
+            regFlags);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterGraphicsHandleToNodesExt(HSAuint64 GraphicsResourceHandle,
+							       HsaGraphicsResourceInfo *GraphicsResourceInfo,
+							       HSAuint64 NumberOfNodes,
+							       HSAuint32 *NodeArray,
+							       HSA_REGISTER_MEM_FLAGS RegisterFlags) {
+  CHECK_DXG_OPEN();
+  uint32_t *gpu_id_array = NULL;
+  HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
+
+  if (is_ipc_sysmemfd(GraphicsResourceHandle)) {
+    GraphicsResourceInfo->NodeId = dxg_runtime->default_node;
+    pr_info("skip register sysmemfd. It would be released in next step\n");
+    return HSAKMT_STATUS_SUCCESS;
+  }
+
+  if (NumberOfNodes == 0) {
+    RegisterFlags.ui32.requiresVAddr = 0;
+    NumberOfNodes = 1;
+    NodeArray = (HSAuint32*)&(dxg_runtime->default_node);
+  }
+
+  pr_debug("number of nodes %lu\n", NumberOfNodes);
+  wsl::thunk::GpuMemoryHandle mem_handle;
+  ret = import_dmabuf_fd(GraphicsResourceHandle, NodeArray[0],
+                          RegisterFlags.ui32.requiresVAddr,
+                          false, &mem_handle);
+  if (ret != HSAKMT_STATUS_SUCCESS) {
+    pr_err("hsaKmtRegisterGraphicsHandleToNodesExt: import_dmabuf_fd failed, "
+           "GraphicsResourceHandle: %lu, NodeId: %u\n",
+           GraphicsResourceHandle, NodeArray[0]);
+    return ret;
+  }
+  wsl::thunk::GpuMemory *gpu_mem = wsl::thunk::GpuMemory::Convert(mem_handle);
+  GraphicsResourceInfo->NodeId = gpu_mem->GetDevice()->NodeId();
+  GraphicsResourceInfo->SizeInBytes = gpu_mem->ClientSize();
+  GraphicsResourceInfo->MemoryAddress = RegisterFlags.ui32.requiresVAddr ?
+                                          reinterpret_cast<void *>(gpu_mem->GpuAddress()):
+                                          reinterpret_cast<void*>(gpu_mem->HandleApeAddress());
+
+  return ret;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtExportDMABufHandle(void *MemoryAddress,
+                                                 HSAuint64 MemorySizeInBytes,
+                                                 int *DMABufFd,
+                                                 HSAuint64 *Offset) {
+  CHECK_DXG_OPEN();
+
+  std::lock_guard<std::mutex> gard(*allocation_map_lock_);
+
+  auto it = allocation_map_->upper_bound(MemoryAddress);
+  if (it != allocation_map_->begin()) {
+    --it;
+    if (it->second.dmabuf_fd == -1) {
+      auto gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
+      auto code = gpu_mem->ExportPhysicalHandle(DMABufFd);
+      if (code != ErrorCode::Success)
+        return HSAKMT_STATUS_ERROR;
+      it->second.dmabuf_fd = *DMABufFd;
+    }
+    *DMABufFd = dup(it->second.dmabuf_fd);
+    *Offset = reinterpret_cast<uint64_t>(MemoryAddress) - it->second.gpu_addr;
+    return HSAKMT_STATUS_SUCCESS;
+  }
+
+  return HSAKMT_STATUS_ERROR;
+}
+
+HSAKMT_STATUS HSAKMTAPI
+hsaKmtGetMemoryHandle(void *MemoryAddress, HSAuint64 SizeInBytes,
+                      uint64_t *SharedMemoryHandle) {
+	CHECK_DXG_OPEN();
+
+	return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS import_dmabuf_fd(int DMABufFd,
+                                       uint32_t NodeId,
+                                       bool alloc_va,
+                                       bool is_ipc_memfd,
+                                       wsl::thunk::GpuMemoryHandle *GpuMemHandle) {
+  CHECK_DXG_OPEN();
+
+  *GpuMemHandle = nullptr;
+  wsl::thunk::WDDMDevice* dev = get_wddmdev(NodeId);
+  wsl::thunk::GpuMemory *gpu_mem = nullptr;
+  wsl::thunk::GpuMemoryCreateInfo create_info{};
+  create_info.dmabuf_fd = DMABufFd;
+  create_info.flags.alloc_va = alloc_va;
+
+  if (is_ipc_memfd) {
+    struct stat st;
+    fstat(DMABufFd, &st);
+    uint64_t sz = st.st_size;
+    if (4096 <= sz && sz < dxg_runtime->SystemHeapSize() && (sz & 0xfff) == 0) {
+      pr_debug("DMABufFd %d is sys mem fd(IPC signal), get size:%ld from it\n", DMABufFd, st.st_size);
+      create_info.flags.sysmem_ipc_sig_importer = 1;        // set to 1 when backend is system memory
+      create_info.size = st.st_size;
+    }
+  }
+
+  gpusize gpu_va = 0;
+  auto code = dev->CreateGpuMemory(create_info, &gpu_mem, &gpu_va);
+  if (code == ErrorCode::SameProcessSameDevice) {
+    /* Unit_hipMemPoolExportToShareableHandle_SameProc */
+    pr_info("imported from same process, use the old one\n");
+    std::lock_guard<std::mutex> gard(*allocation_map_lock_);
+    auto it = allocation_map_->find((void*)gpu_va);
+    if (it == allocation_map_->end()) {
+      pr_err("where's the conflict buffer? va %#lx\n", create_info.va_hint);
+      return HSAKMT_STATUS_ERROR;
+    }
+    wsl::thunk::GpuMemory *conflict_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
+    conflict_mem->IncSharedReference();
+    *GpuMemHandle = it->second.handle;
+    return HSAKMT_STATUS_SUCCESS;
+  } else if (code != ErrorCode::Success) {
+    pr_err("fail to import fd, ret %d\n", (int)code);
+    return HSAKMT_STATUS_ERROR;
+  }
+
+  void *MemoryAddress;
+  if (alloc_va)
+    MemoryAddress = reinterpret_cast<void *>(gpu_mem->GpuAddress());
+  else
+    MemoryAddress = reinterpret_cast<void*>(gpu_mem->HandleApeAddress());
+
+  *GpuMemHandle = gpu_mem->GetGpuMemoryHandle();
+
+  std::lock_guard<std::mutex> gard(*allocation_map_lock_);
+  /*
+   * the gpu_mem->Flags() need convert back from GpuMemoryCreateFlags to
+   * HsaMemFlags, reference hsaKmtAllocMemoryAlign
+   * */
+  (*allocation_map_)[MemoryAddress] = Allocation(
+    *GpuMemHandle, MemoryAddress, (uint64_t)MemoryAddress,
+    gpu_mem->Size(), false, nullptr, gpu_mem->ClientSize(),
+    NodeId, gpu_mem->Flags());
+
+  return HSAKMT_STATUS_SUCCESS;
+
+}
+
+
+HSAKMT_STATUS HSAKMTAPI
+hsaKmtShareMemory(void *MemoryAddress, HSAuint64 SizeInBytes,
+                  HsaSharedMemoryHandle *SharedMemoryHandle) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not implemented\n");
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI
+hsaKmtRegisterSharedHandle(const HsaSharedMemoryHandle *SharedMemoryHandle,
+                           void **MemoryAddress, HSAuint64 *SizeInBytes) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not implemented\n");
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterSharedHandleToNodes(
+    const HsaSharedMemoryHandle *SharedMemoryHandle, void **MemoryAddress,
+    HSAuint64 *SizeInBytes, HSAuint64 NumberOfNodes, HSAuint32 *NodeArray) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not implemented\n");
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtProcessVMRead(HSAuint32 Pid,
+                                            HsaMemoryRange *LocalMemoryArray,
+                                            HSAuint64 LocalMemoryArrayCount,
+                                            HsaMemoryRange *RemoteMemoryArray,
+                                            HSAuint64 RemoteMemoryArrayCount,
+                                            HSAuint64 *SizeCopied) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("has been deprecated\n");
+  assert(false);
+  return HSAKMT_STATUS_NOT_IMPLEMENTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtProcessVMWrite(HSAuint32 Pid,
+                                             HsaMemoryRange *LocalMemoryArray,
+                                             HSAuint64 LocalMemoryArrayCount,
+                                             HsaMemoryRange *RemoteMemoryArray,
+                                             HSAuint64 RemoteMemoryArrayCount,
+                                             HSAuint64 *SizeCopied) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("has been deprecated\n");
+  assert(false);
+  return HSAKMT_STATUS_NOT_IMPLEMENTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtDeregisterMemory(void *MemoryAddress) {
+  CHECK_DXG_OPEN();
+
+  if (!MemoryAddress)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  pr_debug("address %p\n", MemoryAddress);
+
+  {
+    std::lock_guard<std::mutex> gard(*allocation_map_lock_);
+
+    auto it = allocation_map_->find(MemoryAddress);
+    if (it == allocation_map_->end()) {
+      return HSAKMT_STATUS_SUCCESS;
+    }
+
+    auto *gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
+    wsl::thunk::GpuMemoryDescFlags flags;
+    flags.reserved = gpu_mem->Flags();
+    // IPC mem(vram)
+    if (flags.is_imported_vram_ipc &&
+      gpu_mem->DecSharedReference() == 0) {
+      allocation_map_->erase(it);
+      delete gpu_mem;
+      return HSAKMT_STATUS_SUCCESS;
+    }
+    if (it->second.userptr) {
+      allocation_map_->erase(it);
+      allocation_map_->erase((void *)it->second.gpu_addr);
+      delete gpu_mem;
+      return HSAKMT_STATUS_SUCCESS;
+    }
+  }
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtMapMemoryToGPU(void *MemoryAddress,
+                                             HSAuint64 MemorySizeInBytes,
+                                             HSAuint64 *AlternateVAGPU) {
+
+  HSAuint64 NumberOfNodes = 1;
+  HSAuint32 NodeArray[] = {dxg_runtime->default_node};
+  HsaMemMapFlags MemMapFlags;
+  MemMapFlags.Value = 0;
+
+  return hsaKmtMapMemoryToGPUNodes(MemoryAddress, MemorySizeInBytes, AlternateVAGPU,
+    MemMapFlags, NumberOfNodes, NodeArray);
+}
+HSAKMT_STATUS HSAKMTAPI hsaKmtMapMemoryToGPUNodes(
+    void *MemoryAddress, HSAuint64 MemorySizeInBytes, HSAuint64 *AlternateVAGPU,
+    HsaMemMapFlags MemMapFlags, HSAuint64 NumberOfNodes, HSAuint32 *NodeArray) {
+  CHECK_DXG_OPEN();
+
+  if (!MemoryAddress || !AlternateVAGPU) {
+    pr_err("FIXME: mapping NULL pointer\n");
+    return HSAKMT_STATUS_ERROR;
+  }
+
+  uint64_t start = wsl::AlignDown((uint64_t)MemoryAddress, 4096);
+  uint64_t end =
+      wsl::AlignUp((uint64_t)MemoryAddress + MemorySizeInBytes, 4096);
+
+  void *aligned_ptr = (void *)start;
+  size_t aligned_size = end - start;
+
+  {
+    if (nullptr != fragment_allocator_.block_base(aligned_ptr))
+      return HSAKMT_STATUS_SUCCESS;
+  }
+
+  {
+    std::lock_guard<std::mutex> gard(*allocation_map_lock_);
+    auto it = allocation_map_->find(aligned_ptr);
+    if (it != allocation_map_->end()) {
+      wsl::thunk::GpuMemory *gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
+      wsl::thunk::GpuMemoryDescFlags flags;
+      flags.reserved = gpu_mem->Flags();
+      // IPC mem
+      if (flags.is_imported_vram_ipc) {
+
+        auto code = gpu_mem->MapGpuVirtualAddress(gpu_mem->GpuAddress(), gpu_mem->Size());
+        if (code != ErrorCode::Success)
+          return HSAKMT_STATUS_ERROR;
+
+        code = gpu_mem->MakeResident();
+        if (code != ErrorCode::Success)
+          return HSAKMT_STATUS_ERROR;
+
+        wsl::thunk::WDDMDevice *dev = gpu_mem->GetDevice();
+        if (!dev->WaitOnPagingFenceFromCpu())
+          return HSAKMT_STATUS_ERROR;
+
+        return HSAKMT_STATUS_SUCCESS;
+      }
+
+      if (!it->second.userptr) {
+      // GTT/Local mem
+        if (it->second.size >= MemorySizeInBytes) {
+          *AlternateVAGPU = (uint64_t)MemoryAddress;
+          return HSAKMT_STATUS_SUCCESS;
+        } else {
+          return HSAKMT_STATUS_ERROR;
+        }
+      }
+    }
+
+    // userptr mem
+    it = allocation_map_->find(MemoryAddress);
+    if (it != allocation_map_->end()) {
+      if (it->second.userptr && it->second.size >= MemorySizeInBytes) {
+        *AlternateVAGPU =
+            (uintptr_t)it->second.gpu_addr +
+            ((uintptr_t)MemoryAddress - (uintptr_t)it->second.cpu_addr);
+        return HSAKMT_STATUS_SUCCESS;
+      }
+    }
+  }
+
+  // map userptr
+  wsl::thunk::WDDMDevice *dev = get_wddmdev(NodeArray[0]);
+  if (!dev)
+    return HSAKMT_STATUS_ERROR;
+
+  wsl::thunk::GpuMemory *gpu_mem = nullptr;
+  wsl::thunk::GpuMemoryHandle handle = 0;
+  uint64_t addr;
+  wsl::thunk::GpuMemoryCreateInfo create_info{};
+  create_info.domain = thunk_proxy::kUserMemory;
+  create_info.size = aligned_size;
+  create_info.user_ptr = aligned_ptr;
+
+  auto code = dev->CreateGpuMemory(create_info, &gpu_mem);
+  if (code == ErrorCode::Success) {
+    addr = gpu_mem->GpuAddress();
+    handle = gpu_mem->GetGpuMemoryHandle();
+  } else {
+    return HSAKMT_STATUS_ERROR;
+  }
+
+  {
+    std::lock_guard<std::mutex> guard(*allocation_map_lock_);
+   (*allocation_map_)[MemoryAddress] =
+        Allocation(handle, aligned_ptr, addr, aligned_size, true, MemoryAddress,
+                   MemorySizeInBytes);
+    (*allocation_map_)[(void *)addr] =
+        Allocation(handle, aligned_ptr, addr, aligned_size, true, nullptr,
+                   MemorySizeInBytes);
+  }
+
+  *AlternateVAGPU = addr + ((uintptr_t)MemoryAddress - (uintptr_t)aligned_ptr);
+
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtUnmapMemoryToGPU(void *MemoryAddress) {
+  CHECK_DXG_OPEN();
+
+  if (!MemoryAddress) {
+    /* Workaround for runtime bug */
+    pr_err("FIXME: Unmapping NULL pointer\n");
+    return HSAKMT_STATUS_SUCCESS;
+  }
+
+  pr_debug("address %p\n", MemoryAddress);
+
+  {
+    if (nullptr != fragment_allocator_.block_base(MemoryAddress))
+      return HSAKMT_STATUS_SUCCESS;
+  }
+
+  wsl::thunk::GpuMemory *gpu_mem = nullptr;
+  {
+    std::lock_guard<std::mutex> gard(*allocation_map_lock_);
+
+    auto it = allocation_map_->find(MemoryAddress);
+    if (it == allocation_map_->end()) {
+      return HSAKMT_STATUS_ERROR;
+    }
+
+    gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
+    if (gpu_mem->IsQueueReferenced())
+      return HSAKMT_STATUS_ERROR;
+
+    // IPC mem
+    wsl::thunk::GpuMemoryDescFlags flags;
+    flags.reserved = gpu_mem->Flags();
+    if (flags.is_imported_vram_ipc &&
+        !gpu_mem->IsSharedFromSameProcess()) {
+      auto code = gpu_mem->UnmapGpuVirtualAddress(gpu_mem->GpuAddress(), gpu_mem->Size());
+      if (code != ErrorCode::Success)
+        return HSAKMT_STATUS_ERROR;
+      gpu_mem->Evict();
+
+      return HSAKMT_STATUS_SUCCESS;
+    }
+  }
+
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtMapGraphicHandle(HSAuint32 NodeId,
+                                               HSAuint64 GraphicDeviceHandle,
+                                               HSAuint64 GraphicResourceHandle,
+                                               HSAuint64 GraphicResourceOffset,
+                                               HSAuint64 GraphicResourceSize,
+                                               HSAuint64 *FlatMemoryAddress) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not implemented\n");
+  /* This API was only ever implemented in KFD for Kaveri and
+   * was never upstreamed. There are no open-source users of
+   * this interface. It has been superseded by
+   * RegisterGraphicsHandleToNodes.
+   */
+  return HSAKMT_STATUS_NOT_IMPLEMENTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtUnmapGraphicHandle(HSAuint32 NodeId,
+                                                 HSAuint64 FlatMemoryAddress,
+                                                 HSAuint64 SizeInBytes) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not implemented\n");
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtGetTileConfig(HSAuint32 NodeId,
+                                            HsaGpuTileConfig *config) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not implemented\n");
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtQueryPointerInfo(const void *Pointer,
+                                               HsaPointerInfo *PointerInfo) {
+  CHECK_DXG_OPEN();
+
+  if (!Pointer || !PointerInfo)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  pr_debug("pointer %p\n", Pointer);
+
+  memset(PointerInfo, 0, sizeof(HsaPointerInfo));
+
+  wsl::thunk::GpuMemory *gpu_mem = nullptr;
+  Allocation allocation_info;
+  bool found = false;
+  {
+    std::lock_guard<std::mutex> gard(*allocation_map_lock_);
+    auto it = allocation_map_->upper_bound(Pointer);
+    if (it != allocation_map_->begin()) {
+      --it;
+      if (Pointer >= it->first &&
+        (Pointer < reinterpret_cast<const uint8_t*>(it->first) + it->second.size_requested)) {
+        allocation_info = it->second;
+        gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
+        found = true;
+      }
+    }
+  }
+
+  if (!found) {
+    pr_debug("can't found allocation for %p\n", Pointer);
+    PointerInfo->Type = HSA_POINTER_UNKNOWN;
+    return HSAKMT_STATUS_ERROR;
+  }
+
+  if (allocation_info.userptr) {
+    PointerInfo->Type = HSA_POINTER_REGISTERED_USER;
+    PointerInfo->SizeInBytes = allocation_info.size;
+  } else if (gpu_mem->IsVirtual()) {
+    PointerInfo->Type = HSA_POINTER_RESERVED_ADDR;
+  } else {
+    PointerInfo->Type = HSA_POINTER_ALLOCATED;
+    PointerInfo->SizeInBytes = allocation_info.size_requested;
+  }
+
+  PointerInfo->Node = allocation_info.node_id;
+  PointerInfo->MemFlags.Value = allocation_info.mem_flags_value;
+  PointerInfo->CPUAddress = allocation_info.cpu_addr;
+  PointerInfo->GPUAddress = allocation_info.gpu_addr;
+  PointerInfo->UserData = allocation_info.rocr_userdata;
+
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtSetMemoryUserData(const void *Pointer,
+                                                void *UserData) {
+  CHECK_DXG_OPEN();
+
+  uint64_t aligned_ptr = wsl::AlignDown((uint64_t)Pointer, 4096);
+
+  std::lock_guard<std::mutex> gard(*allocation_map_lock_);
+  auto it = allocation_map_->find((void *)aligned_ptr);
+  if (it != allocation_map_->end()) {
+    it->second.rocr_userdata = UserData;
+    return HSAKMT_STATUS_SUCCESS;
+  }
+
+  return HSAKMT_STATUS_ERROR;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtReplaceAsanHeaderPage(void *addr) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  assert(false);
+#ifdef SANITIZER_AMDGPU
+  pr_debug("address %p\n", addr);
+  CHECK_DXG_OPEN();
+
+  return HSAKMT_STATUS_SUCCESS;
+#else
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+#endif
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtReturnAsanHeaderPage(void *addr) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  assert(false);
+#ifdef SANITIZER_AMDGPU
+  pr_debug("address %p\n", addr);
+  CHECK_DXG_OPEN();
+
+  return HSAKMT_STATUS_SUCCESS;
+#else
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+#endif
+}
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/openclose.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/openclose.cpp
new file mode 100644
index 0000000000..eb22a13aae
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/openclose.cpp
@@ -0,0 +1,626 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include <stdlib.h>
+#include <cstring>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/sysinfo.h>
+#include <linux/mman.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <cstdio>
+#include <strings.h>
+#include <cassert>
+
+
+hsakmtRuntime *dxg_runtime = new hsakmtRuntime();
+
+void hsakmtRuntime::HeapInit() {
+    ReserveLocalHeapSpace();
+    ReserveSystemHeapSpace();
+    InitHandleApertureSpace();
+    InitLocalHeapMgr();
+    InitSystemHeapMgr();
+    InitHandleApertureMgr();
+}
+
+void hsakmtRuntime::HeapFini() {
+    FreeSystemHeapSpace();
+    FreeLocalHeapSpace();
+}
+
+bool hsakmtRuntime::ReserveSvmSpace(uint64_t &base, uint64_t &size, uint64_t align) {
+    uint64_t sys_va[16] = {0};
+    uint64_t local_va;
+    uint64_t sys_va_size;
+    int match_index = -1;
+    void* ptr = NULL;
+
+    wsl::thunk::WDDMDevice* device;
+    size_t num_adapters = get_num_wddmdev();
+
+    base = 0;
+    sys_va_size = size + align;
+
+    /* it will retry 16 times to find the avaliable range. */
+    for (int i = 0; i < 16; i++) {
+        local_va = 0;
+        ptr = mmap(NULL, sys_va_size , PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+        if (ptr == MAP_FAILED) {
+            pr_err("fail to reserve cpu va in %d time!\n", i);
+            break;
+        }
+
+        sys_va[i] = (uint64_t)ptr;
+
+        int match_cnt = 0;
+        for (uint32_t j = 0; j < num_adapters; j++) {
+            device = get_wddmdev(j+1);
+            uint64_t start = (base == 0) ? (uint64_t)ptr : base;
+            uint64_t end = start + ((base == 0) ? sys_va_size : size) + 1;
+
+            if (wsl::thunk::d3dthunk::ReserveGpuVirtualAddress(
+                        device->GetAdapter(), size,
+                        start,
+                        end, &local_va) == ErrorCode::Success) {
+
+                match_cnt++;
+                base = local_va;
+                pr_debug("success to reserve gpu va %lx and va cpu %p in %d time\n",
+                        local_va, ptr, i);
+            } else {
+                pr_err("%s fail to reserve gpu va for cpu va %p in %d time!\n",
+                        __FUNCTION__, ptr, i);
+            }
+        }
+
+        if (match_cnt == num_adapters) {
+                match_index = i;
+                break;
+        }
+    }
+
+    if (match_index >= 0) {
+        /* release cpu unused ranges*/
+        uint64_t left_size = local_va - sys_va[match_index];
+        uint64_t right_size = align - left_size;
+        if ((left_size > 0) && munmap((void*)sys_va[match_index], left_size))
+            pr_err("fail to unmap left %lx with size %lx\n", sys_va[match_index], left_size);
+        if ((right_size > 0) && munmap((void*)(local_va + size), right_size))
+            pr_err("fail to unmap right %lx with size %lx\n", (local_va + size), right_size);
+    } else {
+        pr_err("fail to reserve Local Heap Space!\n");
+        base = 0;
+        size = 0;
+    }
+
+    /* free match fail address for cpu va */
+    int free = match_index >= 0 ? match_index : 16;
+    for (int j = 0; j < free; j++) {
+        if (sys_va[j] != 0 && munmap((void*)sys_va[j], sys_va_size)) {
+            pr_err("fail to unmap %d %lx\n", j, sys_va[j]);
+        }
+    }
+
+    return match_index >= 0;
+}
+
+/*
+ * To find the avaliable same range for cpu
+ * virtual space and gpu virtual space.
+ * sys_va_size of cpu va range is larger 1G
+ * than gpu va range, otherwise ReserveGPUVirtualAddress
+ * will return error.
+ */
+bool hsakmtRuntime::ReserveLocalHeapSpace() {
+    wsl::thunk::WDDMDevice* device;
+    uint64_t total_local_size = 0;
+    uint64_t align = 0x40000000; /* 1G */
+    size_t num_adapters = get_num_wddmdev();
+
+    for (uint32_t j = 0; j < num_adapters; j++) {
+        device = get_wddmdev(j+1);
+        if (device == nullptr)
+            return -1;
+        /*
+         * For APU, use non local memory(shared GPU memory) as GPU memory,
+         * because it has small local memory
+        */
+        if (device->IsDgpu())
+          total_local_size = wsl::Max(device->LocalHeapSize(), total_local_size);
+        else
+          total_local_size = wsl::Max(device->LocalHeapSize(), device->NonLocalHeapSize(), total_local_size);
+    }
+
+    total_local_size = wsl::AlignUp(total_local_size, align) * 4;
+    local_heap_space_start_ = 0;
+    local_heap_space_size_ = total_local_size;
+
+    return ReserveSvmSpace(local_heap_space_start_, local_heap_space_size_, align);
+}
+
+bool hsakmtRuntime::FreeSvmSpace(uint64_t &base, uint64_t &size) {
+    wsl::thunk::WDDMDevice* device;
+    size_t num_adapters = get_num_wddmdev();
+    for (uint32_t j = 0; j < num_adapters; j++) {
+        device = get_wddmdev(j+1);
+        if (device == nullptr)
+            return -1;
+        wsl::thunk::d3dthunk::FreeGpuVirtualAddress(device->GetAdapter(), base, size);
+    }
+
+    void *cpu = (void *)base;
+    auto r = (munmap(cpu, size) == 0);
+    base = 0;
+    size = 0;
+    return r;
+}
+
+bool hsakmtRuntime::FreeLocalHeapSpace() {
+    return FreeSvmSpace(local_heap_space_start_, local_heap_space_size_);
+}
+
+void hsakmtRuntime::InitLocalHeapMgr() {
+  local_heap_mgr_ = std::make_unique<wsl::thunk::VaMgr>(local_heap_space_start_,
+                                          local_heap_space_size_,
+                                          DEFAULT_GPU_PAGE_SIZE);
+}
+
+bool hsakmtRuntime::ReserveSystemHeapSpace() {
+    struct sysinfo info;
+    int ret = sysinfo(&info);
+    uint64_t max_ram = 0x10000000000;
+    uint64_t alignment = 0x100000000;
+    assert(!ret);
+
+    int32_t protFlags = PROT_NONE;
+    // minimum of reserve size is 8G, maximum of reserve size is 1T.
+    system_heap_space_size_ = std::min(wsl::AlignUp(info.totalram, alignment) * 2, max_ram);
+
+    return ReserveSvmSpace(system_heap_space_start_, system_heap_space_size_, alignment);
+}
+
+bool hsakmtRuntime::FreeSystemHeapSpace(void) {
+    return FreeSvmSpace(system_heap_space_start_, system_heap_space_size_);
+}
+
+bool hsakmtRuntime::CommitSystemHeapSpace(void* addr, int64_t size, bool lock) {
+    int32_t protFlags = PROT_READ | PROT_WRITE | PROT_EXEC;
+    int32_t mapFlags = MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED|
+        MAP_NORESERVE|MAP_UNINITIALIZED;
+    if (lock)
+        mapFlags |= MAP_LOCKED;
+    void* paddr = mmap(addr, size, protFlags, mapFlags, -1, 0);
+    if (paddr == MAP_FAILED) {
+        pr_err("fail to commit %s addr = %p, paddr = %p\n", (lock ? "locked" : ""), addr, paddr);
+        return false;
+    }
+    assert(addr == paddr);
+
+    /*if (!Runtime::runtime_singleton_->PinWARequired())
+      return true;*/
+
+    /*
+     * Do not make the pages in this range available to the child
+     * after a fork(2).  This is useful to prevent copy-on-write
+     * semantics from changing the physical location of a page if
+     * the parent writes to it after a fork(2).  (Such page
+     * relocations cause problems for hardware that DMAs into the
+     * page.)
+     *
+     * https://man7.org/linux/man-pages/man2/madvise.2.html
+     */
+    if (madvise(addr, size, MADV_DONTFORK))
+        pr_err("fail to set MADV_DONTFORK for addr = %p\n", addr);
+
+    return true;
+}
+
+bool hsakmtRuntime::DecommitSystemHeapSpace(void* addr, int64_t size) {
+    int32_t protFlags = PROT_NONE;
+    int32_t mapFlags = MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED|
+        MAP_NORESERVE|MAP_UNINITIALIZED;
+    void* paddr = mmap(addr, size, protFlags, mapFlags, -1, 0);
+    if (paddr == MAP_FAILED) {
+        pr_err("fail to decommit addr = %p, paddr = %p\n", addr, paddr);
+        return false;
+    }
+    assert(addr == paddr);
+    return true;
+}
+
+void hsakmtRuntime::InitSystemHeapMgr() {
+  system_heap_mgr_ = std::make_unique<wsl::thunk::VaMgr>(system_heap_space_start_,
+                                          system_heap_space_size_,
+                                          DEFAULT_GPU_PAGE_SIZE);
+}
+
+ErrorCode hsakmtRuntime::ReserveGpuVirtualAddress(const thunk_proxy::AllocDomain domain,
+        gpusize hit_base_addr, gpusize size,
+        gpusize *out_gpu_virt_addr, gpusize alignment, bool lock) {
+    gpusize gpu_addr = 0;
+    ErrorCode code = ErrorCode::Success;
+
+    uint64_t align = alignment == 0 ? (64 * 1024) : alignment; // default 64K alignment
+    if (size >= GPU_HUGE_PAGE_SIZE)
+        align = GPU_HUGE_PAGE_SIZE;
+
+    if (domain == thunk_proxy::kSystem) {
+        gpu_addr = system_heap_mgr_->Alloc(size, align, hit_base_addr);
+        if (gpu_addr == 0)
+            code = ErrorCode::OutOfMemory;
+
+        if (!CommitSystemHeapSpace((void*)gpu_addr, size, lock)) {
+            system_heap_mgr_->Free(gpu_addr);
+            code = ErrorCode::SyscallFail;
+        }
+    } else {
+        gpu_addr = local_heap_mgr_->Alloc(size, align, hit_base_addr);
+        if (gpu_addr == 0)
+            code = ErrorCode::OutOfGpuMemory;
+    }
+
+    *out_gpu_virt_addr = (code == ErrorCode::Success) ? gpu_addr : 0;
+    return code;
+}
+
+ErrorCode hsakmtRuntime::FreeGpuVirtualAddress(const thunk_proxy::AllocDomain domain,
+        gpusize gpu_addr, gpusize size) {
+    auto code = ErrorCode::Success;
+
+    if (domain == thunk_proxy::kSystem) {
+        DecommitSystemHeapSpace((void *)gpu_addr, size);
+        system_heap_mgr_->Free(gpu_addr);
+    } else {
+        local_heap_mgr_->Free(gpu_addr);
+    }
+
+    return code;
+}
+
+bool hsakmtRuntime::CommitSystemHeapSpaceIPC(void* addr, int64_t size, int &memfd, bool lock) {
+    int fd = -1;
+
+    if (memfd == -1) {
+        fd = memfd_create("rocr4wsl_gtt", MFD_CLOEXEC);
+        if (fd < 0) {
+            pr_err("memfd_create failed\n");
+            return false;
+        }
+
+        ftruncate(fd, size);
+    } else {
+        fd = memfd;
+    }
+
+    int32_t protFlags = PROT_READ | PROT_WRITE;
+    int32_t mapFlags = MAP_SHARED | MAP_FIXED | MAP_NORESERVE |
+        MAP_UNINITIALIZED | (lock ? MAP_LOCKED : 0);
+
+    void* paddr = mmap(addr, size, protFlags, mapFlags, fd, 0);
+    if (paddr == MAP_FAILED) {
+        pr_err("fail to commit %s addr = %p, paddr = %p\n", (lock ? "locked" : ""), addr, paddr);
+        if (memfd == -1)
+            close(fd);
+        return false;
+    }
+    assert(addr == paddr);
+
+    memfd = fd;
+
+    if (madvise(addr, size, MADV_DONTFORK))
+        pr_err("fail to set MADV_DONTFORK for addr = %p\n", addr);
+
+    return true;
+}
+
+bool hsakmtRuntime::DecommitSystemHeapSpaceIPC(void* addr, int64_t size, int &memfd) {
+    if (munmap(addr, size) != 0) {
+        pr_err("fail to unmap = %p \n", addr);
+        return false;
+    }
+    close(memfd);
+    memfd = -1;
+    return true;
+}
+
+ErrorCode hsakmtRuntime::ReserveIPCSysMem(gpusize size,
+        gpusize *out_gpu_virt_addr, gpusize alignment,
+        int &memfd, bool lock) {
+    gpusize gpu_addr = 0;
+    ErrorCode code = ErrorCode::Success;
+    gpu_addr = system_heap_mgr_->Alloc(size, alignment, 0);
+    if (gpu_addr == 0)
+        return ErrorCode::OutOfMemory;
+
+    if (!CommitSystemHeapSpaceIPC((void*)gpu_addr, size, memfd, lock)) {
+        system_heap_mgr_->Free(gpu_addr);
+        code = ErrorCode::SyscallFail;
+    }
+
+    *out_gpu_virt_addr = (code == ErrorCode::Success) ? gpu_addr : 0;
+    return code;
+}
+
+ErrorCode hsakmtRuntime::FreeIPCSysMem(gpusize gpu_addr, gpusize size, int &memfd) {
+    auto code = ErrorCode::Success;
+
+    DecommitSystemHeapSpaceIPC((void *)gpu_addr, size, memfd);
+
+    system_heap_mgr_->Free(gpu_addr);
+    return code;
+}
+
+bool hsakmtRuntime::InitHandleApertureSpace() {
+	wsl::thunk::WDDMDevice* device;
+	size_t num_adapters = get_num_wddmdev();
+    handle_aperture_start_ = START_NON_CANONICAL_ADDR;
+    handle_aperture_size_ = 1ULL << 47;
+
+    while (handle_aperture_start_ < END_NON_CANONICAL_ADDR - 1) {
+		for (uint32_t j = 0; j < num_adapters;) {
+	        device = get_wddmdev(j+1);
+	        if (device == nullptr)
+	            return -1;
+
+            if (device->PrivateApertureBase() &&
+                    IS_OVERLAPPING(device->PrivateApertureBase(),
+                        device->PrivateApertureSize(),
+                        handle_aperture_start_,
+                        handle_aperture_size_)) {
+                handle_aperture_start_ += (1ULL << 47);
+                continue;
+            }
+
+            if (device->SharedApertureBase() &&
+                    IS_OVERLAPPING(device->SharedApertureBase(),
+                        device->SharedApertureSize(),
+                        handle_aperture_start_,
+                        handle_aperture_size_)) {
+                handle_aperture_start_ += (1ULL << 47);
+                continue;
+            }
+
+            j++;
+        }
+        pr_debug("handle aperture start %lx, size %lx\n", handle_aperture_start_, handle_aperture_size_);
+        return true;
+    }
+
+    handle_aperture_start_ = 0;
+    pr_err("fail\n");
+
+    return false;
+}
+
+void hsakmtRuntime::InitHandleApertureMgr() {
+  handle_aperture_mgr_ = std::make_unique<wsl::thunk::VaMgr>(handle_aperture_start_,
+                                                 handle_aperture_size_,
+                                                 DEFAULT_GPU_PAGE_SIZE);
+}
+
+ErrorCode hsakmtRuntime::HandleApertureAlloc(gpusize size, gpusize *out_gpu_virt_addr) {
+    uint64_t align = DEFAULT_GPU_PAGE_SIZE;
+
+    if (size >= GPU_HUGE_PAGE_SIZE)
+        align = GPU_HUGE_PAGE_SIZE;
+
+    *out_gpu_virt_addr = handle_aperture_mgr_->Alloc(size, align);
+    if (*out_gpu_virt_addr == 0)
+        return ErrorCode::OutOfHandleApeMemory;
+
+    return ErrorCode::Success;
+}
+
+void hsakmtRuntime::HandleApertureFree(gpusize gpu_addr) {
+    handle_aperture_mgr_->Free(gpu_addr);
+}
+
+/* is_forked_child detects when the process has forked since the last
+ * time this function was called. We cannot rely on pthread_atfork
+ * because the process can fork without calling the fork function in
+ * libc (using clone or calling the system call directly).
+ */
+bool is_forked_child(void) {
+  if (dxg_runtime->is_forked)
+    return true;
+
+  pid_t cur_pid = getpid();
+  if (dxg_runtime->parent_pid != cur_pid) {
+    dxg_runtime->is_forked = true;
+    dxg_runtime->parent_pid = cur_pid;
+    return true;
+  }
+
+  return false;
+}
+
+/* Callbacks from pthread_atfork */
+static void prepare_fork_handler(void) { pthread_mutex_lock(&dxg_runtime->hsakmt_mutex); }
+static void parent_fork_handler(void) { pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex); }
+static void child_fork_handler(void) {
+  pthread_mutex_init(&dxg_runtime->hsakmt_mutex, NULL);
+  dxg_runtime->is_forked = true;
+}
+
+/* Call this from the child process after fork. This will clear all
+ * data that is duplicated from the parent process, that is not valid
+ * in the child.
+ * The topology information is duplicated from the parent is valid
+ * in the child process so it is not cleared
+ */
+static void clear_after_fork(void) {
+  reset_suballocator();
+  clear_allocation_map();
+
+  if (dxg_runtime->dxg_fd >= 0) {
+    close(dxg_runtime->dxg_fd);
+    dxg_runtime->dxg_fd = -1;
+  }
+  delete dxg_runtime;
+  dxg_runtime = new hsakmtRuntime();
+
+}
+
+static inline void init_page_size(void) {
+  dxg_runtime->page_size = sysconf(_SC_PAGESIZE);
+  dxg_runtime->page_shift = ffs(dxg_runtime->page_size) - 1;
+}
+
+static HSAKMT_STATUS init_vars_from_env(void) {
+  char *envvar;
+  int debug_level;
+
+  /* Normally libraries don't print messages. For debugging purpose, we'll
+   * print messages if an environment variable, HSAKMT_DEBUG_LEVEL, is set.
+   */
+  envvar = getenv("HSAKMT_DEBUG_LEVEL");
+  if (envvar) {
+    dxg_runtime->hsakmt_debug_level = atoi(envvar);
+  }
+
+  /* Check whether to support Zero frame buffer */
+  envvar = getenv("HSA_ZFB");
+  if (envvar)
+    dxg_runtime->zfb_support = atoi(envvar);
+
+  /* Check whether to handle vendor specific aql packet */
+  envvar = getenv("WSLKMT_VENDOR_PACKET");
+  if (envvar)
+    dxg_runtime->vendor_packet_process = atoi(envvar);
+
+  /* Decide whether to check available system memory before allocation */
+  envvar = getenv("WSL_CHECK_AVAIL_SYSRAM");
+  if (envvar)
+    dxg_runtime->check_avail_sysram = !strcmp(envvar, "1");
+
+  envvar = getenv("WSL_ENABLE_THUNK_SUB_ALLOCATOR");
+  if (envvar)
+    dxg_runtime->enable_thunk_sub_allocator = atoi(envvar);
+
+  envvar = getenv("ROCR_VISIBLE_DEVICES");
+  if (envvar) {
+    std::string devices(envvar);
+    size_t first_num_pos = devices.find_first_of("0123456789");
+    if (first_num_pos != std::string::npos)
+      dxg_runtime->default_node = std::stoi(devices.substr(first_num_pos)) + 1;
+  }
+
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtOpenKFD(void) {
+  HSAKMT_STATUS result;
+  int fd = -1;
+  HsaSystemProperties sys_props;
+  char *error;
+
+  pthread_mutex_lock(&dxg_runtime->hsakmt_mutex);
+
+  /* If the process has forked, the child process must re-initialize
+   * it's connection to DXG. Any references tracked by dxg_open_count
+   * belong to the parent
+   */
+  if (is_forked_child())
+    clear_after_fork();
+
+  if (dxg_runtime->dxg_open_count == 0) {
+    static bool atfork_installed = false;
+
+    result = init_vars_from_env();
+    if (result != HSAKMT_STATUS_SUCCESS)
+      goto open_failed;
+
+    if (dxg_runtime->dxg_fd < 0) {
+      fd = open(dxg_runtime->dxg_device_name, O_RDWR | O_CLOEXEC);
+
+      if (fd == -1) {
+        result = HSAKMT_STATUS_KERNEL_IO_CHANNEL_NOT_OPENED;
+        goto open_failed;
+      }
+
+      dxg_runtime->dxg_fd = fd;
+    }
+    if (!wsl::thunk::dxcore::DxcoreLoader::Instance().Initialize()) {
+        pr_err("Failed to load libdxcore.so\n");
+        result = HSAKMT_STATUS_ERROR;
+        goto dxcore_loader_failed;
+    }
+
+    hsakmt_hsa_loader_init();
+    init_page_size();
+
+    char *useSvmStr = getenv("HSA_USE_SVM");
+    dxg_runtime->is_svm_api_supported = !(useSvmStr && !strcmp(useSvmStr, "0")) && false;
+
+    dxg_runtime->dxg_open_count = 1;
+
+    if (!atfork_installed) {
+      /* Atfork handlers cannot be uninstalled and
+       * must be installed only once. Otherwise
+       * prepare will deadlock when trying to take
+       * the same lock multiple times.
+       */
+      pthread_atfork(prepare_fork_handler, parent_fork_handler,
+                     child_fork_handler);
+      atfork_installed = true;
+    }
+  } else {
+    dxg_runtime->dxg_open_count++;
+    result = HSAKMT_STATUS_KERNEL_ALREADY_OPENED;
+  }
+
+  reset_suballocator();
+  pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex);
+  return result;
+dxcore_loader_failed:
+  close(fd);
+open_failed:
+  pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex);
+
+  return result;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtCloseKFD(void) {
+  HSAKMT_STATUS result;
+
+  pthread_mutex_lock(&dxg_runtime->hsakmt_mutex);
+
+  if (dxg_runtime->dxg_open_count > 0) {
+    if (--dxg_runtime->dxg_open_count == 0) {
+      close(dxg_runtime->dxg_fd);
+      dxg_runtime->dxg_fd = -1;
+      wsl::thunk::dxcore::DxcoreLoader::Instance().Shutdown();
+    }
+
+    result = HSAKMT_STATUS_SUCCESS;
+  } else
+    result = HSAKMT_STATUS_KERNEL_IO_CHANNEL_NOT_OPENED;
+
+  pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex);
+
+  return result;
+}
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/pc_sampling.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/pc_sampling.cpp
new file mode 100644
index 0000000000..6c6a9e2a04
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/pc_sampling.cpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright © 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingSupport(void) {
+  CHECK_DXG_OPEN();
+  // Used for profiling tools
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI
+hsaKmtPcSamplingQueryCapabilities(HSAuint32 NodeId, void *sample_info,
+                                  HSAuint32 sample_info_sz, HSAuint32 *size) {
+  CHECK_DXG_OPEN();
+  // Used for profiling tools
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingCreate(HSAuint32 NodeId,
+                                               HsaPcSamplingInfo *sample_info,
+                                               HsaPcSamplingTraceId *traceId) {
+  CHECK_DXG_OPEN();
+  // Used for profiling tools
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingDestroy(HSAuint32 NodeId,
+                                                HsaPcSamplingTraceId traceId) {
+  CHECK_DXG_OPEN();
+  // Used for profiling tools
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingStart(HSAuint32 NodeId,
+                                              HsaPcSamplingTraceId traceId) {
+  CHECK_DXG_OPEN();
+  // Used for profiling tools
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingStop(HSAuint32 NodeId,
+                                             HsaPcSamplingTraceId traceId) {
+  CHECK_DXG_OPEN();
+  // Used for profiling tools
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/perfctr.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/perfctr.cpp
new file mode 100644
index 0000000000..9189d2dafa
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/perfctr.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtPmcGetCounterProperties(
+    HSAuint32 NodeId, HsaCounterProperties **CounterProperties) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+/* Registers a set of (HW) counters to be used for tracing/profiling */
+HSAKMT_STATUS HSAKMTAPI hsaKmtPmcRegisterTrace(HSAuint32 NodeId,
+                                               HSAuint32 NumberOfCounters,
+                                               HsaCounter *Counters,
+                                               HsaPmcTraceRoot *TraceRoot) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+/* Unregisters a set of (HW) counters used for tracing/profiling */
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtPmcUnregisterTrace(HSAuint32 NodeId,
+                                                 HSATraceId TraceId) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtPmcAcquireTraceAccess(HSAuint32 NodeId,
+                                                    HSATraceId TraceId) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtPmcReleaseTraceAccess(HSAuint32 NodeId,
+                                                    HSATraceId TraceId) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+/* Starts tracing operation on a previously established set of performance
+ * counters */
+HSAKMT_STATUS HSAKMTAPI hsaKmtPmcStartTrace(HSATraceId TraceId,
+                                            void *TraceBuffer,
+                                            HSAuint64 TraceBufferSizeBytes) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+/*Forces an update of all the counters that a previously started trace operation
+ * has registered */
+HSAKMT_STATUS HSAKMTAPI hsaKmtPmcQueryTrace(HSATraceId TraceId) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+/* Stops tracing operation on a previously established set of performance
+ * counters */
+HSAKMT_STATUS HSAKMTAPI hsaKmtPmcStopTrace(HSATraceId TraceId) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/queues.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/queues.cpp
new file mode 100644
index 0000000000..edaaea9d1a
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/queues.cpp
@@ -0,0 +1,216 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include <cinttypes>
+#include "impl/wddm/device.h"
+#include "impl/wddm/queue.h"
+#include "impl/hsa/amd_hsa_signal.h"
+
+uint32_t get_vgpr_size_per_cu(HSA_ENGINE_ID id) {
+  uint32_t vgpr_size = 0x40000;
+
+  uint32_t gfxv = HSA_GET_GFX_VERSION_FULL(id.ui32);
+  if( gfxv == 0x1100 || gfxv == 0x1101 ||
+    gfxv == 0x1151 ||
+    gfxv == 0x1200 || gfxv ==0x1201) {
+    vgpr_size = 0x60000;
+  }
+
+  return vgpr_size;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueue(HSAuint32 NodeId,
+					  HSA_QUEUE_TYPE Type,
+					  HSAuint32 QueuePercentage,
+					  HSA_QUEUE_PRIORITY Priority,
+					  void *QueueAddress,
+					  HSAuint64 QueueSizeInBytes,
+					  HsaEvent *Event,
+					  HsaQueueResource *QueueResource)
+{
+	if (Type == HSA_QUEUE_SDMA_BY_ENG_ID)
+		return HSAKMT_STATUS_ERROR;
+
+	return hsaKmtCreateQueueExt(NodeId, Type, QueuePercentage, Priority, 0,
+				    QueueAddress, QueueSizeInBytes, Event,
+				    QueueResource);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueueExt(HSAuint32 NodeId,
+					     HSA_QUEUE_TYPE Type,
+					     HSAuint32 QueuePercentage,
+					     HSA_QUEUE_PRIORITY Priority,
+					     HSAuint32 SdmaEngineId,
+					     void *QueueAddress,
+					     HSAuint64 QueueSizeInBytes,
+					     HsaEvent *Event,
+					     HsaQueueResource *QueueResource) {
+  HSAKMT_STATUS result;
+
+  CHECK_DXG_OPEN();
+  assert(Event == nullptr);
+
+  if (Priority < HSA_QUEUE_PRIORITY_MINIMUM ||
+      Priority > HSA_QUEUE_PRIORITY_MAXIMUM)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  wsl::thunk::WDDMDevice *device_ = get_wddmdev(NodeId);
+  assert(device_);
+
+  if (queue_acquire_buffer(QueueAddress) == false)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  switch (Type) {
+  case HSA_QUEUE_COMPUTE_AQL: {
+    assert(QueueResource->ErrorReason == nullptr);
+    uint64_t pkg_num = QueueSizeInBytes / 64;
+    uint32_t cmdbuf_size = device_->GetCmdbufSize();
+    uint32_t queue_engine = device_->GetComputeEngine();
+    bool use_hws = device_->IsHwsEnabled(queue_engine);
+    auto queue_ = new wsl::thunk::ComputeQueue(
+        device_, QueueAddress, pkg_num,
+        reinterpret_cast<std::atomic<uint64_t> *>(
+            QueueResource->Queue_write_ptr_aql),
+        reinterpret_cast<std::atomic<uint64_t> *>(
+            QueueResource->Queue_read_ptr_aql),
+        QueueResource->ErrorReason, cmdbuf_size, queue_engine, use_hws);
+
+    QueueResource->QueueId = reinterpret_cast<HSA_QUEUEID>(queue_);
+    // for doorbell_signal.hardware_doorbell_ptr
+    QueueResource->Queue_DoorBell_aql = queue_->GetDoorbellPtr();
+  } break;
+  case HSA_QUEUE_SDMA:
+  case HSA_QUEUE_SDMA_BY_ENG_ID: {
+    pr_debug("create sdma queue in engine %d\n", SdmaEngineId);
+    uint32_t queue_engine = device_->GetSdmaEngine(0); // TODO: SdmaEngineId
+    bool use_hws = device_->IsHwsEnabled(queue_engine);
+    auto queue_ = new wsl::thunk::SDMAQueue(
+		device_, QueueAddress, QueueSizeInBytes,
+		queue_engine, use_hws);
+    QueueResource->QueueId = reinterpret_cast<HSA_QUEUEID>(queue_);
+    QueueResource->Queue_DoorBell_aql = queue_->GetDoorbellPtr();
+    QueueResource->Queue_write_ptr_aql = queue_->GetRingWptr();
+    QueueResource->Queue_read_ptr_aql = queue_->GetRingRptr();
+  } break;
+  default:
+    assert(false);
+    QueueResource->QueueId = 0;
+    QueueResource->Queue_DoorBell = nullptr;
+    break;
+  }
+
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtUpdateQueue(
+    HSA_QUEUEID QueueId, HSAuint32 QueuePercentage, HSA_QUEUE_PRIORITY Priority,
+    void *QueueAddress, HSAuint64 QueueSize, HsaEvent *Event) {
+  CHECK_DXG_OPEN();
+
+  if (Priority < HSA_QUEUE_PRIORITY_MINIMUM ||
+      Priority > HSA_QUEUE_PRIORITY_MAXIMUM)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  auto queue_ = reinterpret_cast<wsl::thunk::ComputeQueue *>(QueueId);
+  if (!queue_)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtDestroyQueue(HSA_QUEUEID QueueId) {
+  CHECK_DXG_OPEN();
+
+  auto queue_ = reinterpret_cast<wsl::thunk::WDDMQueue *>(QueueId);
+  void *QueueAddress = queue_->GetHsaQueueAddr();
+
+  if (!queue_)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  delete queue_;
+  queue_release_buffer(QueueAddress);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtSetQueueCUMask(HSA_QUEUEID QueueId,
+                                             HSAuint32 CUMaskCount,
+                                             HSAuint32 *QueueCUMask) {
+  CHECK_DXG_OPEN();
+
+  auto queue_ = reinterpret_cast<wsl::thunk::ComputeQueue *>(QueueId);
+  if (!queue_)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  if (CUMaskCount == 0 || !QueueCUMask || ((CUMaskCount % 32) != 0))
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  pr_warn_once("not implemented\n");
+
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtGetQueueInfo(HSA_QUEUEID QueueId,
+                                           HsaQueueInfo *QueueInfo) {
+  CHECK_DXG_OPEN();
+
+  if (QueueInfo == NULL)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+  memset(QueueInfo, 0, sizeof(*QueueInfo));
+
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtSetTrapHandler(HSAuint32 Node,
+                                             void *TrapHandlerBaseAddress,
+                                             HSAuint64 TrapHandlerSizeInBytes,
+                                             void *TrapBufferBaseAddress,
+                                             HSAuint64 TrapBufferSizeInBytes) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not implemented\n");
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtAllocQueueGWS(HSA_QUEUEID QueueId, HSAuint32 nGWS,
+                                            HSAuint32 *firstGWS) {
+  CHECK_DXG_OPEN();
+
+  auto queue_ = reinterpret_cast<wsl::thunk::ComputeQueue *>(QueueId);
+  if (!queue_)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtQueueRingDoorbell(HSA_QUEUEID QueueId) {
+  CHECK_DXG_OPEN();
+
+  auto queue_ = reinterpret_cast<wsl::thunk::WDDMQueue *>(QueueId);
+  if (!queue_)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  queue_->RingDoorbell();
+  return HSAKMT_STATUS_SUCCESS;
+}
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/spm.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/spm.cpp
new file mode 100644
index 0000000000..14b0faf1f8
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/spm.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright © 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtSPMAcquire(HSAuint32 PreferredNode) {
+  CHECK_DXG_OPEN();
+  // Used for profiling tools
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtSPMSetDestBuffer(
+    HSAuint32 PreferredNode, HSAuint32 SizeInBytes, HSAuint32 *timeout,
+    HSAuint32 *SizeCopied, void *DestMemoryAddress, bool *isSPMDataLoss) {
+  CHECK_DXG_OPEN();
+  // Used for profiling tools
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtSPMRelease(HSAuint32 PreferredNode) {
+  CHECK_DXG_OPEN();
+  // Used for profiling tools
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/svm.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/svm.cpp
new file mode 100644
index 0000000000..f2f8a10f68
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/svm.cpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright © 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/* Helper functions for calling KFD SVM ioctl */
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtSVMSetAttr(void *start_addr, HSAuint64 size,
+                                         unsigned int nattr,
+                                         HSA_SVM_ATTRIBUTE *attrs) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtSVMGetAttr(void *start_addr, HSAuint64 size,
+                                         unsigned int nattr,
+                                         HSA_SVM_ATTRIBUTE *attrs) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtSetXNACKMode(HSAint32 enable) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtGetXNACKMode(HSAint32 *enable) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  *enable = false;
+  return HSAKMT_STATUS_SUCCESS;
+}
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/thunk_proxy/libthunk_proxy.a b/projects/rocr-runtime/libhsakmt/src/dxg/thunk_proxy/libthunk_proxy.a
new file mode 100644
index 0000000000..3b21eb936d
Binary files /dev/null and b/projects/rocr-runtime/libhsakmt/src/dxg/thunk_proxy/libthunk_proxy.a differ
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/time.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/time.cpp
new file mode 100644
index 0000000000..a28bb29215
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/time.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include <iostream>
+#include <ctime>
+#include <cstring>
+#include <cassert>
+#include "impl/wddm/device.h"
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtGetClockCounters(HSAuint32 NodeId,
+                                               HsaClockCounters *Counters) {
+  HSAKMT_STATUS result = HSAKMT_STATUS_SUCCESS;
+
+  CHECK_DXG_OPEN();
+
+  std::memset(Counters, 0, sizeof(*Counters));
+
+  wsl::thunk::WDDMDevice *device_ = get_wddmdev(NodeId);
+  assert(device_);
+  device_->GetClockCounters(&Counters->GPUClockCounter, &Counters->CPUClockCounter);
+
+  struct timespec ts;
+  if (clock_gettime(CLOCK_MONOTONIC_RAW, &ts) == 0)
+    Counters->SystemClockCounter = ts.tv_sec * 1e9 + ts.tv_nsec;
+  Counters->SystemClockFrequencyHz = 1000000000;
+
+  return result;
+}
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/topology.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/topology.cpp
new file mode 100644
index 0000000000..2db712e341
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/topology.cpp
@@ -0,0 +1,1463 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * Copyright 2016-2018 Raptor Engineering, LLC. All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <cstdlib>
+#include <cstdio>
+#include <cstring>
+#include <cctype>
+#include <cmath>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <assert.h>
+#include <dirent.h>
+#include <unistd.h>
+#include <sys/sysinfo.h>
+
+#include "impl/wddm/types.h"
+#include "impl/wddm/device.h"
+#include "util/utils.h"
+
+/* Number of memory banks added by thunk on top of topology
+ * This only includes static heaps like LDS, scratch and SVM,
+ * not for MMIO_REMAP heap. MMIO_REMAP memory bank is reported
+ * dynamically based on whether mmio aperture was mapped
+ * successfully on this node.
+ */
+#define NUM_OF_IGPU_HEAPS 3
+#define NUM_OF_DGPU_HEAPS 3
+
+typedef struct {
+  HsaNodeProperties node;
+  std::vector<HsaMemoryProperties> mem; /* node->NumBanks elements */
+  std::vector<HsaCacheProperties> cache;
+  std::vector<HsaIoLinkProperties> link;
+} node_props_t;
+
+struct _topology_props {
+  HsaSystemProperties *g_system = nullptr;
+  std::vector<node_props_t> g_props;
+  std::vector<wsl::thunk::WDDMDevice *> wdevices_;
+  uint32_t wdevice_num_ = 0;
+  uint32_t num_sysfs_nodes = 0;
+  int processor_vendor = -1;
+  double freq_max_ = 0.0;
+};
+
+static _topology_props* dxg_topology = new _topology_props();
+
+/* Supported System Vendors */
+enum SUPPORTED_PROCESSOR_VENDORS {
+  GENUINE_INTEL = 0,
+  AUTHENTIC_AMD,
+  IBM_POWER
+};
+/* Adding newline to make the search easier */
+static const char *supported_processor_vendor_name[] = {
+  "GenuineIntel",
+  "AuthenticAMD",
+  "" // POWER requires a different search method
+};
+
+static HSAKMT_STATUS topology_take_snapshot(void);
+static void topology_drop_snapshot(void);
+
+/* information from /proc/cpuinfo */
+struct proc_cpuinfo {
+  uint32_t proc_num;                     /* processor */
+  uint32_t apicid;                       /* apicid */
+  char model_name[HSA_PUBLIC_NAME_SIZE]; /* model name */
+};
+
+/* CPU cache table for all CPUs on the system. Each entry has the relative CPU
+ * info and caches connected to that CPU.
+ */
+typedef struct cpu_cacheinfo {
+  int32_t proc_num;    /* this cpu's processor number */
+  uint32_t num_caches; /* number of caches reported by this cpu */
+} cpu_cacheinfo_t;
+
+/* num_subdirs - find the number of sub-directories in the specified path
+ *	@dirpath - directory path to find sub-directories underneath
+ *	@prefix - only count sub-directory names starting with prefix.
+ *		Use blank string, "", to count all.
+ *	Return - number of sub-directories
+ */
+static int num_subdirs(char *dirpath, const char *prefix) {
+  int count = 0;
+  DIR *dirp;
+  struct dirent *dir;
+  int prefix_len = strlen(prefix);
+
+  dirp = opendir(dirpath);
+  if (dirp) {
+    while ((dir = readdir(dirp)) != 0) {
+      if ((strcmp(dir->d_name, ".") == 0) || (strcmp(dir->d_name, "..") == 0))
+        continue;
+      if (prefix_len && strncmp(dir->d_name, prefix, prefix_len))
+        continue;
+      count++;
+    }
+    closedir(dirp);
+  }
+
+  return count;
+}
+
+/* fscanf_dec - read a file whose content is a decimal number
+ *      @file [IN ] file to read
+ *      @num [OUT] number in the file
+ */
+static HSAKMT_STATUS fscanf_dec(char *file, uint32_t *num) {
+  FILE *fd;
+  HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
+
+  fd = fopen(file, "r");
+  if (!fd) {
+    pr_err("Failed to open %s\n", file);
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+  }
+  if (fscanf(fd, "%u", num) != 1) {
+    pr_err("Failed to parse %s as a decimal.\n", file);
+    ret = HSAKMT_STATUS_ERROR;
+  }
+
+  fclose(fd);
+  return ret;
+}
+
+/* fscanf_str - read a file whose content is a string
+ *      @file [IN ] file to read
+ *      @str [OUT] string in the file
+ */
+static HSAKMT_STATUS fscanf_str(char *file, char *str) {
+  FILE *fd;
+  HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
+
+  fd = fopen(file, "r");
+  if (!fd) {
+    pr_err("Failed to open %s\n", file);
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+  }
+  if (fscanf(fd, "%s", str) != 1) {
+    pr_err("Failed to parse %s as a string.\n", file);
+    ret = HSAKMT_STATUS_ERROR;
+  }
+
+  fclose(fd);
+  return ret;
+}
+
+/* fscanf_size - read a file whose content represents size as a string
+ *      @file [IN ] file to read
+ *      @bytes [OUT] sizes in bytes
+ */
+static HSAKMT_STATUS fscanf_size(char *file, uint32_t *bytes) {
+  FILE *fd;
+  HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
+  char unit;
+  int n;
+
+  fd = fopen(file, "r");
+  if (!fd) {
+    pr_err("Failed to open %s\n", file);
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+  }
+
+  n = fscanf(fd, "%u%c", bytes, &unit);
+  if (n < 1) {
+    pr_err("Failed to parse %s\n", file);
+    ret = HSAKMT_STATUS_ERROR;
+  }
+
+  if (n == 2) {
+    switch (unit) {
+    case 'K':
+      *bytes <<= 10;
+      break;
+    case 'M':
+      *bytes <<= 20;
+      break;
+    case 'G':
+      *bytes <<= 30;
+      break;
+    default:
+      ret = HSAKMT_STATUS_ERROR;
+      break;
+    }
+  }
+
+  fclose(fd);
+  return ret;
+}
+
+/* cpumap_to_cpu_ci - translate shared_cpu_map string + cpuinfo->apicid into
+ *		      SiblingMap in cache
+ *	@shared_cpu_map [IN ] shared_cpu_map string
+ *	@cpuinfo [IN ] cpuinfo to get apicid
+ *	@this_cache [OUT] CPU cache to fill in SiblingMap
+ */
+static void cpumap_to_cpu_ci(char *shared_cpu_map,
+                             const std::vector<struct proc_cpuinfo>& cpuinfo,
+                             HsaCacheProperties *this_cache) {
+  int num_hexs, bit;
+  uint32_t proc, apicid, mask;
+  char *ch_ptr;
+
+  /* shared_cpu_map is shown as ...X3,X2,X1 Each X is a hex without 0x
+   * and it's up to 8 characters(32 bits). For the first 32 CPUs(actually
+   * procs), it's presented in X1. The next 32 is in X2, and so on.
+   */
+  num_hexs = (strlen(shared_cpu_map) + 8) / 9; /* 8 characters + "," */
+  ch_ptr = strtok(shared_cpu_map, ",");
+  while (num_hexs-- > 0) {
+    mask = strtol(ch_ptr, NULL, 16); /* each X */
+    for (bit = 0; bit < 32; bit++) {
+      if (!((1 << bit) & mask))
+        continue;
+      proc = num_hexs * 32 + bit;
+      apicid = cpuinfo[proc].apicid;
+      if (apicid >= HSA_CPU_SIBLINGS) {
+        pr_warn("SiblingMap buffer %d is too small\n", HSA_CPU_SIBLINGS);
+        continue;
+      }
+      this_cache->SiblingMap[apicid] = 1;
+    }
+    ch_ptr = strtok(NULL, ",");
+  }
+}
+
+/* get_cpu_cache_info - get specified CPU's cache information from sysfs
+ *     @prefix [IN] sysfs path for target cpu cache,
+ *                  /sys/devices/system/node/nodeX/cpuY/cache
+ *     @cpuinfo [IN] /proc/cpuinfo data to get apicid
+ *     @cpu_ci: CPU specified. This parameter is an input and also an output.
+ *             [IN] cpu_ci->num_caches: number of index dirs
+ *             [OUT] cpu_ci->cache_info: to store cache info collected
+ *             [OUT] cpu_ci->num_caches: reduces when shared with other cpu(s)
+ * Return: number of cache reported from this cpu
+ */
+static int get_cpu_cache_info(const char *prefix,
+                              const std::vector<struct proc_cpuinfo>& cpuinfo,
+                              std::vector<HsaCacheProperties>& cache,
+                              cpu_cacheinfo_t& cpu_ci) {
+  int n;
+  char path[256], str[256];
+  bool is_power9 = false;
+
+  if (dxg_topology->processor_vendor == IBM_POWER) {
+    if (strcmp(cpuinfo[0].model_name, "POWER9") == 0) {
+      is_power9 = true;
+    }
+  }
+
+  HsaCacheProperties this_cache;
+  int num_idx = cpu_ci.num_caches;
+  for (int idx = 0; idx < num_idx; idx++) {
+    memset(&this_cache, 0, sizeof(this_cache));
+    /* If this cache is shared by multiple CPUs, we only need
+     * to list it in the first CPU.
+     */
+    if (is_power9) {
+      // POWER9 has SMT4
+      if (cpu_ci.proc_num & 0x3) {
+        /* proc is not 0,4,8,etc.  Skip and reduce the cache count. */
+        --cpu_ci.num_caches;
+        continue;
+      }
+    } else {
+      snprintf(path, 256, "%s/index%d/shared_cpu_list", prefix, idx);
+      /* shared_cpu_list is shown as n1,n2... or n1-n2,n3-n4...
+       * For both cases, this cache is listed to proc n1 only.
+       */
+      fscanf_dec(path, (uint32_t *)&n);
+      if (cpu_ci.proc_num != n) {
+        /* proc is not n1. Skip and reduce the cache count. */
+        --cpu_ci.num_caches;
+        continue;
+      }
+      this_cache.ProcessorIdLow = cpuinfo[cpu_ci.proc_num].apicid;
+    }
+
+    /* CacheLevel */
+    snprintf(path, 256, "%s/index%d/level", prefix, idx);
+    fscanf_dec(path, &this_cache.CacheLevel);
+    /* CacheType */
+    snprintf(path, 256, "%s/index%d/type", prefix, idx);
+
+    memset(str, 0, sizeof(str));
+    fscanf_str(path, str);
+    if (!strcmp(str, "Data"))
+      this_cache.CacheType.ui32.Data = 1;
+    if (!strcmp(str, "Instruction"))
+      this_cache.CacheType.ui32.Instruction = 1;
+    if (!strcmp(str, "Unified")) {
+      this_cache.CacheType.ui32.Data = 1;
+      this_cache.CacheType.ui32.Instruction = 1;
+    }
+    this_cache.CacheType.ui32.CPU = 1;
+    /* CacheSize */
+    snprintf(path, 256, "%s/index%d/size", prefix, idx);
+    fscanf_size(path, &this_cache.CacheSize);
+    /* CacheLineSize */
+    snprintf(path, 256, "%s/index%d/coherency_line_size", prefix, idx);
+    fscanf_dec(path, &this_cache.CacheLineSize);
+    /* CacheAssociativity */
+    snprintf(path, 256, "%s/index%d/ways_of_associativity", prefix, idx);
+    fscanf_dec(path, &this_cache.CacheAssociativity);
+    /* CacheLinesPerTag */
+    snprintf(path, 256, "%s/index%d/physical_line_partition", prefix, idx);
+    fscanf_dec(path, &this_cache.CacheLinesPerTag);
+    /* CacheSiblings */
+    snprintf(path, 256, "%s/index%d/shared_cpu_map", prefix, idx);
+    fscanf_str(path, str);
+    cpumap_to_cpu_ci(str, cpuinfo, &this_cache);
+
+    cache.push_back(this_cache);
+  }
+
+  return cpu_ci.num_caches;
+}
+
+static HSAKMT_STATUS topology_map_node_id(uint32_t node_id,
+                                          wsl::thunk::WDDMDevice *&device) {
+  uint32_t idx = node_id;
+  if ((!dxg_topology->wdevices_.size()) || (!node_id) || (node_id >= dxg_topology->num_sysfs_nodes)) {
+    device = nullptr;
+    return HSAKMT_STATUS_ERROR;
+  }
+
+  device = dxg_topology->wdevices_[node_id - 1];
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS topology_sysfs_get_system_props(HsaSystemProperties& props) {
+  HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
+  bool is_node_supported = true;
+  uint32_t num_supported_nodes = 0;
+
+  std::memset(&props, 0, sizeof(props));
+
+  dxg_runtime->HeapFini();
+  for (auto device : dxg_topology->wdevices_)
+    delete device;
+  dxg_topology->wdevices_.clear();
+
+  WDDMCreateDevices(dxg_topology->wdevices_);
+  int num_adapters = dxg_topology->wdevices_.size();
+  if (num_adapters == 0) {
+    pr_err("No WDDM adapters found.\n");
+    return HSAKMT_STATUS_ERROR;
+  }
+
+  dxg_topology->num_sysfs_nodes = num_adapters + 1;
+  dxg_runtime->HeapInit();
+  props.NumNodes = dxg_topology->num_sysfs_nodes;
+  if (dxg_runtime->default_node > num_adapters)
+    dxg_runtime->default_node = num_adapters;
+
+  return ret;
+}
+
+void topology_setup_is_dgpu_param(HsaNodeProperties *props) {
+  /* if we found a dGPU node, then treat the whole system as dGPU */
+  /* noted that some APUs are also treated as dGPU in runtime */
+  if (!props->NumCPUCores && props->NumFComputeCores)
+    dxg_runtime->hsakmt_is_dgpu = true;
+}
+
+static HSAKMT_STATUS topology_get_cpu_model_name(HsaNodeProperties& props,
+                                                 const std::vector<proc_cpuinfo>& cpuinfo) {
+  for (int i = 0; i < cpuinfo.size(); i++) {
+    if (props.CComputeIdLo == cpuinfo[i].apicid) {
+      if (!props.DeviceId) /* CPU-only node */
+        strncpy((char *)props.AMDName, cpuinfo[i].model_name,
+                sizeof(props.AMDName));
+      /* Convert from UTF8 to UTF16 */
+      int j;
+      for (j = 0;
+           cpuinfo[i].model_name[j] != '\0' && j < HSA_PUBLIC_NAME_SIZE - 1; j++)
+        props.MarketingName[j] = cpuinfo[i].model_name[j];
+      props.MarketingName[j] = '\0';
+      return HSAKMT_STATUS_SUCCESS;
+    }
+  }
+
+  return HSAKMT_STATUS_ERROR;
+}
+
+static int topology_search_processor_vendor(const std::string& processor_name) {
+  for (unsigned int i = 0; i < ARRAY_LEN(supported_processor_vendor_name); i++) {
+    if (processor_name == supported_processor_vendor_name[i])
+      return i;
+    if (processor_name == "POWER9, altivec supported")
+      return IBM_POWER;
+  }
+  return -1;
+}
+
+/* topology_parse_cpuinfo - Parse /proc/cpuinfo and fill up required
+ *			topology information
+ * cpuinfo [OUT]: output buffer to hold cpu information
+ * num_procs: number of processors the output buffer can hold
+ */
+static HSAKMT_STATUS topology_parse_cpuinfo(std::vector<proc_cpuinfo>& cpuinfo) {
+  HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
+  uint32_t num_procs = cpuinfo.size();
+
+  std::ifstream cpuinfo_max_freq(
+      "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq");
+  if (cpuinfo_max_freq) {
+    std::string line;
+    std::getline(cpuinfo_max_freq, line);
+    dxg_topology->freq_max_ = static_cast<uint32_t>(std::stod(line) / 1000);
+  }
+
+  std::ifstream cpuinfo_file("/proc/cpuinfo");
+  if (!cpuinfo_file) {
+    pr_err("Failed to open /proc/cpuinfo. Unable to get CPU information");
+    return HSAKMT_STATUS_ERROR;
+  }
+
+  std::string line;
+  uint32_t proc = 0;
+  while (std::getline(cpuinfo_file, line)) {
+    if (line.substr(0, 9) == "processor") {
+      proc = std::stoi(line.substr(line.find(':') + 2));
+      if (proc >= num_procs) {
+        pr_err("cpuinfo contains processor %d larger than %u\n", proc, num_procs);
+        return HSAKMT_STATUS_NO_MEMORY;
+      }
+      continue;
+    }
+
+    if (line.substr(0, 9) == "vendor_id" && dxg_topology->processor_vendor == -1) {
+      std::string vendor = line.substr(line.find(':') + 2);
+      dxg_topology->processor_vendor = topology_search_processor_vendor(vendor.c_str());
+      continue;
+    }
+
+    if (line.substr(0, 10) == "model name") {
+      std::string model_name = line.substr(line.find(':') + 2);
+      if (model_name.size() > HSA_PUBLIC_NAME_SIZE)
+      model_name.resize(HSA_PUBLIC_NAME_SIZE);
+      std::strncpy(cpuinfo[proc].model_name, model_name.c_str(), HSA_PUBLIC_NAME_SIZE);
+      continue;
+    }
+
+    if (line.substr(0, 6) == "apicid") {
+      cpuinfo[proc].apicid = std::stoi(line.substr(line.find(':') + 2));
+      continue;
+    }
+
+    if (!cpuinfo_max_freq) {
+      if (line.substr(0, 7) == "cpu MHz") {
+        double freq = std::stod(line.substr(line.find(':') + 2));
+        if (freq > dxg_topology->freq_max_) {
+          dxg_topology->freq_max_ = freq;
+        }
+        continue;
+      }
+    }
+  }
+
+  if (dxg_topology->processor_vendor < 0) {
+    pr_err("Failed to get Processor Vendor. Setting to %s", supported_processor_vendor_name[GENUINE_INTEL]);
+    dxg_topology->processor_vendor = GENUINE_INTEL;
+  }
+
+  return ret;
+}
+
+static HSAKMT_STATUS topology_sysfs_get_node_props(uint32_t node_id,
+                                                   HsaNodeProperties& props,
+                                                   bool& p2p_links,
+                                                   uint32_t& num_p2pLinks) {
+  HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
+
+  memset(&props, 0, sizeof(props));
+  p2p_links = false;
+  num_p2pLinks = 0;
+
+  props.MaxEngineClockMhzCCompute = dxg_topology->freq_max_;
+
+  if (node_id == 0) {
+    /* CPU node */
+    props.NumCPUCores = sysconf(_SC_NPROCESSORS_ONLN);
+    props.NumMemoryBanks = 1;
+    props.KFDGpuID = 0;
+    return HSAKMT_STATUS_SUCCESS;
+  }
+
+  /* gpu node */
+  wsl::thunk::WDDMDevice *device;
+  ret = topology_map_node_id(node_id, device);
+  if (ret != HSAKMT_STATUS_SUCCESS)
+    return ret;
+
+  props.NumCPUCores = 0;
+  props.NumFComputeCores = device->SimdPerCu() * device->ComputeUnitCount();
+  props.NumMemoryBanks = 1;
+  props.NumCaches = 3;
+  props.NumIOLinks = 1;
+  props.CComputeIdLo = 0;
+  props.FComputeIdLo = 0;
+  props.Capability.ui32.ASICRevision = device->AsicRevision();
+  props.Capability.ui32.WatchPointsTotalBits =
+      std::log2(device->WatchPointsNum());
+  props.MaxWavesPerSIMD = device->WavePerCu() / device->SimdPerCu();
+  props.LDSSizeInKB = device->LdsSize() / 1024;
+  props.GDSSizeInKB = 0;
+  props.WaveFrontSize = device->WavefrontSize();
+  props.NumShaderBanks = device->NumShaderEngine();
+  props.NumArrays = device->ShaderArrayPerShaderEngine();
+  props.NumCUPerArray = device->ComputeUnitCount() / props.NumArrays;
+  props.NumSIMDPerCU = device->SimdPerCu();
+  props.MaxSlotsScratchCU = device->MaxScratchSlotsPerCu();
+  props.VendorId = 0x1002;
+  props.DeviceId = device->DeviceId();
+  props.LocationId = device->PciBusAddr();
+  props.LocalMemSize = 0;
+  props.MaxEngineClockMhzFCompute = device->MaxEngineClockMhz();
+  props.DrmRenderMinor = node_id;
+
+  {
+    int i;
+    const char *name = device->ProductName();
+    for (i = 0; name[i] != 0 && i < HSA_PUBLIC_NAME_SIZE - 1; i++)
+      props.MarketingName[i] = name[i];
+    props.MarketingName[i] = '\0';
+  }
+  props.uCodeEngineVersions.uCodeSDMA = device->GetSdmaFwVersion();
+  props.DebugProperties.Value = 0;
+  props.HiveID = 0;
+  props.NumSdmaEngines = device->NumSdmaEngine();
+  props.NumSdmaXgmiEngines = 0;
+  props.NumSdmaQueuesPerEngine = 6; // TODO
+  props.NumCpQueues = device->GetNumCpQueues();
+  props.NumGws = 0;
+  /*
+   * In Native Linux, if the asic is APU, this value will be set to 1,
+   * if the asic is dGPU, this value will be set to 0. clr use this info
+   * to set hostUnifiedMemory_, but for now wsl does not support this feature.
+   * Therefore, fore vaule to 0 temporarily.
+   */
+  props.Integrated = 0;
+  props.Domain = device->Domain();
+  props.UniqueID = device->Uuid();
+  props.NumXcc = 1;
+  props.KFDGpuID = device->DeviceId(); // TODO
+  props.FamilyID = device->GfxFamily();
+
+  props.EngineId.ui32.uCode = device->GetMecFwVersion();
+  char *envvar = getenv("HSA_OVERRIDE_GFX_VERSION");
+  if (envvar) {
+    char dummy = '\0';
+    uint32_t major = 0, minor = 0, step = 0;
+    /* HSA_OVERRIDE_GFX_VERSION=major.minor.stepping */
+    if ((sscanf(envvar, "%u.%u.%u%c", &major, &minor, &step, &dummy) != 3) ||
+        (major > 63 || minor > 255 || step > 255)) {
+      pr_err("HSA_OVERRIDE_GFX_VERSION %s is invalid\n", envvar);
+      return HSAKMT_STATUS_ERROR;
+    }
+    props.OverrideEngineId.ui32.Major = major & 0x3f;
+    props.OverrideEngineId.ui32.Minor = minor & 0xff;
+    props.OverrideEngineId.ui32.Stepping = step & 0xff;
+  }
+  props.EngineId.ui32.Major = device->Major();
+  props.EngineId.ui32.Minor = device->Minor();
+  props.EngineId.ui32.Stepping = device->Stepping();
+
+  snprintf((char *)props.AMDName, sizeof(props.AMDName) - 1, "GFX%06x",
+           HSA_GET_GFX_VERSION_FULL(props.EngineId.ui32));
+
+  if (!dxg_runtime->is_svm_api_supported)
+    props.Capability.ui32.SVMAPISupported = 0;
+  props.Capability.ui32.DoorbellType = 2;
+
+  /* Get VGPR/SGPR size in byte per CU */
+  props.SGPRSizePerCU = SGPR_SIZE_PER_CU;
+  props.VGPRSizePerCU = get_vgpr_size_per_cu(props.EngineId);
+
+  if (props.NumFComputeCores)
+    assert(props.EngineId.ui32.Major &&
+           "HSA_OVERRIDE_GFX_VERSION may be needed");
+
+  return ret;
+}
+
+static HSAKMT_STATUS topology_sysfs_get_mem_props(uint32_t node_id,
+                                                  uint32_t mem_id,
+                                                  HsaMemoryProperties& props) {
+  HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
+
+  std::memset(&props, 0, sizeof(props));
+  if (node_id == 0) {
+    /* CPU node */
+    props.HeapType = HSA_HEAPTYPE_SYSTEM;
+
+    struct sysinfo info;
+    sysinfo(&info);
+    props.SizeInBytes = info.totalram;
+
+    /* props.SizeInBytes is the actual physical system
+     * memory size. Reserve 1/16th for WSL system usage.
+     */
+    dxg_runtime->max_single_alloc_size = info.totalram - (info.totalram >> 4);
+
+    props.Flags.MemoryProperty = 0;
+    /* TODO: sudo dmidecode --type memory doesn't work on wsl */
+    props.Width = 64;
+    props.MemoryClockMax = 2133;
+    return HSAKMT_STATUS_SUCCESS;
+  }
+
+  wsl::thunk::WDDMDevice *device;
+  ret = topology_map_node_id(node_id, device);
+  if (ret != HSAKMT_STATUS_SUCCESS)
+    return ret;
+
+  props.HeapType = HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE;
+
+  if (device->IsDgpu())
+    props.SizeInBytes = device->LocalHeapSize();
+  else
+    props.SizeInBytes = device->NonLocalHeapSize();
+
+  props.Width = device->MemoryBusWidth();
+  props.MemoryClockMax = device->MaxMemoryClockMhz();
+
+  return ret;
+}
+
+/* topology_get_cpu_cache_props - Read CPU cache information from sysfs
+ *	@node [IN] CPU node number
+ *	@cpuinfo [IN] /proc/cpuinfo data
+ *	@tbl [OUT] the node table to fill up
+ * Return: HSAKMT_STATUS_SUCCESS in success or error number in failure
+ */
+static HSAKMT_STATUS topology_get_cpu_cache_props(int node,
+                                                  const std::vector<proc_cpuinfo>& cpuinfo,
+                                                  node_props_t& tbl) {
+  HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
+
+  /* Get max path size from /sys/devices/system/node/node%d/%s/cache
+   * below, which will max out according to the largest filename,
+   * which can be present twice in the string above. 29 is for the prefix
+   * and the +6 is for the cache suffix
+   */
+#ifndef MAXNAMLEN
+/* MAXNAMLEN is the BSD name for NAME_MAX. glibc aliases this as NAME_MAX, but
+ * not musl */
+#define MAXNAMLEN NAME_MAX
+#endif
+  constexpr uint32_t MAXPATHSIZE = 29 + MAXNAMLEN + (MAXNAMLEN + 6);
+  char path[MAXPATHSIZE], node_dir[MAXPATHSIZE];
+  int max_cpus;
+  int cache_cnt = 0;
+  DIR *dirp = NULL;
+  struct dirent *dir;
+  char *p;
+
+  /* Get info from /sys/devices/system/node/nodeX/cpuY/cache */
+  int node_real = node;
+  if (dxg_topology->processor_vendor == IBM_POWER) {
+    if (!strcmp(cpuinfo[0].model_name, "POWER9")) {
+      node_real = node * 8;
+    }
+  }
+  snprintf(node_dir, MAXPATHSIZE, "/sys/devices/system/node/node%d", node_real);
+  /* Other than cpuY folders, this dir also has cpulist and cpumap */
+  max_cpus = num_subdirs(node_dir, "cpu");
+  if (max_cpus <= 0) {
+    /* If CONFIG_NUMA is not enabled in the kernel,
+     * /sys/devices/system/node doesn't exist.
+     */
+    if (node) { /* CPU node must be 0 or something is wrong */
+      pr_err("Fail to get cpu* dirs under %s.", node_dir);
+      ret = HSAKMT_STATUS_ERROR;
+      goto exit;
+    }
+    /* Fall back to use /sys/devices/system/cpu */
+    snprintf(node_dir, MAXPATHSIZE, "/sys/devices/system/cpu");
+    max_cpus = num_subdirs(node_dir, "cpu");
+    if (max_cpus <= 0) {
+      pr_err("Fail to get cpu* dirs under %s\n", node_dir);
+      ret = HSAKMT_STATUS_ERROR;
+      goto exit;
+    }
+  }
+
+  dirp = opendir(node_dir);
+  while ((dir = readdir(dirp)) != 0) {
+    if (strncmp(dir->d_name, "cpu", 3))
+      continue;
+    if (!isdigit(dir->d_name[3])) /* ignore files like cpulist */
+      continue;
+    if (strlen(node_dir) + strlen(dir->d_name) + strlen("/cache") + 2 < MAXPATHSIZE) {
+      std::string path_str = std::string(node_dir) + "/" + dir->d_name + "/cache";
+      strncpy(path, path_str.c_str(), MAXPATHSIZE);
+      path[MAXPATHSIZE - 1] = '\0';
+    } else {
+      pr_err("Path is too long and was truncated.\n");
+      goto exit;
+    }
+
+    cpu_cacheinfo_t cpu_ci;
+    cpu_ci.num_caches = num_subdirs(path, "index");
+    cpu_ci.proc_num= atoi(dir->d_name+3);
+
+    cache_cnt += get_cpu_cache_info(path, cpuinfo, tbl.cache, cpu_ci);
+  }
+  assert(cache_cnt == tbl.cache.size());
+  tbl.node.NumCaches = cache_cnt;
+
+exit:
+  if (dirp)
+    closedir(dirp);
+  return ret;
+}
+
+/* For a give Node @node_id the function gets @iolink_id information i.e. parses
+ * sysfs the following sysfs entry
+ * ./nodes/@node_id/io_links/@iolink_id/properties. @node_id has to be valid
+ * accessible node.
+ *
+ * If node_to specified by the @iolink_id is not accessible the function returns
+ * HSAKMT_STATUS_NOT_SUPPORTED. If node_to is accessible, then node_to is mapped
+ * from sysfs_node to user_node and returns HSAKMT_STATUS_SUCCESS.
+ */
+static HSAKMT_STATUS topology_sysfs_get_iolink_props(uint32_t node_id,
+                                                     uint32_t iolink_id,
+                                                     HsaIoLinkProperties& props,
+                                                     bool p2pLink) {
+  wsl::thunk::WDDMDevice *device;
+  topology_map_node_id(node_id, device);
+
+  std::memset(&props, 0, sizeof(props));
+  props.IoLinkType = HSA_IOLINKTYPE_PCIEXPRESS;
+  props.VersionMajor = props.VersionMinor = 0;
+  props.NodeFrom = node_id;
+  props.NodeTo = 0;
+  props.Weight = 20;
+  props.Flags.ui32.Override = 1;
+  props.Flags.ui32.NonCoherent = 1;
+  props.Flags.ui32.NoAtomics32bit = !(device->SupportPlatformAtomic());
+  props.Flags.ui32.NoAtomics64bit = !(device->SupportPlatformAtomic());
+  props.RecSdmaEngIdMask = 0;
+
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+/* topology_get_free_io_link_slot_for_node - For the given node_id, find the
+ * next available free slot to add an io_link
+ */
+static HsaIoLinkProperties *
+topology_get_free_io_link_slot_for_node(uint32_t node_id,
+                                        const HsaSystemProperties& sys_props,
+                                        std::vector<node_props_t>& node_props) {
+  std::vector<HsaIoLinkProperties>& props = node_props[node_id].link;
+
+  if (node_id >= sys_props.NumNodes) {
+    pr_err("Invalid node [%d]\n", node_id);
+    return NULL;
+  }
+
+  if (!props.size()) {
+    pr_err("No io_link reported for Node [%d]\n", node_id);
+    return NULL;
+  }
+
+  if (node_props[node_id].node.NumIOLinks >= sys_props.NumNodes - 1) {
+    pr_err("No more space for io_link for Node [%d]\n", node_id);
+    return NULL;
+  }
+
+  return &props[node_props[node_id].node.NumIOLinks];
+}
+
+/* topology_add_io_link_for_node - If a free slot is available,
+ * add io_link for the given Node.
+ * TODO: Add other members of HsaIoLinkProperties
+ */
+static HSAKMT_STATUS topology_add_io_link_for_node(
+    uint32_t node_from, const HsaSystemProperties& sys_props,
+    std::vector<node_props_t>& node_props, HSA_IOLINKTYPE IoLinkType, uint32_t node_to,
+    uint32_t Weight) {
+  HsaIoLinkProperties *props;
+
+  props =
+      topology_get_free_io_link_slot_for_node(node_from, sys_props, node_props);
+  if (!props)
+    return HSAKMT_STATUS_NO_MEMORY;
+
+  props->IoLinkType = IoLinkType;
+  props->NodeFrom = node_from;
+  props->NodeTo = node_to;
+  props->Weight = Weight;
+  node_props[node_from].node.NumIOLinks++;
+
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+/* Find the CPU that this GPU (gpu_node) directly connects to */
+static int32_t gpu_get_direct_link_cpu(uint32_t gpu_node,
+                                       const std::vector<node_props_t>& node_props) {
+  const std::vector<HsaIoLinkProperties>& props = node_props[gpu_node].link;
+  uint32_t i;
+
+  if (!node_props[gpu_node].node.KFDGpuID || props.empty() ||
+      node_props[gpu_node].node.NumIOLinks == 0)
+    return -1;
+
+  for (i = 0; i < node_props[gpu_node].node.NumIOLinks; i++)
+    if (props[i].IoLinkType == HSA_IOLINKTYPE_PCIEXPRESS &&
+        props[i].Weight <= 20) /* >20 is GPU->CPU->GPU */
+      return props[i].NodeTo;
+
+  return -1;
+}
+
+/* Get node1->node2 IO link information. This should be a direct link that has
+ * been created in the kernel.
+ */
+static HSAKMT_STATUS get_direct_iolink_info(uint32_t node1, uint32_t node2,
+                                            const std::vector<node_props_t>& node_props,
+                                            HSAuint32 *weight,
+                                            HSA_IOLINKTYPE *type) {
+  const std::vector<HsaIoLinkProperties>& props = node_props[node1].link;
+  uint32_t i;
+
+  if (!props.size())
+    return HSAKMT_STATUS_INVALID_NODE_UNIT;
+
+  for (i = 0; i < node_props[node1].node.NumIOLinks; i++)
+    if (props[i].NodeTo == node2) {
+      if (weight)
+        *weight = props[i].Weight;
+      if (type)
+        *type = props[i].IoLinkType;
+      return HSAKMT_STATUS_SUCCESS;
+    }
+
+  return HSAKMT_STATUS_INVALID_PARAMETER;
+}
+
+static HSAKMT_STATUS get_indirect_iolink_info(uint32_t node1, uint32_t node2,
+                                              const std::vector<node_props_t>& node_props,
+                                              HSAuint32 *weight,
+                                              HSA_IOLINKTYPE *type) {
+  int32_t dir_cpu1 = -1, dir_cpu2 = -1;
+  HSAKMT_STATUS ret;
+  uint32_t i;
+
+  *weight = 0;
+  *type = HSA_IOLINKTYPE_UNDEFINED;
+
+  if (node1 == node2)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  /* CPU->CPU is not an indirect link */
+  if (!node_props[node1].node.KFDGpuID && !node_props[node2].node.KFDGpuID)
+    return HSAKMT_STATUS_INVALID_NODE_UNIT;
+
+  if (node_props[node1].node.HiveID && node_props[node2].node.HiveID &&
+      node_props[node1].node.HiveID == node_props[node2].node.HiveID)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  if (node_props[node1].node.KFDGpuID)
+    dir_cpu1 = gpu_get_direct_link_cpu(node1, node_props);
+  if (node_props[node2].node.KFDGpuID)
+    dir_cpu2 = gpu_get_direct_link_cpu(node2, node_props);
+
+  if (dir_cpu1 < 0 && dir_cpu2 < 0)
+    return HSAKMT_STATUS_ERROR;
+
+  /* if the node2(dst) is GPU , it need to be large bar for host access*/
+  if (node_props[node2].node.KFDGpuID) {
+    for (i = 0; i < node_props[node2].node.NumMemoryBanks; ++i)
+      if (node_props[node2].mem[i].HeapType == HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC)
+        break;
+    if (i >= node_props[node2].node.NumMemoryBanks)
+      return HSAKMT_STATUS_ERROR;
+  }
+  /* Possible topology:
+   *   GPU --(weight1) -- CPU -- (weight2) -- GPU
+   *   GPU --(weight1) -- CPU -- (weight2) -- CPU -- (weight3) -- GPU
+   *   GPU --(weight1) -- CPU -- (weight2) -- CPU
+   *   CPU -- (weight2) -- CPU -- (weight3) -- GPU
+   */
+  HSAuint32 weight1 = 0, weight2 = 0, weight3 = 0;
+  if (dir_cpu1 >= 0) { /* GPU->CPU ... */
+    if (dir_cpu2 >= 0) {
+      if (dir_cpu1 == dir_cpu2) /* GPU->CPU->GPU*/ {
+        ret =
+            get_direct_iolink_info(node1, dir_cpu1, node_props, &weight1, NULL);
+        if (ret != HSAKMT_STATUS_SUCCESS)
+          return ret;
+        ret =
+            get_direct_iolink_info(dir_cpu1, node2, node_props, &weight2, type);
+      } else /* GPU->CPU->CPU->GPU*/ {
+        ret =
+            get_direct_iolink_info(node1, dir_cpu1, node_props, &weight1, NULL);
+        if (ret != HSAKMT_STATUS_SUCCESS)
+          return ret;
+        ret = get_direct_iolink_info(dir_cpu1, dir_cpu2, node_props, &weight2,
+                                     type);
+        if (ret != HSAKMT_STATUS_SUCCESS)
+          return ret;
+        /* On QPI interconnection, GPUs can't access
+         * each other if they are attached to different
+         * CPU sockets. CPU<->CPU weight larger than 20
+         * means the two CPUs are in different sockets.
+         */
+        if (*type == HSA_IOLINK_TYPE_QPI_1_1 && weight2 > 20)
+          return HSAKMT_STATUS_NOT_SUPPORTED;
+        ret =
+            get_direct_iolink_info(dir_cpu2, node2, node_props, &weight3, NULL);
+      }
+    } else /* GPU->CPU->CPU */ {
+      ret = get_direct_iolink_info(node1, dir_cpu1, node_props, &weight1, NULL);
+      if (ret != HSAKMT_STATUS_SUCCESS)
+        return ret;
+      ret = get_direct_iolink_info(dir_cpu1, node2, node_props, &weight2, type);
+    }
+  } else { /* CPU->CPU->GPU */
+    ret = get_direct_iolink_info(node1, dir_cpu2, node_props, &weight2, type);
+    if (ret != HSAKMT_STATUS_SUCCESS)
+      return ret;
+    ret = get_direct_iolink_info(dir_cpu2, node2, node_props, &weight3, NULL);
+  }
+
+  if (ret != HSAKMT_STATUS_SUCCESS)
+    return ret;
+
+  *weight = weight1 + weight2 + weight3;
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+static void
+topology_create_indirect_gpu_links(const HsaSystemProperties& sys_props,
+                                   std::vector<node_props_t>& node_props) {
+
+  uint32_t i, j;
+  HSAuint32 weight;
+  HSA_IOLINKTYPE type;
+
+  for (i = 0; i < sys_props.NumNodes - 1; i++) {
+    for (j = i + 1; j < sys_props.NumNodes; j++) {
+      get_indirect_iolink_info(i, j, node_props, &weight, &type);
+      if (!weight)
+        goto try_alt_dir;
+      if (topology_add_io_link_for_node(i, sys_props, node_props, type, j,
+                                        weight) != HSAKMT_STATUS_SUCCESS)
+        pr_err("Fail to add IO link %d->%d\n", i, j);
+    try_alt_dir:
+      get_indirect_iolink_info(j, i, node_props, &weight, &type);
+      if (!weight)
+        continue;
+      if (topology_add_io_link_for_node(j, sys_props, node_props, type, i,
+                                        weight) != HSAKMT_STATUS_SUCCESS)
+        pr_err("Fail to add IO link %d->%d\n", j, i);
+    }
+  }
+}
+
+HSAKMT_STATUS topology_take_snapshot(void) {
+  uint32_t i, mem_id, cache_id;
+  HsaSystemProperties sys_props;
+  std::vector<node_props_t>& temp_props = dxg_topology->g_props;
+  HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
+  const uint32_t num_procs = sysconf(_SC_NPROCESSORS_ONLN);
+  std::vector<proc_cpuinfo> cpuinfo(num_procs);
+  uint32_t num_ioLinks;
+  bool p2p_links = false;
+  uint32_t num_p2pLinks = 0;
+
+  topology_parse_cpuinfo(cpuinfo);
+
+  ret = topology_sysfs_get_system_props(sys_props);
+  if (ret != HSAKMT_STATUS_SUCCESS)
+    goto err;
+  if (sys_props.NumNodes > 0) {
+    temp_props.resize(sys_props.NumNodes);
+
+    for (i = 0; i < sys_props.NumNodes; i++) {
+      wsl::thunk::WDDMDevice *device_;
+      topology_map_node_id(i, device_);
+
+      ret = topology_sysfs_get_node_props(i, temp_props[i].node, p2p_links,
+                                          num_p2pLinks);
+      if (ret != HSAKMT_STATUS_SUCCESS) {
+        goto err;
+      }
+
+      topology_setup_is_dgpu_param(&temp_props[i].node);
+
+      if (temp_props[i].node.NumCPUCores)
+        topology_get_cpu_model_name(temp_props[i].node, cpuinfo);
+
+      if (temp_props[i].node.NumMemoryBanks) {
+        temp_props[i].mem.resize(temp_props[i].node.NumMemoryBanks);
+
+        for (mem_id = 0; mem_id < temp_props[i].node.NumMemoryBanks; mem_id++) {
+          ret = topology_sysfs_get_mem_props(i, mem_id,
+                                             temp_props[i].mem[mem_id]);
+          if (ret != HSAKMT_STATUS_SUCCESS) {
+            goto err;
+          }
+        }
+      }
+
+      if (temp_props[i].node.NumCaches) {
+        temp_props[i].cache.resize(temp_props[i].node.NumCaches);
+        for (int j = 0; j < 3; j++) {
+          temp_props[i].cache[j].CacheType.ui32.Data = 1;
+          temp_props[i].cache[j].CacheType.ui32.HSACU = 1;
+          temp_props[i].cache[j].CacheLevel = j + 1;
+        }
+        temp_props[i].cache[0].CacheSize = device_->GetL1CacheSize() / 1024;
+        temp_props[i].cache[1].CacheSize = device_->GetL2CacheSize() / 1024;
+        temp_props[i].cache[2].CacheSize = device_->GetL3CacheSize() / 1024;
+      } else if (!temp_props[i].node.KFDGpuID) { /* a CPU node */
+        ret = topology_get_cpu_cache_props(i, cpuinfo, temp_props[i]);
+        if (ret != HSAKMT_STATUS_SUCCESS) {
+          goto err;
+        }
+      }
+
+      /* To simplify, allocate maximum needed memory for io_links for each node.
+       * This removes the need for realloc when indirect and QPI links are added
+       * later
+       */
+      temp_props[i].link.resize(sys_props.NumNodes - 1);
+      num_ioLinks = temp_props[i].node.NumIOLinks - num_p2pLinks;
+      uint32_t link_id = 0;
+
+      if (num_ioLinks) {
+        uint32_t sys_link_id = 0;
+
+        /* Parse all the sysfs specified io links. Skip the ones where the
+         * remote node (node_to) is not accessible
+         */
+        while (sys_link_id < num_ioLinks && link_id < sys_props.NumNodes - 1) {
+          ret = topology_sysfs_get_iolink_props(
+              i, sys_link_id++, temp_props[i].link[link_id], false);
+          if (ret == HSAKMT_STATUS_NOT_SUPPORTED) {
+            ret = HSAKMT_STATUS_SUCCESS;
+            continue;
+          } else if (ret != HSAKMT_STATUS_SUCCESS) {
+            goto err;
+          }
+          link_id++;
+        }
+        /* sysfs specifies all the io links. Limit the number to valid ones */
+        temp_props[i].node.NumIOLinks = link_id;
+      }
+
+      if (num_p2pLinks) {
+        uint32_t sys_link_id = 0;
+
+        /* Parse all the sysfs specified p2p links.
+         */
+        while (sys_link_id < num_p2pLinks && link_id < sys_props.NumNodes - 1) {
+          ret = topology_sysfs_get_iolink_props(
+              i, sys_link_id++, temp_props[i].link[link_id], true);
+          if (ret == HSAKMT_STATUS_NOT_SUPPORTED) {
+            ret = HSAKMT_STATUS_SUCCESS;
+            continue;
+          } else if (ret != HSAKMT_STATUS_SUCCESS) {
+            goto err;
+          }
+          link_id++;
+        }
+        temp_props[i].node.NumIOLinks = link_id;
+      }
+    }
+  }
+
+  if (!p2p_links) {
+    /* All direct IO links are created in the kernel. Here we need to
+     * connect GPU<->GPU or GPU<->CPU indirect IO links.
+     */
+    topology_create_indirect_gpu_links(sys_props, temp_props);
+  }
+
+  if (!dxg_topology->g_system) {
+    dxg_topology->g_system = (HsaSystemProperties *)malloc(sizeof(HsaSystemProperties));
+    if (!dxg_topology->g_system) {
+      ret = HSAKMT_STATUS_NO_MEMORY;
+      goto err;
+    }
+  }
+
+  *dxg_topology->g_system = sys_props;
+err:
+  return ret;
+}
+
+/* Drop the Snashot of the HSA topology information. Assume lock is held. */
+void topology_drop_snapshot(void) {
+  if (!!dxg_topology->g_system != !!dxg_topology->g_props.size())
+    pr_warn("Probably inconsistency?\n");
+
+  dxg_topology->g_props.clear();
+
+  free(dxg_topology->g_system);
+  dxg_topology->g_system = NULL;
+
+  trim_suballocator();
+  for (auto device : dxg_topology->wdevices_)
+    delete device;
+  dxg_topology->wdevices_.clear();
+}
+
+HSAKMT_STATUS validate_nodeid(uint32_t nodeid, uint32_t *gpu_id) {
+  if (dxg_topology->g_props.empty() || !dxg_topology->g_system || dxg_topology->g_system->NumNodes <= nodeid)
+    return HSAKMT_STATUS_INVALID_NODE_UNIT;
+  if (gpu_id)
+    *gpu_id = dxg_topology->g_props[nodeid].node.KFDGpuID;
+
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS gpuid_to_nodeid(uint32_t gpu_id, uint32_t *node_id) {
+  uint64_t node_idx;
+
+  for (node_idx = 0; node_idx < dxg_topology->g_system->NumNodes; node_idx++) {
+    if (dxg_topology->g_props[node_idx].node.KFDGpuID == gpu_id) {
+      *node_id = node_idx;
+      return HSAKMT_STATUS_SUCCESS;
+    }
+  }
+
+  return HSAKMT_STATUS_INVALID_NODE_UNIT;
+}
+
+HSAKMT_STATUS HSAKMTAPI
+hsaKmtAcquireSystemProperties(HsaSystemProperties *SystemProperties) {
+  HSAKMT_STATUS err = HSAKMT_STATUS_SUCCESS;
+
+  CHECK_DXG_OPEN();
+
+  if (!SystemProperties)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  pthread_mutex_lock(&dxg_runtime->hsakmt_mutex);
+
+  /* We already have a valid snapshot. Avoid double initialization that
+   * would leak memory.
+   */
+  if (dxg_topology->g_system) {
+    *SystemProperties = *dxg_topology->g_system;
+    goto out;
+  }
+
+  err = topology_take_snapshot();
+  if (err != HSAKMT_STATUS_SUCCESS)
+    goto out;
+
+  assert(dxg_topology->g_system);
+
+  // err = fmm_init_process_apertures(dxg_topology->g_system->NumNodes);
+  if (err != HSAKMT_STATUS_SUCCESS)
+    goto init_process_apertures_failed;
+
+  // err = init_process_doorbells(dxg_topology->g_system->NumNodes);
+  if (err != HSAKMT_STATUS_SUCCESS)
+    goto init_doorbells_failed;
+
+  *SystemProperties = *dxg_topology->g_system;
+
+  goto out;
+
+init_doorbells_failed:
+  // fmm_destroy_process_apertures();
+init_process_apertures_failed:
+  topology_drop_snapshot();
+
+out:
+  pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex);
+  return err;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtReleaseSystemProperties(void) {
+  pthread_mutex_lock(&dxg_runtime->hsakmt_mutex);
+
+  topology_drop_snapshot();
+
+  pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex);
+
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS topology_get_node_props(HSAuint32 NodeId,
+                                      HsaNodeProperties *NodeProperties) {
+  if (!dxg_topology->g_system || dxg_topology->g_props.empty() || NodeId >= dxg_topology->g_system->NumNodes)
+    return HSAKMT_STATUS_ERROR;
+
+  *NodeProperties = dxg_topology->g_props[NodeId].node;
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI
+hsaKmtGetNodeProperties(HSAuint32 NodeId, HsaNodeProperties *NodeProperties) {
+  HSAKMT_STATUS err;
+  uint32_t gpu_id;
+
+  if (!NodeProperties)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  CHECK_DXG_OPEN();
+  pthread_mutex_lock(&dxg_runtime->hsakmt_mutex);
+
+  err = validate_nodeid(NodeId, &gpu_id);
+  if (err != HSAKMT_STATUS_SUCCESS)
+    goto out;
+
+  err = topology_get_node_props(NodeId, NodeProperties);
+  if (err != HSAKMT_STATUS_SUCCESS)
+    goto out;
+  /* For CPU only node don't add any additional GPU memory banks. */
+  if (gpu_id) {
+    uint64_t base, limit;
+    if (!(NodeProperties->Integrated))
+      NodeProperties->NumMemoryBanks += NUM_OF_DGPU_HEAPS;
+    else
+      NodeProperties->NumMemoryBanks += NUM_OF_IGPU_HEAPS;
+    // TODO: for apu
+    /*if (fmm_get_aperture_base_and_limit(FMM_MMIO, gpu_id, &base,
+                    &limit) == HSAKMT_STATUS_SUCCESS)
+            NodeProperties->NumMemoryBanks += 1;*/
+  }
+
+out:
+  pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex);
+  return err;
+}
+
+HSAKMT_STATUS HSAKMTAPI
+hsaKmtGetNodeMemoryProperties(HSAuint32 NodeId, HSAuint32 NumBanks,
+                              HsaMemoryProperties *MemoryProperties) {
+  HSAKMT_STATUS err = HSAKMT_STATUS_SUCCESS;
+  uint32_t i;
+
+  if (!MemoryProperties)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  CHECK_DXG_OPEN();
+  pthread_mutex_lock(&dxg_runtime->hsakmt_mutex);
+
+  memset(MemoryProperties, 0, NumBanks * sizeof(HsaMemoryProperties));
+  for (i = 0; i < wsl::Min(dxg_topology->g_props[NodeId].node.NumMemoryBanks, NumBanks); i++) {
+    assert(dxg_topology->g_props[NodeId].mem.size());
+    MemoryProperties[i] = dxg_topology->g_props[NodeId].mem[i];
+  }
+
+  /* The following memory banks does not apply to CPU only node */
+  wsl::thunk::WDDMDevice *device_ = get_wddmdev(NodeId);
+  if (device_ == nullptr)
+    goto out;
+
+  /*Add LDS*/
+  if (i < NumBanks) {
+    MemoryProperties[i].HeapType = HSA_HEAPTYPE_GPU_LDS;
+    MemoryProperties[i].VirtualBaseAddress = device_->SharedApertureBase();
+    MemoryProperties[i].SizeInBytes = dxg_topology->g_props[NodeId].node.LDSSizeInKB * 1024;
+    i++;
+  }
+
+  /* Add SCRATCH */
+  if (i < NumBanks) {
+    MemoryProperties[i].HeapType = HSA_HEAPTYPE_GPU_SCRATCH;
+    MemoryProperties[i].VirtualBaseAddress = device_->PrivateApertureBase();
+    MemoryProperties[i].SizeInBytes = device_->PrivateApertureSize();
+    i++;
+  }
+
+out:
+  pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex);
+  return err;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeCacheProperties(
+    HSAuint32 NodeId, HSAuint32 ProcessorId, HSAuint32 NumCaches,
+    HsaCacheProperties *CacheProperties) {
+  HSAKMT_STATUS err;
+  uint32_t i;
+
+  if (!CacheProperties)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  CHECK_DXG_OPEN();
+  pthread_mutex_lock(&dxg_runtime->hsakmt_mutex);
+
+  /* KFD ADD page 18, snapshot protocol violation */
+  if (!dxg_topology->g_system || NodeId >= dxg_topology->g_system->NumNodes) {
+    err = HSAKMT_STATUS_INVALID_NODE_UNIT;
+    goto out;
+  }
+
+  if (NumCaches > dxg_topology->g_props[NodeId].node.NumCaches) {
+    err = HSAKMT_STATUS_INVALID_PARAMETER;
+    goto out;
+  }
+
+  for (i = 0; i < wsl::Min(dxg_topology->g_props[NodeId].node.NumCaches, NumCaches); i++) {
+    assert(dxg_topology->g_props[NodeId].cache.size());
+    CacheProperties[i] = dxg_topology->g_props[NodeId].cache[i];
+  }
+
+  err = HSAKMT_STATUS_SUCCESS;
+
+out:
+  pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex);
+  return err;
+}
+
+HSAKMT_STATUS topology_get_iolink_props(HSAuint32 NodeId, HSAuint32 NumIoLinks,
+                                        HsaIoLinkProperties *IoLinkProperties) {
+  if (!dxg_topology->g_system || dxg_topology->g_props.empty() || NodeId >= dxg_topology->g_system->NumNodes)
+    return HSAKMT_STATUS_ERROR;
+
+  memcpy(IoLinkProperties, dxg_topology->g_props[NodeId].link.data(),
+         NumIoLinks * sizeof(*IoLinkProperties));
+
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI
+hsaKmtGetNodeIoLinkProperties(HSAuint32 NodeId, HSAuint32 NumIoLinks,
+                              HsaIoLinkProperties *IoLinkProperties) {
+  HSAKMT_STATUS err;
+
+  if (!IoLinkProperties)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  CHECK_DXG_OPEN();
+
+  pthread_mutex_lock(&dxg_runtime->hsakmt_mutex);
+
+  /* KFD ADD page 18, snapshot protocol violation */
+  if (!dxg_topology->g_system || NodeId >= dxg_topology->g_system->NumNodes) {
+    err = HSAKMT_STATUS_INVALID_NODE_UNIT;
+    goto out;
+  }
+
+  if (NumIoLinks > dxg_topology->g_props[NodeId].node.NumIOLinks) {
+    err = HSAKMT_STATUS_INVALID_PARAMETER;
+    goto out;
+  }
+
+  assert(dxg_topology->g_props[NodeId].link.size());
+  err = topology_get_iolink_props(NodeId, NumIoLinks, IoLinkProperties);
+
+out:
+  pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex);
+  return err;
+}
+
+uint16_t get_device_id_by_node_id(HSAuint32 node_id) {
+  if (dxg_topology->g_props.empty() || !dxg_topology->g_system || dxg_topology->g_system->NumNodes <= node_id)
+    return 0;
+
+  return dxg_topology->g_props[node_id].node.DeviceId;
+}
+
+bool prefer_ats(HSAuint32 node_id) {
+  return dxg_topology->g_props[node_id].node.Capability.ui32.HSAMMUPresent &&
+         dxg_topology->g_props[node_id].node.NumCPUCores &&
+         dxg_topology->g_props[node_id].node.NumFComputeCores;
+}
+
+uint16_t get_device_id_by_gpu_id(HSAuint32 gpu_id) {
+  unsigned int i;
+
+  if (dxg_topology->g_props.empty() || !dxg_topology->g_system)
+    return 0;
+
+  for (i = 0; i < dxg_topology->g_system->NumNodes; i++) {
+    if (dxg_topology->g_props[i].node.KFDGpuID == gpu_id)
+      return dxg_topology->g_props[i].node.DeviceId;
+  }
+
+  return 0;
+}
+
+uint32_t get_direct_link_cpu(uint32_t gpu_node) {
+  HSAuint64 size = 0;
+  int32_t cpu_id;
+  HSAuint32 i;
+
+  cpu_id = gpu_get_direct_link_cpu(gpu_node, dxg_topology->g_props);
+  if (cpu_id == -1)
+    return INVALID_NODEID;
+
+  assert(dxg_topology->g_props[cpu_id].mem.size());
+
+  for (i = 0; i < dxg_topology->g_props[cpu_id].node.NumMemoryBanks; i++)
+    size += dxg_topology->g_props[cpu_id].mem[i].SizeInBytes;
+
+  return size ? (uint32_t)cpu_id : INVALID_NODEID;
+}
+
+HSAKMT_STATUS validate_nodeid_array(uint32_t **gpu_id_array,
+                                    uint32_t NumberOfNodes,
+                                    uint32_t *NodeArray) {
+  HSAKMT_STATUS ret;
+  unsigned int i;
+
+  if (NumberOfNodes == 0 || !NodeArray || !gpu_id_array)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  /* Translate Node IDs to gpu_ids */
+  *gpu_id_array = (uint32_t *)malloc(NumberOfNodes * sizeof(uint32_t));
+  if (!(*gpu_id_array))
+    return HSAKMT_STATUS_NO_MEMORY;
+  for (i = 0; i < NumberOfNodes; i++) {
+    ret = validate_nodeid(NodeArray[i], *gpu_id_array + i);
+    if (ret != HSAKMT_STATUS_SUCCESS) {
+      free(*gpu_id_array);
+      break;
+    }
+  }
+
+  return ret;
+}
+
+uint32_t get_num_sysfs_nodes(void) { return dxg_topology->num_sysfs_nodes; }
+
+wsl::thunk::WDDMDevice *get_wddmdev(uint32_t node_id) {
+  if ((!dxg_topology->wdevices_.size()) || (!node_id) || (node_id >= dxg_topology->num_sysfs_nodes))
+    return nullptr;
+
+  return dxg_topology->wdevices_[node_id - 1];
+}
+
+uint32_t get_num_wddmdev() {
+  return dxg_topology->wdevices_.size();
+}
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/atomic_helpers.h b/projects/rocr-runtime/libhsakmt/src/dxg/util/atomic_helpers.h
new file mode 100644
index 0000000000..4b7f8b0362
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/atomic_helpers.h
@@ -0,0 +1,519 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+// 
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+// 
+// Developed by:
+// 
+//                 AMD Research and AMD HSA Software Development
+// 
+//                 Advanced Micro Devices, Inc.
+// 
+//                 www.amd.com
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+// 
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/*
+  Helpers to use native types with C++11 atomic operations.
+  Fixes GCC builtin functionality for x86 with respect to WC and non-temporal
+  stores.
+*/
+#ifndef HSA_RUNTIME_CORE_UTIL_ATOMIC_HELPERS_H_
+#define HSA_RUNTIME_CORE_UTIL_ATOMIC_HELPERS_H_
+
+#include <atomic>
+#include "utils.h"
+
+//ALWAYS_CONSERVATIVE will very likely overfence your code.
+//For use as a debugging aid only.
+#define ALWAYS_CONSERVATIVE 0
+
+#if !ALWAYS_CONSERVATIVE
+#if defined(__x86_64__) || defined(_M_X64)
+#define X64_ORDER_WC 1
+#endif
+#if X64_ORDER_WC
+#include <xmmintrin.h>
+#endif
+#endif
+
+namespace wsl {
+namespace atomic {
+
+static constexpr int c11ToBuiltInFlags(std::memory_order order)
+{
+#if ALWAYS_CONSERVATIVE
+  return __ATOMIC_RELAXED;
+#elif X64_ORDER_WC
+  return __ATOMIC_RELAXED;
+#else
+  return (order == std::memory_order_relaxed) ? __ATOMIC_RELAXED :
+    (order == std::memory_order_acquire) ? __ATOMIC_ACQUIRE :
+    (order == std::memory_order_release) ? __ATOMIC_RELEASE :
+    (order == std::memory_order_seq_cst) ? __ATOMIC_SEQ_CST :
+    (order == std::memory_order_consume) ? __ATOMIC_CONSUME :
+    (order == std::memory_order_acq_rel) ? __ATOMIC_ACQ_REL :
+    __ATOMIC_SEQ_CST;
+#endif
+}
+
+static __forceinline void PreFence(std::memory_order order) {
+#if ALWAYS_CONSERVATIVE
+  switch (order) {
+    case std::memory_order_release:
+    case std::memory_order_seq_cst:
+    case std::memory_order_acq_rel:
+      __atomic_thread_fence(__ATOMIC_SEQ_CST);
+    default:;
+  }
+#elif X64_ORDER_WC
+  switch (order) {
+    case std::memory_order_release:
+    case std::memory_order_seq_cst:
+    case std::memory_order_acq_rel:
+      _mm_sfence();
+    default:;
+  }
+#endif
+}
+
+static __forceinline void PostFence(std::memory_order order) {
+#if ALWAYS_CONSERVATIVE
+  switch (order) {
+    case std::memory_order_seq_cst:
+    case std::memory_order_acq_rel:
+    case std::memory_order_acquire:
+      __atomic_thread_fence(__ATOMIC_SEQ_CST);
+    default:;
+  }
+#elif X64_ORDER_WC
+  switch (order) {
+    case std::memory_order_seq_cst:
+      return _mm_mfence();
+    case std::memory_order_acq_rel:
+    case std::memory_order_acquire:
+      return _mm_lfence();
+    default:;
+  }
+#endif
+}
+
+static __forceinline void Fence(std::memory_order order=std::memory_order_seq_cst) {
+#if ALWAYS_CONSERVATIVE
+  __atomic_thread_fence(__ATOMIC_SEQ_CST);
+#elif X64_ORDER_WC
+  switch (order) {
+    case std::memory_order_seq_cst:
+    case std::memory_order_acq_rel:
+      return _mm_mfence();
+    case std::memory_order_acquire:
+      return _mm_lfence();
+    case std::memory_order_release:
+      return _mm_sfence();
+    default:;
+  }
+#else
+  std::atomic_thread_fence(order);
+#endif
+}
+
+template <class T>
+static __forceinline void BasicCheck(const T* ptr) {
+  constexpr bool value = __atomic_always_lock_free(sizeof(T), 0);
+  static_assert(value, "Atomic type may not be compatible with peripheral atomics.");
+};
+
+template <class T>
+static __forceinline void BasicCheck(const volatile T* ptr) {
+  constexpr bool value = __atomic_always_lock_free(sizeof(T), 0);
+  static_assert(value, "Atomic type may not be compatible with peripheral atomics.");
+};
+
+/// @brief: Load value of type T atomically with specified memory order.
+/// @param: ptr(Input), a pointer to type T.
+/// @param: order(Input), memory order with atomic load, relaxed by default.
+/// @return: T, loaded value.
+template <class T>
+static __forceinline T
+    Load(const T* ptr, std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  T ret;
+  PreFence(order);
+  __atomic_load(ptr, &ret, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: function overloading, for more info, see previous one.
+/// @param: ptr(Input), a pointer to volatile type T.
+/// @param: order(Input), memory order with atomic load, relaxed by default.
+/// @return: T, loaded value.
+template <class T>
+static __forceinline T
+    Load(const volatile T* ptr,
+         std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  T ret;
+  PreFence(order);
+  __atomic_load(ptr, &ret, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Store value of type T with specified memory order.
+/// @param: ptr(Input), a pointer to instance which will be stored.
+/// @param: val(Input), value to be stored.
+/// @param: order(Input), memory order with atomic store, relaxed by default.
+/// @return: void.
+template <class T>
+static __forceinline void Store(
+    T* ptr, T val, std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  __atomic_store(ptr, &val, c11ToBuiltInFlags(order));
+  PostFence(order);
+}
+
+/// @brief: Function overloading, for more info, see previous one.
+/// @param: ptr(Input), a pointer to volatile instance which will be stored.
+/// @param: val(Input), value to be stored.
+/// @param: order(Input), memory order with atomic store, relaxed by default.
+/// @return: void.
+template <class T>
+static __forceinline void Store(
+    volatile T* ptr, T val,
+    std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  __atomic_store(ptr, &val, c11ToBuiltInFlags(order));
+  PostFence(order);
+}
+
+/// @brief: Compare and swap value atomically with specified memory order.
+/// @param: ptr(Input), a pointer to variable which is operated on.
+/// @param: val(Input), value to be stored if condition is satisfied.
+/// @param: expected(Input), value which is expected.
+/// @param: order(Input), memory order with atomic operation.
+/// @return: T, observed value of type T.
+template <class T>
+static __forceinline T
+    Cas(T* ptr, T val, T expected,
+        std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  __atomic_compare_exchange(ptr, &expected, &val, false, c11ToBuiltInFlags(order), __ATOMIC_RELAXED);
+  PostFence(order);
+  return expected;
+}
+
+/// @brief: Function overloading, for more info, see previous one.
+/// @param: ptr(Input), a pointer to volatile variable which is operated on.
+/// @param: val(Input), value to be stored if condition is satisfied.
+/// @param: expected(Input), value which is expected.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, observed value of type T.
+template <class T>
+static __forceinline T
+    Cas(volatile T* ptr, T val, T expected,
+        std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  __atomic_compare_exchange(ptr, &expected, &val, false, c11ToBuiltInFlags(order), __ATOMIC_RELAXED);
+  PostFence(order);
+  return expected;
+}
+
+/// @brief: Exchange the value atomically with specified memory order.
+/// @param: ptr(Input), a pointer to variable which is operated on.
+/// @param: val(Input), value to be stored.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, the value prior to the exchange.
+template <class T>
+static __forceinline T
+    Exchange(T* ptr, T val,
+             std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  T ret;
+  PreFence(order);
+  __atomic_exchange(ptr, &val, &ret, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Function overloading, for more info, see previous one.
+/// @param: ptr(Input), a pointer to variable which is operated on.
+/// @param: val(Input), value to be stored.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, the value prior to the exchange.
+template <class T>
+static __forceinline T
+    Exchange(volatile T* ptr, T val,
+             std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  T ret;
+  PreFence(order);
+  __atomic_exchange(ptr, &val, &ret, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Add value to variable atomically with specified memory order.
+/// @param: ptr(Input), a pointer to variable which is operated on.
+/// @param: val(Input), value to be added.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, the value of the variable prior to the addition.
+template <class T>
+static __forceinline T
+    Add(T* ptr, T val, std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  T ret = __atomic_fetch_add(ptr, val, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Subtract value from the variable atomically with specified memory
+/// order.
+/// @param: ptr(Input), a pointer to variable which is operated on.
+/// @param: val(Input), value to be subtraced.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, value of the variable prior to the subtraction.
+template <class T>
+static __forceinline T
+    Sub(T* ptr, T val, std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  T ret = __atomic_fetch_sub(ptr, val, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Bit And operation on variable atomically with specified memory
+/// order.
+/// @param: ptr(Input), a pointer to variable which is operated on.
+/// @param: val(Input), value which is ANDed with variable.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, value of variable prior to the operation.
+template <class T>
+static __forceinline T
+    And(T* ptr, T val, std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  T ret = __atomic_fetch_and(ptr, val, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Bit Or operation on variable atomically with specified memory order.
+/// @param: ptr(Input), a pointer to variable which is operated on.
+/// @param: val(Input), value which is ORed with variable.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, value of variable prior to the operation.
+template <class T>
+static __forceinline T
+    Or(T* ptr, T val, std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  T ret = __atomic_fetch_or(ptr, val, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Bit Xor operation on variable atomically with specified memory
+/// order.
+/// @param: ptr(Input), a pointer to variable which is operated on.
+/// @param: val(Input), value which is XORed with variable.
+/// @order: order(Input), memory order which is relaxed by default.
+/// @return: T, valud of variable prior to the opertaion.
+template <class T>
+static __forceinline T
+    Xor(T* ptr, T val, std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  T ret = __atomic_fetch_xor(ptr, val, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Increase the value of variable atomically with specified memory
+/// order.
+/// @param: ptr(Input), a pointer to variable which is operated on.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, value of variable prior to the operation.
+template <class T>
+static __forceinline T
+    Increment(T* ptr, std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  T ret = __atomic_fetch_add(ptr, 1, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Decrease the value of the variable atomically with specified memory
+/// order.
+/// @param: ptr(Input), a pointer to variable which is operated on.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, value of variable prior to the operation.
+template <class T>
+static __forceinline T
+    Decrement(T* ptr, std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  T ret = __atomic_fetch_sub(ptr, 1, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Add value to variable atomically with specified memory order.
+/// @param: ptr(Input), a pointer to volatile variable which is operated on.
+/// @param: val(Input), value to be added.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, the value of the variable prior to the addition.
+template <class T>
+static __forceinline T
+    Add(volatile T* ptr, T val,
+        std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  T ret = __atomic_fetch_add(ptr, val, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Subtract value from the variable atomically with specified memory
+/// order.
+/// @param: ptr(Input), a pointer to volatile variable which is operated on.
+/// @param: val(Input), value to be subtraced.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, value of the variable prior to the subtraction.
+template <class T>
+static __forceinline T
+    Sub(volatile T* ptr, T val,
+        std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  T ret = __atomic_fetch_sub(ptr, val, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Bit And operation on variable atomically with specified memory
+/// order.
+/// @param: ptr(Input), a pointer to volatile variable which is operated on.
+/// @param: val(Input), value which is ANDed with variable.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, value of variable prior to the operation.
+template <class T>
+static __forceinline T
+    And(volatile T* ptr, T val,
+        std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  T ret = __atomic_fetch_and(ptr, val, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Bit Or operation on variable atomically with specified memory order.
+/// @param: ptr(Input), a pointer to volatile variable which is operated on.
+/// @param: val(Input), value which is ORed with variable.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, value of variable prior to the operation.
+template <class T>
+static __forceinline T Or(volatile T* ptr, T val,
+                          std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  T ret = __atomic_fetch_or(ptr, val, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Bit Xor operation on variable atomically with specified memory
+/// order.
+/// @param: ptr(Input), a pointer to volatile variable which is operated on.
+/// @param: val(Input), value which is XORed with variable.
+/// @order: order(Input), memory order which is relaxed by default.
+/// @return: T, valud of variable prior to the opertaion.
+template <class T>
+static __forceinline T
+    Xor(volatile T* ptr, T val,
+        std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  T ret = __atomic_fetch_xor(ptr, val, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Increase the value of variable atomically with specified memory
+/// order.
+/// @param: ptr(Input), a pointer to volatile variable which is operated on.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, value of variable prior to the operation.
+template <class T>
+static __forceinline T
+    Increment(volatile T* ptr,
+              std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  T ret = __atomic_fetch_add(ptr, 1, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Decrease the value of the variable atomically with specified memory
+/// order.
+/// @param: ptr(Input), a pointer to volatile variable which is operated on.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, value of variable prior to the operation.
+template <class T>
+static __forceinline T
+    Decrement(volatile T* ptr,
+              std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  T ret = __atomic_fetch_sub(ptr, 1, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+}   //  namespace atomic
+}   //  namespace wsl
+
+#ifdef X64_ORDER_WC
+#undef X64_ORDER_WC
+#endif
+
+#ifdef ALWAYS_CONSERVATIVE
+#undef ALWAYS_CONSERVATIVE
+#endif
+
+#endif  // HSA_RUNTIME_CORE_UTIL_ATOMIC_HELPERS_H_
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/lazy_ptr.h b/projects/rocr-runtime/libhsakmt/src/dxg/util/lazy_ptr.h
new file mode 100644
index 0000000000..b5817af40d
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/lazy_ptr.h
@@ -0,0 +1,155 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIESd OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef HSA_RUNTIME_CORE_UTIL_LAZY_PTR_H_
+#define HSA_RUNTIME_CORE_UTIL_LAZY_PTR_H_
+
+#include <memory>
+#include <utility>
+#include <functional>
+
+#include "core/util/locks.h"
+#include "core/util/utils.h"
+
+namespace wsl {
+
+/*
+ * Wrapper for a std::unique_ptr that initializes its object at first use.
+ */
+template <typename T> class lazy_ptr {
+ public:
+  lazy_ptr() {}
+
+  explicit lazy_ptr(std::function<T*()> Constructor) { reset(Constructor); }
+
+  lazy_ptr(lazy_ptr&& rhs) {
+    obj = std::move(rhs.obj);
+    func = std::move(rhs.func);
+  }
+
+  lazy_ptr& operator=(lazy_ptr&& rhs) {
+    obj = std::move(rhs.obj);
+    func = std::move(rhs.func);
+  }
+
+  lazy_ptr(lazy_ptr&) = delete;
+  lazy_ptr& operator=(lazy_ptr&) = delete;
+
+  void reset(std::function<T*()> Constructor = nullptr) {
+    obj.reset();
+    func = Constructor;
+  }
+
+  void reset(T* ptr) {
+    obj.reset(ptr);
+    func = nullptr;
+  }
+
+  bool operator==(T* rhs) const { return obj.get() == rhs; }
+  bool operator!=(T* rhs) const { return obj.get() != rhs; }
+
+  const std::unique_ptr<T>& operator->() const {
+    make(true);
+    assert(obj != nullptr && "Null dereference through lazy_ptr.");
+    return obj;
+  }
+
+  std::unique_ptr<T>& operator*() {
+    make(true);
+    return obj;
+  }
+
+  const std::unique_ptr<T>& operator*() const {
+    make(true);
+    return obj;
+  }
+
+  /*
+   * Ensures that the object is created or is being created.
+   * This is useful when early construction of the object is required.
+   */
+  void touch() const { make(false); }
+
+  // Tells if the lazy object has been constructed or not.
+  // Construction may fail silently (return nullptr).
+  bool created() const {
+    std::atomic_thread_fence(std::memory_order_acquire);
+    return func == nullptr;
+  }
+
+  // Tells if the lazy object exists or not.
+  bool empty() const {
+    std::atomic_thread_fence(std::memory_order_acquire);
+    return obj == nullptr;
+  }
+
+ private:
+  mutable std::unique_ptr<T> obj;
+  mutable std::function<T*(void)> func;
+  mutable KernelMutex lock;
+
+  // Separated from make to improve inlining.
+  void make_body(bool block) const {
+    if (block) {
+      lock.Acquire();
+    } else if (!lock.Try()) {
+      return;
+    }
+    MAKE_SCOPE_GUARD([&]() { lock.Release(); });
+    if (func == nullptr) return;
+    T* ptr = func();
+    obj.reset(ptr);
+    std::atomic_thread_fence(std::memory_order_release);
+    func = nullptr;
+  }
+
+  __forceinline void make(bool block) const {
+    if (!created()) {
+      make_body(block);
+    }
+  }
+
+};
+
+} // namespace wsl
+
+#endif  // HSA_RUNTIME_CORE_UTIL_LAZY_PTR_H_
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/lnx/os_linux.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/util/lnx/os_linux.cpp
new file mode 100644
index 0000000000..020ca10b28
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/lnx/os_linux.cpp
@@ -0,0 +1,769 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2024, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifdef __linux__
+#include "core/util/os.h"
+#include "core/util/utils.h"
+
+#include <link.h>
+#include <dlfcn.h>
+#include <pthread.h>
+#include <limits.h>
+#include <sched.h>
+#include <sys/sysinfo.h>
+#include <sys/time.h>
+#include <sys/utsname.h>
+#include <unistd.h>
+#include <errno.h>
+#include <cstring>
+#include <atomic>
+#include <memory>
+#include <string>
+#include <utility>
+#include <semaphore.h>
+#include "core/inc/runtime.h"
+#if defined(__i386__) || defined(__x86_64__)
+#include <cpuid.h>
+#endif
+
+namespace wsl {
+namespace os {
+
+struct ThreadArgs {
+  void* entry_args;
+  ThreadEntry entry_function;
+};
+
+void* __stdcall ThreadTrampoline(void* arg) {
+  ThreadArgs* ar = (ThreadArgs*)arg;
+  ThreadEntry CallMe = ar->entry_function;
+  void* Data = ar->entry_args;
+  delete ar;
+  CallMe(Data);
+  return nullptr;
+}
+
+// Thread container allows multiple waits and separate close (destroy).
+class os_thread {
+ public:
+  explicit os_thread(ThreadEntry function, void* threadArgument, uint stackSize)
+      : thread(0), lock(nullptr), state(RUNNING) {
+    int err;
+    std::unique_ptr<ThreadArgs> args(new ThreadArgs);
+    lock = CreateMutex();
+    if (lock == nullptr) return;
+
+    args->entry_args = threadArgument;
+    args->entry_function = function;
+
+    pthread_attr_t attrib;
+    err = pthread_attr_init(&attrib);
+    if (err != 0) {
+      pr_err("pthread_attr_init failed: %s\n", strerror(err));
+      return;
+    }
+
+    if (stackSize != 0) {
+      stackSize = Max(uint(PTHREAD_STACK_MIN), stackSize);
+      stackSize = AlignUp(stackSize, 4096);
+      err = pthread_attr_setstacksize(&attrib, stackSize);
+      if (err != 0) {
+        pr_err("pthread_attr_setstacksize failed: %s\n", strerror(err));
+        err = pthread_attr_destroy(&attrib);
+        if (err != 0) {
+          pr_err("pthread_attr_destroy failed: %s\n", strerror(err));
+          return;
+        }
+      }
+    }
+
+    int cores = 0;
+    cpu_set_t* cpuset = nullptr;
+
+    if (core::Runtime::runtime_singleton_->flag().override_cpu_affinity()) {
+      cores = get_nprocs_conf();
+      cpuset = CPU_ALLOC(cores);
+      if (cpuset == nullptr) {
+        pr_err("CPU_ALLOC failed: %s\n", strerror(errno));
+        return;
+      }
+      CPU_ZERO_S(CPU_ALLOC_SIZE(cores), cpuset);
+      for (int i = 0; i < cores; i++) {
+        CPU_SET_S(i, CPU_ALLOC_SIZE(cores), cpuset);
+      }
+      err = pthread_attr_setaffinity_np(&attrib, CPU_ALLOC_SIZE(cores), cpuset);
+      CPU_FREE(cpuset);
+      if (err != 0) {
+        pr_err("pthread_setaffinity_np failed: %s\n", strerror(err));
+        return;
+      }
+    }
+
+    err = pthread_create(&thread, &attrib, ThreadTrampoline, args.get());
+
+    // Probably a stack size error since system limits can be different from PTHREAD_STACK_MIN
+    // Attempt to grow the stack within reason.
+    if ((err == EINVAL) && stackSize != 0) {
+      while (stackSize < 20 * 1024 * 1024) {
+        stackSize *= 2;
+        err = pthread_attr_setstacksize(&attrib, stackSize);
+        if (err != 0) {
+          pr_err("pthread_attr_setstacksize failed: %s\n", strerror(err));
+          return;
+        }
+        err = pthread_create(&thread, &attrib, ThreadTrampoline, args.get());
+        if (err != EINVAL) break;
+        pr_debug("pthread_create returned EINVAL, doubling stack size\n");
+      }
+    }
+
+    if (err == 0)
+      args.release();
+    else
+      thread = 0;
+
+    err = pthread_attr_destroy(&attrib);
+    if (err != 0) {
+      pr_err("pthread_attr_destroy failed: %s\n", strerror(err));
+    }
+  }
+
+  os_thread(os_thread&& rhs) {
+    thread = rhs.thread;
+    lock = rhs.lock;
+    state = int(rhs.state);
+    rhs.thread = 0;
+    rhs.lock = nullptr;
+  }
+
+  os_thread(os_thread&) = delete;
+
+  ~os_thread() {
+    if (lock != nullptr) DestroyMutex(lock);
+    if ((state == RUNNING) && (thread != 0)) {
+      int err = pthread_detach(thread);
+      if (err != 0) pr_err("pthread_detach failed: %s\n", strerror(err));
+    }
+  }
+
+  bool Valid() { return (lock != nullptr) && (thread != 0); }
+
+  bool Wait() {
+    if (state == FINISHED) return true;
+    AcquireMutex(lock);
+    if (state == FINISHED) {
+      ReleaseMutex(lock);
+      return true;
+    }
+    int err = pthread_join(thread, NULL);
+    bool success = (err == 0);
+    if (success) state = FINISHED;
+    ReleaseMutex(lock);
+    return success;
+  }
+
+ private:
+  pthread_t thread;
+  Mutex lock;
+  std::atomic<int> state;
+  enum { FINISHED = 0, RUNNING = 1 };
+};
+
+static_assert(sizeof(LibHandle) == sizeof(void*), "OS abstraction size mismatch");
+static_assert(sizeof(Semaphore) == sizeof(sem_t*), "OS abstraction size mismatch");
+static_assert(sizeof(Mutex) == sizeof(pthread_mutex_t*), "OS abstraction size mismatch");
+static_assert(sizeof(SharedMutex) == sizeof(pthread_rwlock_t*), "OS abstraction size mismatch");
+static_assert(sizeof(Thread) == sizeof(os_thread*), "OS abstraction size mismatch");
+
+LibHandle LoadLib(std::string filename) {
+  void* ret = dlopen(filename.c_str(), RTLD_LAZY);
+  if (ret == nullptr) pr_err("LoadLib(%s) failed: %s\n", filename.c_str(), dlerror());
+  return *(LibHandle*)&ret;
+}
+
+void* GetExportAddress(LibHandle lib, std::string export_name) {
+  void* ret = dlsym(*(void**)&lib, export_name.c_str());
+
+  // dlsym searches the given library and all the library's load dependencies.
+  // Remaining code limits symbol lookup to only the library handle given.
+  // This lookup pattern matches Windows.
+  if (ret == NULL) return ret;
+
+  link_map* map;
+  int err = dlinfo(*(void**)&lib, RTLD_DI_LINKMAP, &map);
+  if (err == -1) {
+    pr_err("dlinfo failed: %s\n", dlerror());
+    return nullptr;
+  }
+
+  Dl_info info;
+  err = dladdr(ret, &info);
+  if (err == 0) {
+    pr_err("dladdr failed.\n");
+    return nullptr;
+  }
+
+  if (strcmp(info.dli_fname, map->l_name) == 0) return ret;
+
+  return NULL;
+}
+
+void CloseLib(LibHandle lib) { dlclose(*(void**)&lib); }
+
+/*
+ * @brief Look for a symbol called "HSA_AMD_TOOL_PRIORITY" across all loaded
+ * shared libraries, and if found, store the name of the library
+ *
+ * @param[in]: info A dl_phdr_info struct pointer, which contains information
+ * about library's load address, header, and name.
+ *
+ * @param[in]: size integer size of dl_phdr_info struct
+ *
+ * @param[out]: data copy of the data argument to dl_phdr_iterate call
+ *
+ * @retval:: Return 0 on Success. If callback returns a non-zero value,
+ * dl_iterate_phdr() will stop processing, even if there are unprocessed
+ * shared objects.
+ */
+
+static int callback(struct dl_phdr_info* info, size_t size, void* data) {
+  std::vector<std::string>* loadedToolsLib = (std::vector<std::string>*)data;
+  assert(loadedToolsLib != nullptr);
+  /*
+   * Check if lib name is not empty and its not a "vdso.so" lib,
+   * The vDSO is a special shared object file that is built into the Linux kernel.
+   * It is not a regular shared library and thus does not have all the properties
+   * of regular shared libraries. The way the vDSO is loaded and organized in memory
+   * is different from regular shared libraries and it's not guaranteed that it
+   * will have a specific segment or section. Hence its skipped.
+   */
+
+  if ((info) && (info->dlpi_name[0] != '\0')) {
+    if (std::string(info->dlpi_name).find("vdso.so") != std::string::npos) return 0;
+
+    /*
+     * Iterate through the program headers of the loaded lib and check for PT_DYNAMIC program
+     * header. If the PT_DYNAMIC program header is found, use dlpi_addr and dlpi_phdr members
+     * of dl_phdr_info struct to get the address of the dynamic section of the loaded
+     * library in memory
+     */
+
+    for (int i = 0; i < info->dlpi_phnum; i++) {
+      if (info->dlpi_phdr[i].p_type == PT_DYNAMIC) {
+        Elf64_Dyn* dyn_section = (Elf64_Dyn*)(info->dlpi_addr + info->dlpi_phdr[i].p_vaddr);
+
+        char* strings = nullptr;
+        Elf64_Xword limit = 0;
+
+        /*
+         * The dynamic section is searched for DT_STRTAB (address of string table),
+         * and DT_STRSZ (size of string table)
+         * DT_NULL - Marks the end of the _DYNAMIC array
+         */
+
+        for (int j = 0;; j++) {
+          if (dyn_section[j].d_tag == DT_NULL) break;
+
+          if (dyn_section[j].d_tag == DT_STRTAB) strings = (char*)(dyn_section[j].d_un.d_ptr);
+
+          if (dyn_section[j].d_tag == DT_STRSZ) limit = dyn_section[j].d_un.d_val;
+        }
+
+        if (strings == nullptr) pr_debug("String table not found\n");
+
+        /*
+         * Hacky lookup, if string and symbol tables are found,
+         * iterate through the strings in string table and check if
+         * any string matches "HSA_AMD_TOOL_PRIORITY".
+         * If yes, then add the name of the library to the vector of
+         * lib names
+         */
+        if (strings != nullptr) {
+          char* end = strings + limit;
+          while (strings < end) {
+            if (strcmp(strings, "HSA_AMD_TOOL_PRIORITY") == 0) {
+              loadedToolsLib->push_back(info->dlpi_name);
+              return 0;
+            }
+            strings += (strlen(strings) + 1);
+          }
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+std::vector<LibHandle> GetLoadedToolsLib() {
+  std::vector<LibHandle> ret;
+  std::vector<std::string> names;
+
+  /* Iterate through all of the loaded shared libraries in the process */
+  dl_iterate_phdr(callback, &names);
+
+  if (!names.empty()) {
+    for (auto& name : names) ret.push_back(LoadLib(name));
+  }
+
+  return ret;
+}
+
+std::string GetLibraryName(LibHandle lib) {
+  link_map *map;
+  if(dlinfo(lib, RTLD_DI_LINKMAP, &map)!=0)
+    return "";
+  return map->l_name;
+}
+
+Semaphore CreateSemaphore() {
+  sem_t *sem = new sem_t;
+  sem_init(sem, 0, 0);
+  return *(Semaphore*)&sem;
+}
+
+bool WaitSemaphore(Semaphore sem) {
+  while(sem_wait(*(sem_t**)&sem))
+    if (errno != EINTR) return false;
+
+  return true;
+}
+
+void PostSemaphore(Semaphore sem) {
+  if (sem_post(*(sem_t**)&sem))
+    assert(false && "Failed to post semaphore");
+}
+
+void DestroySemaphore(Semaphore sem) {
+  sem_destroy(*(sem_t**)&sem);
+  delete *(sem_t**)&sem;
+}
+
+Mutex CreateMutex() {
+  pthread_mutex_t* mutex = new pthread_mutex_t;
+  pthread_mutex_init(mutex, NULL);
+  return *(Mutex*)&mutex;
+}
+
+bool TryAcquireMutex(Mutex lock) {
+  return pthread_mutex_trylock(*(pthread_mutex_t**)&lock) == 0;
+}
+
+bool AcquireMutex(Mutex lock) {
+  return pthread_mutex_lock(*(pthread_mutex_t**)&lock) == 0;
+}
+
+void ReleaseMutex(Mutex lock) {
+  pthread_mutex_unlock(*(pthread_mutex_t**)&lock);
+}
+
+void DestroyMutex(Mutex lock) {
+  pthread_mutex_destroy(*(pthread_mutex_t**)&lock);
+  delete *(pthread_mutex_t**)&lock;
+}
+
+void Sleep(int delay_in_millisec) { usleep(delay_in_millisec * 1000); }
+
+void uSleep(int delayInUs) { usleep(delayInUs); }
+
+void YieldThread() { sched_yield(); }
+
+Thread CreateThread(ThreadEntry function, void* threadArgument, uint stackSize) {
+  os_thread* result = new os_thread(function, threadArgument, stackSize);
+  if (!result->Valid()) {
+    delete result;
+    return nullptr;
+  }
+
+  return reinterpret_cast<Thread>(result);
+}
+
+void CloseThread(Thread thread) { delete reinterpret_cast<os_thread*>(thread); }
+
+bool WaitForThread(Thread thread) { return reinterpret_cast<os_thread*>(thread)->Wait(); }
+
+bool WaitForAllThreads(Thread* threads, uint threadCount) {
+  for (uint i = 0; i < threadCount; i++) WaitForThread(threads[i]);
+  return true;
+}
+
+bool IsEnvVarSet(std::string env_var_name) {
+  char* buff = NULL;
+  buff = getenv(env_var_name.c_str());
+  return (buff != NULL);
+}
+
+void SetEnvVar(std::string env_var_name, std::string env_var_value) {
+  setenv(env_var_name.c_str(), env_var_value.c_str(), 1);
+}
+
+int GetProcessId() {
+  return ::getpid();
+}
+
+std::string GetEnvVar(std::string env_var_name) {
+  char* buff;
+  buff = getenv(env_var_name.c_str());
+  std::string ret;
+  if (buff) {
+    ret = buff;
+  }
+  return ret;
+}
+
+size_t GetUserModeVirtualMemorySize() {
+#ifdef _LP64
+  // https://www.kernel.org/doc/Documentation/x86/x86_64/mm.txt :
+  // user space is 0000000000000000 - 00007fffffffffff (=47 bits)
+  return (size_t)(0x800000000000);
+#else
+  return (size_t)(0xffffffff);  // ~4GB
+#endif
+}
+
+size_t GetUsablePhysicalHostMemorySize() {
+  struct sysinfo info = {0};
+  if (sysinfo(&info) != 0) {
+    return 0;
+  }
+
+  const size_t physical_size =
+      static_cast<size_t>(info.totalram * info.mem_unit);
+  return std::min(GetUserModeVirtualMemorySize(), physical_size);
+}
+
+uintptr_t GetUserModeVirtualMemoryBase() { return (uintptr_t)0; }
+
+// Os event implementation
+typedef struct EventDescriptor_ {
+  pthread_cond_t event;
+  pthread_mutex_t mutex;
+  bool state;
+  bool auto_reset;
+} EventDescriptor;
+
+EventHandle CreateOsEvent(bool auto_reset, bool init_state) {
+  EventDescriptor* eventDescrp;
+  eventDescrp = (EventDescriptor*)malloc(sizeof(EventDescriptor));
+
+  pthread_mutex_init(&eventDescrp->mutex, NULL);
+  pthread_cond_init(&eventDescrp->event, NULL);
+  eventDescrp->auto_reset = auto_reset;
+  eventDescrp->state = init_state;
+
+  EventHandle handle = reinterpret_cast<EventHandle>(eventDescrp);
+
+  return handle;
+}
+
+int DestroyOsEvent(EventHandle event) {
+  if (event == NULL) {
+    return -1;
+  }
+
+  EventDescriptor* eventDescrp = reinterpret_cast<EventDescriptor*>(event);
+  int ret_code = pthread_cond_destroy(&eventDescrp->event);
+  ret_code |= pthread_mutex_destroy(&eventDescrp->mutex);
+  free(eventDescrp);
+  return ret_code;
+}
+
+int WaitForOsEvent(EventHandle event, unsigned int milli_seconds) {
+  if (event == NULL) {
+    return -1;
+  }
+
+  EventDescriptor* eventDescrp = reinterpret_cast<EventDescriptor*>(event);
+  // Event wait time is 0 and state is non-signaled, return directly
+  if (milli_seconds == 0) {
+    int tmp_ret = pthread_mutex_trylock(&eventDescrp->mutex);
+    if (tmp_ret == EBUSY) {
+      // Timeout
+      return 1;
+    }
+  }
+
+  int ret_code = 0;
+  pthread_mutex_lock(&eventDescrp->mutex);
+  if (!eventDescrp->state) {
+    if (milli_seconds == 0) {
+      ret_code = 1;
+    } else {
+      struct timespec ts;
+      struct timeval tp;
+
+      ret_code = gettimeofday(&tp, NULL);
+      ts.tv_sec = tp.tv_sec;
+      ts.tv_nsec = tp.tv_usec * 1000;
+
+      unsigned int sec = milli_seconds / 1000;
+      unsigned int mSec = milli_seconds % 1000;
+
+      ts.tv_sec += sec;
+      ts.tv_nsec += mSec * 1000000;
+
+      // More then one second, add 1 sec to the tv_sec elem
+      if (ts.tv_nsec > 1000000000) {
+        ts.tv_sec += 1;
+        ts.tv_nsec = ts.tv_nsec - 1000000000;
+      }
+
+      ret_code =
+          pthread_cond_timedwait(&eventDescrp->event, &eventDescrp->mutex, &ts);
+      // Time out
+      if (ret_code == 110) {
+        ret_code = 0x14003;  // 1 means time out in HSA
+      }
+
+      if (ret_code == 0 && eventDescrp->auto_reset) {
+        eventDescrp->state = false;
+      }
+    }
+  } else if (eventDescrp->auto_reset) {
+    eventDescrp->state = false;
+  }
+  pthread_mutex_unlock(&eventDescrp->mutex);
+
+  return ret_code;
+}
+
+int SetOsEvent(EventHandle event) {
+  if (event == NULL) {
+    return -1;
+  }
+
+  EventDescriptor* eventDescrp = reinterpret_cast<EventDescriptor*>(event);
+  int ret_code = 0;
+  ret_code = pthread_mutex_lock(&eventDescrp->mutex);
+  eventDescrp->state = true;
+  ret_code = pthread_mutex_unlock(&eventDescrp->mutex);
+  ret_code |= pthread_cond_signal(&eventDescrp->event);
+
+  return ret_code;
+}
+
+int ResetOsEvent(EventHandle event) {
+  if (event == NULL) {
+    return -1;
+  }
+
+  EventDescriptor* eventDescrp = reinterpret_cast<EventDescriptor*>(event);
+  int ret_code = 0;
+  ret_code = pthread_mutex_lock(&eventDescrp->mutex);
+  eventDescrp->state = false;
+  ret_code = pthread_mutex_unlock(&eventDescrp->mutex);
+
+  return ret_code;
+}
+
+static double invPeriod = 0.0;
+
+uint64_t ReadAccurateClock() {
+  if (invPeriod == 0.0) AccurateClockFrequency();
+  timespec time;
+  int err = clock_gettime(CLOCK_MONOTONIC_RAW, &time);
+  if (err != 0) {
+    pr_err("clock_gettime(CLOCK_MONOTONIC_RAW,...) failed %s\n", strerror(errno));
+    abort();
+  }
+  return (uint64_t(time.tv_sec) * 1000000000ull + uint64_t(time.tv_nsec)) * invPeriod;
+}
+
+uint64_t AccurateClockFrequency() {
+  static clockid_t clock = CLOCK_MONOTONIC;
+  static std::atomic<bool> first(true);
+  // Check kernel version - not a concurrency concern.
+  // use non-RAW for getres due to bug in older 2.6.x kernels
+  if (first.load(std::memory_order_acquire)) {
+    utsname kernelInfo;
+    if (uname(&kernelInfo) == 0) {
+      try {
+        std::string ver = kernelInfo.release;
+        size_t idx;
+        int major = std::stoi(ver, &idx);
+        int minor = std::stoi(ver.substr(idx + 1));
+        if ((major >= 4) && (minor >= 4)) {
+          clock = CLOCK_MONOTONIC_RAW;
+        }
+      } catch (...) {
+        // Kernel version string doesn't conform to the standard pattern.
+        // Keep using the "safe" (non-RAW) clock.
+      }
+    }
+    first.store(false, std::memory_order_release);
+  }
+  timespec time;
+  int err = clock_getres(clock, &time);
+  if (err != 0) {
+    pr_err("clock_getres failed %s\n", strerror(errno));
+    abort();
+  }
+  if (time.tv_sec != 0 || time.tv_nsec >= 0xFFFFFFFF) {
+    pr_err("clock_getres(CLOCK_MONOTONIC(_RAW),...) returned very low frequency (<1Hz).\n");
+    abort();
+  }
+  if (invPeriod == 0.0) invPeriod = 1.0 / double(time.tv_nsec);
+  return 1000000000ull / uint64_t(time.tv_nsec);
+}
+
+SharedMutex CreateSharedMutex() {
+  pthread_rwlockattr_t attrib;
+  int err = pthread_rwlockattr_init(&attrib);
+  if (err != 0) {
+    pr_err("rw lock attribute init failed: %s\n", strerror(err));
+    return nullptr;
+  }
+
+#ifdef __GLIBC__
+  err = pthread_rwlockattr_setkind_np(&attrib, PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP);
+  if (err != 0) {
+    pr_err("Set rw lock attribute failure: %s\n", strerror(err));
+    return nullptr;
+  }
+#else
+  err = pthread_rwlockattr_setkind(&attrib, PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP);
+  if (err != 0) {
+    pr_err("Set rw lock attribute failure: %s\n", strerror(err));
+    return nullptr;
+  }
+#endif
+
+  pthread_rwlock_t* lock = new pthread_rwlock_t;
+  err = pthread_rwlock_init(lock, &attrib);
+  if (err != 0) {
+    pr_err("rw lock init failed: %s\n", strerror(err));
+    return nullptr;
+  }
+
+  pthread_rwlockattr_destroy(&attrib);
+  return lock;
+}
+
+bool TryAcquireSharedMutex(SharedMutex lock) {
+  int err = pthread_rwlock_trywrlock(*(pthread_rwlock_t**)&lock);
+  return err == 0;
+}
+
+bool AcquireSharedMutex(SharedMutex lock) {
+  int err = pthread_rwlock_wrlock(*(pthread_rwlock_t**)&lock);
+  return err == 0;
+}
+
+void ReleaseSharedMutex(SharedMutex lock) {
+  int err = pthread_rwlock_unlock(*(pthread_rwlock_t**)&lock);
+  if (err != 0) {
+    pr_err("SharedMutex unlock failed: %s\n", strerror(err));
+    abort();
+  }
+}
+
+bool TrySharedAcquireSharedMutex(SharedMutex lock) {
+  int err = pthread_rwlock_tryrdlock(*(pthread_rwlock_t**)&lock);
+  return err == 0;
+}
+
+bool SharedAcquireSharedMutex(SharedMutex lock) {
+  int err = pthread_rwlock_rdlock(*(pthread_rwlock_t**)&lock);
+  return err == 0;
+}
+
+void SharedReleaseSharedMutex(SharedMutex lock) {
+  int err = pthread_rwlock_unlock(*(pthread_rwlock_t**)&lock);
+  if (err != 0) {
+    pr_err("SharedMutex unlock failed: %s\n", strerror(err));
+    abort();
+  }
+}
+
+void DestroySharedMutex(SharedMutex lock) {
+  pthread_rwlock_destroy(*(pthread_rwlock_t**)&lock);
+  delete *(pthread_rwlock_t**)&lock;
+}
+
+static uint64_t sys_clock_period_ = 0;
+
+uint64_t ReadSystemClock() {
+  struct timespec ts;
+  clock_gettime(CLOCK_BOOTTIME, &ts);
+  uint64_t time = (uint64_t(ts.tv_sec) * 1000000000 + uint64_t(ts.tv_nsec));
+  if (sys_clock_period_ != 1)
+    return time / sys_clock_period_;
+  else
+    return time;
+}
+
+uint64_t SystemClockFrequency() {
+  struct timespec ts;
+  clock_getres(CLOCK_BOOTTIME, &ts);
+  sys_clock_period_ = (uint64_t(ts.tv_sec) * 1000000000 + uint64_t(ts.tv_nsec));
+  return 1000000000 / sys_clock_period_;
+}
+
+bool ParseCpuID(cpuid_t* cpuinfo) {
+#if defined(__i386__) || defined(__x86_64__)
+  uint32_t eax, ebx, ecx, edx, max_eax = 0;
+  memset(cpuinfo, 0, sizeof(*cpuinfo));
+
+  /* Make sure current CPU supports at least EAX 4 */
+  if (!__get_cpuid_max(0x80000004, NULL)) return false;
+
+  // Manufacturer ID is a twelve-character ASCII string stored in order EBX, EDX, ECX.
+  if (!__get_cpuid(0, &max_eax, (uint32_t*)&cpuinfo->ManufacturerID[0],
+                   (uint32_t*)&cpuinfo->ManufacturerID[8],
+                   (uint32_t*)&cpuinfo->ManufacturerID[4])) {
+    return false;
+  }
+
+  if (!strcmp(cpuinfo->ManufacturerID, "AuthenticAMD")) {
+    if (__get_cpuid(0x80000001, &eax, &ebx, &ecx, &edx)) {
+      cpuinfo->mwaitx = !!((ecx >> 29) & 0x1);
+    }
+  }
+  return true;
+#else
+  return false;
+#endif
+}
+
+}   //  namespace os
+}   //  namespace wsl
+
+#endif
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/locks.h b/projects/rocr-runtime/libhsakmt/src/dxg/util/locks.h
new file mode 100644
index 0000000000..a17fa09593
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/locks.h
@@ -0,0 +1,290 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+// 
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+// 
+// Developed by:
+// 
+//                 AMD Research and AMD HSA Software Development
+// 
+//                 Advanced Micro Devices, Inc.
+// 
+//                 www.amd.com
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+// 
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// Library of syncronization primitives - to be added to as needed.
+
+#ifndef HSA_RUNTIME_CORE_UTIL_LOCKS_H_
+#define HSA_RUNTIME_CORE_UTIL_LOCKS_H_
+
+#include "utils.h"
+#include "os.h"
+
+namespace wsl {
+
+class HybridMutex {
+ public:
+  HybridMutex():lock_(0) { 
+    sem_ = os::CreateSemaphore(); 
+  }
+
+  ~HybridMutex() { 
+    os::DestroySemaphore(sem_); 
+  }
+
+  bool Try() {
+    int old = 0;
+    return lock_.compare_exchange_strong(old, 1);
+  }
+
+  bool Acquire() {
+    int cnt = maxSpinIterPause + maxSpinIterYield;
+
+    int old = 0;
+    while (!lock_.compare_exchange_strong(old, 1)) {
+      cnt--;
+      if (cnt > maxSpinIterPause) {
+        _mm_pause();
+      } else if (cnt-- > maxSpinIterYield) {
+        os::YieldThread();
+      } else {
+        os::WaitSemaphore(sem_);
+        cnt = maxSpinIterPause + maxSpinIterYield;
+      }
+      old = 0;
+    }
+    return true;
+  }
+
+  void Release() {
+    int old = 1;
+    if (lock_.compare_exchange_strong(old, 0))
+      os::PostSemaphore(sem_);
+  }
+
+ private:
+  std::atomic<int> lock_;
+  os::Semaphore sem_;
+  const uint32_t maxSpinIterPause = 55;
+  const uint32_t maxSpinIterYield = 55;
+
+  /// @brief: Disable copiable and assignable ability.
+  DISALLOW_COPY_AND_ASSIGN(HybridMutex);
+};
+
+
+/// @brief: a class represents a kernel mutex.
+/// Uses the kernel's scheduler to keep the waiting thread from being scheduled
+/// until the lock is released (Best for long waits, though anything using
+/// a kernel object is a long wait).
+class KernelMutex {
+ public:
+  KernelMutex() { lock_ = os::CreateMutex(); }
+  ~KernelMutex() { os::DestroyMutex(lock_); }
+
+  bool Try() { return os::TryAcquireMutex(lock_); }
+  bool Acquire() { return os::AcquireMutex(lock_); }
+  void Release() { os::ReleaseMutex(lock_); }
+
+ private:
+  os::Mutex lock_;
+
+  /// @brief: Disable copiable and assignable ability.
+  DISALLOW_COPY_AND_ASSIGN(KernelMutex);
+};
+
+/// @brief: represents a spin lock.
+/// For very short hold durations on the order of the thread scheduling
+/// quanta or less.
+class SpinMutex {
+ public:
+  SpinMutex() { lock_ = 0; }
+
+  bool Try() {
+    int old = 0;
+    return lock_.compare_exchange_strong(old, 1);
+  }
+  bool Acquire() {
+    int old = 0;
+    while (!lock_.compare_exchange_strong(old, 1))
+	{
+		old=0;
+    os::YieldThread();
+	}
+    return true;
+  }
+  void Release() { lock_ = 0; }
+
+ private:
+  std::atomic<int> lock_;
+
+  /// @brief: Disable copiable and assignable ability.
+  DISALLOW_COPY_AND_ASSIGN(SpinMutex);
+};
+
+class KernelEvent {
+ public:
+  KernelEvent() { evt_ = os::CreateOsEvent(true, true); }
+  ~KernelEvent() { os::DestroyOsEvent(evt_); }
+
+  bool IsSet() { return os::WaitForOsEvent(evt_, 0)==0; }
+  bool WaitForSet() { return os::WaitForOsEvent(evt_, 0xFFFFFFFF)==0; }
+  void Set() { os::SetOsEvent(evt_); }
+  void Reset() { os::ResetOsEvent(evt_); }
+
+ private:
+  os::EventHandle evt_;
+
+  /// @brief: Disable copiable and assignable ability.
+  DISALLOW_COPY_AND_ASSIGN(KernelEvent);
+};
+
+/// @brief: represents a yielding shared mutex.
+/// aka read/write mutex
+class KernelSharedMutex {
+ public:
+  /// @brief: Interfaces ScopedAcquire to shared operations.
+  class Shared {
+   public:
+    explicit Shared(KernelSharedMutex* lock) : lock_(lock) {}
+    bool Try() { return lock_->TryShared(); }
+    bool Acquire() { return lock_->AcquireShared(); }
+    void Release() { lock_->ReleaseShared(); }
+
+   private:
+    KernelSharedMutex* lock_;
+  };
+
+  KernelSharedMutex() { lock_ = os::CreateSharedMutex(); }
+  ~KernelSharedMutex() { os::DestroySharedMutex(lock_); }
+
+  // Exclusive mode operations
+  bool Try() { return os::TryAcquireSharedMutex(lock_); }
+  bool Acquire() { return os::AcquireSharedMutex(lock_); }
+  void Release() { os::ReleaseSharedMutex(lock_); }
+
+  // Shared mode operations
+  bool TryShared() { return os::TrySharedAcquireSharedMutex(lock_); }
+  bool AcquireShared() { return os::SharedAcquireSharedMutex(lock_); }
+  void ReleaseShared() { os::SharedReleaseSharedMutex(lock_); }
+
+  // Return shared operations interface
+  Shared shared() { return Shared(this); }
+
+ private:
+  os::SharedMutex lock_;
+
+  /// @brief: Disable copiable and assignable ability.
+  DISALLOW_COPY_AND_ASSIGN(KernelSharedMutex);
+};
+
+/// @brief: Type trait to identify mutex types
+template <class T> class isMutex {
+ public:
+  enum { value = false };
+};
+template <> class isMutex<HybridMutex> {
+ public:
+  enum { value = true };
+};
+template <> class isMutex<KernelMutex> {
+ public:
+  enum { value = true };
+};
+template <> class isMutex<SpinMutex> {
+ public:
+  enum { value = true };
+};
+template <> class isMutex<KernelSharedMutex> {
+ public:
+  enum { value = true };
+};
+
+/// @brief: A class behaves as a lock in a scope. When trying to enter into the
+/// critical section, creat a object of this class. After the control path goes
+/// out of the scope, it will release the lock automatically.
+template <class LockType> class ScopedAcquire {
+ public:
+  /// @brief: When constructing, acquire the lock.
+  /// @param: lock(Input), pointer to an existing lock.
+  explicit ScopedAcquire(LockType* lock) : lock_(lock), doRelease(true) {
+    static_assert(isMutex<LockType>::value, "ScopedAcquire requires a mutex type.");
+    lock_.Acquire();
+  }
+  explicit ScopedAcquire(LockType lock) : lock_(lock), doRelease(true) {
+    static_assert(!isMutex<LockType>::value, "Mutex types are not copyable.");
+    lock_.Acquire();
+  }
+
+  /// @brief: when destructing, release the lock.
+  ~ScopedAcquire() {
+    if (doRelease) lock_.Release();
+  }
+
+  /// @brief: Release the lock early.  Avoid using when possible.
+  void Release() {
+    lock_.Release();
+    doRelease = false;
+  }
+
+ private:
+  /// @brief: Adapts between pointers to mutex types and mutex pointer types.
+  template <class T, bool B> class container {
+   public:
+    container(T* lock) : lock_(lock) {}
+    __forceinline bool Acquire() { return lock_->Acquire(); }
+    __forceinline void Release() { return lock_->Release(); }
+
+   private:
+    T* lock_;
+  };
+
+  /// @brief: Specialization for mutex pointer types.
+  template <class T> class container<T, false> {
+   public:
+    container(T lock) : lock_(lock) {}
+    __forceinline bool Acquire() { return lock_.Acquire(); }
+    __forceinline void Release() { return lock_.Release(); }
+
+   private:
+    T lock_;
+  };
+
+  container<LockType, isMutex<LockType>::value> lock_;
+  bool doRelease;
+
+  /// @brief: Disable copiable and assignable ability.
+  DISALLOW_COPY_AND_ASSIGN(ScopedAcquire);
+};
+
+} // namespace wsl
+
+#endif  // HSA_RUNTIME_CORE_SUTIL_LOCKS_H_
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/os.h b/projects/rocr-runtime/libhsakmt/src/dxg/util/os.h
new file mode 100644
index 0000000000..2f40cd1581
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/os.h
@@ -0,0 +1,327 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2024, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// Minimal operating system abstraction interfaces.
+
+#ifndef HSA_RUNTIME_CORE_UTIL_OS_H_
+#define HSA_RUNTIME_CORE_UTIL_OS_H_
+
+#include <string>
+#include <vector>
+#include "utils.h"
+
+namespace wsl {
+namespace os {
+typedef void* LibHandle;
+typedef void* Semaphore;
+typedef void* Mutex;
+typedef void* SharedMutex;
+typedef void* Thread;
+typedef void* EventHandle;
+
+enum class os_t { OS_WIN = 0, OS_LINUX, COUNT };
+static __forceinline std::underlying_type<os_t>::type os_index(os_t val) {
+  return std::underlying_type<os_t>::type(val);
+}
+
+#ifdef _WIN32
+static const os_t current_os = os_t::OS_WIN;
+#elif __linux__
+static const os_t current_os = os_t::OS_LINUX;
+#else
+static_assert(false, "Operating System not detected!");
+#endif
+
+/// @brief: Loads dynamic library based on file name. Return value will be NULL
+/// if failed.
+/// @param: filename(Input), file name of the library.
+/// @return: LibHandle.
+LibHandle LoadLib(std::string filename);
+
+/// @brief: Gets the address of exported symbol. Return NULl if failed.
+/// @param: lib(Input), library handle which exporting from.
+/// @param: export_name(Input), the name of the exported symbol.
+/// @return: void*.
+void* GetExportAddress(LibHandle lib, std::string export_name);
+
+/// @brief: Unloads the dynamic library.
+/// @param: lib(Input), library handle which will be unloaded.
+void CloseLib(LibHandle lib);
+
+/// @brief: Lists loaded tool libraries that contain
+/// symbol HSA_AMD_TOOL_PRIORITY
+/// @return: List of library handles
+std::vector<LibHandle> GetLoadedToolsLib();
+
+/// @brief: Returns the library's path name.
+/// @param: lib(Input), libray handle
+/// @return: Path name of library
+std::string GetLibraryName(LibHandle lib);
+
+/// @brief: Creates a Semaphore, will return NULL if failed.
+/// @param: void.
+/// @return: Semaphore.
+Semaphore CreateSemaphore();
+
+/// @brief: Waits for the semaphore. This is a blocking wait.
+/// If the Semaphore is signalled, this function will return.
+/// @param: sem(Input), handle to the semaphore.
+/// @return: void.
+bool WaitSemaphore(Semaphore sem);
+
+/// @brief: Post/Signal/Wake-up the semaphore
+/// @param: sem(Input), handle to the semaphore.
+/// @return: void.
+void PostSemaphore(Semaphore sem);
+
+/// @brief: Destroys the semaphore.
+/// @param: sem(Input), handle to the semaphore.
+/// @return: void.
+void DestroySemaphore(Semaphore sem);
+
+/// @brief: Creates a mutex, will return NULL if failed.
+/// @param: void.
+/// @return: Mutex.
+Mutex CreateMutex();
+
+/// @brief: Tries to acquire the mutex once, if successed, return true.
+/// @param: lock(Input), handle to the mutex.
+/// @return: bool.
+bool TryAcquireMutex(Mutex lock);
+
+/// @brief: Aquires the mutex, if the mutex is locked, it will wait until it is
+/// released. If the mutex is acquired successfully, it will return true.
+/// @param: lock(Input), handle to the mutex.
+/// @return: bool.
+bool AcquireMutex(Mutex lock);
+
+/// @brief: Releases the mutex.
+/// @param: lock(Input), handle to the mutex.
+/// @return: void.
+void ReleaseMutex(Mutex lock);
+
+/// @brief: Destroys the mutex.
+/// @param: lock(Input), handle to the mutex.
+/// @return: void.
+void DestroyMutex(Mutex lock);
+
+/// @brief: Creates a shared mutex, will return NULL if failed.
+/// @param: void.
+/// @return: SharedMutex.
+SharedMutex CreateSharedMutex();
+
+/// @brief: Tries to acquire the mutex in exclusive mode once, if successed, return true.
+/// @param: lock(Input), handle to the shared mutex.
+/// @return: bool.
+bool TryAcquireSharedMutex(SharedMutex lock);
+
+/// @brief: Aquires the mutex in exclusive mode, if the mutex is locked, it will wait until it is
+/// released. If the mutex is acquired successfully, it will return true.
+/// @param: lock(Input), handle to the mutex.
+/// @return: bool.
+bool AcquireSharedMutex(SharedMutex lock);
+
+/// @brief: Releases the mutex from exclusive mode.
+/// @param: lock(Input), handle to the mutex.
+/// @return: void.
+void ReleaseSharedMutex(SharedMutex lock);
+
+/// @brief: Tries to acquire the mutex in shared mode once, if successed, return true.
+/// @param: lock(Input), handle to the mutex.
+/// @return: bool.
+bool TrySharedAcquireSharedMutex(SharedMutex lock);
+
+/// @brief: Aquires the mutex in shared mode, if the mutex in exclusive mode, it will wait until it
+/// is released. If the mutex is acquired successfully, it will return true.
+/// @param: lock(Input), handle to the mutex.
+/// @return: bool.
+bool SharedAcquireSharedMutex(SharedMutex lock);
+
+/// @brief: Releases the mutex from shared mode.
+/// @param: lock(Input), handle to the mutex.
+/// @return: void.
+void SharedReleaseSharedMutex(SharedMutex lock);
+
+/// @brief: Destroys the mutex.
+/// @param: lock(Input), handle to the mutex.
+/// @return: void.
+void DestroySharedMutex(SharedMutex lock);
+
+/// @brief: Puts current thread to sleep.
+/// @param: delayInMs(Input), time in millisecond for sleeping.
+/// @return: void.
+void Sleep(int delayInMs);
+
+/// @brief: Puts current thread to sleep.
+/// @param: delayInMs(Input), time in millisecond for sleeping.
+/// @return: void.
+void uSleep(int delayInUs);
+
+/// @brief: Yields current thread.
+/// @param: void.
+/// @return: void.
+void YieldThread();
+
+typedef void (*ThreadEntry)(void*);
+
+/// @brief: Creates a thread will return NULL if failed.
+/// @param: entry_function(Input), a pointer to the function which the thread
+/// starts from.
+/// @param: entry_argument(Input), a pointer to the argument of the thread
+/// function.
+/// @param: stack_size(Input), size of the thread's stack, 0 by default.
+/// @return: Thread, a handle to thread created.
+Thread CreateThread(ThreadEntry entry_function, void* entry_argument,
+                    uint stack_size = 0);
+
+/// @brief: Destroys the thread.
+/// @param: thread(Input), thread handle to what will be destroyed.
+/// @return: void.
+void CloseThread(Thread thread);
+
+/// @brief: Waits for specific thread to finish, if successful, return true.
+/// @param: thread(Input), handle to waiting thread.
+/// @return: bool.
+bool WaitForThread(Thread thread);
+
+/// @brief: Waits for multiple threads to finish, if successful, return true.
+/// @param; threads(Input), a pointer to a list of thread handle.
+/// @param: thread_count(Input), number of threads to be waited on.
+/// @return: bool.
+bool WaitForAllThreads(Thread* threads, uint thread_count);
+
+/// @brief: Determines if environment key is set.
+/// @param: env_var_name(Input), name of the environment value.
+/// @return: bool, true for binding any value to environment key,
+/// including an empty string. False otherwise
+bool IsEnvVarSet(std::string env_var_name);
+
+/// @brief: Sets the environment value.
+/// @param: env_var_name(Input), name of the environment value.
+/// @param: env_var_value(Input), value of the environment value.s
+/// @return: void.
+void SetEnvVar(std::string env_var_name, std::string env_var_value);
+
+/// @brief: Gets the value of environment value.
+/// @param: env_var_name(Input), name of the environment value.
+/// @return: std::string, value of the environment value, returned as string.
+std::string GetEnvVar(std::string env_var_name);
+
+/// @brief: Gets the process ID.
+/// @param: void
+/// @return: int, process ID returned as int.
+int GetProcessId();
+
+/// @brief: Gets the max virtual memory size accessible to the application.
+/// @param: void.
+/// @return: size_t, size of the accessible memory to the application.
+size_t GetUserModeVirtualMemorySize();
+
+/// @brief: Gets the max physical host system memory size.
+/// @param: void.
+/// @return: size_t, size of the physical host system memory.
+size_t GetUsablePhysicalHostMemorySize();
+
+/// @brief: Gets the virtual memory base address. It is hardcoded to 0.
+/// @param: void.
+/// @return: uintptr_t, always 0.
+uintptr_t GetUserModeVirtualMemoryBase();
+
+/// @brief os event api, create an event
+/// @param: auto_reset whether an event can reset the status automatically
+/// @param: init_state initial state of the event
+/// @return: event handle
+EventHandle CreateOsEvent(bool auto_reset, bool init_state);
+
+/// @brief os event api, destroy an event
+/// @param: event handle
+/// @return: whether destroy is correct
+int DestroyOsEvent(EventHandle event);
+
+/// @brief os event api, wait on event
+/// @param: event Event handle
+/// @param: milli_seconds wait time
+/// @return: Indicate success or timeout
+int WaitForOsEvent(EventHandle event, unsigned int milli_seconds);
+
+/// @brief os event api, set event state
+/// @param: event Event handle
+/// @return: Whether event set is correct
+int SetOsEvent(EventHandle event);
+
+/// @brief os event api, reset event state
+/// @param: event Event handle
+/// @return: Whether event reset is correct
+int ResetOsEvent(EventHandle event);
+
+/// @brief reads a clock which is deemed to be accurate for elapsed time
+/// measurements, though not necessarilly fast to query
+/// @return clock counter value
+uint64_t ReadAccurateClock();
+
+/// @brief retrieves the frequency in Hz of the unit used in ReadAccurateClock.
+/// It does not necessarilly reflect the resolution of the clock, but is the
+/// value needed to convert a difference in the clock's counter value to elapsed
+/// seconds.  This frequency does not change at runtime.
+/// @return returns the frequency
+uint64_t AccurateClockFrequency();
+
+/// @brief read the system clock which serves as the HSA system clock
+/// counter in KFD.
+uint64_t ReadSystemClock();
+
+/// @brief read the system clock frequency
+uint64_t SystemClockFrequency();
+
+typedef struct cpuid_s {
+  char ManufacturerID[13];  // 12 char, NULL terminated
+  bool mwaitx;
+} cpuid_t;
+
+/// @brief parse CPUID
+/// @param: cpuinfo struct to be filled
+bool ParseCpuID(cpuid_t* cpuinfo);
+
+}   //  namespace os
+} // namespace wsl
+
+#endif  // HSA_RUNTIME_CORE_UTIL_OS_H_
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/simple_heap.h b/projects/rocr-runtime/libhsakmt/src/dxg/util/simple_heap.h
new file mode 100644
index 0000000000..1fb992eb63
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/simple_heap.h
@@ -0,0 +1,394 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// A simple best fit memory allocator with eager compaction.  Manages block sub-allocation.
+// For use when memory efficiency is more important than allocation speed.
+// O(log n) time.
+
+#ifndef HSA_RUNTME_CORE_UTIL_SIMPLE_HEAP_H_
+#define HSA_RUNTME_CORE_UTIL_SIMPLE_HEAP_H_
+
+#include <map>
+#include <deque>
+#include <utility>
+
+
+namespace wsl {
+
+template <typename Allocator> class SimpleHeap {
+ private:
+  struct Fragment_T {
+    typedef std::multimap<size_t, uintptr_t>::iterator ptr_t;
+    ptr_t free_list_entry_;
+    struct {
+      size_t size : 62;
+      bool discard : 1;
+      bool free : 1;
+    };
+
+    Fragment_T(ptr_t Iterator, size_t Len, bool Free)
+        : free_list_entry_(Iterator), size(Len), discard(false), free(Free) {}
+    Fragment_T() = default;
+  };
+
+  struct Block {
+    uintptr_t base_ptr_;
+    size_t length_;
+
+    Block(uintptr_t base, size_t length) : base_ptr_(base), length_(length) {}
+    Block() = default;
+  };
+
+  Allocator block_allocator_;
+
+  std::multimap<size_t, uintptr_t> free_list_;
+  std::map<uintptr_t, std::map<uintptr_t, Fragment_T>> block_list_;
+  std::deque<Block> block_cache_;
+
+  // Size of blocks that are at least partially in use.
+  size_t in_use_size_;
+  // Total size of block cache
+  size_t cache_size_;
+
+  __forceinline bool isFree(const Fragment_T& node) { return node.free; }
+  __forceinline void setUsed(Fragment_T& node) {
+    node.free = false;
+    node.free_list_entry_ = free_list_.end();
+  }
+  __forceinline void setFree(Fragment_T& node, typename Fragment_T::ptr_t Iterator) {
+    node.free_list_entry_ = Iterator;
+    node.free = true;
+  }
+  __forceinline Fragment_T makeFragment(size_t Len) {
+    return Fragment_T(free_list_.end(), Len, false);
+  }
+  __forceinline Fragment_T makeFragment(typename Fragment_T::ptr_t Iterator, size_t Len) {
+    return Fragment_T(Iterator, Len, true);
+  }
+  __forceinline void removeFreeListEntry(Fragment_T& node) {
+    if (node.free_list_entry_ != free_list_.end()) {
+      free_list_.erase(node.free_list_entry_);
+      node.free_list_entry_ = free_list_.end();
+    }
+  }
+  __forceinline void discard(Fragment_T& node) {
+    removeFreeListEntry(node);
+    node.discard = true;
+  }
+
+ public:
+  explicit SimpleHeap(const Allocator& BlockAllocator = Allocator())
+      : block_allocator_(BlockAllocator), in_use_size_(0), cache_size_(0) {}
+  ~SimpleHeap() {
+    trim();
+    // Leak here may be due to the user.  Check is for debugging only.
+    // assert(in_use_size_ == 0 && "Leak in SimpleHeap.");
+  }
+
+  SimpleHeap(const SimpleHeap& rhs) = delete;
+  SimpleHeap(SimpleHeap&& rhs) = delete;
+  SimpleHeap& operator=(const SimpleHeap& rhs) = delete;
+  SimpleHeap& operator=(SimpleHeap&& rhs) = delete;
+
+  void* alloc(size_t bytes) {
+    // Find best fit.
+    uintptr_t base;
+    size_t size;
+    // For bytes >= 2MB, the requested mem should be aligned
+    size_t align_bytes = bytes;
+    const int retry = bytes >= GPU_HUGE_PAGE_SIZE ? 1 : 0;
+    size_t align = bytes >= GPU_HUGE_PAGE_SIZE ? GPU_HUGE_PAGE_SIZE : DEFAULT_GPU_PAGE_SIZE;
+
+    for (int i = 0; i <= retry; i++) {
+      auto free_fragment = free_list_.lower_bound(align_bytes);
+      if (free_fragment == free_list_.end()) break;
+
+      uintptr_t addr = free_fragment->second;
+      size = free_fragment->first;
+
+      assert(size >= bytes && "SimpleHeap: map lower_bound failure.");
+
+      // Find the containing block and fragment
+      auto it = block_list_.upper_bound(addr);
+      it--;
+      auto& frag_map = it->second;
+      const auto& fragment = frag_map.find(addr);
+
+      assert(fragment != frag_map.end() && "Inconsistency in SimpleHeap.");
+      assert(size == fragment->second.size && "Inconsistency in SimpleHeap.");
+
+      size_t delta = addr & (align - 1);
+      if (!delta) {
+        // already find aligned address
+        base = addr;
+        free_list_.erase(free_fragment);
+        // Sub-allocate from fragment.
+        fragment->second.size = bytes;
+        setUsed(fragment->second);
+        // Record remaining free space.
+        if (size > bytes) {
+          free_fragment = free_list_.insert(std::make_pair(size - bytes, base + bytes));
+          frag_map[base + bytes] = makeFragment(free_fragment, size - bytes);
+        }
+      } else {
+        // If this is the first request and the requested size is not enough for alignment,
+        // then request for a bigger hole and do trim.
+        if (i == 0 && size < bytes + align - delta) {
+          align_bytes += align;
+          continue;
+        }
+
+        uintptr_t aligned_base = addr + align - delta;
+        base = aligned_base;
+
+        // Erase the old free list
+        free_list_.erase(free_fragment);
+
+        // fragment 1 - free
+        free_fragment = free_list_.insert(std::make_pair(aligned_base - addr, addr));
+        frag_map[addr] = makeFragment(free_fragment, aligned_base - addr);
+
+        //fragment 2 - used
+        frag_map[base] = makeFragment(bytes);
+
+        // fragement 3 - free
+        if (size > aligned_base - addr + bytes) {
+          free_fragment = free_list_.insert(std::make_pair(size - (aligned_base - addr) - bytes, aligned_base + bytes));
+          frag_map[aligned_base + bytes] = makeFragment(free_fragment, size - (aligned_base - addr) - bytes);
+        }
+      }
+      return reinterpret_cast<void*>(base);
+    }
+
+    // No usable fragment, check block cache
+    if (bytes < default_block_size() && !block_cache_.empty()) {
+      const auto& block = block_cache_.back();
+      base = block.base_ptr_;
+      size = block.length_;
+      block_cache_.pop_back();
+      cache_size_ -= size;
+    } else {  // Alloc new block - new block may be larger than default.
+      void* ptr = block_allocator_.alloc(bytes, size);
+      if (ptr == nullptr) {
+        fprintf(stderr, "Block allocation failed, Allocator is expected to throw.\n");
+        return nullptr;
+      }
+      base = reinterpret_cast<uintptr_t>(ptr);
+    }
+
+    in_use_size_ += size;
+    assert(size >= bytes && "Alloc exceeds block size.");
+    // Sub alloc and insert free region.
+    if (size > bytes) {
+      auto free_fragment = free_list_.insert(std::make_pair(size - bytes, base + bytes));
+      block_list_[base][base + bytes] = makeFragment(free_fragment, size - bytes);
+    }
+    // Track used region
+    block_list_[base][base] = makeFragment(bytes);
+
+    // Disallow multiple suballocation from large blocks.
+    // Prevents a small allocation from retaining a large block.
+    if (bytes > default_block_size()) {
+      bool err = discardBlock(reinterpret_cast<void*>(base));
+      assert(err && "Large block discard failed.");
+    }
+
+    return reinterpret_cast<void*>(base);
+  }
+
+  /* Return block-base the ptr belongs to if the ptr is a valid ptr which is allocated
+   * from this simpleheap and the block-base is allocated from block_allocator_*/
+  void* block_base(void* ptr) {
+    if (ptr == nullptr)
+      return nullptr;
+
+    uintptr_t base = reinterpret_cast<uintptr_t>(ptr);
+
+    // Find fragment and validate.
+    auto frag_map_it = block_list_.upper_bound(base);
+    if (frag_map_it == block_list_.begin())
+      return nullptr;
+    frag_map_it--;
+    auto& frag_map = frag_map_it->second;
+    auto fragment = frag_map.find(base);
+    if (fragment == frag_map.end() || isFree(fragment->second))
+      return nullptr;
+
+    return reinterpret_cast<void*>(frag_map_it->first);
+  }
+
+  void reset() {
+    free_list_.clear();
+    block_list_.clear();
+    block_cache_.clear();
+    in_use_size_ = 0;
+    cache_size_ = 0;
+  }
+
+  bool free(void* ptr) {
+    if (ptr == nullptr) return true;
+
+    uintptr_t base = reinterpret_cast<uintptr_t>(ptr);
+
+    // Find fragment and validate.
+    auto frag_map_it = block_list_.upper_bound(base);
+    if (frag_map_it == block_list_.begin()) return false;
+    frag_map_it--;
+    auto& frag_map = frag_map_it->second;
+    auto fragment = frag_map.find(base);
+    if (fragment == frag_map.end() || isFree(fragment->second)) return false;
+
+    bool discard = fragment->second.discard;
+
+    // Merge lower
+    if (fragment != frag_map.begin()) {
+      auto lower = fragment;
+      lower--;
+      if (isFree(lower->second)) {
+        removeFreeListEntry(lower->second);
+        lower->second.size += fragment->second.size;
+        frag_map.erase(fragment);
+        fragment = lower;
+      }
+    }
+
+    // Merge upper
+    {
+      auto upper = fragment;
+      upper++;
+      if ((upper != frag_map.end()) && isFree(upper->second)) {
+        removeFreeListEntry(upper->second);
+        fragment->second.size += upper->second.size;
+        frag_map.erase(upper);
+      }
+    }
+
+    // Release whole free blocks.
+    if (frag_map.size() == 1) {
+      Block block(fragment->first, fragment->second.size);
+      block_list_.erase(frag_map_it);
+
+      // Discard or add to the block cache.
+      if (discard) {
+        block_allocator_.free(reinterpret_cast<void*>(block.base_ptr_), block.length_);
+      } else {
+        block_cache_.push_back(block);
+        cache_size_ += block.length_;
+        in_use_size_ -= block.length_;
+      }
+
+      balance();
+
+      // Don't publish free space since block was moved to the cache.
+      return true;
+    }
+
+    // Don't report free memory if discarding the fragment.
+    if (discard) return true;
+
+    // Report free fragment
+    const auto& freeEntry =
+        free_list_.insert(std::make_pair(size_t(fragment->second.size), fragment->first));
+    setFree(fragment->second, freeEntry);
+
+    return true;
+  }
+
+  void balance() {
+    // Release old blocks when over cache limit.
+    while ((block_cache_.size() > 1) && (cache_size_ > in_use_size_ * 2)) {
+      const auto& block = block_cache_.front();
+      block_allocator_.free(reinterpret_cast<void*>(block.base_ptr_), block.length_);
+      cache_size_ -= block.length_;
+      block_cache_.pop_front();
+    }
+  }
+
+  void trim() {
+    for (const auto& block : block_cache_)
+      block_allocator_.free(reinterpret_cast<void*>(block.base_ptr_), block.length_);
+    block_cache_.clear();
+    cache_size_ = 0;
+  }
+
+  size_t cache_size() const { return cache_size_; }
+
+  size_t default_block_size() const { return block_allocator_.block_size(); }
+
+  // Prevent reuse of the block containing ptr.  No further fragments will be allocated from the
+  // block and the block will not be added to the block cache when it is free.
+  bool discardBlock(void* ptr) {
+    if (ptr == nullptr) return true;
+
+    uintptr_t base = reinterpret_cast<uintptr_t>(ptr);
+
+    // Find block validate.
+    auto frag_map_it = block_list_.upper_bound(base);
+    if (frag_map_it == block_list_.begin()) return false;
+    frag_map_it--;
+    auto& frag_map = frag_map_it->second;
+    if ((base < frag_map.begin()->first) ||
+        (frag_map.rbegin()->first + frag_map.rbegin()->second.size <= base))
+      return false;
+
+    // Is block already discarded?
+    if (frag_map.begin()->second.discard) return true;
+
+    // Mark all fragments for discard and compute block size.  Removes freelist records for all
+    // fragments in the block.
+    size_t size = 0;
+    for (auto& frag : frag_map) {
+      discard(frag.second);
+      size += frag.second.size;
+    }
+
+    // Remove discarded block from in-use tracking and rebalance the block cache.
+    in_use_size_ -= size;
+    balance();
+
+    return true;
+  }
+};
+
+} // namespace wsl
+
+#endif  // HSA_RUNTME_CORE_UTIL_SIMPLE_HEAP_H_
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/small_heap.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/util/small_heap.cpp
new file mode 100644
index 0000000000..bcaef5dd87
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/small_heap.cpp
@@ -0,0 +1,185 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+// 
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+// 
+// Developed by:
+// 
+//                 AMD Research and AMD HSA Software Development
+// 
+//                 Advanced Micro Devices, Inc.
+// 
+//                 www.amd.com
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+// 
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include "small_heap.h"
+
+namespace wsl {
+
+// Inserts node into freelist after place.
+// Assumes node will not be an end of the list (list has guard nodes).
+void SmallHeap::insertafter(SmallHeap::iterator_t place, SmallHeap::iterator_t node) {
+  assert(place->first < node->first && "Order violation");
+  assert(isfree(place->second) && "Freelist operation error.");
+  iterator_t next = place->second.next;
+  node->second.next = next;
+  node->second.prior = place;
+  place->second.next = node;
+  next->second.prior = node;
+}
+
+// Removes node from freelist.
+// Assumes node will not be an end of the list (list has guard nodes).
+void SmallHeap::remove(SmallHeap::iterator_t node) {
+  assert(isfree(node->second) && "Freelist operation error.");
+  node->second.prior->second.next = node->second.next;
+  node->second.next->second.prior = node->second.prior;
+  setused(node->second);
+}
+
+// Returns high if merge failed or the merged node.
+SmallHeap::memory_t::iterator SmallHeap::merge(SmallHeap::memory_t::iterator low,
+                                               SmallHeap::memory_t::iterator high) {
+  assert(isfree(low->second) && "Merge with allocated block");
+  assert(isfree(high->second) && "Merge with allocated block");
+
+  if ((char*)low->first + low->second.len != (char*)high->first) return high;
+
+  assert(!islastfree(high->second) && "Illegal merge.");
+
+  low->second.len += high->second.len;
+  low->second.next = high->second.next;
+  high->second.next->second.prior = low;
+
+  memory.erase(high);
+  return low;
+}
+
+void SmallHeap::free(void* ptr) {
+  if (ptr == nullptr) return;
+
+  auto iterator = memory.find(ptr);
+
+  // Check for illegal free
+  if (iterator == memory.end()) {
+    assert(false && "Illegal free.");
+    return;
+  }
+
+  // Return memory to total and link node into free list
+  total_free += iterator->second.len;
+
+  // Could also traverse the free list which might be faster in some cases.
+  auto before = iterator;
+  before--;
+  while (!isfree(before->second)) before--;
+  assert(before->second.next->first > iterator->first && "Inconsistency in small heap.");
+  insertafter(before, iterator);
+
+  // Attempt compaction
+  iterator = merge(before, iterator);
+  merge(iterator, iterator->second.next);
+
+  // Update lowHighBondary
+  high.erase(ptr);
+}
+
+void* SmallHeap::alloc(size_t bytes) {
+  // Is enough memory available?
+  if ((bytes > total_free) || (bytes == 0)) return nullptr;
+
+  iterator_t current;
+
+  // Walk the free list and allocate at first fitting location
+  current = firstfree();
+  while (!islastfree(current->second)) {
+    if (bytes <= current->second.len) {
+      // Decrement from total
+      total_free -= bytes;
+
+      // Split node
+      if (bytes != current->second.len) {
+        void* remaining = (char*)current->first + bytes;
+        Node& node = memory[remaining];
+        node.len = current->second.len - bytes;
+        current->second.len = bytes;
+        insertafter(current, memory.find(remaining));
+      }
+
+      remove(current);
+      return current->first;
+    }
+    current = current->second.next;
+  }
+  assert(current->second.len == 0 && "Freelist corruption.");
+
+  // Can't service the request due to fragmentation
+  return nullptr;
+}
+
+void* SmallHeap::alloc_high(size_t bytes) {
+  // Is enough memory available?
+  if ((bytes > total_free) || (bytes == 0)) return nullptr;
+
+  iterator_t current;
+
+  // Walk the free list and allocate at first fitting location
+  current = lastfree();
+  while (!isfirstfree(current->second)) {
+    if (bytes <= current->second.len) {
+      // Decrement from total
+      total_free -= bytes;
+
+      void* alloc;
+      // Split node
+      if (bytes != current->second.len) {
+        alloc = (char*)current->first + current->second.len - bytes;
+        current->second.len -= bytes;
+        Node& node = memory[alloc];
+        node.len = bytes;
+        setused(node);
+      } else {
+        alloc = current->first;
+        remove(current);
+      }
+
+      high.insert(alloc);
+      return alloc;
+    }
+    current = current->second.prior;
+  }
+  assert(current->second.len == 0 && "Freelist corruption.");
+
+  // Can't service the request due to fragmentation
+  return nullptr;
+}
+
+} // namespace wsl
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/small_heap.h b/projects/rocr-runtime/libhsakmt/src/dxg/util/small_heap.h
new file mode 100644
index 0000000000..f6e060cb09
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/small_heap.h
@@ -0,0 +1,131 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+// 
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+// 
+// Developed by:
+// 
+//                 AMD Research and AMD HSA Software Development
+// 
+//                 Advanced Micro Devices, Inc.
+// 
+//                 www.amd.com
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+// 
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// A simple first fit memory allocator with eager compaction.  For use with few
+// items (where list iteration is faster than trees).
+// Not thread safe!
+
+#ifndef HSA_RUNTME_CORE_UTIL_SMALL_HEAP_H_
+#define HSA_RUNTME_CORE_UTIL_SMALL_HEAP_H_
+
+#include <map>
+#include <set>
+
+#include "utils.h"
+
+namespace wsl {
+
+class SmallHeap {
+ private:
+  struct Node;
+  typedef std::map<void*, Node> memory_t;
+  typedef memory_t::iterator iterator_t;
+
+  struct Node {
+    size_t len;
+    iterator_t next;
+    iterator_t prior;
+  };
+
+  SmallHeap(const SmallHeap& rhs) = delete;
+  SmallHeap& operator=(const SmallHeap& rhs) = delete;
+
+  void* const pool;
+  const size_t length;
+
+  size_t total_free;
+  memory_t memory;
+  std::set<void*> high;
+
+  __forceinline bool isfree(const Node& node) const { return node.next != memory.begin(); }
+  __forceinline bool islastfree(const Node& node) const { return node.next == memory.end(); }
+  __forceinline bool isfirstfree(const Node& node) const { return node.prior == memory.end(); }
+  __forceinline void setlastfree(Node& node) { node.next = memory.end(); }
+  __forceinline void setfirstfree(Node& node) { node.prior = memory.end(); }
+  __forceinline void setused(Node& node) { node.next = memory.begin(); }
+
+  __forceinline iterator_t firstfree() { return memory.begin()->second.next; }
+  __forceinline iterator_t lastfree() { return memory.rbegin()->second.prior; }
+  void insertafter(iterator_t place, iterator_t node);
+  void remove(iterator_t node);
+  iterator_t merge(iterator_t low, iterator_t high);
+
+ public:
+  SmallHeap() : pool(nullptr), length(0), total_free(0) {}
+  SmallHeap(void* base, size_t length)
+      : pool(base), length(length), total_free(length) {
+    assert(pool != nullptr && "Invalid base address.");
+    assert(pool != (void*)0xFFFFFFFFFFFFFFFFull && "Invalid base address.");
+    assert((char*)pool + length != (char*)0xFFFFFFFFFFFFFFFFull && "Invalid pool bounds.");
+
+    Node& start = memory[0];
+    Node& node = memory[pool];
+    Node& end = memory[(void*)0xFFFFFFFFFFFFFFFFull];
+
+    start.len = 0;
+    start.next = memory.find(pool);
+    setfirstfree(start);
+
+    node.len = length;
+    node.prior = memory.begin();
+    node.next = --memory.end();
+
+    end.len = 0;
+    end.prior = start.next;
+    setlastfree(end);
+
+    high.insert((void*)0xFFFFFFFFFFFFFFFFull);
+  }
+
+  void* alloc(size_t bytes);
+  void* alloc_high(size_t bytes);
+  void free(void* ptr);
+
+  void* base() const { return pool; }
+  size_t size() const { return length; }
+  size_t remaining() const { return total_free; }
+  void* high_split() const { return *high.begin(); }
+};
+
+} // namespace wsl
+
+#endif
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/timer.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/util/timer.cpp
new file mode 100644
index 0000000000..c5a2b57c64
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/timer.cpp
@@ -0,0 +1,111 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+// 
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+// 
+// Developed by:
+// 
+//                 AMD Research and AMD HSA Software Development
+// 
+//                 Advanced Micro Devices, Inc.
+// 
+//                 www.amd.com
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+// 
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include "core/util/timer.h"
+
+namespace wsl {
+namespace timer {
+
+accurate_clock::init::init() {
+  freq = os::AccurateClockFrequency();
+  accurate_clock::period_ns = 1e9 / double(freq);
+}
+
+// Calibrates the fast clock using the accurate clock.
+fast_clock::init::init() {
+  typedef accurate_clock clock;
+  clock::duration delay(std::chrono::milliseconds(1));
+
+  // calibrate clock
+  fast_clock::raw_rep min = 0;
+  clock::duration elapsed;
+
+  do {
+    elapsed = clock::duration::max();
+
+    for (int t = 0; t < 10; t++) {
+      fast_clock::raw_rep r1, r2;
+      clock::time_point t0, t1, t2, t3;
+
+      t0 = clock::now();
+      std::atomic_signal_fence(std::memory_order_acq_rel);
+      r1 = fast_clock::raw_now();
+      std::atomic_signal_fence(std::memory_order_acq_rel);
+      t1 = clock::now();
+      std::atomic_signal_fence(std::memory_order_acq_rel);
+
+      do {
+        t2 = clock::now();
+      } while (t2 - t1 < delay);
+
+      std::atomic_signal_fence(std::memory_order_acq_rel);
+      r2 = fast_clock::raw_now();
+      std::atomic_signal_fence(std::memory_order_acq_rel);
+      t3 = clock::now();
+
+      // If elapsed time is shorter than last recorded time and both the start
+      // and end times are confirmed correlated then record the clock readings.
+      // This protects against inaccuracy due to thread switching
+      if ((t3 - t1 < elapsed) && ((t1 - t0) * 10 < (t2 - t1)) &&
+          ((t3 - t2) * 10 < (t2 - t1))) {
+        elapsed = t3 - t1;
+        min = r2 - r1;
+      }
+    }
+    delay += delay;
+  } while (min < 1000);
+
+  fast_clock::freq = double(min) / duration_in_seconds(elapsed);
+  fast_clock::period_ps = 1e12 / fast_clock::freq;
+  // printf("Timer setup took %f ms\n", duration_in_seconds(elapsed)*1000.0f);
+  // printf("Fast clock frequency: %f MHz\n", double(fast_clock::freq)/1e6);
+}
+
+double accurate_clock::period_ns;
+accurate_clock::raw_frequency accurate_clock::freq;
+accurate_clock::init accurate_clock::accurate_clock_init;
+
+double fast_clock::period_ps;
+fast_clock::raw_frequency fast_clock::freq;
+fast_clock::init fast_clock::fast_clock_init;
+}   //  namespace timer
+}   // namespace wsl
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/timer.h b/projects/rocr-runtime/libhsakmt/src/dxg/util/timer.h
new file mode 100644
index 0000000000..3012685113
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/timer.h
@@ -0,0 +1,173 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+// 
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+// 
+// Developed by:
+// 
+//                 AMD Research and AMD HSA Software Development
+// 
+//                 Advanced Micro Devices, Inc.
+// 
+//                 www.amd.com
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+// 
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef HSA_RUNTIME_CORE_UTIL_TIMER_H_
+#define HSA_RUNTIME_CORE_UTIL_TIMER_H_
+
+#include "core/util/utils.h"
+#include "core/util/os.h"
+#include <chrono>
+#include <time.h>
+#include <type_traits>
+
+namespace wsl {
+namespace timer {
+
+// Needed to patch around a mixed arithmetic bug in MSVC's duration_cast as of
+// VS 2013.
+template <bool isFloat, bool isSigned>
+struct wide_type {
+  typedef double type;
+};
+template <>
+struct wide_type<false, false> {
+  typedef uintmax_t type;
+};
+template <>
+struct wide_type<false, true> {
+  typedef intmax_t type;
+};
+
+template <typename To, typename Rep, typename Period>
+static __forceinline To
+    duration_cast(const std::chrono::duration<Rep, Period>& d) {
+  typedef typename wide_type<std::is_floating_point<Rep>::value,
+                             std::is_signed<Rep>::value>::type wide;
+  typedef std::chrono::duration<wide, typename To::period> unit_convert_t;
+
+  unit_convert_t temp = std::chrono::duration_cast<unit_convert_t>(d);
+  return To(static_cast<typename To::rep>(temp.count()));
+}
+// End patch
+
+template <typename Rep, typename Period>
+static __forceinline double duration_in_seconds(
+    std::chrono::duration<Rep, Period> delta) {
+  typedef std::chrono::duration<double, std::ratio<1, 1>> seconds;
+  return seconds(delta).count();
+}
+
+template <typename rep>
+static __forceinline rep duration_from_seconds(double delta) {
+  typedef std::chrono::duration<double, std::ratio<1, 1>> seconds;
+  return std::chrono::duration_cast<rep>(seconds(delta));
+}
+
+// Provices a C++11 standard clock interface to the os::AccurateClock functions
+class accurate_clock {
+ public:
+  typedef double rep;
+  typedef std::nano period;
+  typedef std::chrono::duration<rep, period> duration;
+  typedef std::chrono::time_point<accurate_clock> time_point;
+
+  static const bool is_steady = true;
+
+  static __forceinline time_point now() {
+    return time_point(duration(raw_now() * period_ns));
+  }
+
+  // These two extra APIs and types let us use clocks without conversion to the
+  // arbitrary period unit
+  typedef uint64_t raw_rep;
+  typedef uint64_t raw_frequency;
+
+  static __forceinline raw_rep raw_now() { return os::ReadAccurateClock(); }
+  static __forceinline raw_frequency raw_freq() { return freq; }
+
+ private:
+  static double period_ns;
+  static raw_frequency freq;
+
+  class init {
+   public:
+    init();
+  };
+  static init accurate_clock_init;
+};
+
+// Provices a C++11 standard clock interface to the lowest latency approximate
+// clock
+class fast_clock {
+ public:
+  typedef double rep;
+  typedef std::pico period;
+  typedef std::chrono::duration<rep, period> duration;
+  typedef std::chrono::time_point<fast_clock> time_point;
+
+  static const bool is_steady = true;
+
+  static __forceinline time_point now() {
+    return time_point(duration(raw_now() * period_ps));
+  }
+
+  // These two extra APIs and types let us use clocks without conversion to the
+  // arbitrary period unit
+  typedef uint64_t raw_rep;
+  typedef double raw_frequency;
+
+#if defined(__x86_64__) || defined(_M_X64)
+  static __forceinline raw_rep raw_now() { return __rdtsc(); }
+  static __forceinline raw_frequency raw_freq() { return freq; }
+#else
+  static __forceinline raw_rep raw_now() {
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
+    return (raw_rep(ts.tv_sec) * 1000000000 + raw_rep(ts.tv_nsec));
+  }
+  static __forceinline raw_frequency raw_freq() { return 1.e-9; }
+#endif
+
+ private:
+  static double period_ps;
+  static raw_frequency freq;
+
+  class init {
+   public:
+    init();
+  };
+  static init fast_clock_init;
+};
+}   //  namespace timer
+}   //  namespace wsl
+
+#endif
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/utils.h b/projects/rocr-runtime/libhsakmt/src/dxg/util/utils.h
new file mode 100644
index 0000000000..15d61a87e1
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/utils.h
@@ -0,0 +1,389 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2024, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// Generally useful utility functions
+
+#ifndef HSA_RUNTIME_CORE_UTIL_UTILS_H_
+#define HSA_RUNTIME_CORE_UTIL_UTILS_H_
+
+#include "stdint.h"
+#include "stddef.h"
+#include "stdlib.h"
+#include "stdarg.h"
+#include "unistd.h"
+#include <assert.h>
+#include <iostream>
+#include <string>
+#include <algorithm>
+#include <sstream>
+#include <thread>
+
+namespace wsl {
+extern FILE* log_file;
+extern uint8_t log_flags[8];
+
+typedef unsigned int uint;
+typedef uint64_t uint64;
+
+#if defined(__GNUC__)
+#if defined(__i386__) || defined(__x86_64__)
+#include <x86intrin.h>
+#endif
+
+// 2MB huge page size
+#define GPU_HUGE_PAGE_SIZE    (2 << 20)
+
+// 4KB page size
+#define DEFAULT_GPU_PAGE_SIZE (1 << 12)
+
+#define __forceinline __inline__ __attribute__((always_inline))
+#define __declspec(x) __attribute__((x))
+#undef __stdcall
+#define __stdcall  // __attribute__((__stdcall__))
+#define __ALIGNED__(x) __attribute__((aligned(x)))
+
+void log_printf(const char* file, int line, const char* format, ...);
+
+static __forceinline void* _aligned_malloc(size_t size, size_t alignment) {
+#ifdef _ISOC11_SOURCE
+  return aligned_alloc(alignment, size);
+#else
+  void *mem = NULL;
+  if (0 != posix_memalign(&mem, alignment, size)) return NULL;
+  return mem;
+#endif
+}
+static __forceinline void _aligned_free(void* ptr) { return free(ptr); }
+#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
+#include "intrin.h"
+#define __ALIGNED__(x) __declspec(align(x))
+#if (_MSC_VER < 1800)  // < VS 2013
+static __forceinline unsigned long long int strtoull(const char* str,
+                                                     char** endptr, int base) {
+  return static_cast<unsigned long long>(_strtoui64(str, endptr, base));
+}
+#endif
+#if (_MSC_VER < 1900)  // < VS 2015
+#define thread_local __declspec(thread)
+#endif
+#else
+#error "Compiler and/or processor not identified."
+#endif
+
+#define STRING2(x) #x
+#define STRING(x) STRING2(x)
+
+#define PASTE2(x, y) x##y
+#define PASTE(x, y) PASTE2(x, y)
+
+#define __FILENAME__ (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
+
+#define LogPrint(flag, format, ...)                                                                \
+  do {                                                                                             \
+    if (hsa_flag_isset64(log_flags, flag))                                                         \
+      wsl::log_printf(__FILENAME__, __LINE__, format, ##__VA_ARGS__);                             \
+  } while (false);
+
+// A macro to disallow the copy and move constructor and operator= functions
+#define DISALLOW_COPY_AND_ASSIGN(TypeName)                                                         \
+  TypeName(const TypeName&) = delete;                                                              \
+  TypeName(TypeName&&) = delete;                                                                   \
+  void operator=(const TypeName&) = delete;                                                        \
+  void operator=(TypeName&&) = delete;
+
+template <typename lambda>
+class ScopeGuard {
+ public:
+  explicit __forceinline ScopeGuard(const lambda& release)
+      : release_(release), dismiss_(false) {}
+
+  ScopeGuard(ScopeGuard& rhs) { *this = rhs; }
+
+  __forceinline ~ScopeGuard() {
+    if (!dismiss_) release_();
+  }
+  __forceinline ScopeGuard& operator=(ScopeGuard& rhs) {
+    dismiss_ = rhs.dismiss_;
+    release_ = rhs.release_;
+    rhs.dismiss_ = true;
+    return *this;
+  }
+  __forceinline void Dismiss() { dismiss_ = true; }
+
+ private:
+  lambda release_;
+  bool dismiss_;
+};
+
+template <typename lambda>
+static __forceinline ScopeGuard<lambda> MakeScopeGuard(lambda rel) {
+  return ScopeGuard<lambda>(rel);
+}
+
+#define MAKE_SCOPE_GUARD_HELPER(lname, sname, ...) \
+  auto lname = __VA_ARGS__;                        \
+  ScopeGuard<decltype(lname)> sname(lname);
+#define MAKE_SCOPE_GUARD(...)                                   \
+  MAKE_SCOPE_GUARD_HELPER(PASTE(scopeGuardLambda, __COUNTER__), \
+                          PASTE(scopeGuard, __COUNTER__), __VA_ARGS__)
+#define MAKE_NAMED_SCOPE_GUARD(name, ...)                             \
+  MAKE_SCOPE_GUARD_HELPER(PASTE(scopeGuardLambda, __COUNTER__), name, \
+                          __VA_ARGS__)
+
+/// @brief: Finds out the min one of two inputs, input must support ">"
+/// operator.
+/// @param: a(Input), a reference to type T.
+/// @param: b(Input), a reference to type T.
+/// @return: T.
+template <class T>
+static __forceinline T Min(const T& a, const T& b) {
+  return (a > b) ? b : a;
+}
+
+template <class T, class... Arg>
+static __forceinline T Min(const T& a, const T& b, Arg... args) {
+  return Min(a, Min(b, args...));
+}
+
+/// @brief: Find out the max one of two inputs, input must support ">" operator.
+/// @param: a(Input), a reference to type T.
+/// @param: b(Input), a reference to type T.
+/// @return: T.
+template <class T>
+static __forceinline T Max(const T& a, const T& b) {
+  return (b > a) ? b : a;
+}
+
+template <class T, class... Arg>
+static __forceinline T Max(const T& a, const T& b, Arg... args) {
+  return Max(a, Max(b, args...));
+}
+
+/// @brief: Free the memory space which is newed previously.
+/// @param: ptr(Input), a pointer to memory space. Can't be NULL.
+/// @return: void.
+struct DeleteObject {
+  template <typename T>
+  void operator()(const T* ptr) const {
+    delete ptr;
+  }
+};
+
+/// @brief: Checks if a value is power of two, if it is, return true. Be careful
+/// when passing 0.
+/// @param: val(Input), the data to be checked.
+/// @return: bool.
+template <typename T>
+static __forceinline bool IsPowerOfTwo(T val) {
+  return (val & (val - 1)) == 0;
+}
+
+/// @brief: Calculates the floor value aligned based on parameter of alignment.
+/// If value is at the boundary of alignment, it is unchanged.
+/// @param: value(Input), value to be calculated.
+/// @param: alignment(Input), alignment value.
+/// @return: T.
+template <typename T>
+static __forceinline T AlignDown(T value, size_t alignment) {
+  return (T)((value / alignment) * alignment);
+}
+
+/// @brief: Same as previous one, but first parameter becomes pointer, for more
+/// info, see the previous desciption.
+/// @param: value(Input), pointer to type T.
+/// @param: alignment(Input), alignment value.
+/// @return: T*, pointer to type T.
+template <typename T>
+static __forceinline T* AlignDown(T* value, size_t alignment) {
+  return (T*)AlignDown((intptr_t)value, alignment);
+}
+
+/// @brief: Calculates the ceiling value aligned based on parameter of
+/// alignment.
+/// If value is at the boundary of alignment, it is unchanged.
+/// @param: value(Input), value to be calculated.
+/// @param: alignment(Input), alignment value.
+/// @param: T.
+template <typename T>
+static __forceinline T AlignUp(T value, size_t alignment) {
+  return AlignDown((T)(value + alignment - 1), alignment);
+}
+
+/// @brief: Same as previous one, but first parameter becomes pointer, for more
+/// info, see the previous desciption.
+/// @param: value(Input), pointer to type T.
+/// @param: alignment(Input), alignment value.
+/// @return: T*, pointer to type T.
+template <typename T>
+static __forceinline T* AlignUp(T* value, size_t alignment) {
+  return (T*)AlignDown((intptr_t)((uint8_t*)value + alignment - 1), alignment);
+}
+
+/// @brief: Checks if the input value is at the boundary of alignment, if it is,
+/// @return true.
+/// @param: value(Input), value to be checked.
+/// @param: alignment(Input), alignment value.
+/// @return: bool.
+template <typename T>
+static __forceinline bool IsMultipleOf(T value, size_t alignment) {
+  return (AlignUp(value, alignment) == value);
+}
+
+/// @brief: Same as previous one, but first parameter becomes pointer, for more
+/// info, see the previous desciption.
+/// @param: value(Input), pointer to type T.
+/// @param: alignment(Input), alignment value.
+/// @return: bool.
+template <typename T>
+static __forceinline bool IsMultipleOf(T* value, size_t alignment) {
+  return (AlignUp(value, alignment) == value);
+}
+
+static __forceinline uint32_t NextPow2(uint32_t value) {
+  if (value == 0) return 1;
+  uint32_t v = value - 1;
+  v |= v >> 1;
+  v |= v >> 2;
+  v |= v >> 4;
+  v |= v >> 8;
+  v |= v >> 16;
+  return v + 1;
+}
+
+static __forceinline uint64_t NextPow2(uint64_t value) {
+  if (value == 0) return 1;
+  uint64_t v = value - 1;
+  v |= v >> 1;
+  v |= v >> 2;
+  v |= v >> 4;
+  v |= v >> 8;
+  v |= v >> 16;
+  v |= v >> 32;
+  return v + 1;
+}
+
+static __forceinline bool strIsEmpty(const char* str) noexcept { return str[0] == '\0'; }
+
+static __forceinline std::string& ltrim(std::string& s) {
+  auto it = std::find_if(s.begin(), s.end(),
+                         [](char c) { return !std::isspace<char>(c, std::locale::classic()); });
+  s.erase(s.begin(), it);
+  return s;
+}
+
+static __forceinline std::string& rtrim(std::string& s) {
+  auto it = std::find_if(s.rbegin(), s.rend(),
+                         [](char c) { return !std::isspace<char>(c, std::locale::classic()); });
+  s.erase(it.base(), s.end());
+  return s;
+}
+
+static __forceinline std::string& trim(std::string& s) { return ltrim(rtrim(s)); }
+
+}  // namespace wsl
+
+template <uint32_t lowBit, uint32_t highBit, typename T>
+static __forceinline uint32_t BitSelect(T p) {
+  static_assert(sizeof(T) <= sizeof(uintptr_t), "Type out of range.");
+  static_assert(highBit < sizeof(uintptr_t) * 8, "Bit index out of range.");
+
+  uintptr_t ptr = p;
+  if (highBit != (sizeof(uintptr_t) * 8 - 1))
+    return (uint32_t)((ptr & ((1ull << (highBit + 1)) - 1)) >> lowBit);
+  else
+    return (uint32_t)(ptr >> lowBit);
+}
+
+inline uint32_t PtrLow16Shift8(const void* p) {
+  uintptr_t ptr = reinterpret_cast<uintptr_t>(p);
+  return (uint32_t)((ptr & 0xFFFFULL) >> 8);
+}
+
+inline uint32_t PtrHigh64Shift16(const void* p) {
+  uintptr_t ptr = reinterpret_cast<uintptr_t>(p);
+  return (uint32_t)((ptr & 0xFFFFFFFFFFFF0000ULL) >> 16);
+}
+
+inline uint32_t PtrLow40Shift8(const void* p) {
+  uintptr_t ptr = reinterpret_cast<uintptr_t>(p);
+  return (uint32_t)((ptr & 0xFFFFFFFFFFULL) >> 8);
+}
+
+inline uint32_t PtrHigh64Shift40(const void* p) {
+  uintptr_t ptr = reinterpret_cast<uintptr_t>(p);
+  return (uint32_t)((ptr & 0xFFFFFF0000000000ULL) >> 40);
+}
+
+static inline uint8_t Ptr48High8(const void* p) {
+  uintptr_t ptr = reinterpret_cast<uintptr_t>(p);
+  return (uint8_t)((ptr & 0xFF0000000000ULL) >> 40);
+}
+
+static inline uint32_t Ptr48Low32(const void* p) {
+  uintptr_t ptr = reinterpret_cast<uintptr_t>(p);
+  assert((ptr & 0xFFFFFFFFFF00ULL) == ptr);
+  return (uint32_t)((ptr & 0xFFFFFFFFFFULL) >> 8);
+}
+
+inline uint32_t PtrLow32(const void* p) {
+  return static_cast<uint32_t>(reinterpret_cast<uintptr_t>(p));
+}
+
+inline uint32_t PtrHigh32(const void* p) {
+  uint32_t ptr = 0;
+#ifdef HSA_LARGE_MODEL
+  ptr = static_cast<uint32_t>(reinterpret_cast<uintptr_t>(p) >> 32);
+#endif
+  return ptr;
+}
+
+inline uint32_t HighPart(uint64_t value) {
+  return (value & 0xFFFFFFFF00000000) >> 32;
+}
+
+inline uint32_t LowPart(uint64_t value) {
+  return (value & 0x00000000FFFFFFFF);
+}
+
+#include "atomic_helpers.h"
+
+#endif  // HSA_RUNTIME_CORE_UTIL_UTILS_H_
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/win/os_win.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/util/win/os_win.cpp
new file mode 100644
index 0000000000..b7f2285623
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/win/os_win.cpp
@@ -0,0 +1,327 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+// 
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+// 
+// Developed by:
+// 
+//                 AMD Research and AMD HSA Software Development
+// 
+//                 Advanced Micro Devices, Inc.
+// 
+//                 www.amd.com
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+// 
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifdef _WIN32  // Are we compiling for windows?
+#define NOMINMAX
+
+#include "core/util/os.h"
+
+#include <algorithm>
+#include <process.h>
+#include <string>
+#include <windows.h>
+
+#include <emmintrin.h>
+#include <pmmintrin.h>
+#include <xmmintrin.h>
+
+#undef Yield
+#undef CreateMutex
+
+namespace wsl {
+namespace os {
+
+static_assert(sizeof(LibHandle) == sizeof(HMODULE),
+              "OS abstraction size mismatch");
+static_assert(sizeof(LibHandle) == sizeof(::HANDLE),
+              "OS abstraction size mismatch");
+static_assert(sizeof(Semaphore) == sizeof(::HANDLE),
+              "OS abstraction size mismatch");
+static_assert(sizeof(Mutex) == sizeof(::HANDLE),
+              "OS abstraction size mismatch");
+static_assert(sizeof(Thread) == sizeof(::HANDLE),
+              "OS abstraction size mismatch");
+static_assert(sizeof(EventHandle) == sizeof(::HANDLE),
+              "OS abstraction size mismatch");
+
+LibHandle LoadLib(std::string filename) {
+  HMODULE ret = LoadLibrary(filename.c_str());
+  return *(LibHandle*)&ret;
+}
+
+void* GetExportAddress(LibHandle lib, std::string export_name) {
+  return GetProcAddress(*(HMODULE*)&lib, export_name.c_str());
+}
+
+void CloseLib(LibHandle lib) { FreeLibrary(*(::HMODULE*)&lib); }
+
+std::vector<LibHandle> GetLoadedLibs() {
+  // Use EnumProcessModulesEx
+  static_assert(false, "Not implemented.");
+}
+
+std::string GetLibraryName(LibHandle lib) {
+  static_assert(false, "Not implemented.");
+}
+
+Semaphore CreateSemaphore() {
+  sem = static_cast<void*>(CreateSemaphore(NULL, 0, LONG_MAX, NULL));
+  assert(sem != NULL && "CreateSemaphore failed");
+
+  return *(Semaphore*)&sem;
+}
+
+bool WaitSemaphore(Semaphore sem) {
+  return WaitForSingleObject(*(::HANDLE*)&lock, INFINITE) == WAIT_OBJECT_0;
+}
+
+void PostSemaphore(Semaphore sem) {
+  ReleaseSemaphore(static_cast<HANDLE>(*sem), 1, NULL);
+}
+
+void DestroySemaphore(Semaphore sem) {
+  if (!CloseHandle(static_cast<HANDLE>(*sem))) {
+    assert("CloseHandle() failed");
+  }
+  *sem = NULL;
+}
+
+Mutex CreateMutex() { return CreateEvent(NULL, false, true, NULL); }
+
+bool TryAcquireMutex(Mutex lock) {
+  return WaitForSingleObject(*(::HANDLE*)&lock, 0) == WAIT_OBJECT_0;
+}
+
+bool AcquireMutex(Mutex lock) {
+  return WaitForSingleObject(*(::HANDLE*)&lock, INFINITE) == WAIT_OBJECT_0;
+}
+
+void ReleaseMutex(Mutex lock) { SetEvent(*(::HANDLE*)&lock); }
+
+void DestroyMutex(Mutex lock) { CloseHandle(*(::HANDLE*)&lock); }
+
+void Sleep(int delay_in_millisecond) { ::Sleep(delay_in_millisecond); }
+
+void uSleep(int delayInUs) { ::Sleep(delayInUs / 1000); }
+
+void YieldThread() { ::Sleep(0); }
+
+struct ThreadArgs {
+  void* entry_args;
+  ThreadEntry entry_function;
+};
+
+unsigned __stdcall ThreadTrampoline(void* arg) {
+  ThreadArgs* thread_args = (ThreadArgs*)arg;
+  ThreadEntry entry = thread_args->entry_function;
+  void* data = thread_args->entry_args;
+  delete thread_args;
+  entry(data);
+  _endthreadex(0);
+  return 0;
+}
+
+Thread CreateThread(ThreadEntry entry_function, void* entry_argument,
+                    uint stack_size) {
+  ThreadArgs* thread_args = new ThreadArgs();
+  thread_args->entry_args = entry_argument;
+  thread_args->entry_function = entry_function;
+  uintptr_t ret =
+      _beginthreadex(NULL, stack_size, ThreadTrampoline, thread_args, 0, NULL);
+  return *(Thread*)&ret;
+}
+
+void CloseThread(Thread thread) { CloseHandle(*(::HANDLE*)&thread); }
+
+bool WaitForThread(Thread thread) {
+  return WaitForSingleObject(*(::HANDLE*)&thread, INFINITE) == WAIT_OBJECT_0;
+}
+
+bool WaitForAllThreads(Thread* threads, uint thread_count) {
+  return WaitForMultipleObjects(thread_count, threads, TRUE, INFINITE) ==
+         WAIT_OBJECT_0;
+}
+
+void SetEnvVar(std::string env_var_name, std::string env_var_value) {
+  SetEnvironmentVariable(env_var_name.c_str(), env_var_value.c_str());
+}
+
+std::string GetEnvVar(std::string env_var_name) {
+  char* buff;
+  DWORD char_count = GetEnvironmentVariable(env_var_name.c_str(), NULL, 0);
+  if (char_count == 0) return "";
+  buff = (char*)alloca(sizeof(char) * char_count);
+  GetEnvironmentVariable(env_var_name.c_str(), buff, char_count);
+  buff[char_count - 1] = '\0';
+  std::string ret = buff;
+  return ret;
+}
+
+size_t GetUserModeVirtualMemorySize() {
+  SYSTEM_INFO system_info = {0};
+  GetSystemInfo(&system_info);
+  return ((size_t)system_info.lpMaximumApplicationAddress + 1);
+}
+
+size_t GetUsablePhysicalHostMemorySize() {
+  MEMORYSTATUSEX memory_status = {0};
+  memory_status.dwLength = sizeof(memory_status);
+  if (GlobalMemoryStatusEx(&memory_status) == 0) {
+    return 0;
+  }
+
+  const size_t physical_size = static_cast<size_t>(memory_status.ullTotalPhys);
+  return std::min(GetUserModeVirtualMemorySize(), physical_size);
+}
+
+uintptr_t GetUserModeVirtualMemoryBase() { return (uintptr_t)0; }
+
+// Os event wrappers
+EventHandle CreateOsEvent(bool auto_reset, bool init_state) {
+  EventHandle evt = reinterpret_cast<EventHandle>(
+      CreateEvent(NULL, (BOOL)(!auto_reset), (BOOL)init_state, NULL));
+  return evt;
+}
+
+int DestroyOsEvent(EventHandle event) {
+  if (event == NULL) {
+    return -1;
+  }
+  return CloseHandle(reinterpret_cast<::HANDLE>(event));
+}
+
+int WaitForOsEvent(EventHandle event, unsigned int milli_seconds) {
+  if (event == NULL) {
+    return -1;
+  }
+
+  int ret_code =
+      WaitForSingleObject(reinterpret_cast<::HANDLE>(event), milli_seconds);
+  if (ret_code == WAIT_TIMEOUT) {
+    ret_code = 0x14003;  // 0x14003 indicates timeout
+  }
+  return ret_code;
+}
+
+int SetOsEvent(EventHandle event) {
+  if (event == NULL) {
+    return -1;
+  }
+  return SetEvent(reinterpret_cast<::HANDLE>(event));
+}
+
+int ResetOsEvent(EventHandle event) {
+  if (event == NULL) {
+    return -1;
+  }
+  return ResetEvent(reinterpret_cast<::HANDLE>(event));
+}
+
+uint64_t ReadAccurateClock() {
+  uint64_t ret;
+  QueryPerformanceCounter((LARGE_INTEGER*)&ret);
+  return ret;
+}
+
+uint64_t AccurateClockFrequency() {
+  uint64_t ret;
+  QueryPerformanceFrequency((LARGE_INTEGER*)&ret);
+  return ret;
+}
+
+SharedMutex CreateSharedMutex() {
+  assert(false && "Not implemented.");
+  abort();
+  return nullptr;
+}
+
+bool TryAcquireSharedMutex(SharedMutex lock) {
+  assert(false && "Not implemented.");
+  abort();
+  return false;
+}
+
+bool AcquireSharedMutex(SharedMutex lock) {
+  assert(false && "Not implemented.");
+  abort();
+  return false;
+}
+
+void ReleaseSharedMutex(SharedMutex lock) {
+  assert(false && "Not implemented.");
+  abort();
+}
+
+bool TrySharedAcquireSharedMutex(SharedMutex lock) {
+  assert(false && "Not implemented.");
+  abort();
+  return false;
+}
+
+bool SharedAcquireSharedMutex(SharedMutex lock) {
+  assert(false && "Not implemented.");
+  abort();
+  return false;
+}
+
+void SharedReleaseSharedMutex(SharedMutex lock) {
+  assert(false && "Not implemented.");
+  abort();
+}
+
+void DestroySharedMutex(SharedMutex lock) {
+  assert(false && "Not implemented.");
+  abort();
+}
+
+uint64_t ReadSystemClock() {
+  assert(false && "Not implemented.");
+  abort();
+  return 0;
+}
+
+uint64_t SystemClockFrequency() {
+  assert(false && "Not implemented.");
+  abort();
+  return 0;
+}
+
+bool ParseCpuID(cpuid_t* cpuinfo) {
+  assert(false && "Not implemented.");
+  abort();
+  return false;
+}
+
+}   //  namespace os
+}   //  namespace wsl
+
+#endif
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/version.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/version.cpp
new file mode 100644
index 0000000000..80dc67d44f
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/version.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+
+const char rocdxgbuildid[] __attribute__((used)) = "ROCDXG BUILD ID: " STRING(ROCDXG_VERSION);
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtGetVersion(HsaVersionInfo *VersionInfo) {
+  CHECK_DXG_OPEN();
+
+  VersionInfo->KernelInterfaceMajorVersion = 1;
+  VersionInfo->KernelInterfaceMinorVersion = 17;
+
+  return HSAKMT_STATUS_SUCCESS;
+}
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/wddm/cmd_util.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/wddm/cmd_util.cpp
new file mode 100644
index 0000000000..d650651e31
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/wddm/cmd_util.cpp
@@ -0,0 +1,320 @@
+/* Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. */
+
+#include "impl/wddm/cmd_util.h"
+
+namespace wsl {
+namespace thunk {
+
+/*
+ * Builds a COPY_DATA packet that copies data.
+ */
+size_t CmdUtil::BuildCopyData(
+  uint64_t  *pDstAddr,
+  void      *pBuffer,
+  uint32_t  dstSel,
+  uint32_t  dstCachePolicy,
+  uint32_t  srcSel,
+  uint32_t  srcCachePolicy,
+  uint32_t  countSel,
+  uint32_t  wrConfirm) {
+  PM4MEC_COPY_DATA copy_data = {0};
+
+  GenerateCmdHeader(&copy_data, IT_COPY_DATA);
+  copy_data.bitfields2.dst_sel = dstSel;
+  copy_data.bitfields2.src_sel = srcSel;
+  copy_data.bitfields2.dst_cache_policy = dstCachePolicy;
+  copy_data.bitfields2.src_cache_policy = srcCachePolicy;
+  copy_data.bitfields2.count_sel = countSel;
+  copy_data.bitfields2.wr_confirm = wrConfirm;
+  copy_data.bitfields5c.dst_64b_addr_lo = (PtrLow32(pDstAddr) >> 3);
+  copy_data.dst_addr_hi = PtrHigh32(pDstAddr);
+  memcpy(pBuffer, &copy_data, sizeof(copy_data));
+
+  return sizeof(copy_data);
+}
+
+/*
+ * Builds a EVENT_WRITE packet.
+ * Applications can use Barrier command to ensure their
+ * command is executed only after all other commands have
+ * completed their execution.
+ */
+size_t CmdUtil::BuildBarrier(
+  void      *pBuffer,
+  uint32_t  eventIndex,
+  uint32_t  eventType) {
+  BarrierTemplate barrier = {0};
+
+  GenerateCmdHeader(&barrier.event_write, IT_EVENT_WRITE);
+  barrier.event_write.bitfields2.event_index = eventIndex;
+  barrier.event_write.bitfields2.event_type = eventType;
+  memcpy(pBuffer, &barrier, sizeof(barrier));
+
+  return sizeof(barrier);
+}
+
+/**
+ * Builds a WRITE_DATA packet.
+ * Writes two DWORDs into the GPU memory address "write_addr"
+ */
+
+size_t CmdUtil::BuildWriteData64Command(
+  void*     pBuffer,
+  uint64_t* write_addr,
+  uint64_t  write_value) {
+  WriteDataTemplate command = {0};
+  GenerateCmdHeader(&command.write_data, IT_WRITE_DATA);
+
+  // Encode the user specified address to write to
+  uint64_t addr = uintptr_t(write_addr);
+  assert(!(addr & 0x3) && "WriteData address must be 4 byte aligned");
+
+  // Set the bit to confirm the write operation and cache policy
+  command.write_data.bitfields2.wr_confirm = wr_confirm__mec_write_data__wait_for_write_confirmation;
+  command.write_data.bitfields2.cache_policy = cache_policy__mec_write_data__bypass;
+
+  // Specify the command to increment address if writing more than one DWord
+  command.write_data.bitfields2.addr_incr = addr_incr__mec_write_data__increment_address;
+  // Specify the class to which the write destination belongs
+  command.write_data.bitfields2.dst_sel = dst_sel__mec_write_data__memory;
+
+  command.write_data.bitfields3c.dst_mem_addr_lo = (PtrLow32(write_addr) >> 2);
+  command.write_data.dst_mem_addr_hi = PtrHigh32(write_addr);
+
+  // Specify the value to write
+  command.write_data.write_data_value = write_value;
+
+  memcpy(pBuffer, &command, sizeof(command));
+  return sizeof(command);
+}
+
+/*
+ * Builds a ACQUIRE_MEM packet.
+ * Users can submit this command to
+ * invalidate Gpu caches - L1 and or L2.
+ */
+size_t CmdUtil::BuildAcquireMem(
+  uint8_t major,
+  void    *pBuffer) {
+  size_t ret;
+  if (major == 9) {
+    gfx9::AcquireMemTemplate acq = {0};
+    GenerateCmdHeader(&acq.acquire_mem, IT_ACQUIRE_MEM);
+    // Specify the size of memory to invalidate. Size is
+    // specified in terms of 256 byte chunks. A coher_size
+    // of 0xFFFFFFFF actually specified 0xFFFFFFFF00 (40 bits)
+    // of memory. The field coher_size_hi specifies memory from
+    // bits 40-64 for a total of 256 TB.
+    acq.acquire_mem.coher_size = 0xFFFFFFFF;
+    acq.acquire_mem.bitfields4.coher_size_hi = 0xFF;
+    // Specify the address of memory to invalidate. The
+    // address must be 256 byte aligned.
+    acq.acquire_mem.coher_base_lo = 0;
+    acq.acquire_mem.bitfields6.coher_base_hi = 0;
+    // Specify the poll interval for determing if operation is complete
+    acq.acquire_mem.bitfields7.poll_interval = 4;
+    acq.acquire_mem.bitfields2.coher_cntl =
+      (1 << 29) | // CP_COHER_CNTL__SH_ICACHE_ACTION_ENA_MASK
+      (1 << 27) | // CP_COHER_CNTL__SH_KCACHE_ACTION_ENA_MASK
+      (1 << 28);  // CP_COHER_CNTL__SH_KCACHE_VOL_ACTION_ENA_MASK
+    memcpy(pBuffer, &acq, sizeof(acq));
+    ret = sizeof(acq);
+  } else if (major >= 10) {
+    gfx10::AcquireMemTemplate acq = {0};
+    GenerateCmdHeader(&acq.acquire_mem, IT_ACQUIRE_MEM);
+    acq.acquire_mem.coher_size = 0xFFFFFFFF;
+    acq.acquire_mem.bitfields4.coher_size_hi = 0xFF;
+    acq.acquire_mem.coher_base_lo = 0;
+    acq.acquire_mem.bitfields6.coher_base_hi = 0;
+    acq.acquire_mem.bitfields7.poll_interval = 4;
+    acq.acquire_mem.bitfields8.gcr_cntl =
+      (1 << 16) | // SEQ = FORWARD
+      (1 << 15) | // GL2_WB
+      (1 << 14) | // GL2_INV
+      (1 << 9) |  // GL1_INV
+      (1 << 8) |  // GLV_INV
+      (1 << 7) |  // GLK_INV
+      (1 << 6) |  // GLK_WB
+      (1 << 5) |  // GLM_INV
+      (1 << 4) |  // GLM_WB
+      (1 << 0);   // GLI_INV = ALL
+    memcpy(pBuffer, &acq, sizeof(acq));
+    ret = sizeof(acq);
+  }
+
+  return ret;
+}
+
+/*
+ * Builds a scratch packet.
+ */
+size_t CmdUtil::BuildScratch(
+  void  *pScratchBase,
+  void  *pBuffer) {
+  struct SetScratchTemplate scratch = {0};
+
+  GenerateSetShRegHeader(&scratch, mmCOMPUTE_DISPATCH_SCRATCH_BASE_LO);
+  scratch.scratch_lo = Ptr48Low32(pScratchBase);
+  scratch.scratch_hi = Ptr48High8(pScratchBase);
+  memcpy(pBuffer, &scratch, sizeof(scratch));
+
+  return sizeof(scratch);
+}
+
+/**
+ * @ Set Compute Shader parameter for gfx11 and above
+ */
+size_t CmdUtil::BuildComputeShaderParams(void  *pBuffer) {
+  struct DispatchProgramResourceRegs compute_shader_params = {0};
+
+  GenerateSetShRegHeader(&compute_shader_params, mmCOMPUTE_PGM_RSRC3);
+  // IMAGE_OP: Indicates the compute program contains an image op
+  // instruction and should be stalled by its WAIT_SYNC fence.
+  compute_shader_params.compute_pgm_rsrc3 = (1 << 31);
+
+  memcpy(pBuffer, &compute_shader_params, sizeof(compute_shader_params));
+
+  return sizeof(compute_shader_params);
+}
+
+
+/*
+ * Builds a dispatch packet.
+ */
+size_t CmdUtil::BuildDispatch(
+  struct DispatchInfo *pInfo,
+  void                *pBuffer) {
+  DispatchTemplate dispatch = {0};
+
+  GenerateSetShRegHeader(&dispatch.dimension_regs, mmCOMPUTE_NUM_THREAD_X);
+  dispatch.dimension_regs.compute_num_thread_x = pInfo->pPacket->workgroup_size_x;
+  dispatch.dimension_regs.compute_num_thread_y = pInfo->pPacket->workgroup_size_y;
+  dispatch.dimension_regs.compute_num_thread_z = pInfo->pPacket->workgroup_size_z;
+
+  // TODO: Add AQL packet index for debugger
+  // Debugger requires AQL packet index in COMPUTE_DISPATCH_PKT_ADDR_LO
+  GenerateSetShRegHeader(&dispatch.program_regs, mmCOMPUTE_PGM_LO);
+  dispatch.program_regs.compute_pgm_lo = Ptr48Low32(pInfo->pEntry);
+  dispatch.program_regs.compute_pgm_hi = Ptr48High8(pInfo->pEntry);
+
+  GenerateSetShRegHeader(&dispatch.program_resource_regs, mmCOMPUTE_PGM_RSRC1);
+  dispatch.program_resource_regs.compute_pgm_rsrc1 = pInfo->pKernelObject->compute_pgm_rsrc1;
+  if (pInfo->major == 11) {
+    AMD_HSA_BITS_SET(dispatch.program_resource_regs.compute_pgm_rsrc1,
+        AMD_COMPUTE_PGM_RSRC_ONE_PRIV, 1);
+  }
+  dispatch.program_resource_regs.compute_pgm_rsrc2 =
+    (pInfo->ldsBlks << 15) | pInfo->pKernelObject->compute_pgm_rsrc2;
+
+  GenerateSetShRegHeader(&dispatch.resource_regs, mmCOMPUTE_RESOURCE_LIMITS);
+  dispatch.resource_regs.compute_resource_limits = 0x3ff;
+  dispatch.resource_regs.compute_static_thread_mgmt_se0 = 0xFFFFFFFF;
+  dispatch.resource_regs.compute_static_thread_mgmt_se1 = 0xFFFFFFFF;
+  dispatch.resource_regs.compute_static_thread_mgmt_se2 = 0xFFFFFFFF;
+  dispatch.resource_regs.compute_static_thread_mgmt_se3 = 0xFFFFFFFF;
+
+  dispatch.resource_regs.compute_tmpring_size = pInfo->pAmdQueue->compute_tmpring_size;
+
+  GenerateSetShRegHeader(&dispatch.compute_user_data_regs, mmCOMPUTE_USER_DATA_0);
+
+  uint32_t sgpr_no = 0;
+  if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties,
+		       AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER)) {
+    assert(pInfo->major < 11);
+    pInfo->scratchBaseOffset[pInfo->offsetCnt++] =
+      offsetof(struct DispatchTemplate, compute_user_data_regs.compute_user_data[0]) +
+      sgpr_no * sizeof(uint32_t);
+
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
+      pInfo->pAmdQueue->scratch_resource_descriptor[0];
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
+      pInfo->pAmdQueue->scratch_resource_descriptor[1];
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
+      pInfo->pAmdQueue->scratch_resource_descriptor[2];
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
+      pInfo->srd;
+  }
+  if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties,
+		       AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR)) {
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = PtrLow32(pInfo->pPacket);
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = PtrHigh32(pInfo->pPacket);
+  }
+  if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties,
+		       AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) {
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = PtrLow32(pInfo->pAmdQueue);
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = PtrHigh32(pInfo->pAmdQueue);
+  }
+  if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties,
+		       AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR)) {
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
+      PtrLow32(pInfo->pPacket->kernarg_address);
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
+      PtrHigh32(pInfo->pPacket->kernarg_address);
+  }
+  if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties,
+		       AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_ID)) {
+    // This feature may be enabled as a side effect of indirect calls.
+    // However, the compiler team confirmed that the dispatch id itself is not used,
+    // so safe to send 0 for each dispatch.
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = 0;
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = 0;
+  }
+  if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties,
+		       AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_FLAT_SCRATCH_INIT)) {
+    assert(pInfo->major < 11);
+    pInfo->scratchBaseOffset[pInfo->offsetCnt++] =
+      offsetof(struct DispatchTemplate, compute_user_data_regs.compute_user_data[0]) +
+      sgpr_no * sizeof(uint32_t);
+
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
+      PtrLow32(pInfo->pScratchBase);
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
+      PtrHigh32(pInfo->pScratchBase);
+  }
+  if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties,
+		       AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE)) {
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
+      pInfo->scratchSizePerWave / (pInfo->wave32 ? 32 : 64);
+  }
+
+  GenerateCmdHeader(&dispatch.dispatch_direct, IT_DISPATCH_DIRECT);
+  dispatch.dispatch_direct.dispatch_initiator =
+    (1 << 0) | // COMPUTE_SHADER_EN
+    (1 << 2) | // FORCE_START_AT_000
+    (1 << 5); // USE_THREAD_DIMENSIONS
+  if (pInfo->wave32) dispatch.dispatch_direct.dispatch_initiator |= (1 << 15); // CS_W32_EN
+  dispatch.dispatch_direct.dim_x = pInfo->pPacket->grid_size_x;
+  dispatch.dispatch_direct.dim_y = pInfo->pPacket->grid_size_y;
+  dispatch.dispatch_direct.dim_z = pInfo->pPacket->grid_size_z;
+  memcpy(pBuffer, &dispatch, sizeof(dispatch));
+
+  return sizeof(dispatch);
+}
+
+/*
+ * Builds a ATOMIC_MEM packet.
+ * Users can submit this command
+ * to perform atomic operations.
+ */
+size_t CmdUtil::BuildAtomicMem(
+  uint64_t  *pAddr,
+  uint32_t  atomic,
+  void      *pBuffer,
+  uint32_t  cachePolicy,
+  uint64_t  srcData) {
+  AtomicTemplate atom = {0};
+
+  GenerateCmdHeader(&atom.atomic, IT_ATOMIC_MEM);
+  atom.atomic.addr_lo = PtrLow32(pAddr);
+  atom.atomic.addr_hi = PtrHigh32(pAddr);
+  atom.atomic.bitfields2.atomic = atomic;
+  atom.atomic.bitfields2.cache_policy = cachePolicy;
+  atom.atomic.src_data_lo = LowPart(srcData);
+  atom.atomic.src_data_hi = HighPart(srcData);
+  memcpy(pBuffer, &atom, sizeof(atom));
+
+  return sizeof(atom);
+}
+
+} // namespace thunk
+} // namespace wsl
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/wddm/device.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/wddm/device.cpp
new file mode 100644
index 0000000000..f51af85404
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/wddm/device.cpp
@@ -0,0 +1,780 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include <cinttypes>
+#include <bitset>
+
+#include <sys/mman.h>
+#include <sys/sysinfo.h>
+#include <sys/stat.h>
+#include <linux/mman.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include "impl/wddm/status.h"
+#include "impl/wddm/types.h"
+#include "impl/wddm/device.h"
+#include "impl/wddm/queue.h"
+
+namespace wsl {
+namespace thunk {
+
+const uint32_t WDDMDevice::cmdbuf_aql_frame_num_ = 0x1000;
+
+WDDMDevice::WDDMDevice(D3DKMT_HANDLE adapter, LUID adapter_luid, uint32_t node_id)
+  : adapter_(adapter), adapter_luid_(adapter_luid), node_id_(node_id) {
+  memset(&device_info_, 0, sizeof(device_info_));
+
+  ParseDeviceInfo();
+  CreateDevice();
+  SetPowerOptimization(false);
+  CreatePagingQueue();
+  InitCmdbufInfo();
+  QuerySegmentInfo();
+}
+
+WDDMDevice::~WDDMDevice() {
+  DestroyPagingQueue();
+  SetPowerOptimization(true);
+  DestroyDevice();
+
+  DestroyDeviceInfo();
+}
+
+static NTSTATUS WDDMQueryAdapter(D3DKMT_HANDLE adapter, KMTQUERYADAPTERINFOTYPE type,
+				 void *data, int size)
+{
+  D3DKMT_QUERYADAPTERINFO args = {0};
+
+  args.hAdapter = adapter;
+  args.Type = type;
+  args.pPrivateDriverData = data;
+  args.PrivateDriverDataSize = size;
+
+  return DXCORE_CALL(D3DKMTQueryAdapterInfo(&args));
+}
+
+bool WDDMDevice::QuerySegmentInfo()
+{
+  uint32_t segmentCount = 0;
+  segment_infos_.clear();
+
+  // Get the number of segments
+  D3DKMT_QUERYSTATISTICS adapterQuery = {};
+  adapterQuery.Type = D3DKMT_QUERYSTATISTICS_ADAPTER;
+  adapterQuery.AdapterLuid = adapter_luid_;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTQueryStatistics(&adapterQuery));
+  if (ret == STATUS_SUCCESS) {
+    segmentCount = adapterQuery.QueryResult.AdapterInformation.NbSegments;
+    pr_debug("Total Segments: %u\n", segmentCount);
+  } else {
+    pr_err("Failed to query adapter info\n");
+    return false;
+  }
+
+  for (uint32_t i = 0; i < segmentCount; i++) {
+
+    D3DKMT_QUERYSTATISTICS segQuery = {};
+    segQuery.Type = D3DKMT_QUERYSTATISTICS_SEGMENT;
+    segQuery.AdapterLuid = adapter_luid_;
+    segQuery.QuerySegment.SegmentId = i;
+
+    ret = DXCORE_CALL(D3DKMTQueryStatistics(&segQuery));
+    if (ret != STATUS_SUCCESS) {
+      pr_err("Failed to query segment %u info\n", i);
+      return false;
+    }
+
+    auto& seg = segQuery.QueryResult.SegmentInformation;
+
+    SegmentInfo info;
+    info.segment_id = i;
+    info.segment_type = seg.SegmentProperties.SegmentType;
+    info.system_memory = seg.SegmentProperties.SystemMemory;
+    info.aperture = seg.Aperture;
+    info.commit_limit = seg.CommitLimit;
+
+    segment_infos_.push_back(info);
+  }
+
+  return true;
+}
+
+bool WDDMDevice::GetSegmentId(D3DKMT_QUERYSTATISTICS_SEGMENT_TYPE segment_type,
+                              uint32_t &segment_id)
+{
+  for (const auto& seg_info : segment_infos_) {
+    if (seg_info.segment_type == segment_type) {
+      segment_id = seg_info.segment_id;
+      return true;
+    }
+  }
+  pr_err("Failed to get segment id for type %u\n", segment_type);
+  return false;
+}
+
+/*Local heap(dedicated GPU memory) includes visiable heap and invisiable heap.
+ *Non local heap refers to shared GPU memory and it is sytem memory.
+ */
+uint64_t WDDMDevice::VramAvail(void) {
+  D3DKMT_QUERYSTATISTICS stats;
+  NTSTATUS ret;
+  uint64_t usedVis = 0;
+  uint64_t usedInv = 0;
+  uint64_t usedNonLocal = 0;
+  uint32_t segmentId = 0;
+
+  // wait fence complete
+  uint64_t value = page_fence_value_.load();
+  if(!CpuWait(&page_syncobj_, &value, 1, false))
+    return HSA_STATUS_ERROR;
+
+  if (IsDgpu()) {
+    // local cpu-visible memory
+    if(!GetSegmentId(D3DKMT_QUERYSTATISTICS_SEGMENT_TYPE_MEMORY, segmentId))
+      return HSA_STATUS_ERROR;
+
+    memset(&stats, 0, sizeof(D3DKMT_QUERYSTATISTICS));
+    stats.Type = D3DKMT_QUERYSTATISTICS_SEGMENT;
+    stats.AdapterLuid = adapter_luid_;
+    stats.QuerySegment.SegmentId = segmentId;
+    ret = DXCORE_CALL(D3DKMTQueryStatistics(&stats));
+    if (ret == 0)
+      usedVis = stats.QueryResult.SegmentInformation.BytesResident;
+
+    // local invisible memory
+    if (device_info_.local_invisible_heap_size) {
+      segmentId++;
+      memset(&stats, 0, sizeof(D3DKMT_QUERYSTATISTICS));
+      stats.Type = D3DKMT_QUERYSTATISTICS_SEGMENT;
+      stats.AdapterLuid = adapter_luid_;
+      stats.QuerySegment.SegmentId = 1;
+
+      ret = DXCORE_CALL(D3DKMTQueryStatistics(&stats));
+      if (ret == 0)
+        usedInv = stats.QueryResult.SegmentInformation.BytesResident;
+    }
+
+    return LocalHeapSize() - usedVis - usedInv;
+  } else {
+    // APU - NonLocal memory
+    if(!GetSegmentId(D3DKMT_QUERYSTATISTICS_SEGMENT_TYPE_SYSMEM, segmentId))
+      return HSA_STATUS_ERROR;
+
+    memset(&stats, 0, sizeof(D3DKMT_QUERYSTATISTICS));
+    stats.Type = D3DKMT_QUERYSTATISTICS_SEGMENT;
+    stats.AdapterLuid = adapter_luid_;
+    stats.QuerySegment.SegmentId = segmentId;
+    ret = DXCORE_CALL(D3DKMTQueryStatistics(&stats));
+    if (ret == 0)
+      usedNonLocal = stats.QueryResult.SegmentInformation.BytesResident;
+
+    return NonLocalHeapSize() - usedNonLocal;
+  }
+}
+
+bool WDDMDevice::CreateDevice(void) {
+  D3DKMT_CREATEDEVICE args = {0};
+  args.hAdapter = adapter_;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTCreateDevice(&args));
+  if (ret == STATUS_SUCCESS) {
+    device_ = args.hDevice;
+    return true;
+  }
+
+  pr_err("fail %x\n", ret);
+  return false;
+}
+
+bool WDDMDevice::DestroyDevice(void) {
+  D3DKMT_DESTROYDEVICE args = {0};
+  args.hDevice = device_;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTDestroyDevice(&args));
+  if (ret == STATUS_SUCCESS)
+    return true;
+
+  pr_err("fail %x\n", ret);
+  return false;
+}
+
+bool WDDMDevice::CreatePagingQueue(void) {
+  D3DKMT_CREATEPAGINGQUEUE args = {0};
+  args.hDevice = device_;
+  args.Priority = D3DDDI_PAGINGQUEUE_PRIORITY_NORMAL;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTCreatePagingQueue(&args));
+  if (ret == STATUS_SUCCESS) {
+    page_queue_ = args.hPagingQueue;
+    page_syncobj_ = args.hSyncObject;
+    page_fence_addr_ = (uint64_t *)args.FenceValueCPUVirtualAddress;
+    page_fence_value_ = 0;
+    return true;
+  }
+
+  pr_err("fail %x\n", ret);
+  return false;
+}
+
+bool WDDMDevice::DestroyPagingQueue(void) {
+  D3DDDI_DESTROYPAGINGQUEUE args = {0};
+  args.hPagingQueue = page_queue_;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTDestroyPagingQueue(&args));
+  if (ret == STATUS_SUCCESS)
+    return true;
+
+  pr_err("fail %x\n", ret);
+  return false;
+}
+
+void WDDMDevice::SetPowerOptimization(bool restore) {
+  void *priv_data;
+  int priv_size;
+
+  priv_size = thunk_proxy::GetPowerOptPrivDataSize();
+  priv_data = malloc(priv_size);
+  assert(priv_data);
+  memset(priv_data, 0, priv_size);
+  thunk_proxy::FillinPowerOptPrivData(priv_data, restore);
+
+  D3DKMT_ESCAPE d3dkmt_escape;
+  memset(&d3dkmt_escape, 0, sizeof(d3dkmt_escape));
+
+  d3dkmt_escape.hAdapter              = adapter_;
+  d3dkmt_escape.hDevice               = device_;
+  d3dkmt_escape.hContext              = 0; //KMD only use device to identify the process
+  d3dkmt_escape.Type                  = D3DKMT_ESCAPE_DRIVERPRIVATE;
+  d3dkmt_escape.pPrivateDriverData    = priv_data;
+  d3dkmt_escape.PrivateDriverDataSize = priv_size;
+  d3dkmt_escape.Flags.HardwareAccess  = true;
+
+  NTSTATUS status = DXCORE_CALL(D3DKMTEscape(&d3dkmt_escape));
+  pr_debug("status %d, restore %d\n", status, restore);
+  free(priv_data);
+}
+
+void WDDMDevice::UpdatePageFence(uint64_t fence_value) {
+  uint64_t current = page_fence_value_.load();
+
+  // atomically set fence value when target is bigger than current one
+  do {
+    if (current >= fence_value)
+      break;
+  } while (!page_fence_value_.compare_exchange_weak(current, fence_value));
+}
+
+ErrorCode WDDMDevice::CreateGpuMemory(const GpuMemoryCreateInfo &create_info,
+                                        GpuMemory **gpu_mem, gpusize *gpu_va) {
+  ErrorCode ret;
+
+  *gpu_mem = nullptr;
+  auto mem = new GpuMemory(this);
+  if (create_info.dmabuf_fd > 0)
+    ret = mem->ImportPhysicalHandle(create_info, gpu_va);
+  else 
+    ret = mem->Init(create_info);
+  if (ret == ErrorCode::Success)
+    *gpu_mem = mem;
+  else
+    delete mem;
+
+  return ret;
+}
+
+void *WDDMDevice::Lock(D3DKMT_HANDLE handle) {
+  D3DKMT_LOCK2 args = {0};
+  args.hDevice = device_;
+  args.hAllocation = handle;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTLock2(&args));
+  if (ret == STATUS_SUCCESS)
+    return args.pData;
+
+  pr_err("fail %x\n", ret);
+  return NULL;
+}
+
+bool WDDMDevice::Unlock(D3DKMT_HANDLE handle) {
+  D3DKMT_UNLOCK2 args = {0};
+  args.hDevice = device_;
+  args.hAllocation = handle;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTUnlock2(&args));
+  if (ret == STATUS_SUCCESS)
+    return true;
+
+  pr_err("fail %x\n", ret);
+  return false;
+}
+
+bool WDDMDevice::CreateContext(int engine, D3DKMT_HANDLE *handle) {
+  void *priv_data;
+  int priv_size;
+
+  int ordinal = EngineOrdinal(engine, &device_info_);
+  if (ordinal < 0)
+    return false;
+
+  priv_size = thunk_proxy::GetContextPrivDataSize();
+  priv_data = malloc(priv_size);
+  assert(priv_data);
+  memset(priv_data, 0, priv_size);
+  thunk_proxy::FillinContextPrivData(priv_data, SupportStateShadowingByCpFw());
+
+  D3DKMT_CREATECONTEXTVIRTUAL args = {0};
+  args.hDevice = device_;
+  args.EngineAffinity = 1 << 0;
+  args.NodeOrdinal = ordinal;
+  args.pPrivateDriverData = priv_data;
+  args.PrivateDriverDataSize = priv_size;
+  args.ClientHint = D3DKMT_CLIENTHINT_OPENCL;
+
+  if (IsHwsEnabled(engine))
+    args.Flags.HwQueueSupported = 1;
+  else
+    args.Flags.DisableGpuTimeout = thunk_proxy::ShouldDisableGpuTimeout(engine, &device_info_);
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTCreateContextVirtual(&args));
+  if (ret == STATUS_SUCCESS) {
+    *handle = args.hContext;
+    free(priv_data);
+    return true;
+  }
+
+  free(priv_data);
+
+  pr_err("fail %x\n", ret);
+  return false;
+}
+
+bool WDDMDevice::DestroyContext(D3DKMT_HANDLE handle) {
+  D3DKMT_DESTROYCONTEXT args = {0};
+  args.hContext = handle;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTDestroyContext(&args));
+  if (ret == STATUS_SUCCESS)
+    return true;
+
+  pr_err("fail %x\n", ret);
+  return false;
+}
+
+bool WDDMDevice::GpuWait(WDDMQueue *queue, const D3DKMT_HANDLE *syncobjs,
+			 uint64_t *values, int count) {
+
+  D3DKMT_WAITFORSYNCHRONIZATIONOBJECTFROMGPU args = {0};
+  args.hContext = queue->context;
+  args.ObjectCount = count;
+  args.ObjectHandleArray = syncobjs;
+  args.MonitoredFenceValueArray = values;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTWaitForSynchronizationObjectFromGpu(&args));
+  if (ret == STATUS_SUCCESS)
+      return true;
+
+  pr_err("fail %x\n", ret);
+  return false;
+}
+
+bool WDDMDevice::GpuSignal(D3DKMT_HANDLE context, const D3DKMT_HANDLE *syncobjs,
+			   uint64_t *value, int count) {
+  D3DKMT_SIGNALSYNCHRONIZATIONOBJECTFROMGPU args = {0};
+  args.hContext = context;
+  args.ObjectCount = count;
+  args.ObjectHandleArray = syncobjs;
+  args.MonitoredFenceValueArray = value;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTSignalSynchronizationObjectFromGpu(&args));
+  if (ret == STATUS_SUCCESS)
+    return true;
+
+  pr_err("fail %x\n", ret);
+  return false;
+}
+
+bool WDDMDevice::CpuWait(const D3DKMT_HANDLE *syncobjs, uint64_t *value,
+			 int count, bool wait_any) {
+  D3DKMT_WAITFORSYNCHRONIZATIONOBJECTFROMCPU args = {0};
+  args.hDevice = device_;
+  args.ObjectCount = count;
+  args.ObjectHandleArray = syncobjs;
+  args.FenceValueArray = value;
+  args.Flags.WaitAny = wait_any;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTWaitForSynchronizationObjectFromCpu(&args));
+  if (ret == STATUS_SUCCESS)
+    return true;
+
+  pr_err("fail %x\n", ret);
+  return false;
+}
+
+bool WDDMDevice::WaitOnPagingFenceFromCpu() {
+  uint64_t page_fence_value = 0;
+
+  page_fence_value = page_fence_value_.load();
+  if (CpuWait(&page_syncobj_, &page_fence_value, 1, false))
+    return true;
+
+  return false;
+}
+
+bool WDDMDevice::CreateSyncobj(D3DKMT_HANDLE *handle, uint64_t **addr) {
+  D3DKMT_CREATESYNCHRONIZATIONOBJECT2 args = {0};
+  args.hDevice = device_;
+  args.Info.Type = D3DDDI_MONITORED_FENCE;
+  args.Info.MonitoredFence.EngineAffinity = 1 << 0;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTCreateSynchronizationObject2(&args));
+  if (ret == STATUS_SUCCESS) {
+    *handle = args.hSyncObject;
+    *addr = (uint64_t *)args.Info.MonitoredFence.FenceValueCPUVirtualAddress;
+    pr_debug("create syncobj cpu addr=%p gpu addr=%" PRIx64 "\n",
+             args.Info.MonitoredFence.FenceValueCPUVirtualAddress,
+             args.Info.MonitoredFence.FenceValueGPUVirtualAddress);
+
+    return true;
+  }
+
+  pr_err("fail %x\n", ret);
+  return false;
+}
+
+void WDDMDevice::DestroySyncobj(D3DKMT_HANDLE handle) {
+  D3DKMT_DESTROYSYNCHRONIZATIONOBJECT args = {0};
+  args.hSyncObject = handle;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTDestroySynchronizationObject(&args));
+  if (ret != STATUS_SUCCESS)
+    pr_err("fail %x\n", ret);
+}
+
+void WDDMDevice::InitCmdbufInfo(void) {
+  if (device_info_.major == 9) {
+    cmdbuf_aql_frame_size_ = 2 * sizeof(gfx9::AcquireMemTemplate);
+  } else if (device_info_.major >= 10) {
+    cmdbuf_aql_frame_size_ = 2 * sizeof(gfx10::AcquireMemTemplate);
+  }
+
+  if (device_info_.major >= 11) {
+    cmdbuf_aql_frame_size_ += sizeof(SetScratchTemplate);
+    cmdbuf_aql_frame_size_ += sizeof(DispatchProgramResourceRegs); // BuildComputeShaderParams
+  }
+
+  cmdbuf_aql_frame_size_ +=
+    sizeof(PM4MEC_COPY_DATA) * 2 +
+    sizeof(BarrierTemplate) * 2 +
+    sizeof(DispatchTemplate) +
+    sizeof(AtomicTemplate) * 2;
+
+  // Add safety margin to account for alignment and future additions
+  cmdbuf_aql_frame_size_ += 128;
+
+  cmdbuf_aql_frame_size_ = AlignUp(cmdbuf_aql_frame_size_, 0x10);
+
+  cmdbuf_size_ = AlignUp(cmdbuf_aql_frame_num_ * cmdbuf_aql_frame_size_, 0x1000);
+}
+
+uint32_t WDDMDevice::LdsBlocks(const hsa_kernel_dispatch_packet_t *pkt) {
+  static const uint32_t blk_sz = 512;
+  uint32_t total_sz = pkt->group_segment_size;
+  uint32_t blk_num = (total_sz + blk_sz - 1) / blk_sz;
+  return blk_num;
+}
+
+NTSTATUS WDDMCreateDevices(std::vector<WDDMDevice *> &devices)
+{
+  bool supported = false;
+  D3DKMT_ENUMADAPTERS2 args = {0};
+  NTSTATUS ret = DXCORE_CALL(D3DKMTEnumAdapters2(&args));
+  if (ret != STATUS_SUCCESS)
+    return ret;
+
+  if (!args.NumAdapters) {
+    return STATUS_SUCCESS;
+  }
+
+  D3DKMT_ADAPTERINFO *info = new D3DKMT_ADAPTERINFO[args.NumAdapters];
+  if (!info)
+    return STATUS_NO_MEMORY;
+
+  args.pAdapters = info;
+  ret = DXCORE_CALL(D3DKMTEnumAdapters2(&args));
+  if (ret != STATUS_SUCCESS)
+    goto err_out0;
+
+  for (int i = 0; i < args.NumAdapters; i++) {
+    D3DKMT_QUERY_DEVICE_IDS query = {0};
+
+    ret = WDDMQueryAdapter(info[i].hAdapter, KMTQAITYPE_PHYSICALADAPTERDEVICEIDS,
+			   &query, sizeof(query));
+    if (ret != STATUS_SUCCESS)
+      goto err_out1;
+
+    if (query.DeviceIds.VendorID != 0x1002)
+      continue;
+
+    supported = thunk_proxy::QueryAdapterSupported(query.DeviceIds.DeviceID);
+
+    if (supported) {
+      auto device = new WDDMDevice(
+        info[i].hAdapter, info[i].AdapterLuid, devices.size() + 1);
+      if (!device)
+        goto err_out1;
+      devices.push_back(device);
+    }
+  }
+
+  delete[] info;
+  return STATUS_SUCCESS;
+
+ err_out1:
+  for (auto &device : devices)
+    delete device;
+ err_out0:
+  delete[] info;
+  return ret;
+}
+
+bool WDDMDevice::ParseDeviceInfo() {
+  bool ret;
+
+  memset(&device_info_, 0, sizeof(device_info_));
+  ret = thunk_proxy::ParseAdapterInfo(adapter_, &device_info_);
+  if (!ret)
+    return false;
+
+  return true;
+}
+
+void WDDMDevice::DestroyDeviceInfo() {
+  free(device_info_.adapter_info);
+}
+
+void WDDMDevice::GetClockCounters(uint64_t *gpu, uint64_t *cpu) {
+
+  uint32_t engine = GetComputeEngine();
+  int ordinal = EngineOrdinal(engine, &device_info_);
+
+  D3DKMT_QUERYCLOCKCALIBRATION args = {0};
+
+ /* LDA(Linked Display Adapter)
+  * In the LDA design multiple physical GPUs are linked together to be controlled
+  * as a single object from the point of view of power manager, GPU scheduler and
+  * GPU memory manager. The physical GPUs are represented by a signal logical adapter
+  * object. There is a single DXGADAPTER objects, a single KMD adapter object.
+  *
+  * Set PhysicalAdapterIndex to 0 by default with None LDA mode.
+  */
+  args.hAdapter = adapter_;
+  args.NodeOrdinal = ordinal;
+  args.PhysicalAdapterIndex = 0;
+
+  NTSTATUS status = DXCORE_CALL(D3DKMTQueryClockCalibration(&args));
+  if (status) {
+    pr_debug("status %d \n", status);
+  } else {
+    if (gpu)
+      *gpu = args.ClockData.GpuClockCounter;
+
+    if (cpu)
+      *cpu = args.ClockData.CpuClockCounter;
+  }
+}
+
+bool WDDMDevice::CreateQueue(WDDMQueue *queue) {
+  if (!CreateContext(queue->queue_engine, &queue->context))
+    return false;
+
+  GpuMemory *gpu_mem = nullptr;
+  if (queue->cmdbuf_addr == 0) {
+    GpuMemoryCreateInfo create_info{};
+    create_info.size = queue->cmdbuf_size;
+    create_info.domain = thunk_proxy::kSystem;
+
+    auto code = CreateGpuMemory(create_info, &gpu_mem);
+    if (code != ErrorCode::Success)
+        goto err_out0;
+
+    queue->cmdbuf = gpu_mem->GetGpuMemoryHandle();
+    queue->cmdbuf_addr = gpu_mem->GpuAddress();
+  }
+
+  if (queue->Init())
+     goto err_out1;
+
+  return true;
+
+err_out1:
+  delete gpu_mem;
+err_out0:
+  DestroyContext(queue->context);
+
+  return false;
+}
+
+void WDDMDevice::DestroyQueue(WDDMQueue *queue) {
+
+  queue->Fini();
+
+  auto cmdbuf_mem = GpuMemory::Convert(queue->cmdbuf);
+  delete cmdbuf_mem;
+
+  DestroyContext(queue->context);
+}
+
+bool WDDMDevice::SubmitToSwQueue(WDDMQueue *queue, uint64_t command_addr,
+                                uint64_t command_size, uint64_t fence_value) {
+  void *priv_data;
+  int priv_size;
+
+  priv_size = thunk_proxy::GetSubmitPrivDataSize();
+  priv_data = malloc(priv_size);
+  assert(priv_data);
+  memset(priv_data, 0, priv_size);
+  thunk_proxy::FillinSubmitPrivData(priv_data, queue->queue, command_addr, command_size, false);
+
+  D3DKMT_SUBMITCOMMAND args = {0};
+  args.Commands = command_addr;
+  args.CommandLength = command_size;
+  args.BroadcastContextCount = 1;
+  args.BroadcastContext[0] = queue->context;
+  args.pPrivateDriverData = priv_data;
+  args.PrivateDriverDataSize = priv_size;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTSubmitCommand(&args));
+  if (ret != STATUS_SUCCESS) {
+    pr_err("fail %x\n", ret);
+    free(priv_data);
+    return false;
+  }
+
+  free(priv_data);
+
+  if (!GpuSignal(queue->context, &queue->syncobj, &fence_value, 1))
+    return false;
+
+  return true;
+}
+
+bool WDDMDevice::CreateHwQueue(WDDMQueue *queue) {
+  void *priv_data;
+  int priv_size;
+
+  priv_size = thunk_proxy::GetHwQueuePrivDataSize();
+  priv_data = malloc(priv_size);
+  assert(priv_data);
+  memset(priv_data, 0, priv_size);
+  bool FwManagedGfxState = SupportStateShadowingByCpFw();
+  thunk_proxy::FillinHwQueuePrivData(priv_data, FwManagedGfxState, queue->prio);
+
+  D3DKMT_CREATEHWQUEUE createHwQueue = {0};
+  createHwQueue.hHwContext = queue->context;
+  createHwQueue.Flags.DisableGpuTimeout = thunk_proxy::ShouldDisableGpuTimeout(queue->queue_engine, &device_info_);
+  createHwQueue.pPrivateDriverData = priv_data;
+  createHwQueue.PrivateDriverDataSize = priv_size;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTCreateHwQueue(&createHwQueue));
+  if (ret != STATUS_SUCCESS) {
+    pr_err("fail %x\n", ret);
+    free(priv_data);
+    return false;
+  }
+
+  free(priv_data);
+
+  queue->queue = createHwQueue.hHwQueue;
+  queue->syncobj = createHwQueue.hHwQueueProgressFence;
+  queue->sync_addr = (uint64_t *)createHwQueue.HwQueueProgressFenceCPUVirtualAddress;
+
+  return true;
+}
+
+bool WDDMDevice::DestroyHwQueue(WDDMQueue *queue) {
+   D3DKMT_DESTROYHWQUEUE DestroyHwQueue = {
+    .hHwQueue = queue->queue,
+  };
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTDestroyHwQueue(&DestroyHwQueue));
+  if (ret != STATUS_SUCCESS) {
+    pr_err("fail %x\n", ret);
+    return false;
+  }
+
+  return true;
+}
+
+bool WDDMDevice::SubmitToHwQueue(WDDMQueue *queue, uint64_t command_addr,
+                                uint64_t command_size, uint64_t fence_value) {
+  void *priv_data;
+  int priv_size;
+
+  priv_size = thunk_proxy::GetSubmitPrivDataSize();
+  priv_data = malloc(priv_size);
+  assert(priv_data);
+  memset(priv_data, 0, priv_size);
+  thunk_proxy::FillinSubmitPrivData(priv_data, queue->queue, command_addr, command_size, true);
+
+  D3DKMT_SUBMITCOMMANDTOHWQUEUE args = {0};
+  args.hHwQueue = queue->queue;
+  args.HwQueueProgressFenceId = fence_value;
+  args.CommandBuffer = command_addr;
+  args.CommandLength = command_size;
+  args.pPrivateDriverData = priv_data;
+  args.PrivateDriverDataSize = priv_size;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTSubmitCommandToHwQueue(&args));
+  if (ret != STATUS_SUCCESS) {
+    pr_err("fail %x\n", ret);
+    free(priv_data);
+    return false;
+  }
+
+  free(priv_data);
+
+  return true;
+}
+
+} // namespace thunk
+} // namespace wsl
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/wddm/gpu_memory.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/wddm/gpu_memory.cpp
new file mode 100644
index 0000000000..e374be8867
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/wddm/gpu_memory.cpp
@@ -0,0 +1,594 @@
+#include <sys/stat.h>
+#include <cinttypes>
+#include <cassert>
+#include "impl/wddm/gpu_memory.h"
+#include "impl/wddm/device.h"
+#include "util/utils.h"
+
+using namespace std;
+
+namespace wsl {
+namespace thunk {
+
+size_t GpuMemory::CalcChunkNumbers(gpusize size) {
+  const auto chunk_size = WDDMDevice::GpuMemoryChunkSize;
+  return (size + chunk_size - 1) / chunk_size;
+}
+
+gpusize GpuMemory::AdjustSize(gpusize size) const {
+  const auto &device_info = device_->DeviceInfo();
+
+  if (device_info.enable_big_page_alignment && desc_.domain == thunk_proxy::kLocal) {
+    uint32_t alignment = device_info.big_page_alignment_size;
+    // BigPage is only supported for allocations > bigPageMinAlignment.
+    // Also, if bigPageMinAlignment == 0, BigPage optimization is not supported per KMD.
+    // We do either LargePage or BigPage alignment, whichever has a higher value.
+    if ((device_info.hw_big_page_min_alignment_size > 0) && (size > device_info.hw_big_page_min_alignment_size)) {
+      alignment = std::max(alignment, device_info.hw_big_page_min_alignment_size);
+      if (size > device_info.hw_big_page_alignment_size)
+        alignment = std::max(alignment, device_info.hw_big_page_alignment_size);
+    }
+    if (alignment > 0)
+      size = AlignUp(size, alignment);
+  } else {
+    const size_t min_size = 4096;
+    size = AlignUp(size, min_size);
+  }
+  return size;
+}
+
+GpuMemory::GpuMemory(WDDMDevice *device) : device_(device) {
+  num_allocations_ = 0;
+  alloc_handles_ptr_ = nullptr;
+  alloc_handle_ = 0;
+  resource_ = 0;
+  mem_fd_ = -1;
+}
+
+GpuMemory::~GpuMemory() {
+  FreeGpuVirtualAddress(GpuAddress(), Size());
+  FreePhysicalMemory();
+  if (desc_.handle_ape_addr > 0)
+    dxg_runtime->HandleApertureFree(desc_.handle_ape_addr);
+}
+
+ErrorCode GpuMemory::Init(const GpuMemoryCreateInfo &create_info) {
+  desc_.domain = create_info.domain;
+  desc_.adapter_luid = device_->GetLuid();
+  desc_.client_size = create_info.size;
+  desc_.alignment = create_info.alignment;
+  desc_.mem_flags = create_info.mem_flags;
+  desc_.engine_flag = create_info.engine_flag;
+  desc_.flags.is_virtual = create_info.flags.virtual_alloc;
+  desc_.flags.is_physical_only = create_info.flags.physical_only;
+  desc_.flags.is_physical_contiguous = create_info.flags.physical_contiguous;
+  desc_.flags.is_imported_sys_memfd = create_info.flags.sysmem_ipc_sig_importer;
+  desc_.flags.is_sysmem_exporter = create_info.flags.sysmem_ipc_sig_exporter;
+  desc_.flags.is_va_required = create_info.flags.alloc_va;
+  desc_.flags.is_blit_kernel_object = create_info.flags.blit_kernel_object;
+
+  /* we can't tell the allocation is regular vmm or ipc mem at creation stage,
+     they share same creation parameters, so forcing all vram allocations to
+     sharable to support IPC mem */
+  if (create_info.flags.interprocess ||
+      desc_.domain == thunk_proxy::AllocDomain::kLocal)
+    desc_.flags.is_shared = true;
+
+  desc_.flags.is_locked = create_info.flags.locked;
+  desc_.size = AdjustSize(desc_.client_size);
+
+  if (IsUserMemory() || IsSystem())
+    desc_.cpu_addr = create_info.user_ptr;
+
+  num_allocations_ = CalcChunkNumbers(Size());
+  if (num_allocations_ == 1)
+    alloc_handles_ptr_ = &alloc_handle_;
+  else
+    alloc_handles_ptr_ = new WinAllocationHandle[num_allocations_];
+
+  memset(alloc_handles_ptr_, 0, num_allocations_ * sizeof(WinAllocationHandle));
+
+  auto code = ErrorCode::Success;
+
+  if (IsPhysicalOnly()) {
+    code = CreatePhysicalMemory();
+    if (code == ErrorCode::Success)
+      code = dxg_runtime->HandleApertureAlloc(desc_.size, &desc_.handle_ape_addr);
+    return code;
+  }
+
+  code = ReserveGpuVirtualAddress(create_info.va_hint, Size(), create_info.alignment);
+  if (IsVirtual() || (code != ErrorCode::Success))
+      return code;
+
+  bool physical_created = false;
+
+  auto guard = MakeScopeGuard([this, &physical_created, &code]() {
+    if (code != ErrorCode::Success) {
+
+      if (physical_created) {
+        FreePhysicalMemory();
+      }
+      FreeGpuVirtualAddress(GpuAddress(), Size());
+    }
+  });
+  (void)guard;
+
+  code = CreatePhysicalMemory();
+  if (code != ErrorCode::Success)
+    return code;
+
+  physical_created = true;
+
+  code = MapGpuVirtualAddress(GpuAddress(), Size());
+  if (code != ErrorCode::Success)
+    return code;
+
+  code = MakeResident();
+  if (code != ErrorCode::Success)
+    return code;
+
+  if (!GetDevice()->WaitOnPagingFenceFromCpu())
+    code = ErrorCode::Unknown;
+
+  return code;
+}
+
+ErrorCode GpuMemory::UnmapGpuVirtualAddress(const gpusize addr, const gpusize size, gpusize offset) {
+  auto code = ErrorCode::Success;
+  size_t i = 0;
+  auto map_addr = addr;
+  auto map_size = size;
+
+  while (offset >= WDDMDevice::GpuMemoryChunkSize) {
+    offset -= WDDMDevice::GpuMemoryChunkSize;
+    i += 1;
+  }
+
+  while (map_size > 0) {
+    auto block_size = std::min(map_size, WDDMDevice::GpuMemoryChunkSize);
+
+    D3DDDI_MAPGPUVIRTUALADDRESS args{};
+
+    args.hPagingQueue = device_->PagingQueue();
+    args.BaseAddress = map_addr;
+    args.hAllocation = GetAllocationHandle(i);
+    args.SizeInPages = block_size / 0x1000;
+    args.Protection.NoAccess = 1;
+
+    code = d3dthunk::MapGpuVirtualAddress(&args);
+
+    if (code == ErrorCode::NotReady)
+      device_->UpdatePageFence(args.PagingFenceValue);
+    else if (code != ErrorCode::Success)
+      break;
+
+    map_addr += block_size;
+    map_size -= block_size;
+    offset = 0;   // reset second unmapped allocation offset to zero
+    i += 1;
+  }
+
+  return code;
+}
+
+ErrorCode GpuMemory::MapGpuVirtualAddress(const gpusize addr, const gpusize size, gpusize offset) {
+
+  auto code = ErrorCode::Success;
+  size_t i = 0;
+  auto map_addr = addr;
+  auto map_size = size;
+  const size_t _4K = 0x1000;
+
+  while (offset >= WDDMDevice::GpuMemoryChunkSize) {
+    offset -= WDDMDevice::GpuMemoryChunkSize;
+    i += 1;
+  }
+  const size_t first_chunk = i;
+  const auto first_chunk_offset = offset;
+  /* Found two limitation for local vram:
+   * 1. invisible vram va has to be 64K aligned, otherwise map gpu va fail
+   * 2. visible vram can not be cpu mapped when command submission or after gpu mapped
+   */
+  while (map_size > 0) {
+    auto block_size = std::min(map_size, WDDMDevice::GpuMemoryChunkSize);
+
+    D3DDDI_MAPGPUVIRTUALADDRESS args{};
+
+    args.hPagingQueue = device_->PagingQueue();
+    args.BaseAddress = map_addr;
+    args.hAllocation = GetAllocationHandle(i);
+    args.OffsetInPages = offset / _4K;
+    args.SizeInPages = block_size / _4K;
+    args.Protection.Write = 1;
+
+    code = d3dthunk::MapGpuVirtualAddress(&args);
+
+    if (code != ErrorCode::Success) {
+      if (code == ErrorCode::NotReady) {
+        const uint64_t fence_value = args.PagingFenceValue;
+        device_->UpdatePageFence(fence_value);
+        code = ErrorCode::Success;
+      } else
+        break;
+    }
+
+    map_addr += block_size;
+    map_size -= block_size;
+    offset = 0;  // reset second mapped allocation offset to zero
+    i++;
+  }
+
+  if (code != ErrorCode::Success) {
+    // Map failed, unmap partial mapped block
+    offset = first_chunk_offset;
+    map_addr = addr;
+    map_size = size;
+    for (size_t j = first_chunk; j < i; j++) {
+      auto block_size = std::min(map_size, WDDMDevice::GpuMemoryChunkSize);
+
+      D3DDDI_MAPGPUVIRTUALADDRESS args{};
+
+      args.hPagingQueue = device_->PagingQueue();
+      args.BaseAddress = map_addr;
+      args.hAllocation = 0;
+      args.OffsetInPages = offset / _4K;
+      args.SizeInPages = block_size / _4K;
+      args.Protection.NoAccess = 1;
+
+      auto unmap_code = d3dthunk::MapGpuVirtualAddress(&args);
+      if (unmap_code == ErrorCode::NotReady)
+        device_->UpdatePageFence(args.PagingFenceValue);
+
+      map_addr += block_size;
+      map_size -= block_size;
+    }
+  }
+
+  return code;
+}
+
+ErrorCode GpuMemory::ReserveGpuVirtualAddress(gpusize base_virt_addr, gpusize size, gpusize alignment) {
+  ErrorCode status;
+  gpusize gpu_virt_addr = 0;
+  if ((desc_.flags.is_sysmem_exporter || desc_.flags.is_imported_sys_memfd)
+      && desc_.domain == thunk_proxy::AllocDomain::kSystem) {
+    int mfd = (mem_fd_ > -1)? mem_fd_ : -1;
+    status = dxg_runtime->ReserveIPCSysMem(Size(), &gpu_virt_addr, desc_.alignment, mfd, desc_.flags.is_locked);
+    if (status == ErrorCode::Success)
+      mem_fd_ = mfd;
+  } else {
+    status = dxg_runtime->ReserveGpuVirtualAddress(desc_.domain, base_virt_addr, size, &gpu_virt_addr, alignment,
+        desc_.flags.is_locked);
+  }
+
+  if (status == ErrorCode::Success) {
+    desc_.gpu_addr = gpu_virt_addr;
+
+    if (IsSystem())
+      desc_.cpu_addr = reinterpret_cast<void *>(desc_.gpu_addr);
+  }
+  return status;
+}
+
+ErrorCode GpuMemory::FreeGpuVirtualAddress(gpusize base_addr, gpusize size) {
+  if (mem_fd_ > -1)
+    return dxg_runtime->FreeIPCSysMem(GpuAddress(), Size(), mem_fd_);
+
+  return base_addr != 0 ?
+         dxg_runtime->FreeGpuVirtualAddress(desc_.domain, base_addr, size) :
+         ErrorCode::Success;
+}
+
+ErrorCode GpuMemory::CreatePhysicalMemory() {
+
+  assert(!IsVirtual() && NumChunks() > 0);
+
+  const auto num_allocations = NumChunks();
+  void *priv_drv_data;
+  void *priv_alloc_data;
+  int priv_drv_data_size;
+  int priv_alloc_data_size;
+
+  thunk_proxy::GetAllocPrivDataSize(&priv_drv_data_size, &priv_alloc_data_size);
+  int total_size = priv_drv_data_size +
+    num_allocations * priv_alloc_data_size +
+    num_allocations * sizeof(D3DDDI_ALLOCATIONINFO2);
+  priv_drv_data = malloc(total_size);
+  if (!priv_drv_data)
+    return ErrorCode::OutOfMemory;
+
+  memset(priv_drv_data, 0, total_size);
+  thunk_proxy::FillinAllocPrivDrvData(priv_drv_data, priv_alloc_data_size);
+
+  priv_alloc_data = static_cast<unsigned char*>(priv_drv_data) + priv_drv_data_size;
+  auto alloc_info = reinterpret_cast<D3DDDI_ALLOCATIONINFO2*>(
+       static_cast<unsigned char*>(priv_alloc_data) + priv_alloc_data_size * num_allocations);
+
+  size_t size = desc_.size;
+  uint64_t addr = desc_.gpu_addr;
+  char *cpu_addr = static_cast<char *>(desc_.cpu_addr);
+  const auto &device_info = GetDevice()->DeviceInfo();
+
+  for (size_t i = 0; i < num_allocations; i++) {
+
+    void* priv_data = (void*)((char*)priv_alloc_data + priv_alloc_data_size * i);
+    size_t block_size = std::min(size, WDDMDevice::GpuMemoryChunkSize);
+
+    if (IsUserMemory() || IsSystem()) {
+      thunk_proxy::SetAllocationInfo(priv_data, block_size, desc_.domain, 0, desc_.mem_flags, desc_.engine_flag, device_info);
+      alloc_info[i].pSystemMem = static_cast<void *>(cpu_addr);
+      cpu_addr += block_size;
+    } else {
+      thunk_proxy::SetAllocationInfo(priv_data, block_size, desc_.domain, addr, desc_.mem_flags, desc_.engine_flag, device_info);
+    }
+
+    size -= block_size;
+    addr += block_size;
+
+    alloc_info[i].pPrivateDriverData = priv_data;
+    alloc_info[i].PrivateDriverDataSize = priv_alloc_data_size;
+    alloc_info[i].VidPnSourceId = D3DDDI_ID_UNINITIALIZED;
+  }
+
+  D3DKMT_CREATEALLOCATION args = {};
+  args.hDevice = device_->DeviceHandle();
+  args.pPrivateDriverData = priv_drv_data;
+  args.PrivateDriverDataSize = priv_drv_data_size;
+  args.NumAllocations = num_allocations;
+  args.pAllocationInfo2 = alloc_info;
+
+  /* The PhysicallyContiguous flag causes allocation failure
+   * args.Flags.PhysicallyContiguous = IsPhysicalContiguous();
+   */
+
+  SharedHandleInfo shared_info;
+  if (IsShared()) {
+    shared_info.size = desc_.size;
+    shared_info.client_size = desc_.client_size;
+    shared_info.domain = desc_.domain;
+    shared_info.adapter_luid = desc_.adapter_luid;
+    shared_info.flags = reinterpret_cast<uint32_t>(desc_.flags.reserved);
+    shared_info.mem_flags = desc_.mem_flags;
+    shared_info.pid = dxg_runtime->parent_pid;
+    shared_info.gpu_addr = desc_.gpu_addr;
+    args.pPrivateRuntimeData = &shared_info;
+    args.PrivateRuntimeDataSize = sizeof(shared_info);
+    args.Flags.NtSecuritySharing = 1;
+    args.Flags.CreateShared = 1;
+    args.Flags.CreateResource = 1;
+  }
+
+  auto status = d3dthunk::CreateAllocation(&args);
+  if (status == ErrorCode::Success) {
+    for (size_t i = 0; i < num_allocations; i++)
+      alloc_handles_ptr_[i] = alloc_info[i].hAllocation;
+
+    resource_ = args.hResource;
+  }
+  free(priv_drv_data);
+  return status;
+}
+
+ErrorCode GpuMemory::FreePhysicalMemory() {
+  auto code = ErrorCode::Success;
+
+  if (alloc_handles_ptr_ == nullptr || (NumChunks() == 1 && *alloc_handles_ptr_ == 0))
+      return code;
+
+  code = d3dthunk::DestroyAllocation(device_->DeviceHandle(),
+                                  resource_,
+                                  NumChunks(),
+                                  alloc_handles_ptr_);
+  if (NumChunks() > 1)
+    delete[] alloc_handles_ptr_;
+
+  alloc_handles_ptr_ = nullptr;
+  return code;
+}
+
+ErrorCode GpuMemory::MakeResident() {
+
+  D3DDDI_MAKERESIDENT args = {};
+  args.hPagingQueue = device_->PagingQueue();
+  args.NumAllocations = NumChunks();
+  args.AllocationList = alloc_handles_ptr_;
+  args.Flags.CantTrimFurther = 1;
+
+  auto code = d3dthunk::MakeResident(&args);
+  if (code == ErrorCode::NotReady) {
+    const auto fence_value = args.PagingFenceValue;
+    device_->UpdatePageFence(fence_value);
+    code = ErrorCode::Success;
+  }
+  return code;
+}
+
+ErrorCode GpuMemory::Evict() {
+
+  D3DKMT_EVICT args = {};
+  args.hDevice = device_->DeviceHandle();
+  args.NumAllocations = NumChunks();
+  args.AllocationList = alloc_handles_ptr_;
+
+  return d3dthunk::Evict(&args);
+}
+
+ErrorCode GpuMemory::ExportPhysicalHandle(int* dmabuf_fd, uint32_t flags) {
+  if (mem_fd_ > -1) {
+    *dmabuf_fd = mem_fd_;
+    return ErrorCode::Success;
+  }
+
+  if (IsShared())
+    return d3dthunk::ShareObjects(1, resource_, flags, dmabuf_fd);
+  else
+    return ErrorCode::UnSupported;
+}
+
+
+ErrorCode GpuMemory::ImportPhysicalHandle(const GpuMemoryCreateInfo &create_info, gpusize *gpu_addr) {
+  D3DKMT_QUERYRESOURCEINFOFROMNTHANDLE query_args;
+  int dmabuf_fd = create_info.dmabuf_fd;
+
+  if (dmabuf_fd <= 0)
+    return ErrorCode::InvalidateParams;
+
+  if(create_info.flags.sysmem_ipc_sig_importer) {
+    // the ipc signal sys mem fd will be closed in Runtime::IPCClientImport, dup to hold a reference
+    mem_fd_ = dup(dmabuf_fd);
+    desc_.client_size = create_info.size;
+    desc_.size = AdjustSize(desc_.client_size);
+    desc_.domain = thunk_proxy::AllocDomain::kSystem;
+    desc_.adapter_luid = device_->GetLuid();
+    desc_.alignment = 0x1000;
+    desc_.mem_flags = create_info.mem_flags;
+    desc_.engine_flag = create_info.engine_flag;
+    desc_.flags.is_imported_sys_memfd = create_info.flags.sysmem_ipc_sig_importer;
+    desc_.flags.is_va_required = create_info.flags.alloc_va;
+    desc_.flags.is_virtual = create_info.flags.virtual_alloc;
+    desc_.flags.is_physical_only = create_info.flags.physical_only;
+    desc_.flags.is_physical_contiguous = create_info.flags.physical_contiguous;
+    desc_.flags.is_locked = create_info.flags.locked;
+
+    auto code = ReserveGpuVirtualAddress(create_info.va_hint, Size(), create_info.alignment);
+    if (code != ErrorCode::Success)
+      return code;
+
+    bool physical_created = false;
+    auto guard = MakeScopeGuard([this, &physical_created, &code]() {
+          if (code != ErrorCode::Success) {
+            if (physical_created)
+              FreePhysicalMemory();
+            FreeGpuVirtualAddress(GpuAddress(), Size());
+          }
+        });
+    (void)guard;
+
+    num_allocations_ = CalcChunkNumbers(Size());
+    if (num_allocations_ == 1)
+      alloc_handles_ptr_ = &alloc_handle_;
+    else
+      alloc_handles_ptr_ = new WinAllocationHandle[num_allocations_];
+
+    memset(alloc_handles_ptr_, 0, num_allocations_ * sizeof(WinAllocationHandle));
+
+    code = CreatePhysicalMemory();
+    if (code != ErrorCode::Success)
+      return code;
+
+    physical_created = true;
+
+    code = MapGpuVirtualAddress(GpuAddress(), Size());
+    if (code != ErrorCode::Success)
+      return code;
+
+    code = MakeResident();
+    if (code != ErrorCode::Success)
+      return code;
+
+    if (!GetDevice()->WaitOnPagingFenceFromCpu())
+      code = ErrorCode::Unknown;
+
+    return code;
+  } else {
+    // vmem importer / ipc vram importer
+    memset(&query_args, 0, sizeof(query_args));
+    query_args.hDevice = device_->DeviceHandle();
+    query_args.hNtHandle = reinterpret_cast<HANDLE>(dmabuf_fd);
+    auto ret = d3dthunk::QueryResourceInfoFromNtHandle(&query_args);
+    if (ret != ErrorCode::Success) {
+      pr_err("query resource info from nt handle failed %d\n", static_cast<int>(ret));
+      return ErrorCode::InvalidateParams;
+    }
+    pr_debug("wsl-thunk: import from nt handle %d, get allocation number %d,"
+             " runtime data size %#x total driver data size %#x resource data size=%#x\n",
+             dmabuf_fd,
+             query_args.NumAllocations,
+             query_args.PrivateRuntimeDataSize,
+             query_args.TotalPrivateDriverDataSize,
+             query_args.ResourcePrivateDriverDataSize);
+
+    SharedHandleInfo shared_info;
+    if(sizeof(shared_info) != query_args.PrivateRuntimeDataSize) {
+      pr_err("shared hanle info size mismatch:%d vs %ld\n",
+             query_args.PrivateRuntimeDataSize, sizeof(shared_info));
+      return ErrorCode::UnSupported;
+    }
+
+    uint32_t total_size = query_args.NumAllocations * sizeof(D3DDDI_OPENALLOCATIONINFO2) +
+      query_args.TotalPrivateDriverDataSize +
+      query_args.ResourcePrivateDriverDataSize;
+    D3DDDI_OPENALLOCATIONINFO2 *open_info =
+      reinterpret_cast<D3DDDI_OPENALLOCATIONINFO2*> (calloc(1, total_size));
+    if (!open_info) {
+      pr_err("alloc open_info failed, NumAllocations:%d\n",
+             query_args.NumAllocations);
+      return ErrorCode::OutOfMemory;
+    }
+
+    auto guard = MakeScopeGuard([&open_info]() { free(open_info); });
+
+    alloc_handles_ptr_ = new WinAllocationHandle[query_args.NumAllocations];
+
+    D3DKMT_OPENRESOURCEFROMNTHANDLE open_args;
+    memset(&open_args, 0, sizeof(open_args));
+    open_args.hDevice = query_args.hDevice;
+    open_args.hNtHandle = query_args.hNtHandle;
+    open_args.NumAllocations = query_args.NumAllocations;
+    open_args.pOpenAllocationInfo2 = open_info;
+    open_args.TotalPrivateDriverDataBufferSize = query_args.TotalPrivateDriverDataSize;
+    open_args.pTotalPrivateDriverDataBuffer = reinterpret_cast<void*>
+      (open_args.pOpenAllocationInfo2 + open_args.NumAllocations);
+    open_args.ResourcePrivateDriverDataSize = query_args.ResourcePrivateDriverDataSize;
+    open_args.pResourcePrivateDriverData = reinterpret_cast<void*>
+      (((uint64_t)open_args.pTotalPrivateDriverDataBuffer) +
+       open_args.TotalPrivateDriverDataBufferSize);
+    open_args.PrivateRuntimeDataSize = query_args.PrivateRuntimeDataSize;
+    open_args.pPrivateRuntimeData = reinterpret_cast<void*> (&shared_info);
+
+    ret = d3dthunk::OpenResourceFromNtHandle(&open_args);
+    if (ret != ErrorCode::Success) {
+      ret = ErrorCode::InvalidateParams;
+      pr_err("open resource failed %d\n", static_cast<int>(ret));
+      return ret;
+    }
+    if (shared_info.pid == dxg_runtime->parent_pid &&
+      create_info.flags.alloc_va &&
+      IsSameAdapter(shared_info.adapter_luid) &&
+      shared_info.gpu_addr) {
+      pr_info("import from same device and samve process, va is required. "
+               "a buffer can't be mapped to 2 va. delete the imported buffer, use the existing one.\n");
+      if (gpu_addr)
+        *gpu_addr = shared_info.gpu_addr;
+      return ErrorCode::SameProcessSameDevice;
+    }
+
+    desc_.size = shared_info.size;
+    desc_.client_size = shared_info.client_size;
+    desc_.domain = shared_info.domain;
+    desc_.flags.reserved = shared_info.flags;
+    desc_.mem_flags = shared_info.mem_flags;
+    desc_.adapter_luid = shared_info.adapter_luid;
+    resource_ = open_args.hResource;
+    num_allocations_ = open_args.NumAllocations;
+    for (int i = 0; i < num_allocations_; i++)
+      alloc_handles_ptr_[i] = open_info[i].hAllocation;
+
+    desc_.flags.is_va_required = create_info.flags.alloc_va;
+    if (desc_.flags.is_va_required) {
+      desc_.flags.is_imported_vram_ipc = 1;
+      ret = ReserveGpuVirtualAddress(create_info.va_hint, desc_.size, create_info.alignment);
+      if (ret != ErrorCode::Success)
+        pr_err("failed to allocate svm range, error:%d\n", static_cast<int>(ret));
+
+      return ret;
+    } else {
+      desc_.flags.is_imported_vram_vmem = 1;
+      return dxg_runtime->HandleApertureAlloc(desc_.size, &desc_.handle_ape_addr);
+    }
+  }
+}
+
+} // namespace thunk
+} // namespace wsl
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/wddm/queue.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/wddm/queue.cpp
new file mode 100644
index 0000000000..44658819cb
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/wddm/queue.cpp
@@ -0,0 +1,1210 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include <cstring>
+#include <cinttypes>
+#include <cstddef>
+
+#include "impl/wddm/queue.h"
+#include "impl/registers.h"
+
+#include "impl/hsa/hsa.h"
+#include "impl/hsa/hsa_ven_amd_loader.h"
+extern hsa_signal_value_t hsakmt_hsa_signal_load_relaxed(hsa_signal_t signal);
+extern hsa_signal_value_t hsakmt_hsa_signal_wait_relaxed(
+    hsa_signal_t signal, hsa_signal_condition_t condition,
+    hsa_signal_value_t compare_value, uint64_t timeout_hint,
+    hsa_wait_state_t wait_state_hint);
+extern void hsakmt_hsa_signal_store_screlease(hsa_signal_t hsa_signal,
+                                      hsa_signal_value_t value);
+extern hsa_status_t hsakmt_hsa_ven_amd_loader_query_host_address(
+    const void *device_address, const void **host_address);
+
+namespace wsl {
+namespace thunk {
+
+hsa_status_t WDDMQueue::SwsInit(void) {
+  if (!device->CreateSyncobj(&syncobj, &sync_addr))
+    return HSA_STATUS_ERROR;
+
+  if (device->AllocUserQueueMemFromUMD()) {
+
+    GpuMemory *gpu_mem = nullptr;
+    GpuMemoryCreateInfo create_info{};
+
+    create_info.domain = thunk_proxy::kUserQueue;
+    create_info.size = device->GetSwsQueueSize();
+    create_info.engine_flag = thunk_proxy::QueueEngine2EngineFlag(queue_engine);
+
+    auto code = device->CreateGpuMemory(create_info, &gpu_mem);
+    if (code != ErrorCode::Success) {
+      device->DestroySyncobj(syncobj);
+      return HSA_STATUS_ERROR;
+    }
+
+    queue_mem = gpu_mem->GetGpuMemoryHandle();
+    queue = gpu_mem->GetAllocationHandle(0);
+  }
+
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t WDDMQueue::SwsFini(void) {
+  device->DestroySyncobj(syncobj);
+
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t WDDMQueue::SwsSubmit(uint64_t command_addr,
+                                  uint64_t command_size,
+                                  uint64_t fence_value) {
+  if (!device->SubmitToSwQueue(this, command_addr, command_size, fence_value))
+    return HSA_STATUS_ERROR;
+
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t WDDMQueue::HwsInit(void) {
+  if (!device->CreateHwQueue(this))
+    return HSA_STATUS_ERROR;
+
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t WDDMQueue::HwsFini(void) {
+  if (!device->DestroyHwQueue(this))
+    return HSA_STATUS_ERROR;
+
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t WDDMQueue::HwsSubmit(uint64_t command_addr,
+                                  uint64_t command_size,
+                                  uint64_t fence_value) {
+  if (!device->SubmitToHwQueue(this, command_addr, command_size, fence_value))
+    return HSA_STATUS_ERROR;
+
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t WDDMQueue::SetPriority(hsa_amd_queue_priority_t priority) {
+  if (!use_hws)
+    return HSA_STATUS_SUCCESS;
+
+  thunk_proxy::SchedLevel new_prio = ConvertSchedLevel(priority);
+  if (prio == new_prio)
+    return HSA_STATUS_SUCCESS;
+
+  pr_debug("set prio %d -> %d\n", prio, new_prio);
+  device->DestroyHwQueue(this);
+
+  prio = new_prio;
+  return HwsInit();
+}
+
+void ComputeQueue::HandleError(hsa_status_t status) {
+  hsa_signal_t sig = amd_queue_rocr_->queue_inactive_signal;
+  hsa_signal_value_t val = -1;
+
+  struct queue_error_t {
+    uint32_t code;
+    hsa_status_t status;
+  };
+  static const queue_error_t QueueErrors[] = {
+    {2, HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS},
+    {4, HSA_STATUS_ERROR_INVALID_ALLOCATION},
+    {8, HSA_STATUS_ERROR_INVALID_CODE_OBJECT},
+    //{16, HSA_STATUS_ERROR_INVALID_ARGUMENT},
+    {32, HSA_STATUS_ERROR_INVALID_PACKET_FORMAT},
+    {64, HSA_STATUS_ERROR_INVALID_ARGUMENT},
+    //{128, HSA_STATUS_ERROR_OUT_OF_REGISTERS},
+    //{0x20000000, HSA_STATUS_ERROR_MEMORY_APERTURE_VIOLATION},
+    //{0x40000000, HSA_STATUS_ERROR_ILLEGAL_INSTRUCTION},
+    {0x80000000, HSA_STATUS_ERROR_EXCEPTION},
+  };
+  for (std::size_t i = 0; i < sizeof(QueueErrors) / sizeof(QueueErrors[0]); ++i) {
+    if (QueueErrors[i].status == status) {
+      val = QueueErrors[i].code;
+      pr_err("error %d, sig_val %ld\n", status, val);
+      break;
+    }
+  }
+
+  if (sig.handle) {
+    hsakmt_hsa_signal_store_screlease(sig, val);
+  }
+  if (error_code_) {
+    error_code_->store(val, std::memory_order_release);
+  }
+}
+
+void ComputeQueue::AqlToPm4Thread(ComputeQueue *queue) {
+
+  // This timing system is used for sleeping this Thread
+  // when one packet is invalid for about 2 seconds.
+  std::chrono::steady_clock::time_point start_time, time;
+  // Set the polling timeout value for 2 seconds
+  const std::chrono::milliseconds kMaxElapsed(2000);
+  uint64_t current_position = queue->GetAqlWriteIndex();
+  bool sleep = false;
+  start_time = std::chrono::steady_clock::now();
+
+  while (true) {
+    if (!queue->IsInvalidPacket()) {
+      hsa_status_t status = queue->Process();
+      if (status != HSA_STATUS_SUCCESS) {
+        pr_err("process compute queue fail status = %08x\n", status);
+        queue->HandleError(status);
+        break;
+      }
+      sleep = false;
+    } else {
+      if (current_position == queue->GetAqlWriteIndex()) {
+        time = std::chrono::steady_clock::now();
+        if (time - start_time > kMaxElapsed)
+          sleep = true;
+      } else {
+        start_time = std::chrono::steady_clock::now();
+        current_position = queue->GetAqlWriteIndex();
+        sleep = false;
+      }
+    }
+
+    if ((queue->GetRingWptr()->load() > queue->GetRingRptr()->load()) && !sleep)
+      continue;
+
+    std::unique_lock<std::mutex> lock(queue->thread_cond_lock_);
+    // CPU wait for valid packet
+    if (queue->GetRingWptr()->load() <= queue->GetRingRptr()->load() ||
+        (sleep && queue->IsInvalidPacket())) {
+      if (queue->thread_stop_)
+        break;
+      pr_debug("wait %p wptr=%" PRIx64 " rptr=%" PRIx64 "\n",
+               queue->ring, queue->GetRingWptr()->load(), queue->GetRingRptr()->load());
+      queue->thread_cond_.wait(lock);
+    }
+  }
+
+  pr_debug("aql to pm4 thread %p exit\n", queue->ring);
+}
+
+ComputeQueue::ComputeQueue(WDDMDevice *device,
+               void *ring,
+               uint64_t ring_size,
+               std::atomic<uint64_t> *ring_wptr,
+               std::atomic<uint64_t> *ring_rptr,
+               volatile int64_t *error_addr,
+               uint32_t cmdbuf_size,
+               uint32_t engine,
+               bool use_hws) :
+               WDDMQueue(device, 0, cmdbuf_size, engine, use_hws),
+               ring(ring),
+               ring_size(ring_size),
+               ring_wptr(ring_wptr),
+               ring_rptr(ring_rptr),
+               error_code_(reinterpret_cast<volatile std::atomic<long int>*>(error_addr)),
+               ib_start_addr(0),
+               ib_size(0),
+               sync_point(0),
+               cmdbuf_aql_frame_write_index(0),
+               cmdbuf_aql_frame_size(0),
+               needs_barrier(true),
+               ready_to_submit(false),
+               platform_atomic_support_(false),
+               signal_addr_(NULL),
+               thread_stop_(false),
+               max_scratch_waves_(device->MaxScratchSlotsPerCu() * device->ComputeUnitCount()),
+               dispatch_waves_(0),
+               scratch_size_per_wave_(0),
+               scratch_size_(0),
+               total_scratch_size_(0),
+               scratch_base_(nullptr) {
+  bool ret = device->CreateQueue(this);
+  assert(ret);
+
+  GpuMemoryCreateInfo create_info{};
+  create_info.size = dxg_runtime->page_size;
+  create_info.domain = thunk_proxy::kSystem;
+  GpuMemory *gpu_mem = nullptr;
+  auto code = device->CreateGpuMemory(create_info, &gpu_mem);
+  assert(code == ErrorCode::Success);
+  amd_queue_mem_ = gpu_mem->GetGpuMemoryHandle();
+  amd_queue_ = reinterpret_cast<amd_queue_v2_t*>(gpu_mem->GpuAddress());
+
+  amd_queue_rocr_ = (amd_queue_v2_t*)((char*)ring_rptr - offsetof(amd_queue_v2_t, read_dispatch_id));
+  aql_to_pm4_thread_ = std::thread(AqlToPm4Thread, this);
+
+  if (device->Major() >= 11)
+    scratch_mem_alignment_size_ = 256;
+  else
+    scratch_mem_alignment_size_ = 1024;
+}
+
+ComputeQueue::~ComputeQueue() {
+  thread_cond_lock_.lock();
+  thread_stop_ = true;
+  thread_cond_lock_.unlock();
+  thread_cond_.notify_one();
+  aql_to_pm4_thread_.join();
+
+  //doorbell_signal_->Release();
+
+  device->DestroyQueue(this);
+
+  if (scratch_base_) {
+    auto scratch_gpu_mem = GpuMemory::Convert(scratch_mem_);
+    delete scratch_gpu_mem;
+  }
+
+  auto amd_queue_gpu_mem = GpuMemory::Convert(amd_queue_mem_);
+  delete amd_queue_gpu_mem;
+}
+
+void ComputeQueue::InitScratchSRD() {
+  // Populate scratch resource descriptor
+  SQ_BUF_RSRC_WORD0 srd0;
+
+  uintptr_t scratch_base = uintptr_t(scratch_base_);
+  srd0.bits.BASE_ADDRESS = scratch_base;
+
+  uint32_t srd1_u32;
+
+  if (device->Major() < 11) {
+    SQ_BUF_RSRC_WORD1 srd1;
+
+    srd1.bits.BASE_ADDRESS_HI = scratch_base >> 32;
+    srd1.bits.STRIDE = 0;
+    srd1.bits.CACHE_SWIZZLE = 0;
+    srd1.bits.SWIZZLE_ENABLE = 1;
+
+    srd1_u32 = srd1.u32All;
+  } else {
+    SQ_BUF_RSRC_WORD1_GFX11 srd1;
+
+    srd1.bits.BASE_ADDRESS_HI = scratch_base >> 32;
+    srd1.bits.STRIDE = 0;
+    srd1.bits.SWIZZLE_ENABLE = 1;
+
+    srd1_u32 = srd1.u32All;
+  }
+
+  SQ_BUF_RSRC_WORD2 srd2;
+
+  srd2.bits.NUM_RECORDS = scratch_size_;
+
+  uint32_t srd3_u32;
+
+  if (device->Major() < 10) {
+    SQ_BUF_RSRC_WORD3 srd3;
+
+    srd3.bits.DST_SEL_X = SQ_SEL_X;
+    srd3.bits.DST_SEL_Y = SQ_SEL_Y;
+    srd3.bits.DST_SEL_Z = SQ_SEL_Z;
+    srd3.bits.DST_SEL_W = SQ_SEL_W;
+    srd3.bits.NUM_FORMAT = BUF_NUM_FORMAT_UINT;
+    srd3.bits.DATA_FORMAT = BUF_DATA_FORMAT_32;
+    srd3.bits.ELEMENT_SIZE = 1;  // 4
+    srd3.bits.INDEX_STRIDE = 3;  // 64
+    srd3.bits.ADD_TID_ENABLE = 1;
+    srd3.bits.ATC__CI__VI = 0;
+    srd3.bits.HASH_ENABLE = 0;
+    srd3.bits.HEAP = 0;
+    srd3.bits.MTYPE__CI__VI = 0;
+    srd3.bits.TYPE = SQ_RSRC_BUF;
+
+    srd3_u32 = srd3.u32All;
+  } else if (device->Major() == 10) {
+    SQ_BUF_RSRC_WORD3_GFX10 srd3;
+
+    srd3.bits.DST_SEL_X = SQ_SEL_X;
+    srd3.bits.DST_SEL_Y = SQ_SEL_Y;
+    srd3.bits.DST_SEL_Z = SQ_SEL_Z;
+    srd3.bits.DST_SEL_W = SQ_SEL_W;
+    srd3.bits.FORMAT = BUF_FORMAT_32_UINT;
+    srd3.bits.RESERVED1 = 0;
+    srd3.bits.INDEX_STRIDE = 0;  // filled in by CP
+    srd3.bits.ADD_TID_ENABLE = 1;
+    srd3.bits.RESOURCE_LEVEL = 1;
+    srd3.bits.RESERVED2 = 0;
+    srd3.bits.OOB_SELECT = 2;  // no bounds check in swizzle mode
+    srd3.bits.TYPE = SQ_RSRC_BUF;
+
+    srd3_u32 = srd3.u32All;
+  } else if (device->Major() == 11) {
+    SQ_BUF_RSRC_WORD3_GFX11 srd3;
+
+    srd3.bits.DST_SEL_X = SQ_SEL_X;
+    srd3.bits.DST_SEL_Y = SQ_SEL_Y;
+    srd3.bits.DST_SEL_Z = SQ_SEL_Z;
+    srd3.bits.DST_SEL_W = SQ_SEL_W;
+    srd3.bits.FORMAT = BUF_FORMAT_32_UINT;
+    srd3.bits.RESERVED1 = 0;
+    srd3.bits.INDEX_STRIDE = 0;  // filled in by CP
+    srd3.bits.ADD_TID_ENABLE = 1;
+    srd3.bits.RESERVED2 = 0;
+    srd3.bits.OOB_SELECT = 2;  // no bounds check in swizzle mode
+    srd3.bits.TYPE = SQ_RSRC_BUF;
+
+    srd3_u32 = srd3.u32All;
+  } else {
+    SQ_BUF_RSRC_WORD3_GFX12 srd3;
+    srd3.bits.DST_SEL_X = SQ_SEL_X;
+    srd3.bits.DST_SEL_Y = SQ_SEL_Y;
+    srd3.bits.DST_SEL_Z = SQ_SEL_Z;
+    srd3.bits.DST_SEL_W = SQ_SEL_W;
+    srd3.bits.FORMAT = BUF_FORMAT_32_UINT;
+    srd3.bits.RESERVED1 = 0;
+    srd3.bits.INDEX_STRIDE = 0;  // filled in by CP
+    srd3.bits.ADD_TID_ENABLE = 1;
+    srd3.bits.WRITE_COMPRESS_ENABLE = 0;
+    srd3.bits.COMPRESSION_EN = 0;
+    srd3.bits.COMPRESSION_ACCESS_MODE = 0;
+    srd3.bits.OOB_SELECT = 2;  // no bounds check in swizzle mode
+    srd3.bits.TYPE = SQ_RSRC_BUF;
+
+    srd3_u32 = srd3.u32All;
+  }
+
+  // Update Queue's Scratch descriptor's property
+  amd_queue_->scratch_resource_descriptor[0] = srd0.u32All;
+  amd_queue_->scratch_resource_descriptor[1] = srd1_u32;
+  amd_queue_->scratch_resource_descriptor[2] = srd2.u32All;
+  amd_queue_->scratch_resource_descriptor[3] = srd3_u32;
+
+  // Populate flat scratch parameters in amd_queue_.
+  amd_queue_->scratch_backing_memory_location = scratch_base;
+
+  // For backwards compatibility this field records the per-lane scratch
+  // for a 64 lane wavefront. If scratch was allocated for 32 lane waves
+  // then the effective size for a 64 lane wave is halved.
+  amd_queue_->scratch_wave64_lane_byte_size = scratch_size_per_wave_ / 64;
+
+  uint64_t num_waves;
+  if (device->Major() < 11) {
+    COMPUTE_TMPRING_SIZE tmpring_size;
+    // Scratch Size per Wave is specified in terms of scratch_mem_alignment_size_
+    tmpring_size.bits.WAVESIZE = scratch_size_per_wave_ / scratch_mem_alignment_size_;
+    num_waves = scratch_size_ / scratch_size_per_wave_;
+    tmpring_size.bits.WAVES = std::min(num_waves, max_scratch_waves_);
+
+    amd_queue_->compute_tmpring_size = tmpring_size.u32All;
+  } else if (device->Major() == 11) {
+    COMPUTE_TMPRING_SIZE_GFX11 tmpring_size;
+    tmpring_size.bits.WAVESIZE = scratch_size_per_wave_ / scratch_mem_alignment_size_;
+    // For GFX11 we specify number of waves per engine instead of total
+    num_waves = scratch_size_ / scratch_size_per_wave_ / device->NumShaderEngine();
+    tmpring_size.bits.WAVES = std::min(num_waves, max_scratch_waves_);
+
+    amd_queue_->compute_tmpring_size = tmpring_size.u32All;
+  } else {
+    COMPUTE_TMPRING_SIZE_GFX12 tmpring_size = {};
+    tmpring_size.bits.WAVESIZE = scratch_size_per_wave_ / scratch_mem_alignment_size_;
+    // For GFX12 we specify number of waves per engine instead of total
+    num_waves = scratch_size_ / scratch_size_per_wave_ / device->NumShaderEngine();
+    tmpring_size.bits.WAVES = std::min(num_waves, max_scratch_waves_);
+
+    amd_queue_->compute_tmpring_size = tmpring_size.u32All;
+  }
+
+  return;
+}
+
+uint64_t ComputeQueue::CalcDispatchGroups(hsa_kernel_dispatch_packet_t *packet)
+{
+  const uint64_t lanes_per_group =
+      (uint64_t(packet->workgroup_size_x) * packet->workgroup_size_y) * packet->workgroup_size_z;
+
+  uint64_t groups = ((uint64_t(packet->grid_size_x) + packet->workgroup_size_x - 1) /
+                      packet->workgroup_size_x) *
+                      ((uint64_t(packet->grid_size_y) + packet->workgroup_size_y - 1) /
+                      packet->workgroup_size_y) *
+                      ((uint64_t(packet->grid_size_z) + packet->workgroup_size_z - 1) /
+                      packet->workgroup_size_z);
+  const uint32_t cu_count = device->ComputeUnitCount();
+  const uint32_t engines = device->NumShaderEngine();
+
+  const uint32_t symmetric_cus = AlignDown(cu_count, engines);
+  const uint32_t asymmetryPerRound = cu_count - symmetric_cus;
+  const uint64_t rounds = groups / cu_count;
+  const uint64_t asymmetricGroups = rounds * asymmetryPerRound;
+  const uint64_t symmetricGroups = groups - asymmetricGroups;
+
+  uint64_t maxGroupsPerEngine =
+        ((symmetricGroups + engines - 1) / engines) + (asymmetryPerRound ? rounds : 0);
+
+  // For gfx10+ devices we must attempt to assign the smaller of 256 lanes or 16 groups to each
+  // engine.
+  if (device->Major() >= 10 &&
+      maxGroupsPerEngine < 16 &&
+      lanes_per_group * maxGroupsPerEngine < 256) {
+    uint64_t groups_per_interleave = (256 + lanes_per_group - 1) / lanes_per_group;
+    maxGroupsPerEngine = std::min(groups_per_interleave, uint64_t(16ul));
+  }
+
+  return maxGroupsPerEngine * engines;
+}
+
+uint64_t ComputeQueue::CalcDispatchWavesPerGroup(hsa_kernel_dispatch_packet_t *packet,
+                                                  bool wave32)
+{
+  const uint32_t lanes_per_wave = wave32 ? 32 : 64;
+
+  const uint64_t lanes_per_group =
+      (uint64_t(packet->workgroup_size_x) * packet->workgroup_size_y) * packet->workgroup_size_z;
+
+  return (lanes_per_group + lanes_per_wave - 1) / lanes_per_wave;
+}
+
+bool ComputeQueue::UpdateScratch(hsa_kernel_dispatch_packet_t *packet, bool wave32) {
+  const uint32_t lanes_per_wave = wave32 ? 32 : 64;
+  const uint64_t size_per_thread = AlignUp(packet->private_segment_size,
+                                  scratch_mem_alignment_size_ / lanes_per_wave);
+
+  uint64_t groups = CalcDispatchGroups(packet);
+  uint64_t waves_per_group = CalcDispatchWavesPerGroup(packet, wave32);
+
+  // For packet batching, the maximum value must be used to fit all packets.
+  scratch_size_per_wave_ = std::max(size_per_thread * lanes_per_wave, scratch_size_per_wave_);
+  dispatch_waves_ = std::max(groups * waves_per_group, dispatch_waves_);
+
+  const uint64_t max_scratch_size = scratch_size_per_wave_ * max_scratch_waves_;
+  const uint64_t dispatch_size = scratch_size_per_wave_ * dispatch_waves_;
+
+  scratch_size_ = std::min(dispatch_size, max_scratch_size);
+
+  if (total_scratch_size_ >= scratch_size_)
+    return true;
+
+  pr_debug("need realloc scratch buffer, size %x -> %x\n",
+           total_scratch_size_, scratch_size_);
+
+  GpuMemoryCreateInfo create_info{};
+  create_info.size = scratch_size_;
+  create_info.domain = thunk_proxy::kLocal;
+  GpuMemory *gpu_mem = nullptr;
+  auto code = device->CreateGpuMemory(create_info, &gpu_mem);
+  if (code != ErrorCode::Success)
+    return false;
+
+  if (scratch_base_) {
+    auto scratch_gpu_mem = GpuMemory::Convert(scratch_mem_);
+    delete scratch_gpu_mem;
+  }
+
+  total_scratch_size_ = scratch_size_;
+  scratch_base_ = reinterpret_cast<void *>(gpu_mem->GpuAddress());
+  scratch_mem_ = gpu_mem->GetGpuMemoryHandle();
+
+  InitScratchSRD();
+  return true;
+}
+
+bool ComputeQueue::RelocateCmdbufScratchBase(uint64_t addr) {
+  if (scratch_base_offset_array_.empty())
+    return true;
+
+  for (size_t i = 0; i < scratch_base_offset_array_.size(); i++) {
+    uint32_t *p_compute_user_data =
+      reinterpret_cast<uint32_t *>(addr + scratch_base_offset_array_[i]);
+    if (device->Major() >= 11) {
+      p_compute_user_data[0] = Ptr48Low32(scratch_base_);
+      p_compute_user_data[1] = Ptr48High8(scratch_base_);
+    } else {
+      p_compute_user_data[0] = PtrLow32(scratch_base_);
+      p_compute_user_data[1] = (p_compute_user_data[1] & 0xffff0000) | PtrHigh32(scratch_base_);
+    }
+  }
+  scratch_base_offset_array_.clear();
+
+  return true;
+}
+
+uint32_t ComputeQueue::UpdateIndexStride(uint32_t srd, bool wave32) {
+
+  assert(device->Major() < 13);
+
+  if (device->Major() == 10) {
+    SQ_BUF_RSRC_WORD3_GFX10 srd3;
+
+    srd3.u32All = srd;
+    srd3.bits.INDEX_STRIDE = wave32 ? 2 : 3;
+
+    return srd3.u32All;
+  } else if (device->Major() == 11) {
+    SQ_BUF_RSRC_WORD3_GFX11 srd3;
+
+    srd3.u32All = srd;
+    srd3.bits.INDEX_STRIDE = wave32 ? 2 : 3;
+
+    return srd3.u32All;
+  } else if (device->Major() == 12) {
+    SQ_BUF_RSRC_WORD3_GFX12 srd3;
+
+    srd3.u32All = srd;
+    srd3.bits.INDEX_STRIDE = wave32 ? 2 : 3;
+
+    return srd3.u32All;
+  }
+
+  return srd;
+}
+
+uint64_t ComputeQueue::GetKernelObjAddr(uint64_t addr) const {
+  /* convert dev_addr to host_addr */
+  auto code = get_gpu_mem((void*)addr);
+  if (code && code->IsBlitKernelObject()) {
+    return code->GpuAddress();
+  }
+
+  uint64_t host_addr = 0;
+  auto ret = hsakmt_hsa_ven_amd_loader_query_host_address(reinterpret_cast<const void *>(addr),
+                                           reinterpret_cast<const void **>(&host_addr));
+  if (ret == HSA_STATUS_SUCCESS) {
+    return host_addr;
+  }
+  pr_err("failed to query host address for kernel object %p, ret=%d\n", (void*)addr, ret);
+  return 0;
+}
+
+void ComputeQueue::RingDoorbell() {
+  thread_cond_lock_.lock();
+  thread_cond_lock_.unlock();
+  pr_debug("notify %p wptr=%" PRIx64 " rptr=%" PRIx64 "\n",
+           ring, GetRingWptr()->load(), GetRingRptr()->load());
+  thread_cond_.notify_one();
+}
+
+hsa_status_t ComputeQueue::Init(void) {
+  hsa_status_t ret = use_hws ? HwsInit() : SwsInit();
+  if (ret)
+    return ret;
+
+  ib_start_addr = cmdbuf_addr;
+  cmdbuf_aql_frame_size = device->GetAqlFrameSize();
+  platform_atomic_support_ = device->SupportPlatformAtomic();
+
+  return ret;
+}
+
+hsa_status_t ComputeQueue::Fini(void) {
+  return use_hws ? HwsFini() : SwsFini();
+}
+
+hsa_status_t ComputeQueue::PreSubmit(void) {
+  if (!device->WaitPagingFence(this))
+    return HSA_STATUS_ERROR;
+
+  RelocateCmdbufScratchBase(ib_start_addr);
+
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t ComputeQueue::EndSubmit(void) {
+  // record last submitted cmdbuf_aql_frame_write_index to see if GPU is hungry
+  sync_point = cmdbuf_aql_frame_write_index;
+
+  ib_start_addr = cmdbuf_addr +
+                  (cmdbuf_aql_frame_write_index % WDDMDevice::GetAqlFrameNum()) *
+                  cmdbuf_aql_frame_size;
+  ib_size = 0;
+
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t ComputeQueue::Submit(void) {
+  hsa_status_t ret = PreSubmit();
+  if (ret)
+    return HSA_STATUS_ERROR;
+
+  ret = use_hws ?
+        HwsSubmit(ib_start_addr, ib_size, cmdbuf_aql_frame_write_index) :
+        SwsSubmit(ib_start_addr, ib_size, cmdbuf_aql_frame_write_index);
+  if (ret)
+    return HSA_STATUS_ERROR;
+
+  ret = EndSubmit();
+  if (ret)
+    return HSA_STATUS_ERROR;
+
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t
+ComputeQueue::KernelDispatchAqlToPm4(char *cpu, hsa_kernel_dispatch_packet_t *packet) {
+  pr_debug("queue %p kernel dispatch head=%x setup=%x wx=%x wy=%x wz=%x "
+           "gx=%x gy=%x gz=%x ps=%x gs=%x ko=%" PRIx64 " ka=%p cs=%" PRIx64 "\n",
+           ring, packet->header,
+           packet->setup, packet->workgroup_size_x, packet->workgroup_size_y,
+           packet->workgroup_size_z, packet->grid_size_x, packet->grid_size_y,
+           packet->grid_size_z, packet->private_segment_size,
+           packet->group_segment_size, packet->kernel_object, packet->kernarg_address,
+           packet->completion_signal.handle);
+
+  if (packet->workgroup_size_x > 1024 ||
+      packet->workgroup_size_y > 1024 ||
+      packet->workgroup_size_z > 1024)
+      return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+
+  int major = device->Major();
+  int i = ib_size;
+
+  const amd_kernel_code_t* kernel_object =
+    (const amd_kernel_code_t *)GetKernelObjAddr(packet->kernel_object);
+  if (kernel_object == NULL) {
+    return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
+  }
+
+  void* entry = (void*)(packet->kernel_object + kernel_object->kernel_code_entry_byte_offset);
+  assert((size_t)entry % AMD_ISA_ALIGN_BYTES == 0);
+
+  pr_debug("kernel object property=%x entry=%p lds=%x+%x\n",
+           kernel_object->kernel_code_properties, entry,
+           kernel_object->workgroup_group_segment_byte_size,
+           packet->group_segment_size);
+
+  if (packet->setup == 0 || packet->setup > 3)
+    return HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS;
+  if (packet->group_segment_size > device->LdsSize())
+    return HSA_STATUS_ERROR_INVALID_ALLOCATION;
+
+  uint32_t lds_blks = device->LdsBlocks(packet);
+  if (lds_blks > 128)
+    return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+
+  const bool wave32 =
+    AMD_HSA_BITS_GET(kernel_object->kernel_code_properties,
+		     AMD_KERNEL_CODE_PROPERTIES_ENABLE_WAVEFRONT_SIZE32);
+
+  assert(packet->private_segment_size >= kernel_object->workitem_private_segment_byte_size);
+
+  if (packet->private_segment_size != 0)
+    UpdateScratch(packet, wave32);
+
+  amd_signal_t *signal = (amd_signal_t *)packet->completion_signal.handle;
+
+  // Record start timestamp when enabling profiling
+  if (signal && EnableProfiling())
+    i += cmd_util.BuildCopyData(&signal->start_ts, cpu + i);
+
+  // Build a barrier packet if it is requested
+  const bool is_barrier_packet = (packet->header >> HSA_PACKET_HEADER_BARRIER) & 0x1;
+  if (is_barrier_packet && needs_barrier)
+    i += cmd_util.BuildBarrier(cpu + i);
+
+  // flush cache
+  i += cmd_util.BuildAcquireMem(major, cpu + i);
+
+  if (major >= 11) {
+    AppendCmdbufSratchBaseOffset(
+      i + offsetof(struct SetScratchTemplate, scratch_lo));
+
+    i += cmd_util.BuildScratch(ScratchBase(), cpu + i);
+    i += cmd_util.BuildComputeShaderParams(cpu + i);
+  }
+
+  struct DispatchInfo info;
+  info.major = major;
+  info.pPacket = packet;
+  info.pEntry = entry;
+  info.pKernelObject = kernel_object;
+  info.ldsBlks = lds_blks;
+  info.pAmdQueue = amd_queue_;
+  info.wave32 = wave32;
+  info.srd = UpdateIndexStride(
+    info.pAmdQueue->scratch_resource_descriptor[3], wave32);
+  info.pScratchBase = ScratchBase();
+  info.scratchSizePerWave = ScratchSizePerWave();
+  memset(info.scratchBaseOffset, 0, sizeof(info.scratchBaseOffset));
+  info.offsetCnt = 0;
+
+  size_t size;
+  size = cmd_util.BuildDispatch(&info, cpu + i);
+  for (int j = 0; j < info.offsetCnt; j++)
+    AppendCmdbufSratchBaseOffset(i + info.scratchBaseOffset[j]);
+  i += size;
+
+  needs_barrier = (packet->completion_signal.handle == 0);
+
+  if (signal) {
+    // wait cs done
+    i += cmd_util.BuildBarrier(cpu + i);
+
+    // Record end timestamp when enabling profiling
+    if (EnableProfiling())
+      i += cmd_util.BuildCopyData(&signal->end_ts, cpu + i);
+
+    // flush cache
+    i += cmd_util.BuildAcquireMem(major, cpu + i);
+
+    assert(signal->kind == AMD_SIGNAL_KIND_USER);
+    uint64_t *signal_addr = (uint64_t *)&signal->value;
+    pr_debug("signal value=%" PRIx64 "\n", signal->value);
+
+    if (platform_atomic_support_)
+      i += cmd_util.BuildAtomicMem(signal_addr, TC_OP_ATOMIC_ADD_RTN_64, cpu + i, cache_policy__mec_atomic_mem__bypass, -1);
+    else
+      signal_addr_ = signal_addr;
+  }
+
+  // The ring_rptr is used to record pm4 queue rptr value,
+  // dispatch readptr position, this is used to share rptr with
+  // aql queue.
+  if (platform_atomic_support_)
+    i += cmd_util.BuildAtomicMem((uint64_t *)ring_rptr, TC_OP_ATOMIC_ADD_RTN_64, cpu + i);
+  else
+    i += cmd_util.BuildWriteData64Command(cpu + i, (uint64_t *)ring_rptr, cmdbuf_aql_frame_write_index + 1);
+
+  // Check if we exceeded the frame size
+  if ((i - ib_size) > cmdbuf_aql_frame_size) {
+    pr_err("PM4 command buffer overflow in KernelDispatch: used %d bytes, limit %d bytes\n", i - ib_size, cmdbuf_aql_frame_size);
+    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+  }
+
+  ib_size = i;
+  cmdbuf_aql_frame_write_index++;
+  packet->header = HSA_PACKET_TYPE_INVALID;
+
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t
+ComputeQueue::BarrierGenericAqlToPm4(char *cpu, hsa_barrier_and_packet_t *packet, bool is_or) {
+  pr_debug("queue %p %s head=%x dep %" PRIx64 " %" PRIx64 " %" PRIx64
+           " %" PRIx64 " %" PRIx64 " cs=%" PRIx64"\n",
+           ring, is_or ? "or" : "and",
+           packet->header, packet->dep_signal[0].handle,
+           packet->dep_signal[1].handle, packet->dep_signal[2].handle,
+           packet->dep_signal[3].handle, packet->dep_signal[4].handle,
+           packet->completion_signal.handle);
+  // fix me: can we use gpu packet?
+  if (is_or) {
+    bool unsignaled = true;
+    hsa_signal_t sig[5];
+    int n = 0;
+    for (int i = 0; i < 5; i++) {
+        if (packet->dep_signal[i].handle)
+          sig[n++] = packet->dep_signal[i];
+    }
+
+    while (n) {
+      for (int i = 0; i < n; i++) {
+        if (!hsakmt_hsa_signal_load_relaxed(sig[i])) {
+          unsignaled = false;
+          break;
+        }
+      }
+      if (!unsignaled)
+        break;
+
+      std::this_thread::sleep_for(std::chrono::microseconds(20));
+    }
+  } else {
+    for (int i = 0; i < 5; i++) {
+      if (!packet->dep_signal[i].handle)
+        continue;
+
+    hsa_signal_value_t value =
+      hsakmt_hsa_signal_wait_relaxed(packet->dep_signal[i], HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
+    assert(value == 0);
+    }
+  }
+
+  int major = device->Major();
+  int i = ib_size;
+
+  if (packet->completion_signal.handle != 0) {
+    amd_signal_t *signal = (amd_signal_t *)packet->completion_signal.handle;
+    assert(signal->kind == AMD_SIGNAL_KIND_USER);
+    uint64_t *signal_addr = (uint64_t *)&signal->value;
+    pr_debug("signal value=%" PRIx64 "\n", signal->value);
+
+    // Record start timestamp when enabling profiling
+    if (EnableProfiling())
+      i += cmd_util.BuildCopyData(&signal->start_ts, cpu + i);
+
+    if (needs_barrier)
+      i += cmd_util.BuildBarrier(cpu + i);
+
+    needs_barrier = false;
+
+    // Record end timestamp when enabling profiling
+    if (EnableProfiling())
+      i += cmd_util.BuildCopyData(&signal->end_ts, cpu + i);
+
+    // flush cache
+    i += cmd_util.BuildAcquireMem(major, cpu + i);
+
+    if (platform_atomic_support_)
+      i += cmd_util.BuildAtomicMem(signal_addr, TC_OP_ATOMIC_ADD_RTN_64, cpu + i, cache_policy__mec_atomic_mem__bypass, -1);
+    else
+      signal_addr_ = signal_addr;
+  }
+
+  // The ring_rptr is used to record pm4 queue rptr value,
+  // dispatch readptr position, this is used to share rptr with
+  // aql queue.
+  if (platform_atomic_support_)
+    i += cmd_util.BuildAtomicMem((uint64_t *)ring_rptr, TC_OP_ATOMIC_ADD_RTN_64, cpu + i);
+  else
+    i += cmd_util.BuildWriteData64Command(cpu + i, (uint64_t *)ring_rptr, cmdbuf_aql_frame_write_index + 1);
+
+  // Check if we exceeded the frame size
+  if ((i - ib_size) > cmdbuf_aql_frame_size) {
+    pr_err("PM4 command buffer overflow in BarrierGeneric: used %d bytes, limit %d bytes\n", i - ib_size, cmdbuf_aql_frame_size);
+    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+  }
+
+  ib_size = i;
+  cmdbuf_aql_frame_write_index++;
+  packet->header = HSA_PACKET_TYPE_INVALID;
+
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t ComputeQueue::VendorSpecificAqlToPm4(char *cpu, amd_aql_pm4_ib *packet) {
+  constexpr uint32_t AMD_AQL_FORMAT_PM4_IB = 0x1;
+  assert(packet->ven_hdr == AMD_AQL_FORMAT_PM4_IB);
+
+  uint8_t op = (packet->ib_jump_cmd[0] >> PM4_OPCODE_SHIFT) & 0xff;
+  assert(op == IT_INDIRECT_BUFFER);
+  uint32_t* pm4_addr = reinterpret_cast<uint32_t*>((static_cast<uint64_t>(packet->ib_jump_cmd[2]) << 32) | (static_cast<uint64_t>(packet->ib_jump_cmd[1]) & ~3ull));
+  uint32_t pm4_size = packet->ib_jump_cmd[3]&0xfffff;
+  pr_debug("queue %p %s VENDOR_SPECIFIC pkt pm4_addr %p pm4_size %#x cs=%" PRIx64"\n",
+           ring, dxg_runtime->vendor_packet_process ? "process" : "skip", pm4_addr, pm4_size,
+           packet->completion_signal.handle);
+  for (int i = 0; i < pm4_size; i++) {
+    pr_debug("pm4_addr[%d]=%#x\n", i, pm4_addr[i]);
+  }
+
+  int i = ib_size;
+
+  if (dxg_runtime->vendor_packet_process) {
+    int major = device->Major();
+    memcpy(cpu+i, pm4_addr, pm4_size * sizeof(uint32_t));
+    i += pm4_size * sizeof(uint32_t);
+
+    if (packet->completion_signal.handle != 0) {
+      amd_signal_t *signal = (amd_signal_t *)packet->completion_signal.handle;
+      assert(signal->kind == AMD_SIGNAL_KIND_USER);
+      uint64_t *signal_addr = (uint64_t *)&signal->value;
+      pr_debug("signal value=%" PRIx64 "\n", signal->value);
+
+      // Record start timestamp when enabling profiling
+      if (EnableProfiling())
+        i += cmd_util.BuildCopyData(&signal->start_ts, cpu + i);
+
+      //if (needs_barrier)
+        i += cmd_util.BuildBarrier(cpu + i);
+
+      //needs_barrier = false;
+
+      // Record end timestamp when enabling profiling
+      if (EnableProfiling())
+        i += cmd_util.BuildCopyData(&signal->end_ts, cpu + i);
+
+      // flush cache
+      i += cmd_util.BuildAcquireMem(major, cpu + i);
+
+      if (platform_atomic_support_)
+        i += cmd_util.BuildAtomicMem(signal_addr, TC_OP_ATOMIC_ADD_RTN_64, cpu + i, cache_policy__mec_atomic_mem__bypass, -1);
+      else
+        signal_addr_ = signal_addr;
+    }
+  } else {
+    if (packet->completion_signal.handle != 0) {
+      hsakmt_hsa_signal_store_screlease(packet->completion_signal, 0);
+    }
+  }
+
+  // The ring_rptr is used to record pm4 queue rptr value,
+  // dispatch readptr position, this is used to share rptr with
+  // aql queue.
+  if (platform_atomic_support_)
+    i += cmd_util.BuildAtomicMem((uint64_t *)ring_rptr, TC_OP_ATOMIC_ADD_RTN_64, cpu + i);
+  else
+    i += cmd_util.BuildWriteData64Command(cpu + i, (uint64_t *)ring_rptr, cmdbuf_aql_frame_write_index + 1);
+
+  // Check if we exceeded the frame size
+  if ((i - ib_size) > cmdbuf_aql_frame_size) {
+    pr_err("PM4 command buffer overflow in VendorSpecific: used %d bytes, limit %d bytes\n", i - ib_size, cmdbuf_aql_frame_size);
+    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+  }
+
+  ib_size = i;
+  cmdbuf_aql_frame_write_index++;
+  packet->header = HSA_PACKET_TYPE_INVALID;
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t ComputeQueue::SwitchAql2PM4(void) {
+
+  uint16_t *packet = (uint16_t *) ((char *)ring +
+    (cmdbuf_aql_frame_write_index % ring_size) * 64);
+  uint16_t header = (*packet >> HSA_PACKET_HEADER_TYPE);
+  header &= (1 << HSA_PACKET_HEADER_WIDTH_TYPE) - 1;
+  hsa_kernel_dispatch_packet_t *aql_packet =
+    (hsa_kernel_dispatch_packet_t *)packet;
+  hsa_status_t ret;
+
+  switch (header) {
+  case HSA_PACKET_TYPE_KERNEL_DISPATCH:
+    ret = KernelDispatchAqlToPm4((char *)ib_start_addr, aql_packet);
+    if (ret != HSA_STATUS_SUCCESS)
+      return ret;
+
+    // Stop merging packages util below conditions are met:
+    // 1) The kernel with completion signal;
+    // 2) The cmdbuf_aql_frame_write_index reaches the end of cmdbuf
+    // 3) The queue is empty now, submit the package right now.
+    if (!(aql_packet->completion_signal.handle) &&
+        (cmdbuf_aql_frame_write_index % WDDMDevice::GetAqlFrameNum()) &&
+        (*sync_addr != sync_point))
+      return HSA_STATUS_SUCCESS;
+
+    break;
+  case HSA_PACKET_TYPE_BARRIER_AND:
+    BarrierGenericAqlToPm4((char *)ib_start_addr, (hsa_barrier_and_packet_t *)aql_packet);
+    break;
+  case HSA_PACKET_TYPE_BARRIER_OR:
+    BarrierGenericAqlToPm4((char *)ib_start_addr, (hsa_barrier_and_packet_t *)aql_packet, true);
+    break;
+  case HSA_PACKET_TYPE_VENDOR_SPECIFIC:
+    VendorSpecificAqlToPm4((char *)ib_start_addr, (amd_aql_pm4_ib *)aql_packet);
+    break;
+  case HSA_PACKET_TYPE_INVALID:
+    // When packets are submitted out of order, the format field of current AQL packet
+    // may not have been updated yet and is still INVALID. Return HSA_STATUS_SUCCESS and
+    // do not process AQL packets before the packet format field is updated.
+    assert(false && "Should not reach here, HSA_PACKET_TYPE_INVALID has been filtered in upper layer");
+    return HSA_STATUS_SUCCESS;
+  default:
+    return HSA_STATUS_ERROR_INVALID_PACKET_FORMAT;
+  }
+
+  ready_to_submit = true;
+
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t ComputeQueue::Process(void) {
+
+  while (cmdbuf_aql_frame_write_index < ring_wptr->load() &&
+         !IsInvalidPacket()) {
+    pr_debug("process %p wptr=%" PRIx64 " rptr=%" PRIx64 "\n",
+             ring, ring_wptr->load(), ring_rptr->load());
+
+    hsa_status_t ret;
+
+    // wait for next few cmdbuf slots to be free
+    // If wptr catch up the rptr in the cmdbuf, this needs wait for the rptr to free the cmdbuf.
+    // Here the wptr comes from queue->cmdbuf_aql_frame_write_index, while rptr comes from *queue->sync_addr.
+    if (*sync_addr + WDDMDevice::GetAqlFrameNum() <= cmdbuf_aql_frame_write_index) {
+      uint64_t value = cmdbuf_aql_frame_write_index - WDDMDevice::GetAqlFrameNum() + 1;
+      if (!device->CpuWait(&syncobj, &value, 1, false))
+        return HSA_STATUS_ERROR;
+    }
+
+    ret = SwitchAql2PM4();
+    if (ret != HSA_STATUS_SUCCESS)
+      return ret;
+
+    if (!ready_to_submit)
+      continue;
+
+    ret = Submit();
+    if (ret != HSA_STATUS_SUCCESS)
+      return ret;
+
+    // CPU wait for GPU fence, and cpu update the signal.
+    if (!platform_atomic_support_ && signal_addr_) {
+      // CPU wait for GPU fence
+      if (!device->CpuWait(&syncobj, &cmdbuf_aql_frame_write_index, 1, false))
+        return HSA_STATUS_ERROR;
+      //CPU update completional signal
+      atomic::Decrement(signal_addr_);
+      signal_addr_ = NULL;
+    }
+
+    ready_to_submit = false;
+
+    pr_debug("done %p wptr=%" PRIx64 " rptr=%" PRIx64 "\n",
+             ring, ring_wptr->load(), ring_rptr->load());
+  }
+
+  return HSA_STATUS_SUCCESS;
+}
+
+void SDMAQueue::SdmaThread(SDMAQueue *queue) {
+
+  while (true) {
+    decltype(queue->wptr_queue_) pendings;
+    {
+      std::unique_lock<std::mutex> lock(queue->thread_cond_lock_);
+      while (queue->wptr_queue_.empty() && !queue->thread_stop_)
+        queue->thread_cond_.wait(lock);
+
+      if (queue->thread_stop_)
+        break;
+
+      pendings.swap(queue->wptr_queue_);
+    }
+
+    for (const auto [start, end] : pendings) {
+      pr_debug("wptr %lx %lx\n", start, end);
+
+      SDMA_PKT_POLL_REGMEM* poll_pkt = reinterpret_cast<SDMA_PKT_POLL_REGMEM*>(queue->cmdbuf_addr + queue->WrapIntoRocrRing(start));
+      SDMA_PKT_POLL_REGMEM* poll_next_pkt = poll_pkt + 1;
+      while (queue->IsPollPacket(poll_pkt)) {
+        uint64_t poll_addr = poll_pkt->ADDR_LO_UNION.addr_31_0 |
+                             (uint64_t)poll_pkt->ADDR_HI_UNION.addr_63_32 << 32;
+
+        uint64_t poll_val = poll_pkt->VALUE_UNION.value;
+        uint32_t skip = 1;
+
+        if (queue->IsPollPacket(poll_next_pkt)) {
+          uint64_t poll_next_addr = poll_next_pkt->ADDR_LO_UNION.addr_31_0 |
+                             (uint64_t)poll_next_pkt->ADDR_HI_UNION.addr_63_32 << 32;
+
+          if (poll_next_addr + sizeof(uint32_t) == poll_addr) {
+            poll_addr = poll_next_addr;
+            poll_val = poll_next_pkt->VALUE_UNION.value |
+                            (uint64_t)poll_pkt->VALUE_UNION.value << 32;
+            skip = 2;
+          }
+        }
+
+        amd_signal_t* signal = (amd_signal_t*)((char*)poll_addr - offsetof(amd_signal_t, value));
+        uint64_t signal_handle = reinterpret_cast<uint64_t>(signal);
+        pr_debug("poll signal %#lx addr %#lx val %ld\n", signal_handle, poll_addr, poll_val);
+        hsa_signal_t hsa_signal = {signal_handle};
+        hsa_signal_value_t value =
+          hsakmt_hsa_signal_wait_relaxed(hsa_signal, HSA_SIGNAL_CONDITION_EQ, poll_val, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
+        assert(value == poll_val);
+
+        memset(poll_pkt, 0, skip * sizeof(*poll_pkt));
+        poll_pkt += skip;
+        poll_next_pkt += skip;
+      }
+      queue->PreparePacket(queue->WrapIntoRocrRing(start), end - start);
+      std::atomic_thread_fence(std::memory_order_release);
+      queue->Submit();
+    }
+  }
+  pr_debug("sdma thread exit\n");
+}
+
+SDMAQueue::SDMAQueue(WDDMDevice *device,
+          void *ring,
+          uint64_t cmdbuf_size,
+          uint32_t engine,
+          bool use_hws) :
+          WDDMQueue(device, reinterpret_cast<uint64_t>(ring), cmdbuf_size, engine, use_hws),
+          wptr_next_(0),
+          wptr_pre_(0),
+          rptr_next(0),
+          thread_stop_(false),
+          ib_size(0),
+          ib_start_addr(0) {
+  bool ret = device->CreateQueue(this);
+  assert(ret);
+
+  thread_ = std::thread(SdmaThread, this);
+}
+
+SDMAQueue::~SDMAQueue() {
+  thread_cond_lock_.lock();
+  thread_stop_ = true;
+  thread_cond_lock_.unlock();
+  thread_cond_.notify_one();
+  thread_.join();
+
+  device->DestroyQueue(this);
+}
+
+void SDMAQueue::RingDoorbell() {
+  pr_debug("ringdoorbell %#lx %#lx\n", wptr_pre_, wptr_next_);
+  thread_cond_lock_.lock();
+
+  wptr_queue_.emplace_back(wptr_pre_, wptr_next_);
+  thread_cond_.notify_one();
+
+  thread_cond_lock_.unlock();
+  wptr_pre_ = wptr_next_;
+}
+
+hsa_status_t SDMAQueue::Init(void) {
+  hsa_status_t ret = use_hws ? HwsInit() : SwsInit();
+  if (ret)
+    return ret;
+
+  std::memset((char *)cmdbuf_addr, 0, cmdbuf_size);
+
+  return ret;
+}
+
+hsa_status_t SDMAQueue::Fini(void) {
+  return use_hws ? HwsFini() : SwsFini();
+}
+
+int SDMAQueue::PreparePacket(uint32_t offset, uint64_t size) {
+  ib_start_addr = cmdbuf_addr + offset;
+  ib_size = size;
+  rptr_next += ib_size;
+
+  return STATUS_SUCCESS;
+}
+
+hsa_status_t SDMAQueue::Submit(void) {
+  if (!device->WaitPagingFence(this))
+    return HSA_STATUS_ERROR;
+
+  int ret = use_hws ?
+            HwsSubmit(ib_start_addr, ib_size, rptr_next) :
+            SwsSubmit(ib_start_addr, ib_size, rptr_next);
+  if (ret)
+    return HSA_STATUS_ERROR;
+
+  return HSA_STATUS_SUCCESS;
+}
+
+} // namespace thunk
+} // namespace wsl
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/wddm/va_mgr.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/wddm/va_mgr.cpp
new file mode 100644
index 0000000000..4ea93c70f2
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/wddm/va_mgr.cpp
@@ -0,0 +1,165 @@
+#include <cassert>
+#include <map>
+#include <algorithm>
+#include "impl/wddm/va_mgr.h"
+
+using namespace std;
+
+namespace wsl {
+namespace thunk {
+
+VaMgr::VaMgr(uint64_t start, uint64_t size, uint64_t min_align) {
+  min_align_ = min_align;
+  auto free_it = free_list_.insert(make_pair(size, start));
+  frag_map_[start] = make_fragment(free_it, size);
+}
+
+VaMgr::~VaMgr() {
+
+  if (free_list_.size() != 1)
+    pr_warn("free_list_ size:%ld which should be 1.\n", free_list_.size());
+  if (frag_map_.size() != 1)
+    pr_warn("frag_map_ size:%ld which should be 1.\n", frag_map_.size());
+
+  free_list_.clear();
+  frag_map_.clear();
+}
+
+uint64_t VaMgr::Alloc(uint64_t bytes, uint64_t align, uint64_t addr) {
+
+  if (addr > 0 &&
+      (align == 0 || (addr % align) == 0)) {
+
+    lock_guard<mutex> gard(lock_);
+    auto frag_it = frag_map_.upper_bound(addr);
+    assert(frag_it != frag_map_.begin());
+    --frag_it;
+
+    while (frag_it != frag_map_.begin()) {
+      const uint64_t base = frag_it->first;
+      const uint64_t size = frag_it->second.size;
+
+      // Cannot find free fragment contains the target `addr`
+      if (bytes > size || addr < base || addr + bytes > base + size ||
+          !is_free(frag_it->second)) {
+        --frag_it;
+        continue;
+      } else if (addr >= base + size)
+        break;
+
+
+      // Try to allocate target `addr` from this free fragment
+      auto free_it = frag_it->second.free_list_entry_;
+      assert(free_it != free_list_.end());
+
+      free_list_.erase(free_it);
+      frag_it->second.size = bytes;
+      set_used(frag_it->second);
+
+      // [base, addr)
+      if (addr > base) add_free_fragment(addr - base, base);
+
+      // [addr, addr + bytes) is used
+
+      // [addr + bytes, base + size)
+      if (base + size > addr + bytes) add_free_fragment(base + size - addr - bytes, addr + bytes);
+
+      return addr;
+    }
+  }
+
+  // Allocate not fixed address
+  return AllocImpl(bytes, align);
+}
+
+uint64_t VaMgr::AllocImpl(const uint64_t bytes, const uint64_t align) {
+  uint64_t addr = 0;
+  uint64_t align_bytes = bytes;
+  const int retry = align == 0 ? 0 : 1;
+  const uint64_t new_align = align == 0 ? min_align_ : AlignUp(align, min_align_);
+
+  lock_guard<mutex> gard(lock_);
+  for (int i = 0; i <= retry; i++) {
+    auto free_it = free_list_.lower_bound(align_bytes);
+    if (free_it == free_list_.end()) break;
+
+    uint64_t base = free_it->second;
+    uint64_t size = free_it->first;
+
+    assert(size >= align_bytes);
+
+    auto fragment = frag_map_.find(base);
+
+    assert(fragment != frag_map_.end());
+    assert(size == fragment->second.size);
+
+    uint64_t delta = align == 0 ? 0 : base % align;
+    if (delta == 0) {
+      // already find aligned address
+      addr = base;
+
+      free_list_.erase(free_it);
+      fragment->second.size = bytes;
+      set_used(fragment->second);
+
+      if (size > bytes) add_free_fragment(size - bytes, base + bytes);
+
+      break;
+    } else if (i == 0) {
+      align_bytes += new_align;
+      continue;
+    } else {
+      uint64_t aligned_base = base + align - delta;
+      addr = aligned_base;
+
+      free_list_.erase(free_it);
+
+      add_used_fragment(bytes, aligned_base);
+      add_free_fragment(aligned_base - base, base);
+
+      if (size > aligned_base - base + bytes)
+        add_free_fragment(size - (aligned_base - base) - bytes, aligned_base + bytes);
+
+      break;
+    }
+  }
+  return addr;
+}
+
+void VaMgr::Free(uint64_t addr) {
+  if (addr == 0) return;
+
+  lock_guard<mutex> gard(lock_);
+  auto frag_it = frag_map_.find(addr);
+  if (frag_it == frag_map_.end() || is_free(frag_it->second)) return;
+
+  uint64_t base = addr;
+  // Merge lower
+  if (frag_it != frag_map_.begin()) {
+    auto lower = frag_it;
+    --lower;
+    if (is_free(lower->second)) {
+      remove_free_list_entry(lower->second);
+      base -= lower->second.size;
+      lower->second.size += frag_it->second.size;
+      frag_map_.erase(frag_it);
+      frag_it = lower;
+    }
+  }
+  // Merge upper
+  {
+    auto upper = frag_it;
+    ++upper;
+    if (upper != frag_map_.end() && is_free(upper->second)) {
+      remove_free_list_entry(upper->second);
+      frag_it->second.size += upper->second.size;
+      frag_map_.erase(upper);
+    }
+  }
+  uint64_t size = frag_it->second.size;
+  auto it = free_list_.insert(make_pair(size, base));
+  set_free(frag_it->second, it);
+}
+
+} // namespace thunk
+} // namespace wsl