rocr/hsakmt/wsl Move WSL under ROCR hsakmt. (#2638)

## Motivation ROCR on Windows uses WSL implementation as the codebase. We want to make sure Windows changes can continue to work with WSL and share the same core implementation. Hence, it's easier to maintain the code under the same rocm-system infrastructure and automate all builds/tests in the future. ## Technical Details The new files is the copy of https://github.com/ROCm/librocdxg/ with preserved history. Native windows support and clean-ups will be added in the following check-ins. The same command lines can be used to build WSL under libhsakmt folder for now. ``` # Set the Windows SDK path (adjust version number if different) export win_sdk='/mnt/c/Program Files (x86)/Windows Kits/10/Include/10.0.26100.0/' # Build the library mkdir -p build cd build cmake .. -DWIN_SDK="${win_sdk}/shared" make sudo make install ``` ## JIRA ID SWDEV-558849 ## Test Plan N/A ## Test Result N/A ## Submission Checklist
2026-01-21 20:00:33 -05:00
@@ -25,6 +25,9 @@

 cmake_minimum_required ( VERSION 3.6.3 )

+if (WIN_SDK)
+  include(${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists_wsl.txt)
+else ()
 set(CMAKE_VERBOSE_MAKEFILE ON)

 set ( HSAKMT "hsakmt" )
@@ -319,3 +322,4 @@ endif()
 ###########################
 # Use component packaging
 set ( ENABLE_LDCONFIG ON CACHE BOOL "Set library links and caches using ldconfig.")
+endif()
@@ -0,0 +1,309 @@
+################################################################################
+##
+## Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved.
+##
+## MIT LICENSE:
+## Permission is hereby granted, free of charge, to any person obtaining a copy of
+## this software and associated documentation files (the "Software"), to deal in
+## the Software without restriction, including without limitation the rights to
+## use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+## of the Software, and to permit persons to whom the Software is furnished to do
+## so, subject to the following conditions:
+##
+## The above copyright notice and this permission notice shall be included in all
+## copies or substantial portions of the Software.
+##
+## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+## AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+## LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+## OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+## SOFTWARE.
+##
+################################################################################
+
+cmake_minimum_required ( VERSION 3.15 )
+
+set(CMAKE_VERBOSE_MAKEFILE ON)
+
+set ( ROCDXG "rocdxg" )
+set ( ROCDXG_PACKAGE "rocdxg-roct" )
+set ( ROCDXG_COMPONENT "lib${ROCDXG}" )
+set ( ROCDXG_TARGET "${ROCDXG}" )
+set ( ROCDXG_VERSION "1.1.0")
+
+project ( ${ROCDXG_TARGET} VERSION ${ROCDXG_VERSION} )
+# Project/version initialized; expose version to code via target defs below
+
+# Optionally, build ROCDXG with ccache.
+set(ROCM_CCACHE_BUILD OFF CACHE BOOL "Set to ON for a ccache enabled build")
+if (ROCM_CCACHE_BUILD)
+  find_program(CCACHE_PROGRAM ccache)
+  if (CCACHE_PROGRAM)
+    set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ${CCACHE_PROGRAM})
+  else()
+    message(WARNING "Unable to find ccache. Falling back to real compiler")
+  endif() # if (CCACHE_PROGRAM)
+endif() # if (ROCM_CCACHE_BUILD)
+
+list( PREPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules" )
+
+## Include common cmake modules
+include ( utils )
+include ( GNUInstallDirs )
+
+## Setup the package version.
+get_version ( "${ROCDXG_VERSION}" )
+
+set ( BUILD_VERSION_MAJOR ${VERSION_MAJOR} )
+set ( BUILD_VERSION_MINOR ${VERSION_MINOR} )
+set ( BUILD_VERSION_PATCH ${VERSION_PATCH} )
+
+set ( LIB_VERSION_MAJOR ${VERSION_MAJOR})
+set ( LIB_VERSION_MINOR ${VERSION_MINOR})
+set ( LIB_VERSION_PATCH ${VERSION_PATCH})
+
+set ( LIB_VERSION_STRING "${LIB_VERSION_MAJOR}.${LIB_VERSION_MINOR}.${LIB_VERSION_PATCH}" )
+
+if ( DEFINED VERSION_BUILD AND NOT ${VERSION_BUILD} STREQUAL "" )
+    message ( "VERSION BUILD DEFINED ${VERSION_BUILD}" )
+    set ( BUILD_VERSION_PATCH "${BUILD_VERSION_PATCH}-${VERSION_BUILD}" )
+endif ()
+set ( BUILD_VERSION_STRING "${BUILD_VERSION_MAJOR}.${BUILD_VERSION_MINOR}.${BUILD_VERSION_PATCH}" )
+
+## Compiler flags
+set (ROCDXG_CXX_FLAGS -fPIC -include ${CMAKE_CURRENT_SOURCE_DIR}/src/dxg/librocdxg.h)
+
+if ( CMAKE_COMPILER_IS_GNUCC )
+    set ( ROCDXG_CXX_FLAGS "${ROCDXG_CXX_FLAGS}" -Wlogical-op)
+endif ()
+if ( ${ROCDXG_WERROR} )
+    set ( ROCDXG_CXX_FLAGS "${ROCDXG_CXX_FLAGS}" -Werror )
+endif ()
+if ( "${CMAKE_BUILD_TYPE}" STREQUAL Release )
+    set ( ROCDXG_CXX_FLAGS "${ROCDXG_CXX_FLAGS}" -O2 )
+else ()
+    set ( ROCDXG_CXX_FLAGS "${ROCDXG_CXX_FLAGS}" -g )
+endif ()
+
+set ( ROCDXG_LINKER_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/src/dxg/librocdxg.ver" )
+
+## Linker Flags
+## Add --enable-new-dtags to generate DT_RUNPATH
+set (ROCDXG_LINK_FLAGS "${ROCDXG_LINK_FLAGS} -Wl,--enable-new-dtags -Wl,--version-script=${ROCDXG_LINKER_SCRIPT} -Wl,-soname=${ROCDXG_COMPONENT}.so.${LIB_VERSION_MAJOR} -Wl,-z,nodelete")
+
+## Linker undefined symbol handling
+if ( CMAKE_COMPILER_IS_GNUCC )
+    set ( ROCDXG_LINK_FLAGS "${ROCDXG_LINK_FLAGS} -Wl,-no-undefined" )
+else ()
+    set ( ROCDXG_LINK_FLAGS "${ROCDXG_LINK_FLAGS} -Wl,-undefined,error" )
+endif ()
+
+## Source files
+set ( ROCDXG_SRC "src/dxg/debug.cpp"
+                 "src/dxg/events.cpp"
+                 "src/dxg/memory.cpp"
+                 "src/dxg/libdrm.cpp"
+                 "src/dxg/hsa.cpp"
+                 "src/dxg/openclose.cpp"
+                 "src/dxg/perfctr.cpp"
+                 "src/dxg/queues.cpp"
+                 "src/dxg/time.cpp"
+                 "src/dxg/topology.cpp"
+                 "src/dxg/spm.cpp"
+                 "src/dxg/version.cpp"
+                 "src/dxg/svm.cpp"
+                 "src/dxg/pc_sampling.cpp"
+                 "src/dxg/hsakmtmodel.cpp"
+                 "src/dxg/dxcore_loader.cpp"
+                 "src/dxg/ais.cpp"
+                 "src/dxg/wddm/device.cpp"
+                 "src/dxg/wddm/gpu_memory.cpp"
+                 "src/dxg/wddm/va_mgr.cpp"
+                 "src/dxg/wddm/queue.cpp"
+                 "src/dxg/wddm/cmd_util.cpp" )
+
+## Declare the library target name
+add_library (${ROCDXG_TARGET} SHARED "")
+
+## Add sources
+target_sources ( ${ROCDXG_TARGET} PRIVATE ${ROCDXG_SRC} )
+
+## Add headers.  The public headers need to point at their location in both build and install
+## directory layouts.  This declaration allows publishing library use data to downstream clients.
+target_include_directories( ${ROCDXG_TARGET}
+  PUBLIC
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
+  PRIVATE
+  ${WIN_SDK}
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/dxg )
+
+add_compile_definitions(LINUX __AMD64__ LITTLEENDIAN_CPU HSA_LARGE_MODEL)
+
+# Ensure version macro is defined for this target
+target_compile_definitions(${ROCDXG_TARGET} PRIVATE ROCDXG_VERSION="${ROCDXG_VERSION}")
+
+target_link_directories(${ROCDXG_TARGET} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src/dxg/thunk_proxy)
+target_link_libraries(${ROCDXG_TARGET} PRIVATE thunk_proxy)
+
+set_property(TARGET ${ROCDXG_TARGET} PROPERTY LINK_FLAGS ${ROCDXG_LINK_FLAGS})
+
+## Set the VERSION and SOVERSION values
+set_property ( TARGET ${ROCDXG_TARGET} PROPERTY VERSION "${LIB_VERSION_STRING}" )
+set_property ( TARGET ${ROCDXG_TARGET} PROPERTY SOVERSION "${LIB_VERSION_MAJOR}" )
+
+find_package(PkgConfig)
+# get OS-info for OS-specific build dependencies
+get_os_info()
+# Check for libraries required for building
+find_library(LIBC NAMES c REQUIRED)
+message(STATUS "LIBC:" ${LIBC})
+
+target_link_libraries ( ${ROCDXG_TARGET}
+  PRIVATE pthread rt c ${CMAKE_DL_LIBS}
+)
+
+target_compile_options(${ROCDXG_TARGET} PRIVATE ${ROCDXG_CXX_FLAGS})
+
+## Define default paths and packages.
+if( CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT )
+  set ( CMAKE_INSTALL_PREFIX "/opt/rocm" )
+endif()
+set ( CMAKE_INSTALL_PREFIX ${CMAKE_INSTALL_PREFIX} CACHE STRING "Default installation directory." FORCE )
+set ( CPACK_PACKAGING_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}"  CACHE STRING "Default packaging prefix." )
+set ( CPACK_GENERATOR "DEB"  CACHE STRING "Default packaging generators." )
+
+# Installs binaries and exports the library usage data to ${ROCDXG_TARGET}Targets
+install ( TARGETS ${ROCDXG_TARGET} EXPORT ${ROCDXG_TARGET}Targets
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT binary
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT binary )
+
+# Install public headers
+#install ( DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/${ROCDXG_TARGET} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+#  COMPONENT dev PATTERN "*drm*" EXCLUDE )
+
+# Record our usage data for clients find_package calls.
+install ( EXPORT ${ROCDXG_TARGET}Targets
+  FILE ${ROCDXG_TARGET}Targets.cmake
+  NAMESPACE ${ROCDXG_TARGET}::
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${ROCDXG_TARGET}
+  COMPONENT dev )
+
+# Adds the target alias rocdxg::rocdxg to the local cmake cache.
+# This isn't necessary today.  It's harmless preparation for some
+# hypothetical future in which the we might be included by add_subdirectory()
+# in some other project's cmake file.  It allows uniform use of find_package
+# and target_link_library() without regard to whether a target is external or
+# a subdirectory of the current build.
+add_library( ${ROCDXG_TARGET}::${ROCDXG_TARGET} ALIAS ${ROCDXG_TARGET} )
+
+# Create cmake configuration files
+include(CMakePackageConfigHelpers)
+
+configure_package_config_file(${ROCDXG_TARGET}-config.cmake.in
+                            ${ROCDXG_TARGET}-config.cmake
+                            INSTALL_DESTINATION
+                            ${CMAKE_INSTALL_LIBDIR}/cmake/${ROCDXG_TARGET} )
+
+write_basic_package_version_file(${ROCDXG_TARGET}-config-version.cmake
+                 VERSION ${BUILD_VERSION_STRING}
+                 COMPATIBILITY
+                 AnyNewerVersion)
+
+install(FILES
+        ${CMAKE_CURRENT_BINARY_DIR}/${ROCDXG_TARGET}-config.cmake
+        ${CMAKE_CURRENT_BINARY_DIR}/${ROCDXG_TARGET}-config-version.cmake
+        DESTINATION
+        ${CMAKE_INSTALL_LIBDIR}/cmake/${ROCDXG_TARGET}
+        COMPONENT dev )
+
+# Optionally record the package's find module in the user's package cache.
+if ( NOT DEFINED EXPORT_TO_USER_PACKAGE_REGISTRY )
+  set ( EXPORT_TO_USER_PACKAGE_REGISTRY "off" )
+endif()
+set ( EXPORT_TO_USER_PACKAGE_REGISTRY ${EXPORT_TO_USER_PACKAGE_REGISTRY}
+             CACHE BOOL "Add cmake package config location to the user's cmake package registry.")
+if(${EXPORT_TO_USER_PACKAGE_REGISTRY})
+  # Enable writing to the registry
+  set(CMAKE_EXPORT_PACKAGE_REGISTRY ON)
+  # Generate a target file for the build
+  export(TARGETS ${ROCDXG_TARGET} NAMESPACE ${ROCDXG_TARGET}:: FILE ${ROCDXG_TARGET}Targets.cmake)
+  # Record the package in the user's cache.
+  export(PACKAGE ${ROCDXG_TARGET})
+endif()
+
+# Since librocdxg.pc and libhsakmt.pc are installed to the same pkgconfig directory,
+# we can directly use libhsakmt's header file path in the includedir.
+# This allows librocdxg to reference the same header files as libhsakmt without
+# duplicating header installation.
+configure_file ( librocdxg.pc.in librocdxg.pc @ONLY )
+
+install ( FILES ${CMAKE_CURRENT_BINARY_DIR}/librocdxg.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig COMPONENT dev )
+
+install(CODE "execute_process(COMMAND ldconfig)" COMPONENT binary)
+
+###########################
+# Packaging directives
+###########################
+# Use component packaging
+set(CPACK_COMPONENTS_GROUPING IGNORE)
+set(CPACK_DEB_COMPONENT_INSTALL ON)
+set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.")
+set(CPACK_PACKAGE_VERSION_MAJOR ${VERSION_MAJOR})
+set(CPACK_PACKAGE_VERSION_MINOR ${VERSION_MINOR})
+set(CPACK_PACKAGE_VERSION_PATCH ${VERSION_PATCH})
+set(CPACK_PACKAGE_CONTACT "AMD GFX mailing list <amd-gfx@lists.freedesktop.org>")
+set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md")
+set(CPACK_COMPONENT_DESCRIPTION "ROCDXG development package.\n This package includes the user-mode API interfaces\nused to interact with the ROCm driver.\n This package contains the libraries and cmake files for the ROCDXG package.")
+set ( ENABLE_LDCONFIG ON CACHE BOOL "Set library links and caches using ldconfig.")
+
+# Install License file
+install ( FILES ${CPACK_RESOURCE_FILE_LICENSE} DESTINATION ${CMAKE_INSTALL_DOCDIR} COMPONENT binary )
+
+# Prepare final version for the CPACK use
+set(PACKAGE_VERSION_STR "${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}")
+set(CPACK_PACKAGE_VERSION "${PACKAGE_VERSION_STR}")
+
+# Debian package specific variables
+set(CPACK_DEBIAN_PACKAGE_NAME "rocdxg-roct")
+
+# Debian binary package specific variables (runtime package)
+set(CPACK_DEBIAN_BINARY_PACKAGE_NAME "rocdxg-roct")
+set(CPACK_DEBIAN_BINARY_PACKAGE_DESCRIPTION "ROCDXG runtime package containing libraries")
+
+# Debian dev package specific variables
+set(CPACK_DEBIAN_DEV_PACKAGE_NAME "rocdxg-roct-dev")
+set(CPACK_DEBIAN_DEV_PACKAGE_DESCRIPTION "ROCDXG development package containing pkgconfig and cmake files")
+
+## Process the Debian install/remove scripts to update the CPACK variables
+configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/postinst.in DEBIAN/postinst @ONLY )
+configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/prerm.in DEBIAN/prerm @ONLY )
+set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "DEBIAN/postinst;DEBIAN/prerm")
+
+# Setting package dependencies
+set(CPACK_DEBIAN_PACKAGE_DEPENDS "rocm-core")
+set(CPACK_DEBIAN_BINARY_PACKAGE_DEPENDS "rocm-core")
+set(CPACK_DEBIAN_DEV_PACKAGE_DEPENDS "rocdxg-roct (= ${PACKAGE_VERSION_STR}), rocm-core")
+
+# Set the names now using CPACK utility
+set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT")
+
+# Remove dependency on rocm-core if -DROCM_DEP_ROCMCORE=ON not given to cmake
+if(NOT ROCM_DEP_ROCMCORE)
+    string(REGEX REPLACE ",? ?rocm-core" "" CPACK_DEBIAN_PACKAGE_DEPENDS ${CPACK_DEBIAN_PACKAGE_DEPENDS})
+    string(REGEX REPLACE ",? ?rocm-core" "" CPACK_DEBIAN_BINARY_PACKAGE_DEPENDS ${CPACK_DEBIAN_BINARY_PACKAGE_DEPENDS})
+    string(REGEX REPLACE ",? ?rocm-core" "" CPACK_DEBIAN_DEV_PACKAGE_DEPENDS ${CPACK_DEBIAN_DEV_PACKAGE_DEPENDS})
+endif()
+
+include(CPack)
+
+# Add component descriptions
+cpack_add_component(binary
+  DISPLAY_NAME "Runtime"
+  DESCRIPTION "ROCDXG runtime libraries")
+
+cpack_add_component(dev
+  DISPLAY_NAME "Development"
+  DESCRIPTION "ROCDXG development files (pkgconfig and cmake)")
@@ -0,0 +1,983 @@
+/**
+ * \file xf86drm.h 
+ * OS-independent header for DRM user-level library interface.
+ *
+ * \author Rickard E. (Rik) Faith <faith@valinux.com>
+ */
+ 
+/*
+ * Copyright 1999, 2000 Precision Insight, Inc., Cedar Park, Texas.
+ * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef _XF86DRM_H_
+#define _XF86DRM_H_
+
+#include <stdarg.h>
+#include <sys/types.h>
+#include <stdint.h>
+#include "drm.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#ifndef DRM_MAX_MINOR
+#define DRM_MAX_MINOR   64 /* deprecated */
+#endif
+
+#if defined(__linux__)
+
+#define DRM_IOCTL_NR(n)		_IOC_NR(n)
+#define DRM_IOC_VOID		_IOC_NONE
+#define DRM_IOC_READ		_IOC_READ
+#define DRM_IOC_WRITE		_IOC_WRITE
+#define DRM_IOC_READWRITE	_IOC_READ|_IOC_WRITE
+#define DRM_IOC(dir, group, nr, size) _IOC(dir, group, nr, size)
+
+#else /* One of the *BSDs */
+
+#include <sys/ioccom.h>
+#define DRM_IOCTL_NR(n)         ((n) & 0xff)
+#define DRM_IOC_VOID            IOC_VOID
+#define DRM_IOC_READ            IOC_OUT
+#define DRM_IOC_WRITE           IOC_IN
+#define DRM_IOC_READWRITE       IOC_INOUT
+#define DRM_IOC(dir, group, nr, size) _IOC(dir, group, nr, size)
+
+#endif
+
+				/* Defaults, if nothing set in xf86config */
+#define DRM_DEV_UID	 0
+#define DRM_DEV_GID	 0
+/* Default /dev/dri directory permissions 0755 */
+#define DRM_DEV_DIRMODE	 	\
+	(S_IRUSR|S_IWUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH)
+#define DRM_DEV_MODE	 (S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP)
+
+#ifdef __OpenBSD__
+#define DRM_DIR_NAME  "/dev"
+#define DRM_PRIMARY_MINOR_NAME  "drm"
+#define DRM_CONTROL_MINOR_NAME  "drmC" /* deprecated */
+#define DRM_RENDER_MINOR_NAME   "drmR"
+#else
+#define DRM_DIR_NAME  "/dev/dri"
+#define DRM_PRIMARY_MINOR_NAME  "card"
+#define DRM_CONTROL_MINOR_NAME  "controlD" /* deprecated */
+#define DRM_RENDER_MINOR_NAME   "renderD"
+#define DRM_PROC_NAME "/proc/dri/" /* For backward Linux compatibility */
+#endif
+
+#define DRM_DEV_NAME          "%s/" DRM_PRIMARY_MINOR_NAME "%d"
+#define DRM_CONTROL_DEV_NAME  "%s/" DRM_CONTROL_MINOR_NAME "%d" /* deprecated */
+#define DRM_RENDER_DEV_NAME   "%s/" DRM_RENDER_MINOR_NAME  "%d"
+
+#define DRM_NODE_NAME_MAX \
+    (sizeof(DRM_DIR_NAME) + 1 /* slash */ \
+     + MAX3(sizeof(DRM_PRIMARY_MINOR_NAME), \
+            sizeof(DRM_CONTROL_MINOR_NAME), \
+            sizeof(DRM_RENDER_MINOR_NAME)) \
+     + sizeof("1048575") /* highest possible node number 2^MINORBITS - 1 */ \
+     + 1) /* NULL-terminator */
+
+#define DRM_ERR_NO_DEVICE  (-1001)
+#define DRM_ERR_NO_ACCESS  (-1002)
+#define DRM_ERR_NOT_ROOT   (-1003)
+#define DRM_ERR_INVALID    (-1004)
+#define DRM_ERR_NO_FD      (-1005)
+
+#define DRM_AGP_NO_HANDLE 0
+
+typedef unsigned int  drmSize,     *drmSizePtr;	    /**< For mapped regions */
+typedef void          *drmAddress, **drmAddressPtr; /**< For mapped regions */
+
+#if (__GNUC__ >= 3)
+#define DRM_PRINTFLIKE(f, a) __attribute__ ((format(__printf__, f, a)))
+#else
+#define DRM_PRINTFLIKE(f, a)
+#endif
+
+typedef struct _drmServerInfo {
+  int (*debug_print)(const char *format, va_list ap) DRM_PRINTFLIKE(1,0);
+  int (*load_module)(const char *name);
+  void (*get_perms)(gid_t *, mode_t *);
+} drmServerInfo, *drmServerInfoPtr;
+
+typedef struct drmHashEntry {
+    int      fd;
+    void     (*f)(int, void *, void *);
+    void     *tagTable;
+} drmHashEntry;
+
+extern int drmIoctl(int fd, unsigned long request, void *arg);
+extern void *drmGetHashTable(void);
+extern drmHashEntry *drmGetEntry(int fd);
+
+/**
+ * Driver version information.
+ *
+ * \sa drmGetVersion() and drmSetVersion().
+ */
+typedef struct _drmVersion {
+    int     version_major;        /**< Major version */
+    int     version_minor;        /**< Minor version */
+    int     version_patchlevel;   /**< Patch level */
+    int     name_len; 	          /**< Length of name buffer */
+    char    *name;	          /**< Name of driver */
+    int     date_len;             /**< Length of date buffer */
+    char    *date;                /**< User-space buffer to hold date */
+    int     desc_len;	          /**< Length of desc buffer */
+    char    *desc;                /**< User-space buffer to hold desc */
+} drmVersion, *drmVersionPtr;
+
+typedef struct _drmStats {
+    unsigned long count;	     /**< Number of data */
+    struct {
+	unsigned long value;	     /**< Value from kernel */
+	const char    *long_format;  /**< Suggested format for long_name */
+	const char    *long_name;    /**< Long name for value */
+	const char    *rate_format;  /**< Suggested format for rate_name */
+	const char    *rate_name;    /**< Short name for value per second */
+	int           isvalue;       /**< True if value (vs. counter) */
+	const char    *mult_names;   /**< Multiplier names (e.g., "KGM") */
+	int           mult;          /**< Multiplier value (e.g., 1024) */
+	int           verbose;       /**< Suggest only in verbose output */
+    } data[15];
+} drmStatsT;
+
+
+				/* All of these enums *MUST* match with the
+                                   kernel implementation -- so do *NOT*
+                                   change them!  (The drmlib implementation
+                                   will just copy the flags instead of
+                                   translating them.) */
+typedef enum {
+    DRM_FRAME_BUFFER    = 0,      /**< WC, no caching, no core dump */
+    DRM_REGISTERS       = 1,      /**< no caching, no core dump */
+    DRM_SHM             = 2,      /**< shared, cached */
+    DRM_AGP             = 3,	  /**< AGP/GART */
+    DRM_SCATTER_GATHER  = 4,	  /**< PCI scatter/gather */
+    DRM_CONSISTENT      = 5	  /**< PCI consistent */
+} drmMapType;
+
+typedef enum {
+    DRM_RESTRICTED      = 0x0001, /**< Cannot be mapped to client-virtual */
+    DRM_READ_ONLY       = 0x0002, /**< Read-only in client-virtual */
+    DRM_LOCKED          = 0x0004, /**< Physical pages locked */
+    DRM_KERNEL          = 0x0008, /**< Kernel requires access */
+    DRM_WRITE_COMBINING = 0x0010, /**< Use write-combining, if available */
+    DRM_CONTAINS_LOCK   = 0x0020, /**< SHM page that contains lock */
+    DRM_REMOVABLE	= 0x0040  /**< Removable mapping */
+} drmMapFlags;
+
+/**
+ * \warning These values *MUST* match drm.h
+ */
+typedef enum {
+    /** \name Flags for DMA buffer dispatch */
+    /*@{*/
+    DRM_DMA_BLOCK        = 0x01, /**< 
+				  * Block until buffer dispatched.
+				  * 
+				  * \note the buffer may not yet have been
+				  * processed by the hardware -- getting a
+				  * hardware lock with the hardware quiescent
+				  * will ensure that the buffer has been
+				  * processed.
+				  */
+    DRM_DMA_WHILE_LOCKED = 0x02, /**< Dispatch while lock held */
+    DRM_DMA_PRIORITY     = 0x04, /**< High priority dispatch */
+    /*@}*/
+
+    /** \name Flags for DMA buffer request */
+    /*@{*/
+    DRM_DMA_WAIT         = 0x10, /**< Wait for free buffers */
+    DRM_DMA_SMALLER_OK   = 0x20, /**< Smaller-than-requested buffers OK */
+    DRM_DMA_LARGER_OK    = 0x40  /**< Larger-than-requested buffers OK */
+    /*@}*/
+} drmDMAFlags;
+
+typedef enum {
+    DRM_PAGE_ALIGN       = 0x01,
+    DRM_AGP_BUFFER       = 0x02,
+    DRM_SG_BUFFER        = 0x04,
+    DRM_FB_BUFFER        = 0x08,
+    DRM_PCI_BUFFER_RO    = 0x10
+} drmBufDescFlags;
+
+typedef enum {
+    DRM_LOCK_READY      = 0x01, /**< Wait until hardware is ready for DMA */
+    DRM_LOCK_QUIESCENT  = 0x02, /**< Wait until hardware quiescent */
+    DRM_LOCK_FLUSH      = 0x04, /**< Flush this context's DMA queue first */
+    DRM_LOCK_FLUSH_ALL  = 0x08, /**< Flush all DMA queues first */
+				/* These *HALT* flags aren't supported yet
+                                   -- they will be used to support the
+                                   full-screen DGA-like mode. */
+    DRM_HALT_ALL_QUEUES = 0x10, /**< Halt all current and future queues */
+    DRM_HALT_CUR_QUEUES = 0x20  /**< Halt all current queues */
+} drmLockFlags;
+
+typedef enum {
+    DRM_CONTEXT_PRESERVED = 0x01, /**< This context is preserved and
+				     never swapped. */
+    DRM_CONTEXT_2DONLY    = 0x02  /**< This context is for 2D rendering only. */
+} drm_context_tFlags, *drm_context_tFlagsPtr;
+
+typedef struct _drmBufDesc {
+    int              count;	  /**< Number of buffers of this size */
+    int              size;	  /**< Size in bytes */
+    int              low_mark;	  /**< Low water mark */
+    int              high_mark;	  /**< High water mark */
+} drmBufDesc, *drmBufDescPtr;
+
+typedef struct _drmBufInfo {
+    int              count;	  /**< Number of buffers described in list */
+    drmBufDescPtr    list;	  /**< List of buffer descriptions */
+} drmBufInfo, *drmBufInfoPtr;
+
+typedef struct _drmBuf {
+    int              idx;	  /**< Index into the master buffer list */
+    int              total;	  /**< Buffer size */
+    int              used;	  /**< Amount of buffer in use (for DMA) */
+    drmAddress       address;	  /**< Address */
+} drmBuf, *drmBufPtr;
+
+/**
+ * Buffer mapping information.
+ *
+ * Used by drmMapBufs() and drmUnmapBufs() to store information about the
+ * mapped buffers.
+ */
+typedef struct _drmBufMap {
+    int              count;	  /**< Number of buffers mapped */
+    drmBufPtr        list;	  /**< Buffers */
+} drmBufMap, *drmBufMapPtr;
+
+typedef struct _drmLock {
+    volatile unsigned int lock;
+    char                      padding[60];
+    /* This is big enough for most current (and future?) architectures:
+       DEC Alpha:              32 bytes
+       Intel Merced:           ?
+       Intel P5/PPro/PII/PIII: 32 bytes
+       Intel StrongARM:        32 bytes
+       Intel i386/i486:        16 bytes
+       MIPS:                   32 bytes (?)
+       Motorola 68k:           16 bytes
+       Motorola PowerPC:       32 bytes
+       Sun SPARC:              32 bytes
+    */
+} drmLock, *drmLockPtr;
+
+/**
+ * Indices here refer to the offset into
+ * list in drmBufInfo
+ */
+typedef struct _drmDMAReq {
+    drm_context_t    context;  	  /**< Context handle */
+    int           send_count;     /**< Number of buffers to send */
+    int           *send_list;     /**< List of handles to buffers */
+    int           *send_sizes;    /**< Lengths of data to send, in bytes */
+    drmDMAFlags   flags;          /**< Flags */
+    int           request_count;  /**< Number of buffers requested */
+    int           request_size;	  /**< Desired size of buffers requested */
+    int           *request_list;  /**< Buffer information */
+    int           *request_sizes; /**< Minimum acceptable sizes */
+    int           granted_count;  /**< Number of buffers granted at this size */
+} drmDMAReq, *drmDMAReqPtr;
+
+typedef struct _drmRegion {
+    drm_handle_t     handle;
+    unsigned int  offset;
+    drmSize       size;
+    drmAddress    map;
+} drmRegion, *drmRegionPtr;
+
+typedef struct _drmTextureRegion {
+    unsigned char next;
+    unsigned char prev;
+    unsigned char in_use;
+    unsigned char padding;	/**< Explicitly pad this out */
+    unsigned int  age;
+} drmTextureRegion, *drmTextureRegionPtr;
+
+
+typedef enum {
+    DRM_VBLANK_ABSOLUTE = 0x0,	/**< Wait for specific vblank sequence number */
+    DRM_VBLANK_RELATIVE = 0x1,	/**< Wait for given number of vblanks */
+    /* bits 1-6 are reserved for high crtcs */
+    DRM_VBLANK_HIGH_CRTC_MASK = 0x0000003e,
+    DRM_VBLANK_EVENT = 0x4000000,	/**< Send event instead of blocking */
+    DRM_VBLANK_FLIP = 0x8000000,	/**< Scheduled buffer swap should flip */
+    DRM_VBLANK_NEXTONMISS = 0x10000000,	/**< If missed, wait for next vblank */
+    DRM_VBLANK_SECONDARY = 0x20000000,	/**< Secondary display controller */
+    DRM_VBLANK_SIGNAL   = 0x40000000	/* Send signal instead of blocking */
+} drmVBlankSeqType;
+#define DRM_VBLANK_HIGH_CRTC_SHIFT 1
+
+typedef struct _drmVBlankReq {
+	drmVBlankSeqType type;
+	unsigned int sequence;
+	unsigned long signal;
+} drmVBlankReq, *drmVBlankReqPtr;
+
+typedef struct _drmVBlankReply {
+	drmVBlankSeqType type;
+	unsigned int sequence;
+	long tval_sec;
+	long tval_usec;
+} drmVBlankReply, *drmVBlankReplyPtr;
+
+typedef union _drmVBlank {
+	drmVBlankReq request;
+	drmVBlankReply reply;
+} drmVBlank, *drmVBlankPtr;
+
+typedef struct _drmSetVersion {
+	int drm_di_major;
+	int drm_di_minor;
+	int drm_dd_major;
+	int drm_dd_minor;
+} drmSetVersion, *drmSetVersionPtr;
+
+#define __drm_dummy_lock(lock) (*(__volatile__ unsigned int *)lock)
+
+#define DRM_LOCK_HELD  0x80000000U /**< Hardware lock is held */
+#define DRM_LOCK_CONT  0x40000000U /**< Hardware lock is contended */
+
+#if defined(__GNUC__) && (__GNUC__ >= 2)
+# if defined(__i386) || defined(__AMD64__) || defined(__x86_64__) || defined(__amd64__)
+				/* Reflect changes here to drmP.h */
+#define DRM_CAS(lock,old,new,__ret)                                    \
+	do {                                                           \
+                int __dummy;	/* Can't mark eax as clobbered */      \
+		__asm__ __volatile__(                                  \
+			"lock ; cmpxchg %4,%1\n\t"                     \
+                        "setnz %0"                                     \
+			: "=d" (__ret),                                \
+   			  "=m" (__drm_dummy_lock(lock)),               \
+                          "=a" (__dummy)                               \
+			: "2" (old),                                   \
+			  "r" (new));                                  \
+	} while (0)
+
+#elif defined(__alpha__)
+
+#define	DRM_CAS(lock, old, new, ret)		\
+	do {					\
+		int tmp, old32;			\
+		__asm__ __volatile__(		\
+		"	addl	$31, %5, %3\n"	\
+		"1:	ldl_l	%0, %2\n"	\
+		"	cmpeq	%0, %3, %1\n"	\
+		"	beq	%1, 2f\n"	\
+		"	mov	%4, %0\n"	\
+		"	stl_c	%0, %2\n"	\
+		"	beq	%0, 3f\n"	\
+		"	mb\n"			\
+		"2:	cmpeq	%1, 0, %1\n"	\
+		".subsection 2\n"		\
+		"3:	br	1b\n"		\
+		".previous"			\
+		: "=&r"(tmp), "=&r"(ret),	\
+		  "=m"(__drm_dummy_lock(lock)),	\
+		  "=&r"(old32)			\
+		: "r"(new), "r"(old)		\
+		: "memory");			\
+	} while (0)
+
+#elif defined(__sparc__)
+
+#define DRM_CAS(lock,old,new,__ret)				\
+do {	register unsigned int __old __asm("o0");		\
+	register unsigned int __new __asm("o1");		\
+	register volatile unsigned int *__lock __asm("o2");	\
+	__old = old;						\
+	__new = new;						\
+	__lock = (volatile unsigned int *)lock;			\
+	__asm__ __volatile__(					\
+		/*"cas [%2], %3, %0"*/				\
+		".word 0xd3e29008\n\t"				\
+		/*"membar #StoreStore | #StoreLoad"*/		\
+		".word 0x8143e00a"				\
+		: "=&r" (__new)					\
+		: "0" (__new),					\
+		  "r" (__lock),					\
+		  "r" (__old)					\
+		: "memory");					\
+	__ret = (__new != __old);				\
+} while(0)
+
+#elif defined(__ia64__)
+
+#ifdef __INTEL_COMPILER
+/* this currently generates bad code (missing stop bits)... */
+#include <ia64intrin.h>
+
+#define DRM_CAS(lock,old,new,__ret)					      \
+	do {								      \
+		unsigned long __result, __old = (old) & 0xffffffff;		\
+		__mf();							      	\
+		__result = _InterlockedCompareExchange_acq(&__drm_dummy_lock(lock), (new), __old);\
+		__ret = (__result) != (__old);					\
+/*		__ret = (__sync_val_compare_and_swap(&__drm_dummy_lock(lock), \
+						     (old), (new))	      \
+			 != (old));					      */\
+	} while (0)
+
+#else
+#define DRM_CAS(lock,old,new,__ret)					  \
+	do {								  \
+		unsigned int __result, __old = (old);			  \
+		__asm__ __volatile__(					  \
+			"mf\n"						  \
+			"mov ar.ccv=%2\n"				  \
+			";;\n"						  \
+			"cmpxchg4.acq %0=%1,%3,ar.ccv"			  \
+			: "=r" (__result), "=m" (__drm_dummy_lock(lock))  \
+			: "r" ((unsigned long)__old), "r" (new)			  \
+			: "memory");					  \
+		__ret = (__result) != (__old);				  \
+	} while (0)
+
+#endif
+
+#elif defined(__powerpc__)
+
+#define DRM_CAS(lock,old,new,__ret)			\
+	do {						\
+		__asm__ __volatile__(			\
+			"sync;"				\
+			"0:    lwarx %0,0,%1;"		\
+			"      xor. %0,%3,%0;"		\
+			"      bne 1f;"			\
+			"      stwcx. %2,0,%1;"		\
+			"      bne- 0b;"		\
+			"1:    "			\
+			"sync;"				\
+		: "=&r"(__ret)				\
+		: "r"(lock), "r"(new), "r"(old)		\
+		: "cr0", "memory");			\
+	} while (0)
+
+# elif defined (__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \
+	|| defined (__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) \
+	|| defined (__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) \
+	|| defined (__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \
+	|| defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \
+	|| defined(__ARM_ARCH_7EM__)
+       /* excluding ARMv4/ARMv5 and lower (lacking ldrex/strex support) */
+       #undef DRM_DEV_MODE
+       #define DRM_DEV_MODE     (S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH)
+
+       #define DRM_CAS(lock,old,new,__ret)             \
+       do {                                            \
+               __asm__ __volatile__ (                  \
+                       "1: ldrex %0, [%1]\n"           \
+                       "   teq %0, %2\n"               \
+                       "   ite eq\n"                   \
+                       "   strexeq %0, %3, [%1]\n"     \
+                       "   movne   %0, #1\n"           \
+               : "=&r" (__ret)                         \
+               : "r" (lock), "r" (old), "r" (new)      \
+               : "cc","memory");                       \
+       } while (0)
+
+#endif /* architecture */
+#endif /* __GNUC__ >= 2 */
+
+#ifndef DRM_CAS
+#define DRM_CAS(lock,old,new,ret) do { ret=1; } while (0) /* FAST LOCK FAILS */
+#endif
+
+#if defined(__alpha__)
+#define DRM_CAS_RESULT(_result)		long _result
+#elif defined(__powerpc__)
+#define DRM_CAS_RESULT(_result)		int _result
+#else
+#define DRM_CAS_RESULT(_result)		char _result
+#endif
+
+#define DRM_LIGHT_LOCK(fd,lock,context)                                \
+	do {                                                           \
+                DRM_CAS_RESULT(__ret);                                 \
+		DRM_CAS(lock,context,DRM_LOCK_HELD|context,__ret);     \
+                if (__ret) drmGetLock(fd,context,0);                   \
+        } while(0)
+
+				/* This one counts fast locks -- for
+                                   benchmarking only. */
+#define DRM_LIGHT_LOCK_COUNT(fd,lock,context,count)                    \
+	do {                                                           \
+                DRM_CAS_RESULT(__ret);                                 \
+		DRM_CAS(lock,context,DRM_LOCK_HELD|context,__ret);     \
+                if (__ret) drmGetLock(fd,context,0);                   \
+                else       ++count;                                    \
+        } while(0)
+
+#define DRM_LOCK(fd,lock,context,flags)                                \
+	do {                                                           \
+		if (flags) drmGetLock(fd,context,flags);               \
+		else       DRM_LIGHT_LOCK(fd,lock,context);            \
+	} while(0)
+
+#define DRM_UNLOCK(fd,lock,context)                                    \
+	do {                                                           \
+                DRM_CAS_RESULT(__ret);                                 \
+		DRM_CAS(lock,DRM_LOCK_HELD|context,context,__ret);     \
+                if (__ret) drmUnlock(fd,context);                      \
+        } while(0)
+
+				/* Simple spin locks */
+#define DRM_SPINLOCK(spin,val)                                         \
+	do {                                                           \
+            DRM_CAS_RESULT(__ret);                                     \
+	    do {                                                       \
+		DRM_CAS(spin,0,val,__ret);                             \
+		if (__ret) while ((spin)->lock);                       \
+	    } while (__ret);                                           \
+	} while(0)
+
+#define DRM_SPINLOCK_TAKE(spin,val)                                    \
+	do {                                                           \
+            DRM_CAS_RESULT(__ret);                                     \
+            int  cur;                                                  \
+	    do {                                                       \
+                cur = (*spin).lock;                                    \
+		DRM_CAS(spin,cur,val,__ret);                           \
+	    } while (__ret);                                           \
+	} while(0)
+
+#define DRM_SPINLOCK_COUNT(spin,val,count,__ret)                       \
+	do {                                                           \
+            int  __i;                                                  \
+            __ret = 1;                                                 \
+            for (__i = 0; __ret && __i < count; __i++) {               \
+		DRM_CAS(spin,0,val,__ret);                             \
+		if (__ret) for (;__i < count && (spin)->lock; __i++);  \
+	    }                                                          \
+	} while(0)
+
+#define DRM_SPINUNLOCK(spin,val)                                       \
+	do {                                                           \
+            DRM_CAS_RESULT(__ret);                                     \
+            if ((*spin).lock == val) { /* else server stole lock */    \
+	        do {                                                   \
+		    DRM_CAS(spin,val,0,__ret);                         \
+	        } while (__ret);                                       \
+            }                                                          \
+	} while(0)
+
+
+
+/* General user-level programmer's API: unprivileged */
+extern int           drmAvailable(void);
+extern int           drmOpen(const char *name, const char *busid);
+
+#define DRM_NODE_PRIMARY 0
+#define DRM_NODE_CONTROL 1 /* deprecated: never returned */
+#define DRM_NODE_RENDER  2
+#define DRM_NODE_MAX     3
+
+extern int           drmOpenWithType(const char *name, const char *busid,
+                                     int type);
+
+extern int           drmOpenControl(int minor); /* deprecated: always fails */
+extern int           drmOpenRender(int minor);
+extern int           drmClose(int fd);
+extern drmVersionPtr drmGetVersion(int fd);
+extern drmVersionPtr drmGetLibVersion(int fd);
+extern int           drmGetCap(int fd, uint64_t capability, uint64_t *value);
+extern void          drmFreeVersion(drmVersionPtr);
+extern int           drmGetMagic(int fd, drm_magic_t * magic);
+extern char          *drmGetBusid(int fd);
+extern int           drmGetInterruptFromBusID(int fd, int busnum, int devnum,
+					      int funcnum);
+extern int           drmGetMap(int fd, int idx, drm_handle_t *offset,
+			       drmSize *size, drmMapType *type,
+			       drmMapFlags *flags, drm_handle_t *handle,
+			       int *mtrr);
+extern int           drmGetClient(int fd, int idx, int *auth, int *pid,
+				  int *uid, unsigned long *magic,
+				  unsigned long *iocs);
+extern int           drmGetStats(int fd, drmStatsT *stats);
+extern int           drmSetInterfaceVersion(int fd, drmSetVersion *version);
+extern int           drmCommandNone(int fd, unsigned long drmCommandIndex);
+extern int           drmCommandRead(int fd, unsigned long drmCommandIndex,
+                                    void *data, unsigned long size);
+extern int           drmCommandWrite(int fd, unsigned long drmCommandIndex,
+                                     void *data, unsigned long size);
+extern int           drmCommandWriteRead(int fd, unsigned long drmCommandIndex,
+                                         void *data, unsigned long size);
+
+/* General user-level programmer's API: X server (root) only  */
+extern void          drmFreeBusid(const char *busid);
+extern int           drmSetBusid(int fd, const char *busid);
+extern int           drmAuthMagic(int fd, drm_magic_t magic);
+extern int           drmAddMap(int fd,
+			       drm_handle_t offset,
+			       drmSize size,
+			       drmMapType type,
+			       drmMapFlags flags,
+			       drm_handle_t * handle);
+extern int	     drmRmMap(int fd, drm_handle_t handle);
+extern int	     drmAddContextPrivateMapping(int fd, drm_context_t ctx_id,
+						 drm_handle_t handle);
+
+extern int           drmAddBufs(int fd, int count, int size,
+				drmBufDescFlags flags,
+				int agp_offset);
+extern int           drmMarkBufs(int fd, double low, double high);
+extern int           drmCreateContext(int fd, drm_context_t * handle);
+extern int           drmSetContextFlags(int fd, drm_context_t context,
+					drm_context_tFlags flags);
+extern int           drmGetContextFlags(int fd, drm_context_t context,
+					drm_context_tFlagsPtr flags);
+extern int           drmAddContextTag(int fd, drm_context_t context, void *tag);
+extern int           drmDelContextTag(int fd, drm_context_t context);
+extern void          *drmGetContextTag(int fd, drm_context_t context);
+extern drm_context_t * drmGetReservedContextList(int fd, int *count);
+extern void          drmFreeReservedContextList(drm_context_t *);
+extern int           drmSwitchToContext(int fd, drm_context_t context);
+extern int           drmDestroyContext(int fd, drm_context_t handle);
+extern int           drmCreateDrawable(int fd, drm_drawable_t * handle);
+extern int           drmDestroyDrawable(int fd, drm_drawable_t handle);
+extern int           drmUpdateDrawableInfo(int fd, drm_drawable_t handle,
+					   drm_drawable_info_type_t type,
+					   unsigned int num, void *data);
+extern int           drmCtlInstHandler(int fd, int irq);
+extern int           drmCtlUninstHandler(int fd);
+extern int           drmSetClientCap(int fd, uint64_t capability,
+				     uint64_t value);
+
+extern int           drmCrtcGetSequence(int fd, uint32_t crtcId,
+					uint64_t *sequence, uint64_t *ns);
+extern int           drmCrtcQueueSequence(int fd, uint32_t crtcId,
+					  uint32_t flags, uint64_t sequence,
+					  uint64_t *sequence_queued,
+					  uint64_t user_data);
+/* General user-level programmer's API: authenticated client and/or X */
+extern int           drmMap(int fd,
+			    drm_handle_t handle,
+			    drmSize size,
+			    drmAddressPtr address);
+extern int           drmUnmap(drmAddress address, drmSize size);
+extern drmBufInfoPtr drmGetBufInfo(int fd);
+extern drmBufMapPtr  drmMapBufs(int fd);
+extern int           drmUnmapBufs(drmBufMapPtr bufs);
+extern int           drmDMA(int fd, drmDMAReqPtr request);
+extern int           drmFreeBufs(int fd, int count, int *list);
+extern int           drmGetLock(int fd,
+			        drm_context_t context,
+			        drmLockFlags flags);
+extern int           drmUnlock(int fd, drm_context_t context);
+extern int           drmFinish(int fd, int context, drmLockFlags flags);
+extern int	     drmGetContextPrivateMapping(int fd, drm_context_t ctx_id, 
+						 drm_handle_t * handle);
+
+/* AGP/GART support: X server (root) only */
+extern int           drmAgpAcquire(int fd);
+extern int           drmAgpRelease(int fd);
+extern int           drmAgpEnable(int fd, unsigned long mode);
+extern int           drmAgpAlloc(int fd, unsigned long size,
+				 unsigned long type, unsigned long *address,
+				 drm_handle_t *handle);
+extern int           drmAgpFree(int fd, drm_handle_t handle);
+extern int 	     drmAgpBind(int fd, drm_handle_t handle,
+				unsigned long offset);
+extern int           drmAgpUnbind(int fd, drm_handle_t handle);
+
+/* AGP/GART info: authenticated client and/or X */
+extern int           drmAgpVersionMajor(int fd);
+extern int           drmAgpVersionMinor(int fd);
+extern unsigned long drmAgpGetMode(int fd);
+extern unsigned long drmAgpBase(int fd); /* Physical location */
+extern unsigned long drmAgpSize(int fd); /* Bytes */
+extern unsigned long drmAgpMemoryUsed(int fd);
+extern unsigned long drmAgpMemoryAvail(int fd);
+extern unsigned int  drmAgpVendorId(int fd);
+extern unsigned int  drmAgpDeviceId(int fd);
+
+/* PCI scatter/gather support: X server (root) only */
+extern int           drmScatterGatherAlloc(int fd, unsigned long size,
+					   drm_handle_t *handle);
+extern int           drmScatterGatherFree(int fd, drm_handle_t handle);
+
+extern int           drmWaitVBlank(int fd, drmVBlankPtr vbl);
+
+/* Support routines */
+extern void          drmSetServerInfo(drmServerInfoPtr info);
+extern int           drmError(int err, const char *label);
+extern void          *drmMalloc(int size);
+extern void          drmFree(void *pt);
+
+/* Hash table routines */
+extern void *drmHashCreate(void);
+extern int  drmHashDestroy(void *t);
+extern int  drmHashLookup(void *t, unsigned long key, void **value);
+extern int  drmHashInsert(void *t, unsigned long key, void *value);
+extern int  drmHashDelete(void *t, unsigned long key);
+extern int  drmHashFirst(void *t, unsigned long *key, void **value);
+extern int  drmHashNext(void *t, unsigned long *key, void **value);
+
+/* PRNG routines */
+extern void          *drmRandomCreate(unsigned long seed);
+extern int           drmRandomDestroy(void *state);
+extern unsigned long drmRandom(void *state);
+extern double        drmRandomDouble(void *state);
+
+/* Skip list routines */
+
+extern void *drmSLCreate(void);
+extern int  drmSLDestroy(void *l);
+extern int  drmSLLookup(void *l, unsigned long key, void **value);
+extern int  drmSLInsert(void *l, unsigned long key, void *value);
+extern int  drmSLDelete(void *l, unsigned long key);
+extern int  drmSLNext(void *l, unsigned long *key, void **value);
+extern int  drmSLFirst(void *l, unsigned long *key, void **value);
+extern void drmSLDump(void *l);
+extern int  drmSLLookupNeighbors(void *l, unsigned long key,
+				 unsigned long *prev_key, void **prev_value,
+				 unsigned long *next_key, void **next_value);
+
+extern int drmOpenOnce(void *unused, const char *BusID, int *newlyopened);
+extern int drmOpenOnceWithType(const char *BusID, int *newlyopened, int type);
+extern void drmCloseOnce(int fd);
+extern void drmMsg(const char *format, ...) DRM_PRINTFLIKE(1, 2);
+
+extern int drmSetMaster(int fd);
+extern int drmDropMaster(int fd);
+extern int drmIsMaster(int fd);
+
+#define DRM_EVENT_CONTEXT_VERSION 4
+
+typedef struct _drmEventContext {
+
+	/* This struct is versioned so we can add more pointers if we
+	 * add more events. */
+	int version;
+
+	void (*vblank_handler)(int fd,
+			       unsigned int sequence, 
+			       unsigned int tv_sec,
+			       unsigned int tv_usec,
+			       void *user_data);
+
+	void (*page_flip_handler)(int fd,
+				  unsigned int sequence,
+				  unsigned int tv_sec,
+				  unsigned int tv_usec,
+				  void *user_data);
+
+	void (*page_flip_handler2)(int fd,
+				   unsigned int sequence,
+				   unsigned int tv_sec,
+				   unsigned int tv_usec,
+				   unsigned int crtc_id,
+				   void *user_data);
+
+	void (*sequence_handler)(int fd,
+				 uint64_t sequence,
+				 uint64_t ns,
+				 uint64_t user_data);
+} drmEventContext, *drmEventContextPtr;
+
+extern int drmHandleEvent(int fd, drmEventContextPtr evctx);
+
+extern char *drmGetDeviceNameFromFd(int fd);
+
+/* Improved version of drmGetDeviceNameFromFd which attributes for any type of
+ * device/node - card or renderD.
+ */
+extern char *drmGetDeviceNameFromFd2(int fd);
+extern int drmGetNodeTypeFromFd(int fd);
+
+/* Convert between GEM handles and DMA-BUF file descriptors.
+ *
+ * Warning: since GEM handles are not reference-counted and are unique per
+ * DRM file description, the caller is expected to perform its own reference
+ * counting. drmPrimeFDToHandle is guaranteed to return the same handle for
+ * different FDs if they reference the same underlying buffer object. This
+ * could even be a buffer object originally created on the same DRM FD.
+ *
+ * When sharing a DRM FD with an API such as EGL or GBM, the caller must not
+ * use drmPrimeHandleToFD nor drmPrimeFDToHandle. A single user-space
+ * reference-counting implementation is necessary to avoid double-closing GEM
+ * handles.
+ *
+ * Two processes can't share the same DRM FD and both use it to create or
+ * import GEM handles, even when using a single user-space reference-counting
+ * implementation like GBM, because GBM doesn't share its state between
+ * processes.
+ */
+extern int drmPrimeHandleToFD(int fd, uint32_t handle, uint32_t flags, int *prime_fd);
+extern int drmPrimeFDToHandle(int fd, int prime_fd, uint32_t *handle);
+
+extern int drmCloseBufferHandle(int fd, uint32_t handle);
+
+extern char *drmGetPrimaryDeviceNameFromFd(int fd);
+extern char *drmGetRenderDeviceNameFromFd(int fd);
+
+#define DRM_BUS_PCI       0
+#define DRM_BUS_USB       1
+#define DRM_BUS_PLATFORM  2
+#define DRM_BUS_HOST1X    3
+
+typedef struct _drmPciBusInfo {
+    uint16_t domain;
+    uint8_t bus;
+    uint8_t dev;
+    uint8_t func;
+} drmPciBusInfo, *drmPciBusInfoPtr;
+
+typedef struct _drmPciDeviceInfo {
+    uint16_t vendor_id;
+    uint16_t device_id;
+    uint16_t subvendor_id;
+    uint16_t subdevice_id;
+    uint8_t revision_id;
+} drmPciDeviceInfo, *drmPciDeviceInfoPtr;
+
+typedef struct _drmUsbBusInfo {
+    uint8_t bus;
+    uint8_t dev;
+} drmUsbBusInfo, *drmUsbBusInfoPtr;
+
+typedef struct _drmUsbDeviceInfo {
+    uint16_t vendor;
+    uint16_t product;
+} drmUsbDeviceInfo, *drmUsbDeviceInfoPtr;
+
+#define DRM_PLATFORM_DEVICE_NAME_LEN 512
+
+typedef struct _drmPlatformBusInfo {
+    char fullname[DRM_PLATFORM_DEVICE_NAME_LEN];
+} drmPlatformBusInfo, *drmPlatformBusInfoPtr;
+
+typedef struct _drmPlatformDeviceInfo {
+    char **compatible; /* NULL terminated list of compatible strings */
+} drmPlatformDeviceInfo, *drmPlatformDeviceInfoPtr;
+
+#define DRM_HOST1X_DEVICE_NAME_LEN 512
+
+typedef struct _drmHost1xBusInfo {
+    char fullname[DRM_HOST1X_DEVICE_NAME_LEN];
+} drmHost1xBusInfo, *drmHost1xBusInfoPtr;
+
+typedef struct _drmHost1xDeviceInfo {
+    char **compatible; /* NULL terminated list of compatible strings */
+} drmHost1xDeviceInfo, *drmHost1xDeviceInfoPtr;
+
+typedef struct _drmDevice {
+    char **nodes; /* DRM_NODE_MAX sized array */
+    int available_nodes; /* DRM_NODE_* bitmask */
+    int bustype;
+    union {
+        drmPciBusInfoPtr pci;
+        drmUsbBusInfoPtr usb;
+        drmPlatformBusInfoPtr platform;
+        drmHost1xBusInfoPtr host1x;
+    } businfo;
+    union {
+        drmPciDeviceInfoPtr pci;
+        drmUsbDeviceInfoPtr usb;
+        drmPlatformDeviceInfoPtr platform;
+        drmHost1xDeviceInfoPtr host1x;
+    } deviceinfo;
+} drmDevice, *drmDevicePtr;
+
+extern int drmGetDevice(int fd, drmDevicePtr *device);
+extern void drmFreeDevice(drmDevicePtr *device);
+
+extern int drmGetDevices(drmDevicePtr devices[], int max_devices);
+extern void drmFreeDevices(drmDevicePtr devices[], int count);
+
+#define DRM_DEVICE_GET_PCI_REVISION (1 << 0)
+extern int drmGetDevice2(int fd, uint32_t flags, drmDevicePtr *device);
+extern int drmGetDevices2(uint32_t flags, drmDevicePtr devices[], int max_devices);
+
+extern int drmGetDeviceFromDevId(dev_t dev_id, uint32_t flags, drmDevicePtr *device);
+
+/**
+ * Get the node type (DRM_NODE_PRIMARY or DRM_NODE_RENDER) from a device ID.
+ *
+ * Returns negative errno on error.
+ */
+extern int drmGetNodeTypeFromDevId(dev_t devid);
+
+/**
+ * Check if two drmDevice pointers represent the same DRM device.
+ *
+ * Returns 1 if the devices are equal, 0 otherwise.
+ */
+extern int drmDevicesEqual(drmDevicePtr a, drmDevicePtr b);
+
+extern int drmSyncobjCreate(int fd, uint32_t flags, uint32_t *handle);
+extern int drmSyncobjDestroy(int fd, uint32_t handle);
+extern int drmSyncobjHandleToFD(int fd, uint32_t handle, int *obj_fd);
+extern int drmSyncobjFDToHandle(int fd, int obj_fd, uint32_t *handle);
+
+extern int drmSyncobjImportSyncFile(int fd, uint32_t handle, int sync_file_fd);
+extern int drmSyncobjExportSyncFile(int fd, uint32_t handle, int *sync_file_fd);
+extern int drmSyncobjWait(int fd, uint32_t *handles, unsigned num_handles,
+			  int64_t timeout_nsec, unsigned flags,
+			  uint32_t *first_signaled);
+extern int drmSyncobjReset(int fd, const uint32_t *handles, uint32_t handle_count);
+extern int drmSyncobjSignal(int fd, const uint32_t *handles, uint32_t handle_count);
+extern int drmSyncobjTimelineSignal(int fd, const uint32_t *handles,
+				    uint64_t *points, uint32_t handle_count);
+extern int drmSyncobjTimelineWait(int fd, uint32_t *handles, uint64_t *points,
+				  unsigned num_handles,
+				  int64_t timeout_nsec, unsigned flags,
+				  uint32_t *first_signaled);
+extern int drmSyncobjQuery(int fd, uint32_t *handles, uint64_t *points,
+			   uint32_t handle_count);
+extern int drmSyncobjQuery2(int fd, uint32_t *handles, uint64_t *points,
+			    uint32_t handle_count, uint32_t flags);
+extern int drmSyncobjTransfer(int fd,
+			      uint32_t dst_handle, uint64_t dst_point,
+			      uint32_t src_handle, uint64_t src_point,
+			      uint32_t flags);
+extern int drmSyncobjEventfd(int fd, uint32_t handle, uint64_t point, int ev_fd,
+                             uint32_t flags);
+
+extern char *
+drmGetFormatModifierVendor(uint64_t modifier);
+
+extern char *
+drmGetFormatModifierName(uint64_t modifier);
+
+extern char *
+drmGetFormatName(uint32_t format);
+
+#ifndef fourcc_mod_get_vendor
+#define fourcc_mod_get_vendor(modifier) \
+       (((modifier) >> 56) & 0xff)
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
@@ -398,6 +398,12 @@ hsaKmtGetQueueInfo(
    HsaQueueInfo *QueueInfo	//IN
 );

+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtQueueRingDoorbell(
+    HSA_QUEUEID QueueId
+);
+
 /**
  Allows an HSA process to set/change the default and alternate memory coherency, before starting to dispatch. 
 */
@@ -0,0 +1,33 @@
+/*
+ * Copyright © 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _HSAKMT_DRM_H_
+#define _HSAKMT_DRM_H_
+
+#include "drm/xf86drm.h"
+#include "drm/amdgpu.h"
+#include "drm/amdgpu_drm.h"
+
+#endif
@@ -0,0 +1,91 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+// 
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+// 
+// Developed by:
+// 
+//                 AMD Research and AMD HSA Software Development
+// 
+//                 Advanced Micro Devices, Inc.
+// 
+//                 www.amd.com
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+// 
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// The following set of header files provides definitions for AMD GPU
+// Architecture:
+//   - amd_hsa_common.h
+//   - amd_hsa_elf.h
+//   - amd_hsa_kernel_code.h
+//   - amd_hsa_queue.h
+//   - amd_hsa_signal.h
+//
+// Refer to "HSA Application Binary Interface: AMD GPU Architecture" for more
+// information.
+
+#ifndef AMD_HSA_COMMON_H
+#define AMD_HSA_COMMON_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+// Descriptive version of the HSA Application Binary Interface.
+#define AMD_HSA_ABI_VERSION "AMD GPU Architecture v0.35 (June 25, 2015)"
+
+// Alignment attribute that specifies a minimum alignment (in bytes) for
+// variables of the specified type.
+#if defined(__GNUC__)
+#  define __ALIGNED__(x) __attribute__((aligned(x)))
+#elif defined(_MSC_VER)
+#  define __ALIGNED__(x) __declspec(align(x))
+#elif defined(RC_INVOKED)
+#  define __ALIGNED__(x)
+#else
+#  error
+#endif
+
+// Creates enumeration entries for packed types. Enumeration entries include
+// bit shift amount, bit width, and bit mask.
+#define AMD_HSA_BITS_CREATE_ENUM_ENTRIES(name, shift, width)                   \
+  name##_SHIFT = (shift),                                                      \
+  name##_WIDTH = (width),                                                      \
+  name = (((1 << (width)) - 1) << (shift))                                     \
+
+// Gets bits for specified mask from specified src packed instance.
+#define AMD_HSA_BITS_GET(src, mask)                                            \
+  ((src & mask) >> mask ## _SHIFT)                                             \
+
+// Sets val bits for specified mask in specified dst packed instance.
+#define AMD_HSA_BITS_SET(dst, mask, val)                                       \
+  dst &= (~(1 << mask##_SHIFT) & ~mask);                                       \
+  dst |= (((val) << mask##_SHIFT) & mask)                                      \
+
+#endif // AMD_HSA_COMMON_H
@@ -0,0 +1,467 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// Undefine the macro in case it is defined in the system elf.h.
+#undef EM_AMDGPU
+
+#ifndef AMD_HSA_ELF_H
+#define AMD_HSA_ELF_H
+
+// AMD GPU Specific ELF Header Enumeration Values.
+//
+// Values are copied from LLVM BinaryFormat/ELF.h . This file also contains
+// code object V1 defintions which are not part of the LLVM header. Code object
+// V1 was only supported by the Finalizer which is now deprecated and removed.
+//
+// TODO: Deprecate and remove V1 support and replace this header with using the
+// LLVM header.
+namespace ELF {
+
+// Machine architectures
+// See current registered ELF machine architectures at:
+//    http://www.uxsglobal.com/developers/gabi/latest/ch4.eheader.html
+enum {
+  EM_AMDGPU = 224,        // AMD GPU architecture
+};
+
+// OS ABI identification.
+enum {
+  ELFOSABI_AMDGPU_HSA = 64,    // AMD HSA runtime
+};
+
+// AMDGPU OS ABI Version identification.
+enum {
+  // ELFABIVERSION_AMDGPU_HSA_V1 does not exist because OS ABI identification
+  // was never defined for V1.
+  ELFABIVERSION_AMDGPU_HSA_V2 = 0,
+  ELFABIVERSION_AMDGPU_HSA_V3 = 1,
+  ELFABIVERSION_AMDGPU_HSA_V4 = 2,
+  ELFABIVERSION_AMDGPU_HSA_V5 = 3,
+  ELFABIVERSION_AMDGPU_HSA_V6 = 4,
+};
+
+// AMDGPU specific e_flags.
+enum : unsigned {
+  // Processor selection mask for EF_AMDGPU_MACH_* values.
+  EF_AMDGPU_MACH = 0x0ff,
+
+  // Not specified processor.
+  EF_AMDGPU_MACH_NONE = 0x000,
+
+  // AMDGCN-based processors.
+  // clang-format off
+  EF_AMDGPU_MACH_AMDGCN_GFX600          = 0x020,
+  EF_AMDGPU_MACH_AMDGCN_GFX601          = 0x021,
+  EF_AMDGPU_MACH_AMDGCN_GFX700          = 0x022,
+  EF_AMDGPU_MACH_AMDGCN_GFX701          = 0x023,
+  EF_AMDGPU_MACH_AMDGCN_GFX702          = 0x024,
+  EF_AMDGPU_MACH_AMDGCN_GFX703          = 0x025,
+  EF_AMDGPU_MACH_AMDGCN_GFX704          = 0x026,
+  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X27   = 0x027,
+  EF_AMDGPU_MACH_AMDGCN_GFX801          = 0x028,
+  EF_AMDGPU_MACH_AMDGCN_GFX802          = 0x029,
+  EF_AMDGPU_MACH_AMDGCN_GFX803          = 0x02a,
+  EF_AMDGPU_MACH_AMDGCN_GFX810          = 0x02b,
+  EF_AMDGPU_MACH_AMDGCN_GFX900          = 0x02c,
+  EF_AMDGPU_MACH_AMDGCN_GFX902          = 0x02d,
+  EF_AMDGPU_MACH_AMDGCN_GFX904          = 0x02e,
+  EF_AMDGPU_MACH_AMDGCN_GFX906          = 0x02f,
+  EF_AMDGPU_MACH_AMDGCN_GFX908          = 0x030,
+  EF_AMDGPU_MACH_AMDGCN_GFX909          = 0x031,
+  EF_AMDGPU_MACH_AMDGCN_GFX90C          = 0x032,
+  EF_AMDGPU_MACH_AMDGCN_GFX1010         = 0x033,
+  EF_AMDGPU_MACH_AMDGCN_GFX1011         = 0x034,
+  EF_AMDGPU_MACH_AMDGCN_GFX1012         = 0x035,
+  EF_AMDGPU_MACH_AMDGCN_GFX1030         = 0x036,
+  EF_AMDGPU_MACH_AMDGCN_GFX1031         = 0x037,
+  EF_AMDGPU_MACH_AMDGCN_GFX1032         = 0x038,
+  EF_AMDGPU_MACH_AMDGCN_GFX1033         = 0x039,
+  EF_AMDGPU_MACH_AMDGCN_GFX602          = 0x03a,
+  EF_AMDGPU_MACH_AMDGCN_GFX705          = 0x03b,
+  EF_AMDGPU_MACH_AMDGCN_GFX805          = 0x03c,
+  EF_AMDGPU_MACH_AMDGCN_GFX1035         = 0x03d,
+  EF_AMDGPU_MACH_AMDGCN_GFX1034         = 0x03e,
+  EF_AMDGPU_MACH_AMDGCN_GFX90A          = 0x03f,
+  EF_AMDGPU_MACH_AMDGCN_GFX940          = 0x040,
+  EF_AMDGPU_MACH_AMDGCN_GFX1100         = 0x041,
+  EF_AMDGPU_MACH_AMDGCN_GFX1013         = 0x042,
+  EF_AMDGPU_MACH_AMDGCN_GFX1150         = 0x043,
+  EF_AMDGPU_MACH_AMDGCN_GFX1103         = 0x044,
+  EF_AMDGPU_MACH_AMDGCN_GFX1036         = 0x045,
+  EF_AMDGPU_MACH_AMDGCN_GFX1101         = 0x046,
+  EF_AMDGPU_MACH_AMDGCN_GFX1102         = 0x047,
+  EF_AMDGPU_MACH_AMDGCN_GFX1200         = 0x048,
+  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X49   = 0x049,
+  EF_AMDGPU_MACH_AMDGCN_GFX1151         = 0x04a,
+  EF_AMDGPU_MACH_AMDGCN_GFX941          = 0x04b,
+  EF_AMDGPU_MACH_AMDGCN_GFX942          = 0x04c,
+  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X4D   = 0x04d,
+  EF_AMDGPU_MACH_AMDGCN_GFX1201         = 0x04e,
+  EF_AMDGPU_MACH_AMDGCN_GFX950          = 0x04f,
+  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X50   = 0x050,
+  EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC    = 0x051,
+  EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC = 0x052,
+  EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC = 0x053,
+  EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC   = 0x054,
+  EF_AMDGPU_MACH_AMDGCN_GFX1152         = 0x055,
+  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X56   = 0x056,
+  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X57   = 0x057,
+  EF_AMDGPU_MACH_AMDGCN_GFX1153         = 0x058,
+  EF_AMDGPU_MACH_AMDGCN_GFX12_GENERIC   = 0x059,
+  EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC  = 0x05f,
+  // clang-format on
+
+  // First/last AMDGCN-based processors.
+  EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600,
+  EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC,
+
+  // Indicates if the "xnack" target feature is enabled for all code contained
+  // in the object.
+  //
+  // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V2.
+  EF_AMDGPU_FEATURE_XNACK_V2 = 0x01,
+  // Indicates if the trap handler is enabled for all code contained
+  // in the object.
+  //
+  // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V2.
+  EF_AMDGPU_FEATURE_TRAP_HANDLER_V2 = 0x02,
+
+  // Indicates if the "xnack" target feature is enabled for all code contained
+  // in the object.
+  //
+  // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V3.
+  EF_AMDGPU_FEATURE_XNACK_V3 = 0x100,
+  // Indicates if the "sramecc" target feature is enabled for all code
+  // contained in the object.
+  //
+  // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V3.
+  EF_AMDGPU_FEATURE_SRAMECC_V3 = 0x200,
+
+  // XNACK selection mask for EF_AMDGPU_FEATURE_XNACK_* values.
+  //
+  // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V4.
+  EF_AMDGPU_FEATURE_XNACK_V4 = 0x300,
+  // XNACK is not supported.
+  EF_AMDGPU_FEATURE_XNACK_UNSUPPORTED_V4 = 0x000,
+  // XNACK is any/default/unspecified.
+  EF_AMDGPU_FEATURE_XNACK_ANY_V4 = 0x100,
+  // XNACK is off.
+  EF_AMDGPU_FEATURE_XNACK_OFF_V4 = 0x200,
+  // XNACK is on.
+  EF_AMDGPU_FEATURE_XNACK_ON_V4 = 0x300,
+
+  // SRAMECC selection mask for EF_AMDGPU_FEATURE_SRAMECC_* values.
+  //
+  // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V4.
+  EF_AMDGPU_FEATURE_SRAMECC_V4 = 0xc00,
+  // SRAMECC is not supported.
+  EF_AMDGPU_FEATURE_SRAMECC_UNSUPPORTED_V4 = 0x000,
+  // SRAMECC is any/default/unspecified.
+  EF_AMDGPU_FEATURE_SRAMECC_ANY_V4 = 0x400,
+  // SRAMECC is off.
+  EF_AMDGPU_FEATURE_SRAMECC_OFF_V4 = 0x800,
+  // SRAMECC is on.
+  EF_AMDGPU_FEATURE_SRAMECC_ON_V4 = 0xc00,
+
+  // Generic target versioning. This is contained in the list byte of EFLAGS.
+  EF_AMDGPU_GENERIC_VERSION = 0xff000000,
+  EF_AMDGPU_GENERIC_VERSION_OFFSET = 24,
+  EF_AMDGPU_GENERIC_VERSION_MIN = 1,
+  EF_AMDGPU_GENERIC_VERSION_MAX = 0xff,
+};
+
+// ELF Relocation types for AMDGPU.
+enum : unsigned {
+  R_AMDGPU_ABS32_LO = 1,
+  R_AMDGPU_ABS32_HI = 2,
+  R_AMDGPU_ABS64 = 3,
+  R_AMDGPU_ABS32 = 6,
+  R_AMDGPU_RELATIVE64 = 13,
+};
+
+} // end namespace ELF
+
+// ELF Section Header Flag Enumeration Values.
+#define SHF_AMDGPU_HSA_GLOBAL   (0x00100000 & SHF_MASKOS)
+#define SHF_AMDGPU_HSA_READONLY (0x00200000 & SHF_MASKOS)
+#define SHF_AMDGPU_HSA_CODE     (0x00400000 & SHF_MASKOS)
+#define SHF_AMDGPU_HSA_AGENT    (0x00800000 & SHF_MASKOS)
+
+//
+typedef enum {
+  AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM = 0,
+  AMDGPU_HSA_SEGMENT_GLOBAL_AGENT = 1,
+  AMDGPU_HSA_SEGMENT_READONLY_AGENT = 2,
+  AMDGPU_HSA_SEGMENT_CODE_AGENT = 3,
+  AMDGPU_HSA_SEGMENT_LAST,
+} amdgpu_hsa_elf_segment_t;
+
+// ELF Program Header Type Enumeration Values.
+#define PT_AMDGPU_HSA_LOAD_GLOBAL_PROGRAM (PT_LOOS + AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM)
+#define PT_AMDGPU_HSA_LOAD_GLOBAL_AGENT   (PT_LOOS + AMDGPU_HSA_SEGMENT_GLOBAL_AGENT)
+#define PT_AMDGPU_HSA_LOAD_READONLY_AGENT (PT_LOOS + AMDGPU_HSA_SEGMENT_READONLY_AGENT)
+#define PT_AMDGPU_HSA_LOAD_CODE_AGENT     (PT_LOOS + AMDGPU_HSA_SEGMENT_CODE_AGENT)
+
+// ELF Symbol Type Enumeration Values.
+#define STT_AMDGPU_HSA_KERNEL            (STT_LOOS + 0)
+#define STT_AMDGPU_HSA_INDIRECT_FUNCTION (STT_LOOS + 1)
+#define STT_AMDGPU_HSA_METADATA          (STT_LOOS + 2)
+
+// ELF Symbol Binding Enumeration Values.
+#define STB_AMDGPU_HSA_EXTERNAL (STB_LOOS + 0)
+
+// ELF Symbol Other Information Creation/Retrieval.
+#define ELF64_ST_AMDGPU_ALLOCATION(o)  (((o) >> 2) & 0x3)
+#define ELF64_ST_AMDGPU_FLAGS(o)       ((o) >> 4)
+#define ELF64_ST_AMDGPU_OTHER(f, a, v) (((f) << 4) + (((a) & 0x3) << 2) + ((v) & 0x3))
+
+typedef enum {
+  AMDGPU_HSA_SYMBOL_ALLOCATION_DEFAULT = 0,
+  AMDGPU_HSA_SYMBOL_ALLOCATION_GLOBAL_PROGRAM = 1,
+  AMDGPU_HSA_SYMBOL_ALLOCATION_GLOBAL_AGENT = 2,
+  AMDGPU_HSA_SYMBOL_ALLOCATION_READONLY_AGENT = 3,
+  AMDGPU_HSA_SYMBOL_ALLOCATION_LAST,
+} amdgpu_hsa_symbol_allocation_t;
+
+// ELF Symbol Allocation Enumeration Values.
+#define STA_AMDGPU_HSA_DEFAULT        AMDGPU_HSA_SYMBOL_ALLOCATION_DEFAULT
+#define STA_AMDGPU_HSA_GLOBAL_PROGRAM AMDGPU_HSA_SYMBOL_ALLOCATION_GLOBAL_PROGRAM
+#define STA_AMDGPU_HSA_GLOBAL_AGENT   AMDGPU_HSA_SYMBOL_ALLOCATION_GLOBAL_AGENT
+#define STA_AMDGPU_HSA_READONLY_AGENT AMDGPU_HSA_SYMBOL_ALLOCATION_READONLY_AGENT
+
+typedef enum {
+  AMDGPU_HSA_SYMBOL_FLAG_DEFAULT = 0,
+  AMDGPU_HSA_SYMBOL_FLAG_CONST = 1,
+  AMDGPU_HSA_SYMBOL_FLAG_LAST,
+} amdgpu_hsa_symbol_flag_t;
+
+// ELF Symbol Flag Enumeration Values.
+#define STF_AMDGPU_HSA_CONST AMDGPU_HSA_SYMBOL_FLAG_CONST
+
+// Legacy/V1 AMD GPU Relocation Type Enumeration Values.
+#define R_AMDGPU_V1_NONE         0
+#define R_AMDGPU_V1_32_LOW       1
+#define R_AMDGPU_V1_32_HIGH      2
+#define R_AMDGPU_V1_64           3
+#define R_AMDGPU_V1_INIT_SAMPLER 4
+#define R_AMDGPU_V1_INIT_IMAGE   5
+#define R_AMDGPU_V1_RELATIVE64   13
+
+// AMD GPU Note Type Enumeration Values.
+#define NT_AMD_HSA_CODE_OBJECT_VERSION 1
+#define NT_AMD_HSA_HSAIL               2
+#define NT_AMD_HSA_ISA_VERSION         3
+#define NT_AMD_HSA_PRODUCER            4
+#define NT_AMD_HSA_PRODUCER_OPTIONS    5
+#define NT_AMD_HSA_EXTENSION           6
+#define NT_AMD_HSA_ISA_NAME            11
+/* AMDGPU snapshots of runtime, agent and queues state for use in core dump */
+#define NT_AMDGPU_CORE_STATE           33
+#define NT_AMD_HSA_HLDEBUG_DEBUG       101
+#define NT_AMD_HSA_HLDEBUG_TARGET      102
+
+// AMD GPU Metadata Kind Enumeration Values.
+typedef uint16_t amdgpu_hsa_metadata_kind16_t;
+typedef enum {
+  AMDGPU_HSA_METADATA_KIND_NONE = 0,
+  AMDGPU_HSA_METADATA_KIND_INIT_SAMP = 1,
+  AMDGPU_HSA_METADATA_KIND_INIT_ROIMG = 2,
+  AMDGPU_HSA_METADATA_KIND_INIT_WOIMG = 3,
+  AMDGPU_HSA_METADATA_KIND_INIT_RWIMG = 4
+} amdgpu_hsa_metadata_kind_t;
+
+// AMD GPU Sampler Coordinate Normalization Enumeration Values.
+typedef uint8_t amdgpu_hsa_sampler_coord8_t;
+typedef enum {
+  AMDGPU_HSA_SAMPLER_COORD_UNNORMALIZED = 0,
+  AMDGPU_HSA_SAMPLER_COORD_NORMALIZED = 1
+} amdgpu_hsa_sampler_coord_t;
+
+// AMD GPU Sampler Filter Enumeration Values.
+typedef uint8_t amdgpu_hsa_sampler_filter8_t;
+typedef enum {
+  AMDGPU_HSA_SAMPLER_FILTER_NEAREST = 0,
+  AMDGPU_HSA_SAMPLER_FILTER_LINEAR = 1
+} amdgpu_hsa_sampler_filter_t;
+
+// AMD GPU Sampler Addressing Enumeration Values.
+typedef uint8_t amdgpu_hsa_sampler_addressing8_t;
+typedef enum {
+  AMDGPU_HSA_SAMPLER_ADDRESSING_UNDEFINED = 0,
+  AMDGPU_HSA_SAMPLER_ADDRESSING_CLAMP_TO_EDGE = 1,
+  AMDGPU_HSA_SAMPLER_ADDRESSING_CLAMP_TO_BORDER = 2,
+  AMDGPU_HSA_SAMPLER_ADDRESSING_REPEAT = 3,
+  AMDGPU_HSA_SAMPLER_ADDRESSING_MIRRORED_REPEAT = 4
+} amdgpu_hsa_sampler_addressing_t;
+
+// AMD GPU Sampler Descriptor.
+typedef struct amdgpu_hsa_sampler_descriptor_s {
+  uint16_t size;
+  amdgpu_hsa_metadata_kind16_t kind;
+  amdgpu_hsa_sampler_coord8_t coord;
+  amdgpu_hsa_sampler_filter8_t filter;
+  amdgpu_hsa_sampler_addressing8_t addressing;
+  uint8_t reserved1;
+} amdgpu_hsa_sampler_descriptor_t;
+
+// AMD GPU Image Geometry Enumeration Values.
+typedef uint8_t amdgpu_hsa_image_geometry8_t;
+typedef enum {
+  AMDGPU_HSA_IMAGE_GEOMETRY_1D = 0,
+  AMDGPU_HSA_IMAGE_GEOMETRY_2D = 1,
+  AMDGPU_HSA_IMAGE_GEOMETRY_3D = 2,
+  AMDGPU_HSA_IMAGE_GEOMETRY_1DA = 3,
+  AMDGPU_HSA_IMAGE_GEOMETRY_2DA = 4,
+  AMDGPU_HSA_IMAGE_GEOMETRY_1DB = 5,
+  AMDGPU_HSA_IMAGE_GEOMETRY_2DDEPTH = 6,
+  AMDGPU_HSA_IMAGE_GEOMETRY_2DADEPTH = 7
+} amdgpu_hsa_image_geometry_t;
+
+// AMD GPU Image Channel Order Enumeration Values.
+typedef uint8_t amdgpu_hsa_image_channel_order8_t;
+typedef enum {
+  AMDGPU_HSA_IMAGE_CHANNEL_ORDER_A = 0,
+  AMDGPU_HSA_IMAGE_CHANNEL_ORDER_R = 1,
+  AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RX = 2,
+  AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RG = 3,
+  AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RGX = 4,
+  AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RA = 5,
+  AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RGB = 6,
+  AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RGBX = 7,
+  AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RGBA = 8,
+  AMDGPU_HSA_IMAGE_CHANNEL_ORDER_BGRA = 9,
+  AMDGPU_HSA_IMAGE_CHANNEL_ORDER_ARGB = 10,
+  AMDGPU_HSA_IMAGE_CHANNEL_ORDER_ABGR = 11,
+  AMDGPU_HSA_IMAGE_CHANNEL_ORDER_SRGB = 12,
+  AMDGPU_HSA_IMAGE_CHANNEL_ORDER_SRGBX = 13,
+  AMDGPU_HSA_IMAGE_CHANNEL_ORDER_SRGBA = 14,
+  AMDGPU_HSA_IMAGE_CHANNEL_ORDER_SBGRA = 15,
+  AMDGPU_HSA_IMAGE_CHANNEL_ORDER_INTENSITY = 16,
+  AMDGPU_HSA_IMAGE_CHANNEL_ORDER_LUMINANCE = 17,
+  AMDGPU_HSA_IMAGE_CHANNEL_ORDER_DEPTH = 18,
+  AMDGPU_HSA_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL = 19
+} amdgpu_hsa_image_channel_order_t;
+
+// AMD GPU Image Channel Type Enumeration Values.
+typedef uint8_t amdgpu_hsa_image_channel_type8_t;
+typedef enum {
+  AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SNORM_INT8 = 0,
+  AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SNORM_INT16 = 1,
+  AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNORM_INT8 = 2,
+  AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNORM_INT16 = 3,
+  AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNORM_INT24 = 4,
+  AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SHORT_555 = 5,
+  AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SHORT_565 = 6,
+  AMDGPU_HSA_IMAGE_CHANNEL_TYPE_INT_101010 = 7,
+  AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SIGNED_INT8 = 8,
+  AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SIGNED_INT16 = 9,
+  AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SIGNED_INT32 = 10,
+  AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 = 11,
+  AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 = 12,
+  AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 = 13,
+  AMDGPU_HSA_IMAGE_CHANNEL_TYPE_HALF_FLOAT = 14,
+  AMDGPU_HSA_IMAGE_CHANNEL_TYPE_FLOAT = 15
+} amdgpu_hsa_image_channel_type_t;
+
+// AMD GPU Image Descriptor.
+typedef struct amdgpu_hsa_image_descriptor_s {
+  uint16_t size;
+  amdgpu_hsa_metadata_kind16_t kind;
+  amdgpu_hsa_image_geometry8_t geometry;
+  amdgpu_hsa_image_channel_order8_t channel_order;
+  amdgpu_hsa_image_channel_type8_t channel_type;
+  uint8_t reserved1;
+  uint64_t width;
+  uint64_t height;
+  uint64_t depth;
+  uint64_t array;
+} amdgpu_hsa_image_descriptor_t;
+
+typedef struct amdgpu_hsa_note_code_object_version_s {
+  uint32_t major_version;
+  uint32_t minor_version;
+} amdgpu_hsa_note_code_object_version_t;
+
+typedef struct amdgpu_hsa_note_hsail_s {
+  uint32_t hsail_major_version;
+  uint32_t hsail_minor_version;
+  uint8_t profile;
+  uint8_t machine_model;
+  uint8_t default_float_round;
+} amdgpu_hsa_note_hsail_t;
+
+typedef struct amdgpu_hsa_note_isa_s {
+  uint16_t vendor_name_size;
+  uint16_t architecture_name_size;
+  uint32_t major;
+  uint32_t minor;
+  uint32_t stepping;
+  char vendor_and_architecture_name[1];
+} amdgpu_hsa_note_isa_t;
+
+typedef struct amdgpu_hsa_note_producer_s {
+  uint16_t producer_name_size;
+  uint16_t reserved;
+  uint32_t producer_major_version;
+  uint32_t producer_minor_version;
+  char producer_name[1];
+} amdgpu_hsa_note_producer_t;
+
+typedef struct amdgpu_hsa_note_producer_options_s {
+  uint16_t producer_options_size;
+  char producer_options[1];
+} amdgpu_hsa_note_producer_options_t;
+
+typedef enum {
+  AMDGPU_HSA_RODATA_GLOBAL_PROGRAM = 0,
+  AMDGPU_HSA_RODATA_GLOBAL_AGENT,
+  AMDGPU_HSA_RODATA_READONLY_AGENT,
+  AMDGPU_HSA_DATA_GLOBAL_PROGRAM,
+  AMDGPU_HSA_DATA_GLOBAL_AGENT,
+  AMDGPU_HSA_DATA_READONLY_AGENT,
+  AMDGPU_HSA_BSS_GLOBAL_PROGRAM,
+  AMDGPU_HSA_BSS_GLOBAL_AGENT,
+  AMDGPU_HSA_BSS_READONLY_AGENT,
+  AMDGPU_HSA_SECTION_LAST,
+} amdgpu_hsa_elf_section_t;
+
+#endif // AMD_HSA_ELF_H
@@ -0,0 +1,270 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+// 
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+// 
+// Developed by:
+// 
+//                 AMD Research and AMD HSA Software Development
+// 
+//                 Advanced Micro Devices, Inc.
+// 
+//                 www.amd.com
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+// 
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef AMD_HSA_KERNEL_CODE_H
+#define AMD_HSA_KERNEL_CODE_H
+
+#include "amd_hsa_common.h"
+#include "hsa.h"
+
+// AMD Kernel Code Version Enumeration Values.
+typedef uint32_t amd_kernel_code_version32_t;
+enum amd_kernel_code_version_t {
+  AMD_KERNEL_CODE_VERSION_MAJOR = 1,
+  AMD_KERNEL_CODE_VERSION_MINOR = 1
+};
+
+// AMD Machine Kind Enumeration Values.
+typedef uint16_t amd_machine_kind16_t;
+enum amd_machine_kind_t {
+  AMD_MACHINE_KIND_UNDEFINED = 0,
+  AMD_MACHINE_KIND_AMDGPU = 1
+};
+
+// AMD Machine Version.
+typedef uint16_t amd_machine_version16_t;
+
+// AMD Float Round Mode Enumeration Values.
+enum amd_float_round_mode_t {
+  AMD_FLOAT_ROUND_MODE_NEAREST_EVEN = 0,
+  AMD_FLOAT_ROUND_MODE_PLUS_INFINITY = 1,
+  AMD_FLOAT_ROUND_MODE_MINUS_INFINITY = 2,
+  AMD_FLOAT_ROUND_MODE_ZERO = 3
+};
+
+// AMD Float Denorm Mode Enumeration Values.
+enum amd_float_denorm_mode_t {
+  AMD_FLOAT_DENORM_MODE_FLUSH_SOURCE_OUTPUT = 0,
+  AMD_FLOAT_DENORM_MODE_FLUSH_OUTPUT = 1,
+  AMD_FLOAT_DENORM_MODE_FLUSH_SOURCE = 2,
+  AMD_FLOAT_DENORM_MODE_NO_FLUSH = 3
+};
+
+// AMD Compute Program Resource Register One.
+typedef uint32_t amd_compute_pgm_rsrc_one32_t;
+enum amd_compute_pgm_rsrc_one_t {
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT, 0, 6),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT, 6, 4),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_PRIORITY, 10, 2),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_32, 12, 2),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_16_64, 14, 2),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_32, 16, 2),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64, 18, 2),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_PRIV, 20, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_DX10_CLAMP, 21, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_DEBUG_MODE, 22, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE, 23, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_BULKY, 24, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_CDBG_USER, 25, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_RESERVED1, 26, 6)
+};
+
+// AMD System VGPR Workitem ID Enumeration Values.
+enum amd_system_vgpr_workitem_id_t {
+  AMD_SYSTEM_VGPR_WORKITEM_ID_X = 0,
+  AMD_SYSTEM_VGPR_WORKITEM_ID_X_Y = 1,
+  AMD_SYSTEM_VGPR_WORKITEM_ID_X_Y_Z = 2,
+  AMD_SYSTEM_VGPR_WORKITEM_ID_UNDEFINED = 3
+};
+
+// AMD Compute Program Resource Register Two.
+typedef uint32_t amd_compute_pgm_rsrc_two32_t;
+enum amd_compute_pgm_rsrc_two_t {
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_PRIVATE_SEGMENT_WAVE_BYTE_OFFSET, 0, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT, 1, 5),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_TRAP_HANDLER, 6, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X, 7, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Y, 8, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Z, 9, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_INFO, 10, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_VGPR_WORKITEM_ID, 11, 2),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_ADDRESS_WATCH, 13, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_MEMORY_VIOLATION, 14, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_GRANULATED_LDS_SIZE, 15, 9),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION, 24, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE, 25, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO, 26, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW, 27, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW, 28, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT, 29, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_INT_DIVISION_BY_ZERO, 30, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_RESERVED1, 31, 1)
+};
+
+// AMD Element Byte Size Enumeration Values.
+enum amd_element_byte_size_t {
+  AMD_ELEMENT_BYTE_SIZE_2 = 0,
+  AMD_ELEMENT_BYTE_SIZE_4 = 1,
+  AMD_ELEMENT_BYTE_SIZE_8 = 2,
+  AMD_ELEMENT_BYTE_SIZE_16 = 3
+};
+
+// AMD Kernel Code Properties.
+typedef uint32_t amd_kernel_code_properties32_t;
+enum amd_kernel_code_properties_t {
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER, 0, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR, 1, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR, 2, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR, 3, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_ID, 4, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_FLAT_SCRATCH_INIT, 5, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE, 6, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X, 7, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y, 8, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z, 9, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_WAVEFRONT_SIZE32, 10, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_RESERVED1, 11, 5),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_ORDERED_APPEND_GDS, 16, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_PRIVATE_ELEMENT_SIZE, 17, 2),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_IS_PTR64, 19, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_IS_DYNAMIC_CALLSTACK, 20, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_IS_DEBUG_ENABLED, 21, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_IS_XNACK_ENABLED, 22, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_RESERVED2, 23, 9)
+};
+
+// AMD Power Of Two Enumeration Values.
+typedef uint8_t amd_powertwo8_t;
+enum amd_powertwo_t {
+  AMD_POWERTWO_1 = 0,
+  AMD_POWERTWO_2 = 1,
+  AMD_POWERTWO_4 = 2,
+  AMD_POWERTWO_8 = 3,
+  AMD_POWERTWO_16 = 4,
+  AMD_POWERTWO_32 = 5,
+  AMD_POWERTWO_64 = 6,
+  AMD_POWERTWO_128 = 7,
+  AMD_POWERTWO_256 = 8
+};
+
+// AMD Enabled Control Directive Enumeration Values.
+typedef uint64_t amd_enabled_control_directive64_t;
+enum amd_enabled_control_directive_t {
+  AMD_ENABLED_CONTROL_DIRECTIVE_ENABLE_BREAK_EXCEPTIONS = 1,
+  AMD_ENABLED_CONTROL_DIRECTIVE_ENABLE_DETECT_EXCEPTIONS = 2,
+  AMD_ENABLED_CONTROL_DIRECTIVE_MAX_DYNAMIC_GROUP_SIZE = 4,
+  AMD_ENABLED_CONTROL_DIRECTIVE_MAX_FLAT_GRID_SIZE = 8,
+  AMD_ENABLED_CONTROL_DIRECTIVE_MAX_FLAT_WORKGROUP_SIZE = 16,
+  AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_DIM = 32,
+  AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_GRID_SIZE = 64,
+  AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_WORKGROUP_SIZE = 128,
+  AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRE_NO_PARTIAL_WORKGROUPS = 256
+};
+
+// AMD Exception Kind Enumeration Values.
+typedef uint16_t amd_exception_kind16_t;
+enum amd_exception_kind_t {
+  AMD_EXCEPTION_KIND_INVALID_OPERATION = 1,
+  AMD_EXCEPTION_KIND_DIVISION_BY_ZERO = 2,
+  AMD_EXCEPTION_KIND_OVERFLOW = 4,
+  AMD_EXCEPTION_KIND_UNDERFLOW = 8,
+  AMD_EXCEPTION_KIND_INEXACT = 16
+};
+
+// AMD Control Directives.
+#define AMD_CONTROL_DIRECTIVES_ALIGN_BYTES 64
+#define AMD_CONTROL_DIRECTIVES_ALIGN __ALIGNED__(AMD_CONTROL_DIRECTIVES_ALIGN_BYTES)
+typedef AMD_CONTROL_DIRECTIVES_ALIGN struct amd_control_directives_s {
+  amd_enabled_control_directive64_t enabled_control_directives;
+  uint16_t enable_break_exceptions;
+  uint16_t enable_detect_exceptions;
+  uint32_t max_dynamic_group_size;
+  uint64_t max_flat_grid_size;
+  uint32_t max_flat_workgroup_size;
+  uint8_t required_dim;
+  uint8_t reserved1[3];
+  uint64_t required_grid_size[3];
+  uint32_t required_workgroup_size[3];
+  uint8_t reserved2[60];
+} amd_control_directives_t;
+
+// AMD Kernel Code.
+#define AMD_ISA_ALIGN_BYTES 256
+#define AMD_KERNEL_CODE_ALIGN_BYTES 64
+#define AMD_KERNEL_CODE_ALIGN __ALIGNED__(AMD_KERNEL_CODE_ALIGN_BYTES)
+typedef AMD_KERNEL_CODE_ALIGN struct amd_kernel_code_s {
+  amd_kernel_code_version32_t amd_kernel_code_version_major;
+  amd_kernel_code_version32_t amd_kernel_code_version_minor;
+  amd_machine_kind16_t amd_machine_kind;
+  amd_machine_version16_t amd_machine_version_major;
+  amd_machine_version16_t amd_machine_version_minor;
+  amd_machine_version16_t amd_machine_version_stepping;
+  int64_t kernel_code_entry_byte_offset;
+  int64_t kernel_code_prefetch_byte_offset;
+  uint64_t kernel_code_prefetch_byte_size;
+  uint64_t max_scratch_backing_memory_byte_size;
+  amd_compute_pgm_rsrc_one32_t compute_pgm_rsrc1;
+  amd_compute_pgm_rsrc_two32_t compute_pgm_rsrc2;
+  amd_kernel_code_properties32_t kernel_code_properties;
+  uint32_t workitem_private_segment_byte_size;
+  uint32_t workgroup_group_segment_byte_size;
+  uint32_t gds_segment_byte_size;
+  uint64_t kernarg_segment_byte_size;
+  uint32_t workgroup_fbarrier_count;
+  uint16_t wavefront_sgpr_count;
+  uint16_t workitem_vgpr_count;
+  uint16_t reserved_vgpr_first;
+  uint16_t reserved_vgpr_count;
+  uint16_t reserved_sgpr_first;
+  uint16_t reserved_sgpr_count;
+  uint16_t debug_wavefront_private_segment_offset_sgpr;
+  uint16_t debug_private_segment_buffer_sgpr;
+  amd_powertwo8_t kernarg_segment_alignment;
+  amd_powertwo8_t group_segment_alignment;
+  amd_powertwo8_t private_segment_alignment;
+  amd_powertwo8_t wavefront_size;
+  int32_t call_convention;
+  uint8_t reserved1[12];
+  uint64_t runtime_loader_kernel_symbol;
+  amd_control_directives_t control_directives;
+} amd_kernel_code_t;
+
+// TODO: this struct should be completely gone once debugger designs/implements
+// Debugger APIs.
+typedef struct amd_runtime_loader_debug_info_s {
+  const void* elf_raw;
+  size_t elf_size;
+  const char *kernel_name;
+  const void *owning_segment;
+} amd_runtime_loader_debug_info_t;
+
+#endif // AMD_HSA_KERNEL_CODE_H
@@ -0,0 +1,154 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef AMD_HSA_QUEUE_H
+#define AMD_HSA_QUEUE_H
+
+#include "amd_hsa_common.h"
+#include "hsa.h"
+
+// AMD Queue Properties.
+typedef uint32_t amd_queue_properties32_t;
+enum amd_queue_properties_t {
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER, 0, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_IS_PTR64, 1, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS, 2, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_ENABLE_PROFILING, 3, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE, 4, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_RESERVED1, 5, 27)
+};
+
+// AMD Queue.
+#define AMD_QUEUE_ALIGN_BYTES 64
+#define AMD_QUEUE_ALIGN __ALIGNED__(AMD_QUEUE_ALIGN_BYTES)
+
+// AMD Queue Capabilities.
+typedef uint32_t amd_queue_capabilities32_t;
+enum amd_queue_capabilities_t {
+  /* This version of CP FW supports dual-scratch and async-reclaim */
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_CAPS_CP_ASYNC_RECLAIM, 0, 1),
+
+  /*
+   * This version of ROCr supports async-reclaim and CP FW may access the
+   * V2 fields.
+   */
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_CAPS_SW_ASYNC_RECLAIM, 1, 1),
+};
+
+/* This is the original amd_queue_t structure. The definition is only kept
+ * for reference purposes. This structure should not be used. */
+typedef struct AMD_QUEUE_ALIGN amd_queue_s {
+  hsa_queue_t hsa_queue;
+  uint32_t caps;
+  uint32_t reserved1[3];
+  volatile uint64_t write_dispatch_id;
+  uint32_t group_segment_aperture_base_hi;
+  uint32_t private_segment_aperture_base_hi;
+  uint32_t max_cu_id;
+  uint32_t max_wave_id;
+  volatile uint64_t max_legacy_doorbell_dispatch_id_plus_1;
+  volatile uint32_t legacy_doorbell_lock;
+  uint32_t reserved2[9];
+  volatile uint64_t read_dispatch_id;
+  uint32_t read_dispatch_id_field_base_byte_offset;
+  uint32_t compute_tmpring_size;
+  uint32_t scratch_resource_descriptor[4];
+  uint64_t scratch_backing_memory_location;
+  uint32_t reserved3[2];
+  uint32_t scratch_wave64_lane_byte_size;
+  amd_queue_properties32_t queue_properties;
+  uint32_t reserved4[2];
+  hsa_signal_t queue_inactive_signal;
+  uint32_t reserved5[14];
+} amd_queue_t;
+
+/*
+ * AMD_QUEUE Version 2
+ * amd_queue_v2_t is backwards compatible with amd_queue_t structure and can
+ * be used with previous versions of CP FW. The added fields tagged as V2 are
+ * ignored when running previous versions of CP FW.
+ * CP FW will not try to access elements beyond the original 64-bytes
+ * (sizeof(amd_queue_t)) unless the AMD_QUEUE_CAPS_SW_ASYNC_RECLAIM bit is set.
+ */
+
+#define MAX_NUM_XCC 128
+typedef struct scratch_last_used_index_xcc_s {
+  volatile uint64_t main;
+  volatile uint64_t alt;
+} scratch_last_used_index_xcc_t;
+
+typedef struct AMD_QUEUE_ALIGN amd_queue_v2_s {
+  hsa_queue_t hsa_queue;
+  uint32_t caps;
+  uint32_t reserved1[3];
+  volatile uint64_t write_dispatch_id;
+  uint32_t group_segment_aperture_base_hi;
+  uint32_t private_segment_aperture_base_hi;
+  uint32_t max_cu_id;
+  uint32_t max_wave_id;
+  volatile uint64_t max_legacy_doorbell_dispatch_id_plus_1;
+  volatile uint32_t legacy_doorbell_lock;
+  uint32_t reserved2[9];
+  volatile uint64_t read_dispatch_id;
+  uint32_t read_dispatch_id_field_base_byte_offset;
+  uint32_t compute_tmpring_size;
+  uint32_t scratch_resource_descriptor[4];
+  uint64_t scratch_backing_memory_location;
+  uint64_t scratch_backing_memory_byte_size;
+  uint32_t scratch_wave64_lane_byte_size;
+  amd_queue_properties32_t queue_properties;
+  volatile uint64_t scratch_max_use_index;       /* V2 */
+  hsa_signal_t queue_inactive_signal;
+  volatile uint64_t alt_scratch_max_use_index;  /* V2 */
+  uint32_t alt_scratch_resource_descriptor[4];   /* V2 */
+  uint64_t alt_scratch_backing_memory_location;  /* V2 */
+  uint32_t alt_scratch_dispatch_limit_x;         /* V2 */
+  uint32_t alt_scratch_dispatch_limit_y;         /* V2 */
+  uint32_t alt_scratch_dispatch_limit_z;         /* V2 */
+  uint32_t alt_scratch_wave64_lane_byte_size;    /* V2 */
+  uint32_t alt_compute_tmpring_size;             /* V2 */
+  uint32_t reserved5;
+
+  scratch_last_used_index_xcc_t scratch_last_used_index[MAX_NUM_XCC];
+} amd_queue_v2_t;
+
+#endif // AMD_HSA_QUEUE_H
@@ -0,0 +1,79 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+// 
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+// 
+// Developed by:
+// 
+//                 AMD Research and AMD HSA Software Development
+// 
+//                 Advanced Micro Devices, Inc.
+// 
+//                 www.amd.com
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+// 
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef AMD_HSA_SIGNAL_H
+#define AMD_HSA_SIGNAL_H
+
+#include "amd_hsa_common.h"
+#include "amd_hsa_queue.h"
+
+// AMD Signal Kind Enumeration Values.
+typedef int64_t amd_signal_kind64_t;
+enum amd_signal_kind_t {
+  AMD_SIGNAL_KIND_INVALID = 0,
+  AMD_SIGNAL_KIND_USER = 1,
+  AMD_SIGNAL_KIND_DOORBELL = -1,
+  AMD_SIGNAL_KIND_LEGACY_DOORBELL = -2
+};
+
+// AMD Signal.
+#define AMD_SIGNAL_ALIGN_BYTES 64
+#define AMD_SIGNAL_ALIGN __ALIGNED__(AMD_SIGNAL_ALIGN_BYTES)
+typedef struct AMD_SIGNAL_ALIGN amd_signal_s {
+  amd_signal_kind64_t kind;
+  union {
+    volatile int64_t value;
+    volatile uint64_t* hardware_doorbell_ptr;
+  };
+  uint64_t event_mailbox_ptr;
+  uint32_t event_id;
+  uint32_t reserved1;
+  uint64_t start_ts;
+  uint64_t end_ts;
+  union {
+    amd_queue_v2_t* queue_ptr;
+    uint64_t reserved2;
+  };
+  uint32_t reserved3[2];
+} amd_signal_t;
+
+#endif // AMD_HSA_SIGNAL_H
@@ -0,0 +1,97 @@
+/*
+ * Copyright © Advanced Micro Devices, Inc., or its affiliates. 
+ * 
+ * SPDX-License-Identifier: MIT
+ */
+ 
+#ifndef HSA_RUNTIME_AMD_TOOL_EVENTS_H_
+#define HSA_RUNTIME_AMD_TOOL_EVENTS_H_
+
+// Insert license header
+
+#include <stddef.h>
+#include <stdint.h>
+#include "hsa.h"
+
+
+typedef enum {
+  HSA_AMD_EVENT_SCRATCH_ALLOC_FLAG_NONE = 0,
+  HSA_AMD_EVENT_SCRATCH_ALLOC_FLAG_USE_ONCE =
+      (1 << 0),  // This scratch allocation is only valid for 1 dispatch.
+  HSA_AMD_EVENT_SCRATCH_ALLOC_FLAG_ALT =
+      (1 << 1),  // Used alternate scratch instead of main scratch
+} hsa_amd_event_scratch_alloc_flag_t;
+
+typedef enum {
+  HSA_AMD_TOOL_EVENT_MIN = 0,
+
+  // Scratch memory tracking
+  HSA_AMD_TOOL_EVENT_SCRATCH_ALLOC_START,
+  HSA_AMD_TOOL_EVENT_SCRATCH_ALLOC_END,
+  HSA_AMD_TOOL_EVENT_SCRATCH_FREE_START,
+  HSA_AMD_TOOL_EVENT_SCRATCH_FREE_END,
+  HSA_AMD_TOOL_EVENT_SCRATCH_ASYNC_RECLAIM_START,
+  HSA_AMD_TOOL_EVENT_SCRATCH_ASYNC_RECLAIM_END,
+
+  // Add new events above ^
+  HSA_AMD_TOOL_EVENT_MAX
+} hsa_amd_tool_event_kind_t;
+
+typedef struct {
+  hsa_amd_tool_event_kind_t kind;
+} hsa_amd_tool_event_none_t;
+
+typedef struct {
+  hsa_amd_tool_event_kind_t kind;
+  const hsa_queue_t* queue;
+  hsa_amd_event_scratch_alloc_flag_t flags;
+  uint64_t dispatch_id;  // Dispatch ID of the AQL packet that needs more scratch memory
+} hsa_amd_event_scratch_alloc_start_t;
+
+typedef struct {
+  hsa_amd_tool_event_kind_t kind;
+  const hsa_queue_t* queue;
+  hsa_amd_event_scratch_alloc_flag_t flags;
+  uint64_t dispatch_id;  // Dispatch ID of the AQL packet that needs more scratch memory
+  size_t size;           // Amount of scratch allocated - in bytes
+  size_t num_slots;      // limit of number of waves
+} hsa_amd_event_scratch_alloc_end_t;
+
+typedef struct {
+  hsa_amd_tool_event_kind_t kind;
+  const hsa_queue_t* queue;
+  hsa_amd_event_scratch_alloc_flag_t flags;
+} hsa_amd_event_scratch_free_start_t;
+
+typedef struct {
+  hsa_amd_tool_event_kind_t kind;
+  const hsa_queue_t* queue;
+  hsa_amd_event_scratch_alloc_flag_t flags;
+} hsa_amd_event_scratch_free_end_t;
+
+typedef struct {
+  hsa_amd_tool_event_kind_t kind;
+  const hsa_queue_t* queue;
+  hsa_amd_event_scratch_alloc_flag_t flags;
+} hsa_amd_event_scratch_async_reclaim_start_t;
+
+typedef struct {
+  hsa_amd_tool_event_kind_t kind;
+  const hsa_queue_t* queue;
+  hsa_amd_event_scratch_alloc_flag_t flags;
+} hsa_amd_event_scratch_async_reclaim_end_t;
+
+typedef union {
+  const hsa_amd_tool_event_none_t* none;
+  const hsa_amd_event_scratch_alloc_start_t* scratch_alloc_start;
+  const hsa_amd_event_scratch_alloc_end_t* scratch_alloc_end;
+  const hsa_amd_event_scratch_free_start_t* scratch_free_start;
+  const hsa_amd_event_scratch_free_end_t* scratch_free_end;
+  const hsa_amd_event_scratch_async_reclaim_start_t* scratch_async_reclaim_start;
+  const hsa_amd_event_scratch_async_reclaim_end_t* scratch_async_reclaim_end;
+} hsa_amd_tool_event_t;
+
+typedef hsa_status_t (*hsa_amd_tool_event)(hsa_amd_tool_event_t);
+
+
+#endif
@@ -0,0 +1,587 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2025, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef HSA_RUNTIME_INC_HSA_API_TRACE_H
+#define HSA_RUNTIME_INC_HSA_API_TRACE_H
+
+#include "hsa.h"
+#include "hsa_api_trace_version.h"
+#ifdef AMD_INTERNAL_BUILD
+#include "hsa_ext_image.h"
+#include "hsa_ext_amd.h"
+#include "hsa_ext_finalize.h"
+#include "hsa_amd_tool.h"
+#include "hsa_ven_amd_pc_sampling.h"
+#else
+#include "inc/hsa_ext_image.h"
+#include "inc/hsa_ext_amd.h"
+#include "inc/hsa_ext_finalize.h"
+#include "inc/hsa_amd_tool.h"
+#include "inc/hsa_ven_amd_pc_sampling.h"
+#endif
+
+#include <string.h>
+#include <assert.h>
+#include <stddef.h>
+
+// Table MAJOR_VERSION and STEP_VERSION defines have moved to hsa_api_trace_version.h
+
+// Min function used to copy Api Tables
+static inline uint32_t Min(const uint32_t a, const uint32_t b) {
+  return (a > b) ? b : a;
+}
+
+// Declarations of APIs intended for use only by tools.
+
+// An AQL packet that can be put in an intercept queue to cause a callback to
+// be invoked when the packet is about to be submitted to the underlying
+// hardware queue. These packets are not copied to the underlying hardware
+// queue. These packets should come immediately before the regular AQL packet
+// they relate to. This implies that packet rewriters should always keep these
+// packets adjacent to the regular AQL packet that follows them.
+const uint32_t AMD_AQL_FORMAT_INTERCEPT_MARKER = 0xFE;
+
+struct amd_aql_intercept_marker_s;
+
+// When an intercept queue is processing rewritten packets to put them on the
+// underlying hardware queue, if it encounters a
+// AMD_AQL_FORMAT_INTERCEPT_MARKER vendor AQL packet it will call the following
+// handler. packet points to the packet, queue is the underlying hardware
+// queue, and packet_id is the packet id of the next packet to be put on the
+// underlying hardware queue. The intercept queue does not put these packets
+// onto the underlying hardware queue.
+typedef void (*amd_intercept_marker_handler)(const struct amd_aql_intercept_marker_s* packet,
+                                             hsa_queue_t* queue, uint64_t packet_id);
+// An AQL vendor packet used by the intercept queue to mark the following
+// packet. The callback will be invoked to allow a tool to know where in the
+// underlying hardware queue the following packet will be placed. user_data can
+// be used to hold any data useful to the tool.
+typedef struct amd_aql_intercept_marker_s {
+  uint16_t header; // Must have a packet type of HSA_PACKET_TYPE_VENDOR_SPECIFIC.
+  uint8_t format; // Must be AMD_AQL_FORMAT_INTERCEPT_MARKER.
+  uint8_t reserved[5]; // Must be 0.
+#ifdef HSA_LARGE_MODEL
+  amd_intercept_marker_handler callback;
+#elif defined HSA_LITTLE_ENDIAN
+  amd_intercept_marker_handler callback;
+  uint32_t reserved1; // Must be 0.
+#else
+  uint32_t reserved1; // Must be 0.
+  amd_intercept_marker_handler callback;
+#endif
+  uint64_t user_data[6];
+} amd_aql_intercept_marker_t;
+
+typedef void (*hsa_amd_queue_intercept_packet_writer)(const void* pkts, uint64_t pkt_count);
+typedef void (*hsa_amd_queue_intercept_handler)(const void* pkts, uint64_t pkt_count,
+                                                uint64_t user_pkt_index, void* data,
+                                                hsa_amd_queue_intercept_packet_writer writer);
+hsa_status_t hsa_amd_queue_intercept_register(hsa_queue_t* queue,
+                                              hsa_amd_queue_intercept_handler callback,
+                                              void* user_data);
+hsa_status_t hsa_amd_queue_intercept_create(
+    hsa_agent_t agent_handle, uint32_t size, hsa_queue_type32_t type,
+    void (*callback)(hsa_status_t status, hsa_queue_t* source, void* data), void* data,
+    uint32_t private_segment_size, uint32_t group_segment_size, hsa_queue_t** queue);
+
+typedef void (*hsa_amd_runtime_queue_notifier)(const hsa_queue_t* queue, hsa_agent_t agent,
+                                               void* data);
+hsa_status_t hsa_amd_runtime_queue_create_register(hsa_amd_runtime_queue_notifier callback,
+                                                   void* user_data);
+
+// Structure of Version used to identify an instance of Api table
+// Must be the first member (offsetof == 0) of all API tables.
+// This is the root of the table passing ABI.
+struct ApiTableVersion {
+  uint32_t major_id;
+  uint32_t minor_id;
+  uint32_t step_id;
+  uint32_t reserved;
+};
+
+struct ToolsApiTable {
+  ApiTableVersion version;
+
+  hsa_amd_tool_event hsa_amd_tool_scratch_event_alloc_start_fn;
+  hsa_amd_tool_event hsa_amd_tool_scratch_event_alloc_end_fn;
+  hsa_amd_tool_event hsa_amd_tool_scratch_event_free_start_fn;
+  hsa_amd_tool_event hsa_amd_tool_scratch_event_free_end_fn;
+  hsa_amd_tool_event hsa_amd_tool_scratch_event_async_reclaim_start_fn;
+  hsa_amd_tool_event hsa_amd_tool_scratch_event_async_reclaim_end_fn;
+};
+
+// Table to export HSA Finalizer Extension Apis
+struct FinalizerExtTable {
+  ApiTableVersion version;
+	decltype(hsa_ext_program_create)* hsa_ext_program_create_fn;
+	decltype(hsa_ext_program_destroy)* hsa_ext_program_destroy_fn;
+	decltype(hsa_ext_program_add_module)* hsa_ext_program_add_module_fn;
+	decltype(hsa_ext_program_iterate_modules)* hsa_ext_program_iterate_modules_fn;
+	decltype(hsa_ext_program_get_info)* hsa_ext_program_get_info_fn;
+	decltype(hsa_ext_program_finalize)* hsa_ext_program_finalize_fn;
+};
+
+// Table to export HSA Image Extension Apis
+struct ImageExtTable {
+  ApiTableVersion version;
+	decltype(hsa_ext_image_get_capability)* hsa_ext_image_get_capability_fn;
+	decltype(hsa_ext_image_data_get_info)* hsa_ext_image_data_get_info_fn;
+	decltype(hsa_ext_image_create)* hsa_ext_image_create_fn;
+	decltype(hsa_ext_image_import)* hsa_ext_image_import_fn;
+	decltype(hsa_ext_image_export)* hsa_ext_image_export_fn;
+	decltype(hsa_ext_image_copy)* hsa_ext_image_copy_fn;
+	decltype(hsa_ext_image_clear)* hsa_ext_image_clear_fn;
+	decltype(hsa_ext_image_destroy)* hsa_ext_image_destroy_fn;
+	decltype(hsa_ext_sampler_create)* hsa_ext_sampler_create_fn;
+	decltype(hsa_ext_sampler_destroy)* hsa_ext_sampler_destroy_fn;
+  decltype(hsa_ext_image_get_capability_with_layout)* hsa_ext_image_get_capability_with_layout_fn;
+  decltype(hsa_ext_image_data_get_info_with_layout)* hsa_ext_image_data_get_info_with_layout_fn;
+  decltype(hsa_ext_image_create_with_layout)* hsa_ext_image_create_with_layout_fn;
+  decltype(hsa_ext_sampler_create_v2)* hsa_ext_sampler_create_v2_fn;
+
+};
+
+// Table to export HSA PC Sampling Extension Apis
+struct PcSamplingExtTable {
+  ApiTableVersion version;
+  decltype(hsa_ven_amd_pcs_iterate_configuration)* hsa_ven_amd_pcs_iterate_configuration_fn;
+  decltype(hsa_ven_amd_pcs_create)* hsa_ven_amd_pcs_create_fn;
+  decltype(hsa_ven_amd_pcs_create_from_id)* hsa_ven_amd_pcs_create_from_id_fn;
+  decltype(hsa_ven_amd_pcs_destroy)* hsa_ven_amd_pcs_destroy_fn;
+  decltype(hsa_ven_amd_pcs_start)* hsa_ven_amd_pcs_start_fn;
+  decltype(hsa_ven_amd_pcs_stop)* hsa_ven_amd_pcs_stop_fn;
+  decltype(hsa_ven_amd_pcs_flush)* hsa_ven_amd_pcs_flush_fn;
+};
+
+
+// Table to export AMD Extension Apis
+struct AmdExtTable {
+  ApiTableVersion version;
+	decltype(hsa_amd_coherency_get_type)* hsa_amd_coherency_get_type_fn;
+	decltype(hsa_amd_coherency_set_type)* hsa_amd_coherency_set_type_fn;
+  decltype(hsa_amd_profiling_set_profiler_enabled)* hsa_amd_profiling_set_profiler_enabled_fn;
+  decltype(hsa_amd_profiling_async_copy_enable) *hsa_amd_profiling_async_copy_enable_fn;
+  decltype(hsa_amd_profiling_get_dispatch_time)* hsa_amd_profiling_get_dispatch_time_fn;
+  decltype(hsa_amd_profiling_get_async_copy_time) *hsa_amd_profiling_get_async_copy_time_fn;
+  decltype(hsa_amd_profiling_convert_tick_to_system_domain)* hsa_amd_profiling_convert_tick_to_system_domain_fn;
+  decltype(hsa_amd_signal_async_handler)* hsa_amd_signal_async_handler_fn;
+  decltype(hsa_amd_async_function)* hsa_amd_async_function_fn;
+  decltype(hsa_amd_signal_wait_any)* hsa_amd_signal_wait_any_fn;
+  decltype(hsa_amd_queue_cu_set_mask)* hsa_amd_queue_cu_set_mask_fn;
+  decltype(hsa_amd_memory_pool_get_info)* hsa_amd_memory_pool_get_info_fn;
+  decltype(hsa_amd_agent_iterate_memory_pools)* hsa_amd_agent_iterate_memory_pools_fn;
+  decltype(hsa_amd_memory_pool_allocate)* hsa_amd_memory_pool_allocate_fn;
+  decltype(hsa_amd_memory_pool_free)* hsa_amd_memory_pool_free_fn;
+  decltype(hsa_amd_memory_async_copy)* hsa_amd_memory_async_copy_fn;
+  decltype(hsa_amd_memory_async_copy_on_engine)* hsa_amd_memory_async_copy_on_engine_fn;
+  decltype(hsa_amd_memory_copy_engine_status)* hsa_amd_memory_copy_engine_status_fn;
+  decltype(hsa_amd_agent_memory_pool_get_info)* hsa_amd_agent_memory_pool_get_info_fn;
+  decltype(hsa_amd_agents_allow_access)* hsa_amd_agents_allow_access_fn;
+  decltype(hsa_amd_memory_pool_can_migrate)* hsa_amd_memory_pool_can_migrate_fn;
+  decltype(hsa_amd_memory_migrate)* hsa_amd_memory_migrate_fn;
+  decltype(hsa_amd_memory_lock)* hsa_amd_memory_lock_fn;
+  decltype(hsa_amd_memory_unlock)* hsa_amd_memory_unlock_fn;
+  decltype(hsa_amd_memory_fill)* hsa_amd_memory_fill_fn;
+  decltype(hsa_amd_interop_map_buffer)* hsa_amd_interop_map_buffer_fn;
+  decltype(hsa_amd_interop_unmap_buffer)* hsa_amd_interop_unmap_buffer_fn;
+  decltype(hsa_amd_image_create)* hsa_amd_image_create_fn;
+  decltype(hsa_amd_pointer_info)* hsa_amd_pointer_info_fn;
+  decltype(hsa_amd_pointer_info_set_userdata)* hsa_amd_pointer_info_set_userdata_fn;
+  decltype(hsa_amd_ipc_memory_create)* hsa_amd_ipc_memory_create_fn;
+  decltype(hsa_amd_ipc_memory_attach)* hsa_amd_ipc_memory_attach_fn;
+  decltype(hsa_amd_ipc_memory_detach)* hsa_amd_ipc_memory_detach_fn;
+  decltype(hsa_amd_signal_create)* hsa_amd_signal_create_fn;
+  decltype(hsa_amd_ipc_signal_create)* hsa_amd_ipc_signal_create_fn;
+  decltype(hsa_amd_ipc_signal_attach)* hsa_amd_ipc_signal_attach_fn;
+  decltype(hsa_amd_register_system_event_handler)* hsa_amd_register_system_event_handler_fn;
+  decltype(hsa_amd_queue_intercept_create)* hsa_amd_queue_intercept_create_fn;
+  decltype(hsa_amd_queue_intercept_register)* hsa_amd_queue_intercept_register_fn;
+  decltype(hsa_amd_queue_set_priority)* hsa_amd_queue_set_priority_fn;
+  decltype(hsa_amd_memory_async_copy_rect)* hsa_amd_memory_async_copy_rect_fn;
+  decltype(hsa_amd_runtime_queue_create_register)* hsa_amd_runtime_queue_create_register_fn;
+  decltype(hsa_amd_memory_lock_to_pool)* hsa_amd_memory_lock_to_pool_fn;
+  decltype(hsa_amd_register_deallocation_callback)* hsa_amd_register_deallocation_callback_fn;
+  decltype(hsa_amd_deregister_deallocation_callback)* hsa_amd_deregister_deallocation_callback_fn;
+  decltype(hsa_amd_signal_value_pointer)* hsa_amd_signal_value_pointer_fn;
+  decltype(hsa_amd_svm_attributes_set)* hsa_amd_svm_attributes_set_fn;
+  decltype(hsa_amd_svm_attributes_get)* hsa_amd_svm_attributes_get_fn;
+  decltype(hsa_amd_svm_prefetch_async)* hsa_amd_svm_prefetch_async_fn;
+  decltype(hsa_amd_spm_acquire)* hsa_amd_spm_acquire_fn;
+  decltype(hsa_amd_spm_release)* hsa_amd_spm_release_fn;
+  decltype(hsa_amd_spm_set_dest_buffer)* hsa_amd_spm_set_dest_buffer_fn;
+  decltype(hsa_amd_queue_cu_get_mask)* hsa_amd_queue_cu_get_mask_fn;
+  decltype(hsa_amd_portable_export_dmabuf)* hsa_amd_portable_export_dmabuf_fn;
+  decltype(hsa_amd_portable_close_dmabuf)* hsa_amd_portable_close_dmabuf_fn;
+  decltype(hsa_amd_vmem_address_reserve)* hsa_amd_vmem_address_reserve_fn;
+  decltype(hsa_amd_vmem_address_free)* hsa_amd_vmem_address_free_fn;
+  decltype(hsa_amd_vmem_handle_create)* hsa_amd_vmem_handle_create_fn;
+  decltype(hsa_amd_vmem_handle_release)* hsa_amd_vmem_handle_release_fn;
+  decltype(hsa_amd_vmem_map)* hsa_amd_vmem_map_fn;
+  decltype(hsa_amd_vmem_unmap)* hsa_amd_vmem_unmap_fn;
+  decltype(hsa_amd_vmem_set_access)* hsa_amd_vmem_set_access_fn;
+  decltype(hsa_amd_vmem_get_access)* hsa_amd_vmem_get_access_fn;
+  decltype(hsa_amd_vmem_export_shareable_handle)* hsa_amd_vmem_export_shareable_handle_fn;
+  decltype(hsa_amd_vmem_import_shareable_handle)* hsa_amd_vmem_import_shareable_handle_fn;
+  decltype(hsa_amd_vmem_retain_alloc_handle)* hsa_amd_vmem_retain_alloc_handle_fn;
+  decltype(hsa_amd_vmem_get_alloc_properties_from_handle)*
+      hsa_amd_vmem_get_alloc_properties_from_handle_fn;
+  decltype(hsa_amd_agent_set_async_scratch_limit)* hsa_amd_agent_set_async_scratch_limit_fn;
+  decltype(hsa_amd_queue_get_info)* hsa_amd_queue_get_info_fn;
+  decltype(hsa_amd_vmem_address_reserve_align)* hsa_amd_vmem_address_reserve_align_fn;
+  decltype(hsa_amd_enable_logging)* hsa_amd_enable_logging_fn;
+  decltype(hsa_amd_signal_wait_all)* hsa_amd_signal_wait_all_fn;
+  decltype(hsa_amd_memory_get_preferred_copy_engine)* hsa_amd_memory_get_preferred_copy_engine_fn;
+  decltype(hsa_amd_portable_export_dmabuf_v2)* hsa_amd_portable_export_dmabuf_v2_fn;
+  decltype(hsa_amd_ais_file_write)* hsa_amd_ais_file_write_fn;
+  decltype(hsa_amd_ais_file_read)* hsa_amd_ais_file_read_fn;
+};
+
+// Table to export HSA Core Runtime Apis
+struct CoreApiTable {
+  ApiTableVersion version;
+  decltype(hsa_init)* hsa_init_fn;
+  decltype(hsa_shut_down)* hsa_shut_down_fn;
+  decltype(hsa_system_get_info)* hsa_system_get_info_fn;
+  decltype(hsa_system_extension_supported)* hsa_system_extension_supported_fn;
+  decltype(hsa_system_get_extension_table)* hsa_system_get_extension_table_fn;
+  decltype(hsa_iterate_agents)* hsa_iterate_agents_fn;
+  decltype(hsa_agent_get_info)* hsa_agent_get_info_fn;
+  decltype(hsa_queue_create)* hsa_queue_create_fn;
+  decltype(hsa_soft_queue_create)* hsa_soft_queue_create_fn;
+  decltype(hsa_queue_destroy)* hsa_queue_destroy_fn;
+  decltype(hsa_queue_inactivate)* hsa_queue_inactivate_fn;
+  decltype(hsa_queue_load_read_index_scacquire)* hsa_queue_load_read_index_scacquire_fn;
+  decltype(hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed_fn;
+  decltype(hsa_queue_load_write_index_scacquire)* hsa_queue_load_write_index_scacquire_fn;
+  decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed_fn;
+  decltype(hsa_queue_store_write_index_relaxed)* hsa_queue_store_write_index_relaxed_fn;
+  decltype(hsa_queue_store_write_index_screlease)* hsa_queue_store_write_index_screlease_fn;
+  decltype(hsa_queue_cas_write_index_scacq_screl)* hsa_queue_cas_write_index_scacq_screl_fn;
+  decltype(hsa_queue_cas_write_index_scacquire)* hsa_queue_cas_write_index_scacquire_fn;
+  decltype(hsa_queue_cas_write_index_relaxed)* hsa_queue_cas_write_index_relaxed_fn;
+  decltype(hsa_queue_cas_write_index_screlease)* hsa_queue_cas_write_index_screlease_fn;
+  decltype(hsa_queue_add_write_index_scacq_screl)* hsa_queue_add_write_index_scacq_screl_fn;
+  decltype(hsa_queue_add_write_index_scacquire)* hsa_queue_add_write_index_scacquire_fn;
+  decltype(hsa_queue_add_write_index_relaxed)* hsa_queue_add_write_index_relaxed_fn;
+  decltype(hsa_queue_add_write_index_screlease)* hsa_queue_add_write_index_screlease_fn;
+  decltype(hsa_queue_store_read_index_relaxed)* hsa_queue_store_read_index_relaxed_fn;
+  decltype(hsa_queue_store_read_index_screlease)* hsa_queue_store_read_index_screlease_fn;
+  decltype(hsa_agent_iterate_regions)* hsa_agent_iterate_regions_fn;
+  decltype(hsa_region_get_info)* hsa_region_get_info_fn;
+  decltype(hsa_agent_get_exception_policies)* hsa_agent_get_exception_policies_fn;
+  decltype(hsa_agent_extension_supported)* hsa_agent_extension_supported_fn;
+  decltype(hsa_memory_register)* hsa_memory_register_fn;
+  decltype(hsa_memory_deregister)* hsa_memory_deregister_fn;
+  decltype(hsa_memory_allocate)* hsa_memory_allocate_fn;
+  decltype(hsa_memory_free)* hsa_memory_free_fn;
+  decltype(hsa_memory_copy)* hsa_memory_copy_fn;
+  decltype(hsa_memory_assign_agent)* hsa_memory_assign_agent_fn;
+  decltype(hsa_signal_create)* hsa_signal_create_fn;
+  decltype(hsa_signal_destroy)* hsa_signal_destroy_fn;
+  decltype(hsa_signal_load_relaxed)* hsa_signal_load_relaxed_fn;
+  decltype(hsa_signal_load_scacquire)* hsa_signal_load_scacquire_fn;
+  decltype(hsa_signal_store_relaxed)* hsa_signal_store_relaxed_fn;
+  decltype(hsa_signal_store_screlease)* hsa_signal_store_screlease_fn;
+  decltype(hsa_signal_wait_relaxed)* hsa_signal_wait_relaxed_fn;
+  decltype(hsa_signal_wait_scacquire)* hsa_signal_wait_scacquire_fn;
+  decltype(hsa_signal_and_relaxed)* hsa_signal_and_relaxed_fn;
+  decltype(hsa_signal_and_scacquire)* hsa_signal_and_scacquire_fn;
+  decltype(hsa_signal_and_screlease)* hsa_signal_and_screlease_fn;
+  decltype(hsa_signal_and_scacq_screl)* hsa_signal_and_scacq_screl_fn;
+  decltype(hsa_signal_or_relaxed)* hsa_signal_or_relaxed_fn;
+  decltype(hsa_signal_or_scacquire)* hsa_signal_or_scacquire_fn;
+  decltype(hsa_signal_or_screlease)* hsa_signal_or_screlease_fn;
+  decltype(hsa_signal_or_scacq_screl)* hsa_signal_or_scacq_screl_fn;
+  decltype(hsa_signal_xor_relaxed)* hsa_signal_xor_relaxed_fn;
+  decltype(hsa_signal_xor_scacquire)* hsa_signal_xor_scacquire_fn;
+  decltype(hsa_signal_xor_screlease)* hsa_signal_xor_screlease_fn;
+  decltype(hsa_signal_xor_scacq_screl)* hsa_signal_xor_scacq_screl_fn;
+  decltype(hsa_signal_exchange_relaxed)* hsa_signal_exchange_relaxed_fn;
+  decltype(hsa_signal_exchange_scacquire)* hsa_signal_exchange_scacquire_fn;
+  decltype(hsa_signal_exchange_screlease)* hsa_signal_exchange_screlease_fn;
+  decltype(hsa_signal_exchange_scacq_screl)* hsa_signal_exchange_scacq_screl_fn;
+  decltype(hsa_signal_add_relaxed)* hsa_signal_add_relaxed_fn;
+  decltype(hsa_signal_add_scacquire)* hsa_signal_add_scacquire_fn;
+  decltype(hsa_signal_add_screlease)* hsa_signal_add_screlease_fn;
+  decltype(hsa_signal_add_scacq_screl)* hsa_signal_add_scacq_screl_fn;
+  decltype(hsa_signal_subtract_relaxed)* hsa_signal_subtract_relaxed_fn;
+  decltype(hsa_signal_subtract_scacquire)* hsa_signal_subtract_scacquire_fn;
+  decltype(hsa_signal_subtract_screlease)* hsa_signal_subtract_screlease_fn;
+  decltype(hsa_signal_subtract_scacq_screl)* hsa_signal_subtract_scacq_screl_fn;
+  decltype(hsa_signal_cas_relaxed)* hsa_signal_cas_relaxed_fn;
+  decltype(hsa_signal_cas_scacquire)* hsa_signal_cas_scacquire_fn;
+  decltype(hsa_signal_cas_screlease)* hsa_signal_cas_screlease_fn;
+  decltype(hsa_signal_cas_scacq_screl)* hsa_signal_cas_scacq_screl_fn;
+
+  //===--- Instruction Set Architecture -----------------------------------===//
+
+  decltype(hsa_isa_from_name)* hsa_isa_from_name_fn;
+  // Deprecated since v1.1.
+  decltype(hsa_isa_get_info)* hsa_isa_get_info_fn;
+  // Deprecated since v1.1.
+  decltype(hsa_isa_compatible)* hsa_isa_compatible_fn;
+
+  //===--- Code Objects (deprecated) --------------------------------------===//
+
+  // Deprecated since v1.1.
+  decltype(hsa_code_object_serialize)* hsa_code_object_serialize_fn;
+  // Deprecated since v1.1.
+  decltype(hsa_code_object_deserialize)* hsa_code_object_deserialize_fn;
+  // Deprecated since v1.1.
+  decltype(hsa_code_object_destroy)* hsa_code_object_destroy_fn;
+  // Deprecated since v1.1.
+  decltype(hsa_code_object_get_info)* hsa_code_object_get_info_fn;
+  // Deprecated since v1.1.
+  decltype(hsa_code_object_get_symbol)* hsa_code_object_get_symbol_fn;
+  // Deprecated since v1.1.
+  decltype(hsa_code_symbol_get_info)* hsa_code_symbol_get_info_fn;
+  // Deprecated since v1.1.
+  decltype(hsa_code_object_iterate_symbols)* hsa_code_object_iterate_symbols_fn;
+
+  //===--- Executable -----------------------------------------------------===//
+
+  // Deprecated since v1.1.
+  decltype(hsa_executable_create)* hsa_executable_create_fn;
+  decltype(hsa_executable_destroy)* hsa_executable_destroy_fn;
+  // Deprecated since v1.1.
+  decltype(hsa_executable_load_code_object)* hsa_executable_load_code_object_fn;
+  decltype(hsa_executable_freeze)* hsa_executable_freeze_fn;
+  decltype(hsa_executable_get_info)* hsa_executable_get_info_fn;
+  decltype(hsa_executable_global_variable_define)*
+      hsa_executable_global_variable_define_fn;
+  decltype(hsa_executable_agent_global_variable_define)*
+      hsa_executable_agent_global_variable_define_fn;
+  decltype(hsa_executable_readonly_variable_define)*
+      hsa_executable_readonly_variable_define_fn;
+  decltype(hsa_executable_validate)* hsa_executable_validate_fn;
+  // Deprecated since v1.1.
+  decltype(hsa_executable_get_symbol)* hsa_executable_get_symbol_fn;
+  decltype(hsa_executable_symbol_get_info)* hsa_executable_symbol_get_info_fn;
+  // Deprecated since v1.1.
+  decltype(hsa_executable_iterate_symbols)* hsa_executable_iterate_symbols_fn;
+
+  //===--- Runtime Notifications ------------------------------------------===//
+
+  decltype(hsa_status_string)* hsa_status_string_fn;
+
+  // Start HSA v1.1 additions
+  decltype(hsa_extension_get_name)* hsa_extension_get_name_fn;
+  decltype(hsa_system_major_extension_supported)* hsa_system_major_extension_supported_fn;
+  decltype(hsa_system_get_major_extension_table)* hsa_system_get_major_extension_table_fn;
+  decltype(hsa_agent_major_extension_supported)* hsa_agent_major_extension_supported_fn;
+  decltype(hsa_cache_get_info)* hsa_cache_get_info_fn;
+  decltype(hsa_agent_iterate_caches)* hsa_agent_iterate_caches_fn;
+  decltype(hsa_signal_silent_store_relaxed)* hsa_signal_silent_store_relaxed_fn;
+  decltype(hsa_signal_silent_store_screlease)* hsa_signal_silent_store_screlease_fn;
+  decltype(hsa_signal_group_create)* hsa_signal_group_create_fn;
+  decltype(hsa_signal_group_destroy)* hsa_signal_group_destroy_fn;
+  decltype(hsa_signal_group_wait_any_scacquire)* hsa_signal_group_wait_any_scacquire_fn;
+  decltype(hsa_signal_group_wait_any_relaxed)* hsa_signal_group_wait_any_relaxed_fn;
+
+  //===--- Instruction Set Architecture - HSA v1.1 additions --------------===//
+
+  decltype(hsa_agent_iterate_isas)* hsa_agent_iterate_isas_fn;
+  decltype(hsa_isa_get_info_alt)* hsa_isa_get_info_alt_fn;
+  decltype(hsa_isa_get_exception_policies)* hsa_isa_get_exception_policies_fn;
+  decltype(hsa_isa_get_round_method)* hsa_isa_get_round_method_fn;
+  decltype(hsa_wavefront_get_info)* hsa_wavefront_get_info_fn;
+  decltype(hsa_isa_iterate_wavefronts)* hsa_isa_iterate_wavefronts_fn;
+
+  //===--- Code Objects (deprecated) - HSA v1.1 additions -----------------===//
+
+  // Deprecated since v1.1.
+  decltype(hsa_code_object_get_symbol_from_name)*
+      hsa_code_object_get_symbol_from_name_fn;
+
+  //===--- Executable - HSA v1.1 additions --------------------------------===//
+
+  decltype(hsa_code_object_reader_create_from_file)*
+      hsa_code_object_reader_create_from_file_fn;
+  decltype(hsa_code_object_reader_create_from_memory)*
+      hsa_code_object_reader_create_from_memory_fn;
+  decltype(hsa_code_object_reader_destroy)* hsa_code_object_reader_destroy_fn;
+  decltype(hsa_executable_create_alt)* hsa_executable_create_alt_fn;
+  decltype(hsa_executable_load_program_code_object)*
+      hsa_executable_load_program_code_object_fn;
+  decltype(hsa_executable_load_agent_code_object)*
+      hsa_executable_load_agent_code_object_fn;
+  decltype(hsa_executable_validate_alt)* hsa_executable_validate_alt_fn;
+  decltype(hsa_executable_get_symbol_by_name)*
+      hsa_executable_get_symbol_by_name_fn;
+  decltype(hsa_executable_iterate_agent_symbols)*
+      hsa_executable_iterate_agent_symbols_fn;
+  decltype(hsa_executable_iterate_program_symbols)*
+      hsa_executable_iterate_program_symbols_fn;
+};
+
+// Table to export HSA Apis from Core Runtime, Amd Extensions
+// Finalizer and Images
+struct HsaApiTable {
+
+  // Version of Hsa Api Table
+  ApiTableVersion version;
+
+  // Table of function pointers to HSA Core Runtime
+	CoreApiTable* core_;
+
+  // Table of function pointers to AMD extensions
+	AmdExtTable* amd_ext_;
+
+  // Table of function pointers to HSA Finalizer Extension
+	FinalizerExtTable* finalizer_ext_;
+
+  // Table of function pointers to HSA Image Extension
+	ImageExtTable* image_ext_;
+
+  // Table of function pointers for tools to use
+  ToolsApiTable* tools_;
+
+  // Table of function pointers to AMD PC Sampling Extension
+  PcSamplingExtTable* pc_sampling_ext_;
+};
+
+// Structure containing instances of different api tables
+struct HsaApiTableContainer {
+  HsaApiTable root;
+	CoreApiTable core;
+	AmdExtTable amd_ext;
+	FinalizerExtTable finalizer_ext;
+	ImageExtTable image_ext;
+	ToolsApiTable tools;
+  PcSamplingExtTable pc_sampling_ext;
+
+  // Default initialization of a container instance
+  HsaApiTableContainer() {
+    root.version.major_id = HSA_API_TABLE_MAJOR_VERSION;
+    root.version.minor_id = sizeof(HsaApiTable);
+    root.version.step_id = HSA_API_TABLE_STEP_VERSION;
+
+    core.version.major_id = HSA_CORE_API_TABLE_MAJOR_VERSION;
+    core.version.minor_id = sizeof(CoreApiTable);
+    core.version.step_id = HSA_CORE_API_TABLE_STEP_VERSION;
+    root.core_ = &core;
+
+    amd_ext.version.major_id = HSA_AMD_EXT_API_TABLE_MAJOR_VERSION;
+    amd_ext.version.minor_id = sizeof(AmdExtTable);
+    amd_ext.version.step_id = HSA_AMD_EXT_API_TABLE_STEP_VERSION;
+    root.amd_ext_ = &amd_ext;
+
+    finalizer_ext.version.major_id = HSA_FINALIZER_API_TABLE_MAJOR_VERSION;
+    finalizer_ext.version.minor_id = sizeof(FinalizerExtTable);
+    finalizer_ext.version.step_id = HSA_FINALIZER_API_TABLE_STEP_VERSION;
+    root.finalizer_ext_ = &finalizer_ext;
+
+    image_ext.version.major_id = HSA_IMAGE_API_TABLE_MAJOR_VERSION;
+    image_ext.version.minor_id = sizeof(ImageExtTable);
+    image_ext.version.step_id = HSA_IMAGE_API_TABLE_STEP_VERSION;
+    root.image_ext_ = &image_ext;
+
+    tools.version.major_id = HSA_TOOLS_API_TABLE_MAJOR_VERSION;
+    tools.version.minor_id = sizeof(ToolsApiTable);
+    tools.version.step_id = HSA_TOOLS_API_TABLE_STEP_VERSION;
+    root.tools_ = &tools;
+
+    pc_sampling_ext.version.major_id = HSA_PC_SAMPLING_API_TABLE_MAJOR_VERSION;
+    pc_sampling_ext.version.minor_id = sizeof(PcSamplingExtTable);
+    pc_sampling_ext.version.step_id = HSA_PC_SAMPLING_API_TABLE_STEP_VERSION;
+    root.pc_sampling_ext_ = &pc_sampling_ext;
+  }
+};
+
+// Api to copy function pointers of a table
+static
+void inline copyApi(void* src, void* dest, size_t size) {
+  assert(size >= sizeof(ApiTableVersion));
+  memcpy((char*)src + sizeof(ApiTableVersion),
+         (char*)dest + sizeof(ApiTableVersion),
+         (size - sizeof(ApiTableVersion)));
+}
+
+// Copy Api child tables if valid.
+static void inline copyElement(ApiTableVersion* dest, ApiTableVersion* src) {
+  if (src->major_id && (dest->major_id == src->major_id)) {
+    dest->step_id = src->step_id;
+    dest->minor_id = Min(dest->minor_id, src->minor_id);
+    copyApi(dest, src, dest->minor_id);
+  } else {
+    dest->major_id = 0;
+    dest->minor_id = 0;
+    dest->step_id = 0;
+  }
+}
+
+// Copy constructor for all Api tables. The function assumes the
+// user has initialized an instance of tables container correctly
+// for the Major, Minor and Stepping Ids of Root and Child Api tables.
+// The function will overwrite the value of Minor Id by taking the
+// minimum of source and destination parameters. It will also overwrite
+// the stepping Id with value from source parameter.
+static void inline copyTables(const HsaApiTable* src, HsaApiTable* dest) {
+  // Verify Major Id of source and destination tables match
+  if (dest->version.major_id != src->version.major_id) {
+    dest->version.major_id = 0;
+    dest->version.minor_id = 0;
+    dest->version.step_id = 0;
+    return;
+  }
+
+  // Initialize the stepping id and minor id of root table. For the
+  // minor id which encodes struct size, take the minimum of source
+  // and destination parameters
+  dest->version.step_id = src->version.step_id;
+  dest->version.minor_id = Min(dest->version.minor_id, src->version.minor_id);
+
+  // Copy child tables if present
+  if ((offsetof(HsaApiTable, core_) < dest->version.minor_id))
+    copyElement(&dest->core_->version, &src->core_->version);
+  if ((offsetof(HsaApiTable, amd_ext_) < dest->version.minor_id))
+    copyElement(&dest->amd_ext_->version, &src->amd_ext_->version);
+  if ((offsetof(HsaApiTable, finalizer_ext_) < dest->version.minor_id))
+    copyElement(&dest->finalizer_ext_->version, &src->finalizer_ext_->version);
+  if ((offsetof(HsaApiTable, image_ext_) < dest->version.minor_id))
+    copyElement(&dest->image_ext_->version, &src->image_ext_->version);
+  if ((offsetof(HsaApiTable, tools_) < dest->version.minor_id))
+    copyElement(&dest->tools_->version, &src->tools_->version);
+  if ((offsetof(HsaApiTable, pc_sampling_ext_) < dest->version.minor_id))
+    copyElement(&dest->pc_sampling_ext_->version, &src->pc_sampling_ext_->version);
+}
+#endif
@@ -0,0 +1,70 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2025, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef HSA_RUNTIME_INC_HSA_API_TRACE_VERSION_H
+#define HSA_RUNTIME_INC_HSA_API_TRACE_VERSION_H
+
+// CODE IN THIS FILE **MUST** BE C-COMPATIBLE
+
+// Major Ids of the Api tables exported by Hsa Core Runtime
+#define HSA_API_TABLE_MAJOR_VERSION                 0x03
+#define HSA_CORE_API_TABLE_MAJOR_VERSION            0x02
+#define HSA_AMD_EXT_API_TABLE_MAJOR_VERSION         0x02
+#define HSA_FINALIZER_API_TABLE_MAJOR_VERSION       0x02
+#define HSA_IMAGE_API_TABLE_MAJOR_VERSION           0x02
+#define HSA_AQLPROFILE_API_TABLE_MAJOR_VERSION      0x01
+#define HSA_TOOLS_API_TABLE_MAJOR_VERSION           0x01
+#define HSA_PC_SAMPLING_API_TABLE_MAJOR_VERSION     0x01
+
+// Step Ids of the Api tables exported by Hsa Core Runtime
+#define HSA_API_TABLE_STEP_VERSION                  0x01
+#define HSA_CORE_API_TABLE_STEP_VERSION             0x00
+#define HSA_AMD_EXT_API_TABLE_STEP_VERSION          0x08
+#define HSA_FINALIZER_API_TABLE_STEP_VERSION        0x00
+#define HSA_IMAGE_API_TABLE_STEP_VERSION            0x01
+// Rocprofiler just checks HSA_MAGE_EXT_API_TABLE_STEP_VERSION
+#define HSA_IMAGE_EXT_API_TABLE_STEP_VERSION        HSA_IMAGE_API_TABLE_STEP_VERSION
+#define HSA_AQLPROFILE_API_TABLE_STEP_VERSION       0x00
+#define HSA_TOOLS_API_TABLE_STEP_VERSION            0x00
+#define HSA_PC_SAMPLING_API_TABLE_STEP_VERSION      0x00
+
+#endif  // HSA_RUNTIME_INC_HSA_API_TRACE_VERSION_H
@@ -0,0 +1,531 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+// 
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+// 
+// Developed by:
+// 
+//                 AMD Research and AMD HSA Software Development
+// 
+//                 Advanced Micro Devices, Inc.
+// 
+//                 www.amd.com
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+// 
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef HSA_RUNTIME_INC_HSA_EXT_FINALIZE_H_
+#define HSA_RUNTIME_INC_HSA_EXT_FINALIZE_H_
+
+#include "hsa.h"
+
+#undef HSA_API
+#ifdef HSA_EXPORT_FINALIZER
+#define HSA_API HSA_API_EXPORT
+#else
+#define HSA_API HSA_API_IMPORT
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+struct BrigModuleHeader;
+typedef struct BrigModuleHeader* BrigModule_t;
+
+/** \defgroup ext-alt-finalizer-extensions Finalization Extensions
+ *  @{
+ */
+
+/**
+ * @brief Enumeration constants added to ::hsa_status_t by this extension.
+ */
+enum {
+  /**
+   * The HSAIL program is invalid.
+   */
+  HSA_EXT_STATUS_ERROR_INVALID_PROGRAM = 0x2000,
+  /**
+   * The HSAIL module is invalid.
+   */
+  HSA_EXT_STATUS_ERROR_INVALID_MODULE = 0x2001,
+  /**
+   * Machine model or profile of the HSAIL module do not match the machine model
+   * or profile of the HSAIL program.
+   */
+  HSA_EXT_STATUS_ERROR_INCOMPATIBLE_MODULE = 0x2002,
+  /**
+   * The HSAIL module is already a part of the HSAIL program.
+   */
+  HSA_EXT_STATUS_ERROR_MODULE_ALREADY_INCLUDED = 0x2003,
+  /**
+   * Compatibility mismatch between symbol declaration and symbol definition.
+   */
+  HSA_EXT_STATUS_ERROR_SYMBOL_MISMATCH = 0x2004,
+  /**
+   * The finalization encountered an error while finalizing a kernel or
+   * indirect function.
+   */
+  HSA_EXT_STATUS_ERROR_FINALIZATION_FAILED = 0x2005,
+  /**
+   * Mismatch between a directive in the control directive structure and in
+   * the HSAIL kernel.
+   */
+  HSA_EXT_STATUS_ERROR_DIRECTIVE_MISMATCH = 0x2006
+};
+
+/** @} */
+
+/** \defgroup ext-alt-finalizer-program Finalization Program
+ *  @{
+ */
+
+/**
+ * @brief HSAIL (BRIG) module. The HSA Programmer's Reference Manual contains
+ * the definition of the BrigModule_t type.
+ */
+typedef BrigModule_t hsa_ext_module_t;
+
+/**
+ * @brief An opaque handle to a HSAIL program, which groups a set of HSAIL
+ * modules that collectively define functions and variables used by kernels and
+ * indirect functions.
+ */
+typedef struct hsa_ext_program_s {
+  /**
+   * Opaque handle.
+   */
+  uint64_t handle;
+} hsa_ext_program_t;
+
+/**
+ * @brief Create an empty HSAIL program.
+ *
+ * @param[in] machine_model Machine model used in the HSAIL program.
+ *
+ * @param[in] profile Profile used in the HSAIL program.
+ *
+ * @param[in] default_float_rounding_mode Default float rounding mode used in
+ * the HSAIL program.
+ *
+ * @param[in] options Vendor-specific options. May be NULL.
+ *
+ * @param[out] program Memory location where the HSA runtime stores the newly
+ * created HSAIL program handle.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure to allocate
+ * resources required for the operation.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p machine_model is invalid,
+ * @p profile is invalid, @p default_float_rounding_mode is invalid, or
+ * @p program is NULL.
+ */
+hsa_status_t HSA_API hsa_ext_program_create(
+    hsa_machine_model_t machine_model,
+    hsa_profile_t profile,
+    hsa_default_float_rounding_mode_t default_float_rounding_mode,
+    const char *options,
+    hsa_ext_program_t *program);
+
+/**
+ * @brief Destroy a HSAIL program.
+ *
+ * @details The HSAIL program handle becomes invalid after it has been
+ * destroyed. Code object handles produced by ::hsa_ext_program_finalize are
+ * still valid after the HSAIL program has been destroyed, and can be used as
+ * intended. Resources allocated outside and associated with the HSAIL program
+ * (such as HSAIL modules that are added to the HSAIL program) can be released
+ * after the finalization program has been destroyed.
+ *
+ * @param[in] program HSAIL program.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_INVALID_PROGRAM The HSAIL program is
+ * invalid.
+ */
+hsa_status_t HSA_API hsa_ext_program_destroy(
+    hsa_ext_program_t program);
+
+/**
+ * @brief Add a HSAIL module to an existing HSAIL program.
+ *
+ * @details The HSA runtime does not perform a deep copy of the HSAIL module
+ * upon addition. Instead, it stores a pointer to the HSAIL module. The
+ * ownership of the HSAIL module belongs to the application, which must ensure
+ * that @p module is not released before destroying the HSAIL program.
+ *
+ * The HSAIL module is successfully added to the HSAIL program if @p module is
+ * valid, if all the declarations and definitions for the same symbol are
+ * compatible, and if @p module specify machine model and profile that matches
+ * the HSAIL program.
+ *
+ * @param[in] program HSAIL program.
+ *
+ * @param[in] module HSAIL module. The application can add the same HSAIL module
+ * to @p program at most once. The HSAIL module must specify the same machine
+ * model and profile as @p program. If the floating-mode rounding mode of @p
+ * module is not default, then it should match that of @p program.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure to allocate
+ * resources required for the operation.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_INVALID_PROGRAM The HSAIL program is invalid.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_INVALID_MODULE The HSAIL module is invalid.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_INCOMPATIBLE_MODULE The machine model of @p
+ * module does not match machine model of @p program, or the profile of @p
+ * module does not match profile of @p program.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_MODULE_ALREADY_INCLUDED The HSAIL module is
+ * already a part of the HSAIL program.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_SYMBOL_MISMATCH Symbol declaration and symbol
+ * definition compatibility mismatch. See the symbol compatibility rules in the
+ * HSA Programming Reference Manual.
+ */
+hsa_status_t HSA_API hsa_ext_program_add_module(
+    hsa_ext_program_t program,
+    hsa_ext_module_t module);
+
+/**
+ * @brief Iterate over the HSAIL modules in a program, and invoke an
+ * application-defined callback on every iteration.
+ *
+ * @param[in] program HSAIL program.
+ *
+ * @param[in] callback Callback to be invoked once per HSAIL module in the
+ * program. The HSA runtime passes three arguments to the callback: the program,
+ * a HSAIL module, and the application data.  If @p callback returns a status
+ * other than ::HSA_STATUS_SUCCESS for a particular iteration, the traversal
+ * stops and ::hsa_ext_program_iterate_modules returns that status value.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_INVALID_PROGRAM The program is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL.
+ */
+hsa_status_t HSA_API hsa_ext_program_iterate_modules(
+    hsa_ext_program_t program,
+    hsa_status_t (*callback)(hsa_ext_program_t program, hsa_ext_module_t module,
+                             void* data),
+    void* data);
+
+/**
+ * @brief HSAIL program attributes.
+ */
+typedef enum {
+  /**
+   * Machine model specified when the HSAIL program was created. The type
+   * of this attribute is ::hsa_machine_model_t.
+   */
+  HSA_EXT_PROGRAM_INFO_MACHINE_MODEL = 0,
+  /**
+   * Profile specified when the HSAIL program was created. The type of
+   * this attribute is ::hsa_profile_t.
+   */
+  HSA_EXT_PROGRAM_INFO_PROFILE = 1,
+  /**
+   * Default float rounding mode specified when the HSAIL program was
+   * created. The type of this attribute is ::hsa_default_float_rounding_mode_t.
+   */
+  HSA_EXT_PROGRAM_INFO_DEFAULT_FLOAT_ROUNDING_MODE = 2
+} hsa_ext_program_info_t;
+
+/**
+ * @brief Get the current value of an attribute for a given HSAIL program.
+ *
+ * @param[in] program HSAIL program.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to an application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behaviour is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_INVALID_PROGRAM The HSAIL program is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid
+ * HSAIL program attribute, or @p value is NULL.
+ */
+hsa_status_t HSA_API hsa_ext_program_get_info(
+    hsa_ext_program_t program,
+    hsa_ext_program_info_t attribute,
+    void *value);
+
+/**
+ * @brief Finalizer-determined call convention.
+ */
+typedef enum {
+ /**
+  * Finalizer-determined call convention.
+  */
+  HSA_EXT_FINALIZER_CALL_CONVENTION_AUTO = -1
+} hsa_ext_finalizer_call_convention_t;
+
+/**
+ * @brief Control directives specify low-level information about the
+ * finalization process.
+ */
+typedef struct hsa_ext_control_directives_s {
+  /**
+   * Bitset indicating which control directives are enabled. The bit assigned to
+   * a control directive is determined by the corresponding value in
+   * BrigControlDirective.
+   *
+   * If a control directive is disabled, its corresponding field value (if any)
+   * must be 0. Control directives that are only present or absent (such as
+   * partial workgroups) have no corresponding field as the presence of the bit
+   * in this mask is sufficient.
+   */
+  uint64_t control_directives_mask;
+  /**
+   * Bitset of HSAIL exceptions that must have the BREAK policy enabled. The bit
+   * assigned to an HSAIL exception is determined by the corresponding value
+   * in BrigExceptionsMask. If the kernel contains a enablebreakexceptions
+   * control directive, the finalizer uses the union of the two masks.
+   */
+  uint16_t break_exceptions_mask;
+  /**
+   * Bitset of HSAIL exceptions that must have the DETECT policy enabled. The
+   * bit assigned to an HSAIL exception is determined by the corresponding value
+   * in BrigExceptionsMask. If the kernel contains a enabledetectexceptions
+   * control directive, the finalizer uses the union of the two masks.
+   */
+  uint16_t detect_exceptions_mask;
+  /**
+   * Maximum size (in bytes) of dynamic group memory that will be allocated by
+   * the application for any dispatch of the kernel.  If the kernel contains a
+   * maxdynamicsize control directive, the two values should match.
+   */
+  uint32_t max_dynamic_group_size;
+  /**
+   * Maximum number of grid work-items that will be used by the application to
+   * launch the kernel. If the kernel contains a maxflatgridsize control
+   * directive, the value of @a max_flat_grid_size must not be greater than the
+   * value of the directive, and takes precedence.
+   *
+   * The value specified for maximum absolute grid size must be greater than or
+   * equal to the product of the values specified by @a required_grid_size.
+   *
+   * If the bit at position BRIG_CONTROL_MAXFLATGRIDSIZE is set in @a
+   * control_directives_mask, this field must be greater than 0.
+   */
+  uint64_t max_flat_grid_size;
+  /**
+   * Maximum number of work-group work-items that will be used by the
+   * application to launch the kernel. If the kernel contains a
+   * maxflatworkgroupsize control directive, the value of @a
+   * max_flat_workgroup_size must not be greater than the value of the
+   * directive, and takes precedence.
+   *
+   * The value specified for maximum absolute grid size must be greater than or
+   * equal to the product of the values specified by @a required_workgroup_size.
+   *
+   * If the bit at position BRIG_CONTROL_MAXFLATWORKGROUPSIZE is set in @a
+   * control_directives_mask, this field must be greater than 0.
+   */
+  uint32_t max_flat_workgroup_size;
+  /**
+   * Reserved. Must be 0.
+   */
+  uint32_t reserved1;
+  /**
+   * Grid size that will be used by the application in any dispatch of the
+   * kernel. If the kernel contains a requiredgridsize control directive, the
+   * dimensions should match.
+   *
+   * The specified grid size must be consistent with @a required_workgroup_size
+   * and @a required_dim. Also, the product of the three dimensions must not
+   * exceed @a max_flat_grid_size. Note that the listed invariants must hold
+   * only if all the corresponding control directives are enabled.
+   *
+   * If the bit at position BRIG_CONTROL_REQUIREDGRIDSIZE is set in @a
+   * control_directives_mask, the three dimension values must be greater than 0.
+   */
+  uint64_t required_grid_size[3];
+  /**
+   * Work-group size that will be used by the application in any dispatch of the
+   * kernel. If the kernel contains a requiredworkgroupsize control directive,
+   * the dimensions should match.
+   *
+   * The specified work-group size must be consistent with @a required_grid_size
+   * and @a required_dim. Also, the product of the three dimensions must not
+   * exceed @a max_flat_workgroup_size. Note that the listed invariants must
+   * hold only if all the corresponding control directives are enabled.
+   *
+   * If the bit at position BRIG_CONTROL_REQUIREDWORKGROUPSIZE is set in @a
+   * control_directives_mask, the three dimension values must be greater than 0.
+   */
+  hsa_dim3_t required_workgroup_size;
+  /**
+   * Number of dimensions that will be used by the application to launch the
+   * kernel. If the kernel contains a requireddim control directive, the two
+   * values should match.
+   *
+   * The specified dimensions must be consistent with @a required_grid_size and
+   * @a required_workgroup_size. This invariant must hold only if all the
+   * corresponding control directives are enabled.
+   *
+   * If the bit at position BRIG_CONTROL_REQUIREDDIM is set in @a
+   * control_directives_mask, this field must be 1, 2, or 3.
+   */
+  uint8_t required_dim;
+  /**
+   * Reserved. Must be 0.
+   */
+  uint8_t reserved2[75];
+} hsa_ext_control_directives_t;
+
+/**
+ * @brief Finalize an HSAIL program for a given instruction set architecture.
+ *
+ * @details Finalize all of the kernels and indirect functions that belong to
+ * the same HSAIL program for a specific instruction set architecture (ISA). The
+ * transitive closure of all functions specified by call or scall must be
+ * defined. Kernels and indirect functions that are being finalized must be
+ * defined. Kernels and indirect functions that are referenced in kernels and
+ * indirect functions being finalized may or may not be defined, but must be
+ * declared. All the global/readonly segment variables that are referenced in
+ * kernels and indirect functions being finalized may or may not be defined, but
+ * must be declared.
+ *
+ * @param[in] program HSAIL program.
+ *
+ * @param[in] isa Instruction set architecture to finalize for.
+ *
+ * @param[in] call_convention A call convention used in a finalization. Must
+ * have a value between ::HSA_EXT_FINALIZER_CALL_CONVENTION_AUTO (inclusive)
+ * and the value of the attribute ::HSA_ISA_INFO_CALL_CONVENTION_COUNT in @p
+ * isa (not inclusive).
+ *
+ * @param[in] control_directives Low-level control directives that influence
+ * the finalization process.
+ *
+ * @param[in] options Vendor-specific options. May be NULL.
+ *
+ * @param[in] code_object_type Type of code object to produce.
+ *
+ * @param[out] code_object Code object generated by the Finalizer, which
+ * contains the machine code for the kernels and indirect functions in the HSAIL
+ * program. The code object is independent of the HSAIL module that was used to
+ * generate it.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure to allocate
+ * resources required for the operation.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_INVALID_PROGRAM The HSAIL program is
+ * invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ISA @p isa is invalid.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_DIRECTIVE_MISMATCH The directive in
+ * the control directive structure and in the HSAIL kernel mismatch, or if the
+ * same directive is used with a different value in one of the functions used by
+ * this kernel.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_FINALIZATION_FAILED The Finalizer
+ * encountered an error while compiling a kernel or an indirect function.
+ */
+hsa_status_t HSA_API hsa_ext_program_finalize(
+    hsa_ext_program_t program,
+    hsa_isa_t isa,
+    int32_t call_convention,
+    hsa_ext_control_directives_t control_directives,
+    const char *options,
+    hsa_code_object_type_t code_object_type,
+    hsa_code_object_t *code_object);
+
+/** @} */
+
+#define hsa_ext_finalizer_1_00
+
+typedef struct hsa_ext_finalizer_1_00_pfn_s {
+  hsa_status_t (*hsa_ext_program_create)(
+      hsa_machine_model_t machine_model, hsa_profile_t profile,
+      hsa_default_float_rounding_mode_t default_float_rounding_mode,
+      const char *options, hsa_ext_program_t *program);
+
+  hsa_status_t (*hsa_ext_program_destroy)(hsa_ext_program_t program);
+
+  hsa_status_t (*hsa_ext_program_add_module)(hsa_ext_program_t program,
+                                                 hsa_ext_module_t module);
+
+  hsa_status_t (*hsa_ext_program_iterate_modules)(
+      hsa_ext_program_t program,
+      hsa_status_t (*callback)(hsa_ext_program_t program,
+                               hsa_ext_module_t module, void *data),
+      void *data);
+
+  hsa_status_t (*hsa_ext_program_get_info)(
+      hsa_ext_program_t program, hsa_ext_program_info_t attribute,
+      void *value);
+
+  hsa_status_t (*hsa_ext_program_finalize)(
+      hsa_ext_program_t program, hsa_isa_t isa, int32_t call_convention,
+      hsa_ext_control_directives_t control_directives, const char *options,
+      hsa_code_object_type_t code_object_type, hsa_code_object_t *code_object);
+} hsa_ext_finalizer_1_00_pfn_t;
+
+#ifdef __cplusplus
+} // extern "C" block
+#endif // __cplusplus
+
+#endif // HSA_RUNTIME_INC_HSA_EXT_FINALIZE_H_
@@ -0,0 +1,488 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef OPENSRC_HSA_RUNTIME_INC_HSA_VEN_AMD_AQLPROFILE_H_
+#define OPENSRC_HSA_RUNTIME_INC_HSA_VEN_AMD_AQLPROFILE_H_
+
+#include <stdint.h>
+#include "hsa.h"
+
+#define HSA_AQLPROFILE_VERSION_MAJOR 2
+#define HSA_AQLPROFILE_VERSION_MINOR 0
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+////////////////////////////////////////////////////////////////////////////////
+// Library version
+uint32_t hsa_ven_amd_aqlprofile_version_major();
+uint32_t hsa_ven_amd_aqlprofile_version_minor();
+
+///////////////////////////////////////////////////////////////////////
+// Library API:
+// The library provides helper methods for instantiation of
+// the profile context object and for populating of the start
+// and stop AQL packets. The profile object contains a profiling
+// events list and needed for profiling buffers descriptors,
+// a command buffer and an output data buffer. To check if there
+// was an error the library methods return a status code. Also
+// the library provides methods for querying required buffers
+// attributes, to validate the event attributes and to get profiling
+// output data.
+//
+// Returned status:
+//     hsa_status_t – HSA status codes are used from hsa.h header
+//
+// Supported profiling features:
+//
+// Supported profiling events
+typedef enum {
+  HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC = 0,
+  HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_TRACE = 1,
+} hsa_ven_amd_aqlprofile_event_type_t;
+
+// Supported performance counters (PMC) blocks
+// The block ID is the same for a block instances set, for example
+// each block instance from the TCC block set, TCC0, TCC1, …, TCCN
+// will have the same block ID HSA_VEN_AMD_AQLPROFILE_BLOCKS_TCC.
+typedef enum {
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_CPC = 0,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_CPF = 1,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GDS = 2,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GRBM = 3,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GRBMSE = 4,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SPI = 5,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQ = 6,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQCS = 7,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SRBM = 8,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SX = 9,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TA = 10,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCA = 11,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCC = 12,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCP = 13,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TD = 14,
+  // Memory related blocks
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCARB = 15,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCHUB = 16,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCMCBVM = 17,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCSEQ = 18,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCVML2 = 19,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCXBAR = 20,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_ATC = 21,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_ATCL2 = 22,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GCEA = 23,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_RPB = 24,
+  // System blocks
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SDMA = 25,
+  // GFX10 added blocks
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL1A = 26,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL1C = 27,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL2A = 28,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL2C = 29,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GCR = 30,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GUS = 31,
+
+  // UMC & MMEA System Blocks
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_UMC = 32,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MMEA = 33,
+
+  HSA_VEN_AMD_AQLPROFILE_BLOCKS_NUMBER
+} hsa_ven_amd_aqlprofile_block_name_t;
+
+// PMC event object structure
+// ‘counter_id’ value is specified in GFXIPs perfcounter user guides
+// which is the counters select value, “Performance Counters Selection”
+// chapter.
+typedef struct {
+  hsa_ven_amd_aqlprofile_block_name_t block_name;
+  uint32_t block_index;
+  uint32_t counter_id;
+} hsa_ven_amd_aqlprofile_event_t;
+
+// Check if event is valid for the specific GPU
+hsa_status_t hsa_ven_amd_aqlprofile_validate_event(
+    hsa_agent_t agent,                            // HSA handle for the profiling GPU
+    const hsa_ven_amd_aqlprofile_event_t* event,  // [in] Pointer on validated event
+    bool* result);                                // [out] True if the event valid, False otherwise
+
+// Profiling parameters
+// All parameters are generic and if not applicable for a specific
+// profile configuration then error status will be returned.
+typedef enum {
+  /**
+   * Select the target compute unit (wgp) for profiling.
+   */
+  HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET = 0,
+  /**
+   * VMID Mask
+   */
+  HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_VM_ID_MASK = 1,
+  /**
+   * Legacy. Deprecated.
+   */
+  HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_MASK = 2,
+  /**
+   * Legacy. Deprecated.
+   */
+  HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK = 3,
+  /**
+   * Legacy. Deprecated.
+   */
+  HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK2 = 4,
+  /**
+   * Shader engine mask for selection.
+   */
+  HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SE_MASK = 5,
+  /**
+   * Legacy. Deprecated.
+   */
+  HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SAMPLE_RATE = 6,
+  /**
+   * Legacy. Deprecated.
+   */
+  HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_K_CONCURRENT = 7,
+  /**
+   * Set SIMD Mask (GFX9) or SIMD ID for collection (Navi)
+   */
+  HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SIMD_SELECTION = 8,
+  /**
+   * Set true for occupancy collection only.
+   */
+  HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_OCCUPANCY_MODE = 9,
+  /**
+   * ATT collection max data size, in MB. Shared among shader engines.
+   */
+  HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_ATT_BUFFER_SIZE = 10,
+  /**
+   * Mask of which compute units to generate perfcounters. GFX9 only.
+   */
+  HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_PERFCOUNTER_MASK = 240,
+  /**
+   * Select collection period for perfcounters. GFX9 only.
+   */
+  HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_PERFCOUNTER_CTRL = 241,
+  /**
+   * Select perfcounter ID (SQ block) for collection. GFX9 only.
+   */
+  HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_PERFCOUNTER_NAME = 242,
+} hsa_ven_amd_aqlprofile_parameter_name_t;
+
+// Profile parameter object
+typedef struct {
+  hsa_ven_amd_aqlprofile_parameter_name_t parameter_name;
+  uint32_t value;
+} hsa_ven_amd_aqlprofile_parameter_t;
+
+typedef enum {
+  HSA_VEN_AMD_AQLPROFILE_ATT_CHANNEL_0 = 0,
+  HSA_VEN_AMD_AQLPROFILE_ATT_CHANNEL_1,
+  HSA_VEN_AMD_AQLPROFILE_ATT_CHANNEL_2,
+  HSA_VEN_AMD_AQLPROFILE_ATT_CHANNEL_3
+} hsa_ven_amd_aqlprofile_att_marker_channel_t;
+
+//
+// Profile context object:
+// The library provides a profile object structure which contains
+// the events array, a buffer for the profiling start/stop commands
+// and a buffer for the output data.
+// The buffers are specified by the buffer descriptors and allocated
+// by the application. The buffers allocation attributes, the command
+// buffer size, the PMC output buffer size as well as profiling output
+// data can be get using the generic get profile info helper _get_info.
+//
+// Buffer descriptor
+typedef struct {
+  void* ptr;
+  uint32_t size;
+} hsa_ven_amd_aqlprofile_descriptor_t;
+
+// Profile context object structure, contains profiling events list and
+// needed for profiling buffers descriptors, a command buffer and
+// an output data buffer
+typedef struct {
+  hsa_agent_t agent;                                     // GFXIP handle
+  hsa_ven_amd_aqlprofile_event_type_t type;              // Events type
+  const hsa_ven_amd_aqlprofile_event_t* events;          // Events array
+  uint32_t event_count;                                  // Events count
+  const hsa_ven_amd_aqlprofile_parameter_t* parameters;  // Parameters array
+  uint32_t parameter_count;                              // Parameters count
+  hsa_ven_amd_aqlprofile_descriptor_t output_buffer;     // Output buffer
+  hsa_ven_amd_aqlprofile_descriptor_t command_buffer;    // PM4 commands
+} hsa_ven_amd_aqlprofile_profile_t;
+
+//
+// AQL packets populating methods:
+// The helper methods to populate provided by the application START and
+// STOP AQL packets which the application is required to submit before and
+// after profiled GPU task packets respectively.
+//
+// AQL Vendor Specific packet which carries a PM4 command
+typedef struct {
+  uint16_t header;
+  uint16_t pm4_command[27];
+  hsa_signal_t completion_signal;
+} hsa_ext_amd_aql_pm4_packet_t;
+
+// Method to populate the provided AQL packet with profiling start commands
+// Only 'pm4_command' fields of the packet are set and the application
+// is responsible to set Vendor Specific header type a completion signal
+hsa_status_t hsa_ven_amd_aqlprofile_start(
+    hsa_ven_amd_aqlprofile_profile_t* profile,        // [in,out] profile context object
+    hsa_ext_amd_aql_pm4_packet_t* aql_start_packet);  // [out] profile start AQL packet
+
+// Method to populate the provided AQL packet with profiling stop commands
+// Only 'pm4_command' fields of the packet are set and the application
+// is responsible to set Vendor Specific header type and a completion signal
+hsa_status_t hsa_ven_amd_aqlprofile_stop(
+    const hsa_ven_amd_aqlprofile_profile_t* profile,  // [in] profile context object
+    hsa_ext_amd_aql_pm4_packet_t* aql_stop_packet);   // [out] profile stop AQL packet
+
+// Method to populate the provided AQL packet with profiling read commands
+// Only 'pm4_command' fields of the packet are set and the application
+// is responsible to set Vendor Specific header type and a completion signal
+hsa_status_t hsa_ven_amd_aqlprofile_read(
+    const hsa_ven_amd_aqlprofile_profile_t* profile,  // [in] profile context object
+    hsa_ext_amd_aql_pm4_packet_t* aql_read_packet);   // [out] profile stop AQL packet
+
+// Legacy devices, PM4 profiling packet size
+const unsigned HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE = 192;
+// Legacy devices, converting the profiling AQL packet to PM4 packet blob
+hsa_status_t hsa_ven_amd_aqlprofile_legacy_get_pm4(
+    const hsa_ext_amd_aql_pm4_packet_t* aql_packet,  // [in] AQL packet
+    void* data);                                     // [out] PM4 packet blob
+
+// Method to add a marker (correlation ID) into the ATT buffer.
+hsa_status_t hsa_ven_amd_aqlprofile_att_marker(
+    hsa_ven_amd_aqlprofile_profile_t* profile,            // [in,out] profile context object
+    hsa_ext_amd_aql_pm4_packet_t* aql_marker_packet,      // [out] profile marker AQL packet
+    uint32_t data,                                        // [in] Data to be inserted
+    hsa_ven_amd_aqlprofile_att_marker_channel_t channel); // [in] Comm channel
+
+//
+// Get profile info:
+// Generic method for getting various profile info including profile buffers
+// attributes like the command buffer size and the profiling PMC results.
+// It’s implied that all counters are 64bit values.
+//
+// Profile generic output data:
+typedef struct {
+  uint32_t sample_id;  // PMC sample or trace buffer index
+  union {
+    struct {
+      hsa_ven_amd_aqlprofile_event_t event;  // PMC event
+      uint64_t result;                       // PMC result
+    } pmc_data;
+    hsa_ven_amd_aqlprofile_descriptor_t trace_data;  // Trace output data descriptor
+  };
+} hsa_ven_amd_aqlprofile_info_data_t;
+
+// ID query type
+typedef struct {
+  const char* name;
+  uint32_t id;
+  uint32_t instance_count;
+} hsa_ven_amd_aqlprofile_id_query_t;
+
+// Profile attributes
+typedef enum {
+  HSA_VEN_AMD_AQLPROFILE_INFO_COMMAND_BUFFER_SIZE = 0,  // get_info returns uint32_t value
+  HSA_VEN_AMD_AQLPROFILE_INFO_PMC_DATA_SIZE = 1,        // get_info returns uint32_t value
+  HSA_VEN_AMD_AQLPROFILE_INFO_PMC_DATA = 2,             // get_info returns PMC uint64_t value
+                                                        // in info_data object
+  HSA_VEN_AMD_AQLPROFILE_INFO_TRACE_DATA = 3,           // get_info returns trace buffer ptr/size
+                                                        // in info_data object
+  HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_COUNTERS = 4,       // get_info returns number of block counter
+  HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_ID = 5,             // get_info returns block id, instances
+                                                        // by name string using _id_query_t
+  HSA_VEN_AMD_AQLPROFILE_INFO_ENABLE_CMD = 6,           // get_info returns size/pointer for
+                                                        // counters enable command buffer
+  HSA_VEN_AMD_AQLPROFILE_INFO_DISABLE_CMD = 7,          // get_info returns size/pointer for
+                                                        // counters disable command buffer
+} hsa_ven_amd_aqlprofile_info_type_t;
+
+
+// Definition of output data iterator callback
+typedef hsa_status_t (*hsa_ven_amd_aqlprofile_data_callback_t)(
+    hsa_ven_amd_aqlprofile_info_type_t info_type,   // [in] data type, PMC or trace data
+    hsa_ven_amd_aqlprofile_info_data_t* info_data,  // [in] info_data object
+    void* callback_data);                           // [in,out] data passed to the callback
+
+// Method for getting the profile info
+hsa_status_t hsa_ven_amd_aqlprofile_get_info(
+    const hsa_ven_amd_aqlprofile_profile_t* profile,  // [in] profile context object
+    hsa_ven_amd_aqlprofile_info_type_t attribute,     // [in] requested profile attribute
+    void* value);                                     // [in,out] returned value
+
+// Method for iterating the events output data
+hsa_status_t hsa_ven_amd_aqlprofile_iterate_data(
+    const hsa_ven_amd_aqlprofile_profile_t* profile,  // [in] profile context object
+    hsa_ven_amd_aqlprofile_data_callback_t callback,  // [in] callback to iterate the output data
+    void* data);                                      // [in,out] data passed to the callback
+
+// Return error string
+hsa_status_t hsa_ven_amd_aqlprofile_error_string(
+    const char** str);  // [out] pointer on the error string
+
+/**
+ * @brief Callback for iteration of all possible event coordinate IDs and coordinate names.
+ */
+typedef hsa_status_t(*hsa_ven_amd_aqlprofile_eventname_callback_t)(int id, const char* name);
+/**
+ * @brief Iterate over all possible event coordinate IDs and their names.
+ */
+hsa_status_t hsa_ven_amd_aqlprofile_iterate_event_ids(hsa_ven_amd_aqlprofile_eventname_callback_t);
+
+/**
+ * @brief Iterate over all event coordinates for a given agent_t and event_t.
+ * @param position A counting sequence indicating callback number.
+ * @param id Coordinate ID as in _iterate_event_ids.
+ * @param extent Coordinate extent indicating maximum allowed instances.
+ * @param coordinate The coordinate, in the range [0,extent-1].
+ * @param name Coordinate name as in _iterate_event_ids.
+ * @param userdata Userdata returned from _iterate_event_coord function.
+ */
+typedef hsa_status_t(*hsa_ven_amd_aqlprofile_coordinate_callback_t)(
+  int position,
+  int id,
+  int extent,
+  int coordinate,
+  const char* name,
+  void* userdata
+);
+
+/**
+ * @brief Iterate over all event coordinates for a given agent_t and event_t.
+ * @param[in] agent HSA agent.
+ * @param[in] event The event ID and block ID to iterate for.
+ * @param[in] sample_id aqlprofile_info_data_t.sample_id returned from _aqlprofile_iterate_data.
+ * @param[in] callback Callback function to return the coordinates.
+ * @param[in] userdata Arbitrary data pointer to be sent back to the user via callback.
+ */
+hsa_status_t hsa_ven_amd_aqlprofile_iterate_event_coord(
+  hsa_agent_t agent,
+  hsa_ven_amd_aqlprofile_event_t event,
+  uint32_t sample_id,
+  hsa_ven_amd_aqlprofile_coordinate_callback_t callback,
+  void* userdata
+);
+
+/**
+ * @brief Extension version.
+ */
+#define hsa_ven_amd_aqlprofile_VERSION_MAJOR 1
+#define hsa_ven_amd_aqlprofile_LIB(suff) "libhsa-amd-aqlprofile" suff ".so"
+
+#ifdef HSA_LARGE_MODEL
+static const char kAqlProfileLib[] = hsa_ven_amd_aqlprofile_LIB("64");
+#else
+static const char kAqlProfileLib[] = hsa_ven_amd_aqlprofile_LIB("");
+#endif
+
+/**
+ * @brief Extension function table.
+ */
+typedef struct hsa_ven_amd_aqlprofile_1_00_pfn_s {
+  uint32_t (*hsa_ven_amd_aqlprofile_version_major)();
+  uint32_t (*hsa_ven_amd_aqlprofile_version_minor)();
+
+  hsa_status_t (*hsa_ven_amd_aqlprofile_error_string)(
+      const char** str);
+
+  hsa_status_t (*hsa_ven_amd_aqlprofile_validate_event)(
+      hsa_agent_t agent,
+      const hsa_ven_amd_aqlprofile_event_t* event,
+      bool* result);
+
+  hsa_status_t (*hsa_ven_amd_aqlprofile_start)(
+      hsa_ven_amd_aqlprofile_profile_t* profile,
+      hsa_ext_amd_aql_pm4_packet_t* aql_start_packet);
+
+  hsa_status_t (*hsa_ven_amd_aqlprofile_stop)(
+      const hsa_ven_amd_aqlprofile_profile_t* profile,
+      hsa_ext_amd_aql_pm4_packet_t* aql_stop_packet);
+
+  hsa_status_t (*hsa_ven_amd_aqlprofile_read)(
+      const hsa_ven_amd_aqlprofile_profile_t* profile,
+      hsa_ext_amd_aql_pm4_packet_t* aql_read_packet);
+
+  hsa_status_t (*hsa_ven_amd_aqlprofile_legacy_get_pm4)(
+      const hsa_ext_amd_aql_pm4_packet_t* aql_packet,
+      void* data);
+
+  hsa_status_t (*hsa_ven_amd_aqlprofile_get_info)(
+      const hsa_ven_amd_aqlprofile_profile_t* profile,
+      hsa_ven_amd_aqlprofile_info_type_t attribute,
+      void* value);
+
+  hsa_status_t (*hsa_ven_amd_aqlprofile_iterate_data)(
+      const hsa_ven_amd_aqlprofile_profile_t* profile,
+      hsa_ven_amd_aqlprofile_data_callback_t callback,
+      void* data);
+
+  hsa_status_t (*hsa_ven_amd_aqlprofile_iterate_event_ids)(
+      hsa_ven_amd_aqlprofile_eventname_callback_t
+  );
+
+  hsa_status_t (*hsa_ven_amd_aqlprofile_iterate_event_coord)(
+      hsa_agent_t agent,
+      hsa_ven_amd_aqlprofile_event_t event,
+      uint32_t sample_id,
+      hsa_ven_amd_aqlprofile_coordinate_callback_t callback,
+      void* userdata
+  );
+
+  hsa_status_t (*hsa_ven_amd_aqlprofile_att_marker)(
+      hsa_ven_amd_aqlprofile_profile_t* profile,
+      hsa_ext_amd_aql_pm4_packet_t* aql_packet,
+      uint32_t data,
+      hsa_ven_amd_aqlprofile_att_marker_channel_t channel
+  );
+} hsa_ven_amd_aqlprofile_1_00_pfn_t;
+
+typedef hsa_ven_amd_aqlprofile_1_00_pfn_t hsa_ven_amd_aqlprofile_pfn_t;
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // OPENSRC_HSA_RUNTIME_INC_HSA_VEN_AMD_AQLPROFILE_H_
@@ -0,0 +1,667 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// HSA AMD extension for additional loader functionality.
+
+#ifndef HSA_VEN_AMD_LOADER_H
+#define HSA_VEN_AMD_LOADER_H
+
+#include "hsa.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/**
+ * @brief Queries equivalent host address for given @p device_address, and
+ * records it in @p host_address.
+ *
+ *
+ * @details Contents of memory pointed to by @p host_address would be identical
+ * to contents of memory pointed to by @p device_address. Only difference
+ * between the two is host accessibility: @p host_address is always accessible
+ * from host, @p device_address might not be accessible from host.
+ *
+ * If @p device_address already points to host accessible memory, then the value
+ * of @p device_address is simply copied into @p host_address.
+ *
+ * The lifetime of @p host_address is the same as the lifetime of @p
+ * device_address, and both lifetimes are limited by the lifetime of the
+ * executable that is managing these addresses.
+ *
+ *
+ * @param[in] device_address Device address to query equivalent host address
+ * for.
+ *
+ * @param[out] host_address Pointer to application-allocated buffer to record
+ * queried equivalent host address in.
+ *
+ *
+ * @retval HSA_STATUS_SUCCESS Function is executed successfully.
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED Runtime is not initialized.
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p device_address is invalid or
+ * null, or @p host_address is null.
+ */
+hsa_status_t hsa_ven_amd_loader_query_host_address(
+  const void *device_address,
+  const void **host_address);
+
+/**
+ * @brief The storage type of the code object that is backing loaded memory
+ * segment.
+ */
+typedef enum {
+  /**
+   * Loaded memory segment is not backed by any code object (anonymous), as the
+   * case would be with BSS (uninitialized data).
+   */
+  HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_NONE = 0,
+  /**
+   * Loaded memory segment is backed by the code object that is stored in the
+   * file.
+   */
+  HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE = 1,
+  /**
+   * Loaded memory segment is backed by the code object that is stored in the
+   * memory.
+   */
+  HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY = 2
+} hsa_ven_amd_loader_code_object_storage_type_t;
+
+/**
+ * @brief Loaded memory segment descriptor.
+ *
+ *
+ * @details Loaded memory segment descriptor describes underlying loaded memory
+ * segment. Loaded memory segment is created/allocated by the executable during
+ * the loading of the code object that is backing underlying memory segment.
+ *
+ * The lifetime of underlying memory segment is limited by the lifetime of the
+ * executable that is managing underlying memory segment.
+ */
+typedef struct hsa_ven_amd_loader_segment_descriptor_s {
+  /**
+   * Agent underlying memory segment is allocated on. If the code object that is
+   * backing underlying memory segment is program code object, then 0.
+   */
+  hsa_agent_t agent;
+  /**
+   * Executable that is managing this underlying memory segment.
+   */
+  hsa_executable_t executable;
+  /**
+   * Storage type of the code object that is backing underlying memory segment.
+   */
+  hsa_ven_amd_loader_code_object_storage_type_t code_object_storage_type;
+  /**
+   * If the storage type of the code object that is backing underlying memory
+   * segment is:
+   *   - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_NONE, then null;
+   *   - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE, then null-terminated
+   *     filepath to the code object;
+   *   - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY, then host
+   *     accessible pointer to the first byte of the code object.
+   */
+  const void *code_object_storage_base;
+  /**
+   * If the storage type of the code object that is backing underlying memory
+   * segment is:
+   *   - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_NONE, then 0;
+   *   - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE, then the length of
+   *     the filepath to the code object (including null-terminating character);
+   *   - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY, then the size, in
+   *     bytes, of the memory occupied by the code object.
+   */
+  size_t code_object_storage_size;
+  /**
+   * If the storage type of the code object that is backing underlying memory
+   * segment is:
+   *   - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_NONE, then 0;
+   *   - other, then offset, in bytes, from the beginning of the code object to
+   *     the first byte in the code object data is copied from.
+   */
+  size_t code_object_storage_offset;
+  /**
+   * Starting address of the underlying memory segment.
+   */
+  const void *segment_base;
+  /**
+   * Size, in bytes, of the underlying memory segment.
+   */
+  size_t segment_size;
+} hsa_ven_amd_loader_segment_descriptor_t;
+
+/**
+ * @brief Either queries loaded memory segment descriptors, or total number of
+ * loaded memory segment descriptors.
+ *
+ *
+ * @details If @p segment_descriptors is not null and @p num_segment_descriptors
+ * points to number that exactly matches total number of loaded memory segment
+ * descriptors, then queries loaded memory segment descriptors, and records them
+ * in @p segment_descriptors. If @p segment_descriptors is null and @p
+ * num_segment_descriptors points to zero, then queries total number of loaded
+ * memory segment descriptors, and records it in @p num_segment_descriptors. In
+ * all other cases returns appropriate error code (see below).
+ *
+ * The caller of this function is responsible for the allocation/deallocation
+ * and the lifetime of @p segment_descriptors and @p num_segment_descriptors.
+ *
+ * The lifetime of loaded memory segments that are described by queried loaded
+ * memory segment descriptors is limited by the lifetime of the executable that
+ * is managing loaded memory segments.
+ *
+ * Queried loaded memory segment descriptors are always self-consistent: they
+ * describe a complete set of loaded memory segments that are being backed by
+ * fully loaded code objects that are present at the time (i.e. this function
+ * is blocked until all executable manipulations are fully complete).
+ *
+ *
+ * @param[out] segment_descriptors Pointer to application-allocated buffer to
+ * record queried loaded memory segment descriptors in. Can be null if @p
+ * num_segment_descriptors points to zero.
+ *
+ * @param[in,out] num_segment_descriptors Pointer to application-allocated
+ * buffer that contains either total number of loaded memory segment descriptors
+ * or zero.
+ *
+ *
+ * @retval HSA_STATUS_SUCCESS Function is executed successfully.
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED Runtime is not initialized.
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p segment_descriptors is null
+ * while @p num_segment_descriptors points to non-zero number, @p
+ * segment_descriptors is not null while @p num_segment_descriptors points to
+ * zero, or @p num_segment_descriptors is null.
+ *
+ * @retval HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS @p num_segment_descriptors
+ * does not point to number that exactly matches total number of loaded memory
+ * segment descriptors.
+ */
+hsa_status_t hsa_ven_amd_loader_query_segment_descriptors(
+  hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors,
+  size_t *num_segment_descriptors);
+
+/**
+ * @brief Obtains the handle of executable to which the device address belongs.
+ *
+ * @details This method should not be used to obtain executable handle by using
+ * a host address. The executable returned is expected to be alive until its
+ * destroyed by the user.
+ *
+ * @retval HSA_STATUS_SUCCESS Function is executed successfully.
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED Runtime is not initialized.
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT The input is invalid or there
+ * is no exectuable found for this kernel code object.
+ */
+hsa_status_t hsa_ven_amd_loader_query_executable(
+  const void *device_address,
+  hsa_executable_t *executable);
+
+//===----------------------------------------------------------------------===//
+
+/**
+ * @brief Iterate over the loaded code objects in an executable, and invoke
+ * an application-defined callback on every iteration.
+ *
+ * @param[in] executable Executable.
+ *
+ * @param[in] callback Callback to be invoked once per loaded code object. The
+ * HSA runtime passes three arguments to the callback: the executable, a
+ * loaded code object, and the application data. If @p callback returns a
+ * status other than ::HSA_STATUS_SUCCESS for a particular iteration, the
+ * traversal stops and
+ * ::hsa_ven_amd_loader_executable_iterate_loaded_code_objects returns that
+ * status value.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL.
+ */
+hsa_status_t hsa_ven_amd_loader_executable_iterate_loaded_code_objects(
+  hsa_executable_t executable,
+  hsa_status_t (*callback)(
+    hsa_executable_t executable,
+    hsa_loaded_code_object_t loaded_code_object,
+    void *data),
+  void *data);
+
+/**
+ * @brief Loaded code object kind.
+ */
+typedef enum {
+  /**
+   * Program code object.
+   */
+  HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_KIND_PROGRAM = 1,
+  /**
+   * Agent code object.
+   */
+  HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_KIND_AGENT = 2
+} hsa_ven_amd_loader_loaded_code_object_kind_t;
+
+/**
+ * @brief Loaded code object attributes.
+ */
+typedef enum hsa_ven_amd_loader_loaded_code_object_info_e {
+  /**
+   * The executable in which this loaded code object is loaded. The
+   * type of this attribute is ::hsa_executable_t.
+   */
+  HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_EXECUTABLE = 1,
+  /**
+   * The kind of this loaded code object. The type of this attribute is
+   * ::uint32_t interpreted as ::hsa_ven_amd_loader_loaded_code_object_kind_t.
+   */
+  HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_KIND = 2,
+  /**
+   * The agent on which this loaded code object is loaded. The
+   * value of this attribute is only defined if
+   * ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_KIND is
+   * ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_KIND_AGENT. The type of this
+   * attribute is ::hsa_agent_t.
+   */
+  HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_AGENT = 3,
+  /**
+   * The storage type of the code object reader used to load the loaded code object.
+   * The type of this attribute is ::uint32_t interpreted as a
+   * ::hsa_ven_amd_loader_code_object_storage_type_t.
+   */
+  HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_TYPE = 4,
+  /**
+   * The memory address of the first byte of the code object that was loaaded.
+   * The value of this attribute is only defined if
+   * ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_TYPE is
+   * ::HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY. The type of this
+   * attribute is ::uint64_t.
+   */
+  HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_MEMORY_BASE = 5,
+  /**
+   * The memory size in bytes of the code object that was loaaded.
+   * The value of this attribute is only defined if
+   * ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_TYPE is
+   * ::HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY. The type of this
+   * attribute is ::uint64_t.
+   */
+  HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_MEMORY_SIZE = 6,
+  /**
+   * The file descriptor of the code object that was loaaded.
+   * The value of this attribute is only defined if
+   * ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_TYPE is
+   * ::HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE. The type of this
+   * attribute is ::int.
+   */
+  HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_FILE = 7,
+  /**
+   * The signed byte address difference of the memory address at which the code
+   * object is loaded minus the virtual address specified in the code object
+   * that is loaded. The value of this attribute is only defined if the
+   * executable in which the code object is loaded is froozen. The type of this
+   * attribute is ::int64_t.
+   */
+  HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_DELTA = 8,
+  /**
+   * The base memory address at which the code object is loaded. This is the
+   * base address of the allocation for the lowest addressed segment of the code
+   * object that is loaded. Note that any non-loaded segments before the first
+   * loaded segment are ignored. The value of this attribute is only defined if
+   * the executable in which the code object is loaded is froozen. The type of
+   * this attribute is ::uint64_t.
+   */
+  HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_BASE = 9,
+  /**
+   * The byte size of the loaded code objects contiguous memory allocation. The
+   * value of this attribute is only defined if the executable in which the code
+   * object is loaded is froozen. The type of this attribute is ::uint64_t.
+   */
+  HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_SIZE = 10,
+  /**
+   * The length of the URI in bytes, not including the NUL terminator. The type
+   * of this attribute is uint32_t.
+   */
+  HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI_LENGTH = 11,
+  /**
+   * The URI name from which the code object was loaded. The type of this
+   * attribute is a NUL terminated \p char* with the length equal to the value
+   * of ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI_LENGTH attribute.
+   * The URI name syntax is defined by the following BNF syntax:
+   *
+   *     code_object_uri ::== file_uri | memory_uri
+   *     file_uri        ::== "file://" file_path [ range_specifier ]
+   *     memory_uri      ::== "memory://" process_id range_specifier
+   *     range_specifier ::== [ "#" | "?" ] "offset=" number "&" "size=" number
+   *     file_path       ::== URI_ENCODED_OS_FILE_PATH
+   *     process_id      ::== DECIMAL_NUMBER
+   *     number          ::== HEX_NUMBER | DECIMAL_NUMBER | OCTAL_NUMBER
+   *
+   * ``number`` is a C integral literal where hexadecimal values are prefixed by
+   * "0x" or "0X", and octal values by "0".
+   *
+   * ``file_path`` is the file's path specified as a URI encoded UTF-8 string.
+   * In URI encoding, every character that is not in the regular expression
+   * ``[a-zA-Z0-9/_.~-]`` is encoded as two uppercase hexidecimal digits
+   * proceeded by "%".  Directories in the path are separated by "/".
+   *
+   * ``offset`` is a 0-based byte offset to the start of the code object.  For a
+   * file URI, it is from the start of the file specified by the ``file_path``,
+   * and if omitted defaults to 0. For a memory URI, it is the memory address
+   * and is required.
+   *
+   * ``size`` is the number of bytes in the code object.  For a file URI, if
+   * omitted it defaults to the size of the file.  It is required for a memory
+   * URI.
+   *
+   * ``process_id`` is the identity of the process owning the memory.  For Linux
+   * it is the C unsigned integral decimal literal for the process ID (PID).
+   *
+   * For example:
+   *
+   *     file:///dir1/dir2/file1
+   *     file:///dir3/dir4/file2#offset=0x2000&size=3000
+   *     memory://1234#offset=0x20000&size=3000
+   */
+  HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI = 12,
+} hsa_ven_amd_loader_loaded_code_object_info_t;
+
+/**
+ * @brief Get the current value of an attribute for a given loaded code
+ * object.
+ *
+ * @param[in] loaded_code_object Loaded code object.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to an application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT The loaded code object is
+ * invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid
+ * loaded code object attribute, or @p value is NULL.
+ */
+hsa_status_t hsa_ven_amd_loader_loaded_code_object_get_info(
+  hsa_loaded_code_object_t loaded_code_object,
+  hsa_ven_amd_loader_loaded_code_object_info_t attribute,
+  void *value);
+
+//===----------------------------------------------------------------------===//
+
+/**
+ * @brief Create a code object reader to operate on a file with size and offset.
+ *
+ * @param[in] file File descriptor. The file must have been opened by
+ * application with at least read permissions prior calling this function. The
+ * file must contain a vendor-specific code object.
+ *
+ * The file is owned and managed by the application; the lifetime of the file
+ * descriptor must exceed that of any associated code object reader.
+ *
+ * @param[in] size Size of the code object embedded in @p file.
+ *
+ * @param[in] offset 0-based offset relative to the beginning of the @p file
+ * that denotes the beginning of the code object embedded within the @p file.
+ *
+ * @param[out] code_object_reader Memory location to store the newly created
+ * code object reader handle. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_FILE @p file is not opened with at least
+ * read permissions. This condition may also be reported as
+ * ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER by the
+ * ::hsa_executable_load_agent_code_object function.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT The bytes starting at offset
+ * do not form a valid code object. If file size is 0. Or offset > file size.
+ * This condition may also be reported as
+ * ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT by the
+ * ::hsa_executable_load_agent_code_object function.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p code_object_reader is NULL.
+ */
+hsa_status_t
+hsa_ven_amd_loader_code_object_reader_create_from_file_with_offset_size(
+    hsa_file_t file,
+    size_t offset,
+    size_t size,
+    hsa_code_object_reader_t *code_object_reader);
+
+//===----------------------------------------------------------------------===//
+
+/**
+ * @brief Iterate over the available executables, and invoke an
+ * application-defined callback on every iteration. While
+ * ::hsa_ven_amd_loader_iterate_executables is executing any calls to
+ * ::hsa_executable_create, ::hsa_executable_create_alt, or
+ * ::hsa_executable_destroy will be blocked.
+ *
+ * @param[in] callback Callback to be invoked once per executable. The HSA
+ * runtime passes two arguments to the callback: the executable and the
+ * application data. If @p callback returns a status other than
+ * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and
+ * ::hsa_ven_amd_loader_iterate_executables returns that status value. If
+ * @p callback invokes ::hsa_executable_create, ::hsa_executable_create_alt, or
+ * ::hsa_executable_destroy then the behavior is undefined.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL.
+*/
+hsa_status_t
+hsa_ven_amd_loader_iterate_executables(
+    hsa_status_t (*callback)(
+      hsa_executable_t executable,
+      void *data),
+    void *data);
+
+//===----------------------------------------------------------------------===//
+
+/**
+ * @brief Extension version.
+ */
+#define hsa_ven_amd_loader 001003
+
+/**
+ * @brief Extension function table version 1.00.
+ */
+typedef struct hsa_ven_amd_loader_1_00_pfn_s {
+  hsa_status_t (*hsa_ven_amd_loader_query_host_address)(
+    const void *device_address,
+    const void **host_address);
+
+  hsa_status_t (*hsa_ven_amd_loader_query_segment_descriptors)(
+    hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors,
+    size_t *num_segment_descriptors);
+
+  hsa_status_t (*hsa_ven_amd_loader_query_executable)(
+    const void *device_address,
+    hsa_executable_t *executable);
+} hsa_ven_amd_loader_1_00_pfn_t;
+
+/**
+ * @brief Extension function table version 1.01.
+ */
+typedef struct hsa_ven_amd_loader_1_01_pfn_s {
+  hsa_status_t (*hsa_ven_amd_loader_query_host_address)(
+    const void *device_address,
+    const void **host_address);
+
+  hsa_status_t (*hsa_ven_amd_loader_query_segment_descriptors)(
+    hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors,
+    size_t *num_segment_descriptors);
+
+  hsa_status_t (*hsa_ven_amd_loader_query_executable)(
+    const void *device_address,
+    hsa_executable_t *executable);
+
+  hsa_status_t (*hsa_ven_amd_loader_executable_iterate_loaded_code_objects)(
+    hsa_executable_t executable,
+    hsa_status_t (*callback)(
+      hsa_executable_t executable,
+      hsa_loaded_code_object_t loaded_code_object,
+      void *data),
+    void *data);
+
+  hsa_status_t (*hsa_ven_amd_loader_loaded_code_object_get_info)(
+    hsa_loaded_code_object_t loaded_code_object,
+    hsa_ven_amd_loader_loaded_code_object_info_t attribute,
+    void *value);
+} hsa_ven_amd_loader_1_01_pfn_t;
+
+/**
+ * @brief Extension function table version 1.02.
+ */
+typedef struct hsa_ven_amd_loader_1_02_pfn_s {
+  hsa_status_t (*hsa_ven_amd_loader_query_host_address)(
+    const void *device_address,
+    const void **host_address);
+
+  hsa_status_t (*hsa_ven_amd_loader_query_segment_descriptors)(
+    hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors,
+    size_t *num_segment_descriptors);
+
+  hsa_status_t (*hsa_ven_amd_loader_query_executable)(
+    const void *device_address,
+    hsa_executable_t *executable);
+
+  hsa_status_t (*hsa_ven_amd_loader_executable_iterate_loaded_code_objects)(
+    hsa_executable_t executable,
+    hsa_status_t (*callback)(
+      hsa_executable_t executable,
+      hsa_loaded_code_object_t loaded_code_object,
+      void *data),
+    void *data);
+
+  hsa_status_t (*hsa_ven_amd_loader_loaded_code_object_get_info)(
+    hsa_loaded_code_object_t loaded_code_object,
+    hsa_ven_amd_loader_loaded_code_object_info_t attribute,
+    void *value);
+
+  hsa_status_t
+    (*hsa_ven_amd_loader_code_object_reader_create_from_file_with_offset_size)(
+      hsa_file_t file,
+      size_t offset,
+      size_t size,
+      hsa_code_object_reader_t *code_object_reader);
+} hsa_ven_amd_loader_1_02_pfn_t;
+
+/**
+ * @brief Extension function table version 1.03.
+ */
+typedef struct hsa_ven_amd_loader_1_03_pfn_s {
+  hsa_status_t (*hsa_ven_amd_loader_query_host_address)(
+    const void *device_address,
+    const void **host_address);
+
+  hsa_status_t (*hsa_ven_amd_loader_query_segment_descriptors)(
+    hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors,
+    size_t *num_segment_descriptors);
+
+  hsa_status_t (*hsa_ven_amd_loader_query_executable)(
+    const void *device_address,
+    hsa_executable_t *executable);
+
+  hsa_status_t (*hsa_ven_amd_loader_executable_iterate_loaded_code_objects)(
+    hsa_executable_t executable,
+    hsa_status_t (*callback)(
+      hsa_executable_t executable,
+      hsa_loaded_code_object_t loaded_code_object,
+      void *data),
+    void *data);
+
+  hsa_status_t (*hsa_ven_amd_loader_loaded_code_object_get_info)(
+    hsa_loaded_code_object_t loaded_code_object,
+    hsa_ven_amd_loader_loaded_code_object_info_t attribute,
+    void *value);
+
+  hsa_status_t
+    (*hsa_ven_amd_loader_code_object_reader_create_from_file_with_offset_size)(
+      hsa_file_t file,
+      size_t offset,
+      size_t size,
+      hsa_code_object_reader_t *code_object_reader);
+
+  hsa_status_t
+    (*hsa_ven_amd_loader_iterate_executables)(
+      hsa_status_t (*callback)(
+        hsa_executable_t executable,
+        void *data),
+      void *data);
+} hsa_ven_amd_loader_1_03_pfn_t;
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* HSA_VEN_AMD_LOADER_H */
@@ -0,0 +1,416 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef HSA_VEN_AMD_PC_SAMPLING_H
+#define HSA_VEN_AMD_PC_SAMPLING_H
+
+#include "hsa.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif /*__cplusplus*/
+
+
+/**
+ * @brief HSA AMD Vendor PC Sampling APIs
+ * EXPERIMENTAL: All PC Sampling APIs are currently in an experimental phase and the APIs may be
+ * modified extensively in the future
+ */
+
+/**
+ * @brief PC Sampling sample data for hosttrap sampling method
+ */
+typedef struct {
+  uint64_t pc;
+  uint64_t exec_mask;
+  uint32_t workgroup_id_x;
+  uint32_t workgroup_id_y;
+  uint32_t workgroup_id_z;
+  uint32_t wave_in_wg : 6;
+  uint32_t chiplet    : 3;   // Currently not used
+  uint32_t reserved   : 23;
+  uint32_t hw_id;
+  uint32_t reserved0;
+  uint64_t reserved1;
+  uint64_t timestamp;
+  uint64_t correlation_id;
+} perf_sample_hosttrap_v1_t;
+
+/**
+ * @brief PC Sampling sample data for stochastic sampling method
+ */
+typedef struct {
+  uint64_t pc;
+  uint64_t exec_mask;
+  uint32_t workgroup_id_x;
+  uint32_t workgroup_id_y;
+  uint32_t workgroup_id_z;
+  uint32_t wave_in_wg : 6;
+  uint32_t chiplet    : 3;   // Currently not used
+  uint32_t reserved   : 23;
+  uint32_t hw_id;
+  uint32_t perf_snapshot_data;
+  uint32_t perf_snapshot_data1;
+  uint32_t perf_snapshot_data2;
+  uint64_t timestamp;
+  uint64_t correlation_id;
+} perf_sample_snapshot_v1_t;
+
+/**
+ * @brief PC Sampling method kinds
+ */
+typedef enum {
+  HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1,
+  HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1
+} hsa_ven_amd_pcs_method_kind_t;
+
+/**
+ * @brief PC Sampling interval unit type
+ */
+typedef enum {
+  HSA_VEN_AMD_PCS_INTERVAL_UNITS_MICRO_SECONDS,
+  HSA_VEN_AMD_PCS_INTERVAL_UNITS_CLOCK_CYCLES,
+  HSA_VEN_AMD_PCS_INTERVAL_UNITS_INSTRUCTIONS
+} hsa_ven_amd_pcs_units_t;
+
+/**
+ * @brief HSA callback function to perform the copy onto a destination buffer
+ *
+ * If data_size is 0, HSA will stop current copy operation and keep remaining data in internal
+ * buffers. Remaining contents of HSA internal buffers will be included in next
+ * hsa_ven_amd_pcs_data_ready_callback_t. HSA internal buffers can also be drained by calling
+ * hsa_ven_amd_pcs_flush.
+ *
+ * @param[in] hsa_callback_data private data to pass back to HSA. Provided in
+ * hsa_ven_amd_pcs_data_ready_callback_t
+ *
+ * @param[in] data_size size of destination buffer in bytes.
+ * @param[in] destination destination buffer
+ * @retval    TBD: but could be used to indicate that there is no more data to be read.
+ * Or indicate an error and abort of current copy operations
+ */
+typedef hsa_status_t (*hsa_ven_amd_pcs_data_copy_callback_t)(void* hsa_callback_data,
+                                                             size_t data_size, void* destination);
+
+/**
+ * @brief HSA callback function to to indicate that there is data ready to be copied
+ *
+ * When the client receives this callback, the client should call back @p data_copy_callback for HSA
+ * to perform the copy operation into an available buffer. @p data_copy_callback can be called back
+ * multiple times with smaller @p data_size to split the copy operation.
+ *
+ * This callback must not call ::hsa_ven_amd_pcs_flush.
+ *
+ * @param[in] client_callback_data client private data passed in via
+ * hsa_ven_amd_pcs_create/hsa_ven_amd_pcs_create_from_id
+ * @param[in] data_size size of data available to be copied
+ * @param[in] lost_sample_count number of lost samples since last call to
+ * hsa_ven_amd_pcs_data_ready_callback_t.
+ * @param[in] data_copy_callback callback function for HSA to perform the actual copy
+ * @param[in] hsa_callback_data private data to pass back to HSA
+ */
+typedef void (*hsa_ven_amd_pcs_data_ready_callback_t)(
+    void* client_callback_data, size_t data_size, size_t lost_sample_count,
+    hsa_ven_amd_pcs_data_copy_callback_t data_copy_callback, void* hsa_callback_data);
+
+/**
+ * @brief Opaque handle representing a sampling session.
+ * Two sessions having same handle value represent the same session
+ */
+typedef struct {
+  uint64_t handle;
+} hsa_ven_amd_pcs_t;
+
+/**
+ * @brief PC Sampling configuration flag options
+ */
+typedef enum {
+  /* The interval for this sampling method have to be a power of 2 */
+  HSA_VEN_AMD_PCS_CONFIGURATION_FLAGS_INTERVAL_POWER_OF_2 = (1 << 0)
+} hsa_ven_amd_pcs_configuration_flags_t;
+
+/**
+ * @brief PC Sampling method information
+ * Used to provide client with list of supported PC Sampling methods
+ */
+typedef struct {
+  hsa_ven_amd_pcs_method_kind_t method;
+  hsa_ven_amd_pcs_units_t units;
+  size_t min_interval;
+  size_t max_interval;
+  uint64_t flags;
+} hsa_ven_amd_pcs_configuration_t;
+
+/**
+ * @brief Callback function to iterate through list of supported PC Sampling configurations
+ *
+ * @param[in] configuration one entry for supported PC Sampling method and configuration options
+ * @param[in] callback_data client private callback data that was passed in when calling
+ * hsa_ven_amd_pcs_iterate_configuration
+ */
+typedef hsa_status_t (*hsa_ven_amd_pcs_iterate_configuration_callback_t)(
+    const hsa_ven_amd_pcs_configuration_t* configuration, void* callback_data);
+
+/**
+ * @brief Iterate through list of current supported PC Sampling configurations for this @p agent
+ *
+ * HSA will callback @p configuration_callback for each currently available PC Sampling
+ * configuration. The list of currently available configurations may not be the complete list of
+ * configurations supported on the @p agent. The list of currently available configurations may be
+ * reduced if the @p agent is currently handling other PC sampling sessions.
+ *
+ * @param[in] agent target agent
+ * @param[in] configuration_callback callback function to iterate through list of configurations
+ * @param[in] callback_data client private callback data
+ **/
+hsa_status_t hsa_ven_amd_pcs_iterate_configuration(
+    hsa_agent_t agent, hsa_ven_amd_pcs_iterate_configuration_callback_t configuration_callback,
+    void* callback_data);
+
+/**
+ * @brief  Create a PC Sampling session on @p agent
+ *
+ * Allocate the resources required for a PC Sampling session. The @p method, @p units, @p interval
+ * parameters must be a legal configuration value, as described by the
+ * hsa_ven_amd_pcs_configuration_t configurations passed to the callbacks of
+ * hsa_ven_amd_pcs_iterate_configuration for this @p agent.
+ * A successfull call may restrict the list of possible PC sampling methods available to subsequent
+ * calls to hsa_ven_amd_pcs_iterate_configuration on the same agent as agents have limitations
+ * on what types of PC sampling they can perform concurrently.
+ * For all successful calls, hsa_ven_amd_pcs_destroy should be called to free this session.
+ * The session will be in a stopped/inactive state after this call
+ *
+ * @param[in] agent target agent
+ * @param[in] method method to use
+ * @param[in] units sampling units
+ * @param[in] interval sampling interval in @p units
+ * @param[in] latency expected latency in microseconds for client to provide a buffer for the data
+ * copy callback once HSA calls @p data_ready_callback. This is a performance hint to avoid the
+ * buffer filling up before the client is notified that data is ready. HSA-runtime will estimate
+ * how many samples are received within @p latency and call @p data_ready_callback ahead of time so
+ * that the client has @p latency time to allocate the buffer before the HSA-runtime internal
+ * buffers are full. The value of latency can be 0.
+ * @param[in] buffer_size size of client buffer in bytes. @p data_ready_callback will be called once
+ * HSA-runtime has enough samples to fill @p buffer_size. This needs to be a multiple of size of
+ * perf_sample_hosttrap_v1_t or size of perf_sample_snapshot_v1_t.
+ * @param[in] data_ready_callback client callback function that will be called when:
+ *   1. There is enough samples fill a buffer with @p buffer_size  - estimated samples received
+ *      within @p latency period.
+ * OR
+ *   2. When hsa_ven_amd_pcs_flush is called.
+ * @param[in] client_callback_data client private data to be provided back when data_ready_callback
+ * is called.
+ * @param[out] pc_sampling PC sampling session handle used to reference this session when calling
+ * hsa_ven_amd_pcs_start, hsa_ven_amd_pcs_stop, hsa_ven_amd_pcs_destroy
+ *
+ * @retval ::HSA_STATUS_SUCCESS session created successfully
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT invalid parameters
+ * @retval ::HSA_STATUS_ERROR_RESOURCE_BUSY agent currently handling another PC Sampling session and
+ * cannot handle the type requested.
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Failed to allocate resources
+ * @retval ::HSA_STATUS_ERROR Unexpected error
+ **/
+hsa_status_t hsa_ven_amd_pcs_create(hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method,
+                                    hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency,
+                                    size_t buffer_size,
+                                    hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback,
+                                    void* client_callback_data, hsa_ven_amd_pcs_t* pc_sampling);
+
+
+/**
+ * @brief  Creates a PC Sampling session on @p agent. Assumes that the caller provides the
+ * @p pcs_id generated by the previous call to the underlying driver that reserved PC sampling
+ * on the @p agent.
+ *
+ * Similar to the @ref hsa_ven_amd_pcs_create with the difference that it inherits an existing
+ * PC sampling session that was previously created in the underlying driver.
+ *
+ * Allocate the resources required for a PC Sampling session. The @p method, @p units, @p interval
+ * parameters must be a legal configuration value, and match the parameters that we used to create
+ * the underlying PC Sampling session in the underlying driver.
+ * A successfull call may restrict the list of possible PC sampling methods available to subsequent
+ * calls to hsa_ven_amd_pcs_iterate_configuration on the same agent as agents have limitations
+ * on what types of PC sampling they can perform concurrently.
+ * For all successful calls, hsa_ven_amd_pcs_destroy should be called to free this session.
+ * The session will be in a stopped/inactive state after this call
+ *
+ * @param[in] pcs_id ID that uniquely identifies the PC sampling session within underlying driver
+ * @param[in] agent target agent
+ * @param[in] method method to use
+ * @param[in] units sampling units
+ * @param[in] interval sampling interval in @p units
+ * @param[in] latency expected latency in microseconds for client to provide a buffer for the data
+ * copy callback once HSA calls @p data_ready_callback. This is a performance hint to avoid the
+ * buffer filling up before the client is notified that data is ready. HSA-runtime will estimate
+ * how many samples are received within @p latency and call @p data_ready_callback ahead of time so
+ * that the client has @p latency time to allocate the buffer before the HSA-runtime internal
+ * buffers are full. The value of latency can be 0.
+ * @param[in] buffer_size size of client buffer in bytes. @p data_ready_callback will be called once
+ * HSA-runtime has enough samples to fill @p buffer_size. This needs to be a multiple of size of
+ * perf_sample_hosttrap_v1_t or size of perf_sample_snapshot_v1_t.
+ * @param[in] data_ready_callback client callback function that will be called when:
+ *   1. There is enough samples fill a buffer with @p buffer_size  - estimated samples received
+ *      within @p latency period.
+ * OR
+ *   2. When hsa_ven_amd_pcs_flush is called.
+ * @param[in] client_callback_data client private data to be provided back when data_ready_callback
+ * is called.
+ * @param[out] pc_sampling PC sampling session handle used to reference this session when calling
+ * hsa_ven_amd_pcs_start, hsa_ven_amd_pcs_stop, hsa_ven_amd_pcs_destroy
+ *
+ * @retval ::HSA_STATUS_SUCCESS session created successfully
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT invalid parameters
+ * @retval ::HSA_STATUS_ERROR_RESOURCE_BUSY agent currently handling another PC Sampling session and
+ * cannot handle the type requested.
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Failed to allocate resources
+ * @retval ::HSA_STATUS_ERROR Unexpected error
+ **/
+hsa_status_t hsa_ven_amd_pcs_create_from_id(
+    uint32_t pcs_id, hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method,
+    hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency, size_t buffer_size,
+    hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, void* client_callback_data,
+    hsa_ven_amd_pcs_t* pc_sampling);
+
+/**
+ * @brief  Free a PC Sampling session on @p agent
+ *
+ * Free all the resources allocated for a PC Sampling session on @p agent
+ * Internal buffers for this session will be lost.
+ * If the session was active, the session will be stopped before it is destroyed.
+ *
+ * @param[in] pc_sampling PC sampling session handle
+ *
+ * @retval ::HSA_STATUS_SUCCESS Session destroyed successfully
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid PC sampling handle
+ * @retval ::HSA_STATUS_ERROR unexpected error
+ */
+hsa_status_t hsa_ven_amd_pcs_destroy(hsa_ven_amd_pcs_t pc_sampling);
+
+/**
+ * @brief  Start a PC Sampling session
+ *
+ * Activate a PC Sampling session that was previous created.
+ * The session with be in a active state after this call
+ * If the session was already active, this will result in a no-op and will return HSA_STATUS_SUCCESS
+ *
+ * @param[in] pc_sampling PC sampling session handle
+ *
+ * @retval ::HSA_STATUS_SUCCESS Session started successfully
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid PC sampling handle
+ * @retval ::HSA_STATUS_ERROR unexpected error
+ */
+hsa_status_t hsa_ven_amd_pcs_start(hsa_ven_amd_pcs_t pc_sampling);
+
+/**
+ * @brief  Stop a PC Sampling session
+ *
+ * Stop a session that is currently active
+ * After a session is stopped HSA may still have some PC Sampling data in its internal buffers.
+ * The internal buffers can be drained using hsa_ven_amd_pcs_flush. If the internal
+ * buffers are not drained and the session is started again, the internal buffers will be available
+ * on the next data_ready_callback.
+ * If the session was already inactive, this will result in a no-op and will return
+ * HSA_STATUS_SUCCESS
+ *
+ * @param[in] pc_sampling PC sampling session handle
+ *
+ * @retval ::HSA_STATUS_SUCCESS Session stopped successfully
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid PC sampling handle
+ */
+hsa_status_t hsa_ven_amd_pcs_stop(hsa_ven_amd_pcs_t pc_sampling);
+
+/**
+ * @brief  Flush internal buffers for a PC Sampling session
+ *
+ * Drain internal buffers for a PC Sampling session. If internal buffers have available data,
+ * this trigger a data_ready_callback.
+ *
+ * The function blocks until all PC samples associated with the @p pc_sampling session
+ * generated prior to the function call have been communicated by invocations of
+ * @p data_ready_callback having completed execution.
+ *
+ * @param[in] pc_sampling PC sampling session handle
+ *
+ * @retval ::HSA_STATUS_SUCCESS Session flushed successfully
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid PC sampling handle
+ */
+hsa_status_t hsa_ven_amd_pcs_flush(hsa_ven_amd_pcs_t pc_sampling);
+
+#define hsa_ven_amd_pc_sampling_1_00
+
+/**
+ * @brief The function pointer table for the PC Sampling v1.00 extension. Can be returned by
+ * ::hsa_system_get_extension_table or ::hsa_system_get_major_extension_table.
+ */
+typedef struct hsa_ven_amd_pc_sampling_1_00_pfn_t {
+  hsa_status_t (*hsa_ven_amd_pcs_iterate_configuration)(
+      hsa_agent_t agent, hsa_ven_amd_pcs_iterate_configuration_callback_t configuration_callback,
+      void* callback_data);
+
+  hsa_status_t (*hsa_ven_amd_pcs_create)(hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method,
+                                         hsa_ven_amd_pcs_units_t units, size_t interval,
+                                         size_t latency, size_t buffer_size,
+                                         hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback,
+                                         void* client_callback_data,
+                                         hsa_ven_amd_pcs_t* pc_sampling);
+
+  hsa_status_t (*hsa_ven_amd_pcs_create_from_id)(
+      uint32_t pcs_id, hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method,
+      hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency, size_t buffer_size,
+      hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, void* client_callback_data,
+      hsa_ven_amd_pcs_t* pc_sampling);
+
+  hsa_status_t (*hsa_ven_amd_pcs_destroy)(hsa_ven_amd_pcs_t pc_sampling);
+
+  hsa_status_t (*hsa_ven_amd_pcs_start)(hsa_ven_amd_pcs_t pc_sampling);
+
+  hsa_status_t (*hsa_ven_amd_pcs_stop)(hsa_ven_amd_pcs_t pc_sampling);
+
+  hsa_status_t (*hsa_ven_amd_pcs_flush)(hsa_ven_amd_pcs_t pc_sampling);
+
+} hsa_ven_amd_pc_sampling_1_00_pfn_t;
+
+#ifdef __cplusplus
+}  // end extern "C" block
+#endif /*__cplusplus*/
+
+#endif /* HSA_VEN_AMD_PC_SAMPLING_H */
@@ -0,0 +1,363 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+// 
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+// 
+// Developed by:
+// 
+//                 AMD Research and AMD HSA Software Development
+// 
+//                 Advanced Micro Devices, Inc.
+// 
+//                 www.amd.com
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+// 
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// This file is used only for open source cmake builds, if we hardcode the
+// register values in amd_aql_queue.cpp then this file won't be required. For
+// now we are using this file where register details are  spelled out in the
+// structs/unions below.
+#ifndef _WSL_INC_REGISTERS_H_
+#define _WSL_INC_REGISTERS_H_
+
+typedef enum SQ_RSRC_BUF_TYPE {
+SQ_RSRC_BUF                              = 0x00000000,
+SQ_RSRC_BUF_RSVD_1                       = 0x00000001,
+SQ_RSRC_BUF_RSVD_2                       = 0x00000002,
+SQ_RSRC_BUF_RSVD_3                       = 0x00000003,
+} SQ_RSRC_BUF_TYPE;
+
+typedef enum BUF_DATA_FORMAT {
+BUF_DATA_FORMAT_INVALID                  = 0x00000000,
+BUF_DATA_FORMAT_8                        = 0x00000001,
+BUF_DATA_FORMAT_16                       = 0x00000002,
+BUF_DATA_FORMAT_8_8                      = 0x00000003,
+BUF_DATA_FORMAT_32                       = 0x00000004,
+BUF_DATA_FORMAT_16_16                    = 0x00000005,
+BUF_DATA_FORMAT_10_11_11                 = 0x00000006,
+BUF_DATA_FORMAT_11_11_10                 = 0x00000007,
+BUF_DATA_FORMAT_10_10_10_2               = 0x00000008,
+BUF_DATA_FORMAT_2_10_10_10               = 0x00000009,
+BUF_DATA_FORMAT_8_8_8_8                  = 0x0000000a,
+BUF_DATA_FORMAT_32_32                    = 0x0000000b,
+BUF_DATA_FORMAT_16_16_16_16              = 0x0000000c,
+BUF_DATA_FORMAT_32_32_32                 = 0x0000000d,
+BUF_DATA_FORMAT_32_32_32_32              = 0x0000000e,
+BUF_DATA_FORMAT_RESERVED_15              = 0x0000000f,
+} BUF_DATA_FORMAT;
+
+typedef enum BUF_NUM_FORMAT {
+BUF_NUM_FORMAT_UNORM                     = 0x00000000,
+BUF_NUM_FORMAT_SNORM                     = 0x00000001,
+BUF_NUM_FORMAT_USCALED                   = 0x00000002,
+BUF_NUM_FORMAT_SSCALED                   = 0x00000003,
+BUF_NUM_FORMAT_UINT                      = 0x00000004,
+BUF_NUM_FORMAT_SINT                      = 0x00000005,
+BUF_NUM_FORMAT_SNORM_OGL__SI__CI         = 0x00000006,
+BUF_NUM_FORMAT_RESERVED_6__VI            = 0x00000006,
+BUF_NUM_FORMAT_FLOAT                     = 0x00000007,
+} BUF_NUM_FORMAT;
+
+typedef enum BUF_FORMAT {
+BUF_FORMAT_32_UINT                       = 0x00000014,
+} BUF_FORMAT;
+
+typedef enum SQ_SEL_XYZW01 {
+SQ_SEL_0                                 = 0x00000000,
+SQ_SEL_1                                 = 0x00000001,
+SQ_SEL_RESERVED_0                        = 0x00000002,
+SQ_SEL_RESERVED_1                        = 0x00000003,
+SQ_SEL_X                                 = 0x00000004,
+SQ_SEL_Y                                 = 0x00000005,
+SQ_SEL_Z                                 = 0x00000006,
+SQ_SEL_W                                 = 0x00000007,
+} SQ_SEL_XYZW01;
+
+	union COMPUTE_TMPRING_SIZE {
+	struct {
+#if		defined(LITTLEENDIAN_CPU)
+		unsigned int                           WAVES : 12;
+		unsigned int                        WAVESIZE : 13;
+		unsigned int                                 : 7;
+#elif		defined(BIGENDIAN_CPU)
+		unsigned int                                 : 7;
+		unsigned int                        WAVESIZE : 13;
+		unsigned int                           WAVES : 12;
+#endif
+	} bitfields, bits;
+	unsigned int	u32All;
+	signed int	i32All;
+	float	f32All;
+	};
+
+        union COMPUTE_TMPRING_SIZE_GFX11 {
+          struct {
+#if defined(LITTLEENDIAN_CPU)
+            unsigned int WAVES : 12;
+            unsigned int WAVESIZE : 15;
+            unsigned int : 5;
+#elif defined(BIGENDIAN_CPU)
+            unsigned int : 5;
+            unsigned int WAVESIZE : 15;
+            unsigned int WAVES : 12;
+#endif
+          } bitfields, bits;
+          unsigned int u32All;
+          signed int i32All;
+          float f32All;
+        };
+
+        union COMPUTE_TMPRING_SIZE_GFX12 {
+          struct {
+#if defined(LITTLEENDIAN_CPU)
+            unsigned int WAVES : 12;
+            unsigned int WAVESIZE : 18;
+            unsigned int : 2;
+#elif defined(BIGENDIAN_CPU)
+            unsigned int : 2;
+            unsigned int WAVESIZE : 18;
+            unsigned int WAVES : 12;
+#endif
+          } bitfields, bits;
+          unsigned int u32All;
+          signed int i32All;
+          float f32All;
+        };
+
+        union SQ_BUF_RSRC_WORD0 {
+	struct {
+#if		defined(LITTLEENDIAN_CPU)
+		unsigned int                    BASE_ADDRESS : 32;
+#elif		defined(BIGENDIAN_CPU)
+		unsigned int                    BASE_ADDRESS : 32;
+#endif
+	} bitfields, bits;
+	unsigned int	u32All;
+	signed int	i32All;
+	float	f32All;
+	};
+
+
+	union SQ_BUF_RSRC_WORD1 {
+	struct {
+#if		defined(LITTLEENDIAN_CPU)
+		unsigned int                 BASE_ADDRESS_HI : 16;
+		unsigned int                          STRIDE : 14;
+		unsigned int                   CACHE_SWIZZLE : 1;
+		unsigned int                  SWIZZLE_ENABLE : 1;
+#elif		defined(BIGENDIAN_CPU)
+		unsigned int                  SWIZZLE_ENABLE : 1;
+		unsigned int                   CACHE_SWIZZLE : 1;
+		unsigned int                          STRIDE : 14;
+		unsigned int                 BASE_ADDRESS_HI : 16;
+#endif
+	} bitfields, bits;
+	unsigned int	u32All;
+	signed int	i32All;
+	float	f32All;
+	};
+
+        union SQ_BUF_RSRC_WORD1_GFX11 {
+          struct {
+#if defined(LITTLEENDIAN_CPU)
+            unsigned int BASE_ADDRESS_HI : 16;
+            unsigned int STRIDE : 14;
+            unsigned int SWIZZLE_ENABLE : 2;
+#elif defined(BIGENDIAN_CPU)
+            unsigned int SWIZZLE_ENABLE : 2;
+            unsigned int STRIDE : 14;
+            unsigned int BASE_ADDRESS_HI : 16;
+#endif
+          } bitfields, bits;
+          unsigned int u32All;
+          signed int i32All;
+          float f32All;
+        };
+
+
+        union SQ_BUF_RSRC_WORD2 {
+	struct {
+#if		defined(LITTLEENDIAN_CPU)
+		unsigned int                     NUM_RECORDS : 32;
+#elif		defined(BIGENDIAN_CPU)
+		unsigned int                     NUM_RECORDS : 32;
+#endif
+	} bitfields, bits;
+	unsigned int	u32All;
+	signed int	i32All;
+	float	f32All;
+	};
+
+
+	union SQ_BUF_RSRC_WORD3 {
+	struct {
+#if		defined(LITTLEENDIAN_CPU)
+                unsigned int                       DST_SEL_X : 3;
+                unsigned int                       DST_SEL_Y : 3;
+                unsigned int                       DST_SEL_Z : 3;
+                unsigned int                       DST_SEL_W : 3;
+                unsigned int                      NUM_FORMAT : 3;
+                unsigned int                     DATA_FORMAT : 4;
+                unsigned int                    ELEMENT_SIZE : 2;
+                unsigned int                    INDEX_STRIDE : 2;
+                unsigned int                  ADD_TID_ENABLE : 1;
+                unsigned int                     ATC__CI__VI : 1;
+                unsigned int                     HASH_ENABLE : 1;
+                unsigned int                            HEAP : 1;
+                unsigned int                   MTYPE__CI__VI : 3;
+                unsigned int                            TYPE : 2;
+#elif		defined(BIGENDIAN_CPU)
+                unsigned int                            TYPE : 2;
+                unsigned int                   MTYPE__CI__VI : 3;
+                unsigned int                            HEAP : 1;
+                unsigned int                     HASH_ENABLE : 1;
+                unsigned int                     ATC__CI__VI : 1;
+                unsigned int                  ADD_TID_ENABLE : 1;
+                unsigned int                    INDEX_STRIDE : 2;
+                unsigned int                    ELEMENT_SIZE : 2;
+                unsigned int                     DATA_FORMAT : 4;
+                unsigned int                      NUM_FORMAT : 3;
+                unsigned int                       DST_SEL_W : 3;
+                unsigned int                       DST_SEL_Z : 3;
+                unsigned int                       DST_SEL_Y : 3;
+                unsigned int                       DST_SEL_X : 3;
+#endif
+	} bitfields, bits;
+	unsigned int	u32All;
+	signed int	i32All;
+	float	f32All;
+	};
+
+	union SQ_BUF_RSRC_WORD3_GFX10 {
+	struct {
+#if		defined(LITTLEENDIAN_CPU)
+                unsigned int                       DST_SEL_X : 3;
+                unsigned int                       DST_SEL_Y : 3;
+                unsigned int                       DST_SEL_Z : 3;
+                unsigned int                       DST_SEL_W : 3;
+                unsigned int                          FORMAT : 7;
+                unsigned int                       RESERVED1 : 2;
+                unsigned int                    INDEX_STRIDE : 2;
+                unsigned int                  ADD_TID_ENABLE : 1;
+                unsigned int                  RESOURCE_LEVEL : 1;
+                unsigned int                       RESERVED2 : 3;
+                unsigned int                      OOB_SELECT : 2;
+                unsigned int                            TYPE : 2;
+#elif		defined(BIGENDIAN_CPU)
+                unsigned int                            TYPE : 2;
+                unsigned int                      OOB_SELECT : 2;
+                unsigned int                       RESERVED2 : 3;
+                unsigned int                  RESOURCE_LEVEL : 1;
+                unsigned int                  ADD_TID_ENABLE : 1;
+                unsigned int                    INDEX_STRIDE : 2;
+                unsigned int                       RESERVED1 : 2;
+                unsigned int                          FORMAT : 7;
+                unsigned int                       DST_SEL_W : 3;
+                unsigned int                       DST_SEL_Z : 3;
+                unsigned int                       DST_SEL_Y : 3;
+                unsigned int                       DST_SEL_X : 3;
+#endif
+        } bitfields, bits;
+        unsigned int u32All;
+        signed int i32All;
+        float f32All;
+        };
+
+        // From V# Table
+        union SQ_BUF_RSRC_WORD3_GFX11 {
+          struct {
+#if defined(LITTLEENDIAN_CPU)
+            unsigned int DST_SEL_X : 3;
+            unsigned int DST_SEL_Y : 3;
+            unsigned int DST_SEL_Z : 3;
+            unsigned int DST_SEL_W : 3;
+            unsigned int FORMAT : 6;
+            unsigned int RESERVED1 : 3;
+            unsigned int INDEX_STRIDE : 2;
+            unsigned int ADD_TID_ENABLE : 1;
+            unsigned int RESERVED2 : 4;
+            unsigned int OOB_SELECT : 2;
+            unsigned int TYPE : 2;
+#elif defined(BIGENDIAN_CPU)
+            unsigned int TYPE : 2;
+            unsigned int OOB_SELECT : 2;
+            unsigned int RESERVED2 : 4;
+            unsigned int ADD_TID_ENABLE : 1;
+            unsigned int INDEX_STRIDE : 2;
+            unsigned int RESERVED1 : 3;
+            unsigned int FORMAT : 6;
+            unsigned int DST_SEL_W : 3;
+            unsigned int DST_SEL_Z : 3;
+            unsigned int DST_SEL_Y : 3;
+            unsigned int DST_SEL_X : 3;
+#endif
+          } bitfields, bits;
+        unsigned int	u32All;
+	signed int	i32All;
+	float	f32All;
+        };
+                        // From V# Table
+        union SQ_BUF_RSRC_WORD3_GFX12 {
+          struct {
+#if defined(LITTLEENDIAN_CPU)
+            unsigned int DST_SEL_X : 3;
+            unsigned int DST_SEL_Y : 3;
+            unsigned int DST_SEL_Z : 3;
+            unsigned int DST_SEL_W : 3;
+            unsigned int FORMAT : 6;
+            unsigned int RESERVED1 : 3;
+            unsigned int INDEX_STRIDE : 2;
+            unsigned int ADD_TID_ENABLE : 1;
+            unsigned int WRITE_COMPRESS_ENABLE : 1;
+            unsigned int COMPRESSION_EN : 1;
+            unsigned int COMPRESSION_ACCESS_MODE : 2;
+            unsigned int OOB_SELECT : 2;
+            unsigned int TYPE : 2;
+#elif defined(BIGENDIAN_CPU)
+            unsigned int TYPE : 2;
+            unsigned int OOB_SELECT : 2;
+            unsigned int COMPRESSION_ACCESS_MODE : 2;
+            unsigned int COMPRESSION_EN : 1;
+            unsigned int WRITE_COMPRESS_ENABLE : 1;
+            unsigned int ADD_TID_ENABLE : 1;
+            unsigned int INDEX_STRIDE : 2;
+            unsigned int RESERVED1 : 3;
+            unsigned int FORMAT : 6;
+            unsigned int DST_SEL_W : 3;
+            unsigned int DST_SEL_Z : 3;
+            unsigned int DST_SEL_Y : 3;
+            unsigned int DST_SEL_X : 3;
+#endif
+          } bitfields, bits;
+        unsigned int	u32All;
+	signed int	i32All;
+	float	f32All;
+        };
+#endif  // header guard
@@ -0,0 +1,122 @@
+#ifndef _WSL_INC_THUNK_PROXY_H_
+#define _WSL_INC_THUNK_PROXY_H_
+
+#include <vector>
+
+namespace thunk_proxy {
+enum AllocDomain {
+  kSystem,
+  kLocal,
+  kUserMemory,
+  kUserQueue,
+  kDomainCount,
+};
+
+enum MemFlag {
+  kFineGrain  = (1ULL << 0),
+  kKernarg    = (1ULL << 1),
+};
+
+enum EngineFlag {
+  KCOMPUTE0   = (1ULL << 0),
+  KDRMDMA     = (1ULL << 1),
+  KDRMDMA1    = (1ULL << 2),
+};
+
+enum SchedLevel {
+  kLow = 0,
+  kNormal = 1,
+  kHigh = 2,
+};
+
+struct HwsInfo {
+  union {
+    struct {
+      uint32_t gfxHwsEnabled     : 1;
+      uint32_t computeHwsEnabled : 1;
+      uint32_t dmaHwsEnabled     : 1;
+      uint32_t dma1HwsEnabled    : 1;
+      uint32_t reserved          : 28;
+    } hwsMask;
+    uint32_t osHwsEnableFlags;
+  };
+  uint64_t engineOrdinalMask; // Indicates which engines (by ordinal) support MES HWS
+};
+
+typedef struct {
+  int major;
+  int minor;
+  int stepping;
+  bool is_dgpu;
+  char product_name[MAX_PATH];
+  uint64_t uuid;
+  uint32_t family;
+  uint32_t device_id;
+  uint32_t wavefront_size;
+  uint32_t compute_unit_count;
+  uint32_t max_engine_clock_mhz;
+  uint32_t watch_points_num;
+  uint32_t pci_bus_addr;
+  uint32_t memory_bus_width;
+  uint32_t max_memory_clock_mhz;
+  uint64_t gpu_counter_frequency;
+  uint32_t wave_per_cu;
+  uint32_t simd_per_cu;
+  uint32_t max_scratch_slots_per_cu;
+  uint32_t num_shader_engine;
+  uint32_t shader_array_per_shader_engine;
+  uint32_t domain;
+  uint32_t num_gws;
+  uint32_t asic_revision;
+  uint64_t local_visible_heap_size;
+  uint64_t local_invisible_heap_size;
+  uint64_t non_local_heap_size;
+  uint64_t private_aperture_base;
+  uint64_t private_aperture_size;
+  uint64_t shared_aperture_base;
+  uint64_t shared_aperture_size;
+  uint32_t user_queue_size;
+  uint32_t lds_size;
+  uint32_t big_page_alignment_size;
+  uint32_t hw_big_page_min_alignment_size;
+  uint32_t hw_big_page_alignment_size;
+  bool enable_big_page_alignment;
+  uint32_t mec_fw_version;
+  uint32_t sdma_fw_version;
+  uint32_t l1_cache_size;
+  uint32_t l2_cache_size;
+  uint32_t l3_cache_size;
+  uint32_t gl2_cacheline_size;
+  uint32_t num_cp_queues;
+  HwsInfo hwsInfo;
+  std::vector<int> sdma_schedid;
+  uint32_t compute_schedid;
+  bool state_shadowing_by_cpfw;
+  bool platform_atomic_support;
+  void *adapter_info;
+  uint32_t kmd_version;
+} DeviceInfo;
+
+int EngineOrdinal(int engine, DeviceInfo *device_info);
+bool GetHwsEnabled(int engine, DeviceInfo *device_info);
+bool ShouldDisableGpuTimeout(int engine, DeviceInfo *device_info);
+bool ParseAdapterInfo(D3DKMT_HANDLE adapter, DeviceInfo *device_info);
+bool QueryAdapterSupported(unsigned int device_id);
+
+uint32_t QueueEngine2EngineFlag(uint32_t queue_engine);
+void SetAllocationInfo(void *data, uint64_t size, AllocDomain domain,
+                      uint64_t addr, uint32_t mem_flags, uint32_t engine_flag, const DeviceInfo &device_info);
+void GetAllocPrivDataSize(int *priv_drv_data_size, int *priv_alloc_data_size);
+void FillinAllocPrivDrvData(void *drv_priv, int priv_alloc_data_size);
+
+int GetSubmitPrivDataSize();
+void FillinSubmitPrivData(void *priv_data, D3DKMT_HANDLE queue, uint64_t command_addr,
+                        uint64_t command_size, bool is_hw_queue);
+int GetHwQueuePrivDataSize();
+void FillinHwQueuePrivData(void *priv_data, bool FwManagedGfxState, SchedLevel level = kNormal);
+int GetContextPrivDataSize();
+void FillinContextPrivData(void *priv_data, bool FwManagedGfxState);
+int GetPowerOptPrivDataSize();
+void FillinPowerOptPrivData(void *priv_data, bool restore);
+}
+#endif
@@ -0,0 +1,169 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef _WSL_INC_THUNK_PROXY_WDDM_TYPES_H_
+#define _WSL_INC_THUNK_PROXY_WDDM_TYPES_H_
+
+#include <stdint.h>
+
+#include <no_sal2.h>
+
+typedef uint32_t UINT, *UINT_PTR;
+typedef int32_t  INT32;
+typedef int32_t  LONG;
+typedef uint32_t ULONG, *ULONG_PTR;
+typedef int64_t  LONGLONG;
+typedef int64_t  LONG64;
+typedef uint64_t ULONGLONG;
+typedef uint64_t ULONG64, *ULONG64_PTR;
+typedef uint8_t  BYTE;
+typedef uint16_t WORD;
+typedef uint32_t DWORD;
+typedef int32_t  BOOL;
+typedef int32_t  NTSTATUS;
+typedef uint16_t USHORT;
+typedef uint16_t UINT16;
+typedef uint32_t UINT32;
+typedef uint64_t UINT64;
+typedef int32_t  INT;
+typedef uint64_t SIZE_T;
+typedef void VOID;
+typedef float FLOAT;
+typedef char CHAR;
+typedef unsigned char UCHAR;
+typedef UCHAR BOOLEAN;
+typedef int16_t WCHAR;
+typedef void *HANDLE;
+typedef void *PVOID;
+typedef void *LPVOID;
+typedef const int16_t *PCWSTR;
+
+#define ULONG ULONG
+#define ULONG_PTR ULONG_PTR
+#define USHORT USHORT
+
+#define DECLARE_HANDLE(name) struct name##__{int unused;}; typedef struct name##__ *name
+#define C_ASSERT(e) typedef char __C_ASSERT__[(e)?1:-1]
+
+DECLARE_HANDLE(HWND);
+DECLARE_HANDLE(HDC);
+DECLARE_HANDLE(PALETTEENTRY);
+
+typedef struct tagPOINT {
+    LONG x;
+    LONG y;
+} POINT;
+
+typedef struct tagRECT {
+    LONG left;
+    LONG top;
+    LONG right;
+    LONG bottom;
+} RECT;
+
+typedef struct tagRECTL {
+    LONG left;
+    LONG top;
+    LONG right;
+    LONG bottom;
+} RECTL;
+
+typedef union _LARGE_INTEGER {
+	struct {
+		DWORD LowPart;
+		DWORD HighPart;
+	} u;
+	LONGLONG QuadPart;
+} LARGE_INTEGER;
+
+typedef LARGE_INTEGER *PLARGE_INTEGER;
+
+typedef union _ULARGE_INTEGER {
+    struct {
+        ULONG LowPart;
+        ULONG HighPart;
+    } DUMMYSTRUCTNAME;
+    struct {
+        ULONG LowPart;
+        ULONG HighPart;
+    } u;
+    ULONGLONG QuadPart;
+} ULARGE_INTEGER;
+
+typedef ULARGE_INTEGER *PULARGE_INTEGER;
+
+typedef struct _LUID {
+    ULONG LowPart;
+    LONG HighPart;
+} LUID, *PLUID;
+
+typedef enum _DEVICE_POWER_STATE {
+    PowerDeviceUnspecified = 0,
+    PowerDeviceD0,
+    PowerDeviceD1,
+    PowerDeviceD2,
+    PowerDeviceD3,
+    PowerDeviceMaximum
+} DEVICE_POWER_STATE, *PDEVICE_POWER_STATE;
+
+#define _Check_return_
+#define APIENTRY
+#define CONST const
+#define IN
+#define OUT
+#define FAR
+#define MAX_PATH 260
+#define __stdcall
+
+#ifndef GUID_DEFINED
+#define GUID_DEFINED
+typedef struct _GUID {
+    uint32_t Data1;
+    uint16_t Data2;
+    uint16_t Data3;
+    uint8_t  Data4[ 8 ];
+} GUID;
+#endif
+
+#include <guiddef.h>
+
+#endif
@@ -0,0 +1,82 @@
+/* Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. */
+
+#ifndef _WSL_INC_WDDM_CMD_UTIL_H_
+#define _WSL_INC_WDDM_CMD_UTIL_H_
+
+#include <string.h>
+#include "impl/hsa/hsa.h"
+#include "impl/hsa/amd_hsa_queue.h"
+#include "impl/hsa/amd_hsa_kernel_code.h"
+#include "impl/pm4_cmds.h"
+#include "util/utils.h"
+
+namespace wsl {
+namespace thunk {
+
+struct DispatchInfo {
+  uint8_t                       major;
+  hsa_kernel_dispatch_packet_t  *pPacket;
+  void                          *pEntry;
+  const amd_kernel_code_t       *pKernelObject;
+  uint32_t                      ldsBlks;
+  amd_queue_v2_t                *pAmdQueue;
+  bool                          wave32;
+  uint32_t                      srd;
+  void                          *pScratchBase;
+  uint32_t                      scratchSizePerWave;
+  uint32_t                      scratchBaseOffset[2];
+  uint32_t                      offsetCnt;
+};
+
+class CmdUtil {
+public:
+  CmdUtil() {};
+  ~CmdUtil() {};
+
+  static size_t BuildCopyData(
+    uint64_t  *pDstAddr,
+    void      *pBuffer,
+    uint32_t  dstSel = dst_sel__mec_copy_data__tc_l2,
+    uint32_t  dstCachePolicy = dst_cache_policy__mec_copy_data__stream,
+    uint32_t  srcSel = src_sel__mec_copy_data__gpu_clock_count,
+    uint32_t  srcCachePolicy = src_cache_policy__mec_copy_data__lru,
+    uint32_t  countSel = count_sel__mec_copy_data__64_bits_of_data,
+    uint32_t  wrConfirm = wr_confirm__mec_copy_data__wait_for_confirmation);
+
+  static size_t BuildBarrier(
+    void      *pBuffer,
+    uint32_t  eventIndex = event_index__mec_event_write__cs_partial_flush,
+    uint32_t  eventType = CS_PARTIAL_FLUSH);
+
+  static size_t BuildWriteData64Command(
+    void      *pBuffer,
+    uint64_t* write_addr,
+    uint64_t write_value);
+
+  static size_t BuildAcquireMem(
+    uint8_t major,
+    void    *pBuffer);
+
+  static size_t BuildScratch(
+    void  *pScratchBase,
+    void  *pBuffer);
+
+  static size_t BuildComputeShaderParams(
+    void  *pBuffer);
+
+  static size_t BuildDispatch(
+    struct DispatchInfo *pInfo,
+    void                *pBuffer);
+
+  static size_t BuildAtomicMem(
+    uint64_t  *pAddr,
+    uint32_t  atomic,
+    void      *pBuffer,
+    uint32_t  cachePolicy = cache_policy__mec_atomic_mem__stream,
+    uint64_t  srcData = 1);
+};
+
+} // namespace thunk
+} // namespace wsl
+
+#endif
@@ -0,0 +1,246 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef _WSL_INC_WDDM_DEVICE_H_
+#define _WSL_INC_WDDM_DEVICE_H_
+
+#include <cassert>
+#include <ntstatus.h>
+
+#include <atomic>
+#include <memory>
+#include <vector>
+
+#include "impl/wddm/types.h"
+#include "impl/thunk_proxy/thunk_proxy.h"
+#include "impl/wddm/va_mgr.h"
+#include "impl/wddm/status.h"
+#include "impl/wddm/types.h"
+#include "impl/wddm/gpu_memory.h"
+#include "impl/wddm/cmd_util.h"
+
+namespace wsl {
+namespace thunk {
+
+//class Queue;
+class WDDMQueue;
+
+// WSL2 hyperv GPADL protocol limitation
+#define MAX_USERPTR_BLOCK_SIZE 0xf0000000
+#define START_NON_CANONICAL_ADDR (1ULL << 47)
+#define END_NON_CANONICAL_ADDR (~0UL - (1UL << 47))
+#define IS_OVERLAPPING(start1, size1, start2, size2) \
+  ((start1 < (start2 + size2)) && (start2 < (start1 + size1)))
+
+struct SegmentInfo {
+  uint32_t segment_id;
+  uint32_t segment_type;    // 0=aperture, 1=gpu memory, 2=system memory
+  bool aperture;
+  bool system_memory;
+  uint64_t commit_limit;
+
+  SegmentInfo()
+      : segment_id(0), segment_type(0), aperture(false),
+        system_memory(false), commit_limit(0) {}
+};
+
+class WDDMDevice {
+public:
+  static constexpr size_t GpuMemoryChunkSize = 2 * (1ULL << 30);   // 2 GB
+
+  WDDMDevice(D3DKMT_HANDLE adapter, LUID adapter_luid, uint32_t node_id);
+  ~WDDMDevice();
+
+  int NodeId() const { return node_id_; }
+  int Major() { return device_info_.major; }
+  int Minor() { return device_info_.minor; }
+  int Stepping() { return device_info_.stepping; }
+  bool IsDgpu() { return device_info_.is_dgpu; }
+  const char *ProductName() { return device_info_.product_name; }
+  uint64_t Uuid() { return device_info_.uuid; }
+  uint32_t GfxFamily() { return device_info_.family; }
+  uint32_t DeviceId() { return device_info_.device_id; }
+  uint32_t WavefrontSize() { return device_info_.wavefront_size; }
+  uint32_t ComputeUnitCount() { return device_info_.compute_unit_count; }
+  uint32_t MaxEngineClockMhz() { return device_info_.max_engine_clock_mhz; }
+  uint32_t WatchPointsNum() { return device_info_.watch_points_num; }
+  uint32_t PciBusAddr() { return device_info_.pci_bus_addr; }
+
+  uint32_t MemoryBusWidth() { return device_info_.memory_bus_width; }
+  uint32_t MaxMemoryClockMhz() { return device_info_.max_memory_clock_mhz; }
+  uint32_t WavePerCu() { return device_info_.wave_per_cu; }
+  uint32_t SimdPerCu() { return device_info_.simd_per_cu; }
+  uint32_t MaxScratchSlotsPerCu() { return device_info_.max_scratch_slots_per_cu; }
+  uint32_t NumShaderEngine() { return device_info_.num_shader_engine; }
+  uint32_t ShaderArrayPerShaderEngine() { return device_info_.shader_array_per_shader_engine; }
+  uint32_t NumSdmaEngine() { return device_info_.sdma_schedid.size(); }
+  uint32_t Domain() { return device_info_.domain; }
+  uint32_t NumGws() { return device_info_.num_gws; }
+  uint32_t AsicRevision() { return device_info_.asic_revision; }
+  uint64_t LocalHeapSize() { return device_info_.local_visible_heap_size + device_info_.local_invisible_heap_size; }
+  uint64_t LocalVisibleHeapSize() { return device_info_.local_visible_heap_size; }
+  uint64_t LocalInvisibleHeapSize() { return device_info_.local_invisible_heap_size; }
+  uint64_t NonLocalHeapSize() { return device_info_.non_local_heap_size; }
+  uint64_t PrivateApertureBase() { return device_info_.private_aperture_base; }
+  uint64_t PrivateApertureSize() { return device_info_.private_aperture_size; }
+  uint64_t SharedApertureBase() { return device_info_.shared_aperture_base; }
+  uint64_t SharedApertureSize() { return device_info_.shared_aperture_size; }
+  uint32_t LdsSize() { return device_info_.lds_size; }
+  uint64_t GPUCounterFrequency() { return device_info_.gpu_counter_frequency; }
+  uint32_t GetSwsQueueSize(void) const { return device_info_.user_queue_size; }
+  uint32_t GetMecFwVersion() { return device_info_.mec_fw_version; }
+  uint32_t GetSdmaFwVersion() { return device_info_.sdma_fw_version; }
+  uint32_t GetL1CacheSize() { return device_info_.l1_cache_size; }
+  uint32_t GetL2CacheSize() { return device_info_.l2_cache_size; }
+  uint32_t GetL3CacheSize() { return device_info_.l3_cache_size; }
+  uint32_t Gl2CacheLineSize() { return device_info_.gl2_cacheline_size; }
+  bool SupportStateShadowingByCpFw(void) const { return device_info_.state_shadowing_by_cpfw; }
+  bool SupportPlatformAtomic(void) const { return device_info_.platform_atomic_support; }
+  uint32_t GetSdmaEngine(uint32_t idx) {
+    assert(idx < NumSdmaEngine());
+    return device_info_.sdma_schedid[idx];
+  }
+  uint32_t GetComputeEngine() { return device_info_.compute_schedid; }
+
+  uint64_t VramAvail();
+
+  void GetClockCounters(uint64_t *gpu, uint64_t *cpu);
+  uint32_t GetNumCpQueues() { return device_info_.num_cp_queues; }
+
+  bool CreateSyncobj(D3DKMT_HANDLE *handle, uint64_t **addr);
+  void DestroySyncobj(D3DKMT_HANDLE handle);
+
+  bool CreateQueue(WDDMQueue *queue);
+  void DestroyQueue(WDDMQueue *queue);
+  bool CreateHwQueue(WDDMQueue *queue);
+  bool DestroyHwQueue(WDDMQueue *queue);
+  bool SubmitToSwQueue(WDDMQueue *queue, uint64_t command_addr,
+                      uint64_t command_size, uint64_t fence_value);
+  bool SubmitToHwQueue(WDDMQueue *queue, uint64_t command_addr,
+                      uint64_t command_size, uint64_t fence_value);
+
+  bool WaitPagingFence(WDDMQueue *queue) {
+    uint64_t value = page_fence_value_;
+
+    if (*page_fence_addr_ < value &&
+        !GpuWait(queue, &page_syncobj_, &value, 1))
+      return false;
+
+    return true;
+  }
+
+  bool GpuWait(WDDMQueue *queue, const D3DKMT_HANDLE *syncobjs,
+	       uint64_t *values, int count);
+  bool GpuSignal(D3DKMT_HANDLE context, const D3DKMT_HANDLE *syncobjs,
+		  uint64_t *value, int count);
+  bool CpuWait(const D3DKMT_HANDLE *syncobjs, uint64_t *value,
+	       int count, bool wait_any);
+  bool WaitOnPagingFenceFromCpu();
+
+  uint32_t LdsBlocks(const hsa_kernel_dispatch_packet_t *pkt);
+  uint32_t GetCmdbufSize(void) const { return cmdbuf_size_; }
+  uint32_t GetAqlFrameSize(void) const { return cmdbuf_aql_frame_size_; }
+  static uint32_t GetAqlFrameNum(void) { return cmdbuf_aql_frame_num_; }
+
+  // Both legacy HWS and stage 1 HWS use KMD to alloc use queue memory,
+  // return false by default
+  bool AllocUserQueueMemFromUMD(void) const { return false; }
+
+  bool IsHwsEnabled(int engine) {
+    return thunk_proxy::GetHwsEnabled(engine, &device_info_);
+  }
+
+  void UpdatePageFence(uint64_t fence_value);
+
+  D3DKMT_HANDLE PagingQueue() const { return page_queue_; }
+  D3DKMT_HANDLE PagingFence() const { return page_syncobj_; }
+  D3DKMT_HANDLE DeviceHandle() const { return device_; }
+  LUID GetLuid() const { return adapter_luid_; }
+  D3DKMT_HANDLE GetAdapter() const { return adapter_; }
+
+  const thunk_proxy::DeviceInfo& DeviceInfo() const { return device_info_; }
+
+  ErrorCode CreateGpuMemory(const GpuMemoryCreateInfo &create_info, GpuMemory **gpu_mem, gpusize *gpu_va = nullptr);
+
+private:
+  bool ParseDeviceInfo(void);
+  void DestroyDeviceInfo(void);
+  bool CreateDevice(void);
+  bool DestroyDevice(void);
+  bool CreatePagingQueue(void);
+  bool DestroyPagingQueue(void);
+  void *Lock(D3DKMT_HANDLE handle);
+  bool Unlock(D3DKMT_HANDLE handle);
+  bool CreateContext(int engine, D3DKMT_HANDLE *handle);
+  bool DestroyContext(D3DKMT_HANDLE handle);
+
+  void SetPowerOptimization(bool restore);
+  void InitCmdbufInfo(void);
+
+  bool QuerySegmentInfo();
+  bool GetSegmentId(D3DKMT_QUERYSTATISTICS_SEGMENT_TYPE segment_type, uint32_t &segment_id);
+
+  D3DKMT_HANDLE adapter_;
+  LUID adapter_luid_;
+  D3DKMT_HANDLE device_;
+
+  D3DKMT_HANDLE page_queue_;
+  D3DKMT_HANDLE page_syncobj_;
+  uint64_t *page_fence_addr_;
+  std::atomic<uint64_t> page_fence_value_;
+
+  uint32_t cmdbuf_size_;
+  uint32_t cmdbuf_aql_frame_size_;
+  static const uint32_t cmdbuf_aql_frame_num_;
+  uint32_t node_id_;
+  // device info
+  thunk_proxy::DeviceInfo device_info_;
+  std::vector<struct SegmentInfo> segment_infos_;
+  //CmdUtil cmd_util;
+};
+
+NTSTATUS WDDMCreateDevices(std::vector<WDDMDevice *> &devices);
+
+} // namespace thunk
+} // namespace wsl
+
+#endif
@@ -0,0 +1,249 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef _WSL_INC_WDDM_GPU_MEMORY_H_
+#define _WSL_INC_WDDM_GPU_MEMORY_H_
+
+#include <cstddef>
+#include <cstdint>
+#include "util/utils.h"
+#include "impl/wddm/types.h"
+#include "impl/wddm/thunks.h"
+#include "impl/thunk_proxy/thunk_proxy.h"
+
+namespace wsl {
+namespace thunk {
+
+class WDDMDevice;
+
+union GpuMemoryCreateFlags {
+  struct {
+    uint64_t virtual_alloc              : 1; // only allocate virtual address, without physical buffer
+    uint64_t physical_only              : 1; // only allocate physical buffer, without virutal address
+    uint64_t interprocess               : 1; // physical buffer need share info between exporter and importer
+    uint64_t locked                     : 1; // lock virtual address space into RAM, preventing that memory from being paged to the swap area
+    uint64_t physical_contiguous        : 1; // contiguous physical pages
+    uint64_t sysmem_ipc_sig_importer         : 1; // allocate system memory for IPC signal
+    uint64_t sysmem_ipc_sig_exporter            : 1; // allocate system memory for IPC signal, prepare to export
+    uint64_t alloc_va                   : 1; // allocate va. 0 for vmem import
+    uint64_t blit_kernel_object         : 1; // allocate executable blit kernel object
+    uint64_t unused                     : 55;
+  };
+  uint64_t reserved;
+};
+
+union GpuMemoryDescFlags {
+  struct {
+    uint32_t is_virtual  : 1;
+    uint32_t is_shared   : 1;
+    uint32_t is_external : 1;
+    uint32_t is_physical_only : 1;
+    uint32_t is_locked : 1;
+    uint32_t is_queue_referenced : 1;
+    uint32_t is_physical_contiguous : 1;
+    uint32_t is_imported_sys_memfd : 1;     // 0 - ignored; 1 - va from system heap
+    uint32_t is_sysmem_exporter : 1; // allocate system memory for IPC signal, prepare to export
+    uint32_t is_va_required :1;
+    uint32_t is_imported_vram_vmem	:1;
+    uint32_t is_imported_vram_ipc	:1;
+    uint32_t is_imported_from_same_process : 3; // imported from same process, record shared cnt
+    uint32_t is_blit_kernel_object : 1; // blit kernel object
+    uint32_t unused : 16;
+  };
+
+  uint32_t reserved;
+};
+
+struct GpuMemoryCreateInfo {
+  GpuMemoryCreateInfo() {
+    flags.reserved = 0;
+    domain = thunk_proxy::kLocal;
+    size = 0;
+    alignment = 0;
+    mem_flags = 0;
+    engine_flag = 0;
+    va_hint = 0;
+    user_ptr = nullptr;
+    dmabuf_fd = -1;
+  }
+
+  GpuMemoryCreateFlags flags;
+  thunk_proxy::AllocDomain domain;
+  gpusize size;
+  gpusize alignment;
+  int mem_flags;
+  int engine_flag;
+  int dmabuf_fd; // Import from dmabuf
+
+  void *user_ptr;
+  gpusize va_hint;
+};
+
+struct GpuMemoryDesc {
+  GpuMemoryDesc() {
+    gpu_addr = 0;
+    cpu_addr = nullptr;
+    client_size = 0;
+    size = alignment = 0;
+    flags.reserved = 0;
+    mem_flags = 0;
+    engine_flag = 0;
+    handle_ape_addr = 0;
+  }
+
+  thunk_proxy::AllocDomain domain;
+  LUID adapter_luid;      // Where is the backing store location
+  gpusize gpu_addr;
+  void *cpu_addr;
+  gpusize client_size;    // user request size
+  gpusize size;
+  gpusize alignment;
+  gpusize handle_ape_addr;
+
+  GpuMemoryDescFlags flags;
+  int mem_flags;
+  int engine_flag;
+};
+
+struct SharedHandleInfo {
+  thunk_proxy::AllocDomain domain;
+  LUID adapter_luid;
+  gpusize client_size;    // user request size
+  uint64_t size;
+  uint32_t flags;
+  int mem_flags;
+  pid_t pid;
+  gpusize gpu_addr;
+};
+
+using GpuMemoryHandle = void *;
+
+class GpuMemory {
+public:
+  static size_t CalcChunkNumbers(gpusize size);
+
+  ErrorCode Init(const GpuMemoryCreateInfo &create_info);
+
+  WDDMDevice *GetDevice() const { return device_; }
+  gpusize Size() const { return desc_.size; }
+  gpusize ClientSize() const { return desc_.client_size; }
+  uint64_t GpuAddress() const { return desc_.gpu_addr; }
+  void *CpuAddress() const { return desc_.cpu_addr; }
+  uint64_t HandleApeAddress() const { return desc_.handle_ape_addr; }
+
+  inline bool IsLocal() const { return desc_.domain == thunk_proxy::kLocal; }
+  inline bool IsUserMemory() const { return desc_.domain == thunk_proxy::kUserMemory; }
+  inline bool IsSystem() const { return desc_.domain == thunk_proxy::kSystem; }
+  inline bool IsSysMemFd() const { return desc_.flags.is_imported_sys_memfd; }
+  inline bool IsUserQueue() const { return desc_.domain == thunk_proxy::kUserQueue; }
+  inline bool IsPhysicalOnly() const { return desc_.flags.is_physical_only; }
+  inline bool IsPhysicalContiguous() const { return desc_.flags.is_physical_contiguous; }
+  inline bool IsVirtual() const { return desc_.flags.is_virtual; }
+  inline bool IsShared() const { return desc_.flags.is_shared; }
+  inline bool IsExternal() const { return desc_.flags.is_external; }
+  inline bool IsVaAllocated() const { return desc_.flags.is_va_required; }
+  inline bool IsBlitKernelObject() const { return desc_.flags.is_blit_kernel_object; }
+
+  inline uint32_t Flags() const { return desc_.flags.reserved; }
+  inline int GetAllocInfo() const { return desc_.mem_flags; }
+  inline bool IsFineGrain() const { return (desc_.mem_flags & thunk_proxy::kFineGrain); }
+  inline bool IsSameAdapter(const LUID &luid) const {
+    return (desc_.adapter_luid.HighPart == luid.HighPart &&
+      desc_.adapter_luid.LowPart == luid.LowPart);
+  }
+  inline void GetQueueReference() { desc_.flags.is_queue_referenced = 1; }
+  inline void PutQueueReference() { desc_.flags.is_queue_referenced = 0; }
+  inline bool IsQueueReferenced() const { return desc_.flags.is_queue_referenced; }
+  inline void IncSharedReference() { desc_.flags.is_imported_from_same_process++; }
+  inline uint32_t DecSharedReference() { return (desc_.flags.is_imported_from_same_process == 0) ? 0 : --desc_.flags.is_imported_from_same_process; }
+  inline bool IsSharedFromSameProcess() const { return desc_.flags.is_imported_from_same_process > 0; }
+
+  WinAllocationHandle GetAllocationHandle(size_t index) const { return alloc_handles_ptr_[index]; }
+  size_t NumChunks() const { return num_allocations_; }
+
+  const GpuMemoryHandle GetGpuMemoryHandle() const {
+    return reinterpret_cast<GpuMemoryHandle>(const_cast<GpuMemory*>(this));
+  }
+
+  static GpuMemory *Convert(GpuMemoryHandle handle) { return reinterpret_cast<GpuMemory *>(handle); }
+
+  ErrorCode ReserveGpuVirtualAddress(gpusize base_virt_addr, gpusize va_size, gpusize alignment);
+  ErrorCode FreeGpuVirtualAddress(gpusize va_start_address, gpusize va_size);
+
+  ErrorCode MapGpuVirtualAddress(const gpusize map_addr, const gpusize size, gpusize offset = 0);
+  ErrorCode UnmapGpuVirtualAddress(const gpusize map_addr, const gpusize size, gpusize offset = 0);
+
+  ErrorCode MakeResident();
+  ErrorCode Evict();
+
+  ErrorCode ExportPhysicalHandle(int* dmabuf_fd, uint32_t flags = SHARED_ALLOCATION_ALL_ACCESS);
+  ErrorCode ImportPhysicalHandle(const GpuMemoryCreateInfo &create_info, gpusize *gpu_addr = nullptr);
+  ~GpuMemory();
+protected:
+  explicit GpuMemory(WDDMDevice *device);
+private:
+  ErrorCode CreatePhysicalMemory();
+  ErrorCode FreePhysicalMemory();
+
+  uint64_t AdjustSize(gpusize size) const;
+private:
+  friend class WDDMDevice;
+
+  WDDMDevice *const device_;
+
+  GpuMemoryDesc desc_;
+
+  size_t num_allocations_;
+  WinAllocationHandle *alloc_handles_ptr_;
+  WinAllocationHandle alloc_handle_; // Optimization for num_allocations_ is 1
+
+  WinResourceHandle resource_;     // Handle to a resource object that wraps the allocation. Used for shared resources
+
+  int mem_fd_; // IPC sigal's sys mem fd
+
+  DISALLOW_COPY_AND_ASSIGN(GpuMemory);
+};
+
+} // namespace thunk
+} // namespace wsl
+
+#endif
@@ -0,0 +1,370 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+#ifndef _WSL_INC_WDDM_QUEUE_H_
+#define _WSL_INC_WDDM_QUEUE_H_
+
+#include <cinttypes>
+#include <condition_variable>
+#include <iostream>
+#include <queue>
+#include <utility>
+#include "impl/wddm/types.h"
+#include "impl/wddm/device.h"
+#include "impl/wddm/gpu_memory.h"
+#include "impl/hsa/hsa_ext_amd.h"
+#include "impl/hsa/amd_hsa_queue.h"
+#include "impl/hsa/amd_hsa_signal.h"
+#include "impl/wddm/cmd_util.h"
+
+namespace wsl {
+namespace thunk {
+
+class Queue;
+class WDDMDevice;
+
+class WDDMQueue {
+public:
+  WDDMQueue(WDDMDevice *device,
+            uint64_t cmdbuf_addr,
+            uint32_t cmdbuf_size,
+            uint32_t engine,
+            bool use_hws = true) :
+            device(device),
+            context(0),
+            queue(0),
+            syncobj(0),
+            sync_addr(NULL),
+            cmdbuf(0),
+            cmdbuf_addr(cmdbuf_addr),
+            cmdbuf_size(cmdbuf_size),
+            queue_engine(engine),
+            use_hws(use_hws),
+            prio(thunk_proxy::kNormal) {
+
+  }
+
+  virtual ~WDDMQueue() { }
+
+  virtual hsa_status_t Init(void) { return HSA_STATUS_SUCCESS; }
+  virtual hsa_status_t Fini(void) { return HSA_STATUS_SUCCESS; }
+  virtual void RingDoorbell() { }
+  virtual void* GetHsaQueueAddr(void) const { return reinterpret_cast<void*>(GetCmdbufAddr()); }
+
+  hsa_status_t SwsInit(void);
+  hsa_status_t SwsFini(void);
+  hsa_status_t SwsSubmit(uint64_t command_addr,
+                         uint64_t command_size,
+                         uint64_t fence_value);
+
+  hsa_status_t HwsInit(void);
+  hsa_status_t HwsFini(void);
+  hsa_status_t HwsSubmit(uint64_t command_addr,
+                         uint64_t command_size,
+                         uint64_t fence_value);
+  hsa_status_t SetPriority(hsa_amd_queue_priority_t priority);
+
+  uint64_t *GetSyncAddr(void) const { return sync_addr; }
+  uint64_t GetCmdbufAddr(void) const { return cmdbuf_addr; }
+
+  thunk_proxy::SchedLevel ConvertSchedLevel(hsa_amd_queue_priority_t prio) const {
+    switch (prio) {
+    case HSA_AMD_QUEUE_PRIORITY_LOW:
+      return thunk_proxy::kLow;
+    case HSA_AMD_QUEUE_PRIORITY_HIGH:
+      return thunk_proxy::kHigh;
+    case HSA_AMD_QUEUE_PRIORITY_NORMAL:
+    default:
+      return thunk_proxy::kNormal;
+    }
+  }
+
+  WDDMDevice *device;
+
+  D3DKMT_HANDLE context;
+  D3DKMT_HANDLE queue;
+
+  D3DKMT_HANDLE syncobj;
+  uint64_t *sync_addr;
+
+  GpuMemoryHandle cmdbuf;
+  uint64_t cmdbuf_addr;
+  uint32_t cmdbuf_size;
+
+  GpuMemoryHandle queue_mem;
+  uint64_t queue_addr;
+
+  uint32_t queue_engine;
+
+  bool use_hws;
+  thunk_proxy::SchedLevel prio;
+};
+
+class ComputeQueue : public WDDMQueue {
+public:
+  ComputeQueue(WDDMDevice *device,
+               void *ring,
+               uint64_t ring_size,
+               std::atomic<uint64_t> *ring_wptr,
+               std::atomic<uint64_t> *ring_rptr,
+               volatile int64_t *error_addr,
+               uint32_t cmdbuf_size,
+               uint32_t engine,
+               bool use_hws = true);
+
+  ~ComputeQueue();
+
+  virtual hsa_status_t Init(void);
+  virtual hsa_status_t Fini(void);
+  virtual hsa_status_t Submit(void);
+
+  void* GetRing(void) const { return ring; }
+  uint64_t GetRingSize(void) const { return ring_size; }
+  std::atomic<uint64_t>* GetRingWptr(void) const { return ring_wptr; }
+  std::atomic<uint64_t>* GetRingRptr(void) const { return ring_rptr; }
+
+  uint64_t GetAqlWriteIndex(void) const { return cmdbuf_aql_frame_write_index; }
+  uint32_t GetAqlFrameSize(void) const { return cmdbuf_aql_frame_size; }
+  void* GetHsaQueueAddr(void) const { return ring; }
+
+  bool IsInvalidPacket(void) const {
+    uint16_t *packet = (uint16_t *)((char *)ring +
+                       (cmdbuf_aql_frame_write_index % ring_size) * 64);
+    return ((*packet >> HSA_PACKET_HEADER_TYPE) & ((1 << HSA_PACKET_HEADER_WIDTH_TYPE) - 1))
+           == HSA_PACKET_TYPE_INVALID;
+  }
+
+  hsa_status_t Process(void);
+  uint64_t * GetDoorbellPtr() const { return (uint64_t *)&doorbell_signal_value_; }
+  void RingDoorbell();
+private:
+  hsa_status_t KernelDispatchAqlToPm4(char *cpu, hsa_kernel_dispatch_packet_t *packet);
+  hsa_status_t BarrierGenericAqlToPm4(char *cpu, hsa_barrier_and_packet_t *packet, bool is_or = false);
+
+  uint64_t CalcDispatchGroups(hsa_kernel_dispatch_packet_t *packet);
+  uint64_t CalcDispatchWavesPerGroup(hsa_kernel_dispatch_packet_t *packet, bool wave32);
+
+  struct amd_aql_pm4_ib {
+      uint16_t header;
+      uint16_t ven_hdr;
+      uint32_t ib_jump_cmd[4];
+      uint32_t dw_cnt_remain;
+      uint32_t reserved[8];
+      hsa_signal_t completion_signal;
+  };
+  hsa_status_t VendorSpecificAqlToPm4(char *cpu, amd_aql_pm4_ib *packet);
+  hsa_status_t SwitchAql2PM4(void);
+
+  hsa_status_t PreSubmit(void);
+  hsa_status_t EndSubmit(void);
+
+  void *ring;
+  uint64_t ring_size;
+  std::atomic<uint64_t> *ring_wptr;
+  std::atomic<uint64_t> *ring_rptr;
+
+  // ib_start_addr is the current ib start address
+  uint64_t ib_start_addr;
+
+  // ib_size is the current ib size.
+  uint64_t ib_size;
+
+  // record the last submitted aql frame write index
+  uint64_t sync_point;
+
+  uint64_t cmdbuf_aql_frame_write_index;
+  uint32_t cmdbuf_aql_frame_size;
+
+  uint64_t  *signal_addr_;
+  bool platform_atomic_support_;
+  bool needs_barrier;
+  bool ready_to_submit;
+
+  CmdUtil cmd_util;
+
+private:
+  bool EnableProfiling() {
+    return AMD_HSA_BITS_GET(amd_queue_rocr_->queue_properties, AMD_QUEUE_PROPERTIES_ENABLE_PROFILING);
+  }
+  void HandleError(hsa_status_t status);
+  bool UpdateScratch(hsa_kernel_dispatch_packet_t *packet, bool wave32);
+
+  uint32_t UpdateIndexStride(uint32_t srd, bool wave32);
+
+  void *ScratchBase() { return scratch_base_; }
+
+  void AppendCmdbufSratchBaseOffset(int offset) {
+      scratch_base_offset_array_.push_back(offset);
+  }
+
+  bool RelocateCmdbufScratchBase(uint64_t addr);
+
+  uint32_t ScratchSizePerWave() { return scratch_size_per_wave_; }
+  uint64_t GetKernelObjAddr(uint64_t addr) const;
+  void InitScratchSRD();
+  GpuMemoryHandle amd_queue_mem_;
+  amd_queue_v2_t *amd_queue_;
+  amd_queue_v2_t *amd_queue_rocr_;
+  uint64_t doorbell_signal_value_;
+  volatile std::atomic<int64_t> *error_code_;
+  std::thread aql_to_pm4_thread_;
+  bool thread_stop_;
+  std::mutex thread_cond_lock_;
+  std::condition_variable thread_cond_;
+  static void AqlToPm4Thread(ComputeQueue *queue);
+
+  uint64_t max_scratch_waves_;
+  uint64_t dispatch_waves_;
+  uint64_t scratch_size_per_wave_;
+  uint64_t scratch_size_;
+  uint64_t total_scratch_size_;
+  void *scratch_base_;
+  uint32_t scratch_mem_alignment_size_;
+  GpuMemoryHandle scratch_mem_;
+
+  std::vector<int> scratch_base_offset_array_;
+};
+
+class SDMAQueue : public WDDMQueue {
+public:
+  SDMAQueue(WDDMDevice *device,
+            void *ring,
+            uint64_t cmdbuf_size,
+            uint32_t engine,
+            bool use_hws = true);
+
+  virtual ~SDMAQueue();
+
+  hsa_status_t Init(void);
+  hsa_status_t Fini(void);
+  hsa_status_t Submit(void);
+
+  int PreparePacket(uint32_t offset, uint64_t size);
+
+  void WaitQueue(void) {
+    device->CpuWait(&syncobj, &rptr_next, 1, false);
+  }
+
+  uint64_t * GetRingWptr(void) { return &wptr_next_; }
+  uint64_t * GetRingRptr(void) { return WDDMQueue::GetSyncAddr(); }
+  uint64_t * GetDoorbellPtr() { return &doorbell_; }
+  void RingDoorbell();
+  void* GetHsaQueueAddr(void) const { return reinterpret_cast<void*>(GetCmdbufAddr()); }
+
+private:
+  uint64_t wptr_next_;
+  uint64_t wptr_pre_;
+  uint64_t rptr_next;
+  uint64_t doorbell_;
+  std::vector<std::pair<uint64_t, uint64_t>> wptr_queue_;
+  uint64_t ib_size;
+  uint64_t ib_start_addr;
+
+  std::thread thread_;
+  bool thread_stop_;
+  std::mutex thread_cond_lock_;
+  std::condition_variable thread_cond_;
+  static void SdmaThread(SDMAQueue *queue);
+
+  struct SDMA_PKT_POLL_REGMEM {
+    union {
+      struct {
+        unsigned int op : 8;
+        unsigned int sub_op : 8;
+        unsigned int reserved_0 : 10;
+        unsigned int hdp_flush : 1;
+        unsigned int reserved_1 : 1;
+        unsigned int func : 3;
+        unsigned int mem_poll : 1;
+      };
+      unsigned int DW_0_DATA;
+    } HEADER_UNION;
+
+    union {
+      struct {
+        unsigned int addr_31_0 : 32;
+      };
+      unsigned int DW_1_DATA;
+    } ADDR_LO_UNION;
+
+    union {
+      struct {
+        unsigned int addr_63_32 : 32;
+      };
+      unsigned int DW_2_DATA;
+    } ADDR_HI_UNION;
+
+    union {
+      struct {
+        unsigned int value : 32;
+      };
+      unsigned int DW_3_DATA;
+    } VALUE_UNION;
+
+    union {
+      struct {
+        unsigned int mask : 32;
+      };
+      unsigned int DW_4_DATA;
+    } MASK_UNION;
+
+    union {
+      struct {
+        unsigned int interval : 16;
+        unsigned int retry_count : 12;
+        unsigned int reserved_0 : 4;
+      };
+      unsigned int DW_5_DATA;
+    } DW5_UNION;
+  };
+  const unsigned int SDMA_OP_POLL_REGMEM = 8;
+  bool IsPollPacket(SDMA_PKT_POLL_REGMEM* pkt) {
+    return pkt->HEADER_UNION.op == SDMA_OP_POLL_REGMEM &&
+          pkt->HEADER_UNION.mem_poll == 1 &&
+          pkt->HEADER_UNION.func == 3;
+  }
+  uint32_t WrapIntoRocrRing(uint64_t idx) { return (idx & (cmdbuf_size - 1)); }
+};
+
+} // namespace thunk
+} // namespace wsl
+
+#endif
@@ -0,0 +1,61 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef _WSL_INC_WDDM_STATUS_H
+#define _WSL_INC_WDDM_STATUS_H
+
+enum class ErrorCode {
+  Success,
+  DeviceLost,
+  UnSupported,
+  NotReady,
+  OutOfMemory,
+  OutOfGpuMemory,
+  OutOfHandleApeMemory,
+  Timeout,
+  SyscallFail,
+  InvalidateParams,
+  SameProcessSameDevice,
+  Unknown,
+};
+
+#endif
@@ -0,0 +1,233 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef _WSL_INC_WDDM_THUNKS_H
+#define _WSL_INC_WDDM_THUNKS_H
+
+#include "impl/wddm/status.h"
+#include "impl/wddm/types.h"
+#include "dxcore_loader.h"
+
+namespace wsl {
+namespace thunk {
+
+inline ErrorCode TranslateNtStatus(NTSTATUS status) {
+  switch (status) {
+  case STATUS_SUCCESS:
+    return ErrorCode::Success;
+  case STATUS_PENDING:
+    return ErrorCode::NotReady;
+  case STATUS_NO_MEMORY:
+     return ErrorCode::OutOfMemory;
+  case STATUS_DEVICE_REMOVED:
+    return ErrorCode::DeviceLost;
+   case STATUS_GRAPHICS_NO_VIDEO_MEMORY:
+    return ErrorCode::OutOfGpuMemory;
+  case STATUS_TIMEOUT:
+    return ErrorCode::Timeout;
+  case STATUS_INVALID_PARAMETER:
+    return ErrorCode::InvalidateParams;
+  default:
+    break;
+  }
+  return ErrorCode::Unknown;
+}
+
+namespace d3dthunk {
+
+typedef D3DKMT_CREATEALLOCATION                      CreateAllocationArgs;
+typedef D3DKMT_CREATECONTEXT                         CreateContextArgs;
+typedef D3DKMT_CREATECONTEXTVIRTUAL                  CreateContextVirtualArgs;
+typedef D3DKMT_CREATEPAGINGQUEUE                     CreatePagingQueueArgs;
+typedef D3DKMT_CREATESYNCHRONIZATIONOBJECT           CreateSynchronizationObjectArgs;
+typedef D3DKMT_CREATESYNCHRONIZATIONOBJECT2          CreateSynchronizationObject2Args;
+typedef D3DKMT_ESCAPE                                EscapeArgs;
+typedef D3DKMT_EVICT                                 EvictArgs;
+typedef D3DKMT_FREEGPUVIRTUALADDRESS                 FreeGpuVirtualAddressArgs;
+typedef D3DKMT_LOCK                                  LockArgs;
+typedef D3DKMT_LOCK2                                 Lock2Args;
+typedef D3DKMT_OPENRESOURCE                          OpenResourceArgs;
+typedef D3DKMT_OPENRESOURCEFROMNTHANDLE              OpenResourceFromNtHandleArgs;
+typedef D3DKMT_QUERYADAPTERINFO                      QueryAdapterInfoArgs;
+typedef D3DKMT_SIGNALSYNCHRONIZATIONOBJECT           SignalSynchronizationObjectArgs;
+typedef D3DKMT_SIGNALSYNCHRONIZATIONOBJECT2          SignalSynchronizationObject2Args;
+typedef D3DKMT_SIGNALSYNCHRONIZATIONOBJECTFROMCPU    SignalSynchronizationObjectFromCpuArgs;
+typedef D3DKMT_SIGNALSYNCHRONIZATIONOBJECTFROMGPU2   SignalSynchronizationObjectFromGpuArgs;
+typedef D3DKMT_SUBMITCOMMAND                         SubmitCommandArgs;
+typedef D3DKMT_UNLOCK                                UnlockArgs;
+typedef D3DKMT_UNLOCK2                               Unlock2Args;
+typedef D3DKMT_UPDATEGPUVIRTUALADDRESS               UpdateGpuVirtualAddressArgs;
+typedef D3DKMT_WAITFORSYNCHRONIZATIONOBJECT          WaitForSynchronizationObjectArgs;
+typedef D3DKMT_WAITFORSYNCHRONIZATIONOBJECT2         WaitForSynchronizationObject2Args;
+typedef D3DKMT_WAITFORSYNCHRONIZATIONOBJECTFROMCPU   WaitForSynchronizationObjectFromCpuArgs;
+typedef D3DKMT_WAITFORSYNCHRONIZATIONOBJECTFROMGPU   WaitForSynchronizationObjectFromGpuArgs;
+typedef D3DKMT_ACQUIREKEYEDMUTEX                     AcquireKeyedMutexArgs;
+typedef D3DKMT_RELEASEKEYEDMUTEX                     ReleaseKeyedMutexArgs;
+typedef D3DKMT_OPENKEYEDMUTEX                        OpenKeyedMutexArgs;
+typedef D3DKMT_DESTROYKEYEDMUTEX                     DestroyKeyedMutexArgs;
+typedef D3DKMT_QUERYVIDEOMEMORYINFO                  QueryVideoMemoryInfoArgs;
+typedef D3DKMT_CREATEHWQUEUE                         CreateHwQueueArgs;
+typedef D3DKMT_DESTROYHWQUEUE                        DestroyHwQueueArgs;
+typedef D3DKMT_SUBMITCOMMANDTOHWQUEUE                SubmitCommandToHwQueueArgs;
+typedef D3DKMT_SUBMITPRESENTTOHWQUEUE                SubmitPresentToHwQueueArgs;
+typedef D3DKMT_SUBMITSIGNALSYNCOBJECTSTOHWQUEUE      SubmitSignalSyncObjectsToHwQueueArgs;
+typedef D3DKMT_SUBMITWAITFORSYNCOBJECTSTOHWQUEUE     SubmitWaitForSyncObjectsToHwQueueArgs;
+typedef D3DKMT_CREATESYNCFILE                        CreateSyncFileArgs;
+
+inline ErrorCode MapGpuVirtualAddress(D3DDDI_MAPGPUVIRTUALADDRESS *args) {
+  return TranslateNtStatus(DXCORE_CALL(D3DKMTMapGpuVirtualAddress(args)));
+}
+
+inline ErrorCode CreateAllocation(CreateAllocationArgs *args) {
+  return TranslateNtStatus(DXCORE_CALL(D3DKMTCreateAllocation2(args)));
+}
+
+inline ErrorCode DestroyAllocation(
+            WinDeviceHandle device,
+            WinResourceHandle resource,
+            size_t num_allocations,
+            const WinAllocationHandle *alloc_handles) {
+
+  D3DKMT_DESTROYALLOCATION2 args{};
+
+  memset(&args, 0, sizeof(args));
+  args.hDevice = device;
+  if (resource) {
+    args.hResource = resource;
+  } else {
+    args.phAllocationList = alloc_handles;
+    args.AllocationCount = num_allocations;
+  }
+
+  return TranslateNtStatus(DXCORE_CALL(D3DKMTDestroyAllocation2(&args)));
+}
+
+inline ErrorCode ReserveGpuVirtualAddress(D3DDDI_RESERVEGPUVIRTUALADDRESS *args) {
+  return TranslateNtStatus(DXCORE_CALL(D3DKMTReserveGpuVirtualAddress(args)));
+}
+
+inline ErrorCode ReserveGpuVirtualAddress(WinAdapterHandle handle,
+                                          gpusize size,
+                                          gpusize base_address,
+                                          gpusize *out_addr) {
+  D3DDDI_RESERVEGPUVIRTUALADDRESS args{};
+  args.hPagingQueue = handle;
+  args.Size = size;
+  args.BaseAddress = base_address;
+
+  auto code = ReserveGpuVirtualAddress(&args);
+  if (code == ErrorCode::Success)
+    *out_addr = args.VirtualAddress;
+  return code;
+}
+
+inline ErrorCode ReserveGpuVirtualAddress(WinAdapterHandle handle,
+                                          gpusize size,
+                                          gpusize minimum_address,
+                                          gpusize maximum_address,
+                                          gpusize *out_addr) {
+  D3DDDI_RESERVEGPUVIRTUALADDRESS args{};
+  args.hPagingQueue = handle;
+  args.Size = size;
+  args.MinimumAddress = minimum_address;
+  args.MaximumAddress = maximum_address;
+
+  auto code = ReserveGpuVirtualAddress(&args);
+  if (code == ErrorCode::Success)
+    *out_addr = args.VirtualAddress;
+  return code;
+}
+
+inline ErrorCode FreeGpuVirtualAddress(FreeGpuVirtualAddressArgs *args) {
+  return TranslateNtStatus(DXCORE_CALL(D3DKMTFreeGpuVirtualAddress(args)));
+}
+
+inline ErrorCode FreeGpuVirtualAddress(WinAdapterHandle handle,
+                                       gpusize base_address,
+                                       gpusize size) {
+  FreeGpuVirtualAddressArgs args{};
+  args.hAdapter = handle;
+  args.Size = size;
+  args.BaseAddress = base_address;
+  return FreeGpuVirtualAddress(&args);
+}
+
+inline ErrorCode MakeResident(D3DDDI_MAKERESIDENT *args) {
+  return TranslateNtStatus(DXCORE_CALL(D3DKMTMakeResident(args)));
+}
+
+inline ErrorCode Evict(EvictArgs *args) {
+  return TranslateNtStatus(DXCORE_CALL(D3DKMTEvict(args)));
+}
+
+inline ErrorCode ShareObjects(size_t num_allocations,
+                               WinResourceHandle resource,
+                               uint32_t flags,
+                               int* dmabuf_fd) {
+  OBJECT_ATTRIBUTES obj_attr;
+  HANDLE nt_handle;
+  ErrorCode ret;
+
+  InitializeObjectAttributes(&obj_attr, nullptr, OBJ_INHERIT, nullptr, nullptr);
+  ret = TranslateNtStatus(DXCORE_CALL(D3DKMTShareObjects(num_allocations,
+        &resource, &obj_attr, flags, &nt_handle)));
+  if (ret == ErrorCode::Success)
+    *dmabuf_fd = *(reinterpret_cast<int*>(&nt_handle));
+  else
+    *dmabuf_fd = -1;
+
+  return ret;
+}
+
+inline ErrorCode QueryResourceInfoFromNtHandle(D3DKMT_QUERYRESOURCEINFOFROMNTHANDLE *args) {
+  return TranslateNtStatus(DXCORE_CALL(D3DKMTQueryResourceInfoFromNtHandle(args)));
+}
+
+inline ErrorCode OpenResourceFromNtHandle(D3DKMT_OPENRESOURCEFROMNTHANDLE *args) {
+  return TranslateNtStatus(DXCORE_CALL(D3DKMTOpenResourceFromNtHandle(args)));
+}
+
+} // namespace d3dthunk
+} // namespace thunk
+} // namespace wsl
+
+#endif
@@ -0,0 +1,101 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef _WSL_INC_WDDM_TYPES_H_
+#define _WSL_INC_WDDM_TYPES_H_
+
+#include <cstdint>
+#include <ntstatus.h>
+#include "impl/thunk_proxy/wddm_types.h"
+// windows wchar is 16bit, but linux is 32bit
+// seems libdxcore (not dxgkrnl.ko) convert thunk windows wchar to linux one
+// so only accept 32bit wchar args. note driver private data structure still
+// use 16bit wchar
+#define WCHAR wchar_t
+#define PCWSTR const wchar_t *
+#include <d3dkmthk.h>
+#undef WCHAR
+#undef PCWSTR
+
+using gpusize = uint64_t; // Used to specify GPU addresses and sizes of GPU allocations
+using WinAllocationHandle = D3DKMT_HANDLE;
+using WinResourceHandle = D3DKMT_HANDLE;
+using WinContextHandle = D3DKMT_HANDLE;
+using WinDeviceHandle = D3DKMT_HANDLE;
+using WinAdapterHandle = D3DKMT_HANDLE;
+
+//reference dk/winnt.h
+#define STANDARD_RIGHTS_REQUIRED         (0x000F0000L)
+
+//reference dk/ntdef.h
+#define OBJ_INHERIT                      (0x00000002L)
+typedef WCHAR *PWCHAR, *LPWCH, *PWCH;
+typedef struct _UNICODE_STRING {
+    USHORT Length;
+    USHORT MaximumLength;
+#ifdef MIDL_PASS
+    [size_is(MaximumLength / 2), length_is((Length) / 2) ] USHORT * Buffer;
+#else // MIDL_PASS
+    _Field_size_bytes_part_opt_(MaximumLength, Length) PWCH   Buffer;
+#endif // MIDL_PASS
+} UNICODE_STRING;
+typedef UNICODE_STRING *PUNICODE_STRING;
+typedef const UNICODE_STRING *PCUNICODE_STRING;
+
+typedef struct _OBJECT_ATTRIBUTES {
+  ULONG           Length;
+  HANDLE          RootDirectory;
+  PUNICODE_STRING ObjectName;
+  ULONG           Attributes;
+  PVOID           SecurityDescriptor;
+  PVOID           SecurityQualityOfService;
+} OBJECT_ATTRIBUTES;
+#define InitializeObjectAttributes( p, n, a, r, s ) {   \
+    (p)->Length = sizeof( OBJECT_ATTRIBUTES );          \
+    (p)->RootDirectory = r;                             \
+    (p)->Attributes = a;                                \
+    (p)->ObjectName = n;                                \
+    (p)->SecurityDescriptor = s;                        \
+    (p)->SecurityQualityOfService = NULL;               \
+    }
+
+#endif
@@ -0,0 +1,86 @@
+#ifndef _WSL_INC_WDDM_VA_MGR_H_
+#define _WSL_INC_WDDM_VA_MGR_H_
+
+#include <mutex>
+#include <map>
+#include "util/utils.h"
+
+namespace wsl {
+namespace thunk {
+
+class VaMgr {
+public:
+  VaMgr(uint64_t start, uint64_t size, uint64_t min_align);
+  ~VaMgr();
+
+  /* Allocate `bytes` VA, if `align` is not zero, the returned address is aligned by `align`.
+   * If `addr` parameter is not zero, try best to allocate VA from fixed address `addr`.
+   */
+  uint64_t Alloc(uint64_t bytes, uint64_t align, uint64_t addr = 0);
+
+  void Free(uint64_t addr);
+
+private:
+  uint64_t AllocImpl(uint64_t bytes, uint64_t align);
+
+  struct Fragment {
+    using ptr = std::multimap<uint64_t, uint64_t>::iterator;
+    ptr free_list_entry_;
+
+    struct {
+      uint64_t size : 63;
+      bool is_free : 1;
+    };
+
+    Fragment() : size(0), is_free(false) {}
+    Fragment(ptr iterator, uint64_t len, bool is_free)
+        : free_list_entry_(iterator), size(len), is_free(is_free) {}
+  };
+
+  static inline Fragment make_fragment(typename Fragment::ptr iter, uint64_t len) {
+    return {iter, len, true};
+  }
+
+  inline Fragment make_fragment(uint64_t len) { return {free_list_.end(), len, false}; }
+
+  static inline bool is_free(const Fragment& f) { return f.is_free; }
+  void set_used(Fragment& f) {
+    f.is_free = false;
+    f.free_list_entry_ = free_list_.end();
+  }
+  static void set_free(Fragment& f, typename Fragment::ptr iter) {
+    f.free_list_entry_ = iter;
+    f.is_free = true;
+  }
+
+  inline void remove_free_list_entry(Fragment& frag) {
+    if (frag.free_list_entry_ != free_list_.end()) {
+      free_list_.erase(frag.free_list_entry_);
+      frag.free_list_entry_ = free_list_.end();
+    }
+  }
+
+  inline void add_free_fragment(uint64_t size, uint64_t base) {
+    auto it = free_list_.insert(std::make_pair(size, base));
+    frag_map_[base] = make_fragment(it, size);
+  }
+
+  inline void add_used_fragment(uint64_t size, uint64_t base) {
+    frag_map_[base] = make_fragment(size);
+  }
+  // Indexed by size
+  std::multimap<uint64_t, uint64_t> free_list_;
+  // Indexed by VA, each fragment has no overlap
+  std::map<uint64_t, Fragment> frag_map_;
+
+  uint64_t min_align_;
+
+  std::mutex lock_;  // Mutex protecting allocation and free of va
+
+
+  DISALLOW_COPY_AND_ASSIGN(VaMgr);
+};
+
+} // namespace thunk
+} // namespace wsl
+#endif
@@ -0,0 +1,11 @@
+prefix=${pcfiledir}/../../..
+exec_prefix=${prefix}
+libdir=${prefix}/@CMAKE_INSTALL_LIBDIR@
+includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
+
+Name: librocdxg
+Description: HSA Kernel Mode Thunk library for WSL support
+Version: @LIB_VERSION_STRING@
+
+Libs: -L${libdir} -lrocdxg
+Cflags: -I${includedir}
@@ -0,0 +1,14 @@
+@PACKAGE_INIT@
+
+include( CMakeFindDependencyMacro )
+
+# Locate dependent packages here.  Finding them propagates usage requirements,
+# if any, to our clients and ensures that their target names are in scope for
+# the build.  rocdxg has no cmake project dependencies so there is nothing to
+# find.  If we switch to use find_package with external (to ROCm) library
+# dependencies (ie libnuma) then those packages should be located here using
+# find_dependencies as shown below.
+#find_dependency(Bar, 2.0)
+
+include( "${CMAKE_CURRENT_LIST_DIR}/@ROCDXG_TARGET@Targets.cmake" )
+
@@ -0,0 +1,39 @@
+/*
+ * Copyright © 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtAisReadWriteFile(void *MemoryAddress,
+					      HSAuint64 MemorySizeInBytes,
+					      HSAint32 fd,
+					      HSAint64 file_offset,
+					      HsaAisFlags AisFlags,
+					      HSAuint64 *SizeCopiedInBytes,
+					      HSAint32 *status)
+{
+	CHECK_DXG_OPEN();
+
+	pr_warn_once("not implemented\n");
+	return HSAKMT_STATUS_NOT_SUPPORTED;
+}
@@ -0,0 +1,126 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <cassert>
+#include <cstring>
+
+
+static uint32_t runtime_capabilities_mask = 0;
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtDbgRegister(HSAuint32 NodeId) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtDbgUnregister(HSAuint32 NodeId) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtDbgWavefrontControl(
+    HSAuint32 NodeId, HSA_DBG_WAVEOP Operand, HSA_DBG_WAVEMODE Mode,
+    HSAuint32 TrapId, HsaDbgWaveMessage *DbgWaveMsgRing) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtDbgAddressWatch(
+    HSAuint32 NodeId, HSAuint32 NumWatchPoints, HSA_DBG_WATCH_MODE WatchMode[],
+    void *WatchAddress[], HSAuint64 WatchMask[], HsaEvent *WatchEvent[]) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtCheckRuntimeDebugSupport(void) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtRuntimeEnable(void *rDebug, bool setupTtmp) {
+  HSAKMT_STATUS result = hsaKmtCheckRuntimeDebugSupport();
+
+  if (result)
+    return result;
+
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtRuntimeDisable(void) {
+  HSAKMT_STATUS result = hsaKmtCheckRuntimeDebugSupport();
+
+  if (result)
+    return HSAKMT_STATUS_SUCCESS;
+
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtGetRuntimeCapabilities(HSAuint32 *caps_mask) {
+  CHECK_DXG_OPEN();
+  *caps_mask = runtime_capabilities_mask;
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtDbgEnable(void **runtime_info,
+                                        HSAuint32 *data_size) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+HSAKMT_STATUS HSAKMTAPI hsaKmtDbgDisable(void) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtDbgGetDeviceData(void **data,
+                                               HSAuint32 *n_entries,
+                                               HSAuint32 *entry_size) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtDbgGetQueueData(void **data, HSAuint32 *n_entries,
+                                              HSAuint32 *entry_size,
+                                              bool suspend_queues) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI
+hsaKmtDebugTrapIoctl(struct kfd_ioctl_dbg_trap_args *args, HSA_QUEUEID *Queues,
+                     HSAuint64 *DebugReturn) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+ */
+
+#include "dxcore_loader.h"
+#include "librocdxg.h"
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <ntstatus.h>
+
+namespace wsl {
+namespace thunk {
+namespace dxcore {
+
+DxcoreLoader::DxcoreLoader()
+    : dxcore_handle_(nullptr)
+    , init_flag_()
+    , pfn_D3DKMTCreateAllocation2(nullptr)
+    , pfn_D3DKMTDestroyAllocation2(nullptr)
+    , pfn_D3DKMTMapGpuVirtualAddress(nullptr)
+    , pfn_D3DKMTReserveGpuVirtualAddress(nullptr)
+    , pfn_D3DKMTFreeGpuVirtualAddress(nullptr)
+    , pfn_D3DKMTCreateDevice(nullptr)
+    , pfn_D3DKMTDestroyDevice(nullptr)
+    , pfn_D3DKMTEnumAdapters2(nullptr)
+    , pfn_D3DKMTQueryAdapterInfo(nullptr)
+    , pfn_D3DKMTCreateContextVirtual(nullptr)
+    , pfn_D3DKMTDestroyContext(nullptr)
+    , pfn_D3DKMTSubmitCommand(nullptr)
+    , pfn_D3DKMTCreateSynchronizationObject2(nullptr)
+    , pfn_D3DKMTDestroySynchronizationObject(nullptr)
+    , pfn_D3DKMTQueryStatistics(nullptr)
+    , pfn_D3DKMTEscape(nullptr)
+    , pfn_D3DKMTLock2(nullptr)
+    , pfn_D3DKMTUnlock2(nullptr)
+    , pfn_D3DKMTCreatePagingQueue(nullptr)
+    , pfn_D3DKMTDestroyPagingQueue(nullptr)
+    , pfn_D3DKMTWaitForSynchronizationObjectFromGpu(nullptr)
+    , pfn_D3DKMTSignalSynchronizationObjectFromGpu(nullptr)
+    , pfn_D3DKMTWaitForSynchronizationObjectFromCpu(nullptr)
+    , pfn_D3DKMTQueryClockCalibration(nullptr)
+    , pfn_D3DKMTMakeResident(nullptr)
+    , pfn_D3DKMTEvict(nullptr)
+    , pfn_D3DKMTShareObjects(nullptr)
+    , pfn_D3DKMTQueryResourceInfoFromNtHandle(nullptr)
+    , pfn_D3DKMTOpenResourceFromNtHandle(nullptr)
+    , pfn_D3DKMTCreateHwQueue(nullptr)
+    , pfn_D3DKMTDestroyHwQueue(nullptr)
+    , pfn_D3DKMTSubmitCommandToHwQueue(nullptr) {
+}
+
+DxcoreLoader::~DxcoreLoader() {
+    Shutdown();
+}
+
+bool DxcoreLoader::Initialize() {
+    dlerror(); // Clear error
+    dxcore_handle_ = dlopen("libdxcore.so", RTLD_LAZY);
+
+    if (!dxcore_handle_) {
+        pr_err("[DxcoreLoader] Cannot load libdxcore.so: %s\n", dlerror());
+        return false;
+    }
+
+    pr_info("[DxcoreLoader] libdxcore.so loaded successfully\n");
+    if (!LoadDxcoreApis()) {
+        // If API loading failed, close the handle to indicate failure
+        dlclose(dxcore_handle_);
+        dxcore_handle_ = nullptr;
+        return false;
+    }
+
+    return IsLoaded();
+}
+
+void DxcoreLoader::Shutdown() {
+    if (dxcore_handle_) {
+        if (dlclose(dxcore_handle_) != 0) {
+            pr_err("[DxcoreLoader] Cannot unload libdxcore.so: %s\n", dlerror());
+        } else {
+            pr_info("[DxcoreLoader] libdxcore.so unloaded successfully\n");
+        }
+        dxcore_handle_ = nullptr;
+    }
+}
+
+bool DxcoreLoader::LoadDxcoreApis() {
+    if (!dxcore_handle_) {
+        pr_err("[DxcoreLoader] Error: dxcore_handle_ is null\n");
+        return false;
+    }
+
+    dlerror(); // Clear error
+
+    // Load all D3DKMT functions
+    #define LOAD_DXCORE_API(func_name) \
+        DXCORE_PFN(func_name) = (DXCORE_DEF(func_name)*)dlsym(dxcore_handle_, #func_name); \
+        if (!DXCORE_PFN(func_name)) { \
+            pr_err("[DxcoreLoader] Failed to load " #func_name ": %s\n", dlerror()); \
+            goto ERROR; \
+        }
+
+    LOAD_DXCORE_API(D3DKMTCreateAllocation2);
+    LOAD_DXCORE_API(D3DKMTDestroyAllocation2);
+    LOAD_DXCORE_API(D3DKMTMapGpuVirtualAddress);
+    LOAD_DXCORE_API(D3DKMTReserveGpuVirtualAddress);
+    LOAD_DXCORE_API(D3DKMTFreeGpuVirtualAddress);
+    LOAD_DXCORE_API(D3DKMTCreateDevice);
+    LOAD_DXCORE_API(D3DKMTDestroyDevice);
+    LOAD_DXCORE_API(D3DKMTEnumAdapters2);
+    LOAD_DXCORE_API(D3DKMTQueryAdapterInfo);
+    LOAD_DXCORE_API(D3DKMTCreateContextVirtual);
+    LOAD_DXCORE_API(D3DKMTDestroyContext);
+    LOAD_DXCORE_API(D3DKMTSubmitCommand);
+    LOAD_DXCORE_API(D3DKMTCreateSynchronizationObject2);
+    LOAD_DXCORE_API(D3DKMTDestroySynchronizationObject);
+    LOAD_DXCORE_API(D3DKMTQueryStatistics);
+    LOAD_DXCORE_API(D3DKMTEscape);
+    LOAD_DXCORE_API(D3DKMTLock2);
+    LOAD_DXCORE_API(D3DKMTUnlock2);
+    LOAD_DXCORE_API(D3DKMTCreatePagingQueue);
+    LOAD_DXCORE_API(D3DKMTDestroyPagingQueue);
+    LOAD_DXCORE_API(D3DKMTWaitForSynchronizationObjectFromGpu);
+    LOAD_DXCORE_API(D3DKMTSignalSynchronizationObjectFromGpu);
+    LOAD_DXCORE_API(D3DKMTWaitForSynchronizationObjectFromCpu);
+    LOAD_DXCORE_API(D3DKMTQueryClockCalibration);
+    LOAD_DXCORE_API(D3DKMTMakeResident);
+    LOAD_DXCORE_API(D3DKMTEvict);
+    LOAD_DXCORE_API(D3DKMTShareObjects);
+    LOAD_DXCORE_API(D3DKMTQueryResourceInfoFromNtHandle);
+    LOAD_DXCORE_API(D3DKMTOpenResourceFromNtHandle);
+    LOAD_DXCORE_API(D3DKMTCreateHwQueue);
+    LOAD_DXCORE_API(D3DKMTDestroyHwQueue);
+    LOAD_DXCORE_API(D3DKMTSubmitCommandToHwQueue);
+
+    #undef LOAD_DXCORE_API
+
+    pr_info("[DxcoreLoader] All DXCore APIs loaded successfully\n");
+    return true;
+ERROR:
+    pr_err("[DxcoreLoader] Failed to load DXCore APIs\n");
+    return false;
+}
+
+} // namespace dxcore
+} // namespace thunk
+} // namespace wsl
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef LIBROCDXG_DXCORE_LOADER_H
+#define LIBROCDXG_DXCORE_LOADER_H
+
+#include "impl/wddm/types.h"
+#include <dlfcn.h>
+#include <mutex>
+
+#define DXCORE_CALL(function_name)  wsl::thunk::dxcore::DxcoreLoader::Instance().pfn_##function_name
+
+namespace wsl {
+namespace thunk {
+namespace dxcore {
+
+/**
+ * @brief DxcoreLoader class for dynamic loading of libdxcore.so
+ * 
+ * This class provides a singleton loader for the DXCore library, allowing
+ * optional loading based on environment variable LIBROCDXG_ENABLE_DXCORE.
+ * Supported values: "1", "true", "yes" (case-sensitive).
+ * If not set or invalid, fallback to stub implementations.
+ * 
+ * Thread-safe initialization using std::call_once.
+ */
+
+// Macro definitions mimicking HSAKMT design
+#define DXCORE_DEF(function_name)   PFN##function_name
+#define DXCORE_PFN(function_name)   pfn_##function_name
+
+class DxcoreLoader {
+public:
+    // D3DKMT function type definitions
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTCreateAllocation2))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTDestroyAllocation2))(void *args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTMapGpuVirtualAddress))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTReserveGpuVirtualAddress))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTFreeGpuVirtualAddress))(void *args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTCreateDevice))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTDestroyDevice))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTEnumAdapters2))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTQueryAdapterInfo))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTCreateContextVirtual))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTDestroyContext))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTSubmitCommand))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTCreateSynchronizationObject2))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTDestroySynchronizationObject))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTQueryStatistics))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTEscape))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTLock2))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTUnlock2))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTCreatePagingQueue))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTDestroyPagingQueue))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTWaitForSynchronizationObjectFromGpu))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTSignalSynchronizationObjectFromGpu))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTWaitForSynchronizationObjectFromCpu))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTQueryClockCalibration))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTMakeResident))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTEvict))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTShareObjects))(size_t num_allocations, WinResourceHandle* resource, OBJECT_ATTRIBUTES* obj_attr, uint32_t flags, void** nt_handle);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTQueryResourceInfoFromNtHandle))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTOpenResourceFromNtHandle))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTCreateHwQueue))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTDestroyHwQueue))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTSubmitCommandToHwQueue))(void* args);
+
+    static DxcoreLoader& Instance() {
+        static DxcoreLoader* instance = new DxcoreLoader();
+        return (*instance);
+    }
+
+    bool Initialize();
+    void Shutdown();
+    bool IsLoaded() const { return dxcore_handle_ != nullptr; }
+
+    // Function pointer declarations
+    DXCORE_DEF(D3DKMTCreateAllocation2)* DXCORE_PFN(D3DKMTCreateAllocation2);
+    DXCORE_DEF(D3DKMTDestroyAllocation2)* DXCORE_PFN(D3DKMTDestroyAllocation2);
+    DXCORE_DEF(D3DKMTMapGpuVirtualAddress)* DXCORE_PFN(D3DKMTMapGpuVirtualAddress);
+    DXCORE_DEF(D3DKMTReserveGpuVirtualAddress)* DXCORE_PFN(D3DKMTReserveGpuVirtualAddress);
+    DXCORE_DEF(D3DKMTFreeGpuVirtualAddress)* DXCORE_PFN(D3DKMTFreeGpuVirtualAddress);
+    DXCORE_DEF(D3DKMTCreateDevice)* DXCORE_PFN(D3DKMTCreateDevice);
+    DXCORE_DEF(D3DKMTDestroyDevice)* DXCORE_PFN(D3DKMTDestroyDevice);
+    DXCORE_DEF(D3DKMTEnumAdapters2)* DXCORE_PFN(D3DKMTEnumAdapters2);
+    DXCORE_DEF(D3DKMTQueryAdapterInfo)* DXCORE_PFN(D3DKMTQueryAdapterInfo);
+    DXCORE_DEF(D3DKMTCreateContextVirtual)* DXCORE_PFN(D3DKMTCreateContextVirtual);
+    DXCORE_DEF(D3DKMTDestroyContext)* DXCORE_PFN(D3DKMTDestroyContext);
+    DXCORE_DEF(D3DKMTSubmitCommand)* DXCORE_PFN(D3DKMTSubmitCommand);
+    DXCORE_DEF(D3DKMTCreateSynchronizationObject2)* DXCORE_PFN(D3DKMTCreateSynchronizationObject2);
+    DXCORE_DEF(D3DKMTDestroySynchronizationObject)* DXCORE_PFN(D3DKMTDestroySynchronizationObject);
+    DXCORE_DEF(D3DKMTQueryStatistics)* DXCORE_PFN(D3DKMTQueryStatistics);
+    DXCORE_DEF(D3DKMTEscape)* DXCORE_PFN(D3DKMTEscape);
+    DXCORE_DEF(D3DKMTLock2)* DXCORE_PFN(D3DKMTLock2);
+    DXCORE_DEF(D3DKMTUnlock2)* DXCORE_PFN(D3DKMTUnlock2);
+    DXCORE_DEF(D3DKMTCreatePagingQueue)* DXCORE_PFN(D3DKMTCreatePagingQueue);
+    DXCORE_DEF(D3DKMTDestroyPagingQueue)* DXCORE_PFN(D3DKMTDestroyPagingQueue);
+    DXCORE_DEF(D3DKMTWaitForSynchronizationObjectFromGpu)* DXCORE_PFN(D3DKMTWaitForSynchronizationObjectFromGpu);
+    DXCORE_DEF(D3DKMTSignalSynchronizationObjectFromGpu)* DXCORE_PFN(D3DKMTSignalSynchronizationObjectFromGpu);
+    DXCORE_DEF(D3DKMTWaitForSynchronizationObjectFromCpu)* DXCORE_PFN(D3DKMTWaitForSynchronizationObjectFromCpu);
+    DXCORE_DEF(D3DKMTQueryClockCalibration)* DXCORE_PFN(D3DKMTQueryClockCalibration);
+    DXCORE_DEF(D3DKMTMakeResident)* DXCORE_PFN(D3DKMTMakeResident);
+    DXCORE_DEF(D3DKMTEvict)* DXCORE_PFN(D3DKMTEvict);
+    DXCORE_DEF(D3DKMTShareObjects)* DXCORE_PFN(D3DKMTShareObjects);
+    DXCORE_DEF(D3DKMTQueryResourceInfoFromNtHandle)* DXCORE_PFN(D3DKMTQueryResourceInfoFromNtHandle);
+    DXCORE_DEF(D3DKMTOpenResourceFromNtHandle)* DXCORE_PFN(D3DKMTOpenResourceFromNtHandle);
+    DXCORE_DEF(D3DKMTCreateHwQueue)* DXCORE_PFN(D3DKMTCreateHwQueue);
+    DXCORE_DEF(D3DKMTDestroyHwQueue)* DXCORE_PFN(D3DKMTDestroyHwQueue);
+    DXCORE_DEF(D3DKMTSubmitCommandToHwQueue)* DXCORE_PFN(D3DKMTSubmitCommandToHwQueue);
+
+private:
+    DxcoreLoader();
+    ~DxcoreLoader();
+
+    bool LoadDxcoreApis();
+
+    void* dxcore_handle_;
+    std::once_flag init_flag_;  // For thread-safe initialization
+
+    // Disable copy
+    DxcoreLoader(const DxcoreLoader&) = delete;
+    DxcoreLoader& operator=(const DxcoreLoader&) = delete;
+};
+
+} // namespace dxcore
+} // namespace thunk
+} // namespace wsl
+
+#endif // LIBROCDXG_DXCORE_LOADER_H
@@ -0,0 +1,127 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <cstdio>
+#include <cassert>
+#include <thread>
+#include <chrono>
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtCreateEvent(HsaEventDescriptor *EventDesc,
+                                          bool ManualReset, bool IsSignaled,
+                                          HsaEvent **Event) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtDestroyEvent(HsaEvent *Event) {
+  CHECK_DXG_OPEN();
+  if (!Event)
+    return HSAKMT_STATUS_SUCCESS;
+
+  pr_warn_once("not supported\n");
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtSetEvent(HsaEvent *Event) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  if (!Event)
+    return HSAKMT_STATUS_INVALID_HANDLE;
+
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtResetEvent(HsaEvent *Event) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  if (!Event)
+    return HSAKMT_STATUS_INVALID_HANDLE;
+
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtQueryEventState(HsaEvent *Event) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  if (!Event)
+    return HSAKMT_STATUS_INVALID_HANDLE;
+
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnEvent(HsaEvent *Event,
+                                          HSAuint32 Milliseconds) {
+  return hsaKmtWaitOnEvent_Ext(Event, Milliseconds, NULL);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnEvent_Ext(HsaEvent *Event,
+                                              HSAuint32 Milliseconds,
+                                              uint64_t *event_age) {
+  if (!Event)
+    return HSAKMT_STATUS_INVALID_HANDLE;
+
+  return hsaKmtWaitOnMultipleEvents_Ext(&Event, 1, true, Milliseconds,
+                                        event_age);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents(HsaEvent *Events[],
+                                                   HSAuint32 NumEvents,
+                                                   bool WaitOnAll,
+                                                   HSAuint32 Milliseconds) {
+  return hsaKmtWaitOnMultipleEvents_Ext(Events, NumEvents, WaitOnAll,
+                                        Milliseconds, NULL);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents_Ext(HsaEvent *Events[],
+                                                       HSAuint32 NumEvents,
+                                                       bool WaitOnAll,
+                                                       HSAuint32 Milliseconds,
+                                                       uint64_t *event_age) {
+  CHECK_DXG_OPEN();
+
+  if (!Events)
+    return HSAKMT_STATUS_INVALID_HANDLE;
+
+  if (NumEvents == 1 && Events[0] == nullptr) {
+    std::this_thread::sleep_for(std::chrono::microseconds(20));
+    return HSAKMT_STATUS_SUCCESS;
+  }
+
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtOpenSMI(HSAuint32 NodeId, int *fd) {
+  CHECK_DXG_OPEN();
+  pr_debug("node id %d\n", NodeId);
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
@@ -0,0 +1,137 @@
+#include <dlfcn.h>
+#include "impl/hsa/hsa.h"
+#include "impl/hsa/hsa_ven_amd_loader.h"
+
+static std::mutex* lock_ = new std::mutex();
+
+#if 1
+#define _HSAKMT_LOOKUP_SYMS(_sym)                                              \
+if (fn_##_sym == nullptr) {                                                    \
+    std::lock_guard<std::mutex> gard(*lock_);                                  \
+    if (fn_##_sym == nullptr) {                                                \
+      fn_##_sym =                                                              \
+        reinterpret_cast<decltype(fn_##_sym)>(dlsym(RTLD_DEFAULT, #_sym));     \
+      if (!fn_##_sym) {                                                        \
+        pr_err("%s not found - %s\n", #_sym, dlerror());                       \
+      }                                                                        \
+    }                                                                          \
+}
+
+#define _HSAKMT_EXEC_API(_sym, ...) \
+do { \
+    if (fn_##_sym != nullptr) {    \
+        return fn_##_sym(__VA_ARGS__);   \
+    } \
+} while(0);
+
+bool hsakmt_hsa_loader_init() {
+  void *hsa_loader_handle = dlopen("libhsa-runtime64.so", RTLD_NOW | RTLD_GLOBAL);
+  if (hsa_loader_handle == nullptr) {
+    pr_err("dlopen libhsa-runtime64.so failed - %s\n", dlerror());
+    return false;
+  }
+  dlclose(hsa_loader_handle);
+  return true;
+}
+
+hsa_signal_value_t hsakmt_hsa_signal_load_relaxed(hsa_signal_t signal) {
+  static hsa_signal_value_t (*fn_hsa_signal_load_relaxed)(hsa_signal_t signal) = nullptr;
+
+  _HSAKMT_LOOKUP_SYMS(hsa_signal_load_relaxed);
+  _HSAKMT_EXEC_API(hsa_signal_load_relaxed, signal);
+
+  return 0;
+}
+
+hsa_signal_value_t hsakmt_hsa_signal_wait_relaxed(
+    hsa_signal_t signal, hsa_signal_condition_t condition,
+    hsa_signal_value_t compare_value, uint64_t timeout_hint,
+    hsa_wait_state_t wait_state_hint) {
+static hsa_signal_value_t (*fn_hsa_signal_wait_relaxed)(
+    hsa_signal_t signal, hsa_signal_condition_t condition,
+    hsa_signal_value_t compare_value, uint64_t timeout_hint,
+    hsa_wait_state_t wait_state_hint) = nullptr;
+
+  _HSAKMT_LOOKUP_SYMS(hsa_signal_wait_relaxed);
+  _HSAKMT_EXEC_API(hsa_signal_wait_relaxed, signal, condition, compare_value,
+                   timeout_hint, wait_state_hint);
+
+  return 0;
+}
+
+void hsakmt_hsa_signal_store_screlease(hsa_signal_t hsa_signal,
+                                      hsa_signal_value_t value){
+static void (*fn_hsa_signal_store_screlease)(hsa_signal_t hsa_signal,
+                                      hsa_signal_value_t value) = nullptr;
+
+  _HSAKMT_LOOKUP_SYMS(hsa_signal_store_screlease);
+  _HSAKMT_EXEC_API(hsa_signal_store_screlease, hsa_signal, value);
+}
+
+hsa_status_t hsakmt_hsa_ven_amd_loader_query_host_address(
+    const void *device_address, const void **host_address) {
+  static hsa_status_t (*fn_hsa_ven_amd_loader_query_host_address)(
+    const void *device_address, const void **host_address) = nullptr;
+
+  if (fn_hsa_ven_amd_loader_query_host_address == nullptr) {
+    std::lock_guard<std::mutex> gard(*lock_);
+    if (fn_hsa_ven_amd_loader_query_host_address == nullptr) {
+      hsa_status_t (*fn_hsa_system_get_extension_table)(
+      uint16_t extension, uint16_t version_major, uint16_t version_minor, void *table);
+      fn_hsa_system_get_extension_table =
+        reinterpret_cast<decltype(fn_hsa_system_get_extension_table)>(dlsym(RTLD_DEFAULT, "hsa_system_get_extension_table"));
+      if (fn_hsa_system_get_extension_table == nullptr) {
+        pr_err("%s not found - %s\n", "hsa_system_get_extension_table", dlerror());
+        return HSA_STATUS_ERROR;
+      }
+
+      hsa_ven_amd_loader_1_03_pfn_t table;
+      fn_hsa_system_get_extension_table(HSA_EXTENSION_AMD_LOADER, 1, 3, &table);
+      fn_hsa_ven_amd_loader_query_host_address =
+          table.hsa_ven_amd_loader_query_host_address;
+    }
+  }
+
+  _HSAKMT_EXEC_API(hsa_ven_amd_loader_query_host_address, device_address, host_address);
+  return HSA_STATUS_ERROR;
+}
+
+#else
+hsa_signal_value_t hsakmt_hsa_signal_load_relaxed(hsa_signal_t signal) {
+  return hsa_signal_load_relaxed(signal);
+}
+
+hsa_signal_value_t hsakmt_hsa_signal_wait_relaxed(
+    hsa_signal_t signal, hsa_signal_condition_t condition,
+    hsa_signal_value_t compare_value, uint64_t timeout_hint,
+    hsa_wait_state_t wait_state_hint) {
+  return hsa_signal_wait_relaxed(signal, condition, compare_value, timeout_hint,
+                                 wait_state_hint);
+}
+
+void hsakmt_hsa_signal_store_screlease(hsa_signal_t hsa_signal,
+                                      hsa_signal_value_t value) {
+  hsa_signal_store_screlease(hsa_signal, value);
+}
+
+hsa_status_t hsakmt_hsa_ven_amd_loader_query_host_address(
+    const void *device_address, const void **host_address) {
+  static hsa_status_t (*fn_hsa_ven_amd_loader_query_host_address)(
+    const void *device_address, const void **host_address) = nullptr;
+
+  if (fn_hsa_ven_amd_loader_query_host_address == nullptr) {
+    std::lock_guard<std::mutex> gard(*lock_);
+    if (fn_hsa_ven_amd_loader_query_host_address == nullptr) {
+      hsa_ven_amd_loader_1_03_pfn_t table;
+      hsa_system_get_extension_table(HSA_EXTENSION_AMD_LOADER, 1, 3, &table);
+      fn_hsa_ven_amd_loader_query_host_address =
+          table.hsa_ven_amd_loader_query_host_address;
+    }
+  }
+
+  if (fn_hsa_ven_amd_loader_query_host_address)
+    return fn_hsa_ven_amd_loader_query_host_address(device_address, host_address);
+
+  return HSA_STATUS_ERROR;
+}
+#endif
@@ -0,0 +1,31 @@
+/*
+* Copyright © 2025 Advanced Micro Devices, Inc.
+*
+* Permission is hereby granted, free of charge, to any person
+* obtaining a copy of this software and associated documentation
+* files (the "Software"), to deal in the Software without
+* restriction, including without limitation the rights to use, copy,
+* modify, merge, publish, distribute, sublicense, and/or sell copies
+* of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including
+* the next paragraph) shall be included in all copies or substantial
+* portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+* DEALINGS IN THE SOFTWARE.
+*/
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtModelEnabled(bool* enable)
+{
+  *enable = false;
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_SUCCESS;
+}
@@ -0,0 +1,182 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+#include <cstdint>
+
+#include "impl/wddm/types.h"
+#include "impl/wddm/device.h"
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtGetAMDGPUDeviceHandle(
+    HSAuint32 NodeId, HsaAMDGPUDeviceHandle *DeviceHandle) {
+  CHECK_DXG_OPEN();
+
+  wsl::thunk::WDDMDevice *pDevice = get_wddmdev(NodeId);
+  if (pDevice != nullptr) {
+    *DeviceHandle = reinterpret_cast<HsaAMDGPUDeviceHandle>(pDevice);
+    return HSAKMT_STATUS_SUCCESS;
+  }
+  return HSAKMT_STATUS_ERROR;
+}
+
+HSAKMTAPI int amdgpu_device_initialize(int fd,
+                                       uint32_t *major_version,
+                                       uint32_t *minor_version,
+                                       amdgpu_device_handle *device_handle) {
+  return 0;
+}
+
+HSAKMTAPI int amdgpu_device_deinitialize(amdgpu_device_handle device_handle) {
+  return 0;
+}
+
+HSAKMTAPI int amdgpu_query_gpu_info(amdgpu_device_handle dev,
+                                    struct amdgpu_gpu_info *info) {
+  wsl::thunk::WDDMDevice *pDevice =
+    reinterpret_cast<wsl::thunk::WDDMDevice *>(dev);
+  memset(info, 0, sizeof(*info));
+  info->gpu_counter_freq = pDevice->GPUCounterFrequency() / 1000ull;
+  return 0;
+}
+
+HSAKMTAPI int amdgpu_device_get_fd(amdgpu_device_handle dev) {
+  return dxg_runtime->dxg_fd;
+}
+
+HSAKMTAPI int amdgpu_bo_cpu_map(amdgpu_bo_handle bo, void **cpu) {
+  wsl::thunk::GpuMemory *gpu_mem = reinterpret_cast<wsl::thunk::GpuMemory *>(bo);
+  if (gpu_mem->IsSysMemFd())
+    *cpu = gpu_mem->CpuAddress();
+  return 0;
+}
+
+HSAKMTAPI int amdgpu_bo_free(amdgpu_bo_handle buf_handle) {
+  wsl::thunk::GpuMemory *gpu_mem = reinterpret_cast<wsl::thunk::GpuMemory *>(buf_handle);
+  void *MemoryAddress = gpu_mem->IsVaAllocated() ? (void*)gpu_mem->GpuAddress() : (void*)gpu_mem->HandleApeAddress();
+  auto ret = hsaKmtFreeMemory((void*)MemoryAddress, gpu_mem->Size());
+  return ret == HSAKMT_STATUS_SUCCESS ? 0 : -1;
+}
+
+HSAKMTAPI int amdgpu_bo_export(amdgpu_bo_handle bo,
+                               enum amdgpu_bo_handle_type type,
+                               uint32_t *shared_handle) {
+  *shared_handle = 0;
+  return 0;
+}
+
+HSAKMTAPI int amdgpu_bo_import(amdgpu_device_handle dev,
+                               enum amdgpu_bo_handle_type type,
+                               uint32_t shared_handle,
+                               struct amdgpu_bo_import_result *output) {
+  if (type != amdgpu_bo_handle_type_dma_buf_fd) {
+    pr_err("not implemented\n");
+    return -1;
+  }
+
+
+  wsl::thunk::WDDMDevice *pDevice = reinterpret_cast<wsl::thunk::WDDMDevice *>(dev);
+  wsl::thunk::GpuMemoryHandle mem_handle;
+  bool is_ipc_memfd = is_ipc_sysmemfd(shared_handle);
+  bool alloc_va = is_ipc_memfd;
+
+  HSAKMT_STATUS ret = import_dmabuf_fd(shared_handle, pDevice->NodeId(),
+                                        alloc_va, is_ipc_memfd, &mem_handle);
+  if (ret == HSAKMT_STATUS_SUCCESS) {
+    //use GpuMemory object handle as drm buf handle
+    output->buf_handle = reinterpret_cast<amdgpu_bo_handle>(mem_handle);
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+HSAKMTAPI int amdgpu_bo_va_op(amdgpu_bo_handle bo,
+                              uint64_t offset,
+                              uint64_t size,
+                              uint64_t addr,
+                              uint64_t flags,
+                              uint32_t ops) {
+  wsl::thunk::GpuMemory *gpu_mem = reinterpret_cast<wsl::thunk::GpuMemory *>(bo);
+  assert(gpu_mem != nullptr);
+
+  switch(ops) {
+    case AMDGPU_VA_OP_MAP:
+      {
+        if (gpu_mem->GpuAddress() == addr) {
+          pr_info("bo is mapped already\n");
+          return 0;
+        } else if (gpu_mem->GpuAddress()) {
+          pr_err("amdgpu_bo_va_op: GPU memory already mapped at %p, but requested to map at %p\n",
+                 reinterpret_cast<void *>(gpu_mem->GpuAddress()), reinterpret_cast<void *>(addr));
+          return -1;
+        }
+        auto code = gpu_mem->MapGpuVirtualAddress(reinterpret_cast<gpusize>(addr), size, offset);
+        if (code != ErrorCode::Success)
+          return -1;
+
+        code = gpu_mem->MakeResident();
+        if (code != ErrorCode::Success)
+          return -1;
+      }
+      break;
+    case AMDGPU_VA_OP_UNMAP:
+      {
+        auto code = gpu_mem->UnmapGpuVirtualAddress(reinterpret_cast<gpusize>(addr), size, offset);
+        if (code != ErrorCode::Success)
+          return -1;
+        gpu_mem->Evict();
+      }
+      break;
+  }
+  return 0;
+}
+
+HSAKMTAPI int amdgpu_bo_query_info(amdgpu_bo_handle bo, struct amdgpu_bo_info* info) {
+  return 0;
+}
+
+HSAKMTAPI int amdgpu_bo_set_metadata(amdgpu_bo_handle bo, struct amdgpu_bo_metadata* info) {
+  return 0;
+}
+
+HSAKMTAPI int drmCommandWriteRead(int fd, unsigned long drmCommandIndex,
+                                  void *data, unsigned long size) {
+  return 0;
+}
@@ -0,0 +1,289 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LIBHSAKMT_H_INCLUDED
+#define LIBHSAKMT_H_INCLUDED
+
+#include <pthread.h>
+#include <stdint.h>
+#include <limits.h>
+#include "hsakmt/hsakmt.h"
+#include "hsakmt/hsakmt_drm.h"
+
+#include "impl/wddm/va_mgr.h"
+#include "impl/wddm/types.h"
+#include "impl/wddm/device.h"
+#include "dxcore_loader.h"
+
+wsl::thunk::WDDMDevice* get_wddmdev(uint32_t node_id);
+uint32_t get_num_wddmdev();
+wsl::thunk::GpuMemory *get_gpu_mem(void *MemoryAddress);
+
+#define HSAKMT_DEBUG_LEVEL_ERR      -1
+#define HSAKMT_DEBUG_LEVEL_DEFAULT  3
+#define HSAKMT_DEBUG_LEVEL_WARNING  4
+#define HSAKMT_DEBUG_LEVEL_INFO     6
+#define HSAKMT_DEBUG_LEVEL_DEBUG    7
+
+struct hsakmtRuntime {
+  hsakmtRuntime()
+    : dxg_fd(-1),
+    parent_pid(getpid()),
+    is_forked(false),
+    hsakmt_debug_level(HSAKMT_DEBUG_LEVEL_DEFAULT),
+    dxg_open_count(0),
+    hsakmt_mutex(PTHREAD_MUTEX_INITIALIZER),
+    hsakmt_is_dgpu(false),
+    is_svm_api_supported(false),
+    zfb_support(0),
+    vendor_packet_process(0),
+    check_avail_sysram(false),
+    max_single_alloc_size(0),
+    enable_thunk_sub_allocator(0),
+    local_heap_space_start_(0),
+    local_heap_space_size_(0),
+    system_heap_space_start_(0),
+    system_heap_space_size_(0),
+    handle_aperture_start_(0),
+    handle_aperture_size_(0),
+    default_node(1) {}
+
+  void HeapInit();
+  void HeapFini();
+  bool ReserveSvmSpace(uint64_t &base, uint64_t &size, uint64_t align);
+  bool FreeSvmSpace(uint64_t &base, uint64_t &size);
+  bool ReserveLocalHeapSpace();
+  bool FreeLocalHeapSpace();
+  void InitLocalHeapMgr();
+  bool ReserveSystemHeapSpace();
+  uint64_t SystemHeapSize() { return system_heap_space_size_; }
+  bool FreeSystemHeapSpace();
+  bool CommitSystemHeapSpace(void* addr, int64_t size, bool lock);
+  bool DecommitSystemHeapSpace(void* addr, int64_t size);
+  void InitSystemHeapMgr();
+  ErrorCode ReserveGpuVirtualAddress(const thunk_proxy::AllocDomain domain,
+          gpusize hit_base_addr, gpusize size,
+          gpusize *out_gpu_virt_addr, gpusize alignment, bool lock);
+  ErrorCode FreeGpuVirtualAddress(const thunk_proxy::AllocDomain domain,
+          gpusize gpu_addr, gpusize size);
+  bool CommitSystemHeapSpaceIPC(void* addr, int64_t size, int &fd, bool lock=false);
+  bool DecommitSystemHeapSpaceIPC(void* addr, int64_t size, int &memfd);
+  ErrorCode ReserveIPCSysMem(gpusize size,
+          gpusize *out_gpu_virt_addr, gpusize alignment,
+          int &memfd, bool lock);
+  ErrorCode FreeIPCSysMem(gpusize gpu_addr, gpusize size, int &memfd);
+  bool InitHandleApertureSpace();
+  void InitHandleApertureMgr();
+  ErrorCode HandleApertureAlloc(gpusize size, gpusize *out_gpu_virt_addr);
+  void HandleApertureFree(gpusize gpu_addr);
+
+  pthread_mutex_t hsakmt_mutex;
+  const char *dxg_device_name = "/dev/dxg";
+  long page_size;
+  int page_shift;
+  int dxg_fd = -1;
+  pid_t parent_pid = -1;
+  bool is_forked = false;
+  int hsakmt_debug_level = HSAKMT_DEBUG_LEVEL_DEFAULT;
+  unsigned long dxg_open_count;
+  bool hsakmt_is_dgpu;
+  bool is_svm_api_supported;
+  int zfb_support;
+  int vendor_packet_process;
+  bool check_avail_sysram;
+  size_t max_single_alloc_size;
+  int enable_thunk_sub_allocator;
+  uint32_t default_node;
+
+  /* local heap means bo's backend is vram of all GPUs */
+  uint64_t local_heap_space_start_;
+  uint64_t local_heap_space_size_;
+
+  /* manage the reserved local heap space which shared by CPU and GPUs */
+  std::unique_ptr<wsl::thunk::VaMgr> local_heap_mgr_;
+
+  /* system heap means bo's backend is system ram */
+  uint64_t system_heap_space_start_;
+  uint64_t system_heap_space_size_;
+
+  /* manage the reserved system heap space which shared by CPU and GPUs */
+  std::unique_ptr<wsl::thunk::VaMgr> system_heap_mgr_;
+
+  uint64_t handle_aperture_start_;
+  uint64_t handle_aperture_size_;
+  std::unique_ptr<wsl::thunk::VaMgr> handle_aperture_mgr_;
+};
+
+extern hsakmtRuntime *dxg_runtime;
+
+#undef HSAKMTAPI
+#define HSAKMTAPI __attribute__((visibility ("default")))
+
+#if defined(__clang__)
+#if __has_feature(address_sanitizer)
+#define SANITIZER_AMDGPU 1
+#endif
+#endif
+
+/*Avoid pointer-to-int-cast warning*/
+#define PORT_VPTR_TO_UINT64(vptr) ((uint64_t)(unsigned long)(vptr))
+
+/*Avoid int-to-pointer-cast warning*/
+#define PORT_UINT64_TO_VPTR(v) ((void*)(unsigned long)(v))
+
+#define CHECK_DXG_OPEN() \
+	do { if (dxg_runtime->dxg_open_count == 0 || dxg_runtime->is_forked) return HSAKMT_STATUS_KERNEL_IO_CHANNEL_NOT_OPENED; } while (0)
+
+/* 64KB BigK fragment size for TLB efficiency */
+#define GPU_BIGK_PAGE_SIZE (1 << 16)
+
+/* 2MB huge page size for 4-level page tables on Vega10 and later GPUs */
+#define GPU_HUGE_PAGE_SIZE (2 << 20)
+
+#define CHECK_PAGE_MULTIPLE(x) \
+	do { if ((uint64_t)PORT_VPTR_TO_UINT64(x) % dxg_runtime->page_size) return HSAKMT_STATUS_INVALID_PARAMETER; } while(0)
+
+#define ALIGN_UP(x,align) (((uint64_t)(x) + (align) - 1) & ~(uint64_t)((align)-1))
+#define ALIGN_UP_32(x,align) (((uint32_t)(x) + (align) - 1) & ~(uint32_t)((align)-1))
+#define PAGE_ALIGN_UP(x) ALIGN_UP(x,dxg_runtime->page_size)
+#define BITMASK(n) ((n) ? (UINT64_MAX >> (sizeof(UINT64_MAX) * CHAR_BIT - (n))) : 0)
+#define ARRAY_LEN(array) (sizeof(array) / sizeof(array[0]))
+
+/* HSA Thunk logging usage */
+#define get_thread_id()                                                                                                          \
+    ([]() -> std::string {                                                                                                       \
+        std::stringstream str_thrd_id;                                                                                           \
+        str_thrd_id << std::hex << std::this_thread::get_id();                                                                   \
+        return str_thrd_id.str();                                                                                                \
+    })()
+#define hsakmt_print_common(stream, fmt, ...)                                                                                    \
+    do {                                                                                                                         \
+        fprintf(stream, "pid:%d tid:0x%s [%s] " fmt, getpid(), get_thread_id().c_str(), __FUNCTION__, ##__VA_ARGS__);            \
+        fflush(stream);                                                                                                          \
+    } while (false)
+#ifdef NDEBUG
+#define hsakmt_print(level, fmt, ...)                                                                                            \
+    do { } while (false)
+#else
+#define hsakmt_print(level, fmt, ...)                                                                                            \
+    do {                                                                                                                         \
+        if (level <= dxg_runtime->hsakmt_debug_level) {                                                                          \
+            hsakmt_print_common(stdout, fmt, ##__VA_ARGS__);                                                                     \
+        }                                                                                                                        \
+    } while (false)
+#endif
+
+#define pr_err(fmt, ...) \
+	hsakmt_print_common(stderr, fmt, ##__VA_ARGS__)
+#define pr_warn(fmt, ...) \
+	hsakmt_print(HSAKMT_DEBUG_LEVEL_WARNING, fmt, ##__VA_ARGS__)
+#define pr_info(fmt, ...) \
+	hsakmt_print(HSAKMT_DEBUG_LEVEL_INFO, fmt, ##__VA_ARGS__)
+#define pr_debug(fmt, ...) \
+	hsakmt_print(HSAKMT_DEBUG_LEVEL_DEBUG, fmt, ##__VA_ARGS__)
+#define pr_err_once(fmt, ...)                   \
+({                                              \
+        static bool __print_once;               \
+        if (!__print_once) {                    \
+                __print_once = true;            \
+                pr_err(fmt, ##__VA_ARGS__);     \
+        }                                       \
+})
+#define pr_warn_once(fmt, ...)                  \
+({                                              \
+        static bool __print_once;               \
+        if (!__print_once) {                    \
+                __print_once = true;            \
+                pr_warn(fmt, ##__VA_ARGS__);    \
+        }                                       \
+})
+
+/* Expects HSA_ENGINE_ID.ui32, returns gfxv (full) in hex */
+#define HSA_GET_GFX_VERSION_FULL(ui32) \
+	(((ui32.Major) << 16) | ((ui32.Minor) << 8) | (ui32.Stepping))
+
+HSAKMT_STATUS validate_nodeid(uint32_t nodeid, uint32_t *gpu_id);
+HSAKMT_STATUS gpuid_to_nodeid(uint32_t gpu_id, uint32_t* node_id);
+bool prefer_ats(HSAuint32 node_id);
+uint16_t get_device_id_by_node_id(HSAuint32 node_id);
+uint16_t get_device_id_by_gpu_id(HSAuint32 gpu_id);
+uint32_t get_direct_link_cpu(uint32_t gpu_node);
+
+HSAKMT_STATUS topology_sysfs_get_system_props(HsaSystemProperties& props);
+HSAKMT_STATUS topology_get_node_props(HSAuint32 NodeId,
+				      HsaNodeProperties *NodeProperties);
+HSAKMT_STATUS topology_get_iolink_props(HSAuint32 NodeId,
+					HSAuint32 NumIoLinks,
+					HsaIoLinkProperties *IoLinkProperties);
+void topology_setup_is_dgpu_param(HsaNodeProperties *props);
+
+HSAuint32 PageSizeFromFlags(unsigned int pageSizeFlags);
+
+uint32_t get_num_sysfs_nodes(void);
+
+bool is_forked_child(void);
+
+void clear_allocation_map(void);
+
+class BlockAllocator {
+private:
+    static const size_t block_size_ = 128 * 1024 * 1024;  // 128MB blocks.
+
+public:
+    void* alloc(size_t request_size, size_t& allocated_size) const;
+    void free(void* ptr, size_t length) const;
+    size_t block_size() const { return block_size_; }
+};
+
+void reset_suballocator(void);
+void trim_suballocator(void);
+
+HSAKMT_STATUS hsaKmtAllocMemoryAlignInternal(HSAuint32 PreferredNode,
+                                            HSAuint64 SizeInBytes,
+                                            HSAuint64 Alignment,
+                                            HsaMemFlags MemFlags,
+                                            void **MemoryAddress,
+                                            bool SkipSubAlloc = false);
+
+HSAKMT_STATUS hsaKmtFreeMemoryInternal(void *MemoryAddress,
+                                    HSAuint64 SizeInBytes,
+                                    bool SkipSubAlloc = false);
+
+bool queue_acquire_buffer(void *MemoryAddress);
+bool queue_release_buffer(void *MemoryAddress);
+/* Calculate VGPR and SGPR register file size per CU */
+uint32_t get_vgpr_size_per_cu(HSA_ENGINE_ID id);
+#define SGPR_SIZE_PER_CU 0x4000
+
+bool is_ipc_sysmemfd(int fd);
+
+HSAKMT_STATUS import_dmabuf_fd(int DMABufFd,
+                                       uint32_t NodeId,
+                                       bool alloc_va,
+                                       bool is_ipc_memfd,
+                                       wsl::thunk::GpuMemoryHandle *GpuMemHandle);
+
+bool hsakmt_hsa_loader_init();
+#endif
@@ -0,0 +1,113 @@
+HSAKMT_1
+{
+global:
+hsaKmtOpenKFD;
+hsaKmtCloseKFD;
+hsaKmtGetVersion;
+hsaKmtAcquireSystemProperties;
+hsaKmtReleaseSystemProperties;
+hsaKmtGetNodeProperties;
+hsaKmtGetNodeMemoryProperties;
+hsaKmtGetNodeCacheProperties;
+hsaKmtGetNodeIoLinkProperties;
+hsaKmtCreateEvent;
+hsaKmtDestroyEvent;
+hsaKmtSetEvent;
+hsaKmtResetEvent;
+hsaKmtQueryEventState;
+hsaKmtWaitOnEvent;
+hsaKmtWaitOnMultipleEvents;
+hsaKmtCreateQueue;
+hsaKmtCreateQueueExt;
+hsaKmtUpdateQueue;
+hsaKmtDestroyQueue;
+hsaKmtSetQueueCUMask;
+hsaKmtSetMemoryPolicy;
+hsaKmtAllocMemory;
+hsaKmtAllocMemoryAlign;
+hsaKmtFreeMemory;
+hsaKmtAvailableMemory;
+hsaKmtRegisterMemory;
+hsaKmtRegisterMemoryToNodes;
+hsaKmtRegisterMemoryWithFlags;
+hsaKmtRegisterGraphicsHandleToNodes;
+hsaKmtRegisterGraphicsHandleToNodesExt;
+hsaKmtShareMemory;
+hsaKmtRegisterSharedHandle;
+hsaKmtRegisterSharedHandleToNodes;
+hsaKmtProcessVMRead;
+hsaKmtProcessVMWrite;
+hsaKmtDeregisterMemory;
+hsaKmtMapMemoryToGPU;
+hsaKmtMapMemoryToGPUNodes;
+hsaKmtUnmapMemoryToGPU;
+hsaKmtDbgRegister;
+hsaKmtDbgUnregister;
+hsaKmtDbgWavefrontControl;
+hsaKmtDbgAddressWatch;
+hsaKmtDbgEnable;
+hsaKmtDbgDisable;
+hsaKmtDbgGetDeviceData;
+hsaKmtDbgGetQueueData;
+hsaKmtGetClockCounters;
+hsaKmtPmcGetCounterProperties;
+hsaKmtPmcRegisterTrace;
+hsaKmtPmcUnregisterTrace;
+hsaKmtPmcAcquireTraceAccess;
+hsaKmtPmcReleaseTraceAccess;
+hsaKmtPmcStartTrace;
+hsaKmtPmcQueryTrace;
+hsaKmtPmcStopTrace;
+hsaKmtMapGraphicHandle;
+hsaKmtUnmapGraphicHandle;
+hsaKmtSetTrapHandler;
+hsaKmtGetTileConfig;
+hsaKmtQueryPointerInfo;
+hsaKmtSetMemoryUserData;
+hsaKmtGetQueueInfo;
+hsaKmtAllocQueueGWS;
+hsaKmtRuntimeEnable;
+hsaKmtRuntimeDisable;
+hsaKmtCheckRuntimeDebugSupport;
+hsaKmtGetRuntimeCapabilities;
+hsaKmtDebugTrapIoctl;
+hsaKmtSPMAcquire;
+hsaKmtSPMRelease;
+hsaKmtSPMSetDestBuffer;
+hsaKmtSVMSetAttr;
+hsaKmtSVMGetAttr;
+hsaKmtSetXNACKMode;
+hsaKmtGetXNACKMode;
+hsaKmtOpenSMI;
+hsaKmtExportDMABufHandle;
+hsaKmtGetMemoryHandle;
+hsaKmtWaitOnEvent_Ext;
+hsaKmtWaitOnMultipleEvents_Ext;
+hsaKmtReplaceAsanHeaderPage;
+hsaKmtReturnAsanHeaderPage;
+hsaKmtGetAMDGPUDeviceHandle;
+hsaKmtPcSamplingQueryCapabilities;
+hsaKmtPcSamplingCreate;
+hsaKmtPcSamplingDestroy;
+hsaKmtPcSamplingStart;
+hsaKmtPcSamplingStop;
+hsaKmtPcSamplingSupport;
+hsaKmtAisReadWriteFile;
+hsaKmtModelEnabled;
+hsaKmtQueueRingDoorbell;
+amdgpu_device_initialize;
+amdgpu_device_deinitialize;
+amdgpu_query_gpu_info;
+amdgpu_bo_import;
+amdgpu_bo_va_op;
+amdgpu_device_get_fd;
+amdgpu_bo_cpu_map;
+amdgpu_bo_free;
+amdgpu_bo_export;
+amdgpu_bo_query_info;
+amdgpu_bo_set_metadata;
+drmCommandWriteRead;
+
+local: *;
+};
+
@@ -0,0 +1,989 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/sysinfo.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include "impl/wddm/gpu_memory.h"
+#include "util/simple_heap.h"
+
+struct Allocation {
+  Allocation()
+      : handle(0), cpu_addr(0), gpu_addr(0), size(0), userptr(false),
+        user_data(nullptr), size_requested(0), node_id(0), mem_flags_value(0),
+        dmabuf_fd(-1), rocr_userdata(nullptr) {}
+  Allocation(wsl::thunk::GpuMemoryHandle handle_arg, void *cpu_addr_arg,
+             uint64_t gpu_addr_arg, size_t size_arg, bool userptr_arg = false,
+             void *user_data_arg = nullptr, size_t user_size_arg = 0,
+             HSAuint32 node_id_arg = 0, HSAuint32 mem_flags_value_arg = 0)
+      : handle(handle_arg), cpu_addr(cpu_addr_arg), gpu_addr(gpu_addr_arg),
+        size(size_arg), userptr(userptr_arg), user_data(user_data_arg),
+        size_requested(user_size_arg), node_id(node_id_arg),
+        mem_flags_value(mem_flags_value_arg), dmabuf_fd(-1), rocr_userdata(nullptr) {}
+
+  wsl::thunk::GpuMemoryHandle handle;
+  void *cpu_addr;
+  uint64_t gpu_addr;
+  bool userptr;
+  size_t size; /* actual size = align_up(size_requested, granularity) */
+  void *user_data;
+  size_t size_requested; /* size requested by user */
+  HSAuint32 node_id;
+  HSAuint32 mem_flags_value;
+  int dmabuf_fd;
+  void *rocr_userdata;
+};
+
+static std::map<const void *, Allocation>* allocation_map_ = new std::map<const void *, Allocation>();
+static std::mutex* allocation_map_lock_ = new std::mutex();
+
+void clear_allocation_map(void)
+{
+  //delete allocation_map_lock_;
+  allocation_map_lock_ = new std::mutex();
+  std::lock_guard<std::mutex> lock(*allocation_map_lock_);
+  delete allocation_map_;
+  allocation_map_ = new std::map<const void *, Allocation>();
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtSetMemoryPolicy(HSAuint32 Node,
+                                              HSAuint32 DefaultPolicy,
+                                              HSAuint32 AlternatePolicy,
+                                              void *MemoryAddressAlternate,
+                                              HSAuint64 MemorySizeInBytes) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not implemented\n");
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAuint32 PageSizeFromFlags(unsigned int pageSizeFlags) {
+  switch (pageSizeFlags) {
+  case HSA_PAGE_SIZE_4KB:
+    return 4 * 1024;
+  case HSA_PAGE_SIZE_64KB:
+    return 64 * 1024;
+  case HSA_PAGE_SIZE_2MB:
+    return 2 * 1024 * 1024;
+  case HSA_PAGE_SIZE_1GB:
+    return 1024 * 1024 * 1024;
+  default:
+    assert(false);
+    return 4 * 1024;
+  }
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemory(HSAuint32 PreferredNode,
+                                          HSAuint64 SizeInBytes,
+                                          HsaMemFlags MemFlags,
+                                          void **MemoryAddress) {
+  return hsaKmtAllocMemoryAlign(PreferredNode, SizeInBytes, 0, MemFlags,
+                                MemoryAddress);
+}
+
+#define POWER_OF_2(x) ((x && (!(x & (x - 1)))) ? 1 : 0)
+
+bool isSystemMemoryAvailable(HSAuint64 SizeInBytes) {
+  struct sysinfo info;
+  if (sysinfo(&info) != 0)
+    return false;
+  return SizeInBytes <= info.freeram;
+}
+
+void* BlockAllocator::alloc(size_t request_size, size_t& allocated_size) const {
+  void *address;
+  HsaMemFlags MemFlags;
+
+  MemFlags.Value = 0;
+  MemFlags.ui32.CoarseGrain = 1;
+  MemFlags.ui32.NoSubstitute = 1;
+  allocated_size = wsl::AlignUp(request_size, block_size());
+  if (HSAKMT_STATUS_SUCCESS == hsaKmtAllocMemoryAlignInternal(1, allocated_size, 0, MemFlags, &address, true))
+    return address;
+
+  return nullptr;
+}
+
+void BlockAllocator::free(void* ptr, size_t length) const {
+  if (HSAKMT_STATUS_SUCCESS != hsaKmtFreeMemoryInternal(ptr, length, true))
+    pr_err("wsl-thunk: BlockAllocator::free() err, address %p, length:%zu\n", ptr, length);
+}
+
+static wsl::SimpleHeap<BlockAllocator> fragment_allocator_;
+
+void reset_suballocator(void) {
+  fragment_allocator_.reset();
+}
+
+void trim_suballocator(void) {
+  fragment_allocator_.trim();
+}
+
+HSAKMT_STATUS hsaKmtAllocMemoryAlignInternal(HSAuint32 PreferredNode,
+                                             HSAuint64 SizeInBytes,
+                                             HSAuint64 Alignment,
+                                             HsaMemFlags MemFlags,
+                                             void **MemoryAddress,
+                                             bool SkipSubAlloc) {
+  CHECK_DXG_OPEN();
+
+  if (!MemoryAddress)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  if (MemFlags.ui32.FixedAddress) {
+    if (*MemoryAddress == nullptr)
+      return HSAKMT_STATUS_INVALID_PARAMETER;
+  } else
+    *MemoryAddress = nullptr;
+
+  uint32_t node = (PreferredNode == 0) ? dxg_runtime->default_node : PreferredNode;
+  wsl::thunk::WDDMDevice *dev = get_wddmdev(node);
+  if (!dev)
+    return HSAKMT_STATUS_ERROR;
+
+  wsl::thunk::GpuMemory *gpu_mem = nullptr;
+  wsl::thunk::GpuMemoryCreateInfo create_info{};
+  create_info.size = SizeInBytes;
+
+  /* If initialize scratch pool of GpuAgent, treat it as SVM reserve */
+  if (MemFlags.ui32.Scratch && MemFlags.ui32.HostAccess && SizeInBytes > 0x80000000)
+    MemFlags.ui32.OnlyAddress = 1;
+
+  create_info.alignment = Alignment;
+  create_info.va_hint = reinterpret_cast<gpusize>(*MemoryAddress);
+  if ((PreferredNode == 0 && MemFlags.ui32.HostAccess)
+    || dxg_runtime->zfb_support || MemFlags.ui32.GTTAccess) {
+    if (SizeInBytes > dxg_runtime->max_single_alloc_size)
+      return HSAKMT_STATUS_NO_MEMORY;
+
+    if (dxg_runtime->check_avail_sysram && !isSystemMemoryAvailable(SizeInBytes))
+      return HSAKMT_STATUS_NO_MEMORY;
+
+    /* If allocate VRAM under ZFB mode */
+    if (dxg_runtime->zfb_support && MemFlags.ui32.NonPaged == 1)
+      MemFlags.ui32.CoarseGrain = 1;
+
+    // AllocateNonPaged == AllocateIPC
+    create_info.flags.sysmem_ipc_sig_exporter = !!(MemFlags.ui32.NonPaged && !MemFlags.ui32.GTTAccess);
+
+    create_info.domain = thunk_proxy::AllocDomain::kSystem;
+  } else {
+    create_info.domain = thunk_proxy::AllocDomain::kLocal;
+  }
+
+  if (!MemFlags.ui32.CoarseGrain)
+    create_info.mem_flags = thunk_proxy::kFineGrain;
+
+  //In hsa-runtime, only kernarg region set Uncached.
+  if (MemFlags.ui32.Uncached)
+    create_info.mem_flags |= thunk_proxy::kKernarg;
+
+  create_info.flags.physical_only = MemFlags.ui32.NoAddress;
+  create_info.flags.alloc_va = !create_info.flags.physical_only;
+  create_info.flags.interprocess = MemFlags.ui32.NoAddress;
+  create_info.flags.interprocess |= MemFlags.ui32.Contiguous;
+  create_info.flags.physical_contiguous = MemFlags.ui32.Contiguous;
+  create_info.flags.locked = MemFlags.ui32.NoSubstitute;//AllocatePinned
+  create_info.flags.virtual_alloc = MemFlags.ui32.OnlyAddress;
+  create_info.flags.blit_kernel_object =
+      (MemFlags.ui32.ExecuteBlit && MemFlags.ui32.ExecuteAccess &&
+      (create_info.domain == thunk_proxy::AllocDomain::kSystem));
+  /*when only alloc virtual or only physical, it's vmm allocation, force to local*/
+  if (create_info.flags.virtual_alloc || create_info.flags.physical_only
+        || create_info.flags.physical_contiguous) {
+    create_info.domain = thunk_proxy::AllocDomain::kLocal;
+    SkipSubAlloc = true;
+  }
+
+  /* Only allow using the suballocator for ordinary VRAM.*/
+  bool trim_safe = false;
+  if (!SkipSubAlloc && create_info.domain == thunk_proxy::AllocDomain::kLocal) {
+    /* just quickly skip SA if size is bigger than SA block size.*/
+    gpusize real_size;
+    if (create_info.size > GPU_HUGE_PAGE_SIZE)
+      real_size = wsl::AlignUp(create_info.size, GPU_HUGE_PAGE_SIZE);
+    else
+      real_size = wsl::AlignUp(create_info.size, getpagesize());
+
+    if (real_size < fragment_allocator_.default_block_size()) {
+      *MemoryAddress = fragment_allocator_.alloc(real_size);
+      if (*MemoryAddress)
+        return HSAKMT_STATUS_SUCCESS;
+    }
+
+    /* SA might keep a lot of free blocks as *cache*.
+       * We can trim them if direct allocation fails at first time.
+       */
+    trim_safe = true;
+  }
+
+after_trim:
+  auto code = dev->CreateGpuMemory(create_info, &gpu_mem);
+  if (code == ErrorCode::Success) {
+    std::lock_guard<std::mutex> gard(*allocation_map_lock_);
+
+    /* For these physical allcations, use GpuMemory object's address as thunk handle*/
+    if (create_info.flags.physical_only || create_info.dmabuf_fd > 0)
+      *MemoryAddress = reinterpret_cast<void*>(gpu_mem->HandleApeAddress());
+    else
+      *MemoryAddress = reinterpret_cast<void *>(gpu_mem->GpuAddress());
+
+    (*allocation_map_)[*MemoryAddress] = Allocation(
+        gpu_mem->GetGpuMemoryHandle(), *MemoryAddress, (uint64_t)*MemoryAddress,
+        create_info.size, false, nullptr, SizeInBytes,
+        MemFlags.ui32.GTTAccess ? 0 : PreferredNode, MemFlags.Value);
+    return HSAKMT_STATUS_SUCCESS;
+  } else if (trim_safe) {
+    /* attempt to release memory from the block allocator and retry */
+    fragment_allocator_.trim();
+    trim_safe = false;
+    goto after_trim;
+  }
+
+  return HSAKMT_STATUS_ERROR;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemoryAlign(HSAuint32 PreferredNode,
+                                               HSAuint64 SizeInBytes,
+                                               HSAuint64 Alignment,
+                                               HsaMemFlags MemFlags,
+                                               void **MemoryAddress) {
+  return hsaKmtAllocMemoryAlignInternal(PreferredNode, SizeInBytes,
+                                        Alignment, MemFlags,
+                                        MemoryAddress,
+                                        !dxg_runtime->enable_thunk_sub_allocator);
+}
+
+HSAKMT_STATUS hsaKmtFreeMemoryInternal(void *MemoryAddress,
+                                       HSAuint64 SizeInBytes,
+                                       bool SkipSubAlloc) {
+  CHECK_DXG_OPEN();
+
+  if (!MemoryAddress)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  if (!SkipSubAlloc) {
+    if (fragment_allocator_.free(MemoryAddress))
+      return HSAKMT_STATUS_SUCCESS;
+  }
+
+  wsl::thunk::GpuMemory *gpu_mem = nullptr;
+  {
+    std::lock_guard<std::mutex> gard(*allocation_map_lock_);
+    auto it = allocation_map_->find(MemoryAddress);
+    if (it == allocation_map_->end()) {
+      return HSAKMT_STATUS_ERROR;
+    }
+
+    gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
+    if (gpu_mem->IsQueueReferenced())
+      return HSAKMT_STATUS_ERROR;
+
+    wsl::thunk::GpuMemoryDescFlags flags;
+    flags.reserved = gpu_mem->Flags();
+    if (flags.is_imported_vram_ipc &&
+      gpu_mem->DecSharedReference()) {
+      pr_info("memory is still referenced\n");
+      return HSAKMT_STATUS_SUCCESS;
+    }
+
+    if (it->second.dmabuf_fd >= 0) {
+      close(it->second.dmabuf_fd);
+      it->second.dmabuf_fd = -1;
+    }
+    allocation_map_->erase(it);
+  }
+
+  delete gpu_mem;
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtFreeMemory(void *MemoryAddress,
+                     HSAuint64 SizeInBytes) {
+  return hsaKmtFreeMemoryInternal(MemoryAddress, SizeInBytes);
+}
+
+bool queue_acquire_buffer(void *MemoryAddress) {
+  if (!MemoryAddress)
+  return false;
+
+  wsl::thunk::GpuMemory *gpu_mem = nullptr;
+  {
+  std::lock_guard<std::mutex> gard(*allocation_map_lock_);
+  auto it = allocation_map_->find(MemoryAddress);
+  if (it == allocation_map_->end()) {
+    return HSAKMT_STATUS_ERROR;
+  }
+
+  gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
+  gpu_mem->GetQueueReference();
+  }
+  if (gpu_mem == nullptr)
+  return false;
+
+  return true;
+}
+
+bool queue_release_buffer(void *MemoryAddress) {
+  if (!MemoryAddress)
+    return false;
+
+  wsl::thunk::GpuMemory *gpu_mem = nullptr;
+  {
+    std::lock_guard<std::mutex> gard(*allocation_map_lock_);
+    auto it = allocation_map_->find(MemoryAddress);
+    if (it == allocation_map_->end()) {
+      return HSAKMT_STATUS_ERROR;
+    }
+
+    gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
+    gpu_mem->PutQueueReference();
+  }
+  if (gpu_mem == nullptr)
+    return false;
+
+  return true;
+}
+
+wsl::thunk::GpuMemory *get_gpu_mem(void *MemoryAddress) {
+  std::lock_guard<std::mutex> gard(*allocation_map_lock_);
+  auto it = allocation_map_->find(MemoryAddress);
+  if (it == allocation_map_->end()) {
+    return nullptr;
+  }
+
+  return wsl::thunk::GpuMemory::Convert(it->second.handle);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtAvailableMemory(HSAuint32 Node,
+                                              HSAuint64 *AvailableBytes) {
+  CHECK_DXG_OPEN();
+
+  if (!AvailableBytes)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  wsl::thunk::WDDMDevice *dev = get_wddmdev(Node);
+  if (!dev)
+    return HSAKMT_STATUS_ERROR;
+
+  *AvailableBytes = dev->VramAvail();
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemory(void *MemoryAddress,
+                                             HSAuint64 MemorySizeInBytes) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not implemented\n");
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryToNodes(void *MemoryAddress,
+                                                    HSAuint64 MemorySizeInBytes,
+                                                    HSAuint64 NumberOfNodes,
+                                                    HSAuint32 *NodeArray) {
+  CHECK_DXG_OPEN();
+
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryWithFlags(
+    void *MemoryAddress, HSAuint64 MemorySizeInBytes, HsaMemFlags MemFlags) {
+  CHECK_DXG_OPEN();
+
+  if (!MemoryAddress)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  pr_debug("address %p\n", MemoryAddress);
+
+  if (MemFlags.ui32.ExtendedCoherent && MemFlags.ui32.CoarseGrain)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  // Registered memory should be ordinary paged host memory.
+  if ((MemFlags.ui32.HostAccess != 1) || (MemFlags.ui32.NonPaged == 1))
+    return HSAKMT_STATUS_NOT_SUPPORTED;
+
+  if (!dxg_runtime->hsakmt_is_dgpu)
+    /* TODO: support mixed APU and dGPU configurations */
+    return HSAKMT_STATUS_NOT_SUPPORTED;
+
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+bool is_ipc_sysmemfd(int fd) {
+  std::string fdPath = "/proc/self/fd/" + std::to_string(fd);
+  char linkTarget[256];
+  ssize_t bytes = readlink(fdPath.c_str(), linkTarget, sizeof(linkTarget) - 1);
+  if (bytes == -1)
+    return false;
+  linkTarget[bytes] = '\0';
+  return strstr(linkTarget, "rocr4wsl_gtt") != nullptr;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterGraphicsHandleToNodes(HSAuint64 GraphicsResourceHandle,
+                                                            HsaGraphicsResourceInfo *GraphicsResourceInfo,
+                                                            HSAuint64 NumberOfNodes,
+                                                            HSAuint32 *NodeArray) {
+  HSA_REGISTER_MEM_FLAGS regFlags;
+  regFlags.Value = 0;
+
+  return hsaKmtRegisterGraphicsHandleToNodesExt(GraphicsResourceHandle,
+            GraphicsResourceInfo,
+            NumberOfNodes,
+            NodeArray,
+            regFlags);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterGraphicsHandleToNodesExt(HSAuint64 GraphicsResourceHandle,
+							       HsaGraphicsResourceInfo *GraphicsResourceInfo,
+							       HSAuint64 NumberOfNodes,
+							       HSAuint32 *NodeArray,
+							       HSA_REGISTER_MEM_FLAGS RegisterFlags) {
+  CHECK_DXG_OPEN();
+  uint32_t *gpu_id_array = NULL;
+  HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
+
+  if (is_ipc_sysmemfd(GraphicsResourceHandle)) {
+    GraphicsResourceInfo->NodeId = dxg_runtime->default_node;
+    pr_info("skip register sysmemfd. It would be released in next step\n");
+    return HSAKMT_STATUS_SUCCESS;
+  }
+
+  if (NumberOfNodes == 0) {
+    RegisterFlags.ui32.requiresVAddr = 0;
+    NumberOfNodes = 1;
+    NodeArray = (HSAuint32*)&(dxg_runtime->default_node);
+  }
+
+  pr_debug("number of nodes %lu\n", NumberOfNodes);
+  wsl::thunk::GpuMemoryHandle mem_handle;
+  ret = import_dmabuf_fd(GraphicsResourceHandle, NodeArray[0],
+                          RegisterFlags.ui32.requiresVAddr,
+                          false, &mem_handle);
+  if (ret != HSAKMT_STATUS_SUCCESS) {
+    pr_err("hsaKmtRegisterGraphicsHandleToNodesExt: import_dmabuf_fd failed, "
+           "GraphicsResourceHandle: %lu, NodeId: %u\n",
+           GraphicsResourceHandle, NodeArray[0]);
+    return ret;
+  }
+  wsl::thunk::GpuMemory *gpu_mem = wsl::thunk::GpuMemory::Convert(mem_handle);
+  GraphicsResourceInfo->NodeId = gpu_mem->GetDevice()->NodeId();
+  GraphicsResourceInfo->SizeInBytes = gpu_mem->ClientSize();
+  GraphicsResourceInfo->MemoryAddress = RegisterFlags.ui32.requiresVAddr ?
+                                          reinterpret_cast<void *>(gpu_mem->GpuAddress()):
+                                          reinterpret_cast<void*>(gpu_mem->HandleApeAddress());
+
+  return ret;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtExportDMABufHandle(void *MemoryAddress,
+                                                 HSAuint64 MemorySizeInBytes,
+                                                 int *DMABufFd,
+                                                 HSAuint64 *Offset) {
+  CHECK_DXG_OPEN();
+
+  std::lock_guard<std::mutex> gard(*allocation_map_lock_);
+
+  auto it = allocation_map_->upper_bound(MemoryAddress);
+  if (it != allocation_map_->begin()) {
+    --it;
+    if (it->second.dmabuf_fd == -1) {
+      auto gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
+      auto code = gpu_mem->ExportPhysicalHandle(DMABufFd);
+      if (code != ErrorCode::Success)
+        return HSAKMT_STATUS_ERROR;
+      it->second.dmabuf_fd = *DMABufFd;
+    }
+    *DMABufFd = dup(it->second.dmabuf_fd);
+    *Offset = reinterpret_cast<uint64_t>(MemoryAddress) - it->second.gpu_addr;
+    return HSAKMT_STATUS_SUCCESS;
+  }
+
+  return HSAKMT_STATUS_ERROR;
+}
+
+HSAKMT_STATUS HSAKMTAPI
+hsaKmtGetMemoryHandle(void *MemoryAddress, HSAuint64 SizeInBytes,
+                      uint64_t *SharedMemoryHandle) {
+	CHECK_DXG_OPEN();
+
+	return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS import_dmabuf_fd(int DMABufFd,
+                                       uint32_t NodeId,
+                                       bool alloc_va,
+                                       bool is_ipc_memfd,
+                                       wsl::thunk::GpuMemoryHandle *GpuMemHandle) {
+  CHECK_DXG_OPEN();
+
+  *GpuMemHandle = nullptr;
+  wsl::thunk::WDDMDevice* dev = get_wddmdev(NodeId);
+  wsl::thunk::GpuMemory *gpu_mem = nullptr;
+  wsl::thunk::GpuMemoryCreateInfo create_info{};
+  create_info.dmabuf_fd = DMABufFd;
+  create_info.flags.alloc_va = alloc_va;
+
+  if (is_ipc_memfd) {
+    struct stat st;
+    fstat(DMABufFd, &st);
+    uint64_t sz = st.st_size;
+    if (4096 <= sz && sz < dxg_runtime->SystemHeapSize() && (sz & 0xfff) == 0) {
+      pr_debug("DMABufFd %d is sys mem fd(IPC signal), get size:%ld from it\n", DMABufFd, st.st_size);
+      create_info.flags.sysmem_ipc_sig_importer = 1;        // set to 1 when backend is system memory
+      create_info.size = st.st_size;
+    }
+  }
+
+  gpusize gpu_va = 0;
+  auto code = dev->CreateGpuMemory(create_info, &gpu_mem, &gpu_va);
+  if (code == ErrorCode::SameProcessSameDevice) {
+    /* Unit_hipMemPoolExportToShareableHandle_SameProc */
+    pr_info("imported from same process, use the old one\n");
+    std::lock_guard<std::mutex> gard(*allocation_map_lock_);
+    auto it = allocation_map_->find((void*)gpu_va);
+    if (it == allocation_map_->end()) {
+      pr_err("where's the conflict buffer? va %#lx\n", create_info.va_hint);
+      return HSAKMT_STATUS_ERROR;
+    }
+    wsl::thunk::GpuMemory *conflict_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
+    conflict_mem->IncSharedReference();
+    *GpuMemHandle = it->second.handle;
+    return HSAKMT_STATUS_SUCCESS;
+  } else if (code != ErrorCode::Success) {
+    pr_err("fail to import fd, ret %d\n", (int)code);
+    return HSAKMT_STATUS_ERROR;
+  }
+
+  void *MemoryAddress;
+  if (alloc_va)
+    MemoryAddress = reinterpret_cast<void *>(gpu_mem->GpuAddress());
+  else
+    MemoryAddress = reinterpret_cast<void*>(gpu_mem->HandleApeAddress());
+
+  *GpuMemHandle = gpu_mem->GetGpuMemoryHandle();
+
+  std::lock_guard<std::mutex> gard(*allocation_map_lock_);
+  /*
+   * the gpu_mem->Flags() need convert back from GpuMemoryCreateFlags to
+   * HsaMemFlags, reference hsaKmtAllocMemoryAlign
+   * */
+  (*allocation_map_)[MemoryAddress] = Allocation(
+    *GpuMemHandle, MemoryAddress, (uint64_t)MemoryAddress,
+    gpu_mem->Size(), false, nullptr, gpu_mem->ClientSize(),
+    NodeId, gpu_mem->Flags());
+
+  return HSAKMT_STATUS_SUCCESS;
+
+}
+
+
+HSAKMT_STATUS HSAKMTAPI
+hsaKmtShareMemory(void *MemoryAddress, HSAuint64 SizeInBytes,
+                  HsaSharedMemoryHandle *SharedMemoryHandle) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not implemented\n");
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI
+hsaKmtRegisterSharedHandle(const HsaSharedMemoryHandle *SharedMemoryHandle,
+                           void **MemoryAddress, HSAuint64 *SizeInBytes) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not implemented\n");
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterSharedHandleToNodes(
+    const HsaSharedMemoryHandle *SharedMemoryHandle, void **MemoryAddress,
+    HSAuint64 *SizeInBytes, HSAuint64 NumberOfNodes, HSAuint32 *NodeArray) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not implemented\n");
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtProcessVMRead(HSAuint32 Pid,
+                                            HsaMemoryRange *LocalMemoryArray,
+                                            HSAuint64 LocalMemoryArrayCount,
+                                            HsaMemoryRange *RemoteMemoryArray,
+                                            HSAuint64 RemoteMemoryArrayCount,
+                                            HSAuint64 *SizeCopied) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("has been deprecated\n");
+  assert(false);
+  return HSAKMT_STATUS_NOT_IMPLEMENTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtProcessVMWrite(HSAuint32 Pid,
+                                             HsaMemoryRange *LocalMemoryArray,
+                                             HSAuint64 LocalMemoryArrayCount,
+                                             HsaMemoryRange *RemoteMemoryArray,
+                                             HSAuint64 RemoteMemoryArrayCount,
+                                             HSAuint64 *SizeCopied) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("has been deprecated\n");
+  assert(false);
+  return HSAKMT_STATUS_NOT_IMPLEMENTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtDeregisterMemory(void *MemoryAddress) {
+  CHECK_DXG_OPEN();
+
+  if (!MemoryAddress)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  pr_debug("address %p\n", MemoryAddress);
+
+  {
+    std::lock_guard<std::mutex> gard(*allocation_map_lock_);
+
+    auto it = allocation_map_->find(MemoryAddress);
+    if (it == allocation_map_->end()) {
+      return HSAKMT_STATUS_SUCCESS;
+    }
+
+    auto *gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
+    wsl::thunk::GpuMemoryDescFlags flags;
+    flags.reserved = gpu_mem->Flags();
+    // IPC mem(vram)
+    if (flags.is_imported_vram_ipc &&
+      gpu_mem->DecSharedReference() == 0) {
+      allocation_map_->erase(it);
+      delete gpu_mem;
+      return HSAKMT_STATUS_SUCCESS;
+    }
+    if (it->second.userptr) {
+      allocation_map_->erase(it);
+      allocation_map_->erase((void *)it->second.gpu_addr);
+      delete gpu_mem;
+      return HSAKMT_STATUS_SUCCESS;
+    }
+  }
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtMapMemoryToGPU(void *MemoryAddress,
+                                             HSAuint64 MemorySizeInBytes,
+                                             HSAuint64 *AlternateVAGPU) {
+
+  HSAuint64 NumberOfNodes = 1;
+  HSAuint32 NodeArray[] = {dxg_runtime->default_node};
+  HsaMemMapFlags MemMapFlags;
+  MemMapFlags.Value = 0;
+
+  return hsaKmtMapMemoryToGPUNodes(MemoryAddress, MemorySizeInBytes, AlternateVAGPU,
+    MemMapFlags, NumberOfNodes, NodeArray);
+}
+HSAKMT_STATUS HSAKMTAPI hsaKmtMapMemoryToGPUNodes(
+    void *MemoryAddress, HSAuint64 MemorySizeInBytes, HSAuint64 *AlternateVAGPU,
+    HsaMemMapFlags MemMapFlags, HSAuint64 NumberOfNodes, HSAuint32 *NodeArray) {
+  CHECK_DXG_OPEN();
+
+  if (!MemoryAddress || !AlternateVAGPU) {
+    pr_err("FIXME: mapping NULL pointer\n");
+    return HSAKMT_STATUS_ERROR;
+  }
+
+  uint64_t start = wsl::AlignDown((uint64_t)MemoryAddress, 4096);
+  uint64_t end =
+      wsl::AlignUp((uint64_t)MemoryAddress + MemorySizeInBytes, 4096);
+
+  void *aligned_ptr = (void *)start;
+  size_t aligned_size = end - start;
+
+  {
+    if (nullptr != fragment_allocator_.block_base(aligned_ptr))
+      return HSAKMT_STATUS_SUCCESS;
+  }
+
+  {
+    std::lock_guard<std::mutex> gard(*allocation_map_lock_);
+    auto it = allocation_map_->find(aligned_ptr);
+    if (it != allocation_map_->end()) {
+      wsl::thunk::GpuMemory *gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
+      wsl::thunk::GpuMemoryDescFlags flags;
+      flags.reserved = gpu_mem->Flags();
+      // IPC mem
+      if (flags.is_imported_vram_ipc) {
+
+        auto code = gpu_mem->MapGpuVirtualAddress(gpu_mem->GpuAddress(), gpu_mem->Size());
+        if (code != ErrorCode::Success)
+          return HSAKMT_STATUS_ERROR;
+
+        code = gpu_mem->MakeResident();
+        if (code != ErrorCode::Success)
+          return HSAKMT_STATUS_ERROR;
+
+        wsl::thunk::WDDMDevice *dev = gpu_mem->GetDevice();
+        if (!dev->WaitOnPagingFenceFromCpu())
+          return HSAKMT_STATUS_ERROR;
+
+        return HSAKMT_STATUS_SUCCESS;
+      }
+
+      if (!it->second.userptr) {
+      // GTT/Local mem
+        if (it->second.size >= MemorySizeInBytes) {
+          *AlternateVAGPU = (uint64_t)MemoryAddress;
+          return HSAKMT_STATUS_SUCCESS;
+        } else {
+          return HSAKMT_STATUS_ERROR;
+        }
+      }
+    }
+
+    // userptr mem
+    it = allocation_map_->find(MemoryAddress);
+    if (it != allocation_map_->end()) {
+      if (it->second.userptr && it->second.size >= MemorySizeInBytes) {
+        *AlternateVAGPU =
+            (uintptr_t)it->second.gpu_addr +
+            ((uintptr_t)MemoryAddress - (uintptr_t)it->second.cpu_addr);
+        return HSAKMT_STATUS_SUCCESS;
+      }
+    }
+  }
+
+  // map userptr
+  wsl::thunk::WDDMDevice *dev = get_wddmdev(NodeArray[0]);
+  if (!dev)
+    return HSAKMT_STATUS_ERROR;
+
+  wsl::thunk::GpuMemory *gpu_mem = nullptr;
+  wsl::thunk::GpuMemoryHandle handle = 0;
+  uint64_t addr;
+  wsl::thunk::GpuMemoryCreateInfo create_info{};
+  create_info.domain = thunk_proxy::kUserMemory;
+  create_info.size = aligned_size;
+  create_info.user_ptr = aligned_ptr;
+
+  auto code = dev->CreateGpuMemory(create_info, &gpu_mem);
+  if (code == ErrorCode::Success) {
+    addr = gpu_mem->GpuAddress();
+    handle = gpu_mem->GetGpuMemoryHandle();
+  } else {
+    return HSAKMT_STATUS_ERROR;
+  }
+
+  {
+    std::lock_guard<std::mutex> guard(*allocation_map_lock_);
+   (*allocation_map_)[MemoryAddress] =
+        Allocation(handle, aligned_ptr, addr, aligned_size, true, MemoryAddress,
+                   MemorySizeInBytes);
+    (*allocation_map_)[(void *)addr] =
+        Allocation(handle, aligned_ptr, addr, aligned_size, true, nullptr,
+                   MemorySizeInBytes);
+  }
+
+  *AlternateVAGPU = addr + ((uintptr_t)MemoryAddress - (uintptr_t)aligned_ptr);
+
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtUnmapMemoryToGPU(void *MemoryAddress) {
+  CHECK_DXG_OPEN();
+
+  if (!MemoryAddress) {
+    /* Workaround for runtime bug */
+    pr_err("FIXME: Unmapping NULL pointer\n");
+    return HSAKMT_STATUS_SUCCESS;
+  }
+
+  pr_debug("address %p\n", MemoryAddress);
+
+  {
+    if (nullptr != fragment_allocator_.block_base(MemoryAddress))
+      return HSAKMT_STATUS_SUCCESS;
+  }
+
+  wsl::thunk::GpuMemory *gpu_mem = nullptr;
+  {
+    std::lock_guard<std::mutex> gard(*allocation_map_lock_);
+
+    auto it = allocation_map_->find(MemoryAddress);
+    if (it == allocation_map_->end()) {
+      return HSAKMT_STATUS_ERROR;
+    }
+
+    gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
+    if (gpu_mem->IsQueueReferenced())
+      return HSAKMT_STATUS_ERROR;
+
+    // IPC mem
+    wsl::thunk::GpuMemoryDescFlags flags;
+    flags.reserved = gpu_mem->Flags();
+    if (flags.is_imported_vram_ipc &&
+        !gpu_mem->IsSharedFromSameProcess()) {
+      auto code = gpu_mem->UnmapGpuVirtualAddress(gpu_mem->GpuAddress(), gpu_mem->Size());
+      if (code != ErrorCode::Success)
+        return HSAKMT_STATUS_ERROR;
+      gpu_mem->Evict();
+
+      return HSAKMT_STATUS_SUCCESS;
+    }
+  }
+
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtMapGraphicHandle(HSAuint32 NodeId,
+                                               HSAuint64 GraphicDeviceHandle,
+                                               HSAuint64 GraphicResourceHandle,
+                                               HSAuint64 GraphicResourceOffset,
+                                               HSAuint64 GraphicResourceSize,
+                                               HSAuint64 *FlatMemoryAddress) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not implemented\n");
+  /* This API was only ever implemented in KFD for Kaveri and
+   * was never upstreamed. There are no open-source users of
+   * this interface. It has been superseded by
+   * RegisterGraphicsHandleToNodes.
+   */
+  return HSAKMT_STATUS_NOT_IMPLEMENTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtUnmapGraphicHandle(HSAuint32 NodeId,
+                                                 HSAuint64 FlatMemoryAddress,
+                                                 HSAuint64 SizeInBytes) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not implemented\n");
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtGetTileConfig(HSAuint32 NodeId,
+                                            HsaGpuTileConfig *config) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not implemented\n");
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtQueryPointerInfo(const void *Pointer,
+                                               HsaPointerInfo *PointerInfo) {
+  CHECK_DXG_OPEN();
+
+  if (!Pointer || !PointerInfo)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  pr_debug("pointer %p\n", Pointer);
+
+  memset(PointerInfo, 0, sizeof(HsaPointerInfo));
+
+  wsl::thunk::GpuMemory *gpu_mem = nullptr;
+  Allocation allocation_info;
+  bool found = false;
+  {
+    std::lock_guard<std::mutex> gard(*allocation_map_lock_);
+    auto it = allocation_map_->upper_bound(Pointer);
+    if (it != allocation_map_->begin()) {
+      --it;
+      if (Pointer >= it->first &&
+        (Pointer < reinterpret_cast<const uint8_t*>(it->first) + it->second.size_requested)) {
+        allocation_info = it->second;
+        gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
+        found = true;
+      }
+    }
+  }
+
+  if (!found) {
+    pr_debug("can't found allocation for %p\n", Pointer);
+    PointerInfo->Type = HSA_POINTER_UNKNOWN;
+    return HSAKMT_STATUS_ERROR;
+  }
+
+  if (allocation_info.userptr) {
+    PointerInfo->Type = HSA_POINTER_REGISTERED_USER;
+    PointerInfo->SizeInBytes = allocation_info.size;
+  } else if (gpu_mem->IsVirtual()) {
+    PointerInfo->Type = HSA_POINTER_RESERVED_ADDR;
+  } else {
+    PointerInfo->Type = HSA_POINTER_ALLOCATED;
+    PointerInfo->SizeInBytes = allocation_info.size_requested;
+  }
+
+  PointerInfo->Node = allocation_info.node_id;
+  PointerInfo->MemFlags.Value = allocation_info.mem_flags_value;
+  PointerInfo->CPUAddress = allocation_info.cpu_addr;
+  PointerInfo->GPUAddress = allocation_info.gpu_addr;
+  PointerInfo->UserData = allocation_info.rocr_userdata;
+
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtSetMemoryUserData(const void *Pointer,
+                                                void *UserData) {
+  CHECK_DXG_OPEN();
+
+  uint64_t aligned_ptr = wsl::AlignDown((uint64_t)Pointer, 4096);
+
+  std::lock_guard<std::mutex> gard(*allocation_map_lock_);
+  auto it = allocation_map_->find((void *)aligned_ptr);
+  if (it != allocation_map_->end()) {
+    it->second.rocr_userdata = UserData;
+    return HSAKMT_STATUS_SUCCESS;
+  }
+
+  return HSAKMT_STATUS_ERROR;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtReplaceAsanHeaderPage(void *addr) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  assert(false);
+#ifdef SANITIZER_AMDGPU
+  pr_debug("address %p\n", addr);
+  CHECK_DXG_OPEN();
+
+  return HSAKMT_STATUS_SUCCESS;
+#else
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+#endif
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtReturnAsanHeaderPage(void *addr) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  assert(false);
+#ifdef SANITIZER_AMDGPU
+  pr_debug("address %p\n", addr);
+  CHECK_DXG_OPEN();
+
+  return HSAKMT_STATUS_SUCCESS;
+#else
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+#endif
+}
@@ -0,0 +1,626 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include <stdlib.h>
+#include <cstring>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/sysinfo.h>
+#include <linux/mman.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <cstdio>
+#include <strings.h>
+#include <cassert>
+
+
+hsakmtRuntime *dxg_runtime = new hsakmtRuntime();
+
+void hsakmtRuntime::HeapInit() {
+    ReserveLocalHeapSpace();
+    ReserveSystemHeapSpace();
+    InitHandleApertureSpace();
+    InitLocalHeapMgr();
+    InitSystemHeapMgr();
+    InitHandleApertureMgr();
+}
+
+void hsakmtRuntime::HeapFini() {
+    FreeSystemHeapSpace();
+    FreeLocalHeapSpace();
+}
+
+bool hsakmtRuntime::ReserveSvmSpace(uint64_t &base, uint64_t &size, uint64_t align) {
+    uint64_t sys_va[16] = {0};
+    uint64_t local_va;
+    uint64_t sys_va_size;
+    int match_index = -1;
+    void* ptr = NULL;
+
+    wsl::thunk::WDDMDevice* device;
+    size_t num_adapters = get_num_wddmdev();
+
+    base = 0;
+    sys_va_size = size + align;
+
+    /* it will retry 16 times to find the avaliable range. */
+    for (int i = 0; i < 16; i++) {
+        local_va = 0;
+        ptr = mmap(NULL, sys_va_size , PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+        if (ptr == MAP_FAILED) {
+            pr_err("fail to reserve cpu va in %d time!\n", i);
+            break;
+        }
+
+        sys_va[i] = (uint64_t)ptr;
+
+        int match_cnt = 0;
+        for (uint32_t j = 0; j < num_adapters; j++) {
+            device = get_wddmdev(j+1);
+            uint64_t start = (base == 0) ? (uint64_t)ptr : base;
+            uint64_t end = start + ((base == 0) ? sys_va_size : size) + 1;
+
+            if (wsl::thunk::d3dthunk::ReserveGpuVirtualAddress(
+                        device->GetAdapter(), size,
+                        start,
+                        end, &local_va) == ErrorCode::Success) {
+
+                match_cnt++;
+                base = local_va;
+                pr_debug("success to reserve gpu va %lx and va cpu %p in %d time\n",
+                        local_va, ptr, i);
+            } else {
+                pr_err("%s fail to reserve gpu va for cpu va %p in %d time!\n",
+                        __FUNCTION__, ptr, i);
+            }
+        }
+
+        if (match_cnt == num_adapters) {
+                match_index = i;
+                break;
+        }
+    }
+
+    if (match_index >= 0) {
+        /* release cpu unused ranges*/
+        uint64_t left_size = local_va - sys_va[match_index];
+        uint64_t right_size = align - left_size;
+        if ((left_size > 0) && munmap((void*)sys_va[match_index], left_size))
+            pr_err("fail to unmap left %lx with size %lx\n", sys_va[match_index], left_size);
+        if ((right_size > 0) && munmap((void*)(local_va + size), right_size))
+            pr_err("fail to unmap right %lx with size %lx\n", (local_va + size), right_size);
+    } else {
+        pr_err("fail to reserve Local Heap Space!\n");
+        base = 0;
+        size = 0;
+    }
+
+    /* free match fail address for cpu va */
+    int free = match_index >= 0 ? match_index : 16;
+    for (int j = 0; j < free; j++) {
+        if (sys_va[j] != 0 && munmap((void*)sys_va[j], sys_va_size)) {
+            pr_err("fail to unmap %d %lx\n", j, sys_va[j]);
+        }
+    }
+
+    return match_index >= 0;
+}
+
+/*
+ * To find the avaliable same range for cpu
+ * virtual space and gpu virtual space.
+ * sys_va_size of cpu va range is larger 1G
+ * than gpu va range, otherwise ReserveGPUVirtualAddress
+ * will return error.
+ */
+bool hsakmtRuntime::ReserveLocalHeapSpace() {
+    wsl::thunk::WDDMDevice* device;
+    uint64_t total_local_size = 0;
+    uint64_t align = 0x40000000; /* 1G */
+    size_t num_adapters = get_num_wddmdev();
+
+    for (uint32_t j = 0; j < num_adapters; j++) {
+        device = get_wddmdev(j+1);
+        if (device == nullptr)
+            return -1;
+        /*
+         * For APU, use non local memory(shared GPU memory) as GPU memory,
+         * because it has small local memory
+        */
+        if (device->IsDgpu())
+          total_local_size = wsl::Max(device->LocalHeapSize(), total_local_size);
+        else
+          total_local_size = wsl::Max(device->LocalHeapSize(), device->NonLocalHeapSize(), total_local_size);
+    }
+
+    total_local_size = wsl::AlignUp(total_local_size, align) * 4;
+    local_heap_space_start_ = 0;
+    local_heap_space_size_ = total_local_size;
+
+    return ReserveSvmSpace(local_heap_space_start_, local_heap_space_size_, align);
+}
+
+bool hsakmtRuntime::FreeSvmSpace(uint64_t &base, uint64_t &size) {
+    wsl::thunk::WDDMDevice* device;
+    size_t num_adapters = get_num_wddmdev();
+    for (uint32_t j = 0; j < num_adapters; j++) {
+        device = get_wddmdev(j+1);
+        if (device == nullptr)
+            return -1;
+        wsl::thunk::d3dthunk::FreeGpuVirtualAddress(device->GetAdapter(), base, size);
+    }
+
+    void *cpu = (void *)base;
+    auto r = (munmap(cpu, size) == 0);
+    base = 0;
+    size = 0;
+    return r;
+}
+
+bool hsakmtRuntime::FreeLocalHeapSpace() {
+    return FreeSvmSpace(local_heap_space_start_, local_heap_space_size_);
+}
+
+void hsakmtRuntime::InitLocalHeapMgr() {
+  local_heap_mgr_ = std::make_unique<wsl::thunk::VaMgr>(local_heap_space_start_,
+                                          local_heap_space_size_,
+                                          DEFAULT_GPU_PAGE_SIZE);
+}
+
+bool hsakmtRuntime::ReserveSystemHeapSpace() {
+    struct sysinfo info;
+    int ret = sysinfo(&info);
+    uint64_t max_ram = 0x10000000000;
+    uint64_t alignment = 0x100000000;
+    assert(!ret);
+
+    int32_t protFlags = PROT_NONE;
+    // minimum of reserve size is 8G, maximum of reserve size is 1T.
+    system_heap_space_size_ = std::min(wsl::AlignUp(info.totalram, alignment) * 2, max_ram);
+
+    return ReserveSvmSpace(system_heap_space_start_, system_heap_space_size_, alignment);
+}
+
+bool hsakmtRuntime::FreeSystemHeapSpace(void) {
+    return FreeSvmSpace(system_heap_space_start_, system_heap_space_size_);
+}
+
+bool hsakmtRuntime::CommitSystemHeapSpace(void* addr, int64_t size, bool lock) {
+    int32_t protFlags = PROT_READ | PROT_WRITE | PROT_EXEC;
+    int32_t mapFlags = MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED|
+        MAP_NORESERVE|MAP_UNINITIALIZED;
+    if (lock)
+        mapFlags |= MAP_LOCKED;
+    void* paddr = mmap(addr, size, protFlags, mapFlags, -1, 0);
+    if (paddr == MAP_FAILED) {
+        pr_err("fail to commit %s addr = %p, paddr = %p\n", (lock ? "locked" : ""), addr, paddr);
+        return false;
+    }
+    assert(addr == paddr);
+
+    /*if (!Runtime::runtime_singleton_->PinWARequired())
+      return true;*/
+
+    /*
+     * Do not make the pages in this range available to the child
+     * after a fork(2).  This is useful to prevent copy-on-write
+     * semantics from changing the physical location of a page if
+     * the parent writes to it after a fork(2).  (Such page
+     * relocations cause problems for hardware that DMAs into the
+     * page.)
+     *
+     * https://man7.org/linux/man-pages/man2/madvise.2.html
+     */
+    if (madvise(addr, size, MADV_DONTFORK))
+        pr_err("fail to set MADV_DONTFORK for addr = %p\n", addr);
+
+    return true;
+}
+
+bool hsakmtRuntime::DecommitSystemHeapSpace(void* addr, int64_t size) {
+    int32_t protFlags = PROT_NONE;
+    int32_t mapFlags = MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED|
+        MAP_NORESERVE|MAP_UNINITIALIZED;
+    void* paddr = mmap(addr, size, protFlags, mapFlags, -1, 0);
+    if (paddr == MAP_FAILED) {
+        pr_err("fail to decommit addr = %p, paddr = %p\n", addr, paddr);
+        return false;
+    }
+    assert(addr == paddr);
+    return true;
+}
+
+void hsakmtRuntime::InitSystemHeapMgr() {
+  system_heap_mgr_ = std::make_unique<wsl::thunk::VaMgr>(system_heap_space_start_,
+                                          system_heap_space_size_,
+                                          DEFAULT_GPU_PAGE_SIZE);
+}
+
+ErrorCode hsakmtRuntime::ReserveGpuVirtualAddress(const thunk_proxy::AllocDomain domain,
+        gpusize hit_base_addr, gpusize size,
+        gpusize *out_gpu_virt_addr, gpusize alignment, bool lock) {
+    gpusize gpu_addr = 0;
+    ErrorCode code = ErrorCode::Success;
+
+    uint64_t align = alignment == 0 ? (64 * 1024) : alignment; // default 64K alignment
+    if (size >= GPU_HUGE_PAGE_SIZE)
+        align = GPU_HUGE_PAGE_SIZE;
+
+    if (domain == thunk_proxy::kSystem) {
+        gpu_addr = system_heap_mgr_->Alloc(size, align, hit_base_addr);
+        if (gpu_addr == 0)
+            code = ErrorCode::OutOfMemory;
+
+        if (!CommitSystemHeapSpace((void*)gpu_addr, size, lock)) {
+            system_heap_mgr_->Free(gpu_addr);
+            code = ErrorCode::SyscallFail;
+        }
+    } else {
+        gpu_addr = local_heap_mgr_->Alloc(size, align, hit_base_addr);
+        if (gpu_addr == 0)
+            code = ErrorCode::OutOfGpuMemory;
+    }
+
+    *out_gpu_virt_addr = (code == ErrorCode::Success) ? gpu_addr : 0;
+    return code;
+}
+
+ErrorCode hsakmtRuntime::FreeGpuVirtualAddress(const thunk_proxy::AllocDomain domain,
+        gpusize gpu_addr, gpusize size) {
+    auto code = ErrorCode::Success;
+
+    if (domain == thunk_proxy::kSystem) {
+        DecommitSystemHeapSpace((void *)gpu_addr, size);
+        system_heap_mgr_->Free(gpu_addr);
+    } else {
+        local_heap_mgr_->Free(gpu_addr);
+    }
+
+    return code;
+}
+
+bool hsakmtRuntime::CommitSystemHeapSpaceIPC(void* addr, int64_t size, int &memfd, bool lock) {
+    int fd = -1;
+
+    if (memfd == -1) {
+        fd = memfd_create("rocr4wsl_gtt", MFD_CLOEXEC);
+        if (fd < 0) {
+            pr_err("memfd_create failed\n");
+            return false;
+        }
+
+        ftruncate(fd, size);
+    } else {
+        fd = memfd;
+    }
+
+    int32_t protFlags = PROT_READ | PROT_WRITE;
+    int32_t mapFlags = MAP_SHARED | MAP_FIXED | MAP_NORESERVE |
+        MAP_UNINITIALIZED | (lock ? MAP_LOCKED : 0);
+
+    void* paddr = mmap(addr, size, protFlags, mapFlags, fd, 0);
+    if (paddr == MAP_FAILED) {
+        pr_err("fail to commit %s addr = %p, paddr = %p\n", (lock ? "locked" : ""), addr, paddr);
+        if (memfd == -1)
+            close(fd);
+        return false;
+    }
+    assert(addr == paddr);
+
+    memfd = fd;
+
+    if (madvise(addr, size, MADV_DONTFORK))
+        pr_err("fail to set MADV_DONTFORK for addr = %p\n", addr);
+
+    return true;
+}
+
+bool hsakmtRuntime::DecommitSystemHeapSpaceIPC(void* addr, int64_t size, int &memfd) {
+    if (munmap(addr, size) != 0) {
+        pr_err("fail to unmap = %p \n", addr);
+        return false;
+    }
+    close(memfd);
+    memfd = -1;
+    return true;
+}
+
+ErrorCode hsakmtRuntime::ReserveIPCSysMem(gpusize size,
+        gpusize *out_gpu_virt_addr, gpusize alignment,
+        int &memfd, bool lock) {
+    gpusize gpu_addr = 0;
+    ErrorCode code = ErrorCode::Success;
+    gpu_addr = system_heap_mgr_->Alloc(size, alignment, 0);
+    if (gpu_addr == 0)
+        return ErrorCode::OutOfMemory;
+
+    if (!CommitSystemHeapSpaceIPC((void*)gpu_addr, size, memfd, lock)) {
+        system_heap_mgr_->Free(gpu_addr);
+        code = ErrorCode::SyscallFail;
+    }
+
+    *out_gpu_virt_addr = (code == ErrorCode::Success) ? gpu_addr : 0;
+    return code;
+}
+
+ErrorCode hsakmtRuntime::FreeIPCSysMem(gpusize gpu_addr, gpusize size, int &memfd) {
+    auto code = ErrorCode::Success;
+
+    DecommitSystemHeapSpaceIPC((void *)gpu_addr, size, memfd);
+
+    system_heap_mgr_->Free(gpu_addr);
+    return code;
+}
+
+bool hsakmtRuntime::InitHandleApertureSpace() {
+	wsl::thunk::WDDMDevice* device;
+	size_t num_adapters = get_num_wddmdev();
+    handle_aperture_start_ = START_NON_CANONICAL_ADDR;
+    handle_aperture_size_ = 1ULL << 47;
+
+    while (handle_aperture_start_ < END_NON_CANONICAL_ADDR - 1) {
+		for (uint32_t j = 0; j < num_adapters;) {
+	        device = get_wddmdev(j+1);
+	        if (device == nullptr)
+	            return -1;
+
+            if (device->PrivateApertureBase() &&
+                    IS_OVERLAPPING(device->PrivateApertureBase(),
+                        device->PrivateApertureSize(),
+                        handle_aperture_start_,
+                        handle_aperture_size_)) {
+                handle_aperture_start_ += (1ULL << 47);
+                continue;
+            }
+
+            if (device->SharedApertureBase() &&
+                    IS_OVERLAPPING(device->SharedApertureBase(),
+                        device->SharedApertureSize(),
+                        handle_aperture_start_,
+                        handle_aperture_size_)) {
+                handle_aperture_start_ += (1ULL << 47);
+                continue;
+            }
+
+            j++;
+        }
+        pr_debug("handle aperture start %lx, size %lx\n", handle_aperture_start_, handle_aperture_size_);
+        return true;
+    }
+
+    handle_aperture_start_ = 0;
+    pr_err("fail\n");
+
+    return false;
+}
+
+void hsakmtRuntime::InitHandleApertureMgr() {
+  handle_aperture_mgr_ = std::make_unique<wsl::thunk::VaMgr>(handle_aperture_start_,
+                                                 handle_aperture_size_,
+                                                 DEFAULT_GPU_PAGE_SIZE);
+}
+
+ErrorCode hsakmtRuntime::HandleApertureAlloc(gpusize size, gpusize *out_gpu_virt_addr) {
+    uint64_t align = DEFAULT_GPU_PAGE_SIZE;
+
+    if (size >= GPU_HUGE_PAGE_SIZE)
+        align = GPU_HUGE_PAGE_SIZE;
+
+    *out_gpu_virt_addr = handle_aperture_mgr_->Alloc(size, align);
+    if (*out_gpu_virt_addr == 0)
+        return ErrorCode::OutOfHandleApeMemory;
+
+    return ErrorCode::Success;
+}
+
+void hsakmtRuntime::HandleApertureFree(gpusize gpu_addr) {
+    handle_aperture_mgr_->Free(gpu_addr);
+}
+
+/* is_forked_child detects when the process has forked since the last
+ * time this function was called. We cannot rely on pthread_atfork
+ * because the process can fork without calling the fork function in
+ * libc (using clone or calling the system call directly).
+ */
+bool is_forked_child(void) {
+  if (dxg_runtime->is_forked)
+    return true;
+
+  pid_t cur_pid = getpid();
+  if (dxg_runtime->parent_pid != cur_pid) {
+    dxg_runtime->is_forked = true;
+    dxg_runtime->parent_pid = cur_pid;
+    return true;
+  }
+
+  return false;
+}
+
+/* Callbacks from pthread_atfork */
+static void prepare_fork_handler(void) { pthread_mutex_lock(&dxg_runtime->hsakmt_mutex); }
+static void parent_fork_handler(void) { pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex); }
+static void child_fork_handler(void) {
+  pthread_mutex_init(&dxg_runtime->hsakmt_mutex, NULL);
+  dxg_runtime->is_forked = true;
+}
+
+/* Call this from the child process after fork. This will clear all
+ * data that is duplicated from the parent process, that is not valid
+ * in the child.
+ * The topology information is duplicated from the parent is valid
+ * in the child process so it is not cleared
+ */
+static void clear_after_fork(void) {
+  reset_suballocator();
+  clear_allocation_map();
+
+  if (dxg_runtime->dxg_fd >= 0) {
+    close(dxg_runtime->dxg_fd);
+    dxg_runtime->dxg_fd = -1;
+  }
+  delete dxg_runtime;
+  dxg_runtime = new hsakmtRuntime();
+
+}
+
+static inline void init_page_size(void) {
+  dxg_runtime->page_size = sysconf(_SC_PAGESIZE);
+  dxg_runtime->page_shift = ffs(dxg_runtime->page_size) - 1;
+}
+
+static HSAKMT_STATUS init_vars_from_env(void) {
+  char *envvar;
+  int debug_level;
+
+  /* Normally libraries don't print messages. For debugging purpose, we'll
+   * print messages if an environment variable, HSAKMT_DEBUG_LEVEL, is set.
+   */
+  envvar = getenv("HSAKMT_DEBUG_LEVEL");
+  if (envvar) {
+    dxg_runtime->hsakmt_debug_level = atoi(envvar);
+  }
+
+  /* Check whether to support Zero frame buffer */
+  envvar = getenv("HSA_ZFB");
+  if (envvar)
+    dxg_runtime->zfb_support = atoi(envvar);
+
+  /* Check whether to handle vendor specific aql packet */
+  envvar = getenv("WSLKMT_VENDOR_PACKET");
+  if (envvar)
+    dxg_runtime->vendor_packet_process = atoi(envvar);
+
+  /* Decide whether to check available system memory before allocation */
+  envvar = getenv("WSL_CHECK_AVAIL_SYSRAM");
+  if (envvar)
+    dxg_runtime->check_avail_sysram = !strcmp(envvar, "1");
+
+  envvar = getenv("WSL_ENABLE_THUNK_SUB_ALLOCATOR");
+  if (envvar)
+    dxg_runtime->enable_thunk_sub_allocator = atoi(envvar);
+
+  envvar = getenv("ROCR_VISIBLE_DEVICES");
+  if (envvar) {
+    std::string devices(envvar);
+    size_t first_num_pos = devices.find_first_of("0123456789");
+    if (first_num_pos != std::string::npos)
+      dxg_runtime->default_node = std::stoi(devices.substr(first_num_pos)) + 1;
+  }
+
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtOpenKFD(void) {
+  HSAKMT_STATUS result;
+  int fd = -1;
+  HsaSystemProperties sys_props;
+  char *error;
+
+  pthread_mutex_lock(&dxg_runtime->hsakmt_mutex);
+
+  /* If the process has forked, the child process must re-initialize
+   * it's connection to DXG. Any references tracked by dxg_open_count
+   * belong to the parent
+   */
+  if (is_forked_child())
+    clear_after_fork();
+
+  if (dxg_runtime->dxg_open_count == 0) {
+    static bool atfork_installed = false;
+
+    result = init_vars_from_env();
+    if (result != HSAKMT_STATUS_SUCCESS)
+      goto open_failed;
+
+    if (dxg_runtime->dxg_fd < 0) {
+      fd = open(dxg_runtime->dxg_device_name, O_RDWR | O_CLOEXEC);
+
+      if (fd == -1) {
+        result = HSAKMT_STATUS_KERNEL_IO_CHANNEL_NOT_OPENED;
+        goto open_failed;
+      }
+
+      dxg_runtime->dxg_fd = fd;
+    }
+    if (!wsl::thunk::dxcore::DxcoreLoader::Instance().Initialize()) {
+        pr_err("Failed to load libdxcore.so\n");
+        result = HSAKMT_STATUS_ERROR;
+        goto dxcore_loader_failed;
+    }
+
+    hsakmt_hsa_loader_init();
+    init_page_size();
+
+    char *useSvmStr = getenv("HSA_USE_SVM");
+    dxg_runtime->is_svm_api_supported = !(useSvmStr && !strcmp(useSvmStr, "0")) && false;
+
+    dxg_runtime->dxg_open_count = 1;
+
+    if (!atfork_installed) {
+      /* Atfork handlers cannot be uninstalled and
+       * must be installed only once. Otherwise
+       * prepare will deadlock when trying to take
+       * the same lock multiple times.
+       */
+      pthread_atfork(prepare_fork_handler, parent_fork_handler,
+                     child_fork_handler);
+      atfork_installed = true;
+    }
+  } else {
+    dxg_runtime->dxg_open_count++;
+    result = HSAKMT_STATUS_KERNEL_ALREADY_OPENED;
+  }
+
+  reset_suballocator();
+  pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex);
+  return result;
+dxcore_loader_failed:
+  close(fd);
+open_failed:
+  pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex);
+
+  return result;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtCloseKFD(void) {
+  HSAKMT_STATUS result;
+
+  pthread_mutex_lock(&dxg_runtime->hsakmt_mutex);
+
+  if (dxg_runtime->dxg_open_count > 0) {
+    if (--dxg_runtime->dxg_open_count == 0) {
+      close(dxg_runtime->dxg_fd);
+      dxg_runtime->dxg_fd = -1;
+      wsl::thunk::dxcore::DxcoreLoader::Instance().Shutdown();
+    }
+
+    result = HSAKMT_STATUS_SUCCESS;
+  } else
+    result = HSAKMT_STATUS_KERNEL_IO_CHANNEL_NOT_OPENED;
+
+  pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex);
+
+  return result;
+}
@@ -0,0 +1,78 @@
+/*
+ * Copyright © 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingSupport(void) {
+  CHECK_DXG_OPEN();
+  // Used for profiling tools
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI
+hsaKmtPcSamplingQueryCapabilities(HSAuint32 NodeId, void *sample_info,
+                                  HSAuint32 sample_info_sz, HSAuint32 *size) {
+  CHECK_DXG_OPEN();
+  // Used for profiling tools
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingCreate(HSAuint32 NodeId,
+                                               HsaPcSamplingInfo *sample_info,
+                                               HsaPcSamplingTraceId *traceId) {
+  CHECK_DXG_OPEN();
+  // Used for profiling tools
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingDestroy(HSAuint32 NodeId,
+                                                HsaPcSamplingTraceId traceId) {
+  CHECK_DXG_OPEN();
+  // Used for profiling tools
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingStart(HSAuint32 NodeId,
+                                              HsaPcSamplingTraceId traceId) {
+  CHECK_DXG_OPEN();
+  // Used for profiling tools
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingStop(HSAuint32 NodeId,
+                                             HsaPcSamplingTraceId traceId) {
+  CHECK_DXG_OPEN();
+  // Used for profiling tools
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
@@ -0,0 +1,90 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtPmcGetCounterProperties(
+    HSAuint32 NodeId, HsaCounterProperties **CounterProperties) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+/* Registers a set of (HW) counters to be used for tracing/profiling */
+HSAKMT_STATUS HSAKMTAPI hsaKmtPmcRegisterTrace(HSAuint32 NodeId,
+                                               HSAuint32 NumberOfCounters,
+                                               HsaCounter *Counters,
+                                               HsaPmcTraceRoot *TraceRoot) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+/* Unregisters a set of (HW) counters used for tracing/profiling */
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtPmcUnregisterTrace(HSAuint32 NodeId,
+                                                 HSATraceId TraceId) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtPmcAcquireTraceAccess(HSAuint32 NodeId,
+                                                    HSATraceId TraceId) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtPmcReleaseTraceAccess(HSAuint32 NodeId,
+                                                    HSATraceId TraceId) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+/* Starts tracing operation on a previously established set of performance
+ * counters */
+HSAKMT_STATUS HSAKMTAPI hsaKmtPmcStartTrace(HSATraceId TraceId,
+                                            void *TraceBuffer,
+                                            HSAuint64 TraceBufferSizeBytes) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+/*Forces an update of all the counters that a previously started trace operation
+ * has registered */
+HSAKMT_STATUS HSAKMTAPI hsaKmtPmcQueryTrace(HSATraceId TraceId) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+/* Stops tracing operation on a previously established set of performance
+ * counters */
+HSAKMT_STATUS HSAKMTAPI hsaKmtPmcStopTrace(HSATraceId TraceId) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
@@ -0,0 +1,216 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include <cinttypes>
+#include "impl/wddm/device.h"
+#include "impl/wddm/queue.h"
+#include "impl/hsa/amd_hsa_signal.h"
+
+uint32_t get_vgpr_size_per_cu(HSA_ENGINE_ID id) {
+  uint32_t vgpr_size = 0x40000;
+
+  uint32_t gfxv = HSA_GET_GFX_VERSION_FULL(id.ui32);
+  if( gfxv == 0x1100 || gfxv == 0x1101 ||
+    gfxv == 0x1151 ||
+    gfxv == 0x1200 || gfxv ==0x1201) {
+    vgpr_size = 0x60000;
+  }
+
+  return vgpr_size;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueue(HSAuint32 NodeId,
+					  HSA_QUEUE_TYPE Type,
+					  HSAuint32 QueuePercentage,
+					  HSA_QUEUE_PRIORITY Priority,
+					  void *QueueAddress,
+					  HSAuint64 QueueSizeInBytes,
+					  HsaEvent *Event,
+					  HsaQueueResource *QueueResource)
+{
+	if (Type == HSA_QUEUE_SDMA_BY_ENG_ID)
+		return HSAKMT_STATUS_ERROR;
+
+	return hsaKmtCreateQueueExt(NodeId, Type, QueuePercentage, Priority, 0,
+				    QueueAddress, QueueSizeInBytes, Event,
+				    QueueResource);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueueExt(HSAuint32 NodeId,
+					     HSA_QUEUE_TYPE Type,
+					     HSAuint32 QueuePercentage,
+					     HSA_QUEUE_PRIORITY Priority,
+					     HSAuint32 SdmaEngineId,
+					     void *QueueAddress,
+					     HSAuint64 QueueSizeInBytes,
+					     HsaEvent *Event,
+					     HsaQueueResource *QueueResource) {
+  HSAKMT_STATUS result;
+
+  CHECK_DXG_OPEN();
+  assert(Event == nullptr);
+
+  if (Priority < HSA_QUEUE_PRIORITY_MINIMUM ||
+      Priority > HSA_QUEUE_PRIORITY_MAXIMUM)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  wsl::thunk::WDDMDevice *device_ = get_wddmdev(NodeId);
+  assert(device_);
+
+  if (queue_acquire_buffer(QueueAddress) == false)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  switch (Type) {
+  case HSA_QUEUE_COMPUTE_AQL: {
+    assert(QueueResource->ErrorReason == nullptr);
+    uint64_t pkg_num = QueueSizeInBytes / 64;
+    uint32_t cmdbuf_size = device_->GetCmdbufSize();
+    uint32_t queue_engine = device_->GetComputeEngine();
+    bool use_hws = device_->IsHwsEnabled(queue_engine);
+    auto queue_ = new wsl::thunk::ComputeQueue(
+        device_, QueueAddress, pkg_num,
+        reinterpret_cast<std::atomic<uint64_t> *>(
+            QueueResource->Queue_write_ptr_aql),
+        reinterpret_cast<std::atomic<uint64_t> *>(
+            QueueResource->Queue_read_ptr_aql),
+        QueueResource->ErrorReason, cmdbuf_size, queue_engine, use_hws);
+
+    QueueResource->QueueId = reinterpret_cast<HSA_QUEUEID>(queue_);
+    // for doorbell_signal.hardware_doorbell_ptr
+    QueueResource->Queue_DoorBell_aql = queue_->GetDoorbellPtr();
+  } break;
+  case HSA_QUEUE_SDMA:
+  case HSA_QUEUE_SDMA_BY_ENG_ID: {
+    pr_debug("create sdma queue in engine %d\n", SdmaEngineId);
+    uint32_t queue_engine = device_->GetSdmaEngine(0); // TODO: SdmaEngineId
+    bool use_hws = device_->IsHwsEnabled(queue_engine);
+    auto queue_ = new wsl::thunk::SDMAQueue(
+		device_, QueueAddress, QueueSizeInBytes,
+		queue_engine, use_hws);
+    QueueResource->QueueId = reinterpret_cast<HSA_QUEUEID>(queue_);
+    QueueResource->Queue_DoorBell_aql = queue_->GetDoorbellPtr();
+    QueueResource->Queue_write_ptr_aql = queue_->GetRingWptr();
+    QueueResource->Queue_read_ptr_aql = queue_->GetRingRptr();
+  } break;
+  default:
+    assert(false);
+    QueueResource->QueueId = 0;
+    QueueResource->Queue_DoorBell = nullptr;
+    break;
+  }
+
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtUpdateQueue(
+    HSA_QUEUEID QueueId, HSAuint32 QueuePercentage, HSA_QUEUE_PRIORITY Priority,
+    void *QueueAddress, HSAuint64 QueueSize, HsaEvent *Event) {
+  CHECK_DXG_OPEN();
+
+  if (Priority < HSA_QUEUE_PRIORITY_MINIMUM ||
+      Priority > HSA_QUEUE_PRIORITY_MAXIMUM)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  auto queue_ = reinterpret_cast<wsl::thunk::ComputeQueue *>(QueueId);
+  if (!queue_)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtDestroyQueue(HSA_QUEUEID QueueId) {
+  CHECK_DXG_OPEN();
+
+  auto queue_ = reinterpret_cast<wsl::thunk::WDDMQueue *>(QueueId);
+  void *QueueAddress = queue_->GetHsaQueueAddr();
+
+  if (!queue_)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  delete queue_;
+  queue_release_buffer(QueueAddress);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtSetQueueCUMask(HSA_QUEUEID QueueId,
+                                             HSAuint32 CUMaskCount,
+                                             HSAuint32 *QueueCUMask) {
+  CHECK_DXG_OPEN();
+
+  auto queue_ = reinterpret_cast<wsl::thunk::ComputeQueue *>(QueueId);
+  if (!queue_)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  if (CUMaskCount == 0 || !QueueCUMask || ((CUMaskCount % 32) != 0))
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  pr_warn_once("not implemented\n");
+
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtGetQueueInfo(HSA_QUEUEID QueueId,
+                                           HsaQueueInfo *QueueInfo) {
+  CHECK_DXG_OPEN();
+
+  if (QueueInfo == NULL)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+  memset(QueueInfo, 0, sizeof(*QueueInfo));
+
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtSetTrapHandler(HSAuint32 Node,
+                                             void *TrapHandlerBaseAddress,
+                                             HSAuint64 TrapHandlerSizeInBytes,
+                                             void *TrapBufferBaseAddress,
+                                             HSAuint64 TrapBufferSizeInBytes) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not implemented\n");
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtAllocQueueGWS(HSA_QUEUEID QueueId, HSAuint32 nGWS,
+                                            HSAuint32 *firstGWS) {
+  CHECK_DXG_OPEN();
+
+  auto queue_ = reinterpret_cast<wsl::thunk::ComputeQueue *>(QueueId);
+  if (!queue_)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtQueueRingDoorbell(HSA_QUEUEID QueueId) {
+  CHECK_DXG_OPEN();
+
+  auto queue_ = reinterpret_cast<wsl::thunk::WDDMQueue *>(QueueId);
+  if (!queue_)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  queue_->RingDoorbell();
+  return HSAKMT_STATUS_SUCCESS;
+}
@@ -0,0 +1,50 @@
+/*
+ * Copyright © 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtSPMAcquire(HSAuint32 PreferredNode) {
+  CHECK_DXG_OPEN();
+  // Used for profiling tools
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtSPMSetDestBuffer(
+    HSAuint32 PreferredNode, HSAuint32 SizeInBytes, HSAuint32 *timeout,
+    HSAuint32 *SizeCopied, void *DestMemoryAddress, bool *isSPMDataLoss) {
+  CHECK_DXG_OPEN();
+  // Used for profiling tools
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtSPMRelease(HSAuint32 PreferredNode) {
+  CHECK_DXG_OPEN();
+  // Used for profiling tools
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
@@ -0,0 +1,55 @@
+/*
+ * Copyright © 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/* Helper functions for calling KFD SVM ioctl */
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtSVMSetAttr(void *start_addr, HSAuint64 size,
+                                         unsigned int nattr,
+                                         HSA_SVM_ATTRIBUTE *attrs) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtSVMGetAttr(void *start_addr, HSAuint64 size,
+                                         unsigned int nattr,
+                                         HSA_SVM_ATTRIBUTE *attrs) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtSetXNACKMode(HSAint32 enable) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtGetXNACKMode(HSAint32 *enable) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  *enable = false;
+  return HSAKMT_STATUS_SUCCESS;
+}
@@ -0,0 +1,49 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include <iostream>
+#include <ctime>
+#include <cstring>
+#include <cassert>
+#include "impl/wddm/device.h"
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtGetClockCounters(HSAuint32 NodeId,
+                                               HsaClockCounters *Counters) {
+  HSAKMT_STATUS result = HSAKMT_STATUS_SUCCESS;
+
+  CHECK_DXG_OPEN();
+
+  std::memset(Counters, 0, sizeof(*Counters));
+
+  wsl::thunk::WDDMDevice *device_ = get_wddmdev(NodeId);
+  assert(device_);
+  device_->GetClockCounters(&Counters->GPUClockCounter, &Counters->CPUClockCounter);
+
+  struct timespec ts;
+  if (clock_gettime(CLOCK_MONOTONIC_RAW, &ts) == 0)
+    Counters->SystemClockCounter = ts.tv_sec * 1e9 + ts.tv_nsec;
+  Counters->SystemClockFrequencyHz = 1000000000;
+
+  return result;
+}
@@ -0,0 +1,519 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+// 
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+// 
+// Developed by:
+// 
+//                 AMD Research and AMD HSA Software Development
+// 
+//                 Advanced Micro Devices, Inc.
+// 
+//                 www.amd.com
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+// 
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/*
+  Helpers to use native types with C++11 atomic operations.
+  Fixes GCC builtin functionality for x86 with respect to WC and non-temporal
+  stores.
+*/
+#ifndef HSA_RUNTIME_CORE_UTIL_ATOMIC_HELPERS_H_
+#define HSA_RUNTIME_CORE_UTIL_ATOMIC_HELPERS_H_
+
+#include <atomic>
+#include "utils.h"
+
+//ALWAYS_CONSERVATIVE will very likely overfence your code.
+//For use as a debugging aid only.
+#define ALWAYS_CONSERVATIVE 0
+
+#if !ALWAYS_CONSERVATIVE
+#if defined(__x86_64__) || defined(_M_X64)
+#define X64_ORDER_WC 1
+#endif
+#if X64_ORDER_WC
+#include <xmmintrin.h>
+#endif
+#endif
+
+namespace wsl {
+namespace atomic {
+
+static constexpr int c11ToBuiltInFlags(std::memory_order order)
+{
+#if ALWAYS_CONSERVATIVE
+  return __ATOMIC_RELAXED;
+#elif X64_ORDER_WC
+  return __ATOMIC_RELAXED;
+#else
+  return (order == std::memory_order_relaxed) ? __ATOMIC_RELAXED :
+    (order == std::memory_order_acquire) ? __ATOMIC_ACQUIRE :
+    (order == std::memory_order_release) ? __ATOMIC_RELEASE :
+    (order == std::memory_order_seq_cst) ? __ATOMIC_SEQ_CST :
+    (order == std::memory_order_consume) ? __ATOMIC_CONSUME :
+    (order == std::memory_order_acq_rel) ? __ATOMIC_ACQ_REL :
+    __ATOMIC_SEQ_CST;
+#endif
+}
+
+static __forceinline void PreFence(std::memory_order order) {
+#if ALWAYS_CONSERVATIVE
+  switch (order) {
+    case std::memory_order_release:
+    case std::memory_order_seq_cst:
+    case std::memory_order_acq_rel:
+      __atomic_thread_fence(__ATOMIC_SEQ_CST);
+    default:;
+  }
+#elif X64_ORDER_WC
+  switch (order) {
+    case std::memory_order_release:
+    case std::memory_order_seq_cst:
+    case std::memory_order_acq_rel:
+      _mm_sfence();
+    default:;
+  }
+#endif
+}
+
+static __forceinline void PostFence(std::memory_order order) {
+#if ALWAYS_CONSERVATIVE
+  switch (order) {
+    case std::memory_order_seq_cst:
+    case std::memory_order_acq_rel:
+    case std::memory_order_acquire:
+      __atomic_thread_fence(__ATOMIC_SEQ_CST);
+    default:;
+  }
+#elif X64_ORDER_WC
+  switch (order) {
+    case std::memory_order_seq_cst:
+      return _mm_mfence();
+    case std::memory_order_acq_rel:
+    case std::memory_order_acquire:
+      return _mm_lfence();
+    default:;
+  }
+#endif
+}
+
+static __forceinline void Fence(std::memory_order order=std::memory_order_seq_cst) {
+#if ALWAYS_CONSERVATIVE
+  __atomic_thread_fence(__ATOMIC_SEQ_CST);
+#elif X64_ORDER_WC
+  switch (order) {
+    case std::memory_order_seq_cst:
+    case std::memory_order_acq_rel:
+      return _mm_mfence();
+    case std::memory_order_acquire:
+      return _mm_lfence();
+    case std::memory_order_release:
+      return _mm_sfence();
+    default:;
+  }
+#else
+  std::atomic_thread_fence(order);
+#endif
+}
+
+template <class T>
+static __forceinline void BasicCheck(const T* ptr) {
+  constexpr bool value = __atomic_always_lock_free(sizeof(T), 0);
+  static_assert(value, "Atomic type may not be compatible with peripheral atomics.");
+};
+
+template <class T>
+static __forceinline void BasicCheck(const volatile T* ptr) {
+  constexpr bool value = __atomic_always_lock_free(sizeof(T), 0);
+  static_assert(value, "Atomic type may not be compatible with peripheral atomics.");
+};
+
+/// @brief: Load value of type T atomically with specified memory order.
+/// @param: ptr(Input), a pointer to type T.
+/// @param: order(Input), memory order with atomic load, relaxed by default.
+/// @return: T, loaded value.
+template <class T>
+static __forceinline T
+    Load(const T* ptr, std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  T ret;
+  PreFence(order);
+  __atomic_load(ptr, &ret, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: function overloading, for more info, see previous one.
+/// @param: ptr(Input), a pointer to volatile type T.
+/// @param: order(Input), memory order with atomic load, relaxed by default.
+/// @return: T, loaded value.
+template <class T>
+static __forceinline T
+    Load(const volatile T* ptr,
+         std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  T ret;
+  PreFence(order);
+  __atomic_load(ptr, &ret, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Store value of type T with specified memory order.
+/// @param: ptr(Input), a pointer to instance which will be stored.
+/// @param: val(Input), value to be stored.
+/// @param: order(Input), memory order with atomic store, relaxed by default.
+/// @return: void.
+template <class T>
+static __forceinline void Store(
+    T* ptr, T val, std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  __atomic_store(ptr, &val, c11ToBuiltInFlags(order));
+  PostFence(order);
+}
+
+/// @brief: Function overloading, for more info, see previous one.
+/// @param: ptr(Input), a pointer to volatile instance which will be stored.
+/// @param: val(Input), value to be stored.
+/// @param: order(Input), memory order with atomic store, relaxed by default.
+/// @return: void.
+template <class T>
+static __forceinline void Store(
+    volatile T* ptr, T val,
+    std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  __atomic_store(ptr, &val, c11ToBuiltInFlags(order));
+  PostFence(order);
+}
+
+/// @brief: Compare and swap value atomically with specified memory order.
+/// @param: ptr(Input), a pointer to variable which is operated on.
+/// @param: val(Input), value to be stored if condition is satisfied.
+/// @param: expected(Input), value which is expected.
+/// @param: order(Input), memory order with atomic operation.
+/// @return: T, observed value of type T.
+template <class T>
+static __forceinline T
+    Cas(T* ptr, T val, T expected,
+        std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  __atomic_compare_exchange(ptr, &expected, &val, false, c11ToBuiltInFlags(order), __ATOMIC_RELAXED);
+  PostFence(order);
+  return expected;
+}
+
+/// @brief: Function overloading, for more info, see previous one.
+/// @param: ptr(Input), a pointer to volatile variable which is operated on.
+/// @param: val(Input), value to be stored if condition is satisfied.
+/// @param: expected(Input), value which is expected.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, observed value of type T.
+template <class T>
+static __forceinline T
+    Cas(volatile T* ptr, T val, T expected,
+        std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  __atomic_compare_exchange(ptr, &expected, &val, false, c11ToBuiltInFlags(order), __ATOMIC_RELAXED);
+  PostFence(order);
+  return expected;
+}
+
+/// @brief: Exchange the value atomically with specified memory order.
+/// @param: ptr(Input), a pointer to variable which is operated on.
+/// @param: val(Input), value to be stored.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, the value prior to the exchange.
+template <class T>
+static __forceinline T
+    Exchange(T* ptr, T val,
+             std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  T ret;
+  PreFence(order);
+  __atomic_exchange(ptr, &val, &ret, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Function overloading, for more info, see previous one.
+/// @param: ptr(Input), a pointer to variable which is operated on.
+/// @param: val(Input), value to be stored.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, the value prior to the exchange.
+template <class T>
+static __forceinline T
+    Exchange(volatile T* ptr, T val,
+             std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  T ret;
+  PreFence(order);
+  __atomic_exchange(ptr, &val, &ret, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Add value to variable atomically with specified memory order.
+/// @param: ptr(Input), a pointer to variable which is operated on.
+/// @param: val(Input), value to be added.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, the value of the variable prior to the addition.
+template <class T>
+static __forceinline T
+    Add(T* ptr, T val, std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  T ret = __atomic_fetch_add(ptr, val, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Subtract value from the variable atomically with specified memory
+/// order.
+/// @param: ptr(Input), a pointer to variable which is operated on.
+/// @param: val(Input), value to be subtraced.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, value of the variable prior to the subtraction.
+template <class T>
+static __forceinline T
+    Sub(T* ptr, T val, std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  T ret = __atomic_fetch_sub(ptr, val, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Bit And operation on variable atomically with specified memory
+/// order.
+/// @param: ptr(Input), a pointer to variable which is operated on.
+/// @param: val(Input), value which is ANDed with variable.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, value of variable prior to the operation.
+template <class T>
+static __forceinline T
+    And(T* ptr, T val, std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  T ret = __atomic_fetch_and(ptr, val, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Bit Or operation on variable atomically with specified memory order.
+/// @param: ptr(Input), a pointer to variable which is operated on.
+/// @param: val(Input), value which is ORed with variable.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, value of variable prior to the operation.
+template <class T>
+static __forceinline T
+    Or(T* ptr, T val, std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  T ret = __atomic_fetch_or(ptr, val, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Bit Xor operation on variable atomically with specified memory
+/// order.
+/// @param: ptr(Input), a pointer to variable which is operated on.
+/// @param: val(Input), value which is XORed with variable.
+/// @order: order(Input), memory order which is relaxed by default.
+/// @return: T, valud of variable prior to the opertaion.
+template <class T>
+static __forceinline T
+    Xor(T* ptr, T val, std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  T ret = __atomic_fetch_xor(ptr, val, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Increase the value of variable atomically with specified memory
+/// order.
+/// @param: ptr(Input), a pointer to variable which is operated on.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, value of variable prior to the operation.
+template <class T>
+static __forceinline T
+    Increment(T* ptr, std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  T ret = __atomic_fetch_add(ptr, 1, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Decrease the value of the variable atomically with specified memory
+/// order.
+/// @param: ptr(Input), a pointer to variable which is operated on.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, value of variable prior to the operation.
+template <class T>
+static __forceinline T
+    Decrement(T* ptr, std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  T ret = __atomic_fetch_sub(ptr, 1, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Add value to variable atomically with specified memory order.
+/// @param: ptr(Input), a pointer to volatile variable which is operated on.
+/// @param: val(Input), value to be added.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, the value of the variable prior to the addition.
+template <class T>
+static __forceinline T
+    Add(volatile T* ptr, T val,
+        std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  T ret = __atomic_fetch_add(ptr, val, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Subtract value from the variable atomically with specified memory
+/// order.
+/// @param: ptr(Input), a pointer to volatile variable which is operated on.
+/// @param: val(Input), value to be subtraced.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, value of the variable prior to the subtraction.
+template <class T>
+static __forceinline T
+    Sub(volatile T* ptr, T val,
+        std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  T ret = __atomic_fetch_sub(ptr, val, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Bit And operation on variable atomically with specified memory
+/// order.
+/// @param: ptr(Input), a pointer to volatile variable which is operated on.
+/// @param: val(Input), value which is ANDed with variable.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, value of variable prior to the operation.
+template <class T>
+static __forceinline T
+    And(volatile T* ptr, T val,
+        std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  T ret = __atomic_fetch_and(ptr, val, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Bit Or operation on variable atomically with specified memory order.
+/// @param: ptr(Input), a pointer to volatile variable which is operated on.
+/// @param: val(Input), value which is ORed with variable.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, value of variable prior to the operation.
+template <class T>
+static __forceinline T Or(volatile T* ptr, T val,
+                          std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  T ret = __atomic_fetch_or(ptr, val, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Bit Xor operation on variable atomically with specified memory
+/// order.
+/// @param: ptr(Input), a pointer to volatile variable which is operated on.
+/// @param: val(Input), value which is XORed with variable.
+/// @order: order(Input), memory order which is relaxed by default.
+/// @return: T, valud of variable prior to the opertaion.
+template <class T>
+static __forceinline T
+    Xor(volatile T* ptr, T val,
+        std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  T ret = __atomic_fetch_xor(ptr, val, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Increase the value of variable atomically with specified memory
+/// order.
+/// @param: ptr(Input), a pointer to volatile variable which is operated on.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, value of variable prior to the operation.
+template <class T>
+static __forceinline T
+    Increment(volatile T* ptr,
+              std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  T ret = __atomic_fetch_add(ptr, 1, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Decrease the value of the variable atomically with specified memory
+/// order.
+/// @param: ptr(Input), a pointer to volatile variable which is operated on.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, value of variable prior to the operation.
+template <class T>
+static __forceinline T
+    Decrement(volatile T* ptr,
+              std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  T ret = __atomic_fetch_sub(ptr, 1, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+}   //  namespace atomic
+}   //  namespace wsl
+
+#ifdef X64_ORDER_WC
+#undef X64_ORDER_WC
+#endif
+
+#ifdef ALWAYS_CONSERVATIVE
+#undef ALWAYS_CONSERVATIVE
+#endif
+
+#endif  // HSA_RUNTIME_CORE_UTIL_ATOMIC_HELPERS_H_
@@ -0,0 +1,155 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIESd OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef HSA_RUNTIME_CORE_UTIL_LAZY_PTR_H_
+#define HSA_RUNTIME_CORE_UTIL_LAZY_PTR_H_
+
+#include <memory>
+#include <utility>
+#include <functional>
+
+#include "core/util/locks.h"
+#include "core/util/utils.h"
+
+namespace wsl {
+
+/*
+ * Wrapper for a std::unique_ptr that initializes its object at first use.
+ */
+template <typename T> class lazy_ptr {
+ public:
+  lazy_ptr() {}
+
+  explicit lazy_ptr(std::function<T*()> Constructor) { reset(Constructor); }
+
+  lazy_ptr(lazy_ptr&& rhs) {
+    obj = std::move(rhs.obj);
+    func = std::move(rhs.func);
+  }
+
+  lazy_ptr& operator=(lazy_ptr&& rhs) {
+    obj = std::move(rhs.obj);
+    func = std::move(rhs.func);
+  }
+
+  lazy_ptr(lazy_ptr&) = delete;
+  lazy_ptr& operator=(lazy_ptr&) = delete;
+
+  void reset(std::function<T*()> Constructor = nullptr) {
+    obj.reset();
+    func = Constructor;
+  }
+
+  void reset(T* ptr) {
+    obj.reset(ptr);
+    func = nullptr;
+  }
+
+  bool operator==(T* rhs) const { return obj.get() == rhs; }
+  bool operator!=(T* rhs) const { return obj.get() != rhs; }
+
+  const std::unique_ptr<T>& operator->() const {
+    make(true);
+    assert(obj != nullptr && "Null dereference through lazy_ptr.");
+    return obj;
+  }
+
+  std::unique_ptr<T>& operator*() {
+    make(true);
+    return obj;
+  }
+
+  const std::unique_ptr<T>& operator*() const {
+    make(true);
+    return obj;
+  }
+
+  /*
+   * Ensures that the object is created or is being created.
+   * This is useful when early construction of the object is required.
+   */
+  void touch() const { make(false); }
+
+  // Tells if the lazy object has been constructed or not.
+  // Construction may fail silently (return nullptr).
+  bool created() const {
+    std::atomic_thread_fence(std::memory_order_acquire);
+    return func == nullptr;
+  }
+
+  // Tells if the lazy object exists or not.
+  bool empty() const {
+    std::atomic_thread_fence(std::memory_order_acquire);
+    return obj == nullptr;
+  }
+
+ private:
+  mutable std::unique_ptr<T> obj;
+  mutable std::function<T*(void)> func;
+  mutable KernelMutex lock;
+
+  // Separated from make to improve inlining.
+  void make_body(bool block) const {
+    if (block) {
+      lock.Acquire();
+    } else if (!lock.Try()) {
+      return;
+    }
+    MAKE_SCOPE_GUARD([&]() { lock.Release(); });
+    if (func == nullptr) return;
+    T* ptr = func();
+    obj.reset(ptr);
+    std::atomic_thread_fence(std::memory_order_release);
+    func = nullptr;
+  }
+
+  __forceinline void make(bool block) const {
+    if (!created()) {
+      make_body(block);
+    }
+  }
+
+};
+
+} // namespace wsl
+
+#endif  // HSA_RUNTIME_CORE_UTIL_LAZY_PTR_H_
@@ -0,0 +1,769 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2024, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifdef __linux__
+#include "core/util/os.h"
+#include "core/util/utils.h"
+
+#include <link.h>
+#include <dlfcn.h>
+#include <pthread.h>
+#include <limits.h>
+#include <sched.h>
+#include <sys/sysinfo.h>
+#include <sys/time.h>
+#include <sys/utsname.h>
+#include <unistd.h>
+#include <errno.h>
+#include <cstring>
+#include <atomic>
+#include <memory>
+#include <string>
+#include <utility>
+#include <semaphore.h>
+#include "core/inc/runtime.h"
+#if defined(__i386__) || defined(__x86_64__)
+#include <cpuid.h>
+#endif
+
+namespace wsl {
+namespace os {
+
+struct ThreadArgs {
+  void* entry_args;
+  ThreadEntry entry_function;
+};
+
+void* __stdcall ThreadTrampoline(void* arg) {
+  ThreadArgs* ar = (ThreadArgs*)arg;
+  ThreadEntry CallMe = ar->entry_function;
+  void* Data = ar->entry_args;
+  delete ar;
+  CallMe(Data);
+  return nullptr;
+}
+
+// Thread container allows multiple waits and separate close (destroy).
+class os_thread {
+ public:
+  explicit os_thread(ThreadEntry function, void* threadArgument, uint stackSize)
+      : thread(0), lock(nullptr), state(RUNNING) {
+    int err;
+    std::unique_ptr<ThreadArgs> args(new ThreadArgs);
+    lock = CreateMutex();
+    if (lock == nullptr) return;
+
+    args->entry_args = threadArgument;
+    args->entry_function = function;
+
+    pthread_attr_t attrib;
+    err = pthread_attr_init(&attrib);
+    if (err != 0) {
+      pr_err("pthread_attr_init failed: %s\n", strerror(err));
+      return;
+    }
+
+    if (stackSize != 0) {
+      stackSize = Max(uint(PTHREAD_STACK_MIN), stackSize);
+      stackSize = AlignUp(stackSize, 4096);
+      err = pthread_attr_setstacksize(&attrib, stackSize);
+      if (err != 0) {
+        pr_err("pthread_attr_setstacksize failed: %s\n", strerror(err));
+        err = pthread_attr_destroy(&attrib);
+        if (err != 0) {
+          pr_err("pthread_attr_destroy failed: %s\n", strerror(err));
+          return;
+        }
+      }
+    }
+
+    int cores = 0;
+    cpu_set_t* cpuset = nullptr;
+
+    if (core::Runtime::runtime_singleton_->flag().override_cpu_affinity()) {
+      cores = get_nprocs_conf();
+      cpuset = CPU_ALLOC(cores);
+      if (cpuset == nullptr) {
+        pr_err("CPU_ALLOC failed: %s\n", strerror(errno));
+        return;
+      }
+      CPU_ZERO_S(CPU_ALLOC_SIZE(cores), cpuset);
+      for (int i = 0; i < cores; i++) {
+        CPU_SET_S(i, CPU_ALLOC_SIZE(cores), cpuset);
+      }
+      err = pthread_attr_setaffinity_np(&attrib, CPU_ALLOC_SIZE(cores), cpuset);
+      CPU_FREE(cpuset);
+      if (err != 0) {
+        pr_err("pthread_setaffinity_np failed: %s\n", strerror(err));
+        return;
+      }
+    }
+
+    err = pthread_create(&thread, &attrib, ThreadTrampoline, args.get());
+
+    // Probably a stack size error since system limits can be different from PTHREAD_STACK_MIN
+    // Attempt to grow the stack within reason.
+    if ((err == EINVAL) && stackSize != 0) {
+      while (stackSize < 20 * 1024 * 1024) {
+        stackSize *= 2;
+        err = pthread_attr_setstacksize(&attrib, stackSize);
+        if (err != 0) {
+          pr_err("pthread_attr_setstacksize failed: %s\n", strerror(err));
+          return;
+        }
+        err = pthread_create(&thread, &attrib, ThreadTrampoline, args.get());
+        if (err != EINVAL) break;
+        pr_debug("pthread_create returned EINVAL, doubling stack size\n");
+      }
+    }
+
+    if (err == 0)
+      args.release();
+    else
+      thread = 0;
+
+    err = pthread_attr_destroy(&attrib);
+    if (err != 0) {
+      pr_err("pthread_attr_destroy failed: %s\n", strerror(err));
+    }
+  }
+
+  os_thread(os_thread&& rhs) {
+    thread = rhs.thread;
+    lock = rhs.lock;
+    state = int(rhs.state);
+    rhs.thread = 0;
+    rhs.lock = nullptr;
+  }
+
+  os_thread(os_thread&) = delete;
+
+  ~os_thread() {
+    if (lock != nullptr) DestroyMutex(lock);
+    if ((state == RUNNING) && (thread != 0)) {
+      int err = pthread_detach(thread);
+      if (err != 0) pr_err("pthread_detach failed: %s\n", strerror(err));
+    }
+  }
+
+  bool Valid() { return (lock != nullptr) && (thread != 0); }
+
+  bool Wait() {
+    if (state == FINISHED) return true;
+    AcquireMutex(lock);
+    if (state == FINISHED) {
+      ReleaseMutex(lock);
+      return true;
+    }
+    int err = pthread_join(thread, NULL);
+    bool success = (err == 0);
+    if (success) state = FINISHED;
+    ReleaseMutex(lock);
+    return success;
+  }
+
+ private:
+  pthread_t thread;
+  Mutex lock;
+  std::atomic<int> state;
+  enum { FINISHED = 0, RUNNING = 1 };
+};
+
+static_assert(sizeof(LibHandle) == sizeof(void*), "OS abstraction size mismatch");
+static_assert(sizeof(Semaphore) == sizeof(sem_t*), "OS abstraction size mismatch");
+static_assert(sizeof(Mutex) == sizeof(pthread_mutex_t*), "OS abstraction size mismatch");
+static_assert(sizeof(SharedMutex) == sizeof(pthread_rwlock_t*), "OS abstraction size mismatch");
+static_assert(sizeof(Thread) == sizeof(os_thread*), "OS abstraction size mismatch");
+
+LibHandle LoadLib(std::string filename) {
+  void* ret = dlopen(filename.c_str(), RTLD_LAZY);
+  if (ret == nullptr) pr_err("LoadLib(%s) failed: %s\n", filename.c_str(), dlerror());
+  return *(LibHandle*)&ret;
+}
+
+void* GetExportAddress(LibHandle lib, std::string export_name) {
+  void* ret = dlsym(*(void**)&lib, export_name.c_str());
+
+  // dlsym searches the given library and all the library's load dependencies.
+  // Remaining code limits symbol lookup to only the library handle given.
+  // This lookup pattern matches Windows.
+  if (ret == NULL) return ret;
+
+  link_map* map;
+  int err = dlinfo(*(void**)&lib, RTLD_DI_LINKMAP, &map);
+  if (err == -1) {
+    pr_err("dlinfo failed: %s\n", dlerror());
+    return nullptr;
+  }
+
+  Dl_info info;
+  err = dladdr(ret, &info);
+  if (err == 0) {
+    pr_err("dladdr failed.\n");
+    return nullptr;
+  }
+
+  if (strcmp(info.dli_fname, map->l_name) == 0) return ret;
+
+  return NULL;
+}
+
+void CloseLib(LibHandle lib) { dlclose(*(void**)&lib); }
+
+/*
+ * @brief Look for a symbol called "HSA_AMD_TOOL_PRIORITY" across all loaded
+ * shared libraries, and if found, store the name of the library
+ *
+ * @param[in]: info A dl_phdr_info struct pointer, which contains information
+ * about library's load address, header, and name.
+ *
+ * @param[in]: size integer size of dl_phdr_info struct
+ *
+ * @param[out]: data copy of the data argument to dl_phdr_iterate call
+ *
+ * @retval:: Return 0 on Success. If callback returns a non-zero value,
+ * dl_iterate_phdr() will stop processing, even if there are unprocessed
+ * shared objects.
+ */
+
+static int callback(struct dl_phdr_info* info, size_t size, void* data) {
+  std::vector<std::string>* loadedToolsLib = (std::vector<std::string>*)data;
+  assert(loadedToolsLib != nullptr);
+  /*
+   * Check if lib name is not empty and its not a "vdso.so" lib,
+   * The vDSO is a special shared object file that is built into the Linux kernel.
+   * It is not a regular shared library and thus does not have all the properties
+   * of regular shared libraries. The way the vDSO is loaded and organized in memory
+   * is different from regular shared libraries and it's not guaranteed that it
+   * will have a specific segment or section. Hence its skipped.
+   */
+
+  if ((info) && (info->dlpi_name[0] != '\0')) {
+    if (std::string(info->dlpi_name).find("vdso.so") != std::string::npos) return 0;
+
+    /*
+     * Iterate through the program headers of the loaded lib and check for PT_DYNAMIC program
+     * header. If the PT_DYNAMIC program header is found, use dlpi_addr and dlpi_phdr members
+     * of dl_phdr_info struct to get the address of the dynamic section of the loaded
+     * library in memory
+     */
+
+    for (int i = 0; i < info->dlpi_phnum; i++) {
+      if (info->dlpi_phdr[i].p_type == PT_DYNAMIC) {
+        Elf64_Dyn* dyn_section = (Elf64_Dyn*)(info->dlpi_addr + info->dlpi_phdr[i].p_vaddr);
+
+        char* strings = nullptr;
+        Elf64_Xword limit = 0;
+
+        /*
+         * The dynamic section is searched for DT_STRTAB (address of string table),
+         * and DT_STRSZ (size of string table)
+         * DT_NULL - Marks the end of the _DYNAMIC array
+         */
+
+        for (int j = 0;; j++) {
+          if (dyn_section[j].d_tag == DT_NULL) break;
+
+          if (dyn_section[j].d_tag == DT_STRTAB) strings = (char*)(dyn_section[j].d_un.d_ptr);
+
+          if (dyn_section[j].d_tag == DT_STRSZ) limit = dyn_section[j].d_un.d_val;
+        }
+
+        if (strings == nullptr) pr_debug("String table not found\n");
+
+        /*
+         * Hacky lookup, if string and symbol tables are found,
+         * iterate through the strings in string table and check if
+         * any string matches "HSA_AMD_TOOL_PRIORITY".
+         * If yes, then add the name of the library to the vector of
+         * lib names
+         */
+        if (strings != nullptr) {
+          char* end = strings + limit;
+          while (strings < end) {
+            if (strcmp(strings, "HSA_AMD_TOOL_PRIORITY") == 0) {
+              loadedToolsLib->push_back(info->dlpi_name);
+              return 0;
+            }
+            strings += (strlen(strings) + 1);
+          }
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+std::vector<LibHandle> GetLoadedToolsLib() {
+  std::vector<LibHandle> ret;
+  std::vector<std::string> names;
+
+  /* Iterate through all of the loaded shared libraries in the process */
+  dl_iterate_phdr(callback, &names);
+
+  if (!names.empty()) {
+    for (auto& name : names) ret.push_back(LoadLib(name));
+  }
+
+  return ret;
+}
+
+std::string GetLibraryName(LibHandle lib) {
+  link_map *map;
+  if(dlinfo(lib, RTLD_DI_LINKMAP, &map)!=0)
+    return "";
+  return map->l_name;
+}
+
+Semaphore CreateSemaphore() {
+  sem_t *sem = new sem_t;
+  sem_init(sem, 0, 0);
+  return *(Semaphore*)&sem;
+}
+
+bool WaitSemaphore(Semaphore sem) {
+  while(sem_wait(*(sem_t**)&sem))
+    if (errno != EINTR) return false;
+
+  return true;
+}
+
+void PostSemaphore(Semaphore sem) {
+  if (sem_post(*(sem_t**)&sem))
+    assert(false && "Failed to post semaphore");
+}
+
+void DestroySemaphore(Semaphore sem) {
+  sem_destroy(*(sem_t**)&sem);
+  delete *(sem_t**)&sem;
+}
+
+Mutex CreateMutex() {
+  pthread_mutex_t* mutex = new pthread_mutex_t;
+  pthread_mutex_init(mutex, NULL);
+  return *(Mutex*)&mutex;
+}
+
+bool TryAcquireMutex(Mutex lock) {
+  return pthread_mutex_trylock(*(pthread_mutex_t**)&lock) == 0;
+}
+
+bool AcquireMutex(Mutex lock) {
+  return pthread_mutex_lock(*(pthread_mutex_t**)&lock) == 0;
+}
+
+void ReleaseMutex(Mutex lock) {
+  pthread_mutex_unlock(*(pthread_mutex_t**)&lock);
+}
+
+void DestroyMutex(Mutex lock) {
+  pthread_mutex_destroy(*(pthread_mutex_t**)&lock);
+  delete *(pthread_mutex_t**)&lock;
+}
+
+void Sleep(int delay_in_millisec) { usleep(delay_in_millisec * 1000); }
+
+void uSleep(int delayInUs) { usleep(delayInUs); }
+
+void YieldThread() { sched_yield(); }
+
+Thread CreateThread(ThreadEntry function, void* threadArgument, uint stackSize) {
+  os_thread* result = new os_thread(function, threadArgument, stackSize);
+  if (!result->Valid()) {
+    delete result;
+    return nullptr;
+  }
+
+  return reinterpret_cast<Thread>(result);
+}
+
+void CloseThread(Thread thread) { delete reinterpret_cast<os_thread*>(thread); }
+
+bool WaitForThread(Thread thread) { return reinterpret_cast<os_thread*>(thread)->Wait(); }
+
+bool WaitForAllThreads(Thread* threads, uint threadCount) {
+  for (uint i = 0; i < threadCount; i++) WaitForThread(threads[i]);
+  return true;
+}
+
+bool IsEnvVarSet(std::string env_var_name) {
+  char* buff = NULL;
+  buff = getenv(env_var_name.c_str());
+  return (buff != NULL);
+}
+
+void SetEnvVar(std::string env_var_name, std::string env_var_value) {
+  setenv(env_var_name.c_str(), env_var_value.c_str(), 1);
+}
+
+int GetProcessId() {
+  return ::getpid();
+}
+
+std::string GetEnvVar(std::string env_var_name) {
+  char* buff;
+  buff = getenv(env_var_name.c_str());
+  std::string ret;
+  if (buff) {
+    ret = buff;
+  }
+  return ret;
+}
+
+size_t GetUserModeVirtualMemorySize() {
+#ifdef _LP64
+  // https://www.kernel.org/doc/Documentation/x86/x86_64/mm.txt :
+  // user space is 0000000000000000 - 00007fffffffffff (=47 bits)
+  return (size_t)(0x800000000000);
+#else
+  return (size_t)(0xffffffff);  // ~4GB
+#endif
+}
+
+size_t GetUsablePhysicalHostMemorySize() {
+  struct sysinfo info = {0};
+  if (sysinfo(&info) != 0) {
+    return 0;
+  }
+
+  const size_t physical_size =
+      static_cast<size_t>(info.totalram * info.mem_unit);
+  return std::min(GetUserModeVirtualMemorySize(), physical_size);
+}
+
+uintptr_t GetUserModeVirtualMemoryBase() { return (uintptr_t)0; }
+
+// Os event implementation
+typedef struct EventDescriptor_ {
+  pthread_cond_t event;
+  pthread_mutex_t mutex;
+  bool state;
+  bool auto_reset;
+} EventDescriptor;
+
+EventHandle CreateOsEvent(bool auto_reset, bool init_state) {
+  EventDescriptor* eventDescrp;
+  eventDescrp = (EventDescriptor*)malloc(sizeof(EventDescriptor));
+
+  pthread_mutex_init(&eventDescrp->mutex, NULL);
+  pthread_cond_init(&eventDescrp->event, NULL);
+  eventDescrp->auto_reset = auto_reset;
+  eventDescrp->state = init_state;
+
+  EventHandle handle = reinterpret_cast<EventHandle>(eventDescrp);
+
+  return handle;
+}
+
+int DestroyOsEvent(EventHandle event) {
+  if (event == NULL) {
+    return -1;
+  }
+
+  EventDescriptor* eventDescrp = reinterpret_cast<EventDescriptor*>(event);
+  int ret_code = pthread_cond_destroy(&eventDescrp->event);
+  ret_code |= pthread_mutex_destroy(&eventDescrp->mutex);
+  free(eventDescrp);
+  return ret_code;
+}
+
+int WaitForOsEvent(EventHandle event, unsigned int milli_seconds) {
+  if (event == NULL) {
+    return -1;
+  }
+
+  EventDescriptor* eventDescrp = reinterpret_cast<EventDescriptor*>(event);
+  // Event wait time is 0 and state is non-signaled, return directly
+  if (milli_seconds == 0) {
+    int tmp_ret = pthread_mutex_trylock(&eventDescrp->mutex);
+    if (tmp_ret == EBUSY) {
+      // Timeout
+      return 1;
+    }
+  }
+
+  int ret_code = 0;
+  pthread_mutex_lock(&eventDescrp->mutex);
+  if (!eventDescrp->state) {
+    if (milli_seconds == 0) {
+      ret_code = 1;
+    } else {
+      struct timespec ts;
+      struct timeval tp;
+
+      ret_code = gettimeofday(&tp, NULL);
+      ts.tv_sec = tp.tv_sec;
+      ts.tv_nsec = tp.tv_usec * 1000;
+
+      unsigned int sec = milli_seconds / 1000;
+      unsigned int mSec = milli_seconds % 1000;
+
+      ts.tv_sec += sec;
+      ts.tv_nsec += mSec * 1000000;
+
+      // More then one second, add 1 sec to the tv_sec elem
+      if (ts.tv_nsec > 1000000000) {
+        ts.tv_sec += 1;
+        ts.tv_nsec = ts.tv_nsec - 1000000000;
+      }
+
+      ret_code =
+          pthread_cond_timedwait(&eventDescrp->event, &eventDescrp->mutex, &ts);
+      // Time out
+      if (ret_code == 110) {
+        ret_code = 0x14003;  // 1 means time out in HSA
+      }
+
+      if (ret_code == 0 && eventDescrp->auto_reset) {
+        eventDescrp->state = false;
+      }
+    }
+  } else if (eventDescrp->auto_reset) {
+    eventDescrp->state = false;
+  }
+  pthread_mutex_unlock(&eventDescrp->mutex);
+
+  return ret_code;
+}
+
+int SetOsEvent(EventHandle event) {
+  if (event == NULL) {
+    return -1;
+  }
+
+  EventDescriptor* eventDescrp = reinterpret_cast<EventDescriptor*>(event);
+  int ret_code = 0;
+  ret_code = pthread_mutex_lock(&eventDescrp->mutex);
+  eventDescrp->state = true;
+  ret_code = pthread_mutex_unlock(&eventDescrp->mutex);
+  ret_code |= pthread_cond_signal(&eventDescrp->event);
+
+  return ret_code;
+}
+
+int ResetOsEvent(EventHandle event) {
+  if (event == NULL) {
+    return -1;
+  }
+
+  EventDescriptor* eventDescrp = reinterpret_cast<EventDescriptor*>(event);
+  int ret_code = 0;
+  ret_code = pthread_mutex_lock(&eventDescrp->mutex);
+  eventDescrp->state = false;
+  ret_code = pthread_mutex_unlock(&eventDescrp->mutex);
+
+  return ret_code;
+}
+
+static double invPeriod = 0.0;
+
+uint64_t ReadAccurateClock() {
+  if (invPeriod == 0.0) AccurateClockFrequency();
+  timespec time;
+  int err = clock_gettime(CLOCK_MONOTONIC_RAW, &time);
+  if (err != 0) {
+    pr_err("clock_gettime(CLOCK_MONOTONIC_RAW,...) failed %s\n", strerror(errno));
+    abort();
+  }
+  return (uint64_t(time.tv_sec) * 1000000000ull + uint64_t(time.tv_nsec)) * invPeriod;
+}
+
+uint64_t AccurateClockFrequency() {
+  static clockid_t clock = CLOCK_MONOTONIC;
+  static std::atomic<bool> first(true);
+  // Check kernel version - not a concurrency concern.
+  // use non-RAW for getres due to bug in older 2.6.x kernels
+  if (first.load(std::memory_order_acquire)) {
+    utsname kernelInfo;
+    if (uname(&kernelInfo) == 0) {
+      try {
+        std::string ver = kernelInfo.release;
+        size_t idx;
+        int major = std::stoi(ver, &idx);
+        int minor = std::stoi(ver.substr(idx + 1));
+        if ((major >= 4) && (minor >= 4)) {
+          clock = CLOCK_MONOTONIC_RAW;
+        }
+      } catch (...) {
+        // Kernel version string doesn't conform to the standard pattern.
+        // Keep using the "safe" (non-RAW) clock.
+      }
+    }
+    first.store(false, std::memory_order_release);
+  }
+  timespec time;
+  int err = clock_getres(clock, &time);
+  if (err != 0) {
+    pr_err("clock_getres failed %s\n", strerror(errno));
+    abort();
+  }
+  if (time.tv_sec != 0 || time.tv_nsec >= 0xFFFFFFFF) {
+    pr_err("clock_getres(CLOCK_MONOTONIC(_RAW),...) returned very low frequency (<1Hz).\n");
+    abort();
+  }
+  if (invPeriod == 0.0) invPeriod = 1.0 / double(time.tv_nsec);
+  return 1000000000ull / uint64_t(time.tv_nsec);
+}
+
+SharedMutex CreateSharedMutex() {
+  pthread_rwlockattr_t attrib;
+  int err = pthread_rwlockattr_init(&attrib);
+  if (err != 0) {
+    pr_err("rw lock attribute init failed: %s\n", strerror(err));
+    return nullptr;
+  }
+
+#ifdef __GLIBC__
+  err = pthread_rwlockattr_setkind_np(&attrib, PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP);
+  if (err != 0) {
+    pr_err("Set rw lock attribute failure: %s\n", strerror(err));
+    return nullptr;
+  }
+#else
+  err = pthread_rwlockattr_setkind(&attrib, PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP);
+  if (err != 0) {
+    pr_err("Set rw lock attribute failure: %s\n", strerror(err));
+    return nullptr;
+  }
+#endif
+
+  pthread_rwlock_t* lock = new pthread_rwlock_t;
+  err = pthread_rwlock_init(lock, &attrib);
+  if (err != 0) {
+    pr_err("rw lock init failed: %s\n", strerror(err));
+    return nullptr;
+  }
+
+  pthread_rwlockattr_destroy(&attrib);
+  return lock;
+}
+
+bool TryAcquireSharedMutex(SharedMutex lock) {
+  int err = pthread_rwlock_trywrlock(*(pthread_rwlock_t**)&lock);
+  return err == 0;
+}
+
+bool AcquireSharedMutex(SharedMutex lock) {
+  int err = pthread_rwlock_wrlock(*(pthread_rwlock_t**)&lock);
+  return err == 0;
+}
+
+void ReleaseSharedMutex(SharedMutex lock) {
+  int err = pthread_rwlock_unlock(*(pthread_rwlock_t**)&lock);
+  if (err != 0) {
+    pr_err("SharedMutex unlock failed: %s\n", strerror(err));
+    abort();
+  }
+}
+
+bool TrySharedAcquireSharedMutex(SharedMutex lock) {
+  int err = pthread_rwlock_tryrdlock(*(pthread_rwlock_t**)&lock);
+  return err == 0;
+}
+
+bool SharedAcquireSharedMutex(SharedMutex lock) {
+  int err = pthread_rwlock_rdlock(*(pthread_rwlock_t**)&lock);
+  return err == 0;
+}
+
+void SharedReleaseSharedMutex(SharedMutex lock) {
+  int err = pthread_rwlock_unlock(*(pthread_rwlock_t**)&lock);
+  if (err != 0) {
+    pr_err("SharedMutex unlock failed: %s\n", strerror(err));
+    abort();
+  }
+}
+
+void DestroySharedMutex(SharedMutex lock) {
+  pthread_rwlock_destroy(*(pthread_rwlock_t**)&lock);
+  delete *(pthread_rwlock_t**)&lock;
+}
+
+static uint64_t sys_clock_period_ = 0;
+
+uint64_t ReadSystemClock() {
+  struct timespec ts;
+  clock_gettime(CLOCK_BOOTTIME, &ts);
+  uint64_t time = (uint64_t(ts.tv_sec) * 1000000000 + uint64_t(ts.tv_nsec));
+  if (sys_clock_period_ != 1)
+    return time / sys_clock_period_;
+  else
+    return time;
+}
+
+uint64_t SystemClockFrequency() {
+  struct timespec ts;
+  clock_getres(CLOCK_BOOTTIME, &ts);
+  sys_clock_period_ = (uint64_t(ts.tv_sec) * 1000000000 + uint64_t(ts.tv_nsec));
+  return 1000000000 / sys_clock_period_;
+}
+
+bool ParseCpuID(cpuid_t* cpuinfo) {
+#if defined(__i386__) || defined(__x86_64__)
+  uint32_t eax, ebx, ecx, edx, max_eax = 0;
+  memset(cpuinfo, 0, sizeof(*cpuinfo));
+
+  /* Make sure current CPU supports at least EAX 4 */
+  if (!__get_cpuid_max(0x80000004, NULL)) return false;
+
+  // Manufacturer ID is a twelve-character ASCII string stored in order EBX, EDX, ECX.
+  if (!__get_cpuid(0, &max_eax, (uint32_t*)&cpuinfo->ManufacturerID[0],
+                   (uint32_t*)&cpuinfo->ManufacturerID[8],
+                   (uint32_t*)&cpuinfo->ManufacturerID[4])) {
+    return false;
+  }
+
+  if (!strcmp(cpuinfo->ManufacturerID, "AuthenticAMD")) {
+    if (__get_cpuid(0x80000001, &eax, &ebx, &ecx, &edx)) {
+      cpuinfo->mwaitx = !!((ecx >> 29) & 0x1);
+    }
+  }
+  return true;
+#else
+  return false;
+#endif
+}
+
+}   //  namespace os
+}   //  namespace wsl
+
+#endif
@@ -0,0 +1,290 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+// 
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+// 
+// Developed by:
+// 
+//                 AMD Research and AMD HSA Software Development
+// 
+//                 Advanced Micro Devices, Inc.
+// 
+//                 www.amd.com
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+// 
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// Library of syncronization primitives - to be added to as needed.
+
+#ifndef HSA_RUNTIME_CORE_UTIL_LOCKS_H_
+#define HSA_RUNTIME_CORE_UTIL_LOCKS_H_
+
+#include "utils.h"
+#include "os.h"
+
+namespace wsl {
+
+class HybridMutex {
+ public:
+  HybridMutex():lock_(0) { 
+    sem_ = os::CreateSemaphore(); 
+  }
+
+  ~HybridMutex() { 
+    os::DestroySemaphore(sem_); 
+  }
+
+  bool Try() {
+    int old = 0;
+    return lock_.compare_exchange_strong(old, 1);
+  }
+
+  bool Acquire() {
+    int cnt = maxSpinIterPause + maxSpinIterYield;
+
+    int old = 0;
+    while (!lock_.compare_exchange_strong(old, 1)) {
+      cnt--;
+      if (cnt > maxSpinIterPause) {
+        _mm_pause();
+      } else if (cnt-- > maxSpinIterYield) {
+        os::YieldThread();
+      } else {
+        os::WaitSemaphore(sem_);
+        cnt = maxSpinIterPause + maxSpinIterYield;
+      }
+      old = 0;
+    }
+    return true;
+  }
+
+  void Release() {
+    int old = 1;
+    if (lock_.compare_exchange_strong(old, 0))
+      os::PostSemaphore(sem_);
+  }
+
+ private:
+  std::atomic<int> lock_;
+  os::Semaphore sem_;
+  const uint32_t maxSpinIterPause = 55;
+  const uint32_t maxSpinIterYield = 55;
+
+  /// @brief: Disable copiable and assignable ability.
+  DISALLOW_COPY_AND_ASSIGN(HybridMutex);
+};
+
+
+/// @brief: a class represents a kernel mutex.
+/// Uses the kernel's scheduler to keep the waiting thread from being scheduled
+/// until the lock is released (Best for long waits, though anything using
+/// a kernel object is a long wait).
+class KernelMutex {
+ public:
+  KernelMutex() { lock_ = os::CreateMutex(); }
+  ~KernelMutex() { os::DestroyMutex(lock_); }
+
+  bool Try() { return os::TryAcquireMutex(lock_); }
+  bool Acquire() { return os::AcquireMutex(lock_); }
+  void Release() { os::ReleaseMutex(lock_); }
+
+ private:
+  os::Mutex lock_;
+
+  /// @brief: Disable copiable and assignable ability.
+  DISALLOW_COPY_AND_ASSIGN(KernelMutex);
+};
+
+/// @brief: represents a spin lock.
+/// For very short hold durations on the order of the thread scheduling
+/// quanta or less.
+class SpinMutex {
+ public:
+  SpinMutex() { lock_ = 0; }
+
+  bool Try() {
+    int old = 0;
+    return lock_.compare_exchange_strong(old, 1);
+  }
+  bool Acquire() {
+    int old = 0;
+    while (!lock_.compare_exchange_strong(old, 1))
+	{
+		old=0;
+    os::YieldThread();
+	}
+    return true;
+  }
+  void Release() { lock_ = 0; }
+
+ private:
+  std::atomic<int> lock_;
+
+  /// @brief: Disable copiable and assignable ability.
+  DISALLOW_COPY_AND_ASSIGN(SpinMutex);
+};
+
+class KernelEvent {
+ public:
+  KernelEvent() { evt_ = os::CreateOsEvent(true, true); }
+  ~KernelEvent() { os::DestroyOsEvent(evt_); }
+
+  bool IsSet() { return os::WaitForOsEvent(evt_, 0)==0; }
+  bool WaitForSet() { return os::WaitForOsEvent(evt_, 0xFFFFFFFF)==0; }
+  void Set() { os::SetOsEvent(evt_); }
+  void Reset() { os::ResetOsEvent(evt_); }
+
+ private:
+  os::EventHandle evt_;
+
+  /// @brief: Disable copiable and assignable ability.
+  DISALLOW_COPY_AND_ASSIGN(KernelEvent);
+};
+
+/// @brief: represents a yielding shared mutex.
+/// aka read/write mutex
+class KernelSharedMutex {
+ public:
+  /// @brief: Interfaces ScopedAcquire to shared operations.
+  class Shared {
+   public:
+    explicit Shared(KernelSharedMutex* lock) : lock_(lock) {}
+    bool Try() { return lock_->TryShared(); }
+    bool Acquire() { return lock_->AcquireShared(); }
+    void Release() { lock_->ReleaseShared(); }
+
+   private:
+    KernelSharedMutex* lock_;
+  };
+
+  KernelSharedMutex() { lock_ = os::CreateSharedMutex(); }
+  ~KernelSharedMutex() { os::DestroySharedMutex(lock_); }
+
+  // Exclusive mode operations
+  bool Try() { return os::TryAcquireSharedMutex(lock_); }
+  bool Acquire() { return os::AcquireSharedMutex(lock_); }
+  void Release() { os::ReleaseSharedMutex(lock_); }
+
+  // Shared mode operations
+  bool TryShared() { return os::TrySharedAcquireSharedMutex(lock_); }
+  bool AcquireShared() { return os::SharedAcquireSharedMutex(lock_); }
+  void ReleaseShared() { os::SharedReleaseSharedMutex(lock_); }
+
+  // Return shared operations interface
+  Shared shared() { return Shared(this); }
+
+ private:
+  os::SharedMutex lock_;
+
+  /// @brief: Disable copiable and assignable ability.
+  DISALLOW_COPY_AND_ASSIGN(KernelSharedMutex);
+};
+
+/// @brief: Type trait to identify mutex types
+template <class T> class isMutex {
+ public:
+  enum { value = false };
+};
+template <> class isMutex<HybridMutex> {
+ public:
+  enum { value = true };
+};
+template <> class isMutex<KernelMutex> {
+ public:
+  enum { value = true };
+};
+template <> class isMutex<SpinMutex> {
+ public:
+  enum { value = true };
+};
+template <> class isMutex<KernelSharedMutex> {
+ public:
+  enum { value = true };
+};
+
+/// @brief: A class behaves as a lock in a scope. When trying to enter into the
+/// critical section, creat a object of this class. After the control path goes
+/// out of the scope, it will release the lock automatically.
+template <class LockType> class ScopedAcquire {
+ public:
+  /// @brief: When constructing, acquire the lock.
+  /// @param: lock(Input), pointer to an existing lock.
+  explicit ScopedAcquire(LockType* lock) : lock_(lock), doRelease(true) {
+    static_assert(isMutex<LockType>::value, "ScopedAcquire requires a mutex type.");
+    lock_.Acquire();
+  }
+  explicit ScopedAcquire(LockType lock) : lock_(lock), doRelease(true) {
+    static_assert(!isMutex<LockType>::value, "Mutex types are not copyable.");
+    lock_.Acquire();
+  }
+
+  /// @brief: when destructing, release the lock.
+  ~ScopedAcquire() {
+    if (doRelease) lock_.Release();
+  }
+
+  /// @brief: Release the lock early.  Avoid using when possible.
+  void Release() {
+    lock_.Release();
+    doRelease = false;
+  }
+
+ private:
+  /// @brief: Adapts between pointers to mutex types and mutex pointer types.
+  template <class T, bool B> class container {
+   public:
+    container(T* lock) : lock_(lock) {}
+    __forceinline bool Acquire() { return lock_->Acquire(); }
+    __forceinline void Release() { return lock_->Release(); }
+
+   private:
+    T* lock_;
+  };
+
+  /// @brief: Specialization for mutex pointer types.
+  template <class T> class container<T, false> {
+   public:
+    container(T lock) : lock_(lock) {}
+    __forceinline bool Acquire() { return lock_.Acquire(); }
+    __forceinline void Release() { return lock_.Release(); }
+
+   private:
+    T lock_;
+  };
+
+  container<LockType, isMutex<LockType>::value> lock_;
+  bool doRelease;
+
+  /// @brief: Disable copiable and assignable ability.
+  DISALLOW_COPY_AND_ASSIGN(ScopedAcquire);
+};
+
+} // namespace wsl
+
+#endif  // HSA_RUNTIME_CORE_SUTIL_LOCKS_H_
@@ -0,0 +1,327 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2024, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// Minimal operating system abstraction interfaces.
+
+#ifndef HSA_RUNTIME_CORE_UTIL_OS_H_
+#define HSA_RUNTIME_CORE_UTIL_OS_H_
+
+#include <string>
+#include <vector>
+#include "utils.h"
+
+namespace wsl {
+namespace os {
+typedef void* LibHandle;
+typedef void* Semaphore;
+typedef void* Mutex;
+typedef void* SharedMutex;
+typedef void* Thread;
+typedef void* EventHandle;
+
+enum class os_t { OS_WIN = 0, OS_LINUX, COUNT };
+static __forceinline std::underlying_type<os_t>::type os_index(os_t val) {
+  return std::underlying_type<os_t>::type(val);
+}
+
+#ifdef _WIN32
+static const os_t current_os = os_t::OS_WIN;
+#elif __linux__
+static const os_t current_os = os_t::OS_LINUX;
+#else
+static_assert(false, "Operating System not detected!");
+#endif
+
+/// @brief: Loads dynamic library based on file name. Return value will be NULL
+/// if failed.
+/// @param: filename(Input), file name of the library.
+/// @return: LibHandle.
+LibHandle LoadLib(std::string filename);
+
+/// @brief: Gets the address of exported symbol. Return NULl if failed.
+/// @param: lib(Input), library handle which exporting from.
+/// @param: export_name(Input), the name of the exported symbol.
+/// @return: void*.
+void* GetExportAddress(LibHandle lib, std::string export_name);
+
+/// @brief: Unloads the dynamic library.
+/// @param: lib(Input), library handle which will be unloaded.
+void CloseLib(LibHandle lib);
+
+/// @brief: Lists loaded tool libraries that contain
+/// symbol HSA_AMD_TOOL_PRIORITY
+/// @return: List of library handles
+std::vector<LibHandle> GetLoadedToolsLib();
+
+/// @brief: Returns the library's path name.
+/// @param: lib(Input), libray handle
+/// @return: Path name of library
+std::string GetLibraryName(LibHandle lib);
+
+/// @brief: Creates a Semaphore, will return NULL if failed.
+/// @param: void.
+/// @return: Semaphore.
+Semaphore CreateSemaphore();
+
+/// @brief: Waits for the semaphore. This is a blocking wait.
+/// If the Semaphore is signalled, this function will return.
+/// @param: sem(Input), handle to the semaphore.
+/// @return: void.
+bool WaitSemaphore(Semaphore sem);
+
+/// @brief: Post/Signal/Wake-up the semaphore
+/// @param: sem(Input), handle to the semaphore.
+/// @return: void.
+void PostSemaphore(Semaphore sem);
+
+/// @brief: Destroys the semaphore.
+/// @param: sem(Input), handle to the semaphore.
+/// @return: void.
+void DestroySemaphore(Semaphore sem);
+
+/// @brief: Creates a mutex, will return NULL if failed.
+/// @param: void.
+/// @return: Mutex.
+Mutex CreateMutex();
+
+/// @brief: Tries to acquire the mutex once, if successed, return true.
+/// @param: lock(Input), handle to the mutex.
+/// @return: bool.
+bool TryAcquireMutex(Mutex lock);
+
+/// @brief: Aquires the mutex, if the mutex is locked, it will wait until it is
+/// released. If the mutex is acquired successfully, it will return true.
+/// @param: lock(Input), handle to the mutex.
+/// @return: bool.
+bool AcquireMutex(Mutex lock);
+
+/// @brief: Releases the mutex.
+/// @param: lock(Input), handle to the mutex.
+/// @return: void.
+void ReleaseMutex(Mutex lock);
+
+/// @brief: Destroys the mutex.
+/// @param: lock(Input), handle to the mutex.
+/// @return: void.
+void DestroyMutex(Mutex lock);
+
+/// @brief: Creates a shared mutex, will return NULL if failed.
+/// @param: void.
+/// @return: SharedMutex.
+SharedMutex CreateSharedMutex();
+
+/// @brief: Tries to acquire the mutex in exclusive mode once, if successed, return true.
+/// @param: lock(Input), handle to the shared mutex.
+/// @return: bool.
+bool TryAcquireSharedMutex(SharedMutex lock);
+
+/// @brief: Aquires the mutex in exclusive mode, if the mutex is locked, it will wait until it is
+/// released. If the mutex is acquired successfully, it will return true.
+/// @param: lock(Input), handle to the mutex.
+/// @return: bool.
+bool AcquireSharedMutex(SharedMutex lock);
+
+/// @brief: Releases the mutex from exclusive mode.
+/// @param: lock(Input), handle to the mutex.
+/// @return: void.
+void ReleaseSharedMutex(SharedMutex lock);
+
+/// @brief: Tries to acquire the mutex in shared mode once, if successed, return true.
+/// @param: lock(Input), handle to the mutex.
+/// @return: bool.
+bool TrySharedAcquireSharedMutex(SharedMutex lock);
+
+/// @brief: Aquires the mutex in shared mode, if the mutex in exclusive mode, it will wait until it
+/// is released. If the mutex is acquired successfully, it will return true.
+/// @param: lock(Input), handle to the mutex.
+/// @return: bool.
+bool SharedAcquireSharedMutex(SharedMutex lock);
+
+/// @brief: Releases the mutex from shared mode.
+/// @param: lock(Input), handle to the mutex.
+/// @return: void.
+void SharedReleaseSharedMutex(SharedMutex lock);
+
+/// @brief: Destroys the mutex.
+/// @param: lock(Input), handle to the mutex.
+/// @return: void.
+void DestroySharedMutex(SharedMutex lock);
+
+/// @brief: Puts current thread to sleep.
+/// @param: delayInMs(Input), time in millisecond for sleeping.
+/// @return: void.
+void Sleep(int delayInMs);
+
+/// @brief: Puts current thread to sleep.
+/// @param: delayInMs(Input), time in millisecond for sleeping.
+/// @return: void.
+void uSleep(int delayInUs);
+
+/// @brief: Yields current thread.
+/// @param: void.
+/// @return: void.
+void YieldThread();
+
+typedef void (*ThreadEntry)(void*);
+
+/// @brief: Creates a thread will return NULL if failed.
+/// @param: entry_function(Input), a pointer to the function which the thread
+/// starts from.
+/// @param: entry_argument(Input), a pointer to the argument of the thread
+/// function.
+/// @param: stack_size(Input), size of the thread's stack, 0 by default.
+/// @return: Thread, a handle to thread created.
+Thread CreateThread(ThreadEntry entry_function, void* entry_argument,
+                    uint stack_size = 0);
+
+/// @brief: Destroys the thread.
+/// @param: thread(Input), thread handle to what will be destroyed.
+/// @return: void.
+void CloseThread(Thread thread);
+
+/// @brief: Waits for specific thread to finish, if successful, return true.
+/// @param: thread(Input), handle to waiting thread.
+/// @return: bool.
+bool WaitForThread(Thread thread);
+
+/// @brief: Waits for multiple threads to finish, if successful, return true.
+/// @param; threads(Input), a pointer to a list of thread handle.
+/// @param: thread_count(Input), number of threads to be waited on.
+/// @return: bool.
+bool WaitForAllThreads(Thread* threads, uint thread_count);
+
+/// @brief: Determines if environment key is set.
+/// @param: env_var_name(Input), name of the environment value.
+/// @return: bool, true for binding any value to environment key,
+/// including an empty string. False otherwise
+bool IsEnvVarSet(std::string env_var_name);
+
+/// @brief: Sets the environment value.
+/// @param: env_var_name(Input), name of the environment value.
+/// @param: env_var_value(Input), value of the environment value.s
+/// @return: void.
+void SetEnvVar(std::string env_var_name, std::string env_var_value);
+
+/// @brief: Gets the value of environment value.
+/// @param: env_var_name(Input), name of the environment value.
+/// @return: std::string, value of the environment value, returned as string.
+std::string GetEnvVar(std::string env_var_name);
+
+/// @brief: Gets the process ID.
+/// @param: void
+/// @return: int, process ID returned as int.
+int GetProcessId();
+
+/// @brief: Gets the max virtual memory size accessible to the application.
+/// @param: void.
+/// @return: size_t, size of the accessible memory to the application.
+size_t GetUserModeVirtualMemorySize();
+
+/// @brief: Gets the max physical host system memory size.
+/// @param: void.
+/// @return: size_t, size of the physical host system memory.
+size_t GetUsablePhysicalHostMemorySize();
+
+/// @brief: Gets the virtual memory base address. It is hardcoded to 0.
+/// @param: void.
+/// @return: uintptr_t, always 0.
+uintptr_t GetUserModeVirtualMemoryBase();
+
+/// @brief os event api, create an event
+/// @param: auto_reset whether an event can reset the status automatically
+/// @param: init_state initial state of the event
+/// @return: event handle
+EventHandle CreateOsEvent(bool auto_reset, bool init_state);
+
+/// @brief os event api, destroy an event
+/// @param: event handle
+/// @return: whether destroy is correct
+int DestroyOsEvent(EventHandle event);
+
+/// @brief os event api, wait on event
+/// @param: event Event handle
+/// @param: milli_seconds wait time
+/// @return: Indicate success or timeout
+int WaitForOsEvent(EventHandle event, unsigned int milli_seconds);
+
+/// @brief os event api, set event state
+/// @param: event Event handle
+/// @return: Whether event set is correct
+int SetOsEvent(EventHandle event);
+
+/// @brief os event api, reset event state
+/// @param: event Event handle
+/// @return: Whether event reset is correct
+int ResetOsEvent(EventHandle event);
+
+/// @brief reads a clock which is deemed to be accurate for elapsed time
+/// measurements, though not necessarilly fast to query
+/// @return clock counter value
+uint64_t ReadAccurateClock();
+
+/// @brief retrieves the frequency in Hz of the unit used in ReadAccurateClock.
+/// It does not necessarilly reflect the resolution of the clock, but is the
+/// value needed to convert a difference in the clock's counter value to elapsed
+/// seconds.  This frequency does not change at runtime.
+/// @return returns the frequency
+uint64_t AccurateClockFrequency();
+
+/// @brief read the system clock which serves as the HSA system clock
+/// counter in KFD.
+uint64_t ReadSystemClock();
+
+/// @brief read the system clock frequency
+uint64_t SystemClockFrequency();
+
+typedef struct cpuid_s {
+  char ManufacturerID[13];  // 12 char, NULL terminated
+  bool mwaitx;
+} cpuid_t;
+
+/// @brief parse CPUID
+/// @param: cpuinfo struct to be filled
+bool ParseCpuID(cpuid_t* cpuinfo);
+
+}   //  namespace os
+} // namespace wsl
+
+#endif  // HSA_RUNTIME_CORE_UTIL_OS_H_
@@ -0,0 +1,394 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// A simple best fit memory allocator with eager compaction.  Manages block sub-allocation.
+// For use when memory efficiency is more important than allocation speed.
+// O(log n) time.
+
+#ifndef HSA_RUNTME_CORE_UTIL_SIMPLE_HEAP_H_
+#define HSA_RUNTME_CORE_UTIL_SIMPLE_HEAP_H_
+
+#include <map>
+#include <deque>
+#include <utility>
+
+
+namespace wsl {
+
+template <typename Allocator> class SimpleHeap {
+ private:
+  struct Fragment_T {
+    typedef std::multimap<size_t, uintptr_t>::iterator ptr_t;
+    ptr_t free_list_entry_;
+    struct {
+      size_t size : 62;
+      bool discard : 1;
+      bool free : 1;
+    };
+
+    Fragment_T(ptr_t Iterator, size_t Len, bool Free)
+        : free_list_entry_(Iterator), size(Len), discard(false), free(Free) {}
+    Fragment_T() = default;
+  };
+
+  struct Block {
+    uintptr_t base_ptr_;
+    size_t length_;
+
+    Block(uintptr_t base, size_t length) : base_ptr_(base), length_(length) {}
+    Block() = default;
+  };
+
+  Allocator block_allocator_;
+
+  std::multimap<size_t, uintptr_t> free_list_;
+  std::map<uintptr_t, std::map<uintptr_t, Fragment_T>> block_list_;
+  std::deque<Block> block_cache_;
+
+  // Size of blocks that are at least partially in use.
+  size_t in_use_size_;
+  // Total size of block cache
+  size_t cache_size_;
+
+  __forceinline bool isFree(const Fragment_T& node) { return node.free; }
+  __forceinline void setUsed(Fragment_T& node) {
+    node.free = false;
+    node.free_list_entry_ = free_list_.end();
+  }
+  __forceinline void setFree(Fragment_T& node, typename Fragment_T::ptr_t Iterator) {
+    node.free_list_entry_ = Iterator;
+    node.free = true;
+  }
+  __forceinline Fragment_T makeFragment(size_t Len) {
+    return Fragment_T(free_list_.end(), Len, false);
+  }
+  __forceinline Fragment_T makeFragment(typename Fragment_T::ptr_t Iterator, size_t Len) {
+    return Fragment_T(Iterator, Len, true);
+  }
+  __forceinline void removeFreeListEntry(Fragment_T& node) {
+    if (node.free_list_entry_ != free_list_.end()) {
+      free_list_.erase(node.free_list_entry_);
+      node.free_list_entry_ = free_list_.end();
+    }
+  }
+  __forceinline void discard(Fragment_T& node) {
+    removeFreeListEntry(node);
+    node.discard = true;
+  }
+
+ public:
+  explicit SimpleHeap(const Allocator& BlockAllocator = Allocator())
+      : block_allocator_(BlockAllocator), in_use_size_(0), cache_size_(0) {}
+  ~SimpleHeap() {
+    trim();
+    // Leak here may be due to the user.  Check is for debugging only.
+    // assert(in_use_size_ == 0 && "Leak in SimpleHeap.");
+  }
+
+  SimpleHeap(const SimpleHeap& rhs) = delete;
+  SimpleHeap(SimpleHeap&& rhs) = delete;
+  SimpleHeap& operator=(const SimpleHeap& rhs) = delete;
+  SimpleHeap& operator=(SimpleHeap&& rhs) = delete;
+
+  void* alloc(size_t bytes) {
+    // Find best fit.
+    uintptr_t base;
+    size_t size;
+    // For bytes >= 2MB, the requested mem should be aligned
+    size_t align_bytes = bytes;
+    const int retry = bytes >= GPU_HUGE_PAGE_SIZE ? 1 : 0;
+    size_t align = bytes >= GPU_HUGE_PAGE_SIZE ? GPU_HUGE_PAGE_SIZE : DEFAULT_GPU_PAGE_SIZE;
+
+    for (int i = 0; i <= retry; i++) {
+      auto free_fragment = free_list_.lower_bound(align_bytes);
+      if (free_fragment == free_list_.end()) break;
+
+      uintptr_t addr = free_fragment->second;
+      size = free_fragment->first;
+
+      assert(size >= bytes && "SimpleHeap: map lower_bound failure.");
+
+      // Find the containing block and fragment
+      auto it = block_list_.upper_bound(addr);
+      it--;
+      auto& frag_map = it->second;
+      const auto& fragment = frag_map.find(addr);
+
+      assert(fragment != frag_map.end() && "Inconsistency in SimpleHeap.");
+      assert(size == fragment->second.size && "Inconsistency in SimpleHeap.");
+
+      size_t delta = addr & (align - 1);
+      if (!delta) {
+        // already find aligned address
+        base = addr;
+        free_list_.erase(free_fragment);
+        // Sub-allocate from fragment.
+        fragment->second.size = bytes;
+        setUsed(fragment->second);
+        // Record remaining free space.
+        if (size > bytes) {
+          free_fragment = free_list_.insert(std::make_pair(size - bytes, base + bytes));
+          frag_map[base + bytes] = makeFragment(free_fragment, size - bytes);
+        }
+      } else {
+        // If this is the first request and the requested size is not enough for alignment,
+        // then request for a bigger hole and do trim.
+        if (i == 0 && size < bytes + align - delta) {
+          align_bytes += align;
+          continue;
+        }
+
+        uintptr_t aligned_base = addr + align - delta;
+        base = aligned_base;
+
+        // Erase the old free list
+        free_list_.erase(free_fragment);
+
+        // fragment 1 - free
+        free_fragment = free_list_.insert(std::make_pair(aligned_base - addr, addr));
+        frag_map[addr] = makeFragment(free_fragment, aligned_base - addr);
+
+        //fragment 2 - used
+        frag_map[base] = makeFragment(bytes);
+
+        // fragement 3 - free
+        if (size > aligned_base - addr + bytes) {
+          free_fragment = free_list_.insert(std::make_pair(size - (aligned_base - addr) - bytes, aligned_base + bytes));
+          frag_map[aligned_base + bytes] = makeFragment(free_fragment, size - (aligned_base - addr) - bytes);
+        }
+      }
+      return reinterpret_cast<void*>(base);
+    }
+
+    // No usable fragment, check block cache
+    if (bytes < default_block_size() && !block_cache_.empty()) {
+      const auto& block = block_cache_.back();
+      base = block.base_ptr_;
+      size = block.length_;
+      block_cache_.pop_back();
+      cache_size_ -= size;
+    } else {  // Alloc new block - new block may be larger than default.
+      void* ptr = block_allocator_.alloc(bytes, size);
+      if (ptr == nullptr) {
+        fprintf(stderr, "Block allocation failed, Allocator is expected to throw.\n");
+        return nullptr;
+      }
+      base = reinterpret_cast<uintptr_t>(ptr);
+    }
+
+    in_use_size_ += size;
+    assert(size >= bytes && "Alloc exceeds block size.");
+    // Sub alloc and insert free region.
+    if (size > bytes) {
+      auto free_fragment = free_list_.insert(std::make_pair(size - bytes, base + bytes));
+      block_list_[base][base + bytes] = makeFragment(free_fragment, size - bytes);
+    }
+    // Track used region
+    block_list_[base][base] = makeFragment(bytes);
+
+    // Disallow multiple suballocation from large blocks.
+    // Prevents a small allocation from retaining a large block.
+    if (bytes > default_block_size()) {
+      bool err = discardBlock(reinterpret_cast<void*>(base));
+      assert(err && "Large block discard failed.");
+    }
+
+    return reinterpret_cast<void*>(base);
+  }
+
+  /* Return block-base the ptr belongs to if the ptr is a valid ptr which is allocated
+   * from this simpleheap and the block-base is allocated from block_allocator_*/
+  void* block_base(void* ptr) {
+    if (ptr == nullptr)
+      return nullptr;
+
+    uintptr_t base = reinterpret_cast<uintptr_t>(ptr);
+
+    // Find fragment and validate.
+    auto frag_map_it = block_list_.upper_bound(base);
+    if (frag_map_it == block_list_.begin())
+      return nullptr;
+    frag_map_it--;
+    auto& frag_map = frag_map_it->second;
+    auto fragment = frag_map.find(base);
+    if (fragment == frag_map.end() || isFree(fragment->second))
+      return nullptr;
+
+    return reinterpret_cast<void*>(frag_map_it->first);
+  }
+
+  void reset() {
+    free_list_.clear();
+    block_list_.clear();
+    block_cache_.clear();
+    in_use_size_ = 0;
+    cache_size_ = 0;
+  }
+
+  bool free(void* ptr) {
+    if (ptr == nullptr) return true;
+
+    uintptr_t base = reinterpret_cast<uintptr_t>(ptr);
+
+    // Find fragment and validate.
+    auto frag_map_it = block_list_.upper_bound(base);
+    if (frag_map_it == block_list_.begin()) return false;
+    frag_map_it--;
+    auto& frag_map = frag_map_it->second;
+    auto fragment = frag_map.find(base);
+    if (fragment == frag_map.end() || isFree(fragment->second)) return false;
+
+    bool discard = fragment->second.discard;
+
+    // Merge lower
+    if (fragment != frag_map.begin()) {
+      auto lower = fragment;
+      lower--;
+      if (isFree(lower->second)) {
+        removeFreeListEntry(lower->second);
+        lower->second.size += fragment->second.size;
+        frag_map.erase(fragment);
+        fragment = lower;
+      }
+    }
+
+    // Merge upper
+    {
+      auto upper = fragment;
+      upper++;
+      if ((upper != frag_map.end()) && isFree(upper->second)) {
+        removeFreeListEntry(upper->second);
+        fragment->second.size += upper->second.size;
+        frag_map.erase(upper);
+      }
+    }
+
+    // Release whole free blocks.
+    if (frag_map.size() == 1) {
+      Block block(fragment->first, fragment->second.size);
+      block_list_.erase(frag_map_it);
+
+      // Discard or add to the block cache.
+      if (discard) {
+        block_allocator_.free(reinterpret_cast<void*>(block.base_ptr_), block.length_);
+      } else {
+        block_cache_.push_back(block);
+        cache_size_ += block.length_;
+        in_use_size_ -= block.length_;
+      }
+
+      balance();
+
+      // Don't publish free space since block was moved to the cache.
+      return true;
+    }
+
+    // Don't report free memory if discarding the fragment.
+    if (discard) return true;
+
+    // Report free fragment
+    const auto& freeEntry =
+        free_list_.insert(std::make_pair(size_t(fragment->second.size), fragment->first));
+    setFree(fragment->second, freeEntry);
+
+    return true;
+  }
+
+  void balance() {
+    // Release old blocks when over cache limit.
+    while ((block_cache_.size() > 1) && (cache_size_ > in_use_size_ * 2)) {
+      const auto& block = block_cache_.front();
+      block_allocator_.free(reinterpret_cast<void*>(block.base_ptr_), block.length_);
+      cache_size_ -= block.length_;
+      block_cache_.pop_front();
+    }
+  }
+
+  void trim() {
+    for (const auto& block : block_cache_)
+      block_allocator_.free(reinterpret_cast<void*>(block.base_ptr_), block.length_);
+    block_cache_.clear();
+    cache_size_ = 0;
+  }
+
+  size_t cache_size() const { return cache_size_; }
+
+  size_t default_block_size() const { return block_allocator_.block_size(); }
+
+  // Prevent reuse of the block containing ptr.  No further fragments will be allocated from the
+  // block and the block will not be added to the block cache when it is free.
+  bool discardBlock(void* ptr) {
+    if (ptr == nullptr) return true;
+
+    uintptr_t base = reinterpret_cast<uintptr_t>(ptr);
+
+    // Find block validate.
+    auto frag_map_it = block_list_.upper_bound(base);
+    if (frag_map_it == block_list_.begin()) return false;
+    frag_map_it--;
+    auto& frag_map = frag_map_it->second;
+    if ((base < frag_map.begin()->first) ||
+        (frag_map.rbegin()->first + frag_map.rbegin()->second.size <= base))
+      return false;
+
+    // Is block already discarded?
+    if (frag_map.begin()->second.discard) return true;
+
+    // Mark all fragments for discard and compute block size.  Removes freelist records for all
+    // fragments in the block.
+    size_t size = 0;
+    for (auto& frag : frag_map) {
+      discard(frag.second);
+      size += frag.second.size;
+    }
+
+    // Remove discarded block from in-use tracking and rebalance the block cache.
+    in_use_size_ -= size;
+    balance();
+
+    return true;
+  }
+};
+
+} // namespace wsl
+
+#endif  // HSA_RUNTME_CORE_UTIL_SIMPLE_HEAP_H_
@@ -0,0 +1,185 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+// 
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+// 
+// Developed by:
+// 
+//                 AMD Research and AMD HSA Software Development
+// 
+//                 Advanced Micro Devices, Inc.
+// 
+//                 www.amd.com
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+// 
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include "small_heap.h"
+
+namespace wsl {
+
+// Inserts node into freelist after place.
+// Assumes node will not be an end of the list (list has guard nodes).
+void SmallHeap::insertafter(SmallHeap::iterator_t place, SmallHeap::iterator_t node) {
+  assert(place->first < node->first && "Order violation");
+  assert(isfree(place->second) && "Freelist operation error.");
+  iterator_t next = place->second.next;
+  node->second.next = next;
+  node->second.prior = place;
+  place->second.next = node;
+  next->second.prior = node;
+}
+
+// Removes node from freelist.
+// Assumes node will not be an end of the list (list has guard nodes).
+void SmallHeap::remove(SmallHeap::iterator_t node) {
+  assert(isfree(node->second) && "Freelist operation error.");
+  node->second.prior->second.next = node->second.next;
+  node->second.next->second.prior = node->second.prior;
+  setused(node->second);
+}
+
+// Returns high if merge failed or the merged node.
+SmallHeap::memory_t::iterator SmallHeap::merge(SmallHeap::memory_t::iterator low,
+                                               SmallHeap::memory_t::iterator high) {
+  assert(isfree(low->second) && "Merge with allocated block");
+  assert(isfree(high->second) && "Merge with allocated block");
+
+  if ((char*)low->first + low->second.len != (char*)high->first) return high;
+
+  assert(!islastfree(high->second) && "Illegal merge.");
+
+  low->second.len += high->second.len;
+  low->second.next = high->second.next;
+  high->second.next->second.prior = low;
+
+  memory.erase(high);
+  return low;
+}
+
+void SmallHeap::free(void* ptr) {
+  if (ptr == nullptr) return;
+
+  auto iterator = memory.find(ptr);
+
+  // Check for illegal free
+  if (iterator == memory.end()) {
+    assert(false && "Illegal free.");
+    return;
+  }
+
+  // Return memory to total and link node into free list
+  total_free += iterator->second.len;
+
+  // Could also traverse the free list which might be faster in some cases.
+  auto before = iterator;
+  before--;
+  while (!isfree(before->second)) before--;
+  assert(before->second.next->first > iterator->first && "Inconsistency in small heap.");
+  insertafter(before, iterator);
+
+  // Attempt compaction
+  iterator = merge(before, iterator);
+  merge(iterator, iterator->second.next);
+
+  // Update lowHighBondary
+  high.erase(ptr);
+}
+
+void* SmallHeap::alloc(size_t bytes) {
+  // Is enough memory available?
+  if ((bytes > total_free) || (bytes == 0)) return nullptr;
+
+  iterator_t current;
+
+  // Walk the free list and allocate at first fitting location
+  current = firstfree();
+  while (!islastfree(current->second)) {
+    if (bytes <= current->second.len) {
+      // Decrement from total
+      total_free -= bytes;
+
+      // Split node
+      if (bytes != current->second.len) {
+        void* remaining = (char*)current->first + bytes;
+        Node& node = memory[remaining];
+        node.len = current->second.len - bytes;
+        current->second.len = bytes;
+        insertafter(current, memory.find(remaining));
+      }
+
+      remove(current);
+      return current->first;
+    }
+    current = current->second.next;
+  }
+  assert(current->second.len == 0 && "Freelist corruption.");
+
+  // Can't service the request due to fragmentation
+  return nullptr;
+}
+
+void* SmallHeap::alloc_high(size_t bytes) {
+  // Is enough memory available?
+  if ((bytes > total_free) || (bytes == 0)) return nullptr;
+
+  iterator_t current;
+
+  // Walk the free list and allocate at first fitting location
+  current = lastfree();
+  while (!isfirstfree(current->second)) {
+    if (bytes <= current->second.len) {
+      // Decrement from total
+      total_free -= bytes;
+
+      void* alloc;
+      // Split node
+      if (bytes != current->second.len) {
+        alloc = (char*)current->first + current->second.len - bytes;
+        current->second.len -= bytes;
+        Node& node = memory[alloc];
+        node.len = bytes;
+        setused(node);
+      } else {
+        alloc = current->first;
+        remove(current);
+      }
+
+      high.insert(alloc);
+      return alloc;
+    }
+    current = current->second.prior;
+  }
+  assert(current->second.len == 0 && "Freelist corruption.");
+
+  // Can't service the request due to fragmentation
+  return nullptr;
+}
+
+} // namespace wsl
@@ -0,0 +1,131 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+// 
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+// 
+// Developed by:
+// 
+//                 AMD Research and AMD HSA Software Development
+// 
+//                 Advanced Micro Devices, Inc.
+// 
+//                 www.amd.com
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+// 
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// A simple first fit memory allocator with eager compaction.  For use with few
+// items (where list iteration is faster than trees).
+// Not thread safe!
+
+#ifndef HSA_RUNTME_CORE_UTIL_SMALL_HEAP_H_
+#define HSA_RUNTME_CORE_UTIL_SMALL_HEAP_H_
+
+#include <map>
+#include <set>
+
+#include "utils.h"
+
+namespace wsl {
+
+class SmallHeap {
+ private:
+  struct Node;
+  typedef std::map<void*, Node> memory_t;
+  typedef memory_t::iterator iterator_t;
+
+  struct Node {
+    size_t len;
+    iterator_t next;
+    iterator_t prior;
+  };
+
+  SmallHeap(const SmallHeap& rhs) = delete;
+  SmallHeap& operator=(const SmallHeap& rhs) = delete;
+
+  void* const pool;
+  const size_t length;
+
+  size_t total_free;
+  memory_t memory;
+  std::set<void*> high;
+
+  __forceinline bool isfree(const Node& node) const { return node.next != memory.begin(); }
+  __forceinline bool islastfree(const Node& node) const { return node.next == memory.end(); }
+  __forceinline bool isfirstfree(const Node& node) const { return node.prior == memory.end(); }
+  __forceinline void setlastfree(Node& node) { node.next = memory.end(); }
+  __forceinline void setfirstfree(Node& node) { node.prior = memory.end(); }
+  __forceinline void setused(Node& node) { node.next = memory.begin(); }
+
+  __forceinline iterator_t firstfree() { return memory.begin()->second.next; }
+  __forceinline iterator_t lastfree() { return memory.rbegin()->second.prior; }
+  void insertafter(iterator_t place, iterator_t node);
+  void remove(iterator_t node);
+  iterator_t merge(iterator_t low, iterator_t high);
+
+ public:
+  SmallHeap() : pool(nullptr), length(0), total_free(0) {}
+  SmallHeap(void* base, size_t length)
+      : pool(base), length(length), total_free(length) {
+    assert(pool != nullptr && "Invalid base address.");
+    assert(pool != (void*)0xFFFFFFFFFFFFFFFFull && "Invalid base address.");
+    assert((char*)pool + length != (char*)0xFFFFFFFFFFFFFFFFull && "Invalid pool bounds.");
+
+    Node& start = memory[0];
+    Node& node = memory[pool];
+    Node& end = memory[(void*)0xFFFFFFFFFFFFFFFFull];
+
+    start.len = 0;
+    start.next = memory.find(pool);
+    setfirstfree(start);
+
+    node.len = length;
+    node.prior = memory.begin();
+    node.next = --memory.end();
+
+    end.len = 0;
+    end.prior = start.next;
+    setlastfree(end);
+
+    high.insert((void*)0xFFFFFFFFFFFFFFFFull);
+  }
+
+  void* alloc(size_t bytes);
+  void* alloc_high(size_t bytes);
+  void free(void* ptr);
+
+  void* base() const { return pool; }
+  size_t size() const { return length; }
+  size_t remaining() const { return total_free; }
+  void* high_split() const { return *high.begin(); }
+};
+
+} // namespace wsl
+
+#endif
@@ -0,0 +1,111 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+// 
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+// 
+// Developed by:
+// 
+//                 AMD Research and AMD HSA Software Development
+// 
+//                 Advanced Micro Devices, Inc.
+// 
+//                 www.amd.com
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+// 
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include "core/util/timer.h"
+
+namespace wsl {
+namespace timer {
+
+accurate_clock::init::init() {
+  freq = os::AccurateClockFrequency();
+  accurate_clock::period_ns = 1e9 / double(freq);
+}
+
+// Calibrates the fast clock using the accurate clock.
+fast_clock::init::init() {
+  typedef accurate_clock clock;
+  clock::duration delay(std::chrono::milliseconds(1));
+
+  // calibrate clock
+  fast_clock::raw_rep min = 0;
+  clock::duration elapsed;
+
+  do {
+    elapsed = clock::duration::max();
+
+    for (int t = 0; t < 10; t++) {
+      fast_clock::raw_rep r1, r2;
+      clock::time_point t0, t1, t2, t3;
+
+      t0 = clock::now();
+      std::atomic_signal_fence(std::memory_order_acq_rel);
+      r1 = fast_clock::raw_now();
+      std::atomic_signal_fence(std::memory_order_acq_rel);
+      t1 = clock::now();
+      std::atomic_signal_fence(std::memory_order_acq_rel);
+
+      do {
+        t2 = clock::now();
+      } while (t2 - t1 < delay);
+
+      std::atomic_signal_fence(std::memory_order_acq_rel);
+      r2 = fast_clock::raw_now();
+      std::atomic_signal_fence(std::memory_order_acq_rel);
+      t3 = clock::now();
+
+      // If elapsed time is shorter than last recorded time and both the start
+      // and end times are confirmed correlated then record the clock readings.
+      // This protects against inaccuracy due to thread switching
+      if ((t3 - t1 < elapsed) && ((t1 - t0) * 10 < (t2 - t1)) &&
+          ((t3 - t2) * 10 < (t2 - t1))) {
+        elapsed = t3 - t1;
+        min = r2 - r1;
+      }
+    }
+    delay += delay;
+  } while (min < 1000);
+
+  fast_clock::freq = double(min) / duration_in_seconds(elapsed);
+  fast_clock::period_ps = 1e12 / fast_clock::freq;
+  // printf("Timer setup took %f ms\n", duration_in_seconds(elapsed)*1000.0f);
+  // printf("Fast clock frequency: %f MHz\n", double(fast_clock::freq)/1e6);
+}
+
+double accurate_clock::period_ns;
+accurate_clock::raw_frequency accurate_clock::freq;
+accurate_clock::init accurate_clock::accurate_clock_init;
+
+double fast_clock::period_ps;
+fast_clock::raw_frequency fast_clock::freq;
+fast_clock::init fast_clock::fast_clock_init;
+}   //  namespace timer
+}   // namespace wsl
@@ -0,0 +1,173 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+// 
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+// 
+// Developed by:
+// 
+//                 AMD Research and AMD HSA Software Development
+// 
+//                 Advanced Micro Devices, Inc.
+// 
+//                 www.amd.com
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+// 
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef HSA_RUNTIME_CORE_UTIL_TIMER_H_
+#define HSA_RUNTIME_CORE_UTIL_TIMER_H_
+
+#include "core/util/utils.h"
+#include "core/util/os.h"
+#include <chrono>
+#include <time.h>
+#include <type_traits>
+
+namespace wsl {
+namespace timer {
+
+// Needed to patch around a mixed arithmetic bug in MSVC's duration_cast as of
+// VS 2013.
+template <bool isFloat, bool isSigned>
+struct wide_type {
+  typedef double type;
+};
+template <>
+struct wide_type<false, false> {
+  typedef uintmax_t type;
+};
+template <>
+struct wide_type<false, true> {
+  typedef intmax_t type;
+};
+
+template <typename To, typename Rep, typename Period>
+static __forceinline To
+    duration_cast(const std::chrono::duration<Rep, Period>& d) {
+  typedef typename wide_type<std::is_floating_point<Rep>::value,
+                             std::is_signed<Rep>::value>::type wide;
+  typedef std::chrono::duration<wide, typename To::period> unit_convert_t;
+
+  unit_convert_t temp = std::chrono::duration_cast<unit_convert_t>(d);
+  return To(static_cast<typename To::rep>(temp.count()));
+}
+// End patch
+
+template <typename Rep, typename Period>
+static __forceinline double duration_in_seconds(
+    std::chrono::duration<Rep, Period> delta) {
+  typedef std::chrono::duration<double, std::ratio<1, 1>> seconds;
+  return seconds(delta).count();
+}
+
+template <typename rep>
+static __forceinline rep duration_from_seconds(double delta) {
+  typedef std::chrono::duration<double, std::ratio<1, 1>> seconds;
+  return std::chrono::duration_cast<rep>(seconds(delta));
+}
+
+// Provices a C++11 standard clock interface to the os::AccurateClock functions
+class accurate_clock {
+ public:
+  typedef double rep;
+  typedef std::nano period;
+  typedef std::chrono::duration<rep, period> duration;
+  typedef std::chrono::time_point<accurate_clock> time_point;
+
+  static const bool is_steady = true;
+
+  static __forceinline time_point now() {
+    return time_point(duration(raw_now() * period_ns));
+  }
+
+  // These two extra APIs and types let us use clocks without conversion to the
+  // arbitrary period unit
+  typedef uint64_t raw_rep;
+  typedef uint64_t raw_frequency;
+
+  static __forceinline raw_rep raw_now() { return os::ReadAccurateClock(); }
+  static __forceinline raw_frequency raw_freq() { return freq; }
+
+ private:
+  static double period_ns;
+  static raw_frequency freq;
+
+  class init {
+   public:
+    init();
+  };
+  static init accurate_clock_init;
+};
+
+// Provices a C++11 standard clock interface to the lowest latency approximate
+// clock
+class fast_clock {
+ public:
+  typedef double rep;
+  typedef std::pico period;
+  typedef std::chrono::duration<rep, period> duration;
+  typedef std::chrono::time_point<fast_clock> time_point;
+
+  static const bool is_steady = true;
+
+  static __forceinline time_point now() {
+    return time_point(duration(raw_now() * period_ps));
+  }
+
+  // These two extra APIs and types let us use clocks without conversion to the
+  // arbitrary period unit
+  typedef uint64_t raw_rep;
+  typedef double raw_frequency;
+
+#if defined(__x86_64__) || defined(_M_X64)
+  static __forceinline raw_rep raw_now() { return __rdtsc(); }
+  static __forceinline raw_frequency raw_freq() { return freq; }
+#else
+  static __forceinline raw_rep raw_now() {
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
+    return (raw_rep(ts.tv_sec) * 1000000000 + raw_rep(ts.tv_nsec));
+  }
+  static __forceinline raw_frequency raw_freq() { return 1.e-9; }
+#endif
+
+ private:
+  static double period_ps;
+  static raw_frequency freq;
+
+  class init {
+   public:
+    init();
+  };
+  static init fast_clock_init;
+};
+}   //  namespace timer
+}   //  namespace wsl
+
+#endif
@@ -0,0 +1,389 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2024, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// Generally useful utility functions
+
+#ifndef HSA_RUNTIME_CORE_UTIL_UTILS_H_
+#define HSA_RUNTIME_CORE_UTIL_UTILS_H_
+
+#include "stdint.h"
+#include "stddef.h"
+#include "stdlib.h"
+#include "stdarg.h"
+#include "unistd.h"
+#include <assert.h>
+#include <iostream>
+#include <string>
+#include <algorithm>
+#include <sstream>
+#include <thread>
+
+namespace wsl {
+extern FILE* log_file;
+extern uint8_t log_flags[8];
+
+typedef unsigned int uint;
+typedef uint64_t uint64;
+
+#if defined(__GNUC__)
+#if defined(__i386__) || defined(__x86_64__)
+#include <x86intrin.h>
+#endif
+
+// 2MB huge page size
+#define GPU_HUGE_PAGE_SIZE    (2 << 20)
+
+// 4KB page size
+#define DEFAULT_GPU_PAGE_SIZE (1 << 12)
+
+#define __forceinline __inline__ __attribute__((always_inline))
+#define __declspec(x) __attribute__((x))
+#undef __stdcall
+#define __stdcall  // __attribute__((__stdcall__))
+#define __ALIGNED__(x) __attribute__((aligned(x)))
+
+void log_printf(const char* file, int line, const char* format, ...);
+
+static __forceinline void* _aligned_malloc(size_t size, size_t alignment) {
+#ifdef _ISOC11_SOURCE
+  return aligned_alloc(alignment, size);
+#else
+  void *mem = NULL;
+  if (0 != posix_memalign(&mem, alignment, size)) return NULL;
+  return mem;
+#endif
+}
+static __forceinline void _aligned_free(void* ptr) { return free(ptr); }
+#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
+#include "intrin.h"
+#define __ALIGNED__(x) __declspec(align(x))
+#if (_MSC_VER < 1800)  // < VS 2013
+static __forceinline unsigned long long int strtoull(const char* str,
+                                                     char** endptr, int base) {
+  return static_cast<unsigned long long>(_strtoui64(str, endptr, base));
+}
+#endif
+#if (_MSC_VER < 1900)  // < VS 2015
+#define thread_local __declspec(thread)
+#endif
+#else
+#error "Compiler and/or processor not identified."
+#endif
+
+#define STRING2(x) #x
+#define STRING(x) STRING2(x)
+
+#define PASTE2(x, y) x##y
+#define PASTE(x, y) PASTE2(x, y)
+
+#define __FILENAME__ (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
+
+#define LogPrint(flag, format, ...)                                                                \
+  do {                                                                                             \
+    if (hsa_flag_isset64(log_flags, flag))                                                         \
+      wsl::log_printf(__FILENAME__, __LINE__, format, ##__VA_ARGS__);                             \
+  } while (false);
+
+// A macro to disallow the copy and move constructor and operator= functions
+#define DISALLOW_COPY_AND_ASSIGN(TypeName)                                                         \
+  TypeName(const TypeName&) = delete;                                                              \
+  TypeName(TypeName&&) = delete;                                                                   \
+  void operator=(const TypeName&) = delete;                                                        \
+  void operator=(TypeName&&) = delete;
+
+template <typename lambda>
+class ScopeGuard {
+ public:
+  explicit __forceinline ScopeGuard(const lambda& release)
+      : release_(release), dismiss_(false) {}
+
+  ScopeGuard(ScopeGuard& rhs) { *this = rhs; }
+
+  __forceinline ~ScopeGuard() {
+    if (!dismiss_) release_();
+  }
+  __forceinline ScopeGuard& operator=(ScopeGuard& rhs) {
+    dismiss_ = rhs.dismiss_;
+    release_ = rhs.release_;
+    rhs.dismiss_ = true;
+    return *this;
+  }
+  __forceinline void Dismiss() { dismiss_ = true; }
+
+ private:
+  lambda release_;
+  bool dismiss_;
+};
+
+template <typename lambda>
+static __forceinline ScopeGuard<lambda> MakeScopeGuard(lambda rel) {
+  return ScopeGuard<lambda>(rel);
+}
+
+#define MAKE_SCOPE_GUARD_HELPER(lname, sname, ...) \
+  auto lname = __VA_ARGS__;                        \
+  ScopeGuard<decltype(lname)> sname(lname);
+#define MAKE_SCOPE_GUARD(...)                                   \
+  MAKE_SCOPE_GUARD_HELPER(PASTE(scopeGuardLambda, __COUNTER__), \
+                          PASTE(scopeGuard, __COUNTER__), __VA_ARGS__)
+#define MAKE_NAMED_SCOPE_GUARD(name, ...)                             \
+  MAKE_SCOPE_GUARD_HELPER(PASTE(scopeGuardLambda, __COUNTER__), name, \
+                          __VA_ARGS__)
+
+/// @brief: Finds out the min one of two inputs, input must support ">"
+/// operator.
+/// @param: a(Input), a reference to type T.
+/// @param: b(Input), a reference to type T.
+/// @return: T.
+template <class T>
+static __forceinline T Min(const T& a, const T& b) {
+  return (a > b) ? b : a;
+}
+
+template <class T, class... Arg>
+static __forceinline T Min(const T& a, const T& b, Arg... args) {
+  return Min(a, Min(b, args...));
+}
+
+/// @brief: Find out the max one of two inputs, input must support ">" operator.
+/// @param: a(Input), a reference to type T.
+/// @param: b(Input), a reference to type T.
+/// @return: T.
+template <class T>
+static __forceinline T Max(const T& a, const T& b) {
+  return (b > a) ? b : a;
+}
+
+template <class T, class... Arg>
+static __forceinline T Max(const T& a, const T& b, Arg... args) {
+  return Max(a, Max(b, args...));
+}
+
+/// @brief: Free the memory space which is newed previously.
+/// @param: ptr(Input), a pointer to memory space. Can't be NULL.
+/// @return: void.
+struct DeleteObject {
+  template <typename T>
+  void operator()(const T* ptr) const {
+    delete ptr;
+  }
+};
+
+/// @brief: Checks if a value is power of two, if it is, return true. Be careful
+/// when passing 0.
+/// @param: val(Input), the data to be checked.
+/// @return: bool.
+template <typename T>
+static __forceinline bool IsPowerOfTwo(T val) {
+  return (val & (val - 1)) == 0;
+}
+
+/// @brief: Calculates the floor value aligned based on parameter of alignment.
+/// If value is at the boundary of alignment, it is unchanged.
+/// @param: value(Input), value to be calculated.
+/// @param: alignment(Input), alignment value.
+/// @return: T.
+template <typename T>
+static __forceinline T AlignDown(T value, size_t alignment) {
+  return (T)((value / alignment) * alignment);
+}
+
+/// @brief: Same as previous one, but first parameter becomes pointer, for more
+/// info, see the previous desciption.
+/// @param: value(Input), pointer to type T.
+/// @param: alignment(Input), alignment value.
+/// @return: T*, pointer to type T.
+template <typename T>
+static __forceinline T* AlignDown(T* value, size_t alignment) {
+  return (T*)AlignDown((intptr_t)value, alignment);
+}
+
+/// @brief: Calculates the ceiling value aligned based on parameter of
+/// alignment.
+/// If value is at the boundary of alignment, it is unchanged.
+/// @param: value(Input), value to be calculated.
+/// @param: alignment(Input), alignment value.
+/// @param: T.
+template <typename T>
+static __forceinline T AlignUp(T value, size_t alignment) {
+  return AlignDown((T)(value + alignment - 1), alignment);
+}
+
+/// @brief: Same as previous one, but first parameter becomes pointer, for more
+/// info, see the previous desciption.
+/// @param: value(Input), pointer to type T.
+/// @param: alignment(Input), alignment value.
+/// @return: T*, pointer to type T.
+template <typename T>
+static __forceinline T* AlignUp(T* value, size_t alignment) {
+  return (T*)AlignDown((intptr_t)((uint8_t*)value + alignment - 1), alignment);
+}
+
+/// @brief: Checks if the input value is at the boundary of alignment, if it is,
+/// @return true.
+/// @param: value(Input), value to be checked.
+/// @param: alignment(Input), alignment value.
+/// @return: bool.
+template <typename T>
+static __forceinline bool IsMultipleOf(T value, size_t alignment) {
+  return (AlignUp(value, alignment) == value);
+}
+
+/// @brief: Same as previous one, but first parameter becomes pointer, for more
+/// info, see the previous desciption.
+/// @param: value(Input), pointer to type T.
+/// @param: alignment(Input), alignment value.
+/// @return: bool.
+template <typename T>
+static __forceinline bool IsMultipleOf(T* value, size_t alignment) {
+  return (AlignUp(value, alignment) == value);
+}
+
+static __forceinline uint32_t NextPow2(uint32_t value) {
+  if (value == 0) return 1;
+  uint32_t v = value - 1;
+  v |= v >> 1;
+  v |= v >> 2;
+  v |= v >> 4;
+  v |= v >> 8;
+  v |= v >> 16;
+  return v + 1;
+}
+
+static __forceinline uint64_t NextPow2(uint64_t value) {
+  if (value == 0) return 1;
+  uint64_t v = value - 1;
+  v |= v >> 1;
+  v |= v >> 2;
+  v |= v >> 4;
+  v |= v >> 8;
+  v |= v >> 16;
+  v |= v >> 32;
+  return v + 1;
+}
+
+static __forceinline bool strIsEmpty(const char* str) noexcept { return str[0] == '\0'; }
+
+static __forceinline std::string& ltrim(std::string& s) {
+  auto it = std::find_if(s.begin(), s.end(),
+                         [](char c) { return !std::isspace<char>(c, std::locale::classic()); });
+  s.erase(s.begin(), it);
+  return s;
+}
+
+static __forceinline std::string& rtrim(std::string& s) {
+  auto it = std::find_if(s.rbegin(), s.rend(),
+                         [](char c) { return !std::isspace<char>(c, std::locale::classic()); });
+  s.erase(it.base(), s.end());
+  return s;
+}
+
+static __forceinline std::string& trim(std::string& s) { return ltrim(rtrim(s)); }
+
+}  // namespace wsl
+
+template <uint32_t lowBit, uint32_t highBit, typename T>
+static __forceinline uint32_t BitSelect(T p) {
+  static_assert(sizeof(T) <= sizeof(uintptr_t), "Type out of range.");
+  static_assert(highBit < sizeof(uintptr_t) * 8, "Bit index out of range.");
+
+  uintptr_t ptr = p;
+  if (highBit != (sizeof(uintptr_t) * 8 - 1))
+    return (uint32_t)((ptr & ((1ull << (highBit + 1)) - 1)) >> lowBit);
+  else
+    return (uint32_t)(ptr >> lowBit);
+}
+
+inline uint32_t PtrLow16Shift8(const void* p) {
+  uintptr_t ptr = reinterpret_cast<uintptr_t>(p);
+  return (uint32_t)((ptr & 0xFFFFULL) >> 8);
+}
+
+inline uint32_t PtrHigh64Shift16(const void* p) {
+  uintptr_t ptr = reinterpret_cast<uintptr_t>(p);
+  return (uint32_t)((ptr & 0xFFFFFFFFFFFF0000ULL) >> 16);
+}
+
+inline uint32_t PtrLow40Shift8(const void* p) {
+  uintptr_t ptr = reinterpret_cast<uintptr_t>(p);
+  return (uint32_t)((ptr & 0xFFFFFFFFFFULL) >> 8);
+}
+
+inline uint32_t PtrHigh64Shift40(const void* p) {
+  uintptr_t ptr = reinterpret_cast<uintptr_t>(p);
+  return (uint32_t)((ptr & 0xFFFFFF0000000000ULL) >> 40);
+}
+
+static inline uint8_t Ptr48High8(const void* p) {
+  uintptr_t ptr = reinterpret_cast<uintptr_t>(p);
+  return (uint8_t)((ptr & 0xFF0000000000ULL) >> 40);
+}
+
+static inline uint32_t Ptr48Low32(const void* p) {
+  uintptr_t ptr = reinterpret_cast<uintptr_t>(p);
+  assert((ptr & 0xFFFFFFFFFF00ULL) == ptr);
+  return (uint32_t)((ptr & 0xFFFFFFFFFFULL) >> 8);
+}
+
+inline uint32_t PtrLow32(const void* p) {
+  return static_cast<uint32_t>(reinterpret_cast<uintptr_t>(p));
+}
+
+inline uint32_t PtrHigh32(const void* p) {
+  uint32_t ptr = 0;
+#ifdef HSA_LARGE_MODEL
+  ptr = static_cast<uint32_t>(reinterpret_cast<uintptr_t>(p) >> 32);
+#endif
+  return ptr;
+}
+
+inline uint32_t HighPart(uint64_t value) {
+  return (value & 0xFFFFFFFF00000000) >> 32;
+}
+
+inline uint32_t LowPart(uint64_t value) {
+  return (value & 0x00000000FFFFFFFF);
+}
+
+#include "atomic_helpers.h"
+
+#endif  // HSA_RUNTIME_CORE_UTIL_UTILS_H_
@@ -0,0 +1,327 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+// 
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+// 
+// Developed by:
+// 
+//                 AMD Research and AMD HSA Software Development
+// 
+//                 Advanced Micro Devices, Inc.
+// 
+//                 www.amd.com
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+// 
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifdef _WIN32  // Are we compiling for windows?
+#define NOMINMAX
+
+#include "core/util/os.h"
+
+#include <algorithm>
+#include <process.h>
+#include <string>
+#include <windows.h>
+
+#include <emmintrin.h>
+#include <pmmintrin.h>
+#include <xmmintrin.h>
+
+#undef Yield
+#undef CreateMutex
+
+namespace wsl {
+namespace os {
+
+static_assert(sizeof(LibHandle) == sizeof(HMODULE),
+              "OS abstraction size mismatch");
+static_assert(sizeof(LibHandle) == sizeof(::HANDLE),
+              "OS abstraction size mismatch");
+static_assert(sizeof(Semaphore) == sizeof(::HANDLE),
+              "OS abstraction size mismatch");
+static_assert(sizeof(Mutex) == sizeof(::HANDLE),
+              "OS abstraction size mismatch");
+static_assert(sizeof(Thread) == sizeof(::HANDLE),
+              "OS abstraction size mismatch");
+static_assert(sizeof(EventHandle) == sizeof(::HANDLE),
+              "OS abstraction size mismatch");
+
+LibHandle LoadLib(std::string filename) {
+  HMODULE ret = LoadLibrary(filename.c_str());
+  return *(LibHandle*)&ret;
+}
+
+void* GetExportAddress(LibHandle lib, std::string export_name) {
+  return GetProcAddress(*(HMODULE*)&lib, export_name.c_str());
+}
+
+void CloseLib(LibHandle lib) { FreeLibrary(*(::HMODULE*)&lib); }
+
+std::vector<LibHandle> GetLoadedLibs() {
+  // Use EnumProcessModulesEx
+  static_assert(false, "Not implemented.");
+}
+
+std::string GetLibraryName(LibHandle lib) {
+  static_assert(false, "Not implemented.");
+}
+
+Semaphore CreateSemaphore() {
+  sem = static_cast<void*>(CreateSemaphore(NULL, 0, LONG_MAX, NULL));
+  assert(sem != NULL && "CreateSemaphore failed");
+
+  return *(Semaphore*)&sem;
+}
+
+bool WaitSemaphore(Semaphore sem) {
+  return WaitForSingleObject(*(::HANDLE*)&lock, INFINITE) == WAIT_OBJECT_0;
+}
+
+void PostSemaphore(Semaphore sem) {
+  ReleaseSemaphore(static_cast<HANDLE>(*sem), 1, NULL);
+}
+
+void DestroySemaphore(Semaphore sem) {
+  if (!CloseHandle(static_cast<HANDLE>(*sem))) {
+    assert("CloseHandle() failed");
+  }
+  *sem = NULL;
+}
+
+Mutex CreateMutex() { return CreateEvent(NULL, false, true, NULL); }
+
+bool TryAcquireMutex(Mutex lock) {
+  return WaitForSingleObject(*(::HANDLE*)&lock, 0) == WAIT_OBJECT_0;
+}
+
+bool AcquireMutex(Mutex lock) {
+  return WaitForSingleObject(*(::HANDLE*)&lock, INFINITE) == WAIT_OBJECT_0;
+}
+
+void ReleaseMutex(Mutex lock) { SetEvent(*(::HANDLE*)&lock); }
+
+void DestroyMutex(Mutex lock) { CloseHandle(*(::HANDLE*)&lock); }
+
+void Sleep(int delay_in_millisecond) { ::Sleep(delay_in_millisecond); }
+
+void uSleep(int delayInUs) { ::Sleep(delayInUs / 1000); }
+
+void YieldThread() { ::Sleep(0); }
+
+struct ThreadArgs {
+  void* entry_args;
+  ThreadEntry entry_function;
+};
+
+unsigned __stdcall ThreadTrampoline(void* arg) {
+  ThreadArgs* thread_args = (ThreadArgs*)arg;
+  ThreadEntry entry = thread_args->entry_function;
+  void* data = thread_args->entry_args;
+  delete thread_args;
+  entry(data);
+  _endthreadex(0);
+  return 0;
+}
+
+Thread CreateThread(ThreadEntry entry_function, void* entry_argument,
+                    uint stack_size) {
+  ThreadArgs* thread_args = new ThreadArgs();
+  thread_args->entry_args = entry_argument;
+  thread_args->entry_function = entry_function;
+  uintptr_t ret =
+      _beginthreadex(NULL, stack_size, ThreadTrampoline, thread_args, 0, NULL);
+  return *(Thread*)&ret;
+}
+
+void CloseThread(Thread thread) { CloseHandle(*(::HANDLE*)&thread); }
+
+bool WaitForThread(Thread thread) {
+  return WaitForSingleObject(*(::HANDLE*)&thread, INFINITE) == WAIT_OBJECT_0;
+}
+
+bool WaitForAllThreads(Thread* threads, uint thread_count) {
+  return WaitForMultipleObjects(thread_count, threads, TRUE, INFINITE) ==
+         WAIT_OBJECT_0;
+}
+
+void SetEnvVar(std::string env_var_name, std::string env_var_value) {
+  SetEnvironmentVariable(env_var_name.c_str(), env_var_value.c_str());
+}
+
+std::string GetEnvVar(std::string env_var_name) {
+  char* buff;
+  DWORD char_count = GetEnvironmentVariable(env_var_name.c_str(), NULL, 0);
+  if (char_count == 0) return "";
+  buff = (char*)alloca(sizeof(char) * char_count);
+  GetEnvironmentVariable(env_var_name.c_str(), buff, char_count);
+  buff[char_count - 1] = '\0';
+  std::string ret = buff;
+  return ret;
+}
+
+size_t GetUserModeVirtualMemorySize() {
+  SYSTEM_INFO system_info = {0};
+  GetSystemInfo(&system_info);
+  return ((size_t)system_info.lpMaximumApplicationAddress + 1);
+}
+
+size_t GetUsablePhysicalHostMemorySize() {
+  MEMORYSTATUSEX memory_status = {0};
+  memory_status.dwLength = sizeof(memory_status);
+  if (GlobalMemoryStatusEx(&memory_status) == 0) {
+    return 0;
+  }
+
+  const size_t physical_size = static_cast<size_t>(memory_status.ullTotalPhys);
+  return std::min(GetUserModeVirtualMemorySize(), physical_size);
+}
+
+uintptr_t GetUserModeVirtualMemoryBase() { return (uintptr_t)0; }
+
+// Os event wrappers
+EventHandle CreateOsEvent(bool auto_reset, bool init_state) {
+  EventHandle evt = reinterpret_cast<EventHandle>(
+      CreateEvent(NULL, (BOOL)(!auto_reset), (BOOL)init_state, NULL));
+  return evt;
+}
+
+int DestroyOsEvent(EventHandle event) {
+  if (event == NULL) {
+    return -1;
+  }
+  return CloseHandle(reinterpret_cast<::HANDLE>(event));
+}
+
+int WaitForOsEvent(EventHandle event, unsigned int milli_seconds) {
+  if (event == NULL) {
+    return -1;
+  }
+
+  int ret_code =
+      WaitForSingleObject(reinterpret_cast<::HANDLE>(event), milli_seconds);
+  if (ret_code == WAIT_TIMEOUT) {
+    ret_code = 0x14003;  // 0x14003 indicates timeout
+  }
+  return ret_code;
+}
+
+int SetOsEvent(EventHandle event) {
+  if (event == NULL) {
+    return -1;
+  }
+  return SetEvent(reinterpret_cast<::HANDLE>(event));
+}
+
+int ResetOsEvent(EventHandle event) {
+  if (event == NULL) {
+    return -1;
+  }
+  return ResetEvent(reinterpret_cast<::HANDLE>(event));
+}
+
+uint64_t ReadAccurateClock() {
+  uint64_t ret;
+  QueryPerformanceCounter((LARGE_INTEGER*)&ret);
+  return ret;
+}
+
+uint64_t AccurateClockFrequency() {
+  uint64_t ret;
+  QueryPerformanceFrequency((LARGE_INTEGER*)&ret);
+  return ret;
+}
+
+SharedMutex CreateSharedMutex() {
+  assert(false && "Not implemented.");
+  abort();
+  return nullptr;
+}
+
+bool TryAcquireSharedMutex(SharedMutex lock) {
+  assert(false && "Not implemented.");
+  abort();
+  return false;
+}
+
+bool AcquireSharedMutex(SharedMutex lock) {
+  assert(false && "Not implemented.");
+  abort();
+  return false;
+}
+
+void ReleaseSharedMutex(SharedMutex lock) {
+  assert(false && "Not implemented.");
+  abort();
+}
+
+bool TrySharedAcquireSharedMutex(SharedMutex lock) {
+  assert(false && "Not implemented.");
+  abort();
+  return false;
+}
+
+bool SharedAcquireSharedMutex(SharedMutex lock) {
+  assert(false && "Not implemented.");
+  abort();
+  return false;
+}
+
+void SharedReleaseSharedMutex(SharedMutex lock) {
+  assert(false && "Not implemented.");
+  abort();
+}
+
+void DestroySharedMutex(SharedMutex lock) {
+  assert(false && "Not implemented.");
+  abort();
+}
+
+uint64_t ReadSystemClock() {
+  assert(false && "Not implemented.");
+  abort();
+  return 0;
+}
+
+uint64_t SystemClockFrequency() {
+  assert(false && "Not implemented.");
+  abort();
+  return 0;
+}
+
+bool ParseCpuID(cpuid_t* cpuinfo) {
+  assert(false && "Not implemented.");
+  abort();
+  return false;
+}
+
+}   //  namespace os
+}   //  namespace wsl
+
+#endif
@@ -0,0 +1,36 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+
+const char rocdxgbuildid[] __attribute__((used)) = "ROCDXG BUILD ID: " STRING(ROCDXG_VERSION);
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtGetVersion(HsaVersionInfo *VersionInfo) {
+  CHECK_DXG_OPEN();
+
+  VersionInfo->KernelInterfaceMajorVersion = 1;
+  VersionInfo->KernelInterfaceMinorVersion = 17;
+
+  return HSAKMT_STATUS_SUCCESS;
+}
@@ -0,0 +1,320 @@
+/* Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. */
+
+#include "impl/wddm/cmd_util.h"
+
+namespace wsl {
+namespace thunk {
+
+/*
+ * Builds a COPY_DATA packet that copies data.
+ */
+size_t CmdUtil::BuildCopyData(
+  uint64_t  *pDstAddr,
+  void      *pBuffer,
+  uint32_t  dstSel,
+  uint32_t  dstCachePolicy,
+  uint32_t  srcSel,
+  uint32_t  srcCachePolicy,
+  uint32_t  countSel,
+  uint32_t  wrConfirm) {
+  PM4MEC_COPY_DATA copy_data = {0};
+
+  GenerateCmdHeader(&copy_data, IT_COPY_DATA);
+  copy_data.bitfields2.dst_sel = dstSel;
+  copy_data.bitfields2.src_sel = srcSel;
+  copy_data.bitfields2.dst_cache_policy = dstCachePolicy;
+  copy_data.bitfields2.src_cache_policy = srcCachePolicy;
+  copy_data.bitfields2.count_sel = countSel;
+  copy_data.bitfields2.wr_confirm = wrConfirm;
+  copy_data.bitfields5c.dst_64b_addr_lo = (PtrLow32(pDstAddr) >> 3);
+  copy_data.dst_addr_hi = PtrHigh32(pDstAddr);
+  memcpy(pBuffer, &copy_data, sizeof(copy_data));
+
+  return sizeof(copy_data);
+}
+
+/*
+ * Builds a EVENT_WRITE packet.
+ * Applications can use Barrier command to ensure their
+ * command is executed only after all other commands have
+ * completed their execution.
+ */
+size_t CmdUtil::BuildBarrier(
+  void      *pBuffer,
+  uint32_t  eventIndex,
+  uint32_t  eventType) {
+  BarrierTemplate barrier = {0};
+
+  GenerateCmdHeader(&barrier.event_write, IT_EVENT_WRITE);
+  barrier.event_write.bitfields2.event_index = eventIndex;
+  barrier.event_write.bitfields2.event_type = eventType;
+  memcpy(pBuffer, &barrier, sizeof(barrier));
+
+  return sizeof(barrier);
+}
+
+/**
+ * Builds a WRITE_DATA packet.
+ * Writes two DWORDs into the GPU memory address "write_addr"
+ */
+
+size_t CmdUtil::BuildWriteData64Command(
+  void*     pBuffer,
+  uint64_t* write_addr,
+  uint64_t  write_value) {
+  WriteDataTemplate command = {0};
+  GenerateCmdHeader(&command.write_data, IT_WRITE_DATA);
+
+  // Encode the user specified address to write to
+  uint64_t addr = uintptr_t(write_addr);
+  assert(!(addr & 0x3) && "WriteData address must be 4 byte aligned");
+
+  // Set the bit to confirm the write operation and cache policy
+  command.write_data.bitfields2.wr_confirm = wr_confirm__mec_write_data__wait_for_write_confirmation;
+  command.write_data.bitfields2.cache_policy = cache_policy__mec_write_data__bypass;
+
+  // Specify the command to increment address if writing more than one DWord
+  command.write_data.bitfields2.addr_incr = addr_incr__mec_write_data__increment_address;
+  // Specify the class to which the write destination belongs
+  command.write_data.bitfields2.dst_sel = dst_sel__mec_write_data__memory;
+
+  command.write_data.bitfields3c.dst_mem_addr_lo = (PtrLow32(write_addr) >> 2);
+  command.write_data.dst_mem_addr_hi = PtrHigh32(write_addr);
+
+  // Specify the value to write
+  command.write_data.write_data_value = write_value;
+
+  memcpy(pBuffer, &command, sizeof(command));
+  return sizeof(command);
+}
+
+/*
+ * Builds a ACQUIRE_MEM packet.
+ * Users can submit this command to
+ * invalidate Gpu caches - L1 and or L2.
+ */
+size_t CmdUtil::BuildAcquireMem(
+  uint8_t major,
+  void    *pBuffer) {
+  size_t ret;
+  if (major == 9) {
+    gfx9::AcquireMemTemplate acq = {0};
+    GenerateCmdHeader(&acq.acquire_mem, IT_ACQUIRE_MEM);
+    // Specify the size of memory to invalidate. Size is
+    // specified in terms of 256 byte chunks. A coher_size
+    // of 0xFFFFFFFF actually specified 0xFFFFFFFF00 (40 bits)
+    // of memory. The field coher_size_hi specifies memory from
+    // bits 40-64 for a total of 256 TB.
+    acq.acquire_mem.coher_size = 0xFFFFFFFF;
+    acq.acquire_mem.bitfields4.coher_size_hi = 0xFF;
+    // Specify the address of memory to invalidate. The
+    // address must be 256 byte aligned.
+    acq.acquire_mem.coher_base_lo = 0;
+    acq.acquire_mem.bitfields6.coher_base_hi = 0;
+    // Specify the poll interval for determing if operation is complete
+    acq.acquire_mem.bitfields7.poll_interval = 4;
+    acq.acquire_mem.bitfields2.coher_cntl =
+      (1 << 29) | // CP_COHER_CNTL__SH_ICACHE_ACTION_ENA_MASK
+      (1 << 27) | // CP_COHER_CNTL__SH_KCACHE_ACTION_ENA_MASK
+      (1 << 28);  // CP_COHER_CNTL__SH_KCACHE_VOL_ACTION_ENA_MASK
+    memcpy(pBuffer, &acq, sizeof(acq));
+    ret = sizeof(acq);
+  } else if (major >= 10) {
+    gfx10::AcquireMemTemplate acq = {0};
+    GenerateCmdHeader(&acq.acquire_mem, IT_ACQUIRE_MEM);
+    acq.acquire_mem.coher_size = 0xFFFFFFFF;
+    acq.acquire_mem.bitfields4.coher_size_hi = 0xFF;
+    acq.acquire_mem.coher_base_lo = 0;
+    acq.acquire_mem.bitfields6.coher_base_hi = 0;
+    acq.acquire_mem.bitfields7.poll_interval = 4;
+    acq.acquire_mem.bitfields8.gcr_cntl =
+      (1 << 16) | // SEQ = FORWARD
+      (1 << 15) | // GL2_WB
+      (1 << 14) | // GL2_INV
+      (1 << 9) |  // GL1_INV
+      (1 << 8) |  // GLV_INV
+      (1 << 7) |  // GLK_INV
+      (1 << 6) |  // GLK_WB
+      (1 << 5) |  // GLM_INV
+      (1 << 4) |  // GLM_WB
+      (1 << 0);   // GLI_INV = ALL
+    memcpy(pBuffer, &acq, sizeof(acq));
+    ret = sizeof(acq);
+  }
+
+  return ret;
+}
+
+/*
+ * Builds a scratch packet.
+ */
+size_t CmdUtil::BuildScratch(
+  void  *pScratchBase,
+  void  *pBuffer) {
+  struct SetScratchTemplate scratch = {0};
+
+  GenerateSetShRegHeader(&scratch, mmCOMPUTE_DISPATCH_SCRATCH_BASE_LO);
+  scratch.scratch_lo = Ptr48Low32(pScratchBase);
+  scratch.scratch_hi = Ptr48High8(pScratchBase);
+  memcpy(pBuffer, &scratch, sizeof(scratch));
+
+  return sizeof(scratch);
+}
+
+/**
+ * @ Set Compute Shader parameter for gfx11 and above
+ */
+size_t CmdUtil::BuildComputeShaderParams(void  *pBuffer) {
+  struct DispatchProgramResourceRegs compute_shader_params = {0};
+
+  GenerateSetShRegHeader(&compute_shader_params, mmCOMPUTE_PGM_RSRC3);
+  // IMAGE_OP: Indicates the compute program contains an image op
+  // instruction and should be stalled by its WAIT_SYNC fence.
+  compute_shader_params.compute_pgm_rsrc3 = (1 << 31);
+
+  memcpy(pBuffer, &compute_shader_params, sizeof(compute_shader_params));
+
+  return sizeof(compute_shader_params);
+}
+
+
+/*
+ * Builds a dispatch packet.
+ */
+size_t CmdUtil::BuildDispatch(
+  struct DispatchInfo *pInfo,
+  void                *pBuffer) {
+  DispatchTemplate dispatch = {0};
+
+  GenerateSetShRegHeader(&dispatch.dimension_regs, mmCOMPUTE_NUM_THREAD_X);
+  dispatch.dimension_regs.compute_num_thread_x = pInfo->pPacket->workgroup_size_x;
+  dispatch.dimension_regs.compute_num_thread_y = pInfo->pPacket->workgroup_size_y;
+  dispatch.dimension_regs.compute_num_thread_z = pInfo->pPacket->workgroup_size_z;
+
+  // TODO: Add AQL packet index for debugger
+  // Debugger requires AQL packet index in COMPUTE_DISPATCH_PKT_ADDR_LO
+  GenerateSetShRegHeader(&dispatch.program_regs, mmCOMPUTE_PGM_LO);
+  dispatch.program_regs.compute_pgm_lo = Ptr48Low32(pInfo->pEntry);
+  dispatch.program_regs.compute_pgm_hi = Ptr48High8(pInfo->pEntry);
+
+  GenerateSetShRegHeader(&dispatch.program_resource_regs, mmCOMPUTE_PGM_RSRC1);
+  dispatch.program_resource_regs.compute_pgm_rsrc1 = pInfo->pKernelObject->compute_pgm_rsrc1;
+  if (pInfo->major == 11) {
+    AMD_HSA_BITS_SET(dispatch.program_resource_regs.compute_pgm_rsrc1,
+        AMD_COMPUTE_PGM_RSRC_ONE_PRIV, 1);
+  }
+  dispatch.program_resource_regs.compute_pgm_rsrc2 =
+    (pInfo->ldsBlks << 15) | pInfo->pKernelObject->compute_pgm_rsrc2;
+
+  GenerateSetShRegHeader(&dispatch.resource_regs, mmCOMPUTE_RESOURCE_LIMITS);
+  dispatch.resource_regs.compute_resource_limits = 0x3ff;
+  dispatch.resource_regs.compute_static_thread_mgmt_se0 = 0xFFFFFFFF;
+  dispatch.resource_regs.compute_static_thread_mgmt_se1 = 0xFFFFFFFF;
+  dispatch.resource_regs.compute_static_thread_mgmt_se2 = 0xFFFFFFFF;
+  dispatch.resource_regs.compute_static_thread_mgmt_se3 = 0xFFFFFFFF;
+
+  dispatch.resource_regs.compute_tmpring_size = pInfo->pAmdQueue->compute_tmpring_size;
+
+  GenerateSetShRegHeader(&dispatch.compute_user_data_regs, mmCOMPUTE_USER_DATA_0);
+
+  uint32_t sgpr_no = 0;
+  if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties,
+		       AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER)) {
+    assert(pInfo->major < 11);
+    pInfo->scratchBaseOffset[pInfo->offsetCnt++] =
+      offsetof(struct DispatchTemplate, compute_user_data_regs.compute_user_data[0]) +
+      sgpr_no * sizeof(uint32_t);
+
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
+      pInfo->pAmdQueue->scratch_resource_descriptor[0];
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
+      pInfo->pAmdQueue->scratch_resource_descriptor[1];
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
+      pInfo->pAmdQueue->scratch_resource_descriptor[2];
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
+      pInfo->srd;
+  }
+  if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties,
+		       AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR)) {
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = PtrLow32(pInfo->pPacket);
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = PtrHigh32(pInfo->pPacket);
+  }
+  if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties,
+		       AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) {
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = PtrLow32(pInfo->pAmdQueue);
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = PtrHigh32(pInfo->pAmdQueue);
+  }
+  if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties,
+		       AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR)) {
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
+      PtrLow32(pInfo->pPacket->kernarg_address);
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
+      PtrHigh32(pInfo->pPacket->kernarg_address);
+  }
+  if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties,
+		       AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_ID)) {
+    // This feature may be enabled as a side effect of indirect calls.
+    // However, the compiler team confirmed that the dispatch id itself is not used,
+    // so safe to send 0 for each dispatch.
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = 0;
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = 0;
+  }
+  if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties,
+		       AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_FLAT_SCRATCH_INIT)) {
+    assert(pInfo->major < 11);
+    pInfo->scratchBaseOffset[pInfo->offsetCnt++] =
+      offsetof(struct DispatchTemplate, compute_user_data_regs.compute_user_data[0]) +
+      sgpr_no * sizeof(uint32_t);
+
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
+      PtrLow32(pInfo->pScratchBase);
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
+      PtrHigh32(pInfo->pScratchBase);
+  }
+  if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties,
+		       AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE)) {
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
+      pInfo->scratchSizePerWave / (pInfo->wave32 ? 32 : 64);
+  }
+
+  GenerateCmdHeader(&dispatch.dispatch_direct, IT_DISPATCH_DIRECT);
+  dispatch.dispatch_direct.dispatch_initiator =
+    (1 << 0) | // COMPUTE_SHADER_EN
+    (1 << 2) | // FORCE_START_AT_000
+    (1 << 5); // USE_THREAD_DIMENSIONS
+  if (pInfo->wave32) dispatch.dispatch_direct.dispatch_initiator |= (1 << 15); // CS_W32_EN
+  dispatch.dispatch_direct.dim_x = pInfo->pPacket->grid_size_x;
+  dispatch.dispatch_direct.dim_y = pInfo->pPacket->grid_size_y;
+  dispatch.dispatch_direct.dim_z = pInfo->pPacket->grid_size_z;
+  memcpy(pBuffer, &dispatch, sizeof(dispatch));
+
+  return sizeof(dispatch);
+}
+
+/*
+ * Builds a ATOMIC_MEM packet.
+ * Users can submit this command
+ * to perform atomic operations.
+ */
+size_t CmdUtil::BuildAtomicMem(
+  uint64_t  *pAddr,
+  uint32_t  atomic,
+  void      *pBuffer,
+  uint32_t  cachePolicy,
+  uint64_t  srcData) {
+  AtomicTemplate atom = {0};
+
+  GenerateCmdHeader(&atom.atomic, IT_ATOMIC_MEM);
+  atom.atomic.addr_lo = PtrLow32(pAddr);
+  atom.atomic.addr_hi = PtrHigh32(pAddr);
+  atom.atomic.bitfields2.atomic = atomic;
+  atom.atomic.bitfields2.cache_policy = cachePolicy;
+  atom.atomic.src_data_lo = LowPart(srcData);
+  atom.atomic.src_data_hi = HighPart(srcData);
+  memcpy(pBuffer, &atom, sizeof(atom));
+
+  return sizeof(atom);
+}
+
+} // namespace thunk
+} // namespace wsl
@@ -0,0 +1,780 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include <cinttypes>
+#include <bitset>
+
+#include <sys/mman.h>
+#include <sys/sysinfo.h>
+#include <sys/stat.h>
+#include <linux/mman.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include "impl/wddm/status.h"
+#include "impl/wddm/types.h"
+#include "impl/wddm/device.h"
+#include "impl/wddm/queue.h"
+
+namespace wsl {
+namespace thunk {
+
+const uint32_t WDDMDevice::cmdbuf_aql_frame_num_ = 0x1000;
+
+WDDMDevice::WDDMDevice(D3DKMT_HANDLE adapter, LUID adapter_luid, uint32_t node_id)
+  : adapter_(adapter), adapter_luid_(adapter_luid), node_id_(node_id) {
+  memset(&device_info_, 0, sizeof(device_info_));
+
+  ParseDeviceInfo();
+  CreateDevice();
+  SetPowerOptimization(false);
+  CreatePagingQueue();
+  InitCmdbufInfo();
+  QuerySegmentInfo();
+}
+
+WDDMDevice::~WDDMDevice() {
+  DestroyPagingQueue();
+  SetPowerOptimization(true);
+  DestroyDevice();
+
+  DestroyDeviceInfo();
+}
+
+static NTSTATUS WDDMQueryAdapter(D3DKMT_HANDLE adapter, KMTQUERYADAPTERINFOTYPE type,
+				 void *data, int size)
+{
+  D3DKMT_QUERYADAPTERINFO args = {0};
+
+  args.hAdapter = adapter;
+  args.Type = type;
+  args.pPrivateDriverData = data;
+  args.PrivateDriverDataSize = size;
+
+  return DXCORE_CALL(D3DKMTQueryAdapterInfo(&args));
+}
+
+bool WDDMDevice::QuerySegmentInfo()
+{
+  uint32_t segmentCount = 0;
+  segment_infos_.clear();
+
+  // Get the number of segments
+  D3DKMT_QUERYSTATISTICS adapterQuery = {};
+  adapterQuery.Type = D3DKMT_QUERYSTATISTICS_ADAPTER;
+  adapterQuery.AdapterLuid = adapter_luid_;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTQueryStatistics(&adapterQuery));
+  if (ret == STATUS_SUCCESS) {
+    segmentCount = adapterQuery.QueryResult.AdapterInformation.NbSegments;
+    pr_debug("Total Segments: %u\n", segmentCount);
+  } else {
+    pr_err("Failed to query adapter info\n");
+    return false;
+  }
+
+  for (uint32_t i = 0; i < segmentCount; i++) {
+
+    D3DKMT_QUERYSTATISTICS segQuery = {};
+    segQuery.Type = D3DKMT_QUERYSTATISTICS_SEGMENT;
+    segQuery.AdapterLuid = adapter_luid_;
+    segQuery.QuerySegment.SegmentId = i;
+
+    ret = DXCORE_CALL(D3DKMTQueryStatistics(&segQuery));
+    if (ret != STATUS_SUCCESS) {
+      pr_err("Failed to query segment %u info\n", i);
+      return false;
+    }
+
+    auto& seg = segQuery.QueryResult.SegmentInformation;
+
+    SegmentInfo info;
+    info.segment_id = i;
+    info.segment_type = seg.SegmentProperties.SegmentType;
+    info.system_memory = seg.SegmentProperties.SystemMemory;
+    info.aperture = seg.Aperture;
+    info.commit_limit = seg.CommitLimit;
+
+    segment_infos_.push_back(info);
+  }
+
+  return true;
+}
+
+bool WDDMDevice::GetSegmentId(D3DKMT_QUERYSTATISTICS_SEGMENT_TYPE segment_type,
+                              uint32_t &segment_id)
+{
+  for (const auto& seg_info : segment_infos_) {
+    if (seg_info.segment_type == segment_type) {
+      segment_id = seg_info.segment_id;
+      return true;
+    }
+  }
+  pr_err("Failed to get segment id for type %u\n", segment_type);
+  return false;
+}
+
+/*Local heap(dedicated GPU memory) includes visiable heap and invisiable heap.
+ *Non local heap refers to shared GPU memory and it is sytem memory.
+ */
+uint64_t WDDMDevice::VramAvail(void) {
+  D3DKMT_QUERYSTATISTICS stats;
+  NTSTATUS ret;
+  uint64_t usedVis = 0;
+  uint64_t usedInv = 0;
+  uint64_t usedNonLocal = 0;
+  uint32_t segmentId = 0;
+
+  // wait fence complete
+  uint64_t value = page_fence_value_.load();
+  if(!CpuWait(&page_syncobj_, &value, 1, false))
+    return HSA_STATUS_ERROR;
+
+  if (IsDgpu()) {
+    // local cpu-visible memory
+    if(!GetSegmentId(D3DKMT_QUERYSTATISTICS_SEGMENT_TYPE_MEMORY, segmentId))
+      return HSA_STATUS_ERROR;
+
+    memset(&stats, 0, sizeof(D3DKMT_QUERYSTATISTICS));
+    stats.Type = D3DKMT_QUERYSTATISTICS_SEGMENT;
+    stats.AdapterLuid = adapter_luid_;
+    stats.QuerySegment.SegmentId = segmentId;
+    ret = DXCORE_CALL(D3DKMTQueryStatistics(&stats));
+    if (ret == 0)
+      usedVis = stats.QueryResult.SegmentInformation.BytesResident;
+
+    // local invisible memory
+    if (device_info_.local_invisible_heap_size) {
+      segmentId++;
+      memset(&stats, 0, sizeof(D3DKMT_QUERYSTATISTICS));
+      stats.Type = D3DKMT_QUERYSTATISTICS_SEGMENT;
+      stats.AdapterLuid = adapter_luid_;
+      stats.QuerySegment.SegmentId = 1;
+
+      ret = DXCORE_CALL(D3DKMTQueryStatistics(&stats));
+      if (ret == 0)
+        usedInv = stats.QueryResult.SegmentInformation.BytesResident;
+    }
+
+    return LocalHeapSize() - usedVis - usedInv;
+  } else {
+    // APU - NonLocal memory
+    if(!GetSegmentId(D3DKMT_QUERYSTATISTICS_SEGMENT_TYPE_SYSMEM, segmentId))
+      return HSA_STATUS_ERROR;
+
+    memset(&stats, 0, sizeof(D3DKMT_QUERYSTATISTICS));
+    stats.Type = D3DKMT_QUERYSTATISTICS_SEGMENT;
+    stats.AdapterLuid = adapter_luid_;
+    stats.QuerySegment.SegmentId = segmentId;
+    ret = DXCORE_CALL(D3DKMTQueryStatistics(&stats));
+    if (ret == 0)
+      usedNonLocal = stats.QueryResult.SegmentInformation.BytesResident;
+
+    return NonLocalHeapSize() - usedNonLocal;
+  }
+}
+
+bool WDDMDevice::CreateDevice(void) {
+  D3DKMT_CREATEDEVICE args = {0};
+  args.hAdapter = adapter_;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTCreateDevice(&args));
+  if (ret == STATUS_SUCCESS) {
+    device_ = args.hDevice;
+    return true;
+  }
+
+  pr_err("fail %x\n", ret);
+  return false;
+}
+
+bool WDDMDevice::DestroyDevice(void) {
+  D3DKMT_DESTROYDEVICE args = {0};
+  args.hDevice = device_;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTDestroyDevice(&args));
+  if (ret == STATUS_SUCCESS)
+    return true;
+
+  pr_err("fail %x\n", ret);
+  return false;
+}
+
+bool WDDMDevice::CreatePagingQueue(void) {
+  D3DKMT_CREATEPAGINGQUEUE args = {0};
+  args.hDevice = device_;
+  args.Priority = D3DDDI_PAGINGQUEUE_PRIORITY_NORMAL;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTCreatePagingQueue(&args));
+  if (ret == STATUS_SUCCESS) {
+    page_queue_ = args.hPagingQueue;
+    page_syncobj_ = args.hSyncObject;
+    page_fence_addr_ = (uint64_t *)args.FenceValueCPUVirtualAddress;
+    page_fence_value_ = 0;
+    return true;
+  }
+
+  pr_err("fail %x\n", ret);
+  return false;
+}
+
+bool WDDMDevice::DestroyPagingQueue(void) {
+  D3DDDI_DESTROYPAGINGQUEUE args = {0};
+  args.hPagingQueue = page_queue_;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTDestroyPagingQueue(&args));
+  if (ret == STATUS_SUCCESS)
+    return true;
+
+  pr_err("fail %x\n", ret);
+  return false;
+}
+
+void WDDMDevice::SetPowerOptimization(bool restore) {
+  void *priv_data;
+  int priv_size;
+
+  priv_size = thunk_proxy::GetPowerOptPrivDataSize();
+  priv_data = malloc(priv_size);
+  assert(priv_data);
+  memset(priv_data, 0, priv_size);
+  thunk_proxy::FillinPowerOptPrivData(priv_data, restore);
+
+  D3DKMT_ESCAPE d3dkmt_escape;
+  memset(&d3dkmt_escape, 0, sizeof(d3dkmt_escape));
+
+  d3dkmt_escape.hAdapter              = adapter_;
+  d3dkmt_escape.hDevice               = device_;
+  d3dkmt_escape.hContext              = 0; //KMD only use device to identify the process
+  d3dkmt_escape.Type                  = D3DKMT_ESCAPE_DRIVERPRIVATE;
+  d3dkmt_escape.pPrivateDriverData    = priv_data;
+  d3dkmt_escape.PrivateDriverDataSize = priv_size;
+  d3dkmt_escape.Flags.HardwareAccess  = true;
+
+  NTSTATUS status = DXCORE_CALL(D3DKMTEscape(&d3dkmt_escape));
+  pr_debug("status %d, restore %d\n", status, restore);
+  free(priv_data);
+}
+
+void WDDMDevice::UpdatePageFence(uint64_t fence_value) {
+  uint64_t current = page_fence_value_.load();
+
+  // atomically set fence value when target is bigger than current one
+  do {
+    if (current >= fence_value)
+      break;
+  } while (!page_fence_value_.compare_exchange_weak(current, fence_value));
+}
+
+ErrorCode WDDMDevice::CreateGpuMemory(const GpuMemoryCreateInfo &create_info,
+                                        GpuMemory **gpu_mem, gpusize *gpu_va) {
+  ErrorCode ret;
+
+  *gpu_mem = nullptr;
+  auto mem = new GpuMemory(this);
+  if (create_info.dmabuf_fd > 0)
+    ret = mem->ImportPhysicalHandle(create_info, gpu_va);
+  else 
+    ret = mem->Init(create_info);
+  if (ret == ErrorCode::Success)
+    *gpu_mem = mem;
+  else
+    delete mem;
+
+  return ret;
+}
+
+void *WDDMDevice::Lock(D3DKMT_HANDLE handle) {
+  D3DKMT_LOCK2 args = {0};
+  args.hDevice = device_;
+  args.hAllocation = handle;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTLock2(&args));
+  if (ret == STATUS_SUCCESS)
+    return args.pData;
+
+  pr_err("fail %x\n", ret);
+  return NULL;
+}
+
+bool WDDMDevice::Unlock(D3DKMT_HANDLE handle) {
+  D3DKMT_UNLOCK2 args = {0};
+  args.hDevice = device_;
+  args.hAllocation = handle;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTUnlock2(&args));
+  if (ret == STATUS_SUCCESS)
+    return true;
+
+  pr_err("fail %x\n", ret);
+  return false;
+}
+
+bool WDDMDevice::CreateContext(int engine, D3DKMT_HANDLE *handle) {
+  void *priv_data;
+  int priv_size;
+
+  int ordinal = EngineOrdinal(engine, &device_info_);
+  if (ordinal < 0)
+    return false;
+
+  priv_size = thunk_proxy::GetContextPrivDataSize();
+  priv_data = malloc(priv_size);
+  assert(priv_data);
+  memset(priv_data, 0, priv_size);
+  thunk_proxy::FillinContextPrivData(priv_data, SupportStateShadowingByCpFw());
+
+  D3DKMT_CREATECONTEXTVIRTUAL args = {0};
+  args.hDevice = device_;
+  args.EngineAffinity = 1 << 0;
+  args.NodeOrdinal = ordinal;
+  args.pPrivateDriverData = priv_data;
+  args.PrivateDriverDataSize = priv_size;
+  args.ClientHint = D3DKMT_CLIENTHINT_OPENCL;
+
+  if (IsHwsEnabled(engine))
+    args.Flags.HwQueueSupported = 1;
+  else
+    args.Flags.DisableGpuTimeout = thunk_proxy::ShouldDisableGpuTimeout(engine, &device_info_);
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTCreateContextVirtual(&args));
+  if (ret == STATUS_SUCCESS) {
+    *handle = args.hContext;
+    free(priv_data);
+    return true;
+  }
+
+  free(priv_data);
+
+  pr_err("fail %x\n", ret);
+  return false;
+}
+
+bool WDDMDevice::DestroyContext(D3DKMT_HANDLE handle) {
+  D3DKMT_DESTROYCONTEXT args = {0};
+  args.hContext = handle;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTDestroyContext(&args));
+  if (ret == STATUS_SUCCESS)
+    return true;
+
+  pr_err("fail %x\n", ret);
+  return false;
+}
+
+bool WDDMDevice::GpuWait(WDDMQueue *queue, const D3DKMT_HANDLE *syncobjs,
+			 uint64_t *values, int count) {
+
+  D3DKMT_WAITFORSYNCHRONIZATIONOBJECTFROMGPU args = {0};
+  args.hContext = queue->context;
+  args.ObjectCount = count;
+  args.ObjectHandleArray = syncobjs;
+  args.MonitoredFenceValueArray = values;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTWaitForSynchronizationObjectFromGpu(&args));
+  if (ret == STATUS_SUCCESS)
+      return true;
+
+  pr_err("fail %x\n", ret);
+  return false;
+}
+
+bool WDDMDevice::GpuSignal(D3DKMT_HANDLE context, const D3DKMT_HANDLE *syncobjs,
+			   uint64_t *value, int count) {
+  D3DKMT_SIGNALSYNCHRONIZATIONOBJECTFROMGPU args = {0};
+  args.hContext = context;
+  args.ObjectCount = count;
+  args.ObjectHandleArray = syncobjs;
+  args.MonitoredFenceValueArray = value;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTSignalSynchronizationObjectFromGpu(&args));
+  if (ret == STATUS_SUCCESS)
+    return true;
+
+  pr_err("fail %x\n", ret);
+  return false;
+}
+
+bool WDDMDevice::CpuWait(const D3DKMT_HANDLE *syncobjs, uint64_t *value,
+			 int count, bool wait_any) {
+  D3DKMT_WAITFORSYNCHRONIZATIONOBJECTFROMCPU args = {0};
+  args.hDevice = device_;
+  args.ObjectCount = count;
+  args.ObjectHandleArray = syncobjs;
+  args.FenceValueArray = value;
+  args.Flags.WaitAny = wait_any;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTWaitForSynchronizationObjectFromCpu(&args));
+  if (ret == STATUS_SUCCESS)
+    return true;
+
+  pr_err("fail %x\n", ret);
+  return false;
+}
+
+bool WDDMDevice::WaitOnPagingFenceFromCpu() {
+  uint64_t page_fence_value = 0;
+
+  page_fence_value = page_fence_value_.load();
+  if (CpuWait(&page_syncobj_, &page_fence_value, 1, false))
+    return true;
+
+  return false;
+}
+
+bool WDDMDevice::CreateSyncobj(D3DKMT_HANDLE *handle, uint64_t **addr) {
+  D3DKMT_CREATESYNCHRONIZATIONOBJECT2 args = {0};
+  args.hDevice = device_;
+  args.Info.Type = D3DDDI_MONITORED_FENCE;
+  args.Info.MonitoredFence.EngineAffinity = 1 << 0;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTCreateSynchronizationObject2(&args));
+  if (ret == STATUS_SUCCESS) {
+    *handle = args.hSyncObject;
+    *addr = (uint64_t *)args.Info.MonitoredFence.FenceValueCPUVirtualAddress;
+    pr_debug("create syncobj cpu addr=%p gpu addr=%" PRIx64 "\n",
+             args.Info.MonitoredFence.FenceValueCPUVirtualAddress,
+             args.Info.MonitoredFence.FenceValueGPUVirtualAddress);
+
+    return true;
+  }
+
+  pr_err("fail %x\n", ret);
+  return false;
+}
+
+void WDDMDevice::DestroySyncobj(D3DKMT_HANDLE handle) {
+  D3DKMT_DESTROYSYNCHRONIZATIONOBJECT args = {0};
+  args.hSyncObject = handle;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTDestroySynchronizationObject(&args));
+  if (ret != STATUS_SUCCESS)
+    pr_err("fail %x\n", ret);
+}
+
+void WDDMDevice::InitCmdbufInfo(void) {
+  if (device_info_.major == 9) {
+    cmdbuf_aql_frame_size_ = 2 * sizeof(gfx9::AcquireMemTemplate);
+  } else if (device_info_.major >= 10) {
+    cmdbuf_aql_frame_size_ = 2 * sizeof(gfx10::AcquireMemTemplate);
+  }
+
+  if (device_info_.major >= 11) {
+    cmdbuf_aql_frame_size_ += sizeof(SetScratchTemplate);
+    cmdbuf_aql_frame_size_ += sizeof(DispatchProgramResourceRegs); // BuildComputeShaderParams
+  }
+
+  cmdbuf_aql_frame_size_ +=
+    sizeof(PM4MEC_COPY_DATA) * 2 +
+    sizeof(BarrierTemplate) * 2 +
+    sizeof(DispatchTemplate) +
+    sizeof(AtomicTemplate) * 2;
+
+  // Add safety margin to account for alignment and future additions
+  cmdbuf_aql_frame_size_ += 128;
+
+  cmdbuf_aql_frame_size_ = AlignUp(cmdbuf_aql_frame_size_, 0x10);
+
+  cmdbuf_size_ = AlignUp(cmdbuf_aql_frame_num_ * cmdbuf_aql_frame_size_, 0x1000);
+}
+
+uint32_t WDDMDevice::LdsBlocks(const hsa_kernel_dispatch_packet_t *pkt) {
+  static const uint32_t blk_sz = 512;
+  uint32_t total_sz = pkt->group_segment_size;
+  uint32_t blk_num = (total_sz + blk_sz - 1) / blk_sz;
+  return blk_num;
+}
+
+NTSTATUS WDDMCreateDevices(std::vector<WDDMDevice *> &devices)
+{
+  bool supported = false;
+  D3DKMT_ENUMADAPTERS2 args = {0};
+  NTSTATUS ret = DXCORE_CALL(D3DKMTEnumAdapters2(&args));
+  if (ret != STATUS_SUCCESS)
+    return ret;
+
+  if (!args.NumAdapters) {
+    return STATUS_SUCCESS;
+  }
+
+  D3DKMT_ADAPTERINFO *info = new D3DKMT_ADAPTERINFO[args.NumAdapters];
+  if (!info)
+    return STATUS_NO_MEMORY;
+
+  args.pAdapters = info;
+  ret = DXCORE_CALL(D3DKMTEnumAdapters2(&args));
+  if (ret != STATUS_SUCCESS)
+    goto err_out0;
+
+  for (int i = 0; i < args.NumAdapters; i++) {
+    D3DKMT_QUERY_DEVICE_IDS query = {0};
+
+    ret = WDDMQueryAdapter(info[i].hAdapter, KMTQAITYPE_PHYSICALADAPTERDEVICEIDS,
+			   &query, sizeof(query));
+    if (ret != STATUS_SUCCESS)
+      goto err_out1;
+
+    if (query.DeviceIds.VendorID != 0x1002)
+      continue;
+
+    supported = thunk_proxy::QueryAdapterSupported(query.DeviceIds.DeviceID);
+
+    if (supported) {
+      auto device = new WDDMDevice(
+        info[i].hAdapter, info[i].AdapterLuid, devices.size() + 1);
+      if (!device)
+        goto err_out1;
+      devices.push_back(device);
+    }
+  }
+
+  delete[] info;
+  return STATUS_SUCCESS;
+
+ err_out1:
+  for (auto &device : devices)
+    delete device;
+ err_out0:
+  delete[] info;
+  return ret;
+}
+
+bool WDDMDevice::ParseDeviceInfo() {
+  bool ret;
+
+  memset(&device_info_, 0, sizeof(device_info_));
+  ret = thunk_proxy::ParseAdapterInfo(adapter_, &device_info_);
+  if (!ret)
+    return false;
+
+  return true;
+}
+
+void WDDMDevice::DestroyDeviceInfo() {
+  free(device_info_.adapter_info);
+}
+
+void WDDMDevice::GetClockCounters(uint64_t *gpu, uint64_t *cpu) {
+
+  uint32_t engine = GetComputeEngine();
+  int ordinal = EngineOrdinal(engine, &device_info_);
+
+  D3DKMT_QUERYCLOCKCALIBRATION args = {0};
+
+ /* LDA(Linked Display Adapter)
+  * In the LDA design multiple physical GPUs are linked together to be controlled
+  * as a single object from the point of view of power manager, GPU scheduler and
+  * GPU memory manager. The physical GPUs are represented by a signal logical adapter
+  * object. There is a single DXGADAPTER objects, a single KMD adapter object.
+  *
+  * Set PhysicalAdapterIndex to 0 by default with None LDA mode.
+  */
+  args.hAdapter = adapter_;
+  args.NodeOrdinal = ordinal;
+  args.PhysicalAdapterIndex = 0;
+
+  NTSTATUS status = DXCORE_CALL(D3DKMTQueryClockCalibration(&args));
+  if (status) {
+    pr_debug("status %d \n", status);
+  } else {
+    if (gpu)
+      *gpu = args.ClockData.GpuClockCounter;
+
+    if (cpu)
+      *cpu = args.ClockData.CpuClockCounter;
+  }
+}
+
+bool WDDMDevice::CreateQueue(WDDMQueue *queue) {
+  if (!CreateContext(queue->queue_engine, &queue->context))
+    return false;
+
+  GpuMemory *gpu_mem = nullptr;
+  if (queue->cmdbuf_addr == 0) {
+    GpuMemoryCreateInfo create_info{};
+    create_info.size = queue->cmdbuf_size;
+    create_info.domain = thunk_proxy::kSystem;
+
+    auto code = CreateGpuMemory(create_info, &gpu_mem);
+    if (code != ErrorCode::Success)
+        goto err_out0;
+
+    queue->cmdbuf = gpu_mem->GetGpuMemoryHandle();
+    queue->cmdbuf_addr = gpu_mem->GpuAddress();
+  }
+
+  if (queue->Init())
+     goto err_out1;
+
+  return true;
+
+err_out1:
+  delete gpu_mem;
+err_out0:
+  DestroyContext(queue->context);
+
+  return false;
+}
+
+void WDDMDevice::DestroyQueue(WDDMQueue *queue) {
+
+  queue->Fini();
+
+  auto cmdbuf_mem = GpuMemory::Convert(queue->cmdbuf);
+  delete cmdbuf_mem;
+
+  DestroyContext(queue->context);
+}
+
+bool WDDMDevice::SubmitToSwQueue(WDDMQueue *queue, uint64_t command_addr,
+                                uint64_t command_size, uint64_t fence_value) {
+  void *priv_data;
+  int priv_size;
+
+  priv_size = thunk_proxy::GetSubmitPrivDataSize();
+  priv_data = malloc(priv_size);
+  assert(priv_data);
+  memset(priv_data, 0, priv_size);
+  thunk_proxy::FillinSubmitPrivData(priv_data, queue->queue, command_addr, command_size, false);
+
+  D3DKMT_SUBMITCOMMAND args = {0};
+  args.Commands = command_addr;
+  args.CommandLength = command_size;
+  args.BroadcastContextCount = 1;
+  args.BroadcastContext[0] = queue->context;
+  args.pPrivateDriverData = priv_data;
+  args.PrivateDriverDataSize = priv_size;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTSubmitCommand(&args));
+  if (ret != STATUS_SUCCESS) {
+    pr_err("fail %x\n", ret);
+    free(priv_data);
+    return false;
+  }
+
+  free(priv_data);
+
+  if (!GpuSignal(queue->context, &queue->syncobj, &fence_value, 1))
+    return false;
+
+  return true;
+}
+
+bool WDDMDevice::CreateHwQueue(WDDMQueue *queue) {
+  void *priv_data;
+  int priv_size;
+
+  priv_size = thunk_proxy::GetHwQueuePrivDataSize();
+  priv_data = malloc(priv_size);
+  assert(priv_data);
+  memset(priv_data, 0, priv_size);
+  bool FwManagedGfxState = SupportStateShadowingByCpFw();
+  thunk_proxy::FillinHwQueuePrivData(priv_data, FwManagedGfxState, queue->prio);
+
+  D3DKMT_CREATEHWQUEUE createHwQueue = {0};
+  createHwQueue.hHwContext = queue->context;
+  createHwQueue.Flags.DisableGpuTimeout = thunk_proxy::ShouldDisableGpuTimeout(queue->queue_engine, &device_info_);
+  createHwQueue.pPrivateDriverData = priv_data;
+  createHwQueue.PrivateDriverDataSize = priv_size;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTCreateHwQueue(&createHwQueue));
+  if (ret != STATUS_SUCCESS) {
+    pr_err("fail %x\n", ret);
+    free(priv_data);
+    return false;
+  }
+
+  free(priv_data);
+
+  queue->queue = createHwQueue.hHwQueue;
+  queue->syncobj = createHwQueue.hHwQueueProgressFence;
+  queue->sync_addr = (uint64_t *)createHwQueue.HwQueueProgressFenceCPUVirtualAddress;
+
+  return true;
+}
+
+bool WDDMDevice::DestroyHwQueue(WDDMQueue *queue) {
+   D3DKMT_DESTROYHWQUEUE DestroyHwQueue = {
+    .hHwQueue = queue->queue,
+  };
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTDestroyHwQueue(&DestroyHwQueue));
+  if (ret != STATUS_SUCCESS) {
+    pr_err("fail %x\n", ret);
+    return false;
+  }
+
+  return true;
+}
+
+bool WDDMDevice::SubmitToHwQueue(WDDMQueue *queue, uint64_t command_addr,
+                                uint64_t command_size, uint64_t fence_value) {
+  void *priv_data;
+  int priv_size;
+
+  priv_size = thunk_proxy::GetSubmitPrivDataSize();
+  priv_data = malloc(priv_size);
+  assert(priv_data);
+  memset(priv_data, 0, priv_size);
+  thunk_proxy::FillinSubmitPrivData(priv_data, queue->queue, command_addr, command_size, true);
+
+  D3DKMT_SUBMITCOMMANDTOHWQUEUE args = {0};
+  args.hHwQueue = queue->queue;
+  args.HwQueueProgressFenceId = fence_value;
+  args.CommandBuffer = command_addr;
+  args.CommandLength = command_size;
+  args.pPrivateDriverData = priv_data;
+  args.PrivateDriverDataSize = priv_size;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTSubmitCommandToHwQueue(&args));
+  if (ret != STATUS_SUCCESS) {
+    pr_err("fail %x\n", ret);
+    free(priv_data);
+    return false;
+  }
+
+  free(priv_data);
+
+  return true;
+}
+
+} // namespace thunk
+} // namespace wsl
@@ -0,0 +1,594 @@
+#include <sys/stat.h>
+#include <cinttypes>
+#include <cassert>
+#include "impl/wddm/gpu_memory.h"
+#include "impl/wddm/device.h"
+#include "util/utils.h"
+
+using namespace std;
+
+namespace wsl {
+namespace thunk {
+
+size_t GpuMemory::CalcChunkNumbers(gpusize size) {
+  const auto chunk_size = WDDMDevice::GpuMemoryChunkSize;
+  return (size + chunk_size - 1) / chunk_size;
+}
+
+gpusize GpuMemory::AdjustSize(gpusize size) const {
+  const auto &device_info = device_->DeviceInfo();
+
+  if (device_info.enable_big_page_alignment && desc_.domain == thunk_proxy::kLocal) {
+    uint32_t alignment = device_info.big_page_alignment_size;
+    // BigPage is only supported for allocations > bigPageMinAlignment.
+    // Also, if bigPageMinAlignment == 0, BigPage optimization is not supported per KMD.
+    // We do either LargePage or BigPage alignment, whichever has a higher value.
+    if ((device_info.hw_big_page_min_alignment_size > 0) && (size > device_info.hw_big_page_min_alignment_size)) {
+      alignment = std::max(alignment, device_info.hw_big_page_min_alignment_size);
+      if (size > device_info.hw_big_page_alignment_size)
+        alignment = std::max(alignment, device_info.hw_big_page_alignment_size);
+    }
+    if (alignment > 0)
+      size = AlignUp(size, alignment);
+  } else {
+    const size_t min_size = 4096;
+    size = AlignUp(size, min_size);
+  }
+  return size;
+}
+
+GpuMemory::GpuMemory(WDDMDevice *device) : device_(device) {
+  num_allocations_ = 0;
+  alloc_handles_ptr_ = nullptr;
+  alloc_handle_ = 0;
+  resource_ = 0;
+  mem_fd_ = -1;
+}
+
+GpuMemory::~GpuMemory() {
+  FreeGpuVirtualAddress(GpuAddress(), Size());
+  FreePhysicalMemory();
+  if (desc_.handle_ape_addr > 0)
+    dxg_runtime->HandleApertureFree(desc_.handle_ape_addr);
+}
+
+ErrorCode GpuMemory::Init(const GpuMemoryCreateInfo &create_info) {
+  desc_.domain = create_info.domain;
+  desc_.adapter_luid = device_->GetLuid();
+  desc_.client_size = create_info.size;
+  desc_.alignment = create_info.alignment;
+  desc_.mem_flags = create_info.mem_flags;
+  desc_.engine_flag = create_info.engine_flag;
+  desc_.flags.is_virtual = create_info.flags.virtual_alloc;
+  desc_.flags.is_physical_only = create_info.flags.physical_only;
+  desc_.flags.is_physical_contiguous = create_info.flags.physical_contiguous;
+  desc_.flags.is_imported_sys_memfd = create_info.flags.sysmem_ipc_sig_importer;
+  desc_.flags.is_sysmem_exporter = create_info.flags.sysmem_ipc_sig_exporter;
+  desc_.flags.is_va_required = create_info.flags.alloc_va;
+  desc_.flags.is_blit_kernel_object = create_info.flags.blit_kernel_object;
+
+  /* we can't tell the allocation is regular vmm or ipc mem at creation stage,
+     they share same creation parameters, so forcing all vram allocations to
+     sharable to support IPC mem */
+  if (create_info.flags.interprocess ||
+      desc_.domain == thunk_proxy::AllocDomain::kLocal)
+    desc_.flags.is_shared = true;
+
+  desc_.flags.is_locked = create_info.flags.locked;
+  desc_.size = AdjustSize(desc_.client_size);
+
+  if (IsUserMemory() || IsSystem())
+    desc_.cpu_addr = create_info.user_ptr;
+
+  num_allocations_ = CalcChunkNumbers(Size());
+  if (num_allocations_ == 1)
+    alloc_handles_ptr_ = &alloc_handle_;
+  else
+    alloc_handles_ptr_ = new WinAllocationHandle[num_allocations_];
+
+  memset(alloc_handles_ptr_, 0, num_allocations_ * sizeof(WinAllocationHandle));
+
+  auto code = ErrorCode::Success;
+
+  if (IsPhysicalOnly()) {
+    code = CreatePhysicalMemory();
+    if (code == ErrorCode::Success)
+      code = dxg_runtime->HandleApertureAlloc(desc_.size, &desc_.handle_ape_addr);
+    return code;
+  }
+
+  code = ReserveGpuVirtualAddress(create_info.va_hint, Size(), create_info.alignment);
+  if (IsVirtual() || (code != ErrorCode::Success))
+      return code;
+
+  bool physical_created = false;
+
+  auto guard = MakeScopeGuard([this, &physical_created, &code]() {
+    if (code != ErrorCode::Success) {
+
+      if (physical_created) {
+        FreePhysicalMemory();
+      }
+      FreeGpuVirtualAddress(GpuAddress(), Size());
+    }
+  });
+  (void)guard;
+
+  code = CreatePhysicalMemory();
+  if (code != ErrorCode::Success)
+    return code;
+
+  physical_created = true;
+
+  code = MapGpuVirtualAddress(GpuAddress(), Size());
+  if (code != ErrorCode::Success)
+    return code;
+
+  code = MakeResident();
+  if (code != ErrorCode::Success)
+    return code;
+
+  if (!GetDevice()->WaitOnPagingFenceFromCpu())
+    code = ErrorCode::Unknown;
+
+  return code;
+}
+
+ErrorCode GpuMemory::UnmapGpuVirtualAddress(const gpusize addr, const gpusize size, gpusize offset) {
+  auto code = ErrorCode::Success;
+  size_t i = 0;
+  auto map_addr = addr;
+  auto map_size = size;
+
+  while (offset >= WDDMDevice::GpuMemoryChunkSize) {
+    offset -= WDDMDevice::GpuMemoryChunkSize;
+    i += 1;
+  }
+
+  while (map_size > 0) {
+    auto block_size = std::min(map_size, WDDMDevice::GpuMemoryChunkSize);
+
+    D3DDDI_MAPGPUVIRTUALADDRESS args{};
+
+    args.hPagingQueue = device_->PagingQueue();
+    args.BaseAddress = map_addr;
+    args.hAllocation = GetAllocationHandle(i);
+    args.SizeInPages = block_size / 0x1000;
+    args.Protection.NoAccess = 1;
+
+    code = d3dthunk::MapGpuVirtualAddress(&args);
+
+    if (code == ErrorCode::NotReady)
+      device_->UpdatePageFence(args.PagingFenceValue);
+    else if (code != ErrorCode::Success)
+      break;
+
+    map_addr += block_size;
+    map_size -= block_size;
+    offset = 0;   // reset second unmapped allocation offset to zero
+    i += 1;
+  }
+
+  return code;
+}
+
+ErrorCode GpuMemory::MapGpuVirtualAddress(const gpusize addr, const gpusize size, gpusize offset) {
+
+  auto code = ErrorCode::Success;
+  size_t i = 0;
+  auto map_addr = addr;
+  auto map_size = size;
+  const size_t _4K = 0x1000;
+
+  while (offset >= WDDMDevice::GpuMemoryChunkSize) {
+    offset -= WDDMDevice::GpuMemoryChunkSize;
+    i += 1;
+  }
+  const size_t first_chunk = i;
+  const auto first_chunk_offset = offset;
+  /* Found two limitation for local vram:
+   * 1. invisible vram va has to be 64K aligned, otherwise map gpu va fail
+   * 2. visible vram can not be cpu mapped when command submission or after gpu mapped
+   */
+  while (map_size > 0) {
+    auto block_size = std::min(map_size, WDDMDevice::GpuMemoryChunkSize);
+
+    D3DDDI_MAPGPUVIRTUALADDRESS args{};
+
+    args.hPagingQueue = device_->PagingQueue();
+    args.BaseAddress = map_addr;
+    args.hAllocation = GetAllocationHandle(i);
+    args.OffsetInPages = offset / _4K;
+    args.SizeInPages = block_size / _4K;
+    args.Protection.Write = 1;
+
+    code = d3dthunk::MapGpuVirtualAddress(&args);
+
+    if (code != ErrorCode::Success) {
+      if (code == ErrorCode::NotReady) {
+        const uint64_t fence_value = args.PagingFenceValue;
+        device_->UpdatePageFence(fence_value);
+        code = ErrorCode::Success;
+      } else
+        break;
+    }
+
+    map_addr += block_size;
+    map_size -= block_size;
+    offset = 0;  // reset second mapped allocation offset to zero
+    i++;
+  }
+
+  if (code != ErrorCode::Success) {
+    // Map failed, unmap partial mapped block
+    offset = first_chunk_offset;
+    map_addr = addr;
+    map_size = size;
+    for (size_t j = first_chunk; j < i; j++) {
+      auto block_size = std::min(map_size, WDDMDevice::GpuMemoryChunkSize);
+
+      D3DDDI_MAPGPUVIRTUALADDRESS args{};
+
+      args.hPagingQueue = device_->PagingQueue();
+      args.BaseAddress = map_addr;
+      args.hAllocation = 0;
+      args.OffsetInPages = offset / _4K;
+      args.SizeInPages = block_size / _4K;
+      args.Protection.NoAccess = 1;
+
+      auto unmap_code = d3dthunk::MapGpuVirtualAddress(&args);
+      if (unmap_code == ErrorCode::NotReady)
+        device_->UpdatePageFence(args.PagingFenceValue);
+
+      map_addr += block_size;
+      map_size -= block_size;
+    }
+  }
+
+  return code;
+}
+
+ErrorCode GpuMemory::ReserveGpuVirtualAddress(gpusize base_virt_addr, gpusize size, gpusize alignment) {
+  ErrorCode status;
+  gpusize gpu_virt_addr = 0;
+  if ((desc_.flags.is_sysmem_exporter || desc_.flags.is_imported_sys_memfd)
+      && desc_.domain == thunk_proxy::AllocDomain::kSystem) {
+    int mfd = (mem_fd_ > -1)? mem_fd_ : -1;
+    status = dxg_runtime->ReserveIPCSysMem(Size(), &gpu_virt_addr, desc_.alignment, mfd, desc_.flags.is_locked);
+    if (status == ErrorCode::Success)
+      mem_fd_ = mfd;
+  } else {
+    status = dxg_runtime->ReserveGpuVirtualAddress(desc_.domain, base_virt_addr, size, &gpu_virt_addr, alignment,
+        desc_.flags.is_locked);
+  }
+
+  if (status == ErrorCode::Success) {
+    desc_.gpu_addr = gpu_virt_addr;
+
+    if (IsSystem())
+      desc_.cpu_addr = reinterpret_cast<void *>(desc_.gpu_addr);
+  }
+  return status;
+}
+
+ErrorCode GpuMemory::FreeGpuVirtualAddress(gpusize base_addr, gpusize size) {
+  if (mem_fd_ > -1)
+    return dxg_runtime->FreeIPCSysMem(GpuAddress(), Size(), mem_fd_);
+
+  return base_addr != 0 ?
+         dxg_runtime->FreeGpuVirtualAddress(desc_.domain, base_addr, size) :
+         ErrorCode::Success;
+}
+
+ErrorCode GpuMemory::CreatePhysicalMemory() {
+
+  assert(!IsVirtual() && NumChunks() > 0);
+
+  const auto num_allocations = NumChunks();
+  void *priv_drv_data;
+  void *priv_alloc_data;
+  int priv_drv_data_size;
+  int priv_alloc_data_size;
+
+  thunk_proxy::GetAllocPrivDataSize(&priv_drv_data_size, &priv_alloc_data_size);
+  int total_size = priv_drv_data_size +
+    num_allocations * priv_alloc_data_size +
+    num_allocations * sizeof(D3DDDI_ALLOCATIONINFO2);
+  priv_drv_data = malloc(total_size);
+  if (!priv_drv_data)
+    return ErrorCode::OutOfMemory;
+
+  memset(priv_drv_data, 0, total_size);
+  thunk_proxy::FillinAllocPrivDrvData(priv_drv_data, priv_alloc_data_size);
+
+  priv_alloc_data = static_cast<unsigned char*>(priv_drv_data) + priv_drv_data_size;
+  auto alloc_info = reinterpret_cast<D3DDDI_ALLOCATIONINFO2*>(
+       static_cast<unsigned char*>(priv_alloc_data) + priv_alloc_data_size * num_allocations);
+
+  size_t size = desc_.size;
+  uint64_t addr = desc_.gpu_addr;
+  char *cpu_addr = static_cast<char *>(desc_.cpu_addr);
+  const auto &device_info = GetDevice()->DeviceInfo();
+
+  for (size_t i = 0; i < num_allocations; i++) {
+
+    void* priv_data = (void*)((char*)priv_alloc_data + priv_alloc_data_size * i);
+    size_t block_size = std::min(size, WDDMDevice::GpuMemoryChunkSize);
+
+    if (IsUserMemory() || IsSystem()) {
+      thunk_proxy::SetAllocationInfo(priv_data, block_size, desc_.domain, 0, desc_.mem_flags, desc_.engine_flag, device_info);
+      alloc_info[i].pSystemMem = static_cast<void *>(cpu_addr);
+      cpu_addr += block_size;
+    } else {
+      thunk_proxy::SetAllocationInfo(priv_data, block_size, desc_.domain, addr, desc_.mem_flags, desc_.engine_flag, device_info);
+    }
+
+    size -= block_size;
+    addr += block_size;
+
+    alloc_info[i].pPrivateDriverData = priv_data;
+    alloc_info[i].PrivateDriverDataSize = priv_alloc_data_size;
+    alloc_info[i].VidPnSourceId = D3DDDI_ID_UNINITIALIZED;
+  }
+
+  D3DKMT_CREATEALLOCATION args = {};
+  args.hDevice = device_->DeviceHandle();
+  args.pPrivateDriverData = priv_drv_data;
+  args.PrivateDriverDataSize = priv_drv_data_size;
+  args.NumAllocations = num_allocations;
+  args.pAllocationInfo2 = alloc_info;
+
+  /* The PhysicallyContiguous flag causes allocation failure
+   * args.Flags.PhysicallyContiguous = IsPhysicalContiguous();
+   */
+
+  SharedHandleInfo shared_info;
+  if (IsShared()) {
+    shared_info.size = desc_.size;
+    shared_info.client_size = desc_.client_size;
+    shared_info.domain = desc_.domain;
+    shared_info.adapter_luid = desc_.adapter_luid;
+    shared_info.flags = reinterpret_cast<uint32_t>(desc_.flags.reserved);
+    shared_info.mem_flags = desc_.mem_flags;
+    shared_info.pid = dxg_runtime->parent_pid;
+    shared_info.gpu_addr = desc_.gpu_addr;
+    args.pPrivateRuntimeData = &shared_info;
+    args.PrivateRuntimeDataSize = sizeof(shared_info);
+    args.Flags.NtSecuritySharing = 1;
+    args.Flags.CreateShared = 1;
+    args.Flags.CreateResource = 1;
+  }
+
+  auto status = d3dthunk::CreateAllocation(&args);
+  if (status == ErrorCode::Success) {
+    for (size_t i = 0; i < num_allocations; i++)
+      alloc_handles_ptr_[i] = alloc_info[i].hAllocation;
+
+    resource_ = args.hResource;
+  }
+  free(priv_drv_data);
+  return status;
+}
+
+ErrorCode GpuMemory::FreePhysicalMemory() {
+  auto code = ErrorCode::Success;
+
+  if (alloc_handles_ptr_ == nullptr || (NumChunks() == 1 && *alloc_handles_ptr_ == 0))
+      return code;
+
+  code = d3dthunk::DestroyAllocation(device_->DeviceHandle(),
+                                  resource_,
+                                  NumChunks(),
+                                  alloc_handles_ptr_);
+  if (NumChunks() > 1)
+    delete[] alloc_handles_ptr_;
+
+  alloc_handles_ptr_ = nullptr;
+  return code;
+}
+
+ErrorCode GpuMemory::MakeResident() {
+
+  D3DDDI_MAKERESIDENT args = {};
+  args.hPagingQueue = device_->PagingQueue();
+  args.NumAllocations = NumChunks();
+  args.AllocationList = alloc_handles_ptr_;
+  args.Flags.CantTrimFurther = 1;
+
+  auto code = d3dthunk::MakeResident(&args);
+  if (code == ErrorCode::NotReady) {
+    const auto fence_value = args.PagingFenceValue;
+    device_->UpdatePageFence(fence_value);
+    code = ErrorCode::Success;
+  }
+  return code;
+}
+
+ErrorCode GpuMemory::Evict() {
+
+  D3DKMT_EVICT args = {};
+  args.hDevice = device_->DeviceHandle();
+  args.NumAllocations = NumChunks();
+  args.AllocationList = alloc_handles_ptr_;
+
+  return d3dthunk::Evict(&args);
+}
+
+ErrorCode GpuMemory::ExportPhysicalHandle(int* dmabuf_fd, uint32_t flags) {
+  if (mem_fd_ > -1) {
+    *dmabuf_fd = mem_fd_;
+    return ErrorCode::Success;
+  }
+
+  if (IsShared())
+    return d3dthunk::ShareObjects(1, resource_, flags, dmabuf_fd);
+  else
+    return ErrorCode::UnSupported;
+}
+
+
+ErrorCode GpuMemory::ImportPhysicalHandle(const GpuMemoryCreateInfo &create_info, gpusize *gpu_addr) {
+  D3DKMT_QUERYRESOURCEINFOFROMNTHANDLE query_args;
+  int dmabuf_fd = create_info.dmabuf_fd;
+
+  if (dmabuf_fd <= 0)
+    return ErrorCode::InvalidateParams;
+
+  if(create_info.flags.sysmem_ipc_sig_importer) {
+    // the ipc signal sys mem fd will be closed in Runtime::IPCClientImport, dup to hold a reference
+    mem_fd_ = dup(dmabuf_fd);
+    desc_.client_size = create_info.size;
+    desc_.size = AdjustSize(desc_.client_size);
+    desc_.domain = thunk_proxy::AllocDomain::kSystem;
+    desc_.adapter_luid = device_->GetLuid();
+    desc_.alignment = 0x1000;
+    desc_.mem_flags = create_info.mem_flags;
+    desc_.engine_flag = create_info.engine_flag;
+    desc_.flags.is_imported_sys_memfd = create_info.flags.sysmem_ipc_sig_importer;
+    desc_.flags.is_va_required = create_info.flags.alloc_va;
+    desc_.flags.is_virtual = create_info.flags.virtual_alloc;
+    desc_.flags.is_physical_only = create_info.flags.physical_only;
+    desc_.flags.is_physical_contiguous = create_info.flags.physical_contiguous;
+    desc_.flags.is_locked = create_info.flags.locked;
+
+    auto code = ReserveGpuVirtualAddress(create_info.va_hint, Size(), create_info.alignment);
+    if (code != ErrorCode::Success)
+      return code;
+
+    bool physical_created = false;
+    auto guard = MakeScopeGuard([this, &physical_created, &code]() {
+          if (code != ErrorCode::Success) {
+            if (physical_created)
+              FreePhysicalMemory();
+            FreeGpuVirtualAddress(GpuAddress(), Size());
+          }
+        });
+    (void)guard;
+
+    num_allocations_ = CalcChunkNumbers(Size());
+    if (num_allocations_ == 1)
+      alloc_handles_ptr_ = &alloc_handle_;
+    else
+      alloc_handles_ptr_ = new WinAllocationHandle[num_allocations_];
+
+    memset(alloc_handles_ptr_, 0, num_allocations_ * sizeof(WinAllocationHandle));
+
+    code = CreatePhysicalMemory();
+    if (code != ErrorCode::Success)
+      return code;
+
+    physical_created = true;
+
+    code = MapGpuVirtualAddress(GpuAddress(), Size());
+    if (code != ErrorCode::Success)
+      return code;
+
+    code = MakeResident();
+    if (code != ErrorCode::Success)
+      return code;
+
+    if (!GetDevice()->WaitOnPagingFenceFromCpu())
+      code = ErrorCode::Unknown;
+
+    return code;
+  } else {
+    // vmem importer / ipc vram importer
+    memset(&query_args, 0, sizeof(query_args));
+    query_args.hDevice = device_->DeviceHandle();
+    query_args.hNtHandle = reinterpret_cast<HANDLE>(dmabuf_fd);
+    auto ret = d3dthunk::QueryResourceInfoFromNtHandle(&query_args);
+    if (ret != ErrorCode::Success) {
+      pr_err("query resource info from nt handle failed %d\n", static_cast<int>(ret));
+      return ErrorCode::InvalidateParams;
+    }
+    pr_debug("wsl-thunk: import from nt handle %d, get allocation number %d,"
+             " runtime data size %#x total driver data size %#x resource data size=%#x\n",
+             dmabuf_fd,
+             query_args.NumAllocations,
+             query_args.PrivateRuntimeDataSize,
+             query_args.TotalPrivateDriverDataSize,
+             query_args.ResourcePrivateDriverDataSize);
+
+    SharedHandleInfo shared_info;
+    if(sizeof(shared_info) != query_args.PrivateRuntimeDataSize) {
+      pr_err("shared hanle info size mismatch:%d vs %ld\n",
+             query_args.PrivateRuntimeDataSize, sizeof(shared_info));
+      return ErrorCode::UnSupported;
+    }
+
+    uint32_t total_size = query_args.NumAllocations * sizeof(D3DDDI_OPENALLOCATIONINFO2) +
+      query_args.TotalPrivateDriverDataSize +
+      query_args.ResourcePrivateDriverDataSize;
+    D3DDDI_OPENALLOCATIONINFO2 *open_info =
+      reinterpret_cast<D3DDDI_OPENALLOCATIONINFO2*> (calloc(1, total_size));
+    if (!open_info) {
+      pr_err("alloc open_info failed, NumAllocations:%d\n",
+             query_args.NumAllocations);
+      return ErrorCode::OutOfMemory;
+    }
+
+    auto guard = MakeScopeGuard([&open_info]() { free(open_info); });
+
+    alloc_handles_ptr_ = new WinAllocationHandle[query_args.NumAllocations];
+
+    D3DKMT_OPENRESOURCEFROMNTHANDLE open_args;
+    memset(&open_args, 0, sizeof(open_args));
+    open_args.hDevice = query_args.hDevice;
+    open_args.hNtHandle = query_args.hNtHandle;
+    open_args.NumAllocations = query_args.NumAllocations;
+    open_args.pOpenAllocationInfo2 = open_info;
+    open_args.TotalPrivateDriverDataBufferSize = query_args.TotalPrivateDriverDataSize;
+    open_args.pTotalPrivateDriverDataBuffer = reinterpret_cast<void*>
+      (open_args.pOpenAllocationInfo2 + open_args.NumAllocations);
+    open_args.ResourcePrivateDriverDataSize = query_args.ResourcePrivateDriverDataSize;
+    open_args.pResourcePrivateDriverData = reinterpret_cast<void*>
+      (((uint64_t)open_args.pTotalPrivateDriverDataBuffer) +
+       open_args.TotalPrivateDriverDataBufferSize);
+    open_args.PrivateRuntimeDataSize = query_args.PrivateRuntimeDataSize;
+    open_args.pPrivateRuntimeData = reinterpret_cast<void*> (&shared_info);
+
+    ret = d3dthunk::OpenResourceFromNtHandle(&open_args);
+    if (ret != ErrorCode::Success) {
+      ret = ErrorCode::InvalidateParams;
+      pr_err("open resource failed %d\n", static_cast<int>(ret));
+      return ret;
+    }
+    if (shared_info.pid == dxg_runtime->parent_pid &&
+      create_info.flags.alloc_va &&
+      IsSameAdapter(shared_info.adapter_luid) &&
+      shared_info.gpu_addr) {
+      pr_info("import from same device and samve process, va is required. "
+               "a buffer can't be mapped to 2 va. delete the imported buffer, use the existing one.\n");
+      if (gpu_addr)
+        *gpu_addr = shared_info.gpu_addr;
+      return ErrorCode::SameProcessSameDevice;
+    }
+
+    desc_.size = shared_info.size;
+    desc_.client_size = shared_info.client_size;
+    desc_.domain = shared_info.domain;
+    desc_.flags.reserved = shared_info.flags;
+    desc_.mem_flags = shared_info.mem_flags;
+    desc_.adapter_luid = shared_info.adapter_luid;
+    resource_ = open_args.hResource;
+    num_allocations_ = open_args.NumAllocations;
+    for (int i = 0; i < num_allocations_; i++)
+      alloc_handles_ptr_[i] = open_info[i].hAllocation;
+
+    desc_.flags.is_va_required = create_info.flags.alloc_va;
+    if (desc_.flags.is_va_required) {
+      desc_.flags.is_imported_vram_ipc = 1;
+      ret = ReserveGpuVirtualAddress(create_info.va_hint, desc_.size, create_info.alignment);
+      if (ret != ErrorCode::Success)
+        pr_err("failed to allocate svm range, error:%d\n", static_cast<int>(ret));
+
+      return ret;
+    } else {
+      desc_.flags.is_imported_vram_vmem = 1;
+      return dxg_runtime->HandleApertureAlloc(desc_.size, &desc_.handle_ape_addr);
+    }
+  }
+}
+
+} // namespace thunk
+} // namespace wsl
@@ -0,0 +1,165 @@
+#include <cassert>
+#include <map>
+#include <algorithm>
+#include "impl/wddm/va_mgr.h"
+
+using namespace std;
+
+namespace wsl {
+namespace thunk {
+
+VaMgr::VaMgr(uint64_t start, uint64_t size, uint64_t min_align) {
+  min_align_ = min_align;
+  auto free_it = free_list_.insert(make_pair(size, start));
+  frag_map_[start] = make_fragment(free_it, size);
+}
+
+VaMgr::~VaMgr() {
+
+  if (free_list_.size() != 1)
+    pr_warn("free_list_ size:%ld which should be 1.\n", free_list_.size());
+  if (frag_map_.size() != 1)
+    pr_warn("frag_map_ size:%ld which should be 1.\n", frag_map_.size());
+
+  free_list_.clear();
+  frag_map_.clear();
+}
+
+uint64_t VaMgr::Alloc(uint64_t bytes, uint64_t align, uint64_t addr) {
+
+  if (addr > 0 &&
+      (align == 0 || (addr % align) == 0)) {
+
+    lock_guard<mutex> gard(lock_);
+    auto frag_it = frag_map_.upper_bound(addr);
+    assert(frag_it != frag_map_.begin());
+    --frag_it;
+
+    while (frag_it != frag_map_.begin()) {
+      const uint64_t base = frag_it->first;
+      const uint64_t size = frag_it->second.size;
+
+      // Cannot find free fragment contains the target `addr`
+      if (bytes > size || addr < base || addr + bytes > base + size ||
+          !is_free(frag_it->second)) {
+        --frag_it;
+        continue;
+      } else if (addr >= base + size)
+        break;
+
+
+      // Try to allocate target `addr` from this free fragment
+      auto free_it = frag_it->second.free_list_entry_;
+      assert(free_it != free_list_.end());
+
+      free_list_.erase(free_it);
+      frag_it->second.size = bytes;
+      set_used(frag_it->second);
+
+      // [base, addr)
+      if (addr > base) add_free_fragment(addr - base, base);
+
+      // [addr, addr + bytes) is used
+
+      // [addr + bytes, base + size)
+      if (base + size > addr + bytes) add_free_fragment(base + size - addr - bytes, addr + bytes);
+
+      return addr;
+    }
+  }
+
+  // Allocate not fixed address
+  return AllocImpl(bytes, align);
+}
+
+uint64_t VaMgr::AllocImpl(const uint64_t bytes, const uint64_t align) {
+  uint64_t addr = 0;
+  uint64_t align_bytes = bytes;
+  const int retry = align == 0 ? 0 : 1;
+  const uint64_t new_align = align == 0 ? min_align_ : AlignUp(align, min_align_);
+
+  lock_guard<mutex> gard(lock_);
+  for (int i = 0; i <= retry; i++) {
+    auto free_it = free_list_.lower_bound(align_bytes);
+    if (free_it == free_list_.end()) break;
+
+    uint64_t base = free_it->second;
+    uint64_t size = free_it->first;
+
+    assert(size >= align_bytes);
+
+    auto fragment = frag_map_.find(base);
+
+    assert(fragment != frag_map_.end());
+    assert(size == fragment->second.size);
+
+    uint64_t delta = align == 0 ? 0 : base % align;
+    if (delta == 0) {
+      // already find aligned address
+      addr = base;
+
+      free_list_.erase(free_it);
+      fragment->second.size = bytes;
+      set_used(fragment->second);
+
+      if (size > bytes) add_free_fragment(size - bytes, base + bytes);
+
+      break;
+    } else if (i == 0) {
+      align_bytes += new_align;
+      continue;
+    } else {
+      uint64_t aligned_base = base + align - delta;
+      addr = aligned_base;
+
+      free_list_.erase(free_it);
+
+      add_used_fragment(bytes, aligned_base);
+      add_free_fragment(aligned_base - base, base);
+
+      if (size > aligned_base - base + bytes)
+        add_free_fragment(size - (aligned_base - base) - bytes, aligned_base + bytes);
+
+      break;
+    }
+  }
+  return addr;
+}
+
+void VaMgr::Free(uint64_t addr) {
+  if (addr == 0) return;
+
+  lock_guard<mutex> gard(lock_);
+  auto frag_it = frag_map_.find(addr);
+  if (frag_it == frag_map_.end() || is_free(frag_it->second)) return;
+
+  uint64_t base = addr;
+  // Merge lower
+  if (frag_it != frag_map_.begin()) {
+    auto lower = frag_it;
+    --lower;
+    if (is_free(lower->second)) {
+      remove_free_list_entry(lower->second);
+      base -= lower->second.size;
+      lower->second.size += frag_it->second.size;
+      frag_map_.erase(frag_it);
+      frag_it = lower;
+    }
+  }
+  // Merge upper
+  {
+    auto upper = frag_it;
+    ++upper;
+    if (upper != frag_map_.end() && is_free(upper->second)) {
+      remove_free_list_entry(upper->second);
+      frag_it->second.size += upper->second.size;
+      frag_map_.erase(upper);
+    }
+  }
+  uint64_t size = frag_it->second.size;
+  auto it = free_list_.insert(make_pair(size, base));
+  set_free(frag_it->second, it);
+}
+
+} // namespace thunk
+} // namespace wsl