diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/ais.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/ais.cpp
new file mode 100644
index 0000000000..e32c28b1d4
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/ais.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright © 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtAisReadWriteFile(void *MemoryAddress,
+					      HSAuint64 MemorySizeInBytes,
+					      HSAint32 fd,
+					      HSAint64 file_offset,
+					      HsaAisFlags AisFlags,
+					      HSAuint64 *SizeCopiedInBytes,
+					      HSAint32 *status)
+{
+	CHECK_DXG_OPEN();
+
+	pr_warn_once("not implemented\n");
+	return HSAKMT_STATUS_NOT_SUPPORTED;
+}
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/debug.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/debug.cpp
new file mode 100644
index 0000000000..2b4425599a
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/debug.cpp
@@ -0,0 +1,126 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <cassert>
+#include <cstring>
+
+
+static uint32_t runtime_capabilities_mask = 0;
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtDbgRegister(HSAuint32 NodeId) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtDbgUnregister(HSAuint32 NodeId) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtDbgWavefrontControl(
+    HSAuint32 NodeId, HSA_DBG_WAVEOP Operand, HSA_DBG_WAVEMODE Mode,
+    HSAuint32 TrapId, HsaDbgWaveMessage *DbgWaveMsgRing) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtDbgAddressWatch(
+    HSAuint32 NodeId, HSAuint32 NumWatchPoints, HSA_DBG_WATCH_MODE WatchMode[],
+    void *WatchAddress[], HSAuint64 WatchMask[], HsaEvent *WatchEvent[]) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtCheckRuntimeDebugSupport(void) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtRuntimeEnable(void *rDebug, bool setupTtmp) {
+  HSAKMT_STATUS result = hsaKmtCheckRuntimeDebugSupport();
+
+  if (result)
+    return result;
+
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtRuntimeDisable(void) {
+  HSAKMT_STATUS result = hsaKmtCheckRuntimeDebugSupport();
+
+  if (result)
+    return HSAKMT_STATUS_SUCCESS;
+
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtGetRuntimeCapabilities(HSAuint32 *caps_mask) {
+  CHECK_DXG_OPEN();
+  *caps_mask = runtime_capabilities_mask;
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtDbgEnable(void **runtime_info,
+                                        HSAuint32 *data_size) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+HSAKMT_STATUS HSAKMTAPI hsaKmtDbgDisable(void) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtDbgGetDeviceData(void **data,
+                                               HSAuint32 *n_entries,
+                                               HSAuint32 *entry_size) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtDbgGetQueueData(void **data, HSAuint32 *n_entries,
+                                              HSAuint32 *entry_size,
+                                              bool suspend_queues) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI
+hsaKmtDebugTrapIoctl(struct kfd_ioctl_dbg_trap_args *args, HSA_QUEUEID *Queues,
+                     HSAuint64 *DebugReturn) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/dxcore_loader.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/dxcore_loader.cpp
new file mode 100644
index 0000000000..5d38d69c8d
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/dxcore_loader.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+ */
+
+#include "dxcore_loader.h"
+#include "librocdxg.h"
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <ntstatus.h>
+
+namespace wsl {
+namespace thunk {
+namespace dxcore {
+
+DxcoreLoader::DxcoreLoader()
+    : dxcore_handle_(nullptr)
+    , init_flag_()
+    , pfn_D3DKMTCreateAllocation2(nullptr)
+    , pfn_D3DKMTDestroyAllocation2(nullptr)
+    , pfn_D3DKMTMapGpuVirtualAddress(nullptr)
+    , pfn_D3DKMTReserveGpuVirtualAddress(nullptr)
+    , pfn_D3DKMTFreeGpuVirtualAddress(nullptr)
+    , pfn_D3DKMTCreateDevice(nullptr)
+    , pfn_D3DKMTDestroyDevice(nullptr)
+    , pfn_D3DKMTEnumAdapters2(nullptr)
+    , pfn_D3DKMTQueryAdapterInfo(nullptr)
+    , pfn_D3DKMTCreateContextVirtual(nullptr)
+    , pfn_D3DKMTDestroyContext(nullptr)
+    , pfn_D3DKMTSubmitCommand(nullptr)
+    , pfn_D3DKMTCreateSynchronizationObject2(nullptr)
+    , pfn_D3DKMTDestroySynchronizationObject(nullptr)
+    , pfn_D3DKMTQueryStatistics(nullptr)
+    , pfn_D3DKMTEscape(nullptr)
+    , pfn_D3DKMTLock2(nullptr)
+    , pfn_D3DKMTUnlock2(nullptr)
+    , pfn_D3DKMTCreatePagingQueue(nullptr)
+    , pfn_D3DKMTDestroyPagingQueue(nullptr)
+    , pfn_D3DKMTWaitForSynchronizationObjectFromGpu(nullptr)
+    , pfn_D3DKMTSignalSynchronizationObjectFromGpu(nullptr)
+    , pfn_D3DKMTWaitForSynchronizationObjectFromCpu(nullptr)
+    , pfn_D3DKMTQueryClockCalibration(nullptr)
+    , pfn_D3DKMTMakeResident(nullptr)
+    , pfn_D3DKMTEvict(nullptr)
+    , pfn_D3DKMTShareObjects(nullptr)
+    , pfn_D3DKMTQueryResourceInfoFromNtHandle(nullptr)
+    , pfn_D3DKMTOpenResourceFromNtHandle(nullptr)
+    , pfn_D3DKMTCreateHwQueue(nullptr)
+    , pfn_D3DKMTDestroyHwQueue(nullptr)
+    , pfn_D3DKMTSubmitCommandToHwQueue(nullptr) {
+}
+
+DxcoreLoader::~DxcoreLoader() {
+    Shutdown();
+}
+
+bool DxcoreLoader::Initialize() {
+    dlerror(); // Clear error
+    dxcore_handle_ = dlopen("libdxcore.so", RTLD_LAZY);
+
+    if (!dxcore_handle_) {
+        pr_err("[DxcoreLoader] Cannot load libdxcore.so: %s\n", dlerror());
+        return false;
+    }
+
+    pr_info("[DxcoreLoader] libdxcore.so loaded successfully\n");
+    if (!LoadDxcoreApis()) {
+        // If API loading failed, close the handle to indicate failure
+        dlclose(dxcore_handle_);
+        dxcore_handle_ = nullptr;
+        return false;
+    }
+
+    return IsLoaded();
+}
+
+void DxcoreLoader::Shutdown() {
+    if (dxcore_handle_) {
+        if (dlclose(dxcore_handle_) != 0) {
+            pr_err("[DxcoreLoader] Cannot unload libdxcore.so: %s\n", dlerror());
+        } else {
+            pr_info("[DxcoreLoader] libdxcore.so unloaded successfully\n");
+        }
+        dxcore_handle_ = nullptr;
+    }
+}
+
+bool DxcoreLoader::LoadDxcoreApis() {
+    if (!dxcore_handle_) {
+        pr_err("[DxcoreLoader] Error: dxcore_handle_ is null\n");
+        return false;
+    }
+
+    dlerror(); // Clear error
+
+    // Load all D3DKMT functions
+    #define LOAD_DXCORE_API(func_name) \
+        DXCORE_PFN(func_name) = (DXCORE_DEF(func_name)*)dlsym(dxcore_handle_, #func_name); \
+        if (!DXCORE_PFN(func_name)) { \
+            pr_err("[DxcoreLoader] Failed to load " #func_name ": %s\n", dlerror()); \
+            goto ERROR; \
+        }
+
+    LOAD_DXCORE_API(D3DKMTCreateAllocation2);
+    LOAD_DXCORE_API(D3DKMTDestroyAllocation2);
+    LOAD_DXCORE_API(D3DKMTMapGpuVirtualAddress);
+    LOAD_DXCORE_API(D3DKMTReserveGpuVirtualAddress);
+    LOAD_DXCORE_API(D3DKMTFreeGpuVirtualAddress);
+    LOAD_DXCORE_API(D3DKMTCreateDevice);
+    LOAD_DXCORE_API(D3DKMTDestroyDevice);
+    LOAD_DXCORE_API(D3DKMTEnumAdapters2);
+    LOAD_DXCORE_API(D3DKMTQueryAdapterInfo);
+    LOAD_DXCORE_API(D3DKMTCreateContextVirtual);
+    LOAD_DXCORE_API(D3DKMTDestroyContext);
+    LOAD_DXCORE_API(D3DKMTSubmitCommand);
+    LOAD_DXCORE_API(D3DKMTCreateSynchronizationObject2);
+    LOAD_DXCORE_API(D3DKMTDestroySynchronizationObject);
+    LOAD_DXCORE_API(D3DKMTQueryStatistics);
+    LOAD_DXCORE_API(D3DKMTEscape);
+    LOAD_DXCORE_API(D3DKMTLock2);
+    LOAD_DXCORE_API(D3DKMTUnlock2);
+    LOAD_DXCORE_API(D3DKMTCreatePagingQueue);
+    LOAD_DXCORE_API(D3DKMTDestroyPagingQueue);
+    LOAD_DXCORE_API(D3DKMTWaitForSynchronizationObjectFromGpu);
+    LOAD_DXCORE_API(D3DKMTSignalSynchronizationObjectFromGpu);
+    LOAD_DXCORE_API(D3DKMTWaitForSynchronizationObjectFromCpu);
+    LOAD_DXCORE_API(D3DKMTQueryClockCalibration);
+    LOAD_DXCORE_API(D3DKMTMakeResident);
+    LOAD_DXCORE_API(D3DKMTEvict);
+    LOAD_DXCORE_API(D3DKMTShareObjects);
+    LOAD_DXCORE_API(D3DKMTQueryResourceInfoFromNtHandle);
+    LOAD_DXCORE_API(D3DKMTOpenResourceFromNtHandle);
+    LOAD_DXCORE_API(D3DKMTCreateHwQueue);
+    LOAD_DXCORE_API(D3DKMTDestroyHwQueue);
+    LOAD_DXCORE_API(D3DKMTSubmitCommandToHwQueue);
+
+    #undef LOAD_DXCORE_API
+
+    pr_info("[DxcoreLoader] All DXCore APIs loaded successfully\n");
+    return true;
+ERROR:
+    pr_err("[DxcoreLoader] Failed to load DXCore APIs\n");
+    return false;
+}
+
+} // namespace dxcore
+} // namespace thunk
+} // namespace wsl
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/dxcore_loader.h b/projects/rocr-runtime/libhsakmt/src/dxg/dxcore_loader.h
new file mode 100644
index 0000000000..3f649a4da0
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/dxcore_loader.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef LIBROCDXG_DXCORE_LOADER_H
+#define LIBROCDXG_DXCORE_LOADER_H
+
+#include "impl/wddm/types.h"
+#include <dlfcn.h>
+#include <mutex>
+
+#define DXCORE_CALL(function_name)  wsl::thunk::dxcore::DxcoreLoader::Instance().pfn_##function_name
+
+namespace wsl {
+namespace thunk {
+namespace dxcore {
+
+/**
+ * @brief DxcoreLoader class for dynamic loading of libdxcore.so
+ * 
+ * This class provides a singleton loader for the DXCore library, allowing
+ * optional loading based on environment variable LIBROCDXG_ENABLE_DXCORE.
+ * Supported values: "1", "true", "yes" (case-sensitive).
+ * If not set or invalid, fallback to stub implementations.
+ * 
+ * Thread-safe initialization using std::call_once.
+ */
+
+// Macro definitions mimicking HSAKMT design
+#define DXCORE_DEF(function_name)   PFN##function_name
+#define DXCORE_PFN(function_name)   pfn_##function_name
+
+class DxcoreLoader {
+public:
+    // D3DKMT function type definitions
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTCreateAllocation2))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTDestroyAllocation2))(void *args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTMapGpuVirtualAddress))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTReserveGpuVirtualAddress))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTFreeGpuVirtualAddress))(void *args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTCreateDevice))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTDestroyDevice))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTEnumAdapters2))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTQueryAdapterInfo))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTCreateContextVirtual))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTDestroyContext))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTSubmitCommand))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTCreateSynchronizationObject2))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTDestroySynchronizationObject))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTQueryStatistics))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTEscape))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTLock2))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTUnlock2))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTCreatePagingQueue))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTDestroyPagingQueue))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTWaitForSynchronizationObjectFromGpu))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTSignalSynchronizationObjectFromGpu))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTWaitForSynchronizationObjectFromCpu))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTQueryClockCalibration))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTMakeResident))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTEvict))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTShareObjects))(size_t num_allocations, WinResourceHandle* resource, OBJECT_ATTRIBUTES* obj_attr, uint32_t flags, void** nt_handle);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTQueryResourceInfoFromNtHandle))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTOpenResourceFromNtHandle))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTCreateHwQueue))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTDestroyHwQueue))(void* args);
+    typedef NTSTATUS (DXCORE_DEF(D3DKMTSubmitCommandToHwQueue))(void* args);
+
+    static DxcoreLoader& Instance() {
+        static DxcoreLoader* instance = new DxcoreLoader();
+        return (*instance);
+    }
+
+    bool Initialize();
+    void Shutdown();
+    bool IsLoaded() const { return dxcore_handle_ != nullptr; }
+
+    // Function pointer declarations
+    DXCORE_DEF(D3DKMTCreateAllocation2)* DXCORE_PFN(D3DKMTCreateAllocation2);
+    DXCORE_DEF(D3DKMTDestroyAllocation2)* DXCORE_PFN(D3DKMTDestroyAllocation2);
+    DXCORE_DEF(D3DKMTMapGpuVirtualAddress)* DXCORE_PFN(D3DKMTMapGpuVirtualAddress);
+    DXCORE_DEF(D3DKMTReserveGpuVirtualAddress)* DXCORE_PFN(D3DKMTReserveGpuVirtualAddress);
+    DXCORE_DEF(D3DKMTFreeGpuVirtualAddress)* DXCORE_PFN(D3DKMTFreeGpuVirtualAddress);
+    DXCORE_DEF(D3DKMTCreateDevice)* DXCORE_PFN(D3DKMTCreateDevice);
+    DXCORE_DEF(D3DKMTDestroyDevice)* DXCORE_PFN(D3DKMTDestroyDevice);
+    DXCORE_DEF(D3DKMTEnumAdapters2)* DXCORE_PFN(D3DKMTEnumAdapters2);
+    DXCORE_DEF(D3DKMTQueryAdapterInfo)* DXCORE_PFN(D3DKMTQueryAdapterInfo);
+    DXCORE_DEF(D3DKMTCreateContextVirtual)* DXCORE_PFN(D3DKMTCreateContextVirtual);
+    DXCORE_DEF(D3DKMTDestroyContext)* DXCORE_PFN(D3DKMTDestroyContext);
+    DXCORE_DEF(D3DKMTSubmitCommand)* DXCORE_PFN(D3DKMTSubmitCommand);
+    DXCORE_DEF(D3DKMTCreateSynchronizationObject2)* DXCORE_PFN(D3DKMTCreateSynchronizationObject2);
+    DXCORE_DEF(D3DKMTDestroySynchronizationObject)* DXCORE_PFN(D3DKMTDestroySynchronizationObject);
+    DXCORE_DEF(D3DKMTQueryStatistics)* DXCORE_PFN(D3DKMTQueryStatistics);
+    DXCORE_DEF(D3DKMTEscape)* DXCORE_PFN(D3DKMTEscape);
+    DXCORE_DEF(D3DKMTLock2)* DXCORE_PFN(D3DKMTLock2);
+    DXCORE_DEF(D3DKMTUnlock2)* DXCORE_PFN(D3DKMTUnlock2);
+    DXCORE_DEF(D3DKMTCreatePagingQueue)* DXCORE_PFN(D3DKMTCreatePagingQueue);
+    DXCORE_DEF(D3DKMTDestroyPagingQueue)* DXCORE_PFN(D3DKMTDestroyPagingQueue);
+    DXCORE_DEF(D3DKMTWaitForSynchronizationObjectFromGpu)* DXCORE_PFN(D3DKMTWaitForSynchronizationObjectFromGpu);
+    DXCORE_DEF(D3DKMTSignalSynchronizationObjectFromGpu)* DXCORE_PFN(D3DKMTSignalSynchronizationObjectFromGpu);
+    DXCORE_DEF(D3DKMTWaitForSynchronizationObjectFromCpu)* DXCORE_PFN(D3DKMTWaitForSynchronizationObjectFromCpu);
+    DXCORE_DEF(D3DKMTQueryClockCalibration)* DXCORE_PFN(D3DKMTQueryClockCalibration);
+    DXCORE_DEF(D3DKMTMakeResident)* DXCORE_PFN(D3DKMTMakeResident);
+    DXCORE_DEF(D3DKMTEvict)* DXCORE_PFN(D3DKMTEvict);
+    DXCORE_DEF(D3DKMTShareObjects)* DXCORE_PFN(D3DKMTShareObjects);
+    DXCORE_DEF(D3DKMTQueryResourceInfoFromNtHandle)* DXCORE_PFN(D3DKMTQueryResourceInfoFromNtHandle);
+    DXCORE_DEF(D3DKMTOpenResourceFromNtHandle)* DXCORE_PFN(D3DKMTOpenResourceFromNtHandle);
+    DXCORE_DEF(D3DKMTCreateHwQueue)* DXCORE_PFN(D3DKMTCreateHwQueue);
+    DXCORE_DEF(D3DKMTDestroyHwQueue)* DXCORE_PFN(D3DKMTDestroyHwQueue);
+    DXCORE_DEF(D3DKMTSubmitCommandToHwQueue)* DXCORE_PFN(D3DKMTSubmitCommandToHwQueue);
+
+private:
+    DxcoreLoader();
+    ~DxcoreLoader();
+
+    bool LoadDxcoreApis();
+
+    void* dxcore_handle_;
+    std::once_flag init_flag_;  // For thread-safe initialization
+
+    // Disable copy
+    DxcoreLoader(const DxcoreLoader&) = delete;
+    DxcoreLoader& operator=(const DxcoreLoader&) = delete;
+};
+
+} // namespace dxcore
+} // namespace thunk
+} // namespace wsl
+
+#endif // LIBROCDXG_DXCORE_LOADER_H
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/events.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/events.cpp
new file mode 100644
index 0000000000..1a360832de
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/events.cpp
@@ -0,0 +1,127 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <cstdio>
+#include <cassert>
+#include <thread>
+#include <chrono>
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtCreateEvent(HsaEventDescriptor *EventDesc,
+                                          bool ManualReset, bool IsSignaled,
+                                          HsaEvent **Event) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtDestroyEvent(HsaEvent *Event) {
+  CHECK_DXG_OPEN();
+  if (!Event)
+    return HSAKMT_STATUS_SUCCESS;
+
+  pr_warn_once("not supported\n");
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtSetEvent(HsaEvent *Event) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  if (!Event)
+    return HSAKMT_STATUS_INVALID_HANDLE;
+
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtResetEvent(HsaEvent *Event) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  if (!Event)
+    return HSAKMT_STATUS_INVALID_HANDLE;
+
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtQueryEventState(HsaEvent *Event) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  if (!Event)
+    return HSAKMT_STATUS_INVALID_HANDLE;
+
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnEvent(HsaEvent *Event,
+                                          HSAuint32 Milliseconds) {
+  return hsaKmtWaitOnEvent_Ext(Event, Milliseconds, NULL);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnEvent_Ext(HsaEvent *Event,
+                                              HSAuint32 Milliseconds,
+                                              uint64_t *event_age) {
+  if (!Event)
+    return HSAKMT_STATUS_INVALID_HANDLE;
+
+  return hsaKmtWaitOnMultipleEvents_Ext(&Event, 1, true, Milliseconds,
+                                        event_age);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents(HsaEvent *Events[],
+                                                   HSAuint32 NumEvents,
+                                                   bool WaitOnAll,
+                                                   HSAuint32 Milliseconds) {
+  return hsaKmtWaitOnMultipleEvents_Ext(Events, NumEvents, WaitOnAll,
+                                        Milliseconds, NULL);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents_Ext(HsaEvent *Events[],
+                                                       HSAuint32 NumEvents,
+                                                       bool WaitOnAll,
+                                                       HSAuint32 Milliseconds,
+                                                       uint64_t *event_age) {
+  CHECK_DXG_OPEN();
+
+  if (!Events)
+    return HSAKMT_STATUS_INVALID_HANDLE;
+
+  if (NumEvents == 1 && Events[0] == nullptr) {
+    std::this_thread::sleep_for(std::chrono::microseconds(20));
+    return HSAKMT_STATUS_SUCCESS;
+  }
+
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtOpenSMI(HSAuint32 NodeId, int *fd) {
+  CHECK_DXG_OPEN();
+  pr_debug("node id %d\n", NodeId);
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/hsa.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/hsa.cpp
new file mode 100755
index 0000000000..431e7bb91a
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/hsa.cpp
@@ -0,0 +1,137 @@
+#include <dlfcn.h>
+#include "impl/hsa/hsa.h"
+#include "impl/hsa/hsa_ven_amd_loader.h"
+
+static std::mutex* lock_ = new std::mutex();
+
+#if 1
+#define _HSAKMT_LOOKUP_SYMS(_sym)                                              \
+if (fn_##_sym == nullptr) {                                                    \
+    std::lock_guard<std::mutex> gard(*lock_);                                  \
+    if (fn_##_sym == nullptr) {                                                \
+      fn_##_sym =                                                              \
+        reinterpret_cast<decltype(fn_##_sym)>(dlsym(RTLD_DEFAULT, #_sym));     \
+      if (!fn_##_sym) {                                                        \
+        pr_err("%s not found - %s\n", #_sym, dlerror());                       \
+      }                                                                        \
+    }                                                                          \
+}
+
+#define _HSAKMT_EXEC_API(_sym, ...) \
+do { \
+    if (fn_##_sym != nullptr) {    \
+        return fn_##_sym(__VA_ARGS__);   \
+    } \
+} while(0);
+
+bool hsakmt_hsa_loader_init() {
+  void *hsa_loader_handle = dlopen("libhsa-runtime64.so", RTLD_NOW | RTLD_GLOBAL);
+  if (hsa_loader_handle == nullptr) {
+    pr_err("dlopen libhsa-runtime64.so failed - %s\n", dlerror());
+    return false;
+  }
+  dlclose(hsa_loader_handle);
+  return true;
+}
+
+hsa_signal_value_t hsakmt_hsa_signal_load_relaxed(hsa_signal_t signal) {
+  static hsa_signal_value_t (*fn_hsa_signal_load_relaxed)(hsa_signal_t signal) = nullptr;
+
+  _HSAKMT_LOOKUP_SYMS(hsa_signal_load_relaxed);
+  _HSAKMT_EXEC_API(hsa_signal_load_relaxed, signal);
+
+  return 0;
+}
+
+hsa_signal_value_t hsakmt_hsa_signal_wait_relaxed(
+    hsa_signal_t signal, hsa_signal_condition_t condition,
+    hsa_signal_value_t compare_value, uint64_t timeout_hint,
+    hsa_wait_state_t wait_state_hint) {
+static hsa_signal_value_t (*fn_hsa_signal_wait_relaxed)(
+    hsa_signal_t signal, hsa_signal_condition_t condition,
+    hsa_signal_value_t compare_value, uint64_t timeout_hint,
+    hsa_wait_state_t wait_state_hint) = nullptr;
+
+  _HSAKMT_LOOKUP_SYMS(hsa_signal_wait_relaxed);
+  _HSAKMT_EXEC_API(hsa_signal_wait_relaxed, signal, condition, compare_value,
+                   timeout_hint, wait_state_hint);
+
+  return 0;
+}
+
+void hsakmt_hsa_signal_store_screlease(hsa_signal_t hsa_signal,
+                                      hsa_signal_value_t value){
+static void (*fn_hsa_signal_store_screlease)(hsa_signal_t hsa_signal,
+                                      hsa_signal_value_t value) = nullptr;
+
+  _HSAKMT_LOOKUP_SYMS(hsa_signal_store_screlease);
+  _HSAKMT_EXEC_API(hsa_signal_store_screlease, hsa_signal, value);
+}
+
+hsa_status_t hsakmt_hsa_ven_amd_loader_query_host_address(
+    const void *device_address, const void **host_address) {
+  static hsa_status_t (*fn_hsa_ven_amd_loader_query_host_address)(
+    const void *device_address, const void **host_address) = nullptr;
+
+  if (fn_hsa_ven_amd_loader_query_host_address == nullptr) {
+    std::lock_guard<std::mutex> gard(*lock_);
+    if (fn_hsa_ven_amd_loader_query_host_address == nullptr) {
+      hsa_status_t (*fn_hsa_system_get_extension_table)(
+      uint16_t extension, uint16_t version_major, uint16_t version_minor, void *table);
+      fn_hsa_system_get_extension_table =
+        reinterpret_cast<decltype(fn_hsa_system_get_extension_table)>(dlsym(RTLD_DEFAULT, "hsa_system_get_extension_table"));
+      if (fn_hsa_system_get_extension_table == nullptr) {
+        pr_err("%s not found - %s\n", "hsa_system_get_extension_table", dlerror());
+        return HSA_STATUS_ERROR;
+      }
+
+      hsa_ven_amd_loader_1_03_pfn_t table;
+      fn_hsa_system_get_extension_table(HSA_EXTENSION_AMD_LOADER, 1, 3, &table);
+      fn_hsa_ven_amd_loader_query_host_address =
+          table.hsa_ven_amd_loader_query_host_address;
+    }
+  }
+
+  _HSAKMT_EXEC_API(hsa_ven_amd_loader_query_host_address, device_address, host_address);
+  return HSA_STATUS_ERROR;
+}
+
+#else
+hsa_signal_value_t hsakmt_hsa_signal_load_relaxed(hsa_signal_t signal) {
+  return hsa_signal_load_relaxed(signal);
+}
+
+hsa_signal_value_t hsakmt_hsa_signal_wait_relaxed(
+    hsa_signal_t signal, hsa_signal_condition_t condition,
+    hsa_signal_value_t compare_value, uint64_t timeout_hint,
+    hsa_wait_state_t wait_state_hint) {
+  return hsa_signal_wait_relaxed(signal, condition, compare_value, timeout_hint,
+                                 wait_state_hint);
+}
+
+void hsakmt_hsa_signal_store_screlease(hsa_signal_t hsa_signal,
+                                      hsa_signal_value_t value) {
+  hsa_signal_store_screlease(hsa_signal, value);
+}
+
+hsa_status_t hsakmt_hsa_ven_amd_loader_query_host_address(
+    const void *device_address, const void **host_address) {
+  static hsa_status_t (*fn_hsa_ven_amd_loader_query_host_address)(
+    const void *device_address, const void **host_address) = nullptr;
+
+  if (fn_hsa_ven_amd_loader_query_host_address == nullptr) {
+    std::lock_guard<std::mutex> gard(*lock_);
+    if (fn_hsa_ven_amd_loader_query_host_address == nullptr) {
+      hsa_ven_amd_loader_1_03_pfn_t table;
+      hsa_system_get_extension_table(HSA_EXTENSION_AMD_LOADER, 1, 3, &table);
+      fn_hsa_ven_amd_loader_query_host_address =
+          table.hsa_ven_amd_loader_query_host_address;
+    }
+  }
+
+  if (fn_hsa_ven_amd_loader_query_host_address)
+    return fn_hsa_ven_amd_loader_query_host_address(device_address, host_address);
+
+  return HSA_STATUS_ERROR;
+}
+#endif
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/hsakmtmodel.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/hsakmtmodel.cpp
new file mode 100644
index 0000000000..6799f5d891
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/hsakmtmodel.cpp
@@ -0,0 +1,31 @@
+/*
+* Copyright © 2025 Advanced Micro Devices, Inc.
+*
+* Permission is hereby granted, free of charge, to any person
+* obtaining a copy of this software and associated documentation
+* files (the "Software"), to deal in the Software without
+* restriction, including without limitation the rights to use, copy,
+* modify, merge, publish, distribute, sublicense, and/or sell copies
+* of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including
+* the next paragraph) shall be included in all copies or substantial
+* portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+* DEALINGS IN THE SOFTWARE.
+*/
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtModelEnabled(bool* enable)
+{
+  *enable = false;
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_SUCCESS;
+}
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/libdrm.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/libdrm.cpp
new file mode 100644
index 0000000000..2e125dfb3e
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/libdrm.cpp
@@ -0,0 +1,182 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+#include <cstdint>
+
+#include "impl/wddm/types.h"
+#include "impl/wddm/device.h"
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtGetAMDGPUDeviceHandle(
+    HSAuint32 NodeId, HsaAMDGPUDeviceHandle *DeviceHandle) {
+  CHECK_DXG_OPEN();
+
+  wsl::thunk::WDDMDevice *pDevice = get_wddmdev(NodeId);
+  if (pDevice != nullptr) {
+    *DeviceHandle = reinterpret_cast<HsaAMDGPUDeviceHandle>(pDevice);
+    return HSAKMT_STATUS_SUCCESS;
+  }
+  return HSAKMT_STATUS_ERROR;
+}
+
+HSAKMTAPI int amdgpu_device_initialize(int fd,
+                                       uint32_t *major_version,
+                                       uint32_t *minor_version,
+                                       amdgpu_device_handle *device_handle) {
+  return 0;
+}
+
+HSAKMTAPI int amdgpu_device_deinitialize(amdgpu_device_handle device_handle) {
+  return 0;
+}
+
+HSAKMTAPI int amdgpu_query_gpu_info(amdgpu_device_handle dev,
+                                    struct amdgpu_gpu_info *info) {
+  wsl::thunk::WDDMDevice *pDevice =
+    reinterpret_cast<wsl::thunk::WDDMDevice *>(dev);
+  memset(info, 0, sizeof(*info));
+  info->gpu_counter_freq = pDevice->GPUCounterFrequency() / 1000ull;
+  return 0;
+}
+
+HSAKMTAPI int amdgpu_device_get_fd(amdgpu_device_handle dev) {
+  return dxg_runtime->dxg_fd;
+}
+
+HSAKMTAPI int amdgpu_bo_cpu_map(amdgpu_bo_handle bo, void **cpu) {
+  wsl::thunk::GpuMemory *gpu_mem = reinterpret_cast<wsl::thunk::GpuMemory *>(bo);
+  if (gpu_mem->IsSysMemFd())
+    *cpu = gpu_mem->CpuAddress();
+  return 0;
+}
+
+HSAKMTAPI int amdgpu_bo_free(amdgpu_bo_handle buf_handle) {
+  wsl::thunk::GpuMemory *gpu_mem = reinterpret_cast<wsl::thunk::GpuMemory *>(buf_handle);
+  void *MemoryAddress = gpu_mem->IsVaAllocated() ? (void*)gpu_mem->GpuAddress() : (void*)gpu_mem->HandleApeAddress();
+  auto ret = hsaKmtFreeMemory((void*)MemoryAddress, gpu_mem->Size());
+  return ret == HSAKMT_STATUS_SUCCESS ? 0 : -1;
+}
+
+HSAKMTAPI int amdgpu_bo_export(amdgpu_bo_handle bo,
+                               enum amdgpu_bo_handle_type type,
+                               uint32_t *shared_handle) {
+  *shared_handle = 0;
+  return 0;
+}
+
+HSAKMTAPI int amdgpu_bo_import(amdgpu_device_handle dev,
+                               enum amdgpu_bo_handle_type type,
+                               uint32_t shared_handle,
+                               struct amdgpu_bo_import_result *output) {
+  if (type != amdgpu_bo_handle_type_dma_buf_fd) {
+    pr_err("not implemented\n");
+    return -1;
+  }
+
+
+  wsl::thunk::WDDMDevice *pDevice = reinterpret_cast<wsl::thunk::WDDMDevice *>(dev);
+  wsl::thunk::GpuMemoryHandle mem_handle;
+  bool is_ipc_memfd = is_ipc_sysmemfd(shared_handle);
+  bool alloc_va = is_ipc_memfd;
+
+  HSAKMT_STATUS ret = import_dmabuf_fd(shared_handle, pDevice->NodeId(),
+                                        alloc_va, is_ipc_memfd, &mem_handle);
+  if (ret == HSAKMT_STATUS_SUCCESS) {
+    //use GpuMemory object handle as drm buf handle
+    output->buf_handle = reinterpret_cast<amdgpu_bo_handle>(mem_handle);
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+HSAKMTAPI int amdgpu_bo_va_op(amdgpu_bo_handle bo,
+                              uint64_t offset,
+                              uint64_t size,
+                              uint64_t addr,
+                              uint64_t flags,
+                              uint32_t ops) {
+  wsl::thunk::GpuMemory *gpu_mem = reinterpret_cast<wsl::thunk::GpuMemory *>(bo);
+  assert(gpu_mem != nullptr);
+
+  switch(ops) {
+    case AMDGPU_VA_OP_MAP:
+      {
+        if (gpu_mem->GpuAddress() == addr) {
+          pr_info("bo is mapped already\n");
+          return 0;
+        } else if (gpu_mem->GpuAddress()) {
+          pr_err("amdgpu_bo_va_op: GPU memory already mapped at %p, but requested to map at %p\n",
+                 reinterpret_cast<void *>(gpu_mem->GpuAddress()), reinterpret_cast<void *>(addr));
+          return -1;
+        }
+        auto code = gpu_mem->MapGpuVirtualAddress(reinterpret_cast<gpusize>(addr), size, offset);
+        if (code != ErrorCode::Success)
+          return -1;
+
+        code = gpu_mem->MakeResident();
+        if (code != ErrorCode::Success)
+          return -1;
+      }
+      break;
+    case AMDGPU_VA_OP_UNMAP:
+      {
+        auto code = gpu_mem->UnmapGpuVirtualAddress(reinterpret_cast<gpusize>(addr), size, offset);
+        if (code != ErrorCode::Success)
+          return -1;
+        gpu_mem->Evict();
+      }
+      break;
+  }
+  return 0;
+}
+
+HSAKMTAPI int amdgpu_bo_query_info(amdgpu_bo_handle bo, struct amdgpu_bo_info* info) {
+  return 0;
+}
+
+HSAKMTAPI int amdgpu_bo_set_metadata(amdgpu_bo_handle bo, struct amdgpu_bo_metadata* info) {
+  return 0;
+}
+
+HSAKMTAPI int drmCommandWriteRead(int fd, unsigned long drmCommandIndex,
+                                  void *data, unsigned long size) {
+  return 0;
+}
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/librocdxg.h b/projects/rocr-runtime/libhsakmt/src/dxg/librocdxg.h
new file mode 100644
index 0000000000..02826b22b0
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/librocdxg.h
@@ -0,0 +1,289 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LIBHSAKMT_H_INCLUDED
+#define LIBHSAKMT_H_INCLUDED
+
+#include <pthread.h>
+#include <stdint.h>
+#include <limits.h>
+#include "hsakmt/hsakmt.h"
+#include "hsakmt/hsakmt_drm.h"
+
+#include "impl/wddm/va_mgr.h"
+#include "impl/wddm/types.h"
+#include "impl/wddm/device.h"
+#include "dxcore_loader.h"
+
+wsl::thunk::WDDMDevice* get_wddmdev(uint32_t node_id);
+uint32_t get_num_wddmdev();
+wsl::thunk::GpuMemory *get_gpu_mem(void *MemoryAddress);
+
+#define HSAKMT_DEBUG_LEVEL_ERR      -1
+#define HSAKMT_DEBUG_LEVEL_DEFAULT  3
+#define HSAKMT_DEBUG_LEVEL_WARNING  4
+#define HSAKMT_DEBUG_LEVEL_INFO     6
+#define HSAKMT_DEBUG_LEVEL_DEBUG    7
+
+struct hsakmtRuntime {
+  hsakmtRuntime()
+    : dxg_fd(-1),
+    parent_pid(getpid()),
+    is_forked(false),
+    hsakmt_debug_level(HSAKMT_DEBUG_LEVEL_DEFAULT),
+    dxg_open_count(0),
+    hsakmt_mutex(PTHREAD_MUTEX_INITIALIZER),
+    hsakmt_is_dgpu(false),
+    is_svm_api_supported(false),
+    zfb_support(0),
+    vendor_packet_process(0),
+    check_avail_sysram(false),
+    max_single_alloc_size(0),
+    enable_thunk_sub_allocator(0),
+    local_heap_space_start_(0),
+    local_heap_space_size_(0),
+    system_heap_space_start_(0),
+    system_heap_space_size_(0),
+    handle_aperture_start_(0),
+    handle_aperture_size_(0),
+    default_node(1) {}
+
+  void HeapInit();
+  void HeapFini();
+  bool ReserveSvmSpace(uint64_t &base, uint64_t &size, uint64_t align);
+  bool FreeSvmSpace(uint64_t &base, uint64_t &size);
+  bool ReserveLocalHeapSpace();
+  bool FreeLocalHeapSpace();
+  void InitLocalHeapMgr();
+  bool ReserveSystemHeapSpace();
+  uint64_t SystemHeapSize() { return system_heap_space_size_; }
+  bool FreeSystemHeapSpace();
+  bool CommitSystemHeapSpace(void* addr, int64_t size, bool lock);
+  bool DecommitSystemHeapSpace(void* addr, int64_t size);
+  void InitSystemHeapMgr();
+  ErrorCode ReserveGpuVirtualAddress(const thunk_proxy::AllocDomain domain,
+          gpusize hit_base_addr, gpusize size,
+          gpusize *out_gpu_virt_addr, gpusize alignment, bool lock);
+  ErrorCode FreeGpuVirtualAddress(const thunk_proxy::AllocDomain domain,
+          gpusize gpu_addr, gpusize size);
+  bool CommitSystemHeapSpaceIPC(void* addr, int64_t size, int &fd, bool lock=false);
+  bool DecommitSystemHeapSpaceIPC(void* addr, int64_t size, int &memfd);
+  ErrorCode ReserveIPCSysMem(gpusize size,
+          gpusize *out_gpu_virt_addr, gpusize alignment,
+          int &memfd, bool lock);
+  ErrorCode FreeIPCSysMem(gpusize gpu_addr, gpusize size, int &memfd);
+  bool InitHandleApertureSpace();
+  void InitHandleApertureMgr();
+  ErrorCode HandleApertureAlloc(gpusize size, gpusize *out_gpu_virt_addr);
+  void HandleApertureFree(gpusize gpu_addr);
+
+  pthread_mutex_t hsakmt_mutex;
+  const char *dxg_device_name = "/dev/dxg";
+  long page_size;
+  int page_shift;
+  int dxg_fd = -1;
+  pid_t parent_pid = -1;
+  bool is_forked = false;
+  int hsakmt_debug_level = HSAKMT_DEBUG_LEVEL_DEFAULT;
+  unsigned long dxg_open_count;
+  bool hsakmt_is_dgpu;
+  bool is_svm_api_supported;
+  int zfb_support;
+  int vendor_packet_process;
+  bool check_avail_sysram;
+  size_t max_single_alloc_size;
+  int enable_thunk_sub_allocator;
+  uint32_t default_node;
+
+  /* local heap means bo's backend is vram of all GPUs */
+  uint64_t local_heap_space_start_;
+  uint64_t local_heap_space_size_;
+
+  /* manage the reserved local heap space which shared by CPU and GPUs */
+  std::unique_ptr<wsl::thunk::VaMgr> local_heap_mgr_;
+
+  /* system heap means bo's backend is system ram */
+  uint64_t system_heap_space_start_;
+  uint64_t system_heap_space_size_;
+
+  /* manage the reserved system heap space which shared by CPU and GPUs */
+  std::unique_ptr<wsl::thunk::VaMgr> system_heap_mgr_;
+
+  uint64_t handle_aperture_start_;
+  uint64_t handle_aperture_size_;
+  std::unique_ptr<wsl::thunk::VaMgr> handle_aperture_mgr_;
+};
+
+extern hsakmtRuntime *dxg_runtime;
+
+#undef HSAKMTAPI
+#define HSAKMTAPI __attribute__((visibility ("default")))
+
+#if defined(__clang__)
+#if __has_feature(address_sanitizer)
+#define SANITIZER_AMDGPU 1
+#endif
+#endif
+
+/*Avoid pointer-to-int-cast warning*/
+#define PORT_VPTR_TO_UINT64(vptr) ((uint64_t)(unsigned long)(vptr))
+
+/*Avoid int-to-pointer-cast warning*/
+#define PORT_UINT64_TO_VPTR(v) ((void*)(unsigned long)(v))
+
+#define CHECK_DXG_OPEN() \
+	do { if (dxg_runtime->dxg_open_count == 0 || dxg_runtime->is_forked) return HSAKMT_STATUS_KERNEL_IO_CHANNEL_NOT_OPENED; } while (0)
+
+/* 64KB BigK fragment size for TLB efficiency */
+#define GPU_BIGK_PAGE_SIZE (1 << 16)
+
+/* 2MB huge page size for 4-level page tables on Vega10 and later GPUs */
+#define GPU_HUGE_PAGE_SIZE (2 << 20)
+
+#define CHECK_PAGE_MULTIPLE(x) \
+	do { if ((uint64_t)PORT_VPTR_TO_UINT64(x) % dxg_runtime->page_size) return HSAKMT_STATUS_INVALID_PARAMETER; } while(0)
+
+#define ALIGN_UP(x,align) (((uint64_t)(x) + (align) - 1) & ~(uint64_t)((align)-1))
+#define ALIGN_UP_32(x,align) (((uint32_t)(x) + (align) - 1) & ~(uint32_t)((align)-1))
+#define PAGE_ALIGN_UP(x) ALIGN_UP(x,dxg_runtime->page_size)
+#define BITMASK(n) ((n) ? (UINT64_MAX >> (sizeof(UINT64_MAX) * CHAR_BIT - (n))) : 0)
+#define ARRAY_LEN(array) (sizeof(array) / sizeof(array[0]))
+
+/* HSA Thunk logging usage */
+#define get_thread_id()                                                                                                          \
+    ([]() -> std::string {                                                                                                       \
+        std::stringstream str_thrd_id;                                                                                           \
+        str_thrd_id << std::hex << std::this_thread::get_id();                                                                   \
+        return str_thrd_id.str();                                                                                                \
+    })()
+#define hsakmt_print_common(stream, fmt, ...)                                                                                    \
+    do {                                                                                                                         \
+        fprintf(stream, "pid:%d tid:0x%s [%s] " fmt, getpid(), get_thread_id().c_str(), __FUNCTION__, ##__VA_ARGS__);            \
+        fflush(stream);                                                                                                          \
+    } while (false)
+#ifdef NDEBUG
+#define hsakmt_print(level, fmt, ...)                                                                                            \
+    do { } while (false)
+#else
+#define hsakmt_print(level, fmt, ...)                                                                                            \
+    do {                                                                                                                         \
+        if (level <= dxg_runtime->hsakmt_debug_level) {                                                                          \
+            hsakmt_print_common(stdout, fmt, ##__VA_ARGS__);                                                                     \
+        }                                                                                                                        \
+    } while (false)
+#endif
+
+#define pr_err(fmt, ...) \
+	hsakmt_print_common(stderr, fmt, ##__VA_ARGS__)
+#define pr_warn(fmt, ...) \
+	hsakmt_print(HSAKMT_DEBUG_LEVEL_WARNING, fmt, ##__VA_ARGS__)
+#define pr_info(fmt, ...) \
+	hsakmt_print(HSAKMT_DEBUG_LEVEL_INFO, fmt, ##__VA_ARGS__)
+#define pr_debug(fmt, ...) \
+	hsakmt_print(HSAKMT_DEBUG_LEVEL_DEBUG, fmt, ##__VA_ARGS__)
+#define pr_err_once(fmt, ...)                   \
+({                                              \
+        static bool __print_once;               \
+        if (!__print_once) {                    \
+                __print_once = true;            \
+                pr_err(fmt, ##__VA_ARGS__);     \
+        }                                       \
+})
+#define pr_warn_once(fmt, ...)                  \
+({                                              \
+        static bool __print_once;               \
+        if (!__print_once) {                    \
+                __print_once = true;            \
+                pr_warn(fmt, ##__VA_ARGS__);    \
+        }                                       \
+})
+
+/* Expects HSA_ENGINE_ID.ui32, returns gfxv (full) in hex */
+#define HSA_GET_GFX_VERSION_FULL(ui32) \
+	(((ui32.Major) << 16) | ((ui32.Minor) << 8) | (ui32.Stepping))
+
+HSAKMT_STATUS validate_nodeid(uint32_t nodeid, uint32_t *gpu_id);
+HSAKMT_STATUS gpuid_to_nodeid(uint32_t gpu_id, uint32_t* node_id);
+bool prefer_ats(HSAuint32 node_id);
+uint16_t get_device_id_by_node_id(HSAuint32 node_id);
+uint16_t get_device_id_by_gpu_id(HSAuint32 gpu_id);
+uint32_t get_direct_link_cpu(uint32_t gpu_node);
+
+HSAKMT_STATUS topology_sysfs_get_system_props(HsaSystemProperties& props);
+HSAKMT_STATUS topology_get_node_props(HSAuint32 NodeId,
+				      HsaNodeProperties *NodeProperties);
+HSAKMT_STATUS topology_get_iolink_props(HSAuint32 NodeId,
+					HSAuint32 NumIoLinks,
+					HsaIoLinkProperties *IoLinkProperties);
+void topology_setup_is_dgpu_param(HsaNodeProperties *props);
+
+HSAuint32 PageSizeFromFlags(unsigned int pageSizeFlags);
+
+uint32_t get_num_sysfs_nodes(void);
+
+bool is_forked_child(void);
+
+void clear_allocation_map(void);
+
+class BlockAllocator {
+private:
+    static const size_t block_size_ = 128 * 1024 * 1024;  // 128MB blocks.
+
+public:
+    void* alloc(size_t request_size, size_t& allocated_size) const;
+    void free(void* ptr, size_t length) const;
+    size_t block_size() const { return block_size_; }
+};
+
+void reset_suballocator(void);
+void trim_suballocator(void);
+
+HSAKMT_STATUS hsaKmtAllocMemoryAlignInternal(HSAuint32 PreferredNode,
+                                            HSAuint64 SizeInBytes,
+                                            HSAuint64 Alignment,
+                                            HsaMemFlags MemFlags,
+                                            void **MemoryAddress,
+                                            bool SkipSubAlloc = false);
+
+HSAKMT_STATUS hsaKmtFreeMemoryInternal(void *MemoryAddress,
+                                    HSAuint64 SizeInBytes,
+                                    bool SkipSubAlloc = false);
+
+bool queue_acquire_buffer(void *MemoryAddress);
+bool queue_release_buffer(void *MemoryAddress);
+/* Calculate VGPR and SGPR register file size per CU */
+uint32_t get_vgpr_size_per_cu(HSA_ENGINE_ID id);
+#define SGPR_SIZE_PER_CU 0x4000
+
+bool is_ipc_sysmemfd(int fd);
+
+HSAKMT_STATUS import_dmabuf_fd(int DMABufFd,
+                                       uint32_t NodeId,
+                                       bool alloc_va,
+                                       bool is_ipc_memfd,
+                                       wsl::thunk::GpuMemoryHandle *GpuMemHandle);
+
+bool hsakmt_hsa_loader_init();
+#endif
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/librocdxg.ver b/projects/rocr-runtime/libhsakmt/src/dxg/librocdxg.ver
new file mode 100644
index 0000000000..d91b29ec90
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/librocdxg.ver
@@ -0,0 +1,113 @@
+HSAKMT_1
+{
+global:
+hsaKmtOpenKFD;
+hsaKmtCloseKFD;
+hsaKmtGetVersion;
+hsaKmtAcquireSystemProperties;
+hsaKmtReleaseSystemProperties;
+hsaKmtGetNodeProperties;
+hsaKmtGetNodeMemoryProperties;
+hsaKmtGetNodeCacheProperties;
+hsaKmtGetNodeIoLinkProperties;
+hsaKmtCreateEvent;
+hsaKmtDestroyEvent;
+hsaKmtSetEvent;
+hsaKmtResetEvent;
+hsaKmtQueryEventState;
+hsaKmtWaitOnEvent;
+hsaKmtWaitOnMultipleEvents;
+hsaKmtCreateQueue;
+hsaKmtCreateQueueExt;
+hsaKmtUpdateQueue;
+hsaKmtDestroyQueue;
+hsaKmtSetQueueCUMask;
+hsaKmtSetMemoryPolicy;
+hsaKmtAllocMemory;
+hsaKmtAllocMemoryAlign;
+hsaKmtFreeMemory;
+hsaKmtAvailableMemory;
+hsaKmtRegisterMemory;
+hsaKmtRegisterMemoryToNodes;
+hsaKmtRegisterMemoryWithFlags;
+hsaKmtRegisterGraphicsHandleToNodes;
+hsaKmtRegisterGraphicsHandleToNodesExt;
+hsaKmtShareMemory;
+hsaKmtRegisterSharedHandle;
+hsaKmtRegisterSharedHandleToNodes;
+hsaKmtProcessVMRead;
+hsaKmtProcessVMWrite;
+hsaKmtDeregisterMemory;
+hsaKmtMapMemoryToGPU;
+hsaKmtMapMemoryToGPUNodes;
+hsaKmtUnmapMemoryToGPU;
+hsaKmtDbgRegister;
+hsaKmtDbgUnregister;
+hsaKmtDbgWavefrontControl;
+hsaKmtDbgAddressWatch;
+hsaKmtDbgEnable;
+hsaKmtDbgDisable;
+hsaKmtDbgGetDeviceData;
+hsaKmtDbgGetQueueData;
+hsaKmtGetClockCounters;
+hsaKmtPmcGetCounterProperties;
+hsaKmtPmcRegisterTrace;
+hsaKmtPmcUnregisterTrace;
+hsaKmtPmcAcquireTraceAccess;
+hsaKmtPmcReleaseTraceAccess;
+hsaKmtPmcStartTrace;
+hsaKmtPmcQueryTrace;
+hsaKmtPmcStopTrace;
+hsaKmtMapGraphicHandle;
+hsaKmtUnmapGraphicHandle;
+hsaKmtSetTrapHandler;
+hsaKmtGetTileConfig;
+hsaKmtQueryPointerInfo;
+hsaKmtSetMemoryUserData;
+hsaKmtGetQueueInfo;
+hsaKmtAllocQueueGWS;
+hsaKmtRuntimeEnable;
+hsaKmtRuntimeDisable;
+hsaKmtCheckRuntimeDebugSupport;
+hsaKmtGetRuntimeCapabilities;
+hsaKmtDebugTrapIoctl;
+hsaKmtSPMAcquire;
+hsaKmtSPMRelease;
+hsaKmtSPMSetDestBuffer;
+hsaKmtSVMSetAttr;
+hsaKmtSVMGetAttr;
+hsaKmtSetXNACKMode;
+hsaKmtGetXNACKMode;
+hsaKmtOpenSMI;
+hsaKmtExportDMABufHandle;
+hsaKmtGetMemoryHandle;
+hsaKmtWaitOnEvent_Ext;
+hsaKmtWaitOnMultipleEvents_Ext;
+hsaKmtReplaceAsanHeaderPage;
+hsaKmtReturnAsanHeaderPage;
+hsaKmtGetAMDGPUDeviceHandle;
+hsaKmtPcSamplingQueryCapabilities;
+hsaKmtPcSamplingCreate;
+hsaKmtPcSamplingDestroy;
+hsaKmtPcSamplingStart;
+hsaKmtPcSamplingStop;
+hsaKmtPcSamplingSupport;
+hsaKmtAisReadWriteFile;
+hsaKmtModelEnabled;
+hsaKmtQueueRingDoorbell;
+amdgpu_device_initialize;
+amdgpu_device_deinitialize;
+amdgpu_query_gpu_info;
+amdgpu_bo_import;
+amdgpu_bo_va_op;
+amdgpu_device_get_fd;
+amdgpu_bo_cpu_map;
+amdgpu_bo_free;
+amdgpu_bo_export;
+amdgpu_bo_query_info;
+amdgpu_bo_set_metadata;
+drmCommandWriteRead;
+
+local: *;
+};
+
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/memory.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/memory.cpp
new file mode 100644
index 0000000000..b6ef48cf29
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/memory.cpp
@@ -0,0 +1,989 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/sysinfo.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include "impl/wddm/gpu_memory.h"
+#include "util/simple_heap.h"
+
+struct Allocation {
+  Allocation()
+      : handle(0), cpu_addr(0), gpu_addr(0), size(0), userptr(false),
+        user_data(nullptr), size_requested(0), node_id(0), mem_flags_value(0),
+        dmabuf_fd(-1), rocr_userdata(nullptr) {}
+  Allocation(wsl::thunk::GpuMemoryHandle handle_arg, void *cpu_addr_arg,
+             uint64_t gpu_addr_arg, size_t size_arg, bool userptr_arg = false,
+             void *user_data_arg = nullptr, size_t user_size_arg = 0,
+             HSAuint32 node_id_arg = 0, HSAuint32 mem_flags_value_arg = 0)
+      : handle(handle_arg), cpu_addr(cpu_addr_arg), gpu_addr(gpu_addr_arg),
+        size(size_arg), userptr(userptr_arg), user_data(user_data_arg),
+        size_requested(user_size_arg), node_id(node_id_arg),
+        mem_flags_value(mem_flags_value_arg), dmabuf_fd(-1), rocr_userdata(nullptr) {}
+
+  wsl::thunk::GpuMemoryHandle handle;
+  void *cpu_addr;
+  uint64_t gpu_addr;
+  bool userptr;
+  size_t size; /* actual size = align_up(size_requested, granularity) */
+  void *user_data;
+  size_t size_requested; /* size requested by user */
+  HSAuint32 node_id;
+  HSAuint32 mem_flags_value;
+  int dmabuf_fd;
+  void *rocr_userdata;
+};
+
+static std::map<const void *, Allocation>* allocation_map_ = new std::map<const void *, Allocation>();
+static std::mutex* allocation_map_lock_ = new std::mutex();
+
+void clear_allocation_map(void)
+{
+  //delete allocation_map_lock_;
+  allocation_map_lock_ = new std::mutex();
+  std::lock_guard<std::mutex> lock(*allocation_map_lock_);
+  delete allocation_map_;
+  allocation_map_ = new std::map<const void *, Allocation>();
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtSetMemoryPolicy(HSAuint32 Node,
+                                              HSAuint32 DefaultPolicy,
+                                              HSAuint32 AlternatePolicy,
+                                              void *MemoryAddressAlternate,
+                                              HSAuint64 MemorySizeInBytes) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not implemented\n");
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAuint32 PageSizeFromFlags(unsigned int pageSizeFlags) {
+  switch (pageSizeFlags) {
+  case HSA_PAGE_SIZE_4KB:
+    return 4 * 1024;
+  case HSA_PAGE_SIZE_64KB:
+    return 64 * 1024;
+  case HSA_PAGE_SIZE_2MB:
+    return 2 * 1024 * 1024;
+  case HSA_PAGE_SIZE_1GB:
+    return 1024 * 1024 * 1024;
+  default:
+    assert(false);
+    return 4 * 1024;
+  }
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemory(HSAuint32 PreferredNode,
+                                          HSAuint64 SizeInBytes,
+                                          HsaMemFlags MemFlags,
+                                          void **MemoryAddress) {
+  return hsaKmtAllocMemoryAlign(PreferredNode, SizeInBytes, 0, MemFlags,
+                                MemoryAddress);
+}
+
+#define POWER_OF_2(x) ((x && (!(x & (x - 1)))) ? 1 : 0)
+
+bool isSystemMemoryAvailable(HSAuint64 SizeInBytes) {
+  struct sysinfo info;
+  if (sysinfo(&info) != 0)
+    return false;
+  return SizeInBytes <= info.freeram;
+}
+
+void* BlockAllocator::alloc(size_t request_size, size_t& allocated_size) const {
+  void *address;
+  HsaMemFlags MemFlags;
+
+  MemFlags.Value = 0;
+  MemFlags.ui32.CoarseGrain = 1;
+  MemFlags.ui32.NoSubstitute = 1;
+  allocated_size = wsl::AlignUp(request_size, block_size());
+  if (HSAKMT_STATUS_SUCCESS == hsaKmtAllocMemoryAlignInternal(1, allocated_size, 0, MemFlags, &address, true))
+    return address;
+
+  return nullptr;
+}
+
+void BlockAllocator::free(void* ptr, size_t length) const {
+  if (HSAKMT_STATUS_SUCCESS != hsaKmtFreeMemoryInternal(ptr, length, true))
+    pr_err("wsl-thunk: BlockAllocator::free() err, address %p, length:%zu\n", ptr, length);
+}
+
+static wsl::SimpleHeap<BlockAllocator> fragment_allocator_;
+
+void reset_suballocator(void) {
+  fragment_allocator_.reset();
+}
+
+void trim_suballocator(void) {
+  fragment_allocator_.trim();
+}
+
+HSAKMT_STATUS hsaKmtAllocMemoryAlignInternal(HSAuint32 PreferredNode,
+                                             HSAuint64 SizeInBytes,
+                                             HSAuint64 Alignment,
+                                             HsaMemFlags MemFlags,
+                                             void **MemoryAddress,
+                                             bool SkipSubAlloc) {
+  CHECK_DXG_OPEN();
+
+  if (!MemoryAddress)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  if (MemFlags.ui32.FixedAddress) {
+    if (*MemoryAddress == nullptr)
+      return HSAKMT_STATUS_INVALID_PARAMETER;
+  } else
+    *MemoryAddress = nullptr;
+
+  uint32_t node = (PreferredNode == 0) ? dxg_runtime->default_node : PreferredNode;
+  wsl::thunk::WDDMDevice *dev = get_wddmdev(node);
+  if (!dev)
+    return HSAKMT_STATUS_ERROR;
+
+  wsl::thunk::GpuMemory *gpu_mem = nullptr;
+  wsl::thunk::GpuMemoryCreateInfo create_info{};
+  create_info.size = SizeInBytes;
+
+  /* If initialize scratch pool of GpuAgent, treat it as SVM reserve */
+  if (MemFlags.ui32.Scratch && MemFlags.ui32.HostAccess && SizeInBytes > 0x80000000)
+    MemFlags.ui32.OnlyAddress = 1;
+
+  create_info.alignment = Alignment;
+  create_info.va_hint = reinterpret_cast<gpusize>(*MemoryAddress);
+  if ((PreferredNode == 0 && MemFlags.ui32.HostAccess)
+    || dxg_runtime->zfb_support || MemFlags.ui32.GTTAccess) {
+    if (SizeInBytes > dxg_runtime->max_single_alloc_size)
+      return HSAKMT_STATUS_NO_MEMORY;
+
+    if (dxg_runtime->check_avail_sysram && !isSystemMemoryAvailable(SizeInBytes))
+      return HSAKMT_STATUS_NO_MEMORY;
+
+    /* If allocate VRAM under ZFB mode */
+    if (dxg_runtime->zfb_support && MemFlags.ui32.NonPaged == 1)
+      MemFlags.ui32.CoarseGrain = 1;
+
+    // AllocateNonPaged == AllocateIPC
+    create_info.flags.sysmem_ipc_sig_exporter = !!(MemFlags.ui32.NonPaged && !MemFlags.ui32.GTTAccess);
+
+    create_info.domain = thunk_proxy::AllocDomain::kSystem;
+  } else {
+    create_info.domain = thunk_proxy::AllocDomain::kLocal;
+  }
+
+  if (!MemFlags.ui32.CoarseGrain)
+    create_info.mem_flags = thunk_proxy::kFineGrain;
+
+  //In hsa-runtime, only kernarg region set Uncached.
+  if (MemFlags.ui32.Uncached)
+    create_info.mem_flags |= thunk_proxy::kKernarg;
+
+  create_info.flags.physical_only = MemFlags.ui32.NoAddress;
+  create_info.flags.alloc_va = !create_info.flags.physical_only;
+  create_info.flags.interprocess = MemFlags.ui32.NoAddress;
+  create_info.flags.interprocess |= MemFlags.ui32.Contiguous;
+  create_info.flags.physical_contiguous = MemFlags.ui32.Contiguous;
+  create_info.flags.locked = MemFlags.ui32.NoSubstitute;//AllocatePinned
+  create_info.flags.virtual_alloc = MemFlags.ui32.OnlyAddress;
+  create_info.flags.blit_kernel_object =
+      (MemFlags.ui32.ExecuteBlit && MemFlags.ui32.ExecuteAccess &&
+      (create_info.domain == thunk_proxy::AllocDomain::kSystem));
+  /*when only alloc virtual or only physical, it's vmm allocation, force to local*/
+  if (create_info.flags.virtual_alloc || create_info.flags.physical_only
+        || create_info.flags.physical_contiguous) {
+    create_info.domain = thunk_proxy::AllocDomain::kLocal;
+    SkipSubAlloc = true;
+  }
+
+  /* Only allow using the suballocator for ordinary VRAM.*/
+  bool trim_safe = false;
+  if (!SkipSubAlloc && create_info.domain == thunk_proxy::AllocDomain::kLocal) {
+    /* just quickly skip SA if size is bigger than SA block size.*/
+    gpusize real_size;
+    if (create_info.size > GPU_HUGE_PAGE_SIZE)
+      real_size = wsl::AlignUp(create_info.size, GPU_HUGE_PAGE_SIZE);
+    else
+      real_size = wsl::AlignUp(create_info.size, getpagesize());
+
+    if (real_size < fragment_allocator_.default_block_size()) {
+      *MemoryAddress = fragment_allocator_.alloc(real_size);
+      if (*MemoryAddress)
+        return HSAKMT_STATUS_SUCCESS;
+    }
+
+    /* SA might keep a lot of free blocks as *cache*.
+       * We can trim them if direct allocation fails at first time.
+       */
+    trim_safe = true;
+  }
+
+after_trim:
+  auto code = dev->CreateGpuMemory(create_info, &gpu_mem);
+  if (code == ErrorCode::Success) {
+    std::lock_guard<std::mutex> gard(*allocation_map_lock_);
+
+    /* For these physical allcations, use GpuMemory object's address as thunk handle*/
+    if (create_info.flags.physical_only || create_info.dmabuf_fd > 0)
+      *MemoryAddress = reinterpret_cast<void*>(gpu_mem->HandleApeAddress());
+    else
+      *MemoryAddress = reinterpret_cast<void *>(gpu_mem->GpuAddress());
+
+    (*allocation_map_)[*MemoryAddress] = Allocation(
+        gpu_mem->GetGpuMemoryHandle(), *MemoryAddress, (uint64_t)*MemoryAddress,
+        create_info.size, false, nullptr, SizeInBytes,
+        MemFlags.ui32.GTTAccess ? 0 : PreferredNode, MemFlags.Value);
+    return HSAKMT_STATUS_SUCCESS;
+  } else if (trim_safe) {
+    /* attempt to release memory from the block allocator and retry */
+    fragment_allocator_.trim();
+    trim_safe = false;
+    goto after_trim;
+  }
+
+  return HSAKMT_STATUS_ERROR;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemoryAlign(HSAuint32 PreferredNode,
+                                               HSAuint64 SizeInBytes,
+                                               HSAuint64 Alignment,
+                                               HsaMemFlags MemFlags,
+                                               void **MemoryAddress) {
+  return hsaKmtAllocMemoryAlignInternal(PreferredNode, SizeInBytes,
+                                        Alignment, MemFlags,
+                                        MemoryAddress,
+                                        !dxg_runtime->enable_thunk_sub_allocator);
+}
+
+HSAKMT_STATUS hsaKmtFreeMemoryInternal(void *MemoryAddress,
+                                       HSAuint64 SizeInBytes,
+                                       bool SkipSubAlloc) {
+  CHECK_DXG_OPEN();
+
+  if (!MemoryAddress)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  if (!SkipSubAlloc) {
+    if (fragment_allocator_.free(MemoryAddress))
+      return HSAKMT_STATUS_SUCCESS;
+  }
+
+  wsl::thunk::GpuMemory *gpu_mem = nullptr;
+  {
+    std::lock_guard<std::mutex> gard(*allocation_map_lock_);
+    auto it = allocation_map_->find(MemoryAddress);
+    if (it == allocation_map_->end()) {
+      return HSAKMT_STATUS_ERROR;
+    }
+
+    gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
+    if (gpu_mem->IsQueueReferenced())
+      return HSAKMT_STATUS_ERROR;
+
+    wsl::thunk::GpuMemoryDescFlags flags;
+    flags.reserved = gpu_mem->Flags();
+    if (flags.is_imported_vram_ipc &&
+      gpu_mem->DecSharedReference()) {
+      pr_info("memory is still referenced\n");
+      return HSAKMT_STATUS_SUCCESS;
+    }
+
+    if (it->second.dmabuf_fd >= 0) {
+      close(it->second.dmabuf_fd);
+      it->second.dmabuf_fd = -1;
+    }
+    allocation_map_->erase(it);
+  }
+
+  delete gpu_mem;
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtFreeMemory(void *MemoryAddress,
+                     HSAuint64 SizeInBytes) {
+  return hsaKmtFreeMemoryInternal(MemoryAddress, SizeInBytes);
+}
+
+bool queue_acquire_buffer(void *MemoryAddress) {
+  if (!MemoryAddress)
+  return false;
+
+  wsl::thunk::GpuMemory *gpu_mem = nullptr;
+  {
+  std::lock_guard<std::mutex> gard(*allocation_map_lock_);
+  auto it = allocation_map_->find(MemoryAddress);
+  if (it == allocation_map_->end()) {
+    return HSAKMT_STATUS_ERROR;
+  }
+
+  gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
+  gpu_mem->GetQueueReference();
+  }
+  if (gpu_mem == nullptr)
+  return false;
+
+  return true;
+}
+
+bool queue_release_buffer(void *MemoryAddress) {
+  if (!MemoryAddress)
+    return false;
+
+  wsl::thunk::GpuMemory *gpu_mem = nullptr;
+  {
+    std::lock_guard<std::mutex> gard(*allocation_map_lock_);
+    auto it = allocation_map_->find(MemoryAddress);
+    if (it == allocation_map_->end()) {
+      return HSAKMT_STATUS_ERROR;
+    }
+
+    gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
+    gpu_mem->PutQueueReference();
+  }
+  if (gpu_mem == nullptr)
+    return false;
+
+  return true;
+}
+
+wsl::thunk::GpuMemory *get_gpu_mem(void *MemoryAddress) {
+  std::lock_guard<std::mutex> gard(*allocation_map_lock_);
+  auto it = allocation_map_->find(MemoryAddress);
+  if (it == allocation_map_->end()) {
+    return nullptr;
+  }
+
+  return wsl::thunk::GpuMemory::Convert(it->second.handle);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtAvailableMemory(HSAuint32 Node,
+                                              HSAuint64 *AvailableBytes) {
+  CHECK_DXG_OPEN();
+
+  if (!AvailableBytes)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  wsl::thunk::WDDMDevice *dev = get_wddmdev(Node);
+  if (!dev)
+    return HSAKMT_STATUS_ERROR;
+
+  *AvailableBytes = dev->VramAvail();
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemory(void *MemoryAddress,
+                                             HSAuint64 MemorySizeInBytes) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not implemented\n");
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryToNodes(void *MemoryAddress,
+                                                    HSAuint64 MemorySizeInBytes,
+                                                    HSAuint64 NumberOfNodes,
+                                                    HSAuint32 *NodeArray) {
+  CHECK_DXG_OPEN();
+
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryWithFlags(
+    void *MemoryAddress, HSAuint64 MemorySizeInBytes, HsaMemFlags MemFlags) {
+  CHECK_DXG_OPEN();
+
+  if (!MemoryAddress)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  pr_debug("address %p\n", MemoryAddress);
+
+  if (MemFlags.ui32.ExtendedCoherent && MemFlags.ui32.CoarseGrain)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  // Registered memory should be ordinary paged host memory.
+  if ((MemFlags.ui32.HostAccess != 1) || (MemFlags.ui32.NonPaged == 1))
+    return HSAKMT_STATUS_NOT_SUPPORTED;
+
+  if (!dxg_runtime->hsakmt_is_dgpu)
+    /* TODO: support mixed APU and dGPU configurations */
+    return HSAKMT_STATUS_NOT_SUPPORTED;
+
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+bool is_ipc_sysmemfd(int fd) {
+  std::string fdPath = "/proc/self/fd/" + std::to_string(fd);
+  char linkTarget[256];
+  ssize_t bytes = readlink(fdPath.c_str(), linkTarget, sizeof(linkTarget) - 1);
+  if (bytes == -1)
+    return false;
+  linkTarget[bytes] = '\0';
+  return strstr(linkTarget, "rocr4wsl_gtt") != nullptr;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterGraphicsHandleToNodes(HSAuint64 GraphicsResourceHandle,
+                                                            HsaGraphicsResourceInfo *GraphicsResourceInfo,
+                                                            HSAuint64 NumberOfNodes,
+                                                            HSAuint32 *NodeArray) {
+  HSA_REGISTER_MEM_FLAGS regFlags;
+  regFlags.Value = 0;
+
+  return hsaKmtRegisterGraphicsHandleToNodesExt(GraphicsResourceHandle,
+            GraphicsResourceInfo,
+            NumberOfNodes,
+            NodeArray,
+            regFlags);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterGraphicsHandleToNodesExt(HSAuint64 GraphicsResourceHandle,
+							       HsaGraphicsResourceInfo *GraphicsResourceInfo,
+							       HSAuint64 NumberOfNodes,
+							       HSAuint32 *NodeArray,
+							       HSA_REGISTER_MEM_FLAGS RegisterFlags) {
+  CHECK_DXG_OPEN();
+  uint32_t *gpu_id_array = NULL;
+  HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
+
+  if (is_ipc_sysmemfd(GraphicsResourceHandle)) {
+    GraphicsResourceInfo->NodeId = dxg_runtime->default_node;
+    pr_info("skip register sysmemfd. It would be released in next step\n");
+    return HSAKMT_STATUS_SUCCESS;
+  }
+
+  if (NumberOfNodes == 0) {
+    RegisterFlags.ui32.requiresVAddr = 0;
+    NumberOfNodes = 1;
+    NodeArray = (HSAuint32*)&(dxg_runtime->default_node);
+  }
+
+  pr_debug("number of nodes %lu\n", NumberOfNodes);
+  wsl::thunk::GpuMemoryHandle mem_handle;
+  ret = import_dmabuf_fd(GraphicsResourceHandle, NodeArray[0],
+                          RegisterFlags.ui32.requiresVAddr,
+                          false, &mem_handle);
+  if (ret != HSAKMT_STATUS_SUCCESS) {
+    pr_err("hsaKmtRegisterGraphicsHandleToNodesExt: import_dmabuf_fd failed, "
+           "GraphicsResourceHandle: %lu, NodeId: %u\n",
+           GraphicsResourceHandle, NodeArray[0]);
+    return ret;
+  }
+  wsl::thunk::GpuMemory *gpu_mem = wsl::thunk::GpuMemory::Convert(mem_handle);
+  GraphicsResourceInfo->NodeId = gpu_mem->GetDevice()->NodeId();
+  GraphicsResourceInfo->SizeInBytes = gpu_mem->ClientSize();
+  GraphicsResourceInfo->MemoryAddress = RegisterFlags.ui32.requiresVAddr ?
+                                          reinterpret_cast<void *>(gpu_mem->GpuAddress()):
+                                          reinterpret_cast<void*>(gpu_mem->HandleApeAddress());
+
+  return ret;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtExportDMABufHandle(void *MemoryAddress,
+                                                 HSAuint64 MemorySizeInBytes,
+                                                 int *DMABufFd,
+                                                 HSAuint64 *Offset) {
+  CHECK_DXG_OPEN();
+
+  std::lock_guard<std::mutex> gard(*allocation_map_lock_);
+
+  auto it = allocation_map_->upper_bound(MemoryAddress);
+  if (it != allocation_map_->begin()) {
+    --it;
+    if (it->second.dmabuf_fd == -1) {
+      auto gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
+      auto code = gpu_mem->ExportPhysicalHandle(DMABufFd);
+      if (code != ErrorCode::Success)
+        return HSAKMT_STATUS_ERROR;
+      it->second.dmabuf_fd = *DMABufFd;
+    }
+    *DMABufFd = dup(it->second.dmabuf_fd);
+    *Offset = reinterpret_cast<uint64_t>(MemoryAddress) - it->second.gpu_addr;
+    return HSAKMT_STATUS_SUCCESS;
+  }
+
+  return HSAKMT_STATUS_ERROR;
+}
+
+HSAKMT_STATUS HSAKMTAPI
+hsaKmtGetMemoryHandle(void *MemoryAddress, HSAuint64 SizeInBytes,
+                      uint64_t *SharedMemoryHandle) {
+	CHECK_DXG_OPEN();
+
+	return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS import_dmabuf_fd(int DMABufFd,
+                                       uint32_t NodeId,
+                                       bool alloc_va,
+                                       bool is_ipc_memfd,
+                                       wsl::thunk::GpuMemoryHandle *GpuMemHandle) {
+  CHECK_DXG_OPEN();
+
+  *GpuMemHandle = nullptr;
+  wsl::thunk::WDDMDevice* dev = get_wddmdev(NodeId);
+  wsl::thunk::GpuMemory *gpu_mem = nullptr;
+  wsl::thunk::GpuMemoryCreateInfo create_info{};
+  create_info.dmabuf_fd = DMABufFd;
+  create_info.flags.alloc_va = alloc_va;
+
+  if (is_ipc_memfd) {
+    struct stat st;
+    fstat(DMABufFd, &st);
+    uint64_t sz = st.st_size;
+    if (4096 <= sz && sz < dxg_runtime->SystemHeapSize() && (sz & 0xfff) == 0) {
+      pr_debug("DMABufFd %d is sys mem fd(IPC signal), get size:%ld from it\n", DMABufFd, st.st_size);
+      create_info.flags.sysmem_ipc_sig_importer = 1;        // set to 1 when backend is system memory
+      create_info.size = st.st_size;
+    }
+  }
+
+  gpusize gpu_va = 0;
+  auto code = dev->CreateGpuMemory(create_info, &gpu_mem, &gpu_va);
+  if (code == ErrorCode::SameProcessSameDevice) {
+    /* Unit_hipMemPoolExportToShareableHandle_SameProc */
+    pr_info("imported from same process, use the old one\n");
+    std::lock_guard<std::mutex> gard(*allocation_map_lock_);
+    auto it = allocation_map_->find((void*)gpu_va);
+    if (it == allocation_map_->end()) {
+      pr_err("where's the conflict buffer? va %#lx\n", create_info.va_hint);
+      return HSAKMT_STATUS_ERROR;
+    }
+    wsl::thunk::GpuMemory *conflict_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
+    conflict_mem->IncSharedReference();
+    *GpuMemHandle = it->second.handle;
+    return HSAKMT_STATUS_SUCCESS;
+  } else if (code != ErrorCode::Success) {
+    pr_err("fail to import fd, ret %d\n", (int)code);
+    return HSAKMT_STATUS_ERROR;
+  }
+
+  void *MemoryAddress;
+  if (alloc_va)
+    MemoryAddress = reinterpret_cast<void *>(gpu_mem->GpuAddress());
+  else
+    MemoryAddress = reinterpret_cast<void*>(gpu_mem->HandleApeAddress());
+
+  *GpuMemHandle = gpu_mem->GetGpuMemoryHandle();
+
+  std::lock_guard<std::mutex> gard(*allocation_map_lock_);
+  /*
+   * the gpu_mem->Flags() need convert back from GpuMemoryCreateFlags to
+   * HsaMemFlags, reference hsaKmtAllocMemoryAlign
+   * */
+  (*allocation_map_)[MemoryAddress] = Allocation(
+    *GpuMemHandle, MemoryAddress, (uint64_t)MemoryAddress,
+    gpu_mem->Size(), false, nullptr, gpu_mem->ClientSize(),
+    NodeId, gpu_mem->Flags());
+
+  return HSAKMT_STATUS_SUCCESS;
+
+}
+
+
+HSAKMT_STATUS HSAKMTAPI
+hsaKmtShareMemory(void *MemoryAddress, HSAuint64 SizeInBytes,
+                  HsaSharedMemoryHandle *SharedMemoryHandle) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not implemented\n");
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI
+hsaKmtRegisterSharedHandle(const HsaSharedMemoryHandle *SharedMemoryHandle,
+                           void **MemoryAddress, HSAuint64 *SizeInBytes) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not implemented\n");
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterSharedHandleToNodes(
+    const HsaSharedMemoryHandle *SharedMemoryHandle, void **MemoryAddress,
+    HSAuint64 *SizeInBytes, HSAuint64 NumberOfNodes, HSAuint32 *NodeArray) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not implemented\n");
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtProcessVMRead(HSAuint32 Pid,
+                                            HsaMemoryRange *LocalMemoryArray,
+                                            HSAuint64 LocalMemoryArrayCount,
+                                            HsaMemoryRange *RemoteMemoryArray,
+                                            HSAuint64 RemoteMemoryArrayCount,
+                                            HSAuint64 *SizeCopied) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("has been deprecated\n");
+  assert(false);
+  return HSAKMT_STATUS_NOT_IMPLEMENTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtProcessVMWrite(HSAuint32 Pid,
+                                             HsaMemoryRange *LocalMemoryArray,
+                                             HSAuint64 LocalMemoryArrayCount,
+                                             HsaMemoryRange *RemoteMemoryArray,
+                                             HSAuint64 RemoteMemoryArrayCount,
+                                             HSAuint64 *SizeCopied) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("has been deprecated\n");
+  assert(false);
+  return HSAKMT_STATUS_NOT_IMPLEMENTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtDeregisterMemory(void *MemoryAddress) {
+  CHECK_DXG_OPEN();
+
+  if (!MemoryAddress)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  pr_debug("address %p\n", MemoryAddress);
+
+  {
+    std::lock_guard<std::mutex> gard(*allocation_map_lock_);
+
+    auto it = allocation_map_->find(MemoryAddress);
+    if (it == allocation_map_->end()) {
+      return HSAKMT_STATUS_SUCCESS;
+    }
+
+    auto *gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
+    wsl::thunk::GpuMemoryDescFlags flags;
+    flags.reserved = gpu_mem->Flags();
+    // IPC mem(vram)
+    if (flags.is_imported_vram_ipc &&
+      gpu_mem->DecSharedReference() == 0) {
+      allocation_map_->erase(it);
+      delete gpu_mem;
+      return HSAKMT_STATUS_SUCCESS;
+    }
+    if (it->second.userptr) {
+      allocation_map_->erase(it);
+      allocation_map_->erase((void *)it->second.gpu_addr);
+      delete gpu_mem;
+      return HSAKMT_STATUS_SUCCESS;
+    }
+  }
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtMapMemoryToGPU(void *MemoryAddress,
+                                             HSAuint64 MemorySizeInBytes,
+                                             HSAuint64 *AlternateVAGPU) {
+
+  HSAuint64 NumberOfNodes = 1;
+  HSAuint32 NodeArray[] = {dxg_runtime->default_node};
+  HsaMemMapFlags MemMapFlags;
+  MemMapFlags.Value = 0;
+
+  return hsaKmtMapMemoryToGPUNodes(MemoryAddress, MemorySizeInBytes, AlternateVAGPU,
+    MemMapFlags, NumberOfNodes, NodeArray);
+}
+HSAKMT_STATUS HSAKMTAPI hsaKmtMapMemoryToGPUNodes(
+    void *MemoryAddress, HSAuint64 MemorySizeInBytes, HSAuint64 *AlternateVAGPU,
+    HsaMemMapFlags MemMapFlags, HSAuint64 NumberOfNodes, HSAuint32 *NodeArray) {
+  CHECK_DXG_OPEN();
+
+  if (!MemoryAddress || !AlternateVAGPU) {
+    pr_err("FIXME: mapping NULL pointer\n");
+    return HSAKMT_STATUS_ERROR;
+  }
+
+  uint64_t start = wsl::AlignDown((uint64_t)MemoryAddress, 4096);
+  uint64_t end =
+      wsl::AlignUp((uint64_t)MemoryAddress + MemorySizeInBytes, 4096);
+
+  void *aligned_ptr = (void *)start;
+  size_t aligned_size = end - start;
+
+  {
+    if (nullptr != fragment_allocator_.block_base(aligned_ptr))
+      return HSAKMT_STATUS_SUCCESS;
+  }
+
+  {
+    std::lock_guard<std::mutex> gard(*allocation_map_lock_);
+    auto it = allocation_map_->find(aligned_ptr);
+    if (it != allocation_map_->end()) {
+      wsl::thunk::GpuMemory *gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
+      wsl::thunk::GpuMemoryDescFlags flags;
+      flags.reserved = gpu_mem->Flags();
+      // IPC mem
+      if (flags.is_imported_vram_ipc) {
+
+        auto code = gpu_mem->MapGpuVirtualAddress(gpu_mem->GpuAddress(), gpu_mem->Size());
+        if (code != ErrorCode::Success)
+          return HSAKMT_STATUS_ERROR;
+
+        code = gpu_mem->MakeResident();
+        if (code != ErrorCode::Success)
+          return HSAKMT_STATUS_ERROR;
+
+        wsl::thunk::WDDMDevice *dev = gpu_mem->GetDevice();
+        if (!dev->WaitOnPagingFenceFromCpu())
+          return HSAKMT_STATUS_ERROR;
+
+        return HSAKMT_STATUS_SUCCESS;
+      }
+
+      if (!it->second.userptr) {
+      // GTT/Local mem
+        if (it->second.size >= MemorySizeInBytes) {
+          *AlternateVAGPU = (uint64_t)MemoryAddress;
+          return HSAKMT_STATUS_SUCCESS;
+        } else {
+          return HSAKMT_STATUS_ERROR;
+        }
+      }
+    }
+
+    // userptr mem
+    it = allocation_map_->find(MemoryAddress);
+    if (it != allocation_map_->end()) {
+      if (it->second.userptr && it->second.size >= MemorySizeInBytes) {
+        *AlternateVAGPU =
+            (uintptr_t)it->second.gpu_addr +
+            ((uintptr_t)MemoryAddress - (uintptr_t)it->second.cpu_addr);
+        return HSAKMT_STATUS_SUCCESS;
+      }
+    }
+  }
+
+  // map userptr
+  wsl::thunk::WDDMDevice *dev = get_wddmdev(NodeArray[0]);
+  if (!dev)
+    return HSAKMT_STATUS_ERROR;
+
+  wsl::thunk::GpuMemory *gpu_mem = nullptr;
+  wsl::thunk::GpuMemoryHandle handle = 0;
+  uint64_t addr;
+  wsl::thunk::GpuMemoryCreateInfo create_info{};
+  create_info.domain = thunk_proxy::kUserMemory;
+  create_info.size = aligned_size;
+  create_info.user_ptr = aligned_ptr;
+
+  auto code = dev->CreateGpuMemory(create_info, &gpu_mem);
+  if (code == ErrorCode::Success) {
+    addr = gpu_mem->GpuAddress();
+    handle = gpu_mem->GetGpuMemoryHandle();
+  } else {
+    return HSAKMT_STATUS_ERROR;
+  }
+
+  {
+    std::lock_guard<std::mutex> guard(*allocation_map_lock_);
+   (*allocation_map_)[MemoryAddress] =
+        Allocation(handle, aligned_ptr, addr, aligned_size, true, MemoryAddress,
+                   MemorySizeInBytes);
+    (*allocation_map_)[(void *)addr] =
+        Allocation(handle, aligned_ptr, addr, aligned_size, true, nullptr,
+                   MemorySizeInBytes);
+  }
+
+  *AlternateVAGPU = addr + ((uintptr_t)MemoryAddress - (uintptr_t)aligned_ptr);
+
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtUnmapMemoryToGPU(void *MemoryAddress) {
+  CHECK_DXG_OPEN();
+
+  if (!MemoryAddress) {
+    /* Workaround for runtime bug */
+    pr_err("FIXME: Unmapping NULL pointer\n");
+    return HSAKMT_STATUS_SUCCESS;
+  }
+
+  pr_debug("address %p\n", MemoryAddress);
+
+  {
+    if (nullptr != fragment_allocator_.block_base(MemoryAddress))
+      return HSAKMT_STATUS_SUCCESS;
+  }
+
+  wsl::thunk::GpuMemory *gpu_mem = nullptr;
+  {
+    std::lock_guard<std::mutex> gard(*allocation_map_lock_);
+
+    auto it = allocation_map_->find(MemoryAddress);
+    if (it == allocation_map_->end()) {
+      return HSAKMT_STATUS_ERROR;
+    }
+
+    gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
+    if (gpu_mem->IsQueueReferenced())
+      return HSAKMT_STATUS_ERROR;
+
+    // IPC mem
+    wsl::thunk::GpuMemoryDescFlags flags;
+    flags.reserved = gpu_mem->Flags();
+    if (flags.is_imported_vram_ipc &&
+        !gpu_mem->IsSharedFromSameProcess()) {
+      auto code = gpu_mem->UnmapGpuVirtualAddress(gpu_mem->GpuAddress(), gpu_mem->Size());
+      if (code != ErrorCode::Success)
+        return HSAKMT_STATUS_ERROR;
+      gpu_mem->Evict();
+
+      return HSAKMT_STATUS_SUCCESS;
+    }
+  }
+
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtMapGraphicHandle(HSAuint32 NodeId,
+                                               HSAuint64 GraphicDeviceHandle,
+                                               HSAuint64 GraphicResourceHandle,
+                                               HSAuint64 GraphicResourceOffset,
+                                               HSAuint64 GraphicResourceSize,
+                                               HSAuint64 *FlatMemoryAddress) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not implemented\n");
+  /* This API was only ever implemented in KFD for Kaveri and
+   * was never upstreamed. There are no open-source users of
+   * this interface. It has been superseded by
+   * RegisterGraphicsHandleToNodes.
+   */
+  return HSAKMT_STATUS_NOT_IMPLEMENTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtUnmapGraphicHandle(HSAuint32 NodeId,
+                                                 HSAuint64 FlatMemoryAddress,
+                                                 HSAuint64 SizeInBytes) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not implemented\n");
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtGetTileConfig(HSAuint32 NodeId,
+                                            HsaGpuTileConfig *config) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not implemented\n");
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtQueryPointerInfo(const void *Pointer,
+                                               HsaPointerInfo *PointerInfo) {
+  CHECK_DXG_OPEN();
+
+  if (!Pointer || !PointerInfo)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  pr_debug("pointer %p\n", Pointer);
+
+  memset(PointerInfo, 0, sizeof(HsaPointerInfo));
+
+  wsl::thunk::GpuMemory *gpu_mem = nullptr;
+  Allocation allocation_info;
+  bool found = false;
+  {
+    std::lock_guard<std::mutex> gard(*allocation_map_lock_);
+    auto it = allocation_map_->upper_bound(Pointer);
+    if (it != allocation_map_->begin()) {
+      --it;
+      if (Pointer >= it->first &&
+        (Pointer < reinterpret_cast<const uint8_t*>(it->first) + it->second.size_requested)) {
+        allocation_info = it->second;
+        gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
+        found = true;
+      }
+    }
+  }
+
+  if (!found) {
+    pr_debug("can't found allocation for %p\n", Pointer);
+    PointerInfo->Type = HSA_POINTER_UNKNOWN;
+    return HSAKMT_STATUS_ERROR;
+  }
+
+  if (allocation_info.userptr) {
+    PointerInfo->Type = HSA_POINTER_REGISTERED_USER;
+    PointerInfo->SizeInBytes = allocation_info.size;
+  } else if (gpu_mem->IsVirtual()) {
+    PointerInfo->Type = HSA_POINTER_RESERVED_ADDR;
+  } else {
+    PointerInfo->Type = HSA_POINTER_ALLOCATED;
+    PointerInfo->SizeInBytes = allocation_info.size_requested;
+  }
+
+  PointerInfo->Node = allocation_info.node_id;
+  PointerInfo->MemFlags.Value = allocation_info.mem_flags_value;
+  PointerInfo->CPUAddress = allocation_info.cpu_addr;
+  PointerInfo->GPUAddress = allocation_info.gpu_addr;
+  PointerInfo->UserData = allocation_info.rocr_userdata;
+
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtSetMemoryUserData(const void *Pointer,
+                                                void *UserData) {
+  CHECK_DXG_OPEN();
+
+  uint64_t aligned_ptr = wsl::AlignDown((uint64_t)Pointer, 4096);
+
+  std::lock_guard<std::mutex> gard(*allocation_map_lock_);
+  auto it = allocation_map_->find((void *)aligned_ptr);
+  if (it != allocation_map_->end()) {
+    it->second.rocr_userdata = UserData;
+    return HSAKMT_STATUS_SUCCESS;
+  }
+
+  return HSAKMT_STATUS_ERROR;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtReplaceAsanHeaderPage(void *addr) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  assert(false);
+#ifdef SANITIZER_AMDGPU
+  pr_debug("address %p\n", addr);
+  CHECK_DXG_OPEN();
+
+  return HSAKMT_STATUS_SUCCESS;
+#else
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+#endif
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtReturnAsanHeaderPage(void *addr) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  assert(false);
+#ifdef SANITIZER_AMDGPU
+  pr_debug("address %p\n", addr);
+  CHECK_DXG_OPEN();
+
+  return HSAKMT_STATUS_SUCCESS;
+#else
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+#endif
+}
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/openclose.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/openclose.cpp
new file mode 100644
index 0000000000..eb22a13aae
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/openclose.cpp
@@ -0,0 +1,626 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include <stdlib.h>
+#include <cstring>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/sysinfo.h>
+#include <linux/mman.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <cstdio>
+#include <strings.h>
+#include <cassert>
+
+
+hsakmtRuntime *dxg_runtime = new hsakmtRuntime();
+
+void hsakmtRuntime::HeapInit() {
+    ReserveLocalHeapSpace();
+    ReserveSystemHeapSpace();
+    InitHandleApertureSpace();
+    InitLocalHeapMgr();
+    InitSystemHeapMgr();
+    InitHandleApertureMgr();
+}
+
+void hsakmtRuntime::HeapFini() {
+    FreeSystemHeapSpace();
+    FreeLocalHeapSpace();
+}
+
+bool hsakmtRuntime::ReserveSvmSpace(uint64_t &base, uint64_t &size, uint64_t align) {
+    uint64_t sys_va[16] = {0};
+    uint64_t local_va;
+    uint64_t sys_va_size;
+    int match_index = -1;
+    void* ptr = NULL;
+
+    wsl::thunk::WDDMDevice* device;
+    size_t num_adapters = get_num_wddmdev();
+
+    base = 0;
+    sys_va_size = size + align;
+
+    /* it will retry 16 times to find the avaliable range. */
+    for (int i = 0; i < 16; i++) {
+        local_va = 0;
+        ptr = mmap(NULL, sys_va_size , PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+        if (ptr == MAP_FAILED) {
+            pr_err("fail to reserve cpu va in %d time!\n", i);
+            break;
+        }
+
+        sys_va[i] = (uint64_t)ptr;
+
+        int match_cnt = 0;
+        for (uint32_t j = 0; j < num_adapters; j++) {
+            device = get_wddmdev(j+1);
+            uint64_t start = (base == 0) ? (uint64_t)ptr : base;
+            uint64_t end = start + ((base == 0) ? sys_va_size : size) + 1;
+
+            if (wsl::thunk::d3dthunk::ReserveGpuVirtualAddress(
+                        device->GetAdapter(), size,
+                        start,
+                        end, &local_va) == ErrorCode::Success) {
+
+                match_cnt++;
+                base = local_va;
+                pr_debug("success to reserve gpu va %lx and va cpu %p in %d time\n",
+                        local_va, ptr, i);
+            } else {
+                pr_err("%s fail to reserve gpu va for cpu va %p in %d time!\n",
+                        __FUNCTION__, ptr, i);
+            }
+        }
+
+        if (match_cnt == num_adapters) {
+                match_index = i;
+                break;
+        }
+    }
+
+    if (match_index >= 0) {
+        /* release cpu unused ranges*/
+        uint64_t left_size = local_va - sys_va[match_index];
+        uint64_t right_size = align - left_size;
+        if ((left_size > 0) && munmap((void*)sys_va[match_index], left_size))
+            pr_err("fail to unmap left %lx with size %lx\n", sys_va[match_index], left_size);
+        if ((right_size > 0) && munmap((void*)(local_va + size), right_size))
+            pr_err("fail to unmap right %lx with size %lx\n", (local_va + size), right_size);
+    } else {
+        pr_err("fail to reserve Local Heap Space!\n");
+        base = 0;
+        size = 0;
+    }
+
+    /* free match fail address for cpu va */
+    int free = match_index >= 0 ? match_index : 16;
+    for (int j = 0; j < free; j++) {
+        if (sys_va[j] != 0 && munmap((void*)sys_va[j], sys_va_size)) {
+            pr_err("fail to unmap %d %lx\n", j, sys_va[j]);
+        }
+    }
+
+    return match_index >= 0;
+}
+
+/*
+ * To find the avaliable same range for cpu
+ * virtual space and gpu virtual space.
+ * sys_va_size of cpu va range is larger 1G
+ * than gpu va range, otherwise ReserveGPUVirtualAddress
+ * will return error.
+ */
+bool hsakmtRuntime::ReserveLocalHeapSpace() {
+    wsl::thunk::WDDMDevice* device;
+    uint64_t total_local_size = 0;
+    uint64_t align = 0x40000000; /* 1G */
+    size_t num_adapters = get_num_wddmdev();
+
+    for (uint32_t j = 0; j < num_adapters; j++) {
+        device = get_wddmdev(j+1);
+        if (device == nullptr)
+            return -1;
+        /*
+         * For APU, use non local memory(shared GPU memory) as GPU memory,
+         * because it has small local memory
+        */
+        if (device->IsDgpu())
+          total_local_size = wsl::Max(device->LocalHeapSize(), total_local_size);
+        else
+          total_local_size = wsl::Max(device->LocalHeapSize(), device->NonLocalHeapSize(), total_local_size);
+    }
+
+    total_local_size = wsl::AlignUp(total_local_size, align) * 4;
+    local_heap_space_start_ = 0;
+    local_heap_space_size_ = total_local_size;
+
+    return ReserveSvmSpace(local_heap_space_start_, local_heap_space_size_, align);
+}
+
+bool hsakmtRuntime::FreeSvmSpace(uint64_t &base, uint64_t &size) {
+    wsl::thunk::WDDMDevice* device;
+    size_t num_adapters = get_num_wddmdev();
+    for (uint32_t j = 0; j < num_adapters; j++) {
+        device = get_wddmdev(j+1);
+        if (device == nullptr)
+            return -1;
+        wsl::thunk::d3dthunk::FreeGpuVirtualAddress(device->GetAdapter(), base, size);
+    }
+
+    void *cpu = (void *)base;
+    auto r = (munmap(cpu, size) == 0);
+    base = 0;
+    size = 0;
+    return r;
+}
+
+bool hsakmtRuntime::FreeLocalHeapSpace() {
+    return FreeSvmSpace(local_heap_space_start_, local_heap_space_size_);
+}
+
+void hsakmtRuntime::InitLocalHeapMgr() {
+  local_heap_mgr_ = std::make_unique<wsl::thunk::VaMgr>(local_heap_space_start_,
+                                          local_heap_space_size_,
+                                          DEFAULT_GPU_PAGE_SIZE);
+}
+
+bool hsakmtRuntime::ReserveSystemHeapSpace() {
+    struct sysinfo info;
+    int ret = sysinfo(&info);
+    uint64_t max_ram = 0x10000000000;
+    uint64_t alignment = 0x100000000;
+    assert(!ret);
+
+    int32_t protFlags = PROT_NONE;
+    // minimum of reserve size is 8G, maximum of reserve size is 1T.
+    system_heap_space_size_ = std::min(wsl::AlignUp(info.totalram, alignment) * 2, max_ram);
+
+    return ReserveSvmSpace(system_heap_space_start_, system_heap_space_size_, alignment);
+}
+
+bool hsakmtRuntime::FreeSystemHeapSpace(void) {
+    return FreeSvmSpace(system_heap_space_start_, system_heap_space_size_);
+}
+
+bool hsakmtRuntime::CommitSystemHeapSpace(void* addr, int64_t size, bool lock) {
+    int32_t protFlags = PROT_READ | PROT_WRITE | PROT_EXEC;
+    int32_t mapFlags = MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED|
+        MAP_NORESERVE|MAP_UNINITIALIZED;
+    if (lock)
+        mapFlags |= MAP_LOCKED;
+    void* paddr = mmap(addr, size, protFlags, mapFlags, -1, 0);
+    if (paddr == MAP_FAILED) {
+        pr_err("fail to commit %s addr = %p, paddr = %p\n", (lock ? "locked" : ""), addr, paddr);
+        return false;
+    }
+    assert(addr == paddr);
+
+    /*if (!Runtime::runtime_singleton_->PinWARequired())
+      return true;*/
+
+    /*
+     * Do not make the pages in this range available to the child
+     * after a fork(2).  This is useful to prevent copy-on-write
+     * semantics from changing the physical location of a page if
+     * the parent writes to it after a fork(2).  (Such page
+     * relocations cause problems for hardware that DMAs into the
+     * page.)
+     *
+     * https://man7.org/linux/man-pages/man2/madvise.2.html
+     */
+    if (madvise(addr, size, MADV_DONTFORK))
+        pr_err("fail to set MADV_DONTFORK for addr = %p\n", addr);
+
+    return true;
+}
+
+bool hsakmtRuntime::DecommitSystemHeapSpace(void* addr, int64_t size) {
+    int32_t protFlags = PROT_NONE;
+    int32_t mapFlags = MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED|
+        MAP_NORESERVE|MAP_UNINITIALIZED;
+    void* paddr = mmap(addr, size, protFlags, mapFlags, -1, 0);
+    if (paddr == MAP_FAILED) {
+        pr_err("fail to decommit addr = %p, paddr = %p\n", addr, paddr);
+        return false;
+    }
+    assert(addr == paddr);
+    return true;
+}
+
+void hsakmtRuntime::InitSystemHeapMgr() {
+  system_heap_mgr_ = std::make_unique<wsl::thunk::VaMgr>(system_heap_space_start_,
+                                          system_heap_space_size_,
+                                          DEFAULT_GPU_PAGE_SIZE);
+}
+
+ErrorCode hsakmtRuntime::ReserveGpuVirtualAddress(const thunk_proxy::AllocDomain domain,
+        gpusize hit_base_addr, gpusize size,
+        gpusize *out_gpu_virt_addr, gpusize alignment, bool lock) {
+    gpusize gpu_addr = 0;
+    ErrorCode code = ErrorCode::Success;
+
+    uint64_t align = alignment == 0 ? (64 * 1024) : alignment; // default 64K alignment
+    if (size >= GPU_HUGE_PAGE_SIZE)
+        align = GPU_HUGE_PAGE_SIZE;
+
+    if (domain == thunk_proxy::kSystem) {
+        gpu_addr = system_heap_mgr_->Alloc(size, align, hit_base_addr);
+        if (gpu_addr == 0)
+            code = ErrorCode::OutOfMemory;
+
+        if (!CommitSystemHeapSpace((void*)gpu_addr, size, lock)) {
+            system_heap_mgr_->Free(gpu_addr);
+            code = ErrorCode::SyscallFail;
+        }
+    } else {
+        gpu_addr = local_heap_mgr_->Alloc(size, align, hit_base_addr);
+        if (gpu_addr == 0)
+            code = ErrorCode::OutOfGpuMemory;
+    }
+
+    *out_gpu_virt_addr = (code == ErrorCode::Success) ? gpu_addr : 0;
+    return code;
+}
+
+ErrorCode hsakmtRuntime::FreeGpuVirtualAddress(const thunk_proxy::AllocDomain domain,
+        gpusize gpu_addr, gpusize size) {
+    auto code = ErrorCode::Success;
+
+    if (domain == thunk_proxy::kSystem) {
+        DecommitSystemHeapSpace((void *)gpu_addr, size);
+        system_heap_mgr_->Free(gpu_addr);
+    } else {
+        local_heap_mgr_->Free(gpu_addr);
+    }
+
+    return code;
+}
+
+bool hsakmtRuntime::CommitSystemHeapSpaceIPC(void* addr, int64_t size, int &memfd, bool lock) {
+    int fd = -1;
+
+    if (memfd == -1) {
+        fd = memfd_create("rocr4wsl_gtt", MFD_CLOEXEC);
+        if (fd < 0) {
+            pr_err("memfd_create failed\n");
+            return false;
+        }
+
+        ftruncate(fd, size);
+    } else {
+        fd = memfd;
+    }
+
+    int32_t protFlags = PROT_READ | PROT_WRITE;
+    int32_t mapFlags = MAP_SHARED | MAP_FIXED | MAP_NORESERVE |
+        MAP_UNINITIALIZED | (lock ? MAP_LOCKED : 0);
+
+    void* paddr = mmap(addr, size, protFlags, mapFlags, fd, 0);
+    if (paddr == MAP_FAILED) {
+        pr_err("fail to commit %s addr = %p, paddr = %p\n", (lock ? "locked" : ""), addr, paddr);
+        if (memfd == -1)
+            close(fd);
+        return false;
+    }
+    assert(addr == paddr);
+
+    memfd = fd;
+
+    if (madvise(addr, size, MADV_DONTFORK))
+        pr_err("fail to set MADV_DONTFORK for addr = %p\n", addr);
+
+    return true;
+}
+
+bool hsakmtRuntime::DecommitSystemHeapSpaceIPC(void* addr, int64_t size, int &memfd) {
+    if (munmap(addr, size) != 0) {
+        pr_err("fail to unmap = %p \n", addr);
+        return false;
+    }
+    close(memfd);
+    memfd = -1;
+    return true;
+}
+
+ErrorCode hsakmtRuntime::ReserveIPCSysMem(gpusize size,
+        gpusize *out_gpu_virt_addr, gpusize alignment,
+        int &memfd, bool lock) {
+    gpusize gpu_addr = 0;
+    ErrorCode code = ErrorCode::Success;
+    gpu_addr = system_heap_mgr_->Alloc(size, alignment, 0);
+    if (gpu_addr == 0)
+        return ErrorCode::OutOfMemory;
+
+    if (!CommitSystemHeapSpaceIPC((void*)gpu_addr, size, memfd, lock)) {
+        system_heap_mgr_->Free(gpu_addr);
+        code = ErrorCode::SyscallFail;
+    }
+
+    *out_gpu_virt_addr = (code == ErrorCode::Success) ? gpu_addr : 0;
+    return code;
+}
+
+ErrorCode hsakmtRuntime::FreeIPCSysMem(gpusize gpu_addr, gpusize size, int &memfd) {
+    auto code = ErrorCode::Success;
+
+    DecommitSystemHeapSpaceIPC((void *)gpu_addr, size, memfd);
+
+    system_heap_mgr_->Free(gpu_addr);
+    return code;
+}
+
+bool hsakmtRuntime::InitHandleApertureSpace() {
+	wsl::thunk::WDDMDevice* device;
+	size_t num_adapters = get_num_wddmdev();
+    handle_aperture_start_ = START_NON_CANONICAL_ADDR;
+    handle_aperture_size_ = 1ULL << 47;
+
+    while (handle_aperture_start_ < END_NON_CANONICAL_ADDR - 1) {
+		for (uint32_t j = 0; j < num_adapters;) {
+	        device = get_wddmdev(j+1);
+	        if (device == nullptr)
+	            return -1;
+
+            if (device->PrivateApertureBase() &&
+                    IS_OVERLAPPING(device->PrivateApertureBase(),
+                        device->PrivateApertureSize(),
+                        handle_aperture_start_,
+                        handle_aperture_size_)) {
+                handle_aperture_start_ += (1ULL << 47);
+                continue;
+            }
+
+            if (device->SharedApertureBase() &&
+                    IS_OVERLAPPING(device->SharedApertureBase(),
+                        device->SharedApertureSize(),
+                        handle_aperture_start_,
+                        handle_aperture_size_)) {
+                handle_aperture_start_ += (1ULL << 47);
+                continue;
+            }
+
+            j++;
+        }
+        pr_debug("handle aperture start %lx, size %lx\n", handle_aperture_start_, handle_aperture_size_);
+        return true;
+    }
+
+    handle_aperture_start_ = 0;
+    pr_err("fail\n");
+
+    return false;
+}
+
+void hsakmtRuntime::InitHandleApertureMgr() {
+  handle_aperture_mgr_ = std::make_unique<wsl::thunk::VaMgr>(handle_aperture_start_,
+                                                 handle_aperture_size_,
+                                                 DEFAULT_GPU_PAGE_SIZE);
+}
+
+ErrorCode hsakmtRuntime::HandleApertureAlloc(gpusize size, gpusize *out_gpu_virt_addr) {
+    uint64_t align = DEFAULT_GPU_PAGE_SIZE;
+
+    if (size >= GPU_HUGE_PAGE_SIZE)
+        align = GPU_HUGE_PAGE_SIZE;
+
+    *out_gpu_virt_addr = handle_aperture_mgr_->Alloc(size, align);
+    if (*out_gpu_virt_addr == 0)
+        return ErrorCode::OutOfHandleApeMemory;
+
+    return ErrorCode::Success;
+}
+
+void hsakmtRuntime::HandleApertureFree(gpusize gpu_addr) {
+    handle_aperture_mgr_->Free(gpu_addr);
+}
+
+/* is_forked_child detects when the process has forked since the last
+ * time this function was called. We cannot rely on pthread_atfork
+ * because the process can fork without calling the fork function in
+ * libc (using clone or calling the system call directly).
+ */
+bool is_forked_child(void) {
+  if (dxg_runtime->is_forked)
+    return true;
+
+  pid_t cur_pid = getpid();
+  if (dxg_runtime->parent_pid != cur_pid) {
+    dxg_runtime->is_forked = true;
+    dxg_runtime->parent_pid = cur_pid;
+    return true;
+  }
+
+  return false;
+}
+
+/* Callbacks from pthread_atfork */
+static void prepare_fork_handler(void) { pthread_mutex_lock(&dxg_runtime->hsakmt_mutex); }
+static void parent_fork_handler(void) { pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex); }
+static void child_fork_handler(void) {
+  pthread_mutex_init(&dxg_runtime->hsakmt_mutex, NULL);
+  dxg_runtime->is_forked = true;
+}
+
+/* Call this from the child process after fork. This will clear all
+ * data that is duplicated from the parent process, that is not valid
+ * in the child.
+ * The topology information is duplicated from the parent is valid
+ * in the child process so it is not cleared
+ */
+static void clear_after_fork(void) {
+  reset_suballocator();
+  clear_allocation_map();
+
+  if (dxg_runtime->dxg_fd >= 0) {
+    close(dxg_runtime->dxg_fd);
+    dxg_runtime->dxg_fd = -1;
+  }
+  delete dxg_runtime;
+  dxg_runtime = new hsakmtRuntime();
+
+}
+
+static inline void init_page_size(void) {
+  dxg_runtime->page_size = sysconf(_SC_PAGESIZE);
+  dxg_runtime->page_shift = ffs(dxg_runtime->page_size) - 1;
+}
+
+static HSAKMT_STATUS init_vars_from_env(void) {
+  char *envvar;
+  int debug_level;
+
+  /* Normally libraries don't print messages. For debugging purpose, we'll
+   * print messages if an environment variable, HSAKMT_DEBUG_LEVEL, is set.
+   */
+  envvar = getenv("HSAKMT_DEBUG_LEVEL");
+  if (envvar) {
+    dxg_runtime->hsakmt_debug_level = atoi(envvar);
+  }
+
+  /* Check whether to support Zero frame buffer */
+  envvar = getenv("HSA_ZFB");
+  if (envvar)
+    dxg_runtime->zfb_support = atoi(envvar);
+
+  /* Check whether to handle vendor specific aql packet */
+  envvar = getenv("WSLKMT_VENDOR_PACKET");
+  if (envvar)
+    dxg_runtime->vendor_packet_process = atoi(envvar);
+
+  /* Decide whether to check available system memory before allocation */
+  envvar = getenv("WSL_CHECK_AVAIL_SYSRAM");
+  if (envvar)
+    dxg_runtime->check_avail_sysram = !strcmp(envvar, "1");
+
+  envvar = getenv("WSL_ENABLE_THUNK_SUB_ALLOCATOR");
+  if (envvar)
+    dxg_runtime->enable_thunk_sub_allocator = atoi(envvar);
+
+  envvar = getenv("ROCR_VISIBLE_DEVICES");
+  if (envvar) {
+    std::string devices(envvar);
+    size_t first_num_pos = devices.find_first_of("0123456789");
+    if (first_num_pos != std::string::npos)
+      dxg_runtime->default_node = std::stoi(devices.substr(first_num_pos)) + 1;
+  }
+
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtOpenKFD(void) {
+  HSAKMT_STATUS result;
+  int fd = -1;
+  HsaSystemProperties sys_props;
+  char *error;
+
+  pthread_mutex_lock(&dxg_runtime->hsakmt_mutex);
+
+  /* If the process has forked, the child process must re-initialize
+   * it's connection to DXG. Any references tracked by dxg_open_count
+   * belong to the parent
+   */
+  if (is_forked_child())
+    clear_after_fork();
+
+  if (dxg_runtime->dxg_open_count == 0) {
+    static bool atfork_installed = false;
+
+    result = init_vars_from_env();
+    if (result != HSAKMT_STATUS_SUCCESS)
+      goto open_failed;
+
+    if (dxg_runtime->dxg_fd < 0) {
+      fd = open(dxg_runtime->dxg_device_name, O_RDWR | O_CLOEXEC);
+
+      if (fd == -1) {
+        result = HSAKMT_STATUS_KERNEL_IO_CHANNEL_NOT_OPENED;
+        goto open_failed;
+      }
+
+      dxg_runtime->dxg_fd = fd;
+    }
+    if (!wsl::thunk::dxcore::DxcoreLoader::Instance().Initialize()) {
+        pr_err("Failed to load libdxcore.so\n");
+        result = HSAKMT_STATUS_ERROR;
+        goto dxcore_loader_failed;
+    }
+
+    hsakmt_hsa_loader_init();
+    init_page_size();
+
+    char *useSvmStr = getenv("HSA_USE_SVM");
+    dxg_runtime->is_svm_api_supported = !(useSvmStr && !strcmp(useSvmStr, "0")) && false;
+
+    dxg_runtime->dxg_open_count = 1;
+
+    if (!atfork_installed) {
+      /* Atfork handlers cannot be uninstalled and
+       * must be installed only once. Otherwise
+       * prepare will deadlock when trying to take
+       * the same lock multiple times.
+       */
+      pthread_atfork(prepare_fork_handler, parent_fork_handler,
+                     child_fork_handler);
+      atfork_installed = true;
+    }
+  } else {
+    dxg_runtime->dxg_open_count++;
+    result = HSAKMT_STATUS_KERNEL_ALREADY_OPENED;
+  }
+
+  reset_suballocator();
+  pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex);
+  return result;
+dxcore_loader_failed:
+  close(fd);
+open_failed:
+  pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex);
+
+  return result;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtCloseKFD(void) {
+  HSAKMT_STATUS result;
+
+  pthread_mutex_lock(&dxg_runtime->hsakmt_mutex);
+
+  if (dxg_runtime->dxg_open_count > 0) {
+    if (--dxg_runtime->dxg_open_count == 0) {
+      close(dxg_runtime->dxg_fd);
+      dxg_runtime->dxg_fd = -1;
+      wsl::thunk::dxcore::DxcoreLoader::Instance().Shutdown();
+    }
+
+    result = HSAKMT_STATUS_SUCCESS;
+  } else
+    result = HSAKMT_STATUS_KERNEL_IO_CHANNEL_NOT_OPENED;
+
+  pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex);
+
+  return result;
+}
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/pc_sampling.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/pc_sampling.cpp
new file mode 100644
index 0000000000..6c6a9e2a04
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/pc_sampling.cpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright © 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingSupport(void) {
+  CHECK_DXG_OPEN();
+  // Used for profiling tools
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI
+hsaKmtPcSamplingQueryCapabilities(HSAuint32 NodeId, void *sample_info,
+                                  HSAuint32 sample_info_sz, HSAuint32 *size) {
+  CHECK_DXG_OPEN();
+  // Used for profiling tools
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingCreate(HSAuint32 NodeId,
+                                               HsaPcSamplingInfo *sample_info,
+                                               HsaPcSamplingTraceId *traceId) {
+  CHECK_DXG_OPEN();
+  // Used for profiling tools
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingDestroy(HSAuint32 NodeId,
+                                                HsaPcSamplingTraceId traceId) {
+  CHECK_DXG_OPEN();
+  // Used for profiling tools
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingStart(HSAuint32 NodeId,
+                                              HsaPcSamplingTraceId traceId) {
+  CHECK_DXG_OPEN();
+  // Used for profiling tools
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingStop(HSAuint32 NodeId,
+                                             HsaPcSamplingTraceId traceId) {
+  CHECK_DXG_OPEN();
+  // Used for profiling tools
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/perfctr.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/perfctr.cpp
new file mode 100644
index 0000000000..9189d2dafa
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/perfctr.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtPmcGetCounterProperties(
+    HSAuint32 NodeId, HsaCounterProperties **CounterProperties) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+/* Registers a set of (HW) counters to be used for tracing/profiling */
+HSAKMT_STATUS HSAKMTAPI hsaKmtPmcRegisterTrace(HSAuint32 NodeId,
+                                               HSAuint32 NumberOfCounters,
+                                               HsaCounter *Counters,
+                                               HsaPmcTraceRoot *TraceRoot) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+/* Unregisters a set of (HW) counters used for tracing/profiling */
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtPmcUnregisterTrace(HSAuint32 NodeId,
+                                                 HSATraceId TraceId) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtPmcAcquireTraceAccess(HSAuint32 NodeId,
+                                                    HSATraceId TraceId) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtPmcReleaseTraceAccess(HSAuint32 NodeId,
+                                                    HSATraceId TraceId) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+/* Starts tracing operation on a previously established set of performance
+ * counters */
+HSAKMT_STATUS HSAKMTAPI hsaKmtPmcStartTrace(HSATraceId TraceId,
+                                            void *TraceBuffer,
+                                            HSAuint64 TraceBufferSizeBytes) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+/*Forces an update of all the counters that a previously started trace operation
+ * has registered */
+HSAKMT_STATUS HSAKMTAPI hsaKmtPmcQueryTrace(HSATraceId TraceId) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+/* Stops tracing operation on a previously established set of performance
+ * counters */
+HSAKMT_STATUS HSAKMTAPI hsaKmtPmcStopTrace(HSATraceId TraceId) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/queues.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/queues.cpp
new file mode 100644
index 0000000000..edaaea9d1a
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/queues.cpp
@@ -0,0 +1,216 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include <cinttypes>
+#include "impl/wddm/device.h"
+#include "impl/wddm/queue.h"
+#include "impl/hsa/amd_hsa_signal.h"
+
+uint32_t get_vgpr_size_per_cu(HSA_ENGINE_ID id) {
+  uint32_t vgpr_size = 0x40000;
+
+  uint32_t gfxv = HSA_GET_GFX_VERSION_FULL(id.ui32);
+  if( gfxv == 0x1100 || gfxv == 0x1101 ||
+    gfxv == 0x1151 ||
+    gfxv == 0x1200 || gfxv ==0x1201) {
+    vgpr_size = 0x60000;
+  }
+
+  return vgpr_size;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueue(HSAuint32 NodeId,
+					  HSA_QUEUE_TYPE Type,
+					  HSAuint32 QueuePercentage,
+					  HSA_QUEUE_PRIORITY Priority,
+					  void *QueueAddress,
+					  HSAuint64 QueueSizeInBytes,
+					  HsaEvent *Event,
+					  HsaQueueResource *QueueResource)
+{
+	if (Type == HSA_QUEUE_SDMA_BY_ENG_ID)
+		return HSAKMT_STATUS_ERROR;
+
+	return hsaKmtCreateQueueExt(NodeId, Type, QueuePercentage, Priority, 0,
+				    QueueAddress, QueueSizeInBytes, Event,
+				    QueueResource);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueueExt(HSAuint32 NodeId,
+					     HSA_QUEUE_TYPE Type,
+					     HSAuint32 QueuePercentage,
+					     HSA_QUEUE_PRIORITY Priority,
+					     HSAuint32 SdmaEngineId,
+					     void *QueueAddress,
+					     HSAuint64 QueueSizeInBytes,
+					     HsaEvent *Event,
+					     HsaQueueResource *QueueResource) {
+  HSAKMT_STATUS result;
+
+  CHECK_DXG_OPEN();
+  assert(Event == nullptr);
+
+  if (Priority < HSA_QUEUE_PRIORITY_MINIMUM ||
+      Priority > HSA_QUEUE_PRIORITY_MAXIMUM)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  wsl::thunk::WDDMDevice *device_ = get_wddmdev(NodeId);
+  assert(device_);
+
+  if (queue_acquire_buffer(QueueAddress) == false)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  switch (Type) {
+  case HSA_QUEUE_COMPUTE_AQL: {
+    assert(QueueResource->ErrorReason == nullptr);
+    uint64_t pkg_num = QueueSizeInBytes / 64;
+    uint32_t cmdbuf_size = device_->GetCmdbufSize();
+    uint32_t queue_engine = device_->GetComputeEngine();
+    bool use_hws = device_->IsHwsEnabled(queue_engine);
+    auto queue_ = new wsl::thunk::ComputeQueue(
+        device_, QueueAddress, pkg_num,
+        reinterpret_cast<std::atomic<uint64_t> *>(
+            QueueResource->Queue_write_ptr_aql),
+        reinterpret_cast<std::atomic<uint64_t> *>(
+            QueueResource->Queue_read_ptr_aql),
+        QueueResource->ErrorReason, cmdbuf_size, queue_engine, use_hws);
+
+    QueueResource->QueueId = reinterpret_cast<HSA_QUEUEID>(queue_);
+    // for doorbell_signal.hardware_doorbell_ptr
+    QueueResource->Queue_DoorBell_aql = queue_->GetDoorbellPtr();
+  } break;
+  case HSA_QUEUE_SDMA:
+  case HSA_QUEUE_SDMA_BY_ENG_ID: {
+    pr_debug("create sdma queue in engine %d\n", SdmaEngineId);
+    uint32_t queue_engine = device_->GetSdmaEngine(0); // TODO: SdmaEngineId
+    bool use_hws = device_->IsHwsEnabled(queue_engine);
+    auto queue_ = new wsl::thunk::SDMAQueue(
+		device_, QueueAddress, QueueSizeInBytes,
+		queue_engine, use_hws);
+    QueueResource->QueueId = reinterpret_cast<HSA_QUEUEID>(queue_);
+    QueueResource->Queue_DoorBell_aql = queue_->GetDoorbellPtr();
+    QueueResource->Queue_write_ptr_aql = queue_->GetRingWptr();
+    QueueResource->Queue_read_ptr_aql = queue_->GetRingRptr();
+  } break;
+  default:
+    assert(false);
+    QueueResource->QueueId = 0;
+    QueueResource->Queue_DoorBell = nullptr;
+    break;
+  }
+
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtUpdateQueue(
+    HSA_QUEUEID QueueId, HSAuint32 QueuePercentage, HSA_QUEUE_PRIORITY Priority,
+    void *QueueAddress, HSAuint64 QueueSize, HsaEvent *Event) {
+  CHECK_DXG_OPEN();
+
+  if (Priority < HSA_QUEUE_PRIORITY_MINIMUM ||
+      Priority > HSA_QUEUE_PRIORITY_MAXIMUM)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  auto queue_ = reinterpret_cast<wsl::thunk::ComputeQueue *>(QueueId);
+  if (!queue_)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtDestroyQueue(HSA_QUEUEID QueueId) {
+  CHECK_DXG_OPEN();
+
+  auto queue_ = reinterpret_cast<wsl::thunk::WDDMQueue *>(QueueId);
+  void *QueueAddress = queue_->GetHsaQueueAddr();
+
+  if (!queue_)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  delete queue_;
+  queue_release_buffer(QueueAddress);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtSetQueueCUMask(HSA_QUEUEID QueueId,
+                                             HSAuint32 CUMaskCount,
+                                             HSAuint32 *QueueCUMask) {
+  CHECK_DXG_OPEN();
+
+  auto queue_ = reinterpret_cast<wsl::thunk::ComputeQueue *>(QueueId);
+  if (!queue_)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  if (CUMaskCount == 0 || !QueueCUMask || ((CUMaskCount % 32) != 0))
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  pr_warn_once("not implemented\n");
+
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtGetQueueInfo(HSA_QUEUEID QueueId,
+                                           HsaQueueInfo *QueueInfo) {
+  CHECK_DXG_OPEN();
+
+  if (QueueInfo == NULL)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+  memset(QueueInfo, 0, sizeof(*QueueInfo));
+
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtSetTrapHandler(HSAuint32 Node,
+                                             void *TrapHandlerBaseAddress,
+                                             HSAuint64 TrapHandlerSizeInBytes,
+                                             void *TrapBufferBaseAddress,
+                                             HSAuint64 TrapBufferSizeInBytes) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not implemented\n");
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtAllocQueueGWS(HSA_QUEUEID QueueId, HSAuint32 nGWS,
+                                            HSAuint32 *firstGWS) {
+  CHECK_DXG_OPEN();
+
+  auto queue_ = reinterpret_cast<wsl::thunk::ComputeQueue *>(QueueId);
+  if (!queue_)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  assert(false);
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtQueueRingDoorbell(HSA_QUEUEID QueueId) {
+  CHECK_DXG_OPEN();
+
+  auto queue_ = reinterpret_cast<wsl::thunk::WDDMQueue *>(QueueId);
+  if (!queue_)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  queue_->RingDoorbell();
+  return HSAKMT_STATUS_SUCCESS;
+}
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/spm.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/spm.cpp
new file mode 100644
index 0000000000..14b0faf1f8
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/spm.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright © 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtSPMAcquire(HSAuint32 PreferredNode) {
+  CHECK_DXG_OPEN();
+  // Used for profiling tools
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtSPMSetDestBuffer(
+    HSAuint32 PreferredNode, HSAuint32 SizeInBytes, HSAuint32 *timeout,
+    HSAuint32 *SizeCopied, void *DestMemoryAddress, bool *isSPMDataLoss) {
+  CHECK_DXG_OPEN();
+  // Used for profiling tools
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtSPMRelease(HSAuint32 PreferredNode) {
+  CHECK_DXG_OPEN();
+  // Used for profiling tools
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/svm.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/svm.cpp
new file mode 100644
index 0000000000..f2f8a10f68
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/svm.cpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright © 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/* Helper functions for calling KFD SVM ioctl */
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtSVMSetAttr(void *start_addr, HSAuint64 size,
+                                         unsigned int nattr,
+                                         HSA_SVM_ATTRIBUTE *attrs) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtSVMGetAttr(void *start_addr, HSAuint64 size,
+                                         unsigned int nattr,
+                                         HSA_SVM_ATTRIBUTE *attrs) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtSetXNACKMode(HSAint32 enable) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  return HSAKMT_STATUS_NOT_SUPPORTED;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtGetXNACKMode(HSAint32 *enable) {
+  CHECK_DXG_OPEN();
+  pr_warn_once("not supported\n");
+  *enable = false;
+  return HSAKMT_STATUS_SUCCESS;
+}
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/thunk_proxy/libthunk_proxy.a b/projects/rocr-runtime/libhsakmt/src/dxg/thunk_proxy/libthunk_proxy.a
new file mode 100644
index 0000000000..3b21eb936d
Binary files /dev/null and b/projects/rocr-runtime/libhsakmt/src/dxg/thunk_proxy/libthunk_proxy.a differ
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/time.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/time.cpp
new file mode 100644
index 0000000000..a28bb29215
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/time.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include <iostream>
+#include <ctime>
+#include <cstring>
+#include <cassert>
+#include "impl/wddm/device.h"
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtGetClockCounters(HSAuint32 NodeId,
+                                               HsaClockCounters *Counters) {
+  HSAKMT_STATUS result = HSAKMT_STATUS_SUCCESS;
+
+  CHECK_DXG_OPEN();
+
+  std::memset(Counters, 0, sizeof(*Counters));
+
+  wsl::thunk::WDDMDevice *device_ = get_wddmdev(NodeId);
+  assert(device_);
+  device_->GetClockCounters(&Counters->GPUClockCounter, &Counters->CPUClockCounter);
+
+  struct timespec ts;
+  if (clock_gettime(CLOCK_MONOTONIC_RAW, &ts) == 0)
+    Counters->SystemClockCounter = ts.tv_sec * 1e9 + ts.tv_nsec;
+  Counters->SystemClockFrequencyHz = 1000000000;
+
+  return result;
+}
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/topology.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/topology.cpp
new file mode 100644
index 0000000000..2db712e341
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/topology.cpp
@@ -0,0 +1,1463 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * Copyright 2016-2018 Raptor Engineering, LLC. All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <cstdlib>
+#include <cstdio>
+#include <cstring>
+#include <cctype>
+#include <cmath>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <assert.h>
+#include <dirent.h>
+#include <unistd.h>
+#include <sys/sysinfo.h>
+
+#include "impl/wddm/types.h"
+#include "impl/wddm/device.h"
+#include "util/utils.h"
+
+/* Number of memory banks added by thunk on top of topology
+ * This only includes static heaps like LDS, scratch and SVM,
+ * not for MMIO_REMAP heap. MMIO_REMAP memory bank is reported
+ * dynamically based on whether mmio aperture was mapped
+ * successfully on this node.
+ */
+#define NUM_OF_IGPU_HEAPS 3
+#define NUM_OF_DGPU_HEAPS 3
+
+typedef struct {
+  HsaNodeProperties node;
+  std::vector<HsaMemoryProperties> mem; /* node->NumBanks elements */
+  std::vector<HsaCacheProperties> cache;
+  std::vector<HsaIoLinkProperties> link;
+} node_props_t;
+
+struct _topology_props {
+  HsaSystemProperties *g_system = nullptr;
+  std::vector<node_props_t> g_props;
+  std::vector<wsl::thunk::WDDMDevice *> wdevices_;
+  uint32_t wdevice_num_ = 0;
+  uint32_t num_sysfs_nodes = 0;
+  int processor_vendor = -1;
+  double freq_max_ = 0.0;
+};
+
+static _topology_props* dxg_topology = new _topology_props();
+
+/* Supported System Vendors */
+enum SUPPORTED_PROCESSOR_VENDORS {
+  GENUINE_INTEL = 0,
+  AUTHENTIC_AMD,
+  IBM_POWER
+};
+/* Adding newline to make the search easier */
+static const char *supported_processor_vendor_name[] = {
+  "GenuineIntel",
+  "AuthenticAMD",
+  "" // POWER requires a different search method
+};
+
+static HSAKMT_STATUS topology_take_snapshot(void);
+static void topology_drop_snapshot(void);
+
+/* information from /proc/cpuinfo */
+struct proc_cpuinfo {
+  uint32_t proc_num;                     /* processor */
+  uint32_t apicid;                       /* apicid */
+  char model_name[HSA_PUBLIC_NAME_SIZE]; /* model name */
+};
+
+/* CPU cache table for all CPUs on the system. Each entry has the relative CPU
+ * info and caches connected to that CPU.
+ */
+typedef struct cpu_cacheinfo {
+  int32_t proc_num;    /* this cpu's processor number */
+  uint32_t num_caches; /* number of caches reported by this cpu */
+} cpu_cacheinfo_t;
+
+/* num_subdirs - find the number of sub-directories in the specified path
+ *	@dirpath - directory path to find sub-directories underneath
+ *	@prefix - only count sub-directory names starting with prefix.
+ *		Use blank string, "", to count all.
+ *	Return - number of sub-directories
+ */
+static int num_subdirs(char *dirpath, const char *prefix) {
+  int count = 0;
+  DIR *dirp;
+  struct dirent *dir;
+  int prefix_len = strlen(prefix);
+
+  dirp = opendir(dirpath);
+  if (dirp) {
+    while ((dir = readdir(dirp)) != 0) {
+      if ((strcmp(dir->d_name, ".") == 0) || (strcmp(dir->d_name, "..") == 0))
+        continue;
+      if (prefix_len && strncmp(dir->d_name, prefix, prefix_len))
+        continue;
+      count++;
+    }
+    closedir(dirp);
+  }
+
+  return count;
+}
+
+/* fscanf_dec - read a file whose content is a decimal number
+ *      @file [IN ] file to read
+ *      @num [OUT] number in the file
+ */
+static HSAKMT_STATUS fscanf_dec(char *file, uint32_t *num) {
+  FILE *fd;
+  HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
+
+  fd = fopen(file, "r");
+  if (!fd) {
+    pr_err("Failed to open %s\n", file);
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+  }
+  if (fscanf(fd, "%u", num) != 1) {
+    pr_err("Failed to parse %s as a decimal.\n", file);
+    ret = HSAKMT_STATUS_ERROR;
+  }
+
+  fclose(fd);
+  return ret;
+}
+
+/* fscanf_str - read a file whose content is a string
+ *      @file [IN ] file to read
+ *      @str [OUT] string in the file
+ */
+static HSAKMT_STATUS fscanf_str(char *file, char *str) {
+  FILE *fd;
+  HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
+
+  fd = fopen(file, "r");
+  if (!fd) {
+    pr_err("Failed to open %s\n", file);
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+  }
+  if (fscanf(fd, "%s", str) != 1) {
+    pr_err("Failed to parse %s as a string.\n", file);
+    ret = HSAKMT_STATUS_ERROR;
+  }
+
+  fclose(fd);
+  return ret;
+}
+
+/* fscanf_size - read a file whose content represents size as a string
+ *      @file [IN ] file to read
+ *      @bytes [OUT] sizes in bytes
+ */
+static HSAKMT_STATUS fscanf_size(char *file, uint32_t *bytes) {
+  FILE *fd;
+  HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
+  char unit;
+  int n;
+
+  fd = fopen(file, "r");
+  if (!fd) {
+    pr_err("Failed to open %s\n", file);
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+  }
+
+  n = fscanf(fd, "%u%c", bytes, &unit);
+  if (n < 1) {
+    pr_err("Failed to parse %s\n", file);
+    ret = HSAKMT_STATUS_ERROR;
+  }
+
+  if (n == 2) {
+    switch (unit) {
+    case 'K':
+      *bytes <<= 10;
+      break;
+    case 'M':
+      *bytes <<= 20;
+      break;
+    case 'G':
+      *bytes <<= 30;
+      break;
+    default:
+      ret = HSAKMT_STATUS_ERROR;
+      break;
+    }
+  }
+
+  fclose(fd);
+  return ret;
+}
+
+/* cpumap_to_cpu_ci - translate shared_cpu_map string + cpuinfo->apicid into
+ *		      SiblingMap in cache
+ *	@shared_cpu_map [IN ] shared_cpu_map string
+ *	@cpuinfo [IN ] cpuinfo to get apicid
+ *	@this_cache [OUT] CPU cache to fill in SiblingMap
+ */
+static void cpumap_to_cpu_ci(char *shared_cpu_map,
+                             const std::vector<struct proc_cpuinfo>& cpuinfo,
+                             HsaCacheProperties *this_cache) {
+  int num_hexs, bit;
+  uint32_t proc, apicid, mask;
+  char *ch_ptr;
+
+  /* shared_cpu_map is shown as ...X3,X2,X1 Each X is a hex without 0x
+   * and it's up to 8 characters(32 bits). For the first 32 CPUs(actually
+   * procs), it's presented in X1. The next 32 is in X2, and so on.
+   */
+  num_hexs = (strlen(shared_cpu_map) + 8) / 9; /* 8 characters + "," */
+  ch_ptr = strtok(shared_cpu_map, ",");
+  while (num_hexs-- > 0) {
+    mask = strtol(ch_ptr, NULL, 16); /* each X */
+    for (bit = 0; bit < 32; bit++) {
+      if (!((1 << bit) & mask))
+        continue;
+      proc = num_hexs * 32 + bit;
+      apicid = cpuinfo[proc].apicid;
+      if (apicid >= HSA_CPU_SIBLINGS) {
+        pr_warn("SiblingMap buffer %d is too small\n", HSA_CPU_SIBLINGS);
+        continue;
+      }
+      this_cache->SiblingMap[apicid] = 1;
+    }
+    ch_ptr = strtok(NULL, ",");
+  }
+}
+
+/* get_cpu_cache_info - get specified CPU's cache information from sysfs
+ *     @prefix [IN] sysfs path for target cpu cache,
+ *                  /sys/devices/system/node/nodeX/cpuY/cache
+ *     @cpuinfo [IN] /proc/cpuinfo data to get apicid
+ *     @cpu_ci: CPU specified. This parameter is an input and also an output.
+ *             [IN] cpu_ci->num_caches: number of index dirs
+ *             [OUT] cpu_ci->cache_info: to store cache info collected
+ *             [OUT] cpu_ci->num_caches: reduces when shared with other cpu(s)
+ * Return: number of cache reported from this cpu
+ */
+static int get_cpu_cache_info(const char *prefix,
+                              const std::vector<struct proc_cpuinfo>& cpuinfo,
+                              std::vector<HsaCacheProperties>& cache,
+                              cpu_cacheinfo_t& cpu_ci) {
+  int n;
+  char path[256], str[256];
+  bool is_power9 = false;
+
+  if (dxg_topology->processor_vendor == IBM_POWER) {
+    if (strcmp(cpuinfo[0].model_name, "POWER9") == 0) {
+      is_power9 = true;
+    }
+  }
+
+  HsaCacheProperties this_cache;
+  int num_idx = cpu_ci.num_caches;
+  for (int idx = 0; idx < num_idx; idx++) {
+    memset(&this_cache, 0, sizeof(this_cache));
+    /* If this cache is shared by multiple CPUs, we only need
+     * to list it in the first CPU.
+     */
+    if (is_power9) {
+      // POWER9 has SMT4
+      if (cpu_ci.proc_num & 0x3) {
+        /* proc is not 0,4,8,etc.  Skip and reduce the cache count. */
+        --cpu_ci.num_caches;
+        continue;
+      }
+    } else {
+      snprintf(path, 256, "%s/index%d/shared_cpu_list", prefix, idx);
+      /* shared_cpu_list is shown as n1,n2... or n1-n2,n3-n4...
+       * For both cases, this cache is listed to proc n1 only.
+       */
+      fscanf_dec(path, (uint32_t *)&n);
+      if (cpu_ci.proc_num != n) {
+        /* proc is not n1. Skip and reduce the cache count. */
+        --cpu_ci.num_caches;
+        continue;
+      }
+      this_cache.ProcessorIdLow = cpuinfo[cpu_ci.proc_num].apicid;
+    }
+
+    /* CacheLevel */
+    snprintf(path, 256, "%s/index%d/level", prefix, idx);
+    fscanf_dec(path, &this_cache.CacheLevel);
+    /* CacheType */
+    snprintf(path, 256, "%s/index%d/type", prefix, idx);
+
+    memset(str, 0, sizeof(str));
+    fscanf_str(path, str);
+    if (!strcmp(str, "Data"))
+      this_cache.CacheType.ui32.Data = 1;
+    if (!strcmp(str, "Instruction"))
+      this_cache.CacheType.ui32.Instruction = 1;
+    if (!strcmp(str, "Unified")) {
+      this_cache.CacheType.ui32.Data = 1;
+      this_cache.CacheType.ui32.Instruction = 1;
+    }
+    this_cache.CacheType.ui32.CPU = 1;
+    /* CacheSize */
+    snprintf(path, 256, "%s/index%d/size", prefix, idx);
+    fscanf_size(path, &this_cache.CacheSize);
+    /* CacheLineSize */
+    snprintf(path, 256, "%s/index%d/coherency_line_size", prefix, idx);
+    fscanf_dec(path, &this_cache.CacheLineSize);
+    /* CacheAssociativity */
+    snprintf(path, 256, "%s/index%d/ways_of_associativity", prefix, idx);
+    fscanf_dec(path, &this_cache.CacheAssociativity);
+    /* CacheLinesPerTag */
+    snprintf(path, 256, "%s/index%d/physical_line_partition", prefix, idx);
+    fscanf_dec(path, &this_cache.CacheLinesPerTag);
+    /* CacheSiblings */
+    snprintf(path, 256, "%s/index%d/shared_cpu_map", prefix, idx);
+    fscanf_str(path, str);
+    cpumap_to_cpu_ci(str, cpuinfo, &this_cache);
+
+    cache.push_back(this_cache);
+  }
+
+  return cpu_ci.num_caches;
+}
+
+static HSAKMT_STATUS topology_map_node_id(uint32_t node_id,
+                                          wsl::thunk::WDDMDevice *&device) {
+  uint32_t idx = node_id;
+  if ((!dxg_topology->wdevices_.size()) || (!node_id) || (node_id >= dxg_topology->num_sysfs_nodes)) {
+    device = nullptr;
+    return HSAKMT_STATUS_ERROR;
+  }
+
+  device = dxg_topology->wdevices_[node_id - 1];
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS topology_sysfs_get_system_props(HsaSystemProperties& props) {
+  HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
+  bool is_node_supported = true;
+  uint32_t num_supported_nodes = 0;
+
+  std::memset(&props, 0, sizeof(props));
+
+  dxg_runtime->HeapFini();
+  for (auto device : dxg_topology->wdevices_)
+    delete device;
+  dxg_topology->wdevices_.clear();
+
+  WDDMCreateDevices(dxg_topology->wdevices_);
+  int num_adapters = dxg_topology->wdevices_.size();
+  if (num_adapters == 0) {
+    pr_err("No WDDM adapters found.\n");
+    return HSAKMT_STATUS_ERROR;
+  }
+
+  dxg_topology->num_sysfs_nodes = num_adapters + 1;
+  dxg_runtime->HeapInit();
+  props.NumNodes = dxg_topology->num_sysfs_nodes;
+  if (dxg_runtime->default_node > num_adapters)
+    dxg_runtime->default_node = num_adapters;
+
+  return ret;
+}
+
+void topology_setup_is_dgpu_param(HsaNodeProperties *props) {
+  /* if we found a dGPU node, then treat the whole system as dGPU */
+  /* noted that some APUs are also treated as dGPU in runtime */
+  if (!props->NumCPUCores && props->NumFComputeCores)
+    dxg_runtime->hsakmt_is_dgpu = true;
+}
+
+static HSAKMT_STATUS topology_get_cpu_model_name(HsaNodeProperties& props,
+                                                 const std::vector<proc_cpuinfo>& cpuinfo) {
+  for (int i = 0; i < cpuinfo.size(); i++) {
+    if (props.CComputeIdLo == cpuinfo[i].apicid) {
+      if (!props.DeviceId) /* CPU-only node */
+        strncpy((char *)props.AMDName, cpuinfo[i].model_name,
+                sizeof(props.AMDName));
+      /* Convert from UTF8 to UTF16 */
+      int j;
+      for (j = 0;
+           cpuinfo[i].model_name[j] != '\0' && j < HSA_PUBLIC_NAME_SIZE - 1; j++)
+        props.MarketingName[j] = cpuinfo[i].model_name[j];
+      props.MarketingName[j] = '\0';
+      return HSAKMT_STATUS_SUCCESS;
+    }
+  }
+
+  return HSAKMT_STATUS_ERROR;
+}
+
+static int topology_search_processor_vendor(const std::string& processor_name) {
+  for (unsigned int i = 0; i < ARRAY_LEN(supported_processor_vendor_name); i++) {
+    if (processor_name == supported_processor_vendor_name[i])
+      return i;
+    if (processor_name == "POWER9, altivec supported")
+      return IBM_POWER;
+  }
+  return -1;
+}
+
+/* topology_parse_cpuinfo - Parse /proc/cpuinfo and fill up required
+ *			topology information
+ * cpuinfo [OUT]: output buffer to hold cpu information
+ * num_procs: number of processors the output buffer can hold
+ */
+static HSAKMT_STATUS topology_parse_cpuinfo(std::vector<proc_cpuinfo>& cpuinfo) {
+  HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
+  uint32_t num_procs = cpuinfo.size();
+
+  std::ifstream cpuinfo_max_freq(
+      "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq");
+  if (cpuinfo_max_freq) {
+    std::string line;
+    std::getline(cpuinfo_max_freq, line);
+    dxg_topology->freq_max_ = static_cast<uint32_t>(std::stod(line) / 1000);
+  }
+
+  std::ifstream cpuinfo_file("/proc/cpuinfo");
+  if (!cpuinfo_file) {
+    pr_err("Failed to open /proc/cpuinfo. Unable to get CPU information");
+    return HSAKMT_STATUS_ERROR;
+  }
+
+  std::string line;
+  uint32_t proc = 0;
+  while (std::getline(cpuinfo_file, line)) {
+    if (line.substr(0, 9) == "processor") {
+      proc = std::stoi(line.substr(line.find(':') + 2));
+      if (proc >= num_procs) {
+        pr_err("cpuinfo contains processor %d larger than %u\n", proc, num_procs);
+        return HSAKMT_STATUS_NO_MEMORY;
+      }
+      continue;
+    }
+
+    if (line.substr(0, 9) == "vendor_id" && dxg_topology->processor_vendor == -1) {
+      std::string vendor = line.substr(line.find(':') + 2);
+      dxg_topology->processor_vendor = topology_search_processor_vendor(vendor.c_str());
+      continue;
+    }
+
+    if (line.substr(0, 10) == "model name") {
+      std::string model_name = line.substr(line.find(':') + 2);
+      if (model_name.size() > HSA_PUBLIC_NAME_SIZE)
+      model_name.resize(HSA_PUBLIC_NAME_SIZE);
+      std::strncpy(cpuinfo[proc].model_name, model_name.c_str(), HSA_PUBLIC_NAME_SIZE);
+      continue;
+    }
+
+    if (line.substr(0, 6) == "apicid") {
+      cpuinfo[proc].apicid = std::stoi(line.substr(line.find(':') + 2));
+      continue;
+    }
+
+    if (!cpuinfo_max_freq) {
+      if (line.substr(0, 7) == "cpu MHz") {
+        double freq = std::stod(line.substr(line.find(':') + 2));
+        if (freq > dxg_topology->freq_max_) {
+          dxg_topology->freq_max_ = freq;
+        }
+        continue;
+      }
+    }
+  }
+
+  if (dxg_topology->processor_vendor < 0) {
+    pr_err("Failed to get Processor Vendor. Setting to %s", supported_processor_vendor_name[GENUINE_INTEL]);
+    dxg_topology->processor_vendor = GENUINE_INTEL;
+  }
+
+  return ret;
+}
+
+static HSAKMT_STATUS topology_sysfs_get_node_props(uint32_t node_id,
+                                                   HsaNodeProperties& props,
+                                                   bool& p2p_links,
+                                                   uint32_t& num_p2pLinks) {
+  HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
+
+  memset(&props, 0, sizeof(props));
+  p2p_links = false;
+  num_p2pLinks = 0;
+
+  props.MaxEngineClockMhzCCompute = dxg_topology->freq_max_;
+
+  if (node_id == 0) {
+    /* CPU node */
+    props.NumCPUCores = sysconf(_SC_NPROCESSORS_ONLN);
+    props.NumMemoryBanks = 1;
+    props.KFDGpuID = 0;
+    return HSAKMT_STATUS_SUCCESS;
+  }
+
+  /* gpu node */
+  wsl::thunk::WDDMDevice *device;
+  ret = topology_map_node_id(node_id, device);
+  if (ret != HSAKMT_STATUS_SUCCESS)
+    return ret;
+
+  props.NumCPUCores = 0;
+  props.NumFComputeCores = device->SimdPerCu() * device->ComputeUnitCount();
+  props.NumMemoryBanks = 1;
+  props.NumCaches = 3;
+  props.NumIOLinks = 1;
+  props.CComputeIdLo = 0;
+  props.FComputeIdLo = 0;
+  props.Capability.ui32.ASICRevision = device->AsicRevision();
+  props.Capability.ui32.WatchPointsTotalBits =
+      std::log2(device->WatchPointsNum());
+  props.MaxWavesPerSIMD = device->WavePerCu() / device->SimdPerCu();
+  props.LDSSizeInKB = device->LdsSize() / 1024;
+  props.GDSSizeInKB = 0;
+  props.WaveFrontSize = device->WavefrontSize();
+  props.NumShaderBanks = device->NumShaderEngine();
+  props.NumArrays = device->ShaderArrayPerShaderEngine();
+  props.NumCUPerArray = device->ComputeUnitCount() / props.NumArrays;
+  props.NumSIMDPerCU = device->SimdPerCu();
+  props.MaxSlotsScratchCU = device->MaxScratchSlotsPerCu();
+  props.VendorId = 0x1002;
+  props.DeviceId = device->DeviceId();
+  props.LocationId = device->PciBusAddr();
+  props.LocalMemSize = 0;
+  props.MaxEngineClockMhzFCompute = device->MaxEngineClockMhz();
+  props.DrmRenderMinor = node_id;
+
+  {
+    int i;
+    const char *name = device->ProductName();
+    for (i = 0; name[i] != 0 && i < HSA_PUBLIC_NAME_SIZE - 1; i++)
+      props.MarketingName[i] = name[i];
+    props.MarketingName[i] = '\0';
+  }
+  props.uCodeEngineVersions.uCodeSDMA = device->GetSdmaFwVersion();
+  props.DebugProperties.Value = 0;
+  props.HiveID = 0;
+  props.NumSdmaEngines = device->NumSdmaEngine();
+  props.NumSdmaXgmiEngines = 0;
+  props.NumSdmaQueuesPerEngine = 6; // TODO
+  props.NumCpQueues = device->GetNumCpQueues();
+  props.NumGws = 0;
+  /*
+   * In Native Linux, if the asic is APU, this value will be set to 1,
+   * if the asic is dGPU, this value will be set to 0. clr use this info
+   * to set hostUnifiedMemory_, but for now wsl does not support this feature.
+   * Therefore, fore vaule to 0 temporarily.
+   */
+  props.Integrated = 0;
+  props.Domain = device->Domain();
+  props.UniqueID = device->Uuid();
+  props.NumXcc = 1;
+  props.KFDGpuID = device->DeviceId(); // TODO
+  props.FamilyID = device->GfxFamily();
+
+  props.EngineId.ui32.uCode = device->GetMecFwVersion();
+  char *envvar = getenv("HSA_OVERRIDE_GFX_VERSION");
+  if (envvar) {
+    char dummy = '\0';
+    uint32_t major = 0, minor = 0, step = 0;
+    /* HSA_OVERRIDE_GFX_VERSION=major.minor.stepping */
+    if ((sscanf(envvar, "%u.%u.%u%c", &major, &minor, &step, &dummy) != 3) ||
+        (major > 63 || minor > 255 || step > 255)) {
+      pr_err("HSA_OVERRIDE_GFX_VERSION %s is invalid\n", envvar);
+      return HSAKMT_STATUS_ERROR;
+    }
+    props.OverrideEngineId.ui32.Major = major & 0x3f;
+    props.OverrideEngineId.ui32.Minor = minor & 0xff;
+    props.OverrideEngineId.ui32.Stepping = step & 0xff;
+  }
+  props.EngineId.ui32.Major = device->Major();
+  props.EngineId.ui32.Minor = device->Minor();
+  props.EngineId.ui32.Stepping = device->Stepping();
+
+  snprintf((char *)props.AMDName, sizeof(props.AMDName) - 1, "GFX%06x",
+           HSA_GET_GFX_VERSION_FULL(props.EngineId.ui32));
+
+  if (!dxg_runtime->is_svm_api_supported)
+    props.Capability.ui32.SVMAPISupported = 0;
+  props.Capability.ui32.DoorbellType = 2;
+
+  /* Get VGPR/SGPR size in byte per CU */
+  props.SGPRSizePerCU = SGPR_SIZE_PER_CU;
+  props.VGPRSizePerCU = get_vgpr_size_per_cu(props.EngineId);
+
+  if (props.NumFComputeCores)
+    assert(props.EngineId.ui32.Major &&
+           "HSA_OVERRIDE_GFX_VERSION may be needed");
+
+  return ret;
+}
+
+static HSAKMT_STATUS topology_sysfs_get_mem_props(uint32_t node_id,
+                                                  uint32_t mem_id,
+                                                  HsaMemoryProperties& props) {
+  HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
+
+  std::memset(&props, 0, sizeof(props));
+  if (node_id == 0) {
+    /* CPU node */
+    props.HeapType = HSA_HEAPTYPE_SYSTEM;
+
+    struct sysinfo info;
+    sysinfo(&info);
+    props.SizeInBytes = info.totalram;
+
+    /* props.SizeInBytes is the actual physical system
+     * memory size. Reserve 1/16th for WSL system usage.
+     */
+    dxg_runtime->max_single_alloc_size = info.totalram - (info.totalram >> 4);
+
+    props.Flags.MemoryProperty = 0;
+    /* TODO: sudo dmidecode --type memory doesn't work on wsl */
+    props.Width = 64;
+    props.MemoryClockMax = 2133;
+    return HSAKMT_STATUS_SUCCESS;
+  }
+
+  wsl::thunk::WDDMDevice *device;
+  ret = topology_map_node_id(node_id, device);
+  if (ret != HSAKMT_STATUS_SUCCESS)
+    return ret;
+
+  props.HeapType = HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE;
+
+  if (device->IsDgpu())
+    props.SizeInBytes = device->LocalHeapSize();
+  else
+    props.SizeInBytes = device->NonLocalHeapSize();
+
+  props.Width = device->MemoryBusWidth();
+  props.MemoryClockMax = device->MaxMemoryClockMhz();
+
+  return ret;
+}
+
+/* topology_get_cpu_cache_props - Read CPU cache information from sysfs
+ *	@node [IN] CPU node number
+ *	@cpuinfo [IN] /proc/cpuinfo data
+ *	@tbl [OUT] the node table to fill up
+ * Return: HSAKMT_STATUS_SUCCESS in success or error number in failure
+ */
+static HSAKMT_STATUS topology_get_cpu_cache_props(int node,
+                                                  const std::vector<proc_cpuinfo>& cpuinfo,
+                                                  node_props_t& tbl) {
+  HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
+
+  /* Get max path size from /sys/devices/system/node/node%d/%s/cache
+   * below, which will max out according to the largest filename,
+   * which can be present twice in the string above. 29 is for the prefix
+   * and the +6 is for the cache suffix
+   */
+#ifndef MAXNAMLEN
+/* MAXNAMLEN is the BSD name for NAME_MAX. glibc aliases this as NAME_MAX, but
+ * not musl */
+#define MAXNAMLEN NAME_MAX
+#endif
+  constexpr uint32_t MAXPATHSIZE = 29 + MAXNAMLEN + (MAXNAMLEN + 6);
+  char path[MAXPATHSIZE], node_dir[MAXPATHSIZE];
+  int max_cpus;
+  int cache_cnt = 0;
+  DIR *dirp = NULL;
+  struct dirent *dir;
+  char *p;
+
+  /* Get info from /sys/devices/system/node/nodeX/cpuY/cache */
+  int node_real = node;
+  if (dxg_topology->processor_vendor == IBM_POWER) {
+    if (!strcmp(cpuinfo[0].model_name, "POWER9")) {
+      node_real = node * 8;
+    }
+  }
+  snprintf(node_dir, MAXPATHSIZE, "/sys/devices/system/node/node%d", node_real);
+  /* Other than cpuY folders, this dir also has cpulist and cpumap */
+  max_cpus = num_subdirs(node_dir, "cpu");
+  if (max_cpus <= 0) {
+    /* If CONFIG_NUMA is not enabled in the kernel,
+     * /sys/devices/system/node doesn't exist.
+     */
+    if (node) { /* CPU node must be 0 or something is wrong */
+      pr_err("Fail to get cpu* dirs under %s.", node_dir);
+      ret = HSAKMT_STATUS_ERROR;
+      goto exit;
+    }
+    /* Fall back to use /sys/devices/system/cpu */
+    snprintf(node_dir, MAXPATHSIZE, "/sys/devices/system/cpu");
+    max_cpus = num_subdirs(node_dir, "cpu");
+    if (max_cpus <= 0) {
+      pr_err("Fail to get cpu* dirs under %s\n", node_dir);
+      ret = HSAKMT_STATUS_ERROR;
+      goto exit;
+    }
+  }
+
+  dirp = opendir(node_dir);
+  while ((dir = readdir(dirp)) != 0) {
+    if (strncmp(dir->d_name, "cpu", 3))
+      continue;
+    if (!isdigit(dir->d_name[3])) /* ignore files like cpulist */
+      continue;
+    if (strlen(node_dir) + strlen(dir->d_name) + strlen("/cache") + 2 < MAXPATHSIZE) {
+      std::string path_str = std::string(node_dir) + "/" + dir->d_name + "/cache";
+      strncpy(path, path_str.c_str(), MAXPATHSIZE);
+      path[MAXPATHSIZE - 1] = '\0';
+    } else {
+      pr_err("Path is too long and was truncated.\n");
+      goto exit;
+    }
+
+    cpu_cacheinfo_t cpu_ci;
+    cpu_ci.num_caches = num_subdirs(path, "index");
+    cpu_ci.proc_num= atoi(dir->d_name+3);
+
+    cache_cnt += get_cpu_cache_info(path, cpuinfo, tbl.cache, cpu_ci);
+  }
+  assert(cache_cnt == tbl.cache.size());
+  tbl.node.NumCaches = cache_cnt;
+
+exit:
+  if (dirp)
+    closedir(dirp);
+  return ret;
+}
+
+/* For a give Node @node_id the function gets @iolink_id information i.e. parses
+ * sysfs the following sysfs entry
+ * ./nodes/@node_id/io_links/@iolink_id/properties. @node_id has to be valid
+ * accessible node.
+ *
+ * If node_to specified by the @iolink_id is not accessible the function returns
+ * HSAKMT_STATUS_NOT_SUPPORTED. If node_to is accessible, then node_to is mapped
+ * from sysfs_node to user_node and returns HSAKMT_STATUS_SUCCESS.
+ */
+static HSAKMT_STATUS topology_sysfs_get_iolink_props(uint32_t node_id,
+                                                     uint32_t iolink_id,
+                                                     HsaIoLinkProperties& props,
+                                                     bool p2pLink) {
+  wsl::thunk::WDDMDevice *device;
+  topology_map_node_id(node_id, device);
+
+  std::memset(&props, 0, sizeof(props));
+  props.IoLinkType = HSA_IOLINKTYPE_PCIEXPRESS;
+  props.VersionMajor = props.VersionMinor = 0;
+  props.NodeFrom = node_id;
+  props.NodeTo = 0;
+  props.Weight = 20;
+  props.Flags.ui32.Override = 1;
+  props.Flags.ui32.NonCoherent = 1;
+  props.Flags.ui32.NoAtomics32bit = !(device->SupportPlatformAtomic());
+  props.Flags.ui32.NoAtomics64bit = !(device->SupportPlatformAtomic());
+  props.RecSdmaEngIdMask = 0;
+
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+/* topology_get_free_io_link_slot_for_node - For the given node_id, find the
+ * next available free slot to add an io_link
+ */
+static HsaIoLinkProperties *
+topology_get_free_io_link_slot_for_node(uint32_t node_id,
+                                        const HsaSystemProperties& sys_props,
+                                        std::vector<node_props_t>& node_props) {
+  std::vector<HsaIoLinkProperties>& props = node_props[node_id].link;
+
+  if (node_id >= sys_props.NumNodes) {
+    pr_err("Invalid node [%d]\n", node_id);
+    return NULL;
+  }
+
+  if (!props.size()) {
+    pr_err("No io_link reported for Node [%d]\n", node_id);
+    return NULL;
+  }
+
+  if (node_props[node_id].node.NumIOLinks >= sys_props.NumNodes - 1) {
+    pr_err("No more space for io_link for Node [%d]\n", node_id);
+    return NULL;
+  }
+
+  return &props[node_props[node_id].node.NumIOLinks];
+}
+
+/* topology_add_io_link_for_node - If a free slot is available,
+ * add io_link for the given Node.
+ * TODO: Add other members of HsaIoLinkProperties
+ */
+static HSAKMT_STATUS topology_add_io_link_for_node(
+    uint32_t node_from, const HsaSystemProperties& sys_props,
+    std::vector<node_props_t>& node_props, HSA_IOLINKTYPE IoLinkType, uint32_t node_to,
+    uint32_t Weight) {
+  HsaIoLinkProperties *props;
+
+  props =
+      topology_get_free_io_link_slot_for_node(node_from, sys_props, node_props);
+  if (!props)
+    return HSAKMT_STATUS_NO_MEMORY;
+
+  props->IoLinkType = IoLinkType;
+  props->NodeFrom = node_from;
+  props->NodeTo = node_to;
+  props->Weight = Weight;
+  node_props[node_from].node.NumIOLinks++;
+
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+/* Find the CPU that this GPU (gpu_node) directly connects to */
+static int32_t gpu_get_direct_link_cpu(uint32_t gpu_node,
+                                       const std::vector<node_props_t>& node_props) {
+  const std::vector<HsaIoLinkProperties>& props = node_props[gpu_node].link;
+  uint32_t i;
+
+  if (!node_props[gpu_node].node.KFDGpuID || props.empty() ||
+      node_props[gpu_node].node.NumIOLinks == 0)
+    return -1;
+
+  for (i = 0; i < node_props[gpu_node].node.NumIOLinks; i++)
+    if (props[i].IoLinkType == HSA_IOLINKTYPE_PCIEXPRESS &&
+        props[i].Weight <= 20) /* >20 is GPU->CPU->GPU */
+      return props[i].NodeTo;
+
+  return -1;
+}
+
+/* Get node1->node2 IO link information. This should be a direct link that has
+ * been created in the kernel.
+ */
+static HSAKMT_STATUS get_direct_iolink_info(uint32_t node1, uint32_t node2,
+                                            const std::vector<node_props_t>& node_props,
+                                            HSAuint32 *weight,
+                                            HSA_IOLINKTYPE *type) {
+  const std::vector<HsaIoLinkProperties>& props = node_props[node1].link;
+  uint32_t i;
+
+  if (!props.size())
+    return HSAKMT_STATUS_INVALID_NODE_UNIT;
+
+  for (i = 0; i < node_props[node1].node.NumIOLinks; i++)
+    if (props[i].NodeTo == node2) {
+      if (weight)
+        *weight = props[i].Weight;
+      if (type)
+        *type = props[i].IoLinkType;
+      return HSAKMT_STATUS_SUCCESS;
+    }
+
+  return HSAKMT_STATUS_INVALID_PARAMETER;
+}
+
+static HSAKMT_STATUS get_indirect_iolink_info(uint32_t node1, uint32_t node2,
+                                              const std::vector<node_props_t>& node_props,
+                                              HSAuint32 *weight,
+                                              HSA_IOLINKTYPE *type) {
+  int32_t dir_cpu1 = -1, dir_cpu2 = -1;
+  HSAKMT_STATUS ret;
+  uint32_t i;
+
+  *weight = 0;
+  *type = HSA_IOLINKTYPE_UNDEFINED;
+
+  if (node1 == node2)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  /* CPU->CPU is not an indirect link */
+  if (!node_props[node1].node.KFDGpuID && !node_props[node2].node.KFDGpuID)
+    return HSAKMT_STATUS_INVALID_NODE_UNIT;
+
+  if (node_props[node1].node.HiveID && node_props[node2].node.HiveID &&
+      node_props[node1].node.HiveID == node_props[node2].node.HiveID)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  if (node_props[node1].node.KFDGpuID)
+    dir_cpu1 = gpu_get_direct_link_cpu(node1, node_props);
+  if (node_props[node2].node.KFDGpuID)
+    dir_cpu2 = gpu_get_direct_link_cpu(node2, node_props);
+
+  if (dir_cpu1 < 0 && dir_cpu2 < 0)
+    return HSAKMT_STATUS_ERROR;
+
+  /* if the node2(dst) is GPU , it need to be large bar for host access*/
+  if (node_props[node2].node.KFDGpuID) {
+    for (i = 0; i < node_props[node2].node.NumMemoryBanks; ++i)
+      if (node_props[node2].mem[i].HeapType == HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC)
+        break;
+    if (i >= node_props[node2].node.NumMemoryBanks)
+      return HSAKMT_STATUS_ERROR;
+  }
+  /* Possible topology:
+   *   GPU --(weight1) -- CPU -- (weight2) -- GPU
+   *   GPU --(weight1) -- CPU -- (weight2) -- CPU -- (weight3) -- GPU
+   *   GPU --(weight1) -- CPU -- (weight2) -- CPU
+   *   CPU -- (weight2) -- CPU -- (weight3) -- GPU
+   */
+  HSAuint32 weight1 = 0, weight2 = 0, weight3 = 0;
+  if (dir_cpu1 >= 0) { /* GPU->CPU ... */
+    if (dir_cpu2 >= 0) {
+      if (dir_cpu1 == dir_cpu2) /* GPU->CPU->GPU*/ {
+        ret =
+            get_direct_iolink_info(node1, dir_cpu1, node_props, &weight1, NULL);
+        if (ret != HSAKMT_STATUS_SUCCESS)
+          return ret;
+        ret =
+            get_direct_iolink_info(dir_cpu1, node2, node_props, &weight2, type);
+      } else /* GPU->CPU->CPU->GPU*/ {
+        ret =
+            get_direct_iolink_info(node1, dir_cpu1, node_props, &weight1, NULL);
+        if (ret != HSAKMT_STATUS_SUCCESS)
+          return ret;
+        ret = get_direct_iolink_info(dir_cpu1, dir_cpu2, node_props, &weight2,
+                                     type);
+        if (ret != HSAKMT_STATUS_SUCCESS)
+          return ret;
+        /* On QPI interconnection, GPUs can't access
+         * each other if they are attached to different
+         * CPU sockets. CPU<->CPU weight larger than 20
+         * means the two CPUs are in different sockets.
+         */
+        if (*type == HSA_IOLINK_TYPE_QPI_1_1 && weight2 > 20)
+          return HSAKMT_STATUS_NOT_SUPPORTED;
+        ret =
+            get_direct_iolink_info(dir_cpu2, node2, node_props, &weight3, NULL);
+      }
+    } else /* GPU->CPU->CPU */ {
+      ret = get_direct_iolink_info(node1, dir_cpu1, node_props, &weight1, NULL);
+      if (ret != HSAKMT_STATUS_SUCCESS)
+        return ret;
+      ret = get_direct_iolink_info(dir_cpu1, node2, node_props, &weight2, type);
+    }
+  } else { /* CPU->CPU->GPU */
+    ret = get_direct_iolink_info(node1, dir_cpu2, node_props, &weight2, type);
+    if (ret != HSAKMT_STATUS_SUCCESS)
+      return ret;
+    ret = get_direct_iolink_info(dir_cpu2, node2, node_props, &weight3, NULL);
+  }
+
+  if (ret != HSAKMT_STATUS_SUCCESS)
+    return ret;
+
+  *weight = weight1 + weight2 + weight3;
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+static void
+topology_create_indirect_gpu_links(const HsaSystemProperties& sys_props,
+                                   std::vector<node_props_t>& node_props) {
+
+  uint32_t i, j;
+  HSAuint32 weight;
+  HSA_IOLINKTYPE type;
+
+  for (i = 0; i < sys_props.NumNodes - 1; i++) {
+    for (j = i + 1; j < sys_props.NumNodes; j++) {
+      get_indirect_iolink_info(i, j, node_props, &weight, &type);
+      if (!weight)
+        goto try_alt_dir;
+      if (topology_add_io_link_for_node(i, sys_props, node_props, type, j,
+                                        weight) != HSAKMT_STATUS_SUCCESS)
+        pr_err("Fail to add IO link %d->%d\n", i, j);
+    try_alt_dir:
+      get_indirect_iolink_info(j, i, node_props, &weight, &type);
+      if (!weight)
+        continue;
+      if (topology_add_io_link_for_node(j, sys_props, node_props, type, i,
+                                        weight) != HSAKMT_STATUS_SUCCESS)
+        pr_err("Fail to add IO link %d->%d\n", j, i);
+    }
+  }
+}
+
+HSAKMT_STATUS topology_take_snapshot(void) {
+  uint32_t i, mem_id, cache_id;
+  HsaSystemProperties sys_props;
+  std::vector<node_props_t>& temp_props = dxg_topology->g_props;
+  HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
+  const uint32_t num_procs = sysconf(_SC_NPROCESSORS_ONLN);
+  std::vector<proc_cpuinfo> cpuinfo(num_procs);
+  uint32_t num_ioLinks;
+  bool p2p_links = false;
+  uint32_t num_p2pLinks = 0;
+
+  topology_parse_cpuinfo(cpuinfo);
+
+  ret = topology_sysfs_get_system_props(sys_props);
+  if (ret != HSAKMT_STATUS_SUCCESS)
+    goto err;
+  if (sys_props.NumNodes > 0) {
+    temp_props.resize(sys_props.NumNodes);
+
+    for (i = 0; i < sys_props.NumNodes; i++) {
+      wsl::thunk::WDDMDevice *device_;
+      topology_map_node_id(i, device_);
+
+      ret = topology_sysfs_get_node_props(i, temp_props[i].node, p2p_links,
+                                          num_p2pLinks);
+      if (ret != HSAKMT_STATUS_SUCCESS) {
+        goto err;
+      }
+
+      topology_setup_is_dgpu_param(&temp_props[i].node);
+
+      if (temp_props[i].node.NumCPUCores)
+        topology_get_cpu_model_name(temp_props[i].node, cpuinfo);
+
+      if (temp_props[i].node.NumMemoryBanks) {
+        temp_props[i].mem.resize(temp_props[i].node.NumMemoryBanks);
+
+        for (mem_id = 0; mem_id < temp_props[i].node.NumMemoryBanks; mem_id++) {
+          ret = topology_sysfs_get_mem_props(i, mem_id,
+                                             temp_props[i].mem[mem_id]);
+          if (ret != HSAKMT_STATUS_SUCCESS) {
+            goto err;
+          }
+        }
+      }
+
+      if (temp_props[i].node.NumCaches) {
+        temp_props[i].cache.resize(temp_props[i].node.NumCaches);
+        for (int j = 0; j < 3; j++) {
+          temp_props[i].cache[j].CacheType.ui32.Data = 1;
+          temp_props[i].cache[j].CacheType.ui32.HSACU = 1;
+          temp_props[i].cache[j].CacheLevel = j + 1;
+        }
+        temp_props[i].cache[0].CacheSize = device_->GetL1CacheSize() / 1024;
+        temp_props[i].cache[1].CacheSize = device_->GetL2CacheSize() / 1024;
+        temp_props[i].cache[2].CacheSize = device_->GetL3CacheSize() / 1024;
+      } else if (!temp_props[i].node.KFDGpuID) { /* a CPU node */
+        ret = topology_get_cpu_cache_props(i, cpuinfo, temp_props[i]);
+        if (ret != HSAKMT_STATUS_SUCCESS) {
+          goto err;
+        }
+      }
+
+      /* To simplify, allocate maximum needed memory for io_links for each node.
+       * This removes the need for realloc when indirect and QPI links are added
+       * later
+       */
+      temp_props[i].link.resize(sys_props.NumNodes - 1);
+      num_ioLinks = temp_props[i].node.NumIOLinks - num_p2pLinks;
+      uint32_t link_id = 0;
+
+      if (num_ioLinks) {
+        uint32_t sys_link_id = 0;
+
+        /* Parse all the sysfs specified io links. Skip the ones where the
+         * remote node (node_to) is not accessible
+         */
+        while (sys_link_id < num_ioLinks && link_id < sys_props.NumNodes - 1) {
+          ret = topology_sysfs_get_iolink_props(
+              i, sys_link_id++, temp_props[i].link[link_id], false);
+          if (ret == HSAKMT_STATUS_NOT_SUPPORTED) {
+            ret = HSAKMT_STATUS_SUCCESS;
+            continue;
+          } else if (ret != HSAKMT_STATUS_SUCCESS) {
+            goto err;
+          }
+          link_id++;
+        }
+        /* sysfs specifies all the io links. Limit the number to valid ones */
+        temp_props[i].node.NumIOLinks = link_id;
+      }
+
+      if (num_p2pLinks) {
+        uint32_t sys_link_id = 0;
+
+        /* Parse all the sysfs specified p2p links.
+         */
+        while (sys_link_id < num_p2pLinks && link_id < sys_props.NumNodes - 1) {
+          ret = topology_sysfs_get_iolink_props(
+              i, sys_link_id++, temp_props[i].link[link_id], true);
+          if (ret == HSAKMT_STATUS_NOT_SUPPORTED) {
+            ret = HSAKMT_STATUS_SUCCESS;
+            continue;
+          } else if (ret != HSAKMT_STATUS_SUCCESS) {
+            goto err;
+          }
+          link_id++;
+        }
+        temp_props[i].node.NumIOLinks = link_id;
+      }
+    }
+  }
+
+  if (!p2p_links) {
+    /* All direct IO links are created in the kernel. Here we need to
+     * connect GPU<->GPU or GPU<->CPU indirect IO links.
+     */
+    topology_create_indirect_gpu_links(sys_props, temp_props);
+  }
+
+  if (!dxg_topology->g_system) {
+    dxg_topology->g_system = (HsaSystemProperties *)malloc(sizeof(HsaSystemProperties));
+    if (!dxg_topology->g_system) {
+      ret = HSAKMT_STATUS_NO_MEMORY;
+      goto err;
+    }
+  }
+
+  *dxg_topology->g_system = sys_props;
+err:
+  return ret;
+}
+
+/* Drop the Snashot of the HSA topology information. Assume lock is held. */
+void topology_drop_snapshot(void) {
+  if (!!dxg_topology->g_system != !!dxg_topology->g_props.size())
+    pr_warn("Probably inconsistency?\n");
+
+  dxg_topology->g_props.clear();
+
+  free(dxg_topology->g_system);
+  dxg_topology->g_system = NULL;
+
+  trim_suballocator();
+  for (auto device : dxg_topology->wdevices_)
+    delete device;
+  dxg_topology->wdevices_.clear();
+}
+
+HSAKMT_STATUS validate_nodeid(uint32_t nodeid, uint32_t *gpu_id) {
+  if (dxg_topology->g_props.empty() || !dxg_topology->g_system || dxg_topology->g_system->NumNodes <= nodeid)
+    return HSAKMT_STATUS_INVALID_NODE_UNIT;
+  if (gpu_id)
+    *gpu_id = dxg_topology->g_props[nodeid].node.KFDGpuID;
+
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS gpuid_to_nodeid(uint32_t gpu_id, uint32_t *node_id) {
+  uint64_t node_idx;
+
+  for (node_idx = 0; node_idx < dxg_topology->g_system->NumNodes; node_idx++) {
+    if (dxg_topology->g_props[node_idx].node.KFDGpuID == gpu_id) {
+      *node_id = node_idx;
+      return HSAKMT_STATUS_SUCCESS;
+    }
+  }
+
+  return HSAKMT_STATUS_INVALID_NODE_UNIT;
+}
+
+HSAKMT_STATUS HSAKMTAPI
+hsaKmtAcquireSystemProperties(HsaSystemProperties *SystemProperties) {
+  HSAKMT_STATUS err = HSAKMT_STATUS_SUCCESS;
+
+  CHECK_DXG_OPEN();
+
+  if (!SystemProperties)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  pthread_mutex_lock(&dxg_runtime->hsakmt_mutex);
+
+  /* We already have a valid snapshot. Avoid double initialization that
+   * would leak memory.
+   */
+  if (dxg_topology->g_system) {
+    *SystemProperties = *dxg_topology->g_system;
+    goto out;
+  }
+
+  err = topology_take_snapshot();
+  if (err != HSAKMT_STATUS_SUCCESS)
+    goto out;
+
+  assert(dxg_topology->g_system);
+
+  // err = fmm_init_process_apertures(dxg_topology->g_system->NumNodes);
+  if (err != HSAKMT_STATUS_SUCCESS)
+    goto init_process_apertures_failed;
+
+  // err = init_process_doorbells(dxg_topology->g_system->NumNodes);
+  if (err != HSAKMT_STATUS_SUCCESS)
+    goto init_doorbells_failed;
+
+  *SystemProperties = *dxg_topology->g_system;
+
+  goto out;
+
+init_doorbells_failed:
+  // fmm_destroy_process_apertures();
+init_process_apertures_failed:
+  topology_drop_snapshot();
+
+out:
+  pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex);
+  return err;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtReleaseSystemProperties(void) {
+  pthread_mutex_lock(&dxg_runtime->hsakmt_mutex);
+
+  topology_drop_snapshot();
+
+  pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex);
+
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS topology_get_node_props(HSAuint32 NodeId,
+                                      HsaNodeProperties *NodeProperties) {
+  if (!dxg_topology->g_system || dxg_topology->g_props.empty() || NodeId >= dxg_topology->g_system->NumNodes)
+    return HSAKMT_STATUS_ERROR;
+
+  *NodeProperties = dxg_topology->g_props[NodeId].node;
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI
+hsaKmtGetNodeProperties(HSAuint32 NodeId, HsaNodeProperties *NodeProperties) {
+  HSAKMT_STATUS err;
+  uint32_t gpu_id;
+
+  if (!NodeProperties)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  CHECK_DXG_OPEN();
+  pthread_mutex_lock(&dxg_runtime->hsakmt_mutex);
+
+  err = validate_nodeid(NodeId, &gpu_id);
+  if (err != HSAKMT_STATUS_SUCCESS)
+    goto out;
+
+  err = topology_get_node_props(NodeId, NodeProperties);
+  if (err != HSAKMT_STATUS_SUCCESS)
+    goto out;
+  /* For CPU only node don't add any additional GPU memory banks. */
+  if (gpu_id) {
+    uint64_t base, limit;
+    if (!(NodeProperties->Integrated))
+      NodeProperties->NumMemoryBanks += NUM_OF_DGPU_HEAPS;
+    else
+      NodeProperties->NumMemoryBanks += NUM_OF_IGPU_HEAPS;
+    // TODO: for apu
+    /*if (fmm_get_aperture_base_and_limit(FMM_MMIO, gpu_id, &base,
+                    &limit) == HSAKMT_STATUS_SUCCESS)
+            NodeProperties->NumMemoryBanks += 1;*/
+  }
+
+out:
+  pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex);
+  return err;
+}
+
+HSAKMT_STATUS HSAKMTAPI
+hsaKmtGetNodeMemoryProperties(HSAuint32 NodeId, HSAuint32 NumBanks,
+                              HsaMemoryProperties *MemoryProperties) {
+  HSAKMT_STATUS err = HSAKMT_STATUS_SUCCESS;
+  uint32_t i;
+
+  if (!MemoryProperties)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  CHECK_DXG_OPEN();
+  pthread_mutex_lock(&dxg_runtime->hsakmt_mutex);
+
+  memset(MemoryProperties, 0, NumBanks * sizeof(HsaMemoryProperties));
+  for (i = 0; i < wsl::Min(dxg_topology->g_props[NodeId].node.NumMemoryBanks, NumBanks); i++) {
+    assert(dxg_topology->g_props[NodeId].mem.size());
+    MemoryProperties[i] = dxg_topology->g_props[NodeId].mem[i];
+  }
+
+  /* The following memory banks does not apply to CPU only node */
+  wsl::thunk::WDDMDevice *device_ = get_wddmdev(NodeId);
+  if (device_ == nullptr)
+    goto out;
+
+  /*Add LDS*/
+  if (i < NumBanks) {
+    MemoryProperties[i].HeapType = HSA_HEAPTYPE_GPU_LDS;
+    MemoryProperties[i].VirtualBaseAddress = device_->SharedApertureBase();
+    MemoryProperties[i].SizeInBytes = dxg_topology->g_props[NodeId].node.LDSSizeInKB * 1024;
+    i++;
+  }
+
+  /* Add SCRATCH */
+  if (i < NumBanks) {
+    MemoryProperties[i].HeapType = HSA_HEAPTYPE_GPU_SCRATCH;
+    MemoryProperties[i].VirtualBaseAddress = device_->PrivateApertureBase();
+    MemoryProperties[i].SizeInBytes = device_->PrivateApertureSize();
+    i++;
+  }
+
+out:
+  pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex);
+  return err;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeCacheProperties(
+    HSAuint32 NodeId, HSAuint32 ProcessorId, HSAuint32 NumCaches,
+    HsaCacheProperties *CacheProperties) {
+  HSAKMT_STATUS err;
+  uint32_t i;
+
+  if (!CacheProperties)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  CHECK_DXG_OPEN();
+  pthread_mutex_lock(&dxg_runtime->hsakmt_mutex);
+
+  /* KFD ADD page 18, snapshot protocol violation */
+  if (!dxg_topology->g_system || NodeId >= dxg_topology->g_system->NumNodes) {
+    err = HSAKMT_STATUS_INVALID_NODE_UNIT;
+    goto out;
+  }
+
+  if (NumCaches > dxg_topology->g_props[NodeId].node.NumCaches) {
+    err = HSAKMT_STATUS_INVALID_PARAMETER;
+    goto out;
+  }
+
+  for (i = 0; i < wsl::Min(dxg_topology->g_props[NodeId].node.NumCaches, NumCaches); i++) {
+    assert(dxg_topology->g_props[NodeId].cache.size());
+    CacheProperties[i] = dxg_topology->g_props[NodeId].cache[i];
+  }
+
+  err = HSAKMT_STATUS_SUCCESS;
+
+out:
+  pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex);
+  return err;
+}
+
+HSAKMT_STATUS topology_get_iolink_props(HSAuint32 NodeId, HSAuint32 NumIoLinks,
+                                        HsaIoLinkProperties *IoLinkProperties) {
+  if (!dxg_topology->g_system || dxg_topology->g_props.empty() || NodeId >= dxg_topology->g_system->NumNodes)
+    return HSAKMT_STATUS_ERROR;
+
+  memcpy(IoLinkProperties, dxg_topology->g_props[NodeId].link.data(),
+         NumIoLinks * sizeof(*IoLinkProperties));
+
+  return HSAKMT_STATUS_SUCCESS;
+}
+
+HSAKMT_STATUS HSAKMTAPI
+hsaKmtGetNodeIoLinkProperties(HSAuint32 NodeId, HSAuint32 NumIoLinks,
+                              HsaIoLinkProperties *IoLinkProperties) {
+  HSAKMT_STATUS err;
+
+  if (!IoLinkProperties)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  CHECK_DXG_OPEN();
+
+  pthread_mutex_lock(&dxg_runtime->hsakmt_mutex);
+
+  /* KFD ADD page 18, snapshot protocol violation */
+  if (!dxg_topology->g_system || NodeId >= dxg_topology->g_system->NumNodes) {
+    err = HSAKMT_STATUS_INVALID_NODE_UNIT;
+    goto out;
+  }
+
+  if (NumIoLinks > dxg_topology->g_props[NodeId].node.NumIOLinks) {
+    err = HSAKMT_STATUS_INVALID_PARAMETER;
+    goto out;
+  }
+
+  assert(dxg_topology->g_props[NodeId].link.size());
+  err = topology_get_iolink_props(NodeId, NumIoLinks, IoLinkProperties);
+
+out:
+  pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex);
+  return err;
+}
+
+uint16_t get_device_id_by_node_id(HSAuint32 node_id) {
+  if (dxg_topology->g_props.empty() || !dxg_topology->g_system || dxg_topology->g_system->NumNodes <= node_id)
+    return 0;
+
+  return dxg_topology->g_props[node_id].node.DeviceId;
+}
+
+bool prefer_ats(HSAuint32 node_id) {
+  return dxg_topology->g_props[node_id].node.Capability.ui32.HSAMMUPresent &&
+         dxg_topology->g_props[node_id].node.NumCPUCores &&
+         dxg_topology->g_props[node_id].node.NumFComputeCores;
+}
+
+uint16_t get_device_id_by_gpu_id(HSAuint32 gpu_id) {
+  unsigned int i;
+
+  if (dxg_topology->g_props.empty() || !dxg_topology->g_system)
+    return 0;
+
+  for (i = 0; i < dxg_topology->g_system->NumNodes; i++) {
+    if (dxg_topology->g_props[i].node.KFDGpuID == gpu_id)
+      return dxg_topology->g_props[i].node.DeviceId;
+  }
+
+  return 0;
+}
+
+uint32_t get_direct_link_cpu(uint32_t gpu_node) {
+  HSAuint64 size = 0;
+  int32_t cpu_id;
+  HSAuint32 i;
+
+  cpu_id = gpu_get_direct_link_cpu(gpu_node, dxg_topology->g_props);
+  if (cpu_id == -1)
+    return INVALID_NODEID;
+
+  assert(dxg_topology->g_props[cpu_id].mem.size());
+
+  for (i = 0; i < dxg_topology->g_props[cpu_id].node.NumMemoryBanks; i++)
+    size += dxg_topology->g_props[cpu_id].mem[i].SizeInBytes;
+
+  return size ? (uint32_t)cpu_id : INVALID_NODEID;
+}
+
+HSAKMT_STATUS validate_nodeid_array(uint32_t **gpu_id_array,
+                                    uint32_t NumberOfNodes,
+                                    uint32_t *NodeArray) {
+  HSAKMT_STATUS ret;
+  unsigned int i;
+
+  if (NumberOfNodes == 0 || !NodeArray || !gpu_id_array)
+    return HSAKMT_STATUS_INVALID_PARAMETER;
+
+  /* Translate Node IDs to gpu_ids */
+  *gpu_id_array = (uint32_t *)malloc(NumberOfNodes * sizeof(uint32_t));
+  if (!(*gpu_id_array))
+    return HSAKMT_STATUS_NO_MEMORY;
+  for (i = 0; i < NumberOfNodes; i++) {
+    ret = validate_nodeid(NodeArray[i], *gpu_id_array + i);
+    if (ret != HSAKMT_STATUS_SUCCESS) {
+      free(*gpu_id_array);
+      break;
+    }
+  }
+
+  return ret;
+}
+
+uint32_t get_num_sysfs_nodes(void) { return dxg_topology->num_sysfs_nodes; }
+
+wsl::thunk::WDDMDevice *get_wddmdev(uint32_t node_id) {
+  if ((!dxg_topology->wdevices_.size()) || (!node_id) || (node_id >= dxg_topology->num_sysfs_nodes))
+    return nullptr;
+
+  return dxg_topology->wdevices_[node_id - 1];
+}
+
+uint32_t get_num_wddmdev() {
+  return dxg_topology->wdevices_.size();
+}
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/atomic_helpers.h b/projects/rocr-runtime/libhsakmt/src/dxg/util/atomic_helpers.h
new file mode 100644
index 0000000000..4b7f8b0362
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/atomic_helpers.h
@@ -0,0 +1,519 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+// 
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+// 
+// Developed by:
+// 
+//                 AMD Research and AMD HSA Software Development
+// 
+//                 Advanced Micro Devices, Inc.
+// 
+//                 www.amd.com
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+// 
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/*
+  Helpers to use native types with C++11 atomic operations.
+  Fixes GCC builtin functionality for x86 with respect to WC and non-temporal
+  stores.
+*/
+#ifndef HSA_RUNTIME_CORE_UTIL_ATOMIC_HELPERS_H_
+#define HSA_RUNTIME_CORE_UTIL_ATOMIC_HELPERS_H_
+
+#include <atomic>
+#include "utils.h"
+
+//ALWAYS_CONSERVATIVE will very likely overfence your code.
+//For use as a debugging aid only.
+#define ALWAYS_CONSERVATIVE 0
+
+#if !ALWAYS_CONSERVATIVE
+#if defined(__x86_64__) || defined(_M_X64)
+#define X64_ORDER_WC 1
+#endif
+#if X64_ORDER_WC
+#include <xmmintrin.h>
+#endif
+#endif
+
+namespace wsl {
+namespace atomic {
+
+static constexpr int c11ToBuiltInFlags(std::memory_order order)
+{
+#if ALWAYS_CONSERVATIVE
+  return __ATOMIC_RELAXED;
+#elif X64_ORDER_WC
+  return __ATOMIC_RELAXED;
+#else
+  return (order == std::memory_order_relaxed) ? __ATOMIC_RELAXED :
+    (order == std::memory_order_acquire) ? __ATOMIC_ACQUIRE :
+    (order == std::memory_order_release) ? __ATOMIC_RELEASE :
+    (order == std::memory_order_seq_cst) ? __ATOMIC_SEQ_CST :
+    (order == std::memory_order_consume) ? __ATOMIC_CONSUME :
+    (order == std::memory_order_acq_rel) ? __ATOMIC_ACQ_REL :
+    __ATOMIC_SEQ_CST;
+#endif
+}
+
+static __forceinline void PreFence(std::memory_order order) {
+#if ALWAYS_CONSERVATIVE
+  switch (order) {
+    case std::memory_order_release:
+    case std::memory_order_seq_cst:
+    case std::memory_order_acq_rel:
+      __atomic_thread_fence(__ATOMIC_SEQ_CST);
+    default:;
+  }
+#elif X64_ORDER_WC
+  switch (order) {
+    case std::memory_order_release:
+    case std::memory_order_seq_cst:
+    case std::memory_order_acq_rel:
+      _mm_sfence();
+    default:;
+  }
+#endif
+}
+
+static __forceinline void PostFence(std::memory_order order) {
+#if ALWAYS_CONSERVATIVE
+  switch (order) {
+    case std::memory_order_seq_cst:
+    case std::memory_order_acq_rel:
+    case std::memory_order_acquire:
+      __atomic_thread_fence(__ATOMIC_SEQ_CST);
+    default:;
+  }
+#elif X64_ORDER_WC
+  switch (order) {
+    case std::memory_order_seq_cst:
+      return _mm_mfence();
+    case std::memory_order_acq_rel:
+    case std::memory_order_acquire:
+      return _mm_lfence();
+    default:;
+  }
+#endif
+}
+
+static __forceinline void Fence(std::memory_order order=std::memory_order_seq_cst) {
+#if ALWAYS_CONSERVATIVE
+  __atomic_thread_fence(__ATOMIC_SEQ_CST);
+#elif X64_ORDER_WC
+  switch (order) {
+    case std::memory_order_seq_cst:
+    case std::memory_order_acq_rel:
+      return _mm_mfence();
+    case std::memory_order_acquire:
+      return _mm_lfence();
+    case std::memory_order_release:
+      return _mm_sfence();
+    default:;
+  }
+#else
+  std::atomic_thread_fence(order);
+#endif
+}
+
+template <class T>
+static __forceinline void BasicCheck(const T* ptr) {
+  constexpr bool value = __atomic_always_lock_free(sizeof(T), 0);
+  static_assert(value, "Atomic type may not be compatible with peripheral atomics.");
+};
+
+template <class T>
+static __forceinline void BasicCheck(const volatile T* ptr) {
+  constexpr bool value = __atomic_always_lock_free(sizeof(T), 0);
+  static_assert(value, "Atomic type may not be compatible with peripheral atomics.");
+};
+
+/// @brief: Load value of type T atomically with specified memory order.
+/// @param: ptr(Input), a pointer to type T.
+/// @param: order(Input), memory order with atomic load, relaxed by default.
+/// @return: T, loaded value.
+template <class T>
+static __forceinline T
+    Load(const T* ptr, std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  T ret;
+  PreFence(order);
+  __atomic_load(ptr, &ret, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: function overloading, for more info, see previous one.
+/// @param: ptr(Input), a pointer to volatile type T.
+/// @param: order(Input), memory order with atomic load, relaxed by default.
+/// @return: T, loaded value.
+template <class T>
+static __forceinline T
+    Load(const volatile T* ptr,
+         std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  T ret;
+  PreFence(order);
+  __atomic_load(ptr, &ret, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Store value of type T with specified memory order.
+/// @param: ptr(Input), a pointer to instance which will be stored.
+/// @param: val(Input), value to be stored.
+/// @param: order(Input), memory order with atomic store, relaxed by default.
+/// @return: void.
+template <class T>
+static __forceinline void Store(
+    T* ptr, T val, std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  __atomic_store(ptr, &val, c11ToBuiltInFlags(order));
+  PostFence(order);
+}
+
+/// @brief: Function overloading, for more info, see previous one.
+/// @param: ptr(Input), a pointer to volatile instance which will be stored.
+/// @param: val(Input), value to be stored.
+/// @param: order(Input), memory order with atomic store, relaxed by default.
+/// @return: void.
+template <class T>
+static __forceinline void Store(
+    volatile T* ptr, T val,
+    std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  __atomic_store(ptr, &val, c11ToBuiltInFlags(order));
+  PostFence(order);
+}
+
+/// @brief: Compare and swap value atomically with specified memory order.
+/// @param: ptr(Input), a pointer to variable which is operated on.
+/// @param: val(Input), value to be stored if condition is satisfied.
+/// @param: expected(Input), value which is expected.
+/// @param: order(Input), memory order with atomic operation.
+/// @return: T, observed value of type T.
+template <class T>
+static __forceinline T
+    Cas(T* ptr, T val, T expected,
+        std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  __atomic_compare_exchange(ptr, &expected, &val, false, c11ToBuiltInFlags(order), __ATOMIC_RELAXED);
+  PostFence(order);
+  return expected;
+}
+
+/// @brief: Function overloading, for more info, see previous one.
+/// @param: ptr(Input), a pointer to volatile variable which is operated on.
+/// @param: val(Input), value to be stored if condition is satisfied.
+/// @param: expected(Input), value which is expected.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, observed value of type T.
+template <class T>
+static __forceinline T
+    Cas(volatile T* ptr, T val, T expected,
+        std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  __atomic_compare_exchange(ptr, &expected, &val, false, c11ToBuiltInFlags(order), __ATOMIC_RELAXED);
+  PostFence(order);
+  return expected;
+}
+
+/// @brief: Exchange the value atomically with specified memory order.
+/// @param: ptr(Input), a pointer to variable which is operated on.
+/// @param: val(Input), value to be stored.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, the value prior to the exchange.
+template <class T>
+static __forceinline T
+    Exchange(T* ptr, T val,
+             std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  T ret;
+  PreFence(order);
+  __atomic_exchange(ptr, &val, &ret, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Function overloading, for more info, see previous one.
+/// @param: ptr(Input), a pointer to variable which is operated on.
+/// @param: val(Input), value to be stored.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, the value prior to the exchange.
+template <class T>
+static __forceinline T
+    Exchange(volatile T* ptr, T val,
+             std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  T ret;
+  PreFence(order);
+  __atomic_exchange(ptr, &val, &ret, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Add value to variable atomically with specified memory order.
+/// @param: ptr(Input), a pointer to variable which is operated on.
+/// @param: val(Input), value to be added.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, the value of the variable prior to the addition.
+template <class T>
+static __forceinline T
+    Add(T* ptr, T val, std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  T ret = __atomic_fetch_add(ptr, val, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Subtract value from the variable atomically with specified memory
+/// order.
+/// @param: ptr(Input), a pointer to variable which is operated on.
+/// @param: val(Input), value to be subtraced.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, value of the variable prior to the subtraction.
+template <class T>
+static __forceinline T
+    Sub(T* ptr, T val, std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  T ret = __atomic_fetch_sub(ptr, val, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Bit And operation on variable atomically with specified memory
+/// order.
+/// @param: ptr(Input), a pointer to variable which is operated on.
+/// @param: val(Input), value which is ANDed with variable.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, value of variable prior to the operation.
+template <class T>
+static __forceinline T
+    And(T* ptr, T val, std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  T ret = __atomic_fetch_and(ptr, val, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Bit Or operation on variable atomically with specified memory order.
+/// @param: ptr(Input), a pointer to variable which is operated on.
+/// @param: val(Input), value which is ORed with variable.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, value of variable prior to the operation.
+template <class T>
+static __forceinline T
+    Or(T* ptr, T val, std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  T ret = __atomic_fetch_or(ptr, val, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Bit Xor operation on variable atomically with specified memory
+/// order.
+/// @param: ptr(Input), a pointer to variable which is operated on.
+/// @param: val(Input), value which is XORed with variable.
+/// @order: order(Input), memory order which is relaxed by default.
+/// @return: T, valud of variable prior to the opertaion.
+template <class T>
+static __forceinline T
+    Xor(T* ptr, T val, std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  T ret = __atomic_fetch_xor(ptr, val, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Increase the value of variable atomically with specified memory
+/// order.
+/// @param: ptr(Input), a pointer to variable which is operated on.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, value of variable prior to the operation.
+template <class T>
+static __forceinline T
+    Increment(T* ptr, std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  T ret = __atomic_fetch_add(ptr, 1, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Decrease the value of the variable atomically with specified memory
+/// order.
+/// @param: ptr(Input), a pointer to variable which is operated on.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, value of variable prior to the operation.
+template <class T>
+static __forceinline T
+    Decrement(T* ptr, std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  T ret = __atomic_fetch_sub(ptr, 1, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Add value to variable atomically with specified memory order.
+/// @param: ptr(Input), a pointer to volatile variable which is operated on.
+/// @param: val(Input), value to be added.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, the value of the variable prior to the addition.
+template <class T>
+static __forceinline T
+    Add(volatile T* ptr, T val,
+        std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  T ret = __atomic_fetch_add(ptr, val, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Subtract value from the variable atomically with specified memory
+/// order.
+/// @param: ptr(Input), a pointer to volatile variable which is operated on.
+/// @param: val(Input), value to be subtraced.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, value of the variable prior to the subtraction.
+template <class T>
+static __forceinline T
+    Sub(volatile T* ptr, T val,
+        std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  T ret = __atomic_fetch_sub(ptr, val, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Bit And operation on variable atomically with specified memory
+/// order.
+/// @param: ptr(Input), a pointer to volatile variable which is operated on.
+/// @param: val(Input), value which is ANDed with variable.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, value of variable prior to the operation.
+template <class T>
+static __forceinline T
+    And(volatile T* ptr, T val,
+        std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  T ret = __atomic_fetch_and(ptr, val, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Bit Or operation on variable atomically with specified memory order.
+/// @param: ptr(Input), a pointer to volatile variable which is operated on.
+/// @param: val(Input), value which is ORed with variable.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, value of variable prior to the operation.
+template <class T>
+static __forceinline T Or(volatile T* ptr, T val,
+                          std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  T ret = __atomic_fetch_or(ptr, val, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Bit Xor operation on variable atomically with specified memory
+/// order.
+/// @param: ptr(Input), a pointer to volatile variable which is operated on.
+/// @param: val(Input), value which is XORed with variable.
+/// @order: order(Input), memory order which is relaxed by default.
+/// @return: T, valud of variable prior to the opertaion.
+template <class T>
+static __forceinline T
+    Xor(volatile T* ptr, T val,
+        std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  T ret = __atomic_fetch_xor(ptr, val, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Increase the value of variable atomically with specified memory
+/// order.
+/// @param: ptr(Input), a pointer to volatile variable which is operated on.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, value of variable prior to the operation.
+template <class T>
+static __forceinline T
+    Increment(volatile T* ptr,
+              std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  T ret = __atomic_fetch_add(ptr, 1, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+
+/// @brief: Decrease the value of the variable atomically with specified memory
+/// order.
+/// @param: ptr(Input), a pointer to volatile variable which is operated on.
+/// @param: order(Input), memory order which is relaxed by default.
+/// @return: T, value of variable prior to the operation.
+template <class T>
+static __forceinline T
+    Decrement(volatile T* ptr,
+              std::memory_order order = std::memory_order_relaxed) {
+  BasicCheck<T>(ptr);
+  PreFence(order);
+  T ret = __atomic_fetch_sub(ptr, 1, c11ToBuiltInFlags(order));
+  PostFence(order);
+  return ret;
+}
+}   //  namespace atomic
+}   //  namespace wsl
+
+#ifdef X64_ORDER_WC
+#undef X64_ORDER_WC
+#endif
+
+#ifdef ALWAYS_CONSERVATIVE
+#undef ALWAYS_CONSERVATIVE
+#endif
+
+#endif  // HSA_RUNTIME_CORE_UTIL_ATOMIC_HELPERS_H_
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/lazy_ptr.h b/projects/rocr-runtime/libhsakmt/src/dxg/util/lazy_ptr.h
new file mode 100644
index 0000000000..b5817af40d
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/lazy_ptr.h
@@ -0,0 +1,155 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIESd OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef HSA_RUNTIME_CORE_UTIL_LAZY_PTR_H_
+#define HSA_RUNTIME_CORE_UTIL_LAZY_PTR_H_
+
+#include <memory>
+#include <utility>
+#include <functional>
+
+#include "core/util/locks.h"
+#include "core/util/utils.h"
+
+namespace wsl {
+
+/*
+ * Wrapper for a std::unique_ptr that initializes its object at first use.
+ */
+template <typename T> class lazy_ptr {
+ public:
+  lazy_ptr() {}
+
+  explicit lazy_ptr(std::function<T*()> Constructor) { reset(Constructor); }
+
+  lazy_ptr(lazy_ptr&& rhs) {
+    obj = std::move(rhs.obj);
+    func = std::move(rhs.func);
+  }
+
+  lazy_ptr& operator=(lazy_ptr&& rhs) {
+    obj = std::move(rhs.obj);
+    func = std::move(rhs.func);
+  }
+
+  lazy_ptr(lazy_ptr&) = delete;
+  lazy_ptr& operator=(lazy_ptr&) = delete;
+
+  void reset(std::function<T*()> Constructor = nullptr) {
+    obj.reset();
+    func = Constructor;
+  }
+
+  void reset(T* ptr) {
+    obj.reset(ptr);
+    func = nullptr;
+  }
+
+  bool operator==(T* rhs) const { return obj.get() == rhs; }
+  bool operator!=(T* rhs) const { return obj.get() != rhs; }
+
+  const std::unique_ptr<T>& operator->() const {
+    make(true);
+    assert(obj != nullptr && "Null dereference through lazy_ptr.");
+    return obj;
+  }
+
+  std::unique_ptr<T>& operator*() {
+    make(true);
+    return obj;
+  }
+
+  const std::unique_ptr<T>& operator*() const {
+    make(true);
+    return obj;
+  }
+
+  /*
+   * Ensures that the object is created or is being created.
+   * This is useful when early construction of the object is required.
+   */
+  void touch() const { make(false); }
+
+  // Tells if the lazy object has been constructed or not.
+  // Construction may fail silently (return nullptr).
+  bool created() const {
+    std::atomic_thread_fence(std::memory_order_acquire);
+    return func == nullptr;
+  }
+
+  // Tells if the lazy object exists or not.
+  bool empty() const {
+    std::atomic_thread_fence(std::memory_order_acquire);
+    return obj == nullptr;
+  }
+
+ private:
+  mutable std::unique_ptr<T> obj;
+  mutable std::function<T*(void)> func;
+  mutable KernelMutex lock;
+
+  // Separated from make to improve inlining.
+  void make_body(bool block) const {
+    if (block) {
+      lock.Acquire();
+    } else if (!lock.Try()) {
+      return;
+    }
+    MAKE_SCOPE_GUARD([&]() { lock.Release(); });
+    if (func == nullptr) return;
+    T* ptr = func();
+    obj.reset(ptr);
+    std::atomic_thread_fence(std::memory_order_release);
+    func = nullptr;
+  }
+
+  __forceinline void make(bool block) const {
+    if (!created()) {
+      make_body(block);
+    }
+  }
+
+};
+
+} // namespace wsl
+
+#endif  // HSA_RUNTIME_CORE_UTIL_LAZY_PTR_H_
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/lnx/os_linux.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/util/lnx/os_linux.cpp
new file mode 100644
index 0000000000..020ca10b28
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/lnx/os_linux.cpp
@@ -0,0 +1,769 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2024, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifdef __linux__
+#include "core/util/os.h"
+#include "core/util/utils.h"
+
+#include <link.h>
+#include <dlfcn.h>
+#include <pthread.h>
+#include <limits.h>
+#include <sched.h>
+#include <sys/sysinfo.h>
+#include <sys/time.h>
+#include <sys/utsname.h>
+#include <unistd.h>
+#include <errno.h>
+#include <cstring>
+#include <atomic>
+#include <memory>
+#include <string>
+#include <utility>
+#include <semaphore.h>
+#include "core/inc/runtime.h"
+#if defined(__i386__) || defined(__x86_64__)
+#include <cpuid.h>
+#endif
+
+namespace wsl {
+namespace os {
+
+struct ThreadArgs {
+  void* entry_args;
+  ThreadEntry entry_function;
+};
+
+void* __stdcall ThreadTrampoline(void* arg) {
+  ThreadArgs* ar = (ThreadArgs*)arg;
+  ThreadEntry CallMe = ar->entry_function;
+  void* Data = ar->entry_args;
+  delete ar;
+  CallMe(Data);
+  return nullptr;
+}
+
+// Thread container allows multiple waits and separate close (destroy).
+class os_thread {
+ public:
+  explicit os_thread(ThreadEntry function, void* threadArgument, uint stackSize)
+      : thread(0), lock(nullptr), state(RUNNING) {
+    int err;
+    std::unique_ptr<ThreadArgs> args(new ThreadArgs);
+    lock = CreateMutex();
+    if (lock == nullptr) return;
+
+    args->entry_args = threadArgument;
+    args->entry_function = function;
+
+    pthread_attr_t attrib;
+    err = pthread_attr_init(&attrib);
+    if (err != 0) {
+      pr_err("pthread_attr_init failed: %s\n", strerror(err));
+      return;
+    }
+
+    if (stackSize != 0) {
+      stackSize = Max(uint(PTHREAD_STACK_MIN), stackSize);
+      stackSize = AlignUp(stackSize, 4096);
+      err = pthread_attr_setstacksize(&attrib, stackSize);
+      if (err != 0) {
+        pr_err("pthread_attr_setstacksize failed: %s\n", strerror(err));
+        err = pthread_attr_destroy(&attrib);
+        if (err != 0) {
+          pr_err("pthread_attr_destroy failed: %s\n", strerror(err));
+          return;
+        }
+      }
+    }
+
+    int cores = 0;
+    cpu_set_t* cpuset = nullptr;
+
+    if (core::Runtime::runtime_singleton_->flag().override_cpu_affinity()) {
+      cores = get_nprocs_conf();
+      cpuset = CPU_ALLOC(cores);
+      if (cpuset == nullptr) {
+        pr_err("CPU_ALLOC failed: %s\n", strerror(errno));
+        return;
+      }
+      CPU_ZERO_S(CPU_ALLOC_SIZE(cores), cpuset);
+      for (int i = 0; i < cores; i++) {
+        CPU_SET_S(i, CPU_ALLOC_SIZE(cores), cpuset);
+      }
+      err = pthread_attr_setaffinity_np(&attrib, CPU_ALLOC_SIZE(cores), cpuset);
+      CPU_FREE(cpuset);
+      if (err != 0) {
+        pr_err("pthread_setaffinity_np failed: %s\n", strerror(err));
+        return;
+      }
+    }
+
+    err = pthread_create(&thread, &attrib, ThreadTrampoline, args.get());
+
+    // Probably a stack size error since system limits can be different from PTHREAD_STACK_MIN
+    // Attempt to grow the stack within reason.
+    if ((err == EINVAL) && stackSize != 0) {
+      while (stackSize < 20 * 1024 * 1024) {
+        stackSize *= 2;
+        err = pthread_attr_setstacksize(&attrib, stackSize);
+        if (err != 0) {
+          pr_err("pthread_attr_setstacksize failed: %s\n", strerror(err));
+          return;
+        }
+        err = pthread_create(&thread, &attrib, ThreadTrampoline, args.get());
+        if (err != EINVAL) break;
+        pr_debug("pthread_create returned EINVAL, doubling stack size\n");
+      }
+    }
+
+    if (err == 0)
+      args.release();
+    else
+      thread = 0;
+
+    err = pthread_attr_destroy(&attrib);
+    if (err != 0) {
+      pr_err("pthread_attr_destroy failed: %s\n", strerror(err));
+    }
+  }
+
+  os_thread(os_thread&& rhs) {
+    thread = rhs.thread;
+    lock = rhs.lock;
+    state = int(rhs.state);
+    rhs.thread = 0;
+    rhs.lock = nullptr;
+  }
+
+  os_thread(os_thread&) = delete;
+
+  ~os_thread() {
+    if (lock != nullptr) DestroyMutex(lock);
+    if ((state == RUNNING) && (thread != 0)) {
+      int err = pthread_detach(thread);
+      if (err != 0) pr_err("pthread_detach failed: %s\n", strerror(err));
+    }
+  }
+
+  bool Valid() { return (lock != nullptr) && (thread != 0); }
+
+  bool Wait() {
+    if (state == FINISHED) return true;
+    AcquireMutex(lock);
+    if (state == FINISHED) {
+      ReleaseMutex(lock);
+      return true;
+    }
+    int err = pthread_join(thread, NULL);
+    bool success = (err == 0);
+    if (success) state = FINISHED;
+    ReleaseMutex(lock);
+    return success;
+  }
+
+ private:
+  pthread_t thread;
+  Mutex lock;
+  std::atomic<int> state;
+  enum { FINISHED = 0, RUNNING = 1 };
+};
+
+static_assert(sizeof(LibHandle) == sizeof(void*), "OS abstraction size mismatch");
+static_assert(sizeof(Semaphore) == sizeof(sem_t*), "OS abstraction size mismatch");
+static_assert(sizeof(Mutex) == sizeof(pthread_mutex_t*), "OS abstraction size mismatch");
+static_assert(sizeof(SharedMutex) == sizeof(pthread_rwlock_t*), "OS abstraction size mismatch");
+static_assert(sizeof(Thread) == sizeof(os_thread*), "OS abstraction size mismatch");
+
+LibHandle LoadLib(std::string filename) {
+  void* ret = dlopen(filename.c_str(), RTLD_LAZY);
+  if (ret == nullptr) pr_err("LoadLib(%s) failed: %s\n", filename.c_str(), dlerror());
+  return *(LibHandle*)&ret;
+}
+
+void* GetExportAddress(LibHandle lib, std::string export_name) {
+  void* ret = dlsym(*(void**)&lib, export_name.c_str());
+
+  // dlsym searches the given library and all the library's load dependencies.
+  // Remaining code limits symbol lookup to only the library handle given.
+  // This lookup pattern matches Windows.
+  if (ret == NULL) return ret;
+
+  link_map* map;
+  int err = dlinfo(*(void**)&lib, RTLD_DI_LINKMAP, &map);
+  if (err == -1) {
+    pr_err("dlinfo failed: %s\n", dlerror());
+    return nullptr;
+  }
+
+  Dl_info info;
+  err = dladdr(ret, &info);
+  if (err == 0) {
+    pr_err("dladdr failed.\n");
+    return nullptr;
+  }
+
+  if (strcmp(info.dli_fname, map->l_name) == 0) return ret;
+
+  return NULL;
+}
+
+void CloseLib(LibHandle lib) { dlclose(*(void**)&lib); }
+
+/*
+ * @brief Look for a symbol called "HSA_AMD_TOOL_PRIORITY" across all loaded
+ * shared libraries, and if found, store the name of the library
+ *
+ * @param[in]: info A dl_phdr_info struct pointer, which contains information
+ * about library's load address, header, and name.
+ *
+ * @param[in]: size integer size of dl_phdr_info struct
+ *
+ * @param[out]: data copy of the data argument to dl_phdr_iterate call
+ *
+ * @retval:: Return 0 on Success. If callback returns a non-zero value,
+ * dl_iterate_phdr() will stop processing, even if there are unprocessed
+ * shared objects.
+ */
+
+static int callback(struct dl_phdr_info* info, size_t size, void* data) {
+  std::vector<std::string>* loadedToolsLib = (std::vector<std::string>*)data;
+  assert(loadedToolsLib != nullptr);
+  /*
+   * Check if lib name is not empty and its not a "vdso.so" lib,
+   * The vDSO is a special shared object file that is built into the Linux kernel.
+   * It is not a regular shared library and thus does not have all the properties
+   * of regular shared libraries. The way the vDSO is loaded and organized in memory
+   * is different from regular shared libraries and it's not guaranteed that it
+   * will have a specific segment or section. Hence its skipped.
+   */
+
+  if ((info) && (info->dlpi_name[0] != '\0')) {
+    if (std::string(info->dlpi_name).find("vdso.so") != std::string::npos) return 0;
+
+    /*
+     * Iterate through the program headers of the loaded lib and check for PT_DYNAMIC program
+     * header. If the PT_DYNAMIC program header is found, use dlpi_addr and dlpi_phdr members
+     * of dl_phdr_info struct to get the address of the dynamic section of the loaded
+     * library in memory
+     */
+
+    for (int i = 0; i < info->dlpi_phnum; i++) {
+      if (info->dlpi_phdr[i].p_type == PT_DYNAMIC) {
+        Elf64_Dyn* dyn_section = (Elf64_Dyn*)(info->dlpi_addr + info->dlpi_phdr[i].p_vaddr);
+
+        char* strings = nullptr;
+        Elf64_Xword limit = 0;
+
+        /*
+         * The dynamic section is searched for DT_STRTAB (address of string table),
+         * and DT_STRSZ (size of string table)
+         * DT_NULL - Marks the end of the _DYNAMIC array
+         */
+
+        for (int j = 0;; j++) {
+          if (dyn_section[j].d_tag == DT_NULL) break;
+
+          if (dyn_section[j].d_tag == DT_STRTAB) strings = (char*)(dyn_section[j].d_un.d_ptr);
+
+          if (dyn_section[j].d_tag == DT_STRSZ) limit = dyn_section[j].d_un.d_val;
+        }
+
+        if (strings == nullptr) pr_debug("String table not found\n");
+
+        /*
+         * Hacky lookup, if string and symbol tables are found,
+         * iterate through the strings in string table and check if
+         * any string matches "HSA_AMD_TOOL_PRIORITY".
+         * If yes, then add the name of the library to the vector of
+         * lib names
+         */
+        if (strings != nullptr) {
+          char* end = strings + limit;
+          while (strings < end) {
+            if (strcmp(strings, "HSA_AMD_TOOL_PRIORITY") == 0) {
+              loadedToolsLib->push_back(info->dlpi_name);
+              return 0;
+            }
+            strings += (strlen(strings) + 1);
+          }
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+std::vector<LibHandle> GetLoadedToolsLib() {
+  std::vector<LibHandle> ret;
+  std::vector<std::string> names;
+
+  /* Iterate through all of the loaded shared libraries in the process */
+  dl_iterate_phdr(callback, &names);
+
+  if (!names.empty()) {
+    for (auto& name : names) ret.push_back(LoadLib(name));
+  }
+
+  return ret;
+}
+
+std::string GetLibraryName(LibHandle lib) {
+  link_map *map;
+  if(dlinfo(lib, RTLD_DI_LINKMAP, &map)!=0)
+    return "";
+  return map->l_name;
+}
+
+Semaphore CreateSemaphore() {
+  sem_t *sem = new sem_t;
+  sem_init(sem, 0, 0);
+  return *(Semaphore*)&sem;
+}
+
+bool WaitSemaphore(Semaphore sem) {
+  while(sem_wait(*(sem_t**)&sem))
+    if (errno != EINTR) return false;
+
+  return true;
+}
+
+void PostSemaphore(Semaphore sem) {
+  if (sem_post(*(sem_t**)&sem))
+    assert(false && "Failed to post semaphore");
+}
+
+void DestroySemaphore(Semaphore sem) {
+  sem_destroy(*(sem_t**)&sem);
+  delete *(sem_t**)&sem;
+}
+
+Mutex CreateMutex() {
+  pthread_mutex_t* mutex = new pthread_mutex_t;
+  pthread_mutex_init(mutex, NULL);
+  return *(Mutex*)&mutex;
+}
+
+bool TryAcquireMutex(Mutex lock) {
+  return pthread_mutex_trylock(*(pthread_mutex_t**)&lock) == 0;
+}
+
+bool AcquireMutex(Mutex lock) {
+  return pthread_mutex_lock(*(pthread_mutex_t**)&lock) == 0;
+}
+
+void ReleaseMutex(Mutex lock) {
+  pthread_mutex_unlock(*(pthread_mutex_t**)&lock);
+}
+
+void DestroyMutex(Mutex lock) {
+  pthread_mutex_destroy(*(pthread_mutex_t**)&lock);
+  delete *(pthread_mutex_t**)&lock;
+}
+
+void Sleep(int delay_in_millisec) { usleep(delay_in_millisec * 1000); }
+
+void uSleep(int delayInUs) { usleep(delayInUs); }
+
+void YieldThread() { sched_yield(); }
+
+Thread CreateThread(ThreadEntry function, void* threadArgument, uint stackSize) {
+  os_thread* result = new os_thread(function, threadArgument, stackSize);
+  if (!result->Valid()) {
+    delete result;
+    return nullptr;
+  }
+
+  return reinterpret_cast<Thread>(result);
+}
+
+void CloseThread(Thread thread) { delete reinterpret_cast<os_thread*>(thread); }
+
+bool WaitForThread(Thread thread) { return reinterpret_cast<os_thread*>(thread)->Wait(); }
+
+bool WaitForAllThreads(Thread* threads, uint threadCount) {
+  for (uint i = 0; i < threadCount; i++) WaitForThread(threads[i]);
+  return true;
+}
+
+bool IsEnvVarSet(std::string env_var_name) {
+  char* buff = NULL;
+  buff = getenv(env_var_name.c_str());
+  return (buff != NULL);
+}
+
+void SetEnvVar(std::string env_var_name, std::string env_var_value) {
+  setenv(env_var_name.c_str(), env_var_value.c_str(), 1);
+}
+
+int GetProcessId() {
+  return ::getpid();
+}
+
+std::string GetEnvVar(std::string env_var_name) {
+  char* buff;
+  buff = getenv(env_var_name.c_str());
+  std::string ret;
+  if (buff) {
+    ret = buff;
+  }
+  return ret;
+}
+
+size_t GetUserModeVirtualMemorySize() {
+#ifdef _LP64
+  // https://www.kernel.org/doc/Documentation/x86/x86_64/mm.txt :
+  // user space is 0000000000000000 - 00007fffffffffff (=47 bits)
+  return (size_t)(0x800000000000);
+#else
+  return (size_t)(0xffffffff);  // ~4GB
+#endif
+}
+
+size_t GetUsablePhysicalHostMemorySize() {
+  struct sysinfo info = {0};
+  if (sysinfo(&info) != 0) {
+    return 0;
+  }
+
+  const size_t physical_size =
+      static_cast<size_t>(info.totalram * info.mem_unit);
+  return std::min(GetUserModeVirtualMemorySize(), physical_size);
+}
+
+uintptr_t GetUserModeVirtualMemoryBase() { return (uintptr_t)0; }
+
+// Os event implementation
+typedef struct EventDescriptor_ {
+  pthread_cond_t event;
+  pthread_mutex_t mutex;
+  bool state;
+  bool auto_reset;
+} EventDescriptor;
+
+EventHandle CreateOsEvent(bool auto_reset, bool init_state) {
+  EventDescriptor* eventDescrp;
+  eventDescrp = (EventDescriptor*)malloc(sizeof(EventDescriptor));
+
+  pthread_mutex_init(&eventDescrp->mutex, NULL);
+  pthread_cond_init(&eventDescrp->event, NULL);
+  eventDescrp->auto_reset = auto_reset;
+  eventDescrp->state = init_state;
+
+  EventHandle handle = reinterpret_cast<EventHandle>(eventDescrp);
+
+  return handle;
+}
+
+int DestroyOsEvent(EventHandle event) {
+  if (event == NULL) {
+    return -1;
+  }
+
+  EventDescriptor* eventDescrp = reinterpret_cast<EventDescriptor*>(event);
+  int ret_code = pthread_cond_destroy(&eventDescrp->event);
+  ret_code |= pthread_mutex_destroy(&eventDescrp->mutex);
+  free(eventDescrp);
+  return ret_code;
+}
+
+int WaitForOsEvent(EventHandle event, unsigned int milli_seconds) {
+  if (event == NULL) {
+    return -1;
+  }
+
+  EventDescriptor* eventDescrp = reinterpret_cast<EventDescriptor*>(event);
+  // Event wait time is 0 and state is non-signaled, return directly
+  if (milli_seconds == 0) {
+    int tmp_ret = pthread_mutex_trylock(&eventDescrp->mutex);
+    if (tmp_ret == EBUSY) {
+      // Timeout
+      return 1;
+    }
+  }
+
+  int ret_code = 0;
+  pthread_mutex_lock(&eventDescrp->mutex);
+  if (!eventDescrp->state) {
+    if (milli_seconds == 0) {
+      ret_code = 1;
+    } else {
+      struct timespec ts;
+      struct timeval tp;
+
+      ret_code = gettimeofday(&tp, NULL);
+      ts.tv_sec = tp.tv_sec;
+      ts.tv_nsec = tp.tv_usec * 1000;
+
+      unsigned int sec = milli_seconds / 1000;
+      unsigned int mSec = milli_seconds % 1000;
+
+      ts.tv_sec += sec;
+      ts.tv_nsec += mSec * 1000000;
+
+      // More then one second, add 1 sec to the tv_sec elem
+      if (ts.tv_nsec > 1000000000) {
+        ts.tv_sec += 1;
+        ts.tv_nsec = ts.tv_nsec - 1000000000;
+      }
+
+      ret_code =
+          pthread_cond_timedwait(&eventDescrp->event, &eventDescrp->mutex, &ts);
+      // Time out
+      if (ret_code == 110) {
+        ret_code = 0x14003;  // 1 means time out in HSA
+      }
+
+      if (ret_code == 0 && eventDescrp->auto_reset) {
+        eventDescrp->state = false;
+      }
+    }
+  } else if (eventDescrp->auto_reset) {
+    eventDescrp->state = false;
+  }
+  pthread_mutex_unlock(&eventDescrp->mutex);
+
+  return ret_code;
+}
+
+int SetOsEvent(EventHandle event) {
+  if (event == NULL) {
+    return -1;
+  }
+
+  EventDescriptor* eventDescrp = reinterpret_cast<EventDescriptor*>(event);
+  int ret_code = 0;
+  ret_code = pthread_mutex_lock(&eventDescrp->mutex);
+  eventDescrp->state = true;
+  ret_code = pthread_mutex_unlock(&eventDescrp->mutex);
+  ret_code |= pthread_cond_signal(&eventDescrp->event);
+
+  return ret_code;
+}
+
+int ResetOsEvent(EventHandle event) {
+  if (event == NULL) {
+    return -1;
+  }
+
+  EventDescriptor* eventDescrp = reinterpret_cast<EventDescriptor*>(event);
+  int ret_code = 0;
+  ret_code = pthread_mutex_lock(&eventDescrp->mutex);
+  eventDescrp->state = false;
+  ret_code = pthread_mutex_unlock(&eventDescrp->mutex);
+
+  return ret_code;
+}
+
+static double invPeriod = 0.0;
+
+uint64_t ReadAccurateClock() {
+  if (invPeriod == 0.0) AccurateClockFrequency();
+  timespec time;
+  int err = clock_gettime(CLOCK_MONOTONIC_RAW, &time);
+  if (err != 0) {
+    pr_err("clock_gettime(CLOCK_MONOTONIC_RAW,...) failed %s\n", strerror(errno));
+    abort();
+  }
+  return (uint64_t(time.tv_sec) * 1000000000ull + uint64_t(time.tv_nsec)) * invPeriod;
+}
+
+uint64_t AccurateClockFrequency() {
+  static clockid_t clock = CLOCK_MONOTONIC;
+  static std::atomic<bool> first(true);
+  // Check kernel version - not a concurrency concern.
+  // use non-RAW for getres due to bug in older 2.6.x kernels
+  if (first.load(std::memory_order_acquire)) {
+    utsname kernelInfo;
+    if (uname(&kernelInfo) == 0) {
+      try {
+        std::string ver = kernelInfo.release;
+        size_t idx;
+        int major = std::stoi(ver, &idx);
+        int minor = std::stoi(ver.substr(idx + 1));
+        if ((major >= 4) && (minor >= 4)) {
+          clock = CLOCK_MONOTONIC_RAW;
+        }
+      } catch (...) {
+        // Kernel version string doesn't conform to the standard pattern.
+        // Keep using the "safe" (non-RAW) clock.
+      }
+    }
+    first.store(false, std::memory_order_release);
+  }
+  timespec time;
+  int err = clock_getres(clock, &time);
+  if (err != 0) {
+    pr_err("clock_getres failed %s\n", strerror(errno));
+    abort();
+  }
+  if (time.tv_sec != 0 || time.tv_nsec >= 0xFFFFFFFF) {
+    pr_err("clock_getres(CLOCK_MONOTONIC(_RAW),...) returned very low frequency (<1Hz).\n");
+    abort();
+  }
+  if (invPeriod == 0.0) invPeriod = 1.0 / double(time.tv_nsec);
+  return 1000000000ull / uint64_t(time.tv_nsec);
+}
+
+SharedMutex CreateSharedMutex() {
+  pthread_rwlockattr_t attrib;
+  int err = pthread_rwlockattr_init(&attrib);
+  if (err != 0) {
+    pr_err("rw lock attribute init failed: %s\n", strerror(err));
+    return nullptr;
+  }
+
+#ifdef __GLIBC__
+  err = pthread_rwlockattr_setkind_np(&attrib, PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP);
+  if (err != 0) {
+    pr_err("Set rw lock attribute failure: %s\n", strerror(err));
+    return nullptr;
+  }
+#else
+  err = pthread_rwlockattr_setkind(&attrib, PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP);
+  if (err != 0) {
+    pr_err("Set rw lock attribute failure: %s\n", strerror(err));
+    return nullptr;
+  }
+#endif
+
+  pthread_rwlock_t* lock = new pthread_rwlock_t;
+  err = pthread_rwlock_init(lock, &attrib);
+  if (err != 0) {
+    pr_err("rw lock init failed: %s\n", strerror(err));
+    return nullptr;
+  }
+
+  pthread_rwlockattr_destroy(&attrib);
+  return lock;
+}
+
+bool TryAcquireSharedMutex(SharedMutex lock) {
+  int err = pthread_rwlock_trywrlock(*(pthread_rwlock_t**)&lock);
+  return err == 0;
+}
+
+bool AcquireSharedMutex(SharedMutex lock) {
+  int err = pthread_rwlock_wrlock(*(pthread_rwlock_t**)&lock);
+  return err == 0;
+}
+
+void ReleaseSharedMutex(SharedMutex lock) {
+  int err = pthread_rwlock_unlock(*(pthread_rwlock_t**)&lock);
+  if (err != 0) {
+    pr_err("SharedMutex unlock failed: %s\n", strerror(err));
+    abort();
+  }
+}
+
+bool TrySharedAcquireSharedMutex(SharedMutex lock) {
+  int err = pthread_rwlock_tryrdlock(*(pthread_rwlock_t**)&lock);
+  return err == 0;
+}
+
+bool SharedAcquireSharedMutex(SharedMutex lock) {
+  int err = pthread_rwlock_rdlock(*(pthread_rwlock_t**)&lock);
+  return err == 0;
+}
+
+void SharedReleaseSharedMutex(SharedMutex lock) {
+  int err = pthread_rwlock_unlock(*(pthread_rwlock_t**)&lock);
+  if (err != 0) {
+    pr_err("SharedMutex unlock failed: %s\n", strerror(err));
+    abort();
+  }
+}
+
+void DestroySharedMutex(SharedMutex lock) {
+  pthread_rwlock_destroy(*(pthread_rwlock_t**)&lock);
+  delete *(pthread_rwlock_t**)&lock;
+}
+
+static uint64_t sys_clock_period_ = 0;
+
+uint64_t ReadSystemClock() {
+  struct timespec ts;
+  clock_gettime(CLOCK_BOOTTIME, &ts);
+  uint64_t time = (uint64_t(ts.tv_sec) * 1000000000 + uint64_t(ts.tv_nsec));
+  if (sys_clock_period_ != 1)
+    return time / sys_clock_period_;
+  else
+    return time;
+}
+
+uint64_t SystemClockFrequency() {
+  struct timespec ts;
+  clock_getres(CLOCK_BOOTTIME, &ts);
+  sys_clock_period_ = (uint64_t(ts.tv_sec) * 1000000000 + uint64_t(ts.tv_nsec));
+  return 1000000000 / sys_clock_period_;
+}
+
+bool ParseCpuID(cpuid_t* cpuinfo) {
+#if defined(__i386__) || defined(__x86_64__)
+  uint32_t eax, ebx, ecx, edx, max_eax = 0;
+  memset(cpuinfo, 0, sizeof(*cpuinfo));
+
+  /* Make sure current CPU supports at least EAX 4 */
+  if (!__get_cpuid_max(0x80000004, NULL)) return false;
+
+  // Manufacturer ID is a twelve-character ASCII string stored in order EBX, EDX, ECX.
+  if (!__get_cpuid(0, &max_eax, (uint32_t*)&cpuinfo->ManufacturerID[0],
+                   (uint32_t*)&cpuinfo->ManufacturerID[8],
+                   (uint32_t*)&cpuinfo->ManufacturerID[4])) {
+    return false;
+  }
+
+  if (!strcmp(cpuinfo->ManufacturerID, "AuthenticAMD")) {
+    if (__get_cpuid(0x80000001, &eax, &ebx, &ecx, &edx)) {
+      cpuinfo->mwaitx = !!((ecx >> 29) & 0x1);
+    }
+  }
+  return true;
+#else
+  return false;
+#endif
+}
+
+}   //  namespace os
+}   //  namespace wsl
+
+#endif
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/locks.h b/projects/rocr-runtime/libhsakmt/src/dxg/util/locks.h
new file mode 100644
index 0000000000..a17fa09593
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/locks.h
@@ -0,0 +1,290 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+// 
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+// 
+// Developed by:
+// 
+//                 AMD Research and AMD HSA Software Development
+// 
+//                 Advanced Micro Devices, Inc.
+// 
+//                 www.amd.com
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+// 
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// Library of syncronization primitives - to be added to as needed.
+
+#ifndef HSA_RUNTIME_CORE_UTIL_LOCKS_H_
+#define HSA_RUNTIME_CORE_UTIL_LOCKS_H_
+
+#include "utils.h"
+#include "os.h"
+
+namespace wsl {
+
+class HybridMutex {
+ public:
+  HybridMutex():lock_(0) { 
+    sem_ = os::CreateSemaphore(); 
+  }
+
+  ~HybridMutex() { 
+    os::DestroySemaphore(sem_); 
+  }
+
+  bool Try() {
+    int old = 0;
+    return lock_.compare_exchange_strong(old, 1);
+  }
+
+  bool Acquire() {
+    int cnt = maxSpinIterPause + maxSpinIterYield;
+
+    int old = 0;
+    while (!lock_.compare_exchange_strong(old, 1)) {
+      cnt--;
+      if (cnt > maxSpinIterPause) {
+        _mm_pause();
+      } else if (cnt-- > maxSpinIterYield) {
+        os::YieldThread();
+      } else {
+        os::WaitSemaphore(sem_);
+        cnt = maxSpinIterPause + maxSpinIterYield;
+      }
+      old = 0;
+    }
+    return true;
+  }
+
+  void Release() {
+    int old = 1;
+    if (lock_.compare_exchange_strong(old, 0))
+      os::PostSemaphore(sem_);
+  }
+
+ private:
+  std::atomic<int> lock_;
+  os::Semaphore sem_;
+  const uint32_t maxSpinIterPause = 55;
+  const uint32_t maxSpinIterYield = 55;
+
+  /// @brief: Disable copiable and assignable ability.
+  DISALLOW_COPY_AND_ASSIGN(HybridMutex);
+};
+
+
+/// @brief: a class represents a kernel mutex.
+/// Uses the kernel's scheduler to keep the waiting thread from being scheduled
+/// until the lock is released (Best for long waits, though anything using
+/// a kernel object is a long wait).
+class KernelMutex {
+ public:
+  KernelMutex() { lock_ = os::CreateMutex(); }
+  ~KernelMutex() { os::DestroyMutex(lock_); }
+
+  bool Try() { return os::TryAcquireMutex(lock_); }
+  bool Acquire() { return os::AcquireMutex(lock_); }
+  void Release() { os::ReleaseMutex(lock_); }
+
+ private:
+  os::Mutex lock_;
+
+  /// @brief: Disable copiable and assignable ability.
+  DISALLOW_COPY_AND_ASSIGN(KernelMutex);
+};
+
+/// @brief: represents a spin lock.
+/// For very short hold durations on the order of the thread scheduling
+/// quanta or less.
+class SpinMutex {
+ public:
+  SpinMutex() { lock_ = 0; }
+
+  bool Try() {
+    int old = 0;
+    return lock_.compare_exchange_strong(old, 1);
+  }
+  bool Acquire() {
+    int old = 0;
+    while (!lock_.compare_exchange_strong(old, 1))
+	{
+		old=0;
+    os::YieldThread();
+	}
+    return true;
+  }
+  void Release() { lock_ = 0; }
+
+ private:
+  std::atomic<int> lock_;
+
+  /// @brief: Disable copiable and assignable ability.
+  DISALLOW_COPY_AND_ASSIGN(SpinMutex);
+};
+
+class KernelEvent {
+ public:
+  KernelEvent() { evt_ = os::CreateOsEvent(true, true); }
+  ~KernelEvent() { os::DestroyOsEvent(evt_); }
+
+  bool IsSet() { return os::WaitForOsEvent(evt_, 0)==0; }
+  bool WaitForSet() { return os::WaitForOsEvent(evt_, 0xFFFFFFFF)==0; }
+  void Set() { os::SetOsEvent(evt_); }
+  void Reset() { os::ResetOsEvent(evt_); }
+
+ private:
+  os::EventHandle evt_;
+
+  /// @brief: Disable copiable and assignable ability.
+  DISALLOW_COPY_AND_ASSIGN(KernelEvent);
+};
+
+/// @brief: represents a yielding shared mutex.
+/// aka read/write mutex
+class KernelSharedMutex {
+ public:
+  /// @brief: Interfaces ScopedAcquire to shared operations.
+  class Shared {
+   public:
+    explicit Shared(KernelSharedMutex* lock) : lock_(lock) {}
+    bool Try() { return lock_->TryShared(); }
+    bool Acquire() { return lock_->AcquireShared(); }
+    void Release() { lock_->ReleaseShared(); }
+
+   private:
+    KernelSharedMutex* lock_;
+  };
+
+  KernelSharedMutex() { lock_ = os::CreateSharedMutex(); }
+  ~KernelSharedMutex() { os::DestroySharedMutex(lock_); }
+
+  // Exclusive mode operations
+  bool Try() { return os::TryAcquireSharedMutex(lock_); }
+  bool Acquire() { return os::AcquireSharedMutex(lock_); }
+  void Release() { os::ReleaseSharedMutex(lock_); }
+
+  // Shared mode operations
+  bool TryShared() { return os::TrySharedAcquireSharedMutex(lock_); }
+  bool AcquireShared() { return os::SharedAcquireSharedMutex(lock_); }
+  void ReleaseShared() { os::SharedReleaseSharedMutex(lock_); }
+
+  // Return shared operations interface
+  Shared shared() { return Shared(this); }
+
+ private:
+  os::SharedMutex lock_;
+
+  /// @brief: Disable copiable and assignable ability.
+  DISALLOW_COPY_AND_ASSIGN(KernelSharedMutex);
+};
+
+/// @brief: Type trait to identify mutex types
+template <class T> class isMutex {
+ public:
+  enum { value = false };
+};
+template <> class isMutex<HybridMutex> {
+ public:
+  enum { value = true };
+};
+template <> class isMutex<KernelMutex> {
+ public:
+  enum { value = true };
+};
+template <> class isMutex<SpinMutex> {
+ public:
+  enum { value = true };
+};
+template <> class isMutex<KernelSharedMutex> {
+ public:
+  enum { value = true };
+};
+
+/// @brief: A class behaves as a lock in a scope. When trying to enter into the
+/// critical section, creat a object of this class. After the control path goes
+/// out of the scope, it will release the lock automatically.
+template <class LockType> class ScopedAcquire {
+ public:
+  /// @brief: When constructing, acquire the lock.
+  /// @param: lock(Input), pointer to an existing lock.
+  explicit ScopedAcquire(LockType* lock) : lock_(lock), doRelease(true) {
+    static_assert(isMutex<LockType>::value, "ScopedAcquire requires a mutex type.");
+    lock_.Acquire();
+  }
+  explicit ScopedAcquire(LockType lock) : lock_(lock), doRelease(true) {
+    static_assert(!isMutex<LockType>::value, "Mutex types are not copyable.");
+    lock_.Acquire();
+  }
+
+  /// @brief: when destructing, release the lock.
+  ~ScopedAcquire() {
+    if (doRelease) lock_.Release();
+  }
+
+  /// @brief: Release the lock early.  Avoid using when possible.
+  void Release() {
+    lock_.Release();
+    doRelease = false;
+  }
+
+ private:
+  /// @brief: Adapts between pointers to mutex types and mutex pointer types.
+  template <class T, bool B> class container {
+   public:
+    container(T* lock) : lock_(lock) {}
+    __forceinline bool Acquire() { return lock_->Acquire(); }
+    __forceinline void Release() { return lock_->Release(); }
+
+   private:
+    T* lock_;
+  };
+
+  /// @brief: Specialization for mutex pointer types.
+  template <class T> class container<T, false> {
+   public:
+    container(T lock) : lock_(lock) {}
+    __forceinline bool Acquire() { return lock_.Acquire(); }
+    __forceinline void Release() { return lock_.Release(); }
+
+   private:
+    T lock_;
+  };
+
+  container<LockType, isMutex<LockType>::value> lock_;
+  bool doRelease;
+
+  /// @brief: Disable copiable and assignable ability.
+  DISALLOW_COPY_AND_ASSIGN(ScopedAcquire);
+};
+
+} // namespace wsl
+
+#endif  // HSA_RUNTIME_CORE_SUTIL_LOCKS_H_
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/os.h b/projects/rocr-runtime/libhsakmt/src/dxg/util/os.h
new file mode 100644
index 0000000000..2f40cd1581
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/os.h
@@ -0,0 +1,327 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2024, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// Minimal operating system abstraction interfaces.
+
+#ifndef HSA_RUNTIME_CORE_UTIL_OS_H_
+#define HSA_RUNTIME_CORE_UTIL_OS_H_
+
+#include <string>
+#include <vector>
+#include "utils.h"
+
+namespace wsl {
+namespace os {
+typedef void* LibHandle;
+typedef void* Semaphore;
+typedef void* Mutex;
+typedef void* SharedMutex;
+typedef void* Thread;
+typedef void* EventHandle;
+
+enum class os_t { OS_WIN = 0, OS_LINUX, COUNT };
+static __forceinline std::underlying_type<os_t>::type os_index(os_t val) {
+  return std::underlying_type<os_t>::type(val);
+}
+
+#ifdef _WIN32
+static const os_t current_os = os_t::OS_WIN;
+#elif __linux__
+static const os_t current_os = os_t::OS_LINUX;
+#else
+static_assert(false, "Operating System not detected!");
+#endif
+
+/// @brief: Loads dynamic library based on file name. Return value will be NULL
+/// if failed.
+/// @param: filename(Input), file name of the library.
+/// @return: LibHandle.
+LibHandle LoadLib(std::string filename);
+
+/// @brief: Gets the address of exported symbol. Return NULl if failed.
+/// @param: lib(Input), library handle which exporting from.
+/// @param: export_name(Input), the name of the exported symbol.
+/// @return: void*.
+void* GetExportAddress(LibHandle lib, std::string export_name);
+
+/// @brief: Unloads the dynamic library.
+/// @param: lib(Input), library handle which will be unloaded.
+void CloseLib(LibHandle lib);
+
+/// @brief: Lists loaded tool libraries that contain
+/// symbol HSA_AMD_TOOL_PRIORITY
+/// @return: List of library handles
+std::vector<LibHandle> GetLoadedToolsLib();
+
+/// @brief: Returns the library's path name.
+/// @param: lib(Input), libray handle
+/// @return: Path name of library
+std::string GetLibraryName(LibHandle lib);
+
+/// @brief: Creates a Semaphore, will return NULL if failed.
+/// @param: void.
+/// @return: Semaphore.
+Semaphore CreateSemaphore();
+
+/// @brief: Waits for the semaphore. This is a blocking wait.
+/// If the Semaphore is signalled, this function will return.
+/// @param: sem(Input), handle to the semaphore.
+/// @return: void.
+bool WaitSemaphore(Semaphore sem);
+
+/// @brief: Post/Signal/Wake-up the semaphore
+/// @param: sem(Input), handle to the semaphore.
+/// @return: void.
+void PostSemaphore(Semaphore sem);
+
+/// @brief: Destroys the semaphore.
+/// @param: sem(Input), handle to the semaphore.
+/// @return: void.
+void DestroySemaphore(Semaphore sem);
+
+/// @brief: Creates a mutex, will return NULL if failed.
+/// @param: void.
+/// @return: Mutex.
+Mutex CreateMutex();
+
+/// @brief: Tries to acquire the mutex once, if successed, return true.
+/// @param: lock(Input), handle to the mutex.
+/// @return: bool.
+bool TryAcquireMutex(Mutex lock);
+
+/// @brief: Aquires the mutex, if the mutex is locked, it will wait until it is
+/// released. If the mutex is acquired successfully, it will return true.
+/// @param: lock(Input), handle to the mutex.
+/// @return: bool.
+bool AcquireMutex(Mutex lock);
+
+/// @brief: Releases the mutex.
+/// @param: lock(Input), handle to the mutex.
+/// @return: void.
+void ReleaseMutex(Mutex lock);
+
+/// @brief: Destroys the mutex.
+/// @param: lock(Input), handle to the mutex.
+/// @return: void.
+void DestroyMutex(Mutex lock);
+
+/// @brief: Creates a shared mutex, will return NULL if failed.
+/// @param: void.
+/// @return: SharedMutex.
+SharedMutex CreateSharedMutex();
+
+/// @brief: Tries to acquire the mutex in exclusive mode once, if successed, return true.
+/// @param: lock(Input), handle to the shared mutex.
+/// @return: bool.
+bool TryAcquireSharedMutex(SharedMutex lock);
+
+/// @brief: Aquires the mutex in exclusive mode, if the mutex is locked, it will wait until it is
+/// released. If the mutex is acquired successfully, it will return true.
+/// @param: lock(Input), handle to the mutex.
+/// @return: bool.
+bool AcquireSharedMutex(SharedMutex lock);
+
+/// @brief: Releases the mutex from exclusive mode.
+/// @param: lock(Input), handle to the mutex.
+/// @return: void.
+void ReleaseSharedMutex(SharedMutex lock);
+
+/// @brief: Tries to acquire the mutex in shared mode once, if successed, return true.
+/// @param: lock(Input), handle to the mutex.
+/// @return: bool.
+bool TrySharedAcquireSharedMutex(SharedMutex lock);
+
+/// @brief: Aquires the mutex in shared mode, if the mutex in exclusive mode, it will wait until it
+/// is released. If the mutex is acquired successfully, it will return true.
+/// @param: lock(Input), handle to the mutex.
+/// @return: bool.
+bool SharedAcquireSharedMutex(SharedMutex lock);
+
+/// @brief: Releases the mutex from shared mode.
+/// @param: lock(Input), handle to the mutex.
+/// @return: void.
+void SharedReleaseSharedMutex(SharedMutex lock);
+
+/// @brief: Destroys the mutex.
+/// @param: lock(Input), handle to the mutex.
+/// @return: void.
+void DestroySharedMutex(SharedMutex lock);
+
+/// @brief: Puts current thread to sleep.
+/// @param: delayInMs(Input), time in millisecond for sleeping.
+/// @return: void.
+void Sleep(int delayInMs);
+
+/// @brief: Puts current thread to sleep.
+/// @param: delayInMs(Input), time in millisecond for sleeping.
+/// @return: void.
+void uSleep(int delayInUs);
+
+/// @brief: Yields current thread.
+/// @param: void.
+/// @return: void.
+void YieldThread();
+
+typedef void (*ThreadEntry)(void*);
+
+/// @brief: Creates a thread will return NULL if failed.
+/// @param: entry_function(Input), a pointer to the function which the thread
+/// starts from.
+/// @param: entry_argument(Input), a pointer to the argument of the thread
+/// function.
+/// @param: stack_size(Input), size of the thread's stack, 0 by default.
+/// @return: Thread, a handle to thread created.
+Thread CreateThread(ThreadEntry entry_function, void* entry_argument,
+                    uint stack_size = 0);
+
+/// @brief: Destroys the thread.
+/// @param: thread(Input), thread handle to what will be destroyed.
+/// @return: void.
+void CloseThread(Thread thread);
+
+/// @brief: Waits for specific thread to finish, if successful, return true.
+/// @param: thread(Input), handle to waiting thread.
+/// @return: bool.
+bool WaitForThread(Thread thread);
+
+/// @brief: Waits for multiple threads to finish, if successful, return true.
+/// @param; threads(Input), a pointer to a list of thread handle.
+/// @param: thread_count(Input), number of threads to be waited on.
+/// @return: bool.
+bool WaitForAllThreads(Thread* threads, uint thread_count);
+
+/// @brief: Determines if environment key is set.
+/// @param: env_var_name(Input), name of the environment value.
+/// @return: bool, true for binding any value to environment key,
+/// including an empty string. False otherwise
+bool IsEnvVarSet(std::string env_var_name);
+
+/// @brief: Sets the environment value.
+/// @param: env_var_name(Input), name of the environment value.
+/// @param: env_var_value(Input), value of the environment value.s
+/// @return: void.
+void SetEnvVar(std::string env_var_name, std::string env_var_value);
+
+/// @brief: Gets the value of environment value.
+/// @param: env_var_name(Input), name of the environment value.
+/// @return: std::string, value of the environment value, returned as string.
+std::string GetEnvVar(std::string env_var_name);
+
+/// @brief: Gets the process ID.
+/// @param: void
+/// @return: int, process ID returned as int.
+int GetProcessId();
+
+/// @brief: Gets the max virtual memory size accessible to the application.
+/// @param: void.
+/// @return: size_t, size of the accessible memory to the application.
+size_t GetUserModeVirtualMemorySize();
+
+/// @brief: Gets the max physical host system memory size.
+/// @param: void.
+/// @return: size_t, size of the physical host system memory.
+size_t GetUsablePhysicalHostMemorySize();
+
+/// @brief: Gets the virtual memory base address. It is hardcoded to 0.
+/// @param: void.
+/// @return: uintptr_t, always 0.
+uintptr_t GetUserModeVirtualMemoryBase();
+
+/// @brief os event api, create an event
+/// @param: auto_reset whether an event can reset the status automatically
+/// @param: init_state initial state of the event
+/// @return: event handle
+EventHandle CreateOsEvent(bool auto_reset, bool init_state);
+
+/// @brief os event api, destroy an event
+/// @param: event handle
+/// @return: whether destroy is correct
+int DestroyOsEvent(EventHandle event);
+
+/// @brief os event api, wait on event
+/// @param: event Event handle
+/// @param: milli_seconds wait time
+/// @return: Indicate success or timeout
+int WaitForOsEvent(EventHandle event, unsigned int milli_seconds);
+
+/// @brief os event api, set event state
+/// @param: event Event handle
+/// @return: Whether event set is correct
+int SetOsEvent(EventHandle event);
+
+/// @brief os event api, reset event state
+/// @param: event Event handle
+/// @return: Whether event reset is correct
+int ResetOsEvent(EventHandle event);
+
+/// @brief reads a clock which is deemed to be accurate for elapsed time
+/// measurements, though not necessarilly fast to query
+/// @return clock counter value
+uint64_t ReadAccurateClock();
+
+/// @brief retrieves the frequency in Hz of the unit used in ReadAccurateClock.
+/// It does not necessarilly reflect the resolution of the clock, but is the
+/// value needed to convert a difference in the clock's counter value to elapsed
+/// seconds.  This frequency does not change at runtime.
+/// @return returns the frequency
+uint64_t AccurateClockFrequency();
+
+/// @brief read the system clock which serves as the HSA system clock
+/// counter in KFD.
+uint64_t ReadSystemClock();
+
+/// @brief read the system clock frequency
+uint64_t SystemClockFrequency();
+
+typedef struct cpuid_s {
+  char ManufacturerID[13];  // 12 char, NULL terminated
+  bool mwaitx;
+} cpuid_t;
+
+/// @brief parse CPUID
+/// @param: cpuinfo struct to be filled
+bool ParseCpuID(cpuid_t* cpuinfo);
+
+}   //  namespace os
+} // namespace wsl
+
+#endif  // HSA_RUNTIME_CORE_UTIL_OS_H_
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/simple_heap.h b/projects/rocr-runtime/libhsakmt/src/dxg/util/simple_heap.h
new file mode 100644
index 0000000000..1fb992eb63
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/simple_heap.h
@@ -0,0 +1,394 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// A simple best fit memory allocator with eager compaction.  Manages block sub-allocation.
+// For use when memory efficiency is more important than allocation speed.
+// O(log n) time.
+
+#ifndef HSA_RUNTME_CORE_UTIL_SIMPLE_HEAP_H_
+#define HSA_RUNTME_CORE_UTIL_SIMPLE_HEAP_H_
+
+#include <map>
+#include <deque>
+#include <utility>
+
+
+namespace wsl {
+
+template <typename Allocator> class SimpleHeap {
+ private:
+  struct Fragment_T {
+    typedef std::multimap<size_t, uintptr_t>::iterator ptr_t;
+    ptr_t free_list_entry_;
+    struct {
+      size_t size : 62;
+      bool discard : 1;
+      bool free : 1;
+    };
+
+    Fragment_T(ptr_t Iterator, size_t Len, bool Free)
+        : free_list_entry_(Iterator), size(Len), discard(false), free(Free) {}
+    Fragment_T() = default;
+  };
+
+  struct Block {
+    uintptr_t base_ptr_;
+    size_t length_;
+
+    Block(uintptr_t base, size_t length) : base_ptr_(base), length_(length) {}
+    Block() = default;
+  };
+
+  Allocator block_allocator_;
+
+  std::multimap<size_t, uintptr_t> free_list_;
+  std::map<uintptr_t, std::map<uintptr_t, Fragment_T>> block_list_;
+  std::deque<Block> block_cache_;
+
+  // Size of blocks that are at least partially in use.
+  size_t in_use_size_;
+  // Total size of block cache
+  size_t cache_size_;
+
+  __forceinline bool isFree(const Fragment_T& node) { return node.free; }
+  __forceinline void setUsed(Fragment_T& node) {
+    node.free = false;
+    node.free_list_entry_ = free_list_.end();
+  }
+  __forceinline void setFree(Fragment_T& node, typename Fragment_T::ptr_t Iterator) {
+    node.free_list_entry_ = Iterator;
+    node.free = true;
+  }
+  __forceinline Fragment_T makeFragment(size_t Len) {
+    return Fragment_T(free_list_.end(), Len, false);
+  }
+  __forceinline Fragment_T makeFragment(typename Fragment_T::ptr_t Iterator, size_t Len) {
+    return Fragment_T(Iterator, Len, true);
+  }
+  __forceinline void removeFreeListEntry(Fragment_T& node) {
+    if (node.free_list_entry_ != free_list_.end()) {
+      free_list_.erase(node.free_list_entry_);
+      node.free_list_entry_ = free_list_.end();
+    }
+  }
+  __forceinline void discard(Fragment_T& node) {
+    removeFreeListEntry(node);
+    node.discard = true;
+  }
+
+ public:
+  explicit SimpleHeap(const Allocator& BlockAllocator = Allocator())
+      : block_allocator_(BlockAllocator), in_use_size_(0), cache_size_(0) {}
+  ~SimpleHeap() {
+    trim();
+    // Leak here may be due to the user.  Check is for debugging only.
+    // assert(in_use_size_ == 0 && "Leak in SimpleHeap.");
+  }
+
+  SimpleHeap(const SimpleHeap& rhs) = delete;
+  SimpleHeap(SimpleHeap&& rhs) = delete;
+  SimpleHeap& operator=(const SimpleHeap& rhs) = delete;
+  SimpleHeap& operator=(SimpleHeap&& rhs) = delete;
+
+  void* alloc(size_t bytes) {
+    // Find best fit.
+    uintptr_t base;
+    size_t size;
+    // For bytes >= 2MB, the requested mem should be aligned
+    size_t align_bytes = bytes;
+    const int retry = bytes >= GPU_HUGE_PAGE_SIZE ? 1 : 0;
+    size_t align = bytes >= GPU_HUGE_PAGE_SIZE ? GPU_HUGE_PAGE_SIZE : DEFAULT_GPU_PAGE_SIZE;
+
+    for (int i = 0; i <= retry; i++) {
+      auto free_fragment = free_list_.lower_bound(align_bytes);
+      if (free_fragment == free_list_.end()) break;
+
+      uintptr_t addr = free_fragment->second;
+      size = free_fragment->first;
+
+      assert(size >= bytes && "SimpleHeap: map lower_bound failure.");
+
+      // Find the containing block and fragment
+      auto it = block_list_.upper_bound(addr);
+      it--;
+      auto& frag_map = it->second;
+      const auto& fragment = frag_map.find(addr);
+
+      assert(fragment != frag_map.end() && "Inconsistency in SimpleHeap.");
+      assert(size == fragment->second.size && "Inconsistency in SimpleHeap.");
+
+      size_t delta = addr & (align - 1);
+      if (!delta) {
+        // already find aligned address
+        base = addr;
+        free_list_.erase(free_fragment);
+        // Sub-allocate from fragment.
+        fragment->second.size = bytes;
+        setUsed(fragment->second);
+        // Record remaining free space.
+        if (size > bytes) {
+          free_fragment = free_list_.insert(std::make_pair(size - bytes, base + bytes));
+          frag_map[base + bytes] = makeFragment(free_fragment, size - bytes);
+        }
+      } else {
+        // If this is the first request and the requested size is not enough for alignment,
+        // then request for a bigger hole and do trim.
+        if (i == 0 && size < bytes + align - delta) {
+          align_bytes += align;
+          continue;
+        }
+
+        uintptr_t aligned_base = addr + align - delta;
+        base = aligned_base;
+
+        // Erase the old free list
+        free_list_.erase(free_fragment);
+
+        // fragment 1 - free
+        free_fragment = free_list_.insert(std::make_pair(aligned_base - addr, addr));
+        frag_map[addr] = makeFragment(free_fragment, aligned_base - addr);
+
+        //fragment 2 - used
+        frag_map[base] = makeFragment(bytes);
+
+        // fragement 3 - free
+        if (size > aligned_base - addr + bytes) {
+          free_fragment = free_list_.insert(std::make_pair(size - (aligned_base - addr) - bytes, aligned_base + bytes));
+          frag_map[aligned_base + bytes] = makeFragment(free_fragment, size - (aligned_base - addr) - bytes);
+        }
+      }
+      return reinterpret_cast<void*>(base);
+    }
+
+    // No usable fragment, check block cache
+    if (bytes < default_block_size() && !block_cache_.empty()) {
+      const auto& block = block_cache_.back();
+      base = block.base_ptr_;
+      size = block.length_;
+      block_cache_.pop_back();
+      cache_size_ -= size;
+    } else {  // Alloc new block - new block may be larger than default.
+      void* ptr = block_allocator_.alloc(bytes, size);
+      if (ptr == nullptr) {
+        fprintf(stderr, "Block allocation failed, Allocator is expected to throw.\n");
+        return nullptr;
+      }
+      base = reinterpret_cast<uintptr_t>(ptr);
+    }
+
+    in_use_size_ += size;
+    assert(size >= bytes && "Alloc exceeds block size.");
+    // Sub alloc and insert free region.
+    if (size > bytes) {
+      auto free_fragment = free_list_.insert(std::make_pair(size - bytes, base + bytes));
+      block_list_[base][base + bytes] = makeFragment(free_fragment, size - bytes);
+    }
+    // Track used region
+    block_list_[base][base] = makeFragment(bytes);
+
+    // Disallow multiple suballocation from large blocks.
+    // Prevents a small allocation from retaining a large block.
+    if (bytes > default_block_size()) {
+      bool err = discardBlock(reinterpret_cast<void*>(base));
+      assert(err && "Large block discard failed.");
+    }
+
+    return reinterpret_cast<void*>(base);
+  }
+
+  /* Return block-base the ptr belongs to if the ptr is a valid ptr which is allocated
+   * from this simpleheap and the block-base is allocated from block_allocator_*/
+  void* block_base(void* ptr) {
+    if (ptr == nullptr)
+      return nullptr;
+
+    uintptr_t base = reinterpret_cast<uintptr_t>(ptr);
+
+    // Find fragment and validate.
+    auto frag_map_it = block_list_.upper_bound(base);
+    if (frag_map_it == block_list_.begin())
+      return nullptr;
+    frag_map_it--;
+    auto& frag_map = frag_map_it->second;
+    auto fragment = frag_map.find(base);
+    if (fragment == frag_map.end() || isFree(fragment->second))
+      return nullptr;
+
+    return reinterpret_cast<void*>(frag_map_it->first);
+  }
+
+  void reset() {
+    free_list_.clear();
+    block_list_.clear();
+    block_cache_.clear();
+    in_use_size_ = 0;
+    cache_size_ = 0;
+  }
+
+  bool free(void* ptr) {
+    if (ptr == nullptr) return true;
+
+    uintptr_t base = reinterpret_cast<uintptr_t>(ptr);
+
+    // Find fragment and validate.
+    auto frag_map_it = block_list_.upper_bound(base);
+    if (frag_map_it == block_list_.begin()) return false;
+    frag_map_it--;
+    auto& frag_map = frag_map_it->second;
+    auto fragment = frag_map.find(base);
+    if (fragment == frag_map.end() || isFree(fragment->second)) return false;
+
+    bool discard = fragment->second.discard;
+
+    // Merge lower
+    if (fragment != frag_map.begin()) {
+      auto lower = fragment;
+      lower--;
+      if (isFree(lower->second)) {
+        removeFreeListEntry(lower->second);
+        lower->second.size += fragment->second.size;
+        frag_map.erase(fragment);
+        fragment = lower;
+      }
+    }
+
+    // Merge upper
+    {
+      auto upper = fragment;
+      upper++;
+      if ((upper != frag_map.end()) && isFree(upper->second)) {
+        removeFreeListEntry(upper->second);
+        fragment->second.size += upper->second.size;
+        frag_map.erase(upper);
+      }
+    }
+
+    // Release whole free blocks.
+    if (frag_map.size() == 1) {
+      Block block(fragment->first, fragment->second.size);
+      block_list_.erase(frag_map_it);
+
+      // Discard or add to the block cache.
+      if (discard) {
+        block_allocator_.free(reinterpret_cast<void*>(block.base_ptr_), block.length_);
+      } else {
+        block_cache_.push_back(block);
+        cache_size_ += block.length_;
+        in_use_size_ -= block.length_;
+      }
+
+      balance();
+
+      // Don't publish free space since block was moved to the cache.
+      return true;
+    }
+
+    // Don't report free memory if discarding the fragment.
+    if (discard) return true;
+
+    // Report free fragment
+    const auto& freeEntry =
+        free_list_.insert(std::make_pair(size_t(fragment->second.size), fragment->first));
+    setFree(fragment->second, freeEntry);
+
+    return true;
+  }
+
+  void balance() {
+    // Release old blocks when over cache limit.
+    while ((block_cache_.size() > 1) && (cache_size_ > in_use_size_ * 2)) {
+      const auto& block = block_cache_.front();
+      block_allocator_.free(reinterpret_cast<void*>(block.base_ptr_), block.length_);
+      cache_size_ -= block.length_;
+      block_cache_.pop_front();
+    }
+  }
+
+  void trim() {
+    for (const auto& block : block_cache_)
+      block_allocator_.free(reinterpret_cast<void*>(block.base_ptr_), block.length_);
+    block_cache_.clear();
+    cache_size_ = 0;
+  }
+
+  size_t cache_size() const { return cache_size_; }
+
+  size_t default_block_size() const { return block_allocator_.block_size(); }
+
+  // Prevent reuse of the block containing ptr.  No further fragments will be allocated from the
+  // block and the block will not be added to the block cache when it is free.
+  bool discardBlock(void* ptr) {
+    if (ptr == nullptr) return true;
+
+    uintptr_t base = reinterpret_cast<uintptr_t>(ptr);
+
+    // Find block validate.
+    auto frag_map_it = block_list_.upper_bound(base);
+    if (frag_map_it == block_list_.begin()) return false;
+    frag_map_it--;
+    auto& frag_map = frag_map_it->second;
+    if ((base < frag_map.begin()->first) ||
+        (frag_map.rbegin()->first + frag_map.rbegin()->second.size <= base))
+      return false;
+
+    // Is block already discarded?
+    if (frag_map.begin()->second.discard) return true;
+
+    // Mark all fragments for discard and compute block size.  Removes freelist records for all
+    // fragments in the block.
+    size_t size = 0;
+    for (auto& frag : frag_map) {
+      discard(frag.second);
+      size += frag.second.size;
+    }
+
+    // Remove discarded block from in-use tracking and rebalance the block cache.
+    in_use_size_ -= size;
+    balance();
+
+    return true;
+  }
+};
+
+} // namespace wsl
+
+#endif  // HSA_RUNTME_CORE_UTIL_SIMPLE_HEAP_H_
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/small_heap.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/util/small_heap.cpp
new file mode 100644
index 0000000000..bcaef5dd87
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/small_heap.cpp
@@ -0,0 +1,185 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+// 
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+// 
+// Developed by:
+// 
+//                 AMD Research and AMD HSA Software Development
+// 
+//                 Advanced Micro Devices, Inc.
+// 
+//                 www.amd.com
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+// 
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include "small_heap.h"
+
+namespace wsl {
+
+// Inserts node into freelist after place.
+// Assumes node will not be an end of the list (list has guard nodes).
+void SmallHeap::insertafter(SmallHeap::iterator_t place, SmallHeap::iterator_t node) {
+  assert(place->first < node->first && "Order violation");
+  assert(isfree(place->second) && "Freelist operation error.");
+  iterator_t next = place->second.next;
+  node->second.next = next;
+  node->second.prior = place;
+  place->second.next = node;
+  next->second.prior = node;
+}
+
+// Removes node from freelist.
+// Assumes node will not be an end of the list (list has guard nodes).
+void SmallHeap::remove(SmallHeap::iterator_t node) {
+  assert(isfree(node->second) && "Freelist operation error.");
+  node->second.prior->second.next = node->second.next;
+  node->second.next->second.prior = node->second.prior;
+  setused(node->second);
+}
+
+// Returns high if merge failed or the merged node.
+SmallHeap::memory_t::iterator SmallHeap::merge(SmallHeap::memory_t::iterator low,
+                                               SmallHeap::memory_t::iterator high) {
+  assert(isfree(low->second) && "Merge with allocated block");
+  assert(isfree(high->second) && "Merge with allocated block");
+
+  if ((char*)low->first + low->second.len != (char*)high->first) return high;
+
+  assert(!islastfree(high->second) && "Illegal merge.");
+
+  low->second.len += high->second.len;
+  low->second.next = high->second.next;
+  high->second.next->second.prior = low;
+
+  memory.erase(high);
+  return low;
+}
+
+void SmallHeap::free(void* ptr) {
+  if (ptr == nullptr) return;
+
+  auto iterator = memory.find(ptr);
+
+  // Check for illegal free
+  if (iterator == memory.end()) {
+    assert(false && "Illegal free.");
+    return;
+  }
+
+  // Return memory to total and link node into free list
+  total_free += iterator->second.len;
+
+  // Could also traverse the free list which might be faster in some cases.
+  auto before = iterator;
+  before--;
+  while (!isfree(before->second)) before--;
+  assert(before->second.next->first > iterator->first && "Inconsistency in small heap.");
+  insertafter(before, iterator);
+
+  // Attempt compaction
+  iterator = merge(before, iterator);
+  merge(iterator, iterator->second.next);
+
+  // Update lowHighBondary
+  high.erase(ptr);
+}
+
+void* SmallHeap::alloc(size_t bytes) {
+  // Is enough memory available?
+  if ((bytes > total_free) || (bytes == 0)) return nullptr;
+
+  iterator_t current;
+
+  // Walk the free list and allocate at first fitting location
+  current = firstfree();
+  while (!islastfree(current->second)) {
+    if (bytes <= current->second.len) {
+      // Decrement from total
+      total_free -= bytes;
+
+      // Split node
+      if (bytes != current->second.len) {
+        void* remaining = (char*)current->first + bytes;
+        Node& node = memory[remaining];
+        node.len = current->second.len - bytes;
+        current->second.len = bytes;
+        insertafter(current, memory.find(remaining));
+      }
+
+      remove(current);
+      return current->first;
+    }
+    current = current->second.next;
+  }
+  assert(current->second.len == 0 && "Freelist corruption.");
+
+  // Can't service the request due to fragmentation
+  return nullptr;
+}
+
+void* SmallHeap::alloc_high(size_t bytes) {
+  // Is enough memory available?
+  if ((bytes > total_free) || (bytes == 0)) return nullptr;
+
+  iterator_t current;
+
+  // Walk the free list and allocate at first fitting location
+  current = lastfree();
+  while (!isfirstfree(current->second)) {
+    if (bytes <= current->second.len) {
+      // Decrement from total
+      total_free -= bytes;
+
+      void* alloc;
+      // Split node
+      if (bytes != current->second.len) {
+        alloc = (char*)current->first + current->second.len - bytes;
+        current->second.len -= bytes;
+        Node& node = memory[alloc];
+        node.len = bytes;
+        setused(node);
+      } else {
+        alloc = current->first;
+        remove(current);
+      }
+
+      high.insert(alloc);
+      return alloc;
+    }
+    current = current->second.prior;
+  }
+  assert(current->second.len == 0 && "Freelist corruption.");
+
+  // Can't service the request due to fragmentation
+  return nullptr;
+}
+
+} // namespace wsl
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/small_heap.h b/projects/rocr-runtime/libhsakmt/src/dxg/util/small_heap.h
new file mode 100644
index 0000000000..f6e060cb09
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/small_heap.h
@@ -0,0 +1,131 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+// 
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+// 
+// Developed by:
+// 
+//                 AMD Research and AMD HSA Software Development
+// 
+//                 Advanced Micro Devices, Inc.
+// 
+//                 www.amd.com
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+// 
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// A simple first fit memory allocator with eager compaction.  For use with few
+// items (where list iteration is faster than trees).
+// Not thread safe!
+
+#ifndef HSA_RUNTME_CORE_UTIL_SMALL_HEAP_H_
+#define HSA_RUNTME_CORE_UTIL_SMALL_HEAP_H_
+
+#include <map>
+#include <set>
+
+#include "utils.h"
+
+namespace wsl {
+
+class SmallHeap {
+ private:
+  struct Node;
+  typedef std::map<void*, Node> memory_t;
+  typedef memory_t::iterator iterator_t;
+
+  struct Node {
+    size_t len;
+    iterator_t next;
+    iterator_t prior;
+  };
+
+  SmallHeap(const SmallHeap& rhs) = delete;
+  SmallHeap& operator=(const SmallHeap& rhs) = delete;
+
+  void* const pool;
+  const size_t length;
+
+  size_t total_free;
+  memory_t memory;
+  std::set<void*> high;
+
+  __forceinline bool isfree(const Node& node) const { return node.next != memory.begin(); }
+  __forceinline bool islastfree(const Node& node) const { return node.next == memory.end(); }
+  __forceinline bool isfirstfree(const Node& node) const { return node.prior == memory.end(); }
+  __forceinline void setlastfree(Node& node) { node.next = memory.end(); }
+  __forceinline void setfirstfree(Node& node) { node.prior = memory.end(); }
+  __forceinline void setused(Node& node) { node.next = memory.begin(); }
+
+  __forceinline iterator_t firstfree() { return memory.begin()->second.next; }
+  __forceinline iterator_t lastfree() { return memory.rbegin()->second.prior; }
+  void insertafter(iterator_t place, iterator_t node);
+  void remove(iterator_t node);
+  iterator_t merge(iterator_t low, iterator_t high);
+
+ public:
+  SmallHeap() : pool(nullptr), length(0), total_free(0) {}
+  SmallHeap(void* base, size_t length)
+      : pool(base), length(length), total_free(length) {
+    assert(pool != nullptr && "Invalid base address.");
+    assert(pool != (void*)0xFFFFFFFFFFFFFFFFull && "Invalid base address.");
+    assert((char*)pool + length != (char*)0xFFFFFFFFFFFFFFFFull && "Invalid pool bounds.");
+
+    Node& start = memory[0];
+    Node& node = memory[pool];
+    Node& end = memory[(void*)0xFFFFFFFFFFFFFFFFull];
+
+    start.len = 0;
+    start.next = memory.find(pool);
+    setfirstfree(start);
+
+    node.len = length;
+    node.prior = memory.begin();
+    node.next = --memory.end();
+
+    end.len = 0;
+    end.prior = start.next;
+    setlastfree(end);
+
+    high.insert((void*)0xFFFFFFFFFFFFFFFFull);
+  }
+
+  void* alloc(size_t bytes);
+  void* alloc_high(size_t bytes);
+  void free(void* ptr);
+
+  void* base() const { return pool; }
+  size_t size() const { return length; }
+  size_t remaining() const { return total_free; }
+  void* high_split() const { return *high.begin(); }
+};
+
+} // namespace wsl
+
+#endif
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/timer.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/util/timer.cpp
new file mode 100644
index 0000000000..c5a2b57c64
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/timer.cpp
@@ -0,0 +1,111 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+// 
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+// 
+// Developed by:
+// 
+//                 AMD Research and AMD HSA Software Development
+// 
+//                 Advanced Micro Devices, Inc.
+// 
+//                 www.amd.com
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+// 
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include "core/util/timer.h"
+
+namespace wsl {
+namespace timer {
+
+accurate_clock::init::init() {
+  freq = os::AccurateClockFrequency();
+  accurate_clock::period_ns = 1e9 / double(freq);
+}
+
+// Calibrates the fast clock using the accurate clock.
+fast_clock::init::init() {
+  typedef accurate_clock clock;
+  clock::duration delay(std::chrono::milliseconds(1));
+
+  // calibrate clock
+  fast_clock::raw_rep min = 0;
+  clock::duration elapsed;
+
+  do {
+    elapsed = clock::duration::max();
+
+    for (int t = 0; t < 10; t++) {
+      fast_clock::raw_rep r1, r2;
+      clock::time_point t0, t1, t2, t3;
+
+      t0 = clock::now();
+      std::atomic_signal_fence(std::memory_order_acq_rel);
+      r1 = fast_clock::raw_now();
+      std::atomic_signal_fence(std::memory_order_acq_rel);
+      t1 = clock::now();
+      std::atomic_signal_fence(std::memory_order_acq_rel);
+
+      do {
+        t2 = clock::now();
+      } while (t2 - t1 < delay);
+
+      std::atomic_signal_fence(std::memory_order_acq_rel);
+      r2 = fast_clock::raw_now();
+      std::atomic_signal_fence(std::memory_order_acq_rel);
+      t3 = clock::now();
+
+      // If elapsed time is shorter than last recorded time and both the start
+      // and end times are confirmed correlated then record the clock readings.
+      // This protects against inaccuracy due to thread switching
+      if ((t3 - t1 < elapsed) && ((t1 - t0) * 10 < (t2 - t1)) &&
+          ((t3 - t2) * 10 < (t2 - t1))) {
+        elapsed = t3 - t1;
+        min = r2 - r1;
+      }
+    }
+    delay += delay;
+  } while (min < 1000);
+
+  fast_clock::freq = double(min) / duration_in_seconds(elapsed);
+  fast_clock::period_ps = 1e12 / fast_clock::freq;
+  // printf("Timer setup took %f ms\n", duration_in_seconds(elapsed)*1000.0f);
+  // printf("Fast clock frequency: %f MHz\n", double(fast_clock::freq)/1e6);
+}
+
+double accurate_clock::period_ns;
+accurate_clock::raw_frequency accurate_clock::freq;
+accurate_clock::init accurate_clock::accurate_clock_init;
+
+double fast_clock::period_ps;
+fast_clock::raw_frequency fast_clock::freq;
+fast_clock::init fast_clock::fast_clock_init;
+}   //  namespace timer
+}   // namespace wsl
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/timer.h b/projects/rocr-runtime/libhsakmt/src/dxg/util/timer.h
new file mode 100644
index 0000000000..3012685113
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/timer.h
@@ -0,0 +1,173 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+// 
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+// 
+// Developed by:
+// 
+//                 AMD Research and AMD HSA Software Development
+// 
+//                 Advanced Micro Devices, Inc.
+// 
+//                 www.amd.com
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+// 
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef HSA_RUNTIME_CORE_UTIL_TIMER_H_
+#define HSA_RUNTIME_CORE_UTIL_TIMER_H_
+
+#include "core/util/utils.h"
+#include "core/util/os.h"
+#include <chrono>
+#include <time.h>
+#include <type_traits>
+
+namespace wsl {
+namespace timer {
+
+// Needed to patch around a mixed arithmetic bug in MSVC's duration_cast as of
+// VS 2013.
+template <bool isFloat, bool isSigned>
+struct wide_type {
+  typedef double type;
+};
+template <>
+struct wide_type<false, false> {
+  typedef uintmax_t type;
+};
+template <>
+struct wide_type<false, true> {
+  typedef intmax_t type;
+};
+
+template <typename To, typename Rep, typename Period>
+static __forceinline To
+    duration_cast(const std::chrono::duration<Rep, Period>& d) {
+  typedef typename wide_type<std::is_floating_point<Rep>::value,
+                             std::is_signed<Rep>::value>::type wide;
+  typedef std::chrono::duration<wide, typename To::period> unit_convert_t;
+
+  unit_convert_t temp = std::chrono::duration_cast<unit_convert_t>(d);
+  return To(static_cast<typename To::rep>(temp.count()));
+}
+// End patch
+
+template <typename Rep, typename Period>
+static __forceinline double duration_in_seconds(
+    std::chrono::duration<Rep, Period> delta) {
+  typedef std::chrono::duration<double, std::ratio<1, 1>> seconds;
+  return seconds(delta).count();
+}
+
+template <typename rep>
+static __forceinline rep duration_from_seconds(double delta) {
+  typedef std::chrono::duration<double, std::ratio<1, 1>> seconds;
+  return std::chrono::duration_cast<rep>(seconds(delta));
+}
+
+// Provices a C++11 standard clock interface to the os::AccurateClock functions
+class accurate_clock {
+ public:
+  typedef double rep;
+  typedef std::nano period;
+  typedef std::chrono::duration<rep, period> duration;
+  typedef std::chrono::time_point<accurate_clock> time_point;
+
+  static const bool is_steady = true;
+
+  static __forceinline time_point now() {
+    return time_point(duration(raw_now() * period_ns));
+  }
+
+  // These two extra APIs and types let us use clocks without conversion to the
+  // arbitrary period unit
+  typedef uint64_t raw_rep;
+  typedef uint64_t raw_frequency;
+
+  static __forceinline raw_rep raw_now() { return os::ReadAccurateClock(); }
+  static __forceinline raw_frequency raw_freq() { return freq; }
+
+ private:
+  static double period_ns;
+  static raw_frequency freq;
+
+  class init {
+   public:
+    init();
+  };
+  static init accurate_clock_init;
+};
+
+// Provices a C++11 standard clock interface to the lowest latency approximate
+// clock
+class fast_clock {
+ public:
+  typedef double rep;
+  typedef std::pico period;
+  typedef std::chrono::duration<rep, period> duration;
+  typedef std::chrono::time_point<fast_clock> time_point;
+
+  static const bool is_steady = true;
+
+  static __forceinline time_point now() {
+    return time_point(duration(raw_now() * period_ps));
+  }
+
+  // These two extra APIs and types let us use clocks without conversion to the
+  // arbitrary period unit
+  typedef uint64_t raw_rep;
+  typedef double raw_frequency;
+
+#if defined(__x86_64__) || defined(_M_X64)
+  static __forceinline raw_rep raw_now() { return __rdtsc(); }
+  static __forceinline raw_frequency raw_freq() { return freq; }
+#else
+  static __forceinline raw_rep raw_now() {
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
+    return (raw_rep(ts.tv_sec) * 1000000000 + raw_rep(ts.tv_nsec));
+  }
+  static __forceinline raw_frequency raw_freq() { return 1.e-9; }
+#endif
+
+ private:
+  static double period_ps;
+  static raw_frequency freq;
+
+  class init {
+   public:
+    init();
+  };
+  static init fast_clock_init;
+};
+}   //  namespace timer
+}   //  namespace wsl
+
+#endif
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/utils.h b/projects/rocr-runtime/libhsakmt/src/dxg/util/utils.h
new file mode 100644
index 0000000000..15d61a87e1
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/utils.h
@@ -0,0 +1,389 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2024, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// Generally useful utility functions
+
+#ifndef HSA_RUNTIME_CORE_UTIL_UTILS_H_
+#define HSA_RUNTIME_CORE_UTIL_UTILS_H_
+
+#include "stdint.h"
+#include "stddef.h"
+#include "stdlib.h"
+#include "stdarg.h"
+#include "unistd.h"
+#include <assert.h>
+#include <iostream>
+#include <string>
+#include <algorithm>
+#include <sstream>
+#include <thread>
+
+namespace wsl {
+extern FILE* log_file;
+extern uint8_t log_flags[8];
+
+typedef unsigned int uint;
+typedef uint64_t uint64;
+
+#if defined(__GNUC__)
+#if defined(__i386__) || defined(__x86_64__)
+#include <x86intrin.h>
+#endif
+
+// 2MB huge page size
+#define GPU_HUGE_PAGE_SIZE    (2 << 20)
+
+// 4KB page size
+#define DEFAULT_GPU_PAGE_SIZE (1 << 12)
+
+#define __forceinline __inline__ __attribute__((always_inline))
+#define __declspec(x) __attribute__((x))
+#undef __stdcall
+#define __stdcall  // __attribute__((__stdcall__))
+#define __ALIGNED__(x) __attribute__((aligned(x)))
+
+void log_printf(const char* file, int line, const char* format, ...);
+
+static __forceinline void* _aligned_malloc(size_t size, size_t alignment) {
+#ifdef _ISOC11_SOURCE
+  return aligned_alloc(alignment, size);
+#else
+  void *mem = NULL;
+  if (0 != posix_memalign(&mem, alignment, size)) return NULL;
+  return mem;
+#endif
+}
+static __forceinline void _aligned_free(void* ptr) { return free(ptr); }
+#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
+#include "intrin.h"
+#define __ALIGNED__(x) __declspec(align(x))
+#if (_MSC_VER < 1800)  // < VS 2013
+static __forceinline unsigned long long int strtoull(const char* str,
+                                                     char** endptr, int base) {
+  return static_cast<unsigned long long>(_strtoui64(str, endptr, base));
+}
+#endif
+#if (_MSC_VER < 1900)  // < VS 2015
+#define thread_local __declspec(thread)
+#endif
+#else
+#error "Compiler and/or processor not identified."
+#endif
+
+#define STRING2(x) #x
+#define STRING(x) STRING2(x)
+
+#define PASTE2(x, y) x##y
+#define PASTE(x, y) PASTE2(x, y)
+
+#define __FILENAME__ (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
+
+#define LogPrint(flag, format, ...)                                                                \
+  do {                                                                                             \
+    if (hsa_flag_isset64(log_flags, flag))                                                         \
+      wsl::log_printf(__FILENAME__, __LINE__, format, ##__VA_ARGS__);                             \
+  } while (false);
+
+// A macro to disallow the copy and move constructor and operator= functions
+#define DISALLOW_COPY_AND_ASSIGN(TypeName)                                                         \
+  TypeName(const TypeName&) = delete;                                                              \
+  TypeName(TypeName&&) = delete;                                                                   \
+  void operator=(const TypeName&) = delete;                                                        \
+  void operator=(TypeName&&) = delete;
+
+template <typename lambda>
+class ScopeGuard {
+ public:
+  explicit __forceinline ScopeGuard(const lambda& release)
+      : release_(release), dismiss_(false) {}
+
+  ScopeGuard(ScopeGuard& rhs) { *this = rhs; }
+
+  __forceinline ~ScopeGuard() {
+    if (!dismiss_) release_();
+  }
+  __forceinline ScopeGuard& operator=(ScopeGuard& rhs) {
+    dismiss_ = rhs.dismiss_;
+    release_ = rhs.release_;
+    rhs.dismiss_ = true;
+    return *this;
+  }
+  __forceinline void Dismiss() { dismiss_ = true; }
+
+ private:
+  lambda release_;
+  bool dismiss_;
+};
+
+template <typename lambda>
+static __forceinline ScopeGuard<lambda> MakeScopeGuard(lambda rel) {
+  return ScopeGuard<lambda>(rel);
+}
+
+#define MAKE_SCOPE_GUARD_HELPER(lname, sname, ...) \
+  auto lname = __VA_ARGS__;                        \
+  ScopeGuard<decltype(lname)> sname(lname);
+#define MAKE_SCOPE_GUARD(...)                                   \
+  MAKE_SCOPE_GUARD_HELPER(PASTE(scopeGuardLambda, __COUNTER__), \
+                          PASTE(scopeGuard, __COUNTER__), __VA_ARGS__)
+#define MAKE_NAMED_SCOPE_GUARD(name, ...)                             \
+  MAKE_SCOPE_GUARD_HELPER(PASTE(scopeGuardLambda, __COUNTER__), name, \
+                          __VA_ARGS__)
+
+/// @brief: Finds out the min one of two inputs, input must support ">"
+/// operator.
+/// @param: a(Input), a reference to type T.
+/// @param: b(Input), a reference to type T.
+/// @return: T.
+template <class T>
+static __forceinline T Min(const T& a, const T& b) {
+  return (a > b) ? b : a;
+}
+
+template <class T, class... Arg>
+static __forceinline T Min(const T& a, const T& b, Arg... args) {
+  return Min(a, Min(b, args...));
+}
+
+/// @brief: Find out the max one of two inputs, input must support ">" operator.
+/// @param: a(Input), a reference to type T.
+/// @param: b(Input), a reference to type T.
+/// @return: T.
+template <class T>
+static __forceinline T Max(const T& a, const T& b) {
+  return (b > a) ? b : a;
+}
+
+template <class T, class... Arg>
+static __forceinline T Max(const T& a, const T& b, Arg... args) {
+  return Max(a, Max(b, args...));
+}
+
+/// @brief: Free the memory space which is newed previously.
+/// @param: ptr(Input), a pointer to memory space. Can't be NULL.
+/// @return: void.
+struct DeleteObject {
+  template <typename T>
+  void operator()(const T* ptr) const {
+    delete ptr;
+  }
+};
+
+/// @brief: Checks if a value is power of two, if it is, return true. Be careful
+/// when passing 0.
+/// @param: val(Input), the data to be checked.
+/// @return: bool.
+template <typename T>
+static __forceinline bool IsPowerOfTwo(T val) {
+  return (val & (val - 1)) == 0;
+}
+
+/// @brief: Calculates the floor value aligned based on parameter of alignment.
+/// If value is at the boundary of alignment, it is unchanged.
+/// @param: value(Input), value to be calculated.
+/// @param: alignment(Input), alignment value.
+/// @return: T.
+template <typename T>
+static __forceinline T AlignDown(T value, size_t alignment) {
+  return (T)((value / alignment) * alignment);
+}
+
+/// @brief: Same as previous one, but first parameter becomes pointer, for more
+/// info, see the previous desciption.
+/// @param: value(Input), pointer to type T.
+/// @param: alignment(Input), alignment value.
+/// @return: T*, pointer to type T.
+template <typename T>
+static __forceinline T* AlignDown(T* value, size_t alignment) {
+  return (T*)AlignDown((intptr_t)value, alignment);
+}
+
+/// @brief: Calculates the ceiling value aligned based on parameter of
+/// alignment.
+/// If value is at the boundary of alignment, it is unchanged.
+/// @param: value(Input), value to be calculated.
+/// @param: alignment(Input), alignment value.
+/// @param: T.
+template <typename T>
+static __forceinline T AlignUp(T value, size_t alignment) {
+  return AlignDown((T)(value + alignment - 1), alignment);
+}
+
+/// @brief: Same as previous one, but first parameter becomes pointer, for more
+/// info, see the previous desciption.
+/// @param: value(Input), pointer to type T.
+/// @param: alignment(Input), alignment value.
+/// @return: T*, pointer to type T.
+template <typename T>
+static __forceinline T* AlignUp(T* value, size_t alignment) {
+  return (T*)AlignDown((intptr_t)((uint8_t*)value + alignment - 1), alignment);
+}
+
+/// @brief: Checks if the input value is at the boundary of alignment, if it is,
+/// @return true.
+/// @param: value(Input), value to be checked.
+/// @param: alignment(Input), alignment value.
+/// @return: bool.
+template <typename T>
+static __forceinline bool IsMultipleOf(T value, size_t alignment) {
+  return (AlignUp(value, alignment) == value);
+}
+
+/// @brief: Same as previous one, but first parameter becomes pointer, for more
+/// info, see the previous desciption.
+/// @param: value(Input), pointer to type T.
+/// @param: alignment(Input), alignment value.
+/// @return: bool.
+template <typename T>
+static __forceinline bool IsMultipleOf(T* value, size_t alignment) {
+  return (AlignUp(value, alignment) == value);
+}
+
+static __forceinline uint32_t NextPow2(uint32_t value) {
+  if (value == 0) return 1;
+  uint32_t v = value - 1;
+  v |= v >> 1;
+  v |= v >> 2;
+  v |= v >> 4;
+  v |= v >> 8;
+  v |= v >> 16;
+  return v + 1;
+}
+
+static __forceinline uint64_t NextPow2(uint64_t value) {
+  if (value == 0) return 1;
+  uint64_t v = value - 1;
+  v |= v >> 1;
+  v |= v >> 2;
+  v |= v >> 4;
+  v |= v >> 8;
+  v |= v >> 16;
+  v |= v >> 32;
+  return v + 1;
+}
+
+static __forceinline bool strIsEmpty(const char* str) noexcept { return str[0] == '\0'; }
+
+static __forceinline std::string& ltrim(std::string& s) {
+  auto it = std::find_if(s.begin(), s.end(),
+                         [](char c) { return !std::isspace<char>(c, std::locale::classic()); });
+  s.erase(s.begin(), it);
+  return s;
+}
+
+static __forceinline std::string& rtrim(std::string& s) {
+  auto it = std::find_if(s.rbegin(), s.rend(),
+                         [](char c) { return !std::isspace<char>(c, std::locale::classic()); });
+  s.erase(it.base(), s.end());
+  return s;
+}
+
+static __forceinline std::string& trim(std::string& s) { return ltrim(rtrim(s)); }
+
+}  // namespace wsl
+
+template <uint32_t lowBit, uint32_t highBit, typename T>
+static __forceinline uint32_t BitSelect(T p) {
+  static_assert(sizeof(T) <= sizeof(uintptr_t), "Type out of range.");
+  static_assert(highBit < sizeof(uintptr_t) * 8, "Bit index out of range.");
+
+  uintptr_t ptr = p;
+  if (highBit != (sizeof(uintptr_t) * 8 - 1))
+    return (uint32_t)((ptr & ((1ull << (highBit + 1)) - 1)) >> lowBit);
+  else
+    return (uint32_t)(ptr >> lowBit);
+}
+
+inline uint32_t PtrLow16Shift8(const void* p) {
+  uintptr_t ptr = reinterpret_cast<uintptr_t>(p);
+  return (uint32_t)((ptr & 0xFFFFULL) >> 8);
+}
+
+inline uint32_t PtrHigh64Shift16(const void* p) {
+  uintptr_t ptr = reinterpret_cast<uintptr_t>(p);
+  return (uint32_t)((ptr & 0xFFFFFFFFFFFF0000ULL) >> 16);
+}
+
+inline uint32_t PtrLow40Shift8(const void* p) {
+  uintptr_t ptr = reinterpret_cast<uintptr_t>(p);
+  return (uint32_t)((ptr & 0xFFFFFFFFFFULL) >> 8);
+}
+
+inline uint32_t PtrHigh64Shift40(const void* p) {
+  uintptr_t ptr = reinterpret_cast<uintptr_t>(p);
+  return (uint32_t)((ptr & 0xFFFFFF0000000000ULL) >> 40);
+}
+
+static inline uint8_t Ptr48High8(const void* p) {
+  uintptr_t ptr = reinterpret_cast<uintptr_t>(p);
+  return (uint8_t)((ptr & 0xFF0000000000ULL) >> 40);
+}
+
+static inline uint32_t Ptr48Low32(const void* p) {
+  uintptr_t ptr = reinterpret_cast<uintptr_t>(p);
+  assert((ptr & 0xFFFFFFFFFF00ULL) == ptr);
+  return (uint32_t)((ptr & 0xFFFFFFFFFFULL) >> 8);
+}
+
+inline uint32_t PtrLow32(const void* p) {
+  return static_cast<uint32_t>(reinterpret_cast<uintptr_t>(p));
+}
+
+inline uint32_t PtrHigh32(const void* p) {
+  uint32_t ptr = 0;
+#ifdef HSA_LARGE_MODEL
+  ptr = static_cast<uint32_t>(reinterpret_cast<uintptr_t>(p) >> 32);
+#endif
+  return ptr;
+}
+
+inline uint32_t HighPart(uint64_t value) {
+  return (value & 0xFFFFFFFF00000000) >> 32;
+}
+
+inline uint32_t LowPart(uint64_t value) {
+  return (value & 0x00000000FFFFFFFF);
+}
+
+#include "atomic_helpers.h"
+
+#endif  // HSA_RUNTIME_CORE_UTIL_UTILS_H_
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/win/os_win.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/util/win/os_win.cpp
new file mode 100644
index 0000000000..b7f2285623
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/win/os_win.cpp
@@ -0,0 +1,327 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+// 
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+// 
+// Developed by:
+// 
+//                 AMD Research and AMD HSA Software Development
+// 
+//                 Advanced Micro Devices, Inc.
+// 
+//                 www.amd.com
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+// 
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifdef _WIN32  // Are we compiling for windows?
+#define NOMINMAX
+
+#include "core/util/os.h"
+
+#include <algorithm>
+#include <process.h>
+#include <string>
+#include <windows.h>
+
+#include <emmintrin.h>
+#include <pmmintrin.h>
+#include <xmmintrin.h>
+
+#undef Yield
+#undef CreateMutex
+
+namespace wsl {
+namespace os {
+
+static_assert(sizeof(LibHandle) == sizeof(HMODULE),
+              "OS abstraction size mismatch");
+static_assert(sizeof(LibHandle) == sizeof(::HANDLE),
+              "OS abstraction size mismatch");
+static_assert(sizeof(Semaphore) == sizeof(::HANDLE),
+              "OS abstraction size mismatch");
+static_assert(sizeof(Mutex) == sizeof(::HANDLE),
+              "OS abstraction size mismatch");
+static_assert(sizeof(Thread) == sizeof(::HANDLE),
+              "OS abstraction size mismatch");
+static_assert(sizeof(EventHandle) == sizeof(::HANDLE),
+              "OS abstraction size mismatch");
+
+LibHandle LoadLib(std::string filename) {
+  HMODULE ret = LoadLibrary(filename.c_str());
+  return *(LibHandle*)&ret;
+}
+
+void* GetExportAddress(LibHandle lib, std::string export_name) {
+  return GetProcAddress(*(HMODULE*)&lib, export_name.c_str());
+}
+
+void CloseLib(LibHandle lib) { FreeLibrary(*(::HMODULE*)&lib); }
+
+std::vector<LibHandle> GetLoadedLibs() {
+  // Use EnumProcessModulesEx
+  static_assert(false, "Not implemented.");
+}
+
+std::string GetLibraryName(LibHandle lib) {
+  static_assert(false, "Not implemented.");
+}
+
+Semaphore CreateSemaphore() {
+  sem = static_cast<void*>(CreateSemaphore(NULL, 0, LONG_MAX, NULL));
+  assert(sem != NULL && "CreateSemaphore failed");
+
+  return *(Semaphore*)&sem;
+}
+
+bool WaitSemaphore(Semaphore sem) {
+  return WaitForSingleObject(*(::HANDLE*)&lock, INFINITE) == WAIT_OBJECT_0;
+}
+
+void PostSemaphore(Semaphore sem) {
+  ReleaseSemaphore(static_cast<HANDLE>(*sem), 1, NULL);
+}
+
+void DestroySemaphore(Semaphore sem) {
+  if (!CloseHandle(static_cast<HANDLE>(*sem))) {
+    assert("CloseHandle() failed");
+  }
+  *sem = NULL;
+}
+
+Mutex CreateMutex() { return CreateEvent(NULL, false, true, NULL); }
+
+bool TryAcquireMutex(Mutex lock) {
+  return WaitForSingleObject(*(::HANDLE*)&lock, 0) == WAIT_OBJECT_0;
+}
+
+bool AcquireMutex(Mutex lock) {
+  return WaitForSingleObject(*(::HANDLE*)&lock, INFINITE) == WAIT_OBJECT_0;
+}
+
+void ReleaseMutex(Mutex lock) { SetEvent(*(::HANDLE*)&lock); }
+
+void DestroyMutex(Mutex lock) { CloseHandle(*(::HANDLE*)&lock); }
+
+void Sleep(int delay_in_millisecond) { ::Sleep(delay_in_millisecond); }
+
+void uSleep(int delayInUs) { ::Sleep(delayInUs / 1000); }
+
+void YieldThread() { ::Sleep(0); }
+
+struct ThreadArgs {
+  void* entry_args;
+  ThreadEntry entry_function;
+};
+
+unsigned __stdcall ThreadTrampoline(void* arg) {
+  ThreadArgs* thread_args = (ThreadArgs*)arg;
+  ThreadEntry entry = thread_args->entry_function;
+  void* data = thread_args->entry_args;
+  delete thread_args;
+  entry(data);
+  _endthreadex(0);
+  return 0;
+}
+
+Thread CreateThread(ThreadEntry entry_function, void* entry_argument,
+                    uint stack_size) {
+  ThreadArgs* thread_args = new ThreadArgs();
+  thread_args->entry_args = entry_argument;
+  thread_args->entry_function = entry_function;
+  uintptr_t ret =
+      _beginthreadex(NULL, stack_size, ThreadTrampoline, thread_args, 0, NULL);
+  return *(Thread*)&ret;
+}
+
+void CloseThread(Thread thread) { CloseHandle(*(::HANDLE*)&thread); }
+
+bool WaitForThread(Thread thread) {
+  return WaitForSingleObject(*(::HANDLE*)&thread, INFINITE) == WAIT_OBJECT_0;
+}
+
+bool WaitForAllThreads(Thread* threads, uint thread_count) {
+  return WaitForMultipleObjects(thread_count, threads, TRUE, INFINITE) ==
+         WAIT_OBJECT_0;
+}
+
+void SetEnvVar(std::string env_var_name, std::string env_var_value) {
+  SetEnvironmentVariable(env_var_name.c_str(), env_var_value.c_str());
+}
+
+std::string GetEnvVar(std::string env_var_name) {
+  char* buff;
+  DWORD char_count = GetEnvironmentVariable(env_var_name.c_str(), NULL, 0);
+  if (char_count == 0) return "";
+  buff = (char*)alloca(sizeof(char) * char_count);
+  GetEnvironmentVariable(env_var_name.c_str(), buff, char_count);
+  buff[char_count - 1] = '\0';
+  std::string ret = buff;
+  return ret;
+}
+
+size_t GetUserModeVirtualMemorySize() {
+  SYSTEM_INFO system_info = {0};
+  GetSystemInfo(&system_info);
+  return ((size_t)system_info.lpMaximumApplicationAddress + 1);
+}
+
+size_t GetUsablePhysicalHostMemorySize() {
+  MEMORYSTATUSEX memory_status = {0};
+  memory_status.dwLength = sizeof(memory_status);
+  if (GlobalMemoryStatusEx(&memory_status) == 0) {
+    return 0;
+  }
+
+  const size_t physical_size = static_cast<size_t>(memory_status.ullTotalPhys);
+  return std::min(GetUserModeVirtualMemorySize(), physical_size);
+}
+
+uintptr_t GetUserModeVirtualMemoryBase() { return (uintptr_t)0; }
+
+// Os event wrappers
+EventHandle CreateOsEvent(bool auto_reset, bool init_state) {
+  EventHandle evt = reinterpret_cast<EventHandle>(
+      CreateEvent(NULL, (BOOL)(!auto_reset), (BOOL)init_state, NULL));
+  return evt;
+}
+
+int DestroyOsEvent(EventHandle event) {
+  if (event == NULL) {
+    return -1;
+  }
+  return CloseHandle(reinterpret_cast<::HANDLE>(event));
+}
+
+int WaitForOsEvent(EventHandle event, unsigned int milli_seconds) {
+  if (event == NULL) {
+    return -1;
+  }
+
+  int ret_code =
+      WaitForSingleObject(reinterpret_cast<::HANDLE>(event), milli_seconds);
+  if (ret_code == WAIT_TIMEOUT) {
+    ret_code = 0x14003;  // 0x14003 indicates timeout
+  }
+  return ret_code;
+}
+
+int SetOsEvent(EventHandle event) {
+  if (event == NULL) {
+    return -1;
+  }
+  return SetEvent(reinterpret_cast<::HANDLE>(event));
+}
+
+int ResetOsEvent(EventHandle event) {
+  if (event == NULL) {
+    return -1;
+  }
+  return ResetEvent(reinterpret_cast<::HANDLE>(event));
+}
+
+uint64_t ReadAccurateClock() {
+  uint64_t ret;
+  QueryPerformanceCounter((LARGE_INTEGER*)&ret);
+  return ret;
+}
+
+uint64_t AccurateClockFrequency() {
+  uint64_t ret;
+  QueryPerformanceFrequency((LARGE_INTEGER*)&ret);
+  return ret;
+}
+
+SharedMutex CreateSharedMutex() {
+  assert(false && "Not implemented.");
+  abort();
+  return nullptr;
+}
+
+bool TryAcquireSharedMutex(SharedMutex lock) {
+  assert(false && "Not implemented.");
+  abort();
+  return false;
+}
+
+bool AcquireSharedMutex(SharedMutex lock) {
+  assert(false && "Not implemented.");
+  abort();
+  return false;
+}
+
+void ReleaseSharedMutex(SharedMutex lock) {
+  assert(false && "Not implemented.");
+  abort();
+}
+
+bool TrySharedAcquireSharedMutex(SharedMutex lock) {
+  assert(false && "Not implemented.");
+  abort();
+  return false;
+}
+
+bool SharedAcquireSharedMutex(SharedMutex lock) {
+  assert(false && "Not implemented.");
+  abort();
+  return false;
+}
+
+void SharedReleaseSharedMutex(SharedMutex lock) {
+  assert(false && "Not implemented.");
+  abort();
+}
+
+void DestroySharedMutex(SharedMutex lock) {
+  assert(false && "Not implemented.");
+  abort();
+}
+
+uint64_t ReadSystemClock() {
+  assert(false && "Not implemented.");
+  abort();
+  return 0;
+}
+
+uint64_t SystemClockFrequency() {
+  assert(false && "Not implemented.");
+  abort();
+  return 0;
+}
+
+bool ParseCpuID(cpuid_t* cpuinfo) {
+  assert(false && "Not implemented.");
+  abort();
+  return false;
+}
+
+}   //  namespace os
+}   //  namespace wsl
+
+#endif
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/version.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/version.cpp
new file mode 100644
index 0000000000..80dc67d44f
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/version.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+
+const char rocdxgbuildid[] __attribute__((used)) = "ROCDXG BUILD ID: " STRING(ROCDXG_VERSION);
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtGetVersion(HsaVersionInfo *VersionInfo) {
+  CHECK_DXG_OPEN();
+
+  VersionInfo->KernelInterfaceMajorVersion = 1;
+  VersionInfo->KernelInterfaceMinorVersion = 17;
+
+  return HSAKMT_STATUS_SUCCESS;
+}
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/wddm/cmd_util.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/wddm/cmd_util.cpp
new file mode 100644
index 0000000000..d650651e31
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/wddm/cmd_util.cpp
@@ -0,0 +1,320 @@
+/* Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. */
+
+#include "impl/wddm/cmd_util.h"
+
+namespace wsl {
+namespace thunk {
+
+/*
+ * Builds a COPY_DATA packet that copies data.
+ */
+size_t CmdUtil::BuildCopyData(
+  uint64_t  *pDstAddr,
+  void      *pBuffer,
+  uint32_t  dstSel,
+  uint32_t  dstCachePolicy,
+  uint32_t  srcSel,
+  uint32_t  srcCachePolicy,
+  uint32_t  countSel,
+  uint32_t  wrConfirm) {
+  PM4MEC_COPY_DATA copy_data = {0};
+
+  GenerateCmdHeader(&copy_data, IT_COPY_DATA);
+  copy_data.bitfields2.dst_sel = dstSel;
+  copy_data.bitfields2.src_sel = srcSel;
+  copy_data.bitfields2.dst_cache_policy = dstCachePolicy;
+  copy_data.bitfields2.src_cache_policy = srcCachePolicy;
+  copy_data.bitfields2.count_sel = countSel;
+  copy_data.bitfields2.wr_confirm = wrConfirm;
+  copy_data.bitfields5c.dst_64b_addr_lo = (PtrLow32(pDstAddr) >> 3);
+  copy_data.dst_addr_hi = PtrHigh32(pDstAddr);
+  memcpy(pBuffer, &copy_data, sizeof(copy_data));
+
+  return sizeof(copy_data);
+}
+
+/*
+ * Builds a EVENT_WRITE packet.
+ * Applications can use Barrier command to ensure their
+ * command is executed only after all other commands have
+ * completed their execution.
+ */
+size_t CmdUtil::BuildBarrier(
+  void      *pBuffer,
+  uint32_t  eventIndex,
+  uint32_t  eventType) {
+  BarrierTemplate barrier = {0};
+
+  GenerateCmdHeader(&barrier.event_write, IT_EVENT_WRITE);
+  barrier.event_write.bitfields2.event_index = eventIndex;
+  barrier.event_write.bitfields2.event_type = eventType;
+  memcpy(pBuffer, &barrier, sizeof(barrier));
+
+  return sizeof(barrier);
+}
+
+/**
+ * Builds a WRITE_DATA packet.
+ * Writes two DWORDs into the GPU memory address "write_addr"
+ */
+
+size_t CmdUtil::BuildWriteData64Command(
+  void*     pBuffer,
+  uint64_t* write_addr,
+  uint64_t  write_value) {
+  WriteDataTemplate command = {0};
+  GenerateCmdHeader(&command.write_data, IT_WRITE_DATA);
+
+  // Encode the user specified address to write to
+  uint64_t addr = uintptr_t(write_addr);
+  assert(!(addr & 0x3) && "WriteData address must be 4 byte aligned");
+
+  // Set the bit to confirm the write operation and cache policy
+  command.write_data.bitfields2.wr_confirm = wr_confirm__mec_write_data__wait_for_write_confirmation;
+  command.write_data.bitfields2.cache_policy = cache_policy__mec_write_data__bypass;
+
+  // Specify the command to increment address if writing more than one DWord
+  command.write_data.bitfields2.addr_incr = addr_incr__mec_write_data__increment_address;
+  // Specify the class to which the write destination belongs
+  command.write_data.bitfields2.dst_sel = dst_sel__mec_write_data__memory;
+
+  command.write_data.bitfields3c.dst_mem_addr_lo = (PtrLow32(write_addr) >> 2);
+  command.write_data.dst_mem_addr_hi = PtrHigh32(write_addr);
+
+  // Specify the value to write
+  command.write_data.write_data_value = write_value;
+
+  memcpy(pBuffer, &command, sizeof(command));
+  return sizeof(command);
+}
+
+/*
+ * Builds a ACQUIRE_MEM packet.
+ * Users can submit this command to
+ * invalidate Gpu caches - L1 and or L2.
+ */
+size_t CmdUtil::BuildAcquireMem(
+  uint8_t major,
+  void    *pBuffer) {
+  size_t ret;
+  if (major == 9) {
+    gfx9::AcquireMemTemplate acq = {0};
+    GenerateCmdHeader(&acq.acquire_mem, IT_ACQUIRE_MEM);
+    // Specify the size of memory to invalidate. Size is
+    // specified in terms of 256 byte chunks. A coher_size
+    // of 0xFFFFFFFF actually specified 0xFFFFFFFF00 (40 bits)
+    // of memory. The field coher_size_hi specifies memory from
+    // bits 40-64 for a total of 256 TB.
+    acq.acquire_mem.coher_size = 0xFFFFFFFF;
+    acq.acquire_mem.bitfields4.coher_size_hi = 0xFF;
+    // Specify the address of memory to invalidate. The
+    // address must be 256 byte aligned.
+    acq.acquire_mem.coher_base_lo = 0;
+    acq.acquire_mem.bitfields6.coher_base_hi = 0;
+    // Specify the poll interval for determing if operation is complete
+    acq.acquire_mem.bitfields7.poll_interval = 4;
+    acq.acquire_mem.bitfields2.coher_cntl =
+      (1 << 29) | // CP_COHER_CNTL__SH_ICACHE_ACTION_ENA_MASK
+      (1 << 27) | // CP_COHER_CNTL__SH_KCACHE_ACTION_ENA_MASK
+      (1 << 28);  // CP_COHER_CNTL__SH_KCACHE_VOL_ACTION_ENA_MASK
+    memcpy(pBuffer, &acq, sizeof(acq));
+    ret = sizeof(acq);
+  } else if (major >= 10) {
+    gfx10::AcquireMemTemplate acq = {0};
+    GenerateCmdHeader(&acq.acquire_mem, IT_ACQUIRE_MEM);
+    acq.acquire_mem.coher_size = 0xFFFFFFFF;
+    acq.acquire_mem.bitfields4.coher_size_hi = 0xFF;
+    acq.acquire_mem.coher_base_lo = 0;
+    acq.acquire_mem.bitfields6.coher_base_hi = 0;
+    acq.acquire_mem.bitfields7.poll_interval = 4;
+    acq.acquire_mem.bitfields8.gcr_cntl =
+      (1 << 16) | // SEQ = FORWARD
+      (1 << 15) | // GL2_WB
+      (1 << 14) | // GL2_INV
+      (1 << 9) |  // GL1_INV
+      (1 << 8) |  // GLV_INV
+      (1 << 7) |  // GLK_INV
+      (1 << 6) |  // GLK_WB
+      (1 << 5) |  // GLM_INV
+      (1 << 4) |  // GLM_WB
+      (1 << 0);   // GLI_INV = ALL
+    memcpy(pBuffer, &acq, sizeof(acq));
+    ret = sizeof(acq);
+  }
+
+  return ret;
+}
+
+/*
+ * Builds a scratch packet.
+ */
+size_t CmdUtil::BuildScratch(
+  void  *pScratchBase,
+  void  *pBuffer) {
+  struct SetScratchTemplate scratch = {0};
+
+  GenerateSetShRegHeader(&scratch, mmCOMPUTE_DISPATCH_SCRATCH_BASE_LO);
+  scratch.scratch_lo = Ptr48Low32(pScratchBase);
+  scratch.scratch_hi = Ptr48High8(pScratchBase);
+  memcpy(pBuffer, &scratch, sizeof(scratch));
+
+  return sizeof(scratch);
+}
+
+/**
+ * @ Set Compute Shader parameter for gfx11 and above
+ */
+size_t CmdUtil::BuildComputeShaderParams(void  *pBuffer) {
+  struct DispatchProgramResourceRegs compute_shader_params = {0};
+
+  GenerateSetShRegHeader(&compute_shader_params, mmCOMPUTE_PGM_RSRC3);
+  // IMAGE_OP: Indicates the compute program contains an image op
+  // instruction and should be stalled by its WAIT_SYNC fence.
+  compute_shader_params.compute_pgm_rsrc3 = (1 << 31);
+
+  memcpy(pBuffer, &compute_shader_params, sizeof(compute_shader_params));
+
+  return sizeof(compute_shader_params);
+}
+
+
+/*
+ * Builds a dispatch packet.
+ */
+size_t CmdUtil::BuildDispatch(
+  struct DispatchInfo *pInfo,
+  void                *pBuffer) {
+  DispatchTemplate dispatch = {0};
+
+  GenerateSetShRegHeader(&dispatch.dimension_regs, mmCOMPUTE_NUM_THREAD_X);
+  dispatch.dimension_regs.compute_num_thread_x = pInfo->pPacket->workgroup_size_x;
+  dispatch.dimension_regs.compute_num_thread_y = pInfo->pPacket->workgroup_size_y;
+  dispatch.dimension_regs.compute_num_thread_z = pInfo->pPacket->workgroup_size_z;
+
+  // TODO: Add AQL packet index for debugger
+  // Debugger requires AQL packet index in COMPUTE_DISPATCH_PKT_ADDR_LO
+  GenerateSetShRegHeader(&dispatch.program_regs, mmCOMPUTE_PGM_LO);
+  dispatch.program_regs.compute_pgm_lo = Ptr48Low32(pInfo->pEntry);
+  dispatch.program_regs.compute_pgm_hi = Ptr48High8(pInfo->pEntry);
+
+  GenerateSetShRegHeader(&dispatch.program_resource_regs, mmCOMPUTE_PGM_RSRC1);
+  dispatch.program_resource_regs.compute_pgm_rsrc1 = pInfo->pKernelObject->compute_pgm_rsrc1;
+  if (pInfo->major == 11) {
+    AMD_HSA_BITS_SET(dispatch.program_resource_regs.compute_pgm_rsrc1,
+        AMD_COMPUTE_PGM_RSRC_ONE_PRIV, 1);
+  }
+  dispatch.program_resource_regs.compute_pgm_rsrc2 =
+    (pInfo->ldsBlks << 15) | pInfo->pKernelObject->compute_pgm_rsrc2;
+
+  GenerateSetShRegHeader(&dispatch.resource_regs, mmCOMPUTE_RESOURCE_LIMITS);
+  dispatch.resource_regs.compute_resource_limits = 0x3ff;
+  dispatch.resource_regs.compute_static_thread_mgmt_se0 = 0xFFFFFFFF;
+  dispatch.resource_regs.compute_static_thread_mgmt_se1 = 0xFFFFFFFF;
+  dispatch.resource_regs.compute_static_thread_mgmt_se2 = 0xFFFFFFFF;
+  dispatch.resource_regs.compute_static_thread_mgmt_se3 = 0xFFFFFFFF;
+
+  dispatch.resource_regs.compute_tmpring_size = pInfo->pAmdQueue->compute_tmpring_size;
+
+  GenerateSetShRegHeader(&dispatch.compute_user_data_regs, mmCOMPUTE_USER_DATA_0);
+
+  uint32_t sgpr_no = 0;
+  if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties,
+		       AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER)) {
+    assert(pInfo->major < 11);
+    pInfo->scratchBaseOffset[pInfo->offsetCnt++] =
+      offsetof(struct DispatchTemplate, compute_user_data_regs.compute_user_data[0]) +
+      sgpr_no * sizeof(uint32_t);
+
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
+      pInfo->pAmdQueue->scratch_resource_descriptor[0];
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
+      pInfo->pAmdQueue->scratch_resource_descriptor[1];
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
+      pInfo->pAmdQueue->scratch_resource_descriptor[2];
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
+      pInfo->srd;
+  }
+  if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties,
+		       AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR)) {
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = PtrLow32(pInfo->pPacket);
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = PtrHigh32(pInfo->pPacket);
+  }
+  if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties,
+		       AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) {
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = PtrLow32(pInfo->pAmdQueue);
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = PtrHigh32(pInfo->pAmdQueue);
+  }
+  if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties,
+		       AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR)) {
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
+      PtrLow32(pInfo->pPacket->kernarg_address);
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
+      PtrHigh32(pInfo->pPacket->kernarg_address);
+  }
+  if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties,
+		       AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_ID)) {
+    // This feature may be enabled as a side effect of indirect calls.
+    // However, the compiler team confirmed that the dispatch id itself is not used,
+    // so safe to send 0 for each dispatch.
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = 0;
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = 0;
+  }
+  if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties,
+		       AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_FLAT_SCRATCH_INIT)) {
+    assert(pInfo->major < 11);
+    pInfo->scratchBaseOffset[pInfo->offsetCnt++] =
+      offsetof(struct DispatchTemplate, compute_user_data_regs.compute_user_data[0]) +
+      sgpr_no * sizeof(uint32_t);
+
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
+      PtrLow32(pInfo->pScratchBase);
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
+      PtrHigh32(pInfo->pScratchBase);
+  }
+  if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties,
+		       AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE)) {
+    dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
+      pInfo->scratchSizePerWave / (pInfo->wave32 ? 32 : 64);
+  }
+
+  GenerateCmdHeader(&dispatch.dispatch_direct, IT_DISPATCH_DIRECT);
+  dispatch.dispatch_direct.dispatch_initiator =
+    (1 << 0) | // COMPUTE_SHADER_EN
+    (1 << 2) | // FORCE_START_AT_000
+    (1 << 5); // USE_THREAD_DIMENSIONS
+  if (pInfo->wave32) dispatch.dispatch_direct.dispatch_initiator |= (1 << 15); // CS_W32_EN
+  dispatch.dispatch_direct.dim_x = pInfo->pPacket->grid_size_x;
+  dispatch.dispatch_direct.dim_y = pInfo->pPacket->grid_size_y;
+  dispatch.dispatch_direct.dim_z = pInfo->pPacket->grid_size_z;
+  memcpy(pBuffer, &dispatch, sizeof(dispatch));
+
+  return sizeof(dispatch);
+}
+
+/*
+ * Builds a ATOMIC_MEM packet.
+ * Users can submit this command
+ * to perform atomic operations.
+ */
+size_t CmdUtil::BuildAtomicMem(
+  uint64_t  *pAddr,
+  uint32_t  atomic,
+  void      *pBuffer,
+  uint32_t  cachePolicy,
+  uint64_t  srcData) {
+  AtomicTemplate atom = {0};
+
+  GenerateCmdHeader(&atom.atomic, IT_ATOMIC_MEM);
+  atom.atomic.addr_lo = PtrLow32(pAddr);
+  atom.atomic.addr_hi = PtrHigh32(pAddr);
+  atom.atomic.bitfields2.atomic = atomic;
+  atom.atomic.bitfields2.cache_policy = cachePolicy;
+  atom.atomic.src_data_lo = LowPart(srcData);
+  atom.atomic.src_data_hi = HighPart(srcData);
+  memcpy(pBuffer, &atom, sizeof(atom));
+
+  return sizeof(atom);
+}
+
+} // namespace thunk
+} // namespace wsl
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/wddm/device.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/wddm/device.cpp
new file mode 100644
index 0000000000..f51af85404
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/wddm/device.cpp
@@ -0,0 +1,780 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include <cinttypes>
+#include <bitset>
+
+#include <sys/mman.h>
+#include <sys/sysinfo.h>
+#include <sys/stat.h>
+#include <linux/mman.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include "impl/wddm/status.h"
+#include "impl/wddm/types.h"
+#include "impl/wddm/device.h"
+#include "impl/wddm/queue.h"
+
+namespace wsl {
+namespace thunk {
+
+const uint32_t WDDMDevice::cmdbuf_aql_frame_num_ = 0x1000;
+
+WDDMDevice::WDDMDevice(D3DKMT_HANDLE adapter, LUID adapter_luid, uint32_t node_id)
+  : adapter_(adapter), adapter_luid_(adapter_luid), node_id_(node_id) {
+  memset(&device_info_, 0, sizeof(device_info_));
+
+  ParseDeviceInfo();
+  CreateDevice();
+  SetPowerOptimization(false);
+  CreatePagingQueue();
+  InitCmdbufInfo();
+  QuerySegmentInfo();
+}
+
+WDDMDevice::~WDDMDevice() {
+  DestroyPagingQueue();
+  SetPowerOptimization(true);
+  DestroyDevice();
+
+  DestroyDeviceInfo();
+}
+
+static NTSTATUS WDDMQueryAdapter(D3DKMT_HANDLE adapter, KMTQUERYADAPTERINFOTYPE type,
+				 void *data, int size)
+{
+  D3DKMT_QUERYADAPTERINFO args = {0};
+
+  args.hAdapter = adapter;
+  args.Type = type;
+  args.pPrivateDriverData = data;
+  args.PrivateDriverDataSize = size;
+
+  return DXCORE_CALL(D3DKMTQueryAdapterInfo(&args));
+}
+
+bool WDDMDevice::QuerySegmentInfo()
+{
+  uint32_t segmentCount = 0;
+  segment_infos_.clear();
+
+  // Get the number of segments
+  D3DKMT_QUERYSTATISTICS adapterQuery = {};
+  adapterQuery.Type = D3DKMT_QUERYSTATISTICS_ADAPTER;
+  adapterQuery.AdapterLuid = adapter_luid_;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTQueryStatistics(&adapterQuery));
+  if (ret == STATUS_SUCCESS) {
+    segmentCount = adapterQuery.QueryResult.AdapterInformation.NbSegments;
+    pr_debug("Total Segments: %u\n", segmentCount);
+  } else {
+    pr_err("Failed to query adapter info\n");
+    return false;
+  }
+
+  for (uint32_t i = 0; i < segmentCount; i++) {
+
+    D3DKMT_QUERYSTATISTICS segQuery = {};
+    segQuery.Type = D3DKMT_QUERYSTATISTICS_SEGMENT;
+    segQuery.AdapterLuid = adapter_luid_;
+    segQuery.QuerySegment.SegmentId = i;
+
+    ret = DXCORE_CALL(D3DKMTQueryStatistics(&segQuery));
+    if (ret != STATUS_SUCCESS) {
+      pr_err("Failed to query segment %u info\n", i);
+      return false;
+    }
+
+    auto& seg = segQuery.QueryResult.SegmentInformation;
+
+    SegmentInfo info;
+    info.segment_id = i;
+    info.segment_type = seg.SegmentProperties.SegmentType;
+    info.system_memory = seg.SegmentProperties.SystemMemory;
+    info.aperture = seg.Aperture;
+    info.commit_limit = seg.CommitLimit;
+
+    segment_infos_.push_back(info);
+  }
+
+  return true;
+}
+
+bool WDDMDevice::GetSegmentId(D3DKMT_QUERYSTATISTICS_SEGMENT_TYPE segment_type,
+                              uint32_t &segment_id)
+{
+  for (const auto& seg_info : segment_infos_) {
+    if (seg_info.segment_type == segment_type) {
+      segment_id = seg_info.segment_id;
+      return true;
+    }
+  }
+  pr_err("Failed to get segment id for type %u\n", segment_type);
+  return false;
+}
+
+/*Local heap(dedicated GPU memory) includes visiable heap and invisiable heap.
+ *Non local heap refers to shared GPU memory and it is sytem memory.
+ */
+uint64_t WDDMDevice::VramAvail(void) {
+  D3DKMT_QUERYSTATISTICS stats;
+  NTSTATUS ret;
+  uint64_t usedVis = 0;
+  uint64_t usedInv = 0;
+  uint64_t usedNonLocal = 0;
+  uint32_t segmentId = 0;
+
+  // wait fence complete
+  uint64_t value = page_fence_value_.load();
+  if(!CpuWait(&page_syncobj_, &value, 1, false))
+    return HSA_STATUS_ERROR;
+
+  if (IsDgpu()) {
+    // local cpu-visible memory
+    if(!GetSegmentId(D3DKMT_QUERYSTATISTICS_SEGMENT_TYPE_MEMORY, segmentId))
+      return HSA_STATUS_ERROR;
+
+    memset(&stats, 0, sizeof(D3DKMT_QUERYSTATISTICS));
+    stats.Type = D3DKMT_QUERYSTATISTICS_SEGMENT;
+    stats.AdapterLuid = adapter_luid_;
+    stats.QuerySegment.SegmentId = segmentId;
+    ret = DXCORE_CALL(D3DKMTQueryStatistics(&stats));
+    if (ret == 0)
+      usedVis = stats.QueryResult.SegmentInformation.BytesResident;
+
+    // local invisible memory
+    if (device_info_.local_invisible_heap_size) {
+      segmentId++;
+      memset(&stats, 0, sizeof(D3DKMT_QUERYSTATISTICS));
+      stats.Type = D3DKMT_QUERYSTATISTICS_SEGMENT;
+      stats.AdapterLuid = adapter_luid_;
+      stats.QuerySegment.SegmentId = 1;
+
+      ret = DXCORE_CALL(D3DKMTQueryStatistics(&stats));
+      if (ret == 0)
+        usedInv = stats.QueryResult.SegmentInformation.BytesResident;
+    }
+
+    return LocalHeapSize() - usedVis - usedInv;
+  } else {
+    // APU - NonLocal memory
+    if(!GetSegmentId(D3DKMT_QUERYSTATISTICS_SEGMENT_TYPE_SYSMEM, segmentId))
+      return HSA_STATUS_ERROR;
+
+    memset(&stats, 0, sizeof(D3DKMT_QUERYSTATISTICS));
+    stats.Type = D3DKMT_QUERYSTATISTICS_SEGMENT;
+    stats.AdapterLuid = adapter_luid_;
+    stats.QuerySegment.SegmentId = segmentId;
+    ret = DXCORE_CALL(D3DKMTQueryStatistics(&stats));
+    if (ret == 0)
+      usedNonLocal = stats.QueryResult.SegmentInformation.BytesResident;
+
+    return NonLocalHeapSize() - usedNonLocal;
+  }
+}
+
+bool WDDMDevice::CreateDevice(void) {
+  D3DKMT_CREATEDEVICE args = {0};
+  args.hAdapter = adapter_;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTCreateDevice(&args));
+  if (ret == STATUS_SUCCESS) {
+    device_ = args.hDevice;
+    return true;
+  }
+
+  pr_err("fail %x\n", ret);
+  return false;
+}
+
+bool WDDMDevice::DestroyDevice(void) {
+  D3DKMT_DESTROYDEVICE args = {0};
+  args.hDevice = device_;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTDestroyDevice(&args));
+  if (ret == STATUS_SUCCESS)
+    return true;
+
+  pr_err("fail %x\n", ret);
+  return false;
+}
+
+bool WDDMDevice::CreatePagingQueue(void) {
+  D3DKMT_CREATEPAGINGQUEUE args = {0};
+  args.hDevice = device_;
+  args.Priority = D3DDDI_PAGINGQUEUE_PRIORITY_NORMAL;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTCreatePagingQueue(&args));
+  if (ret == STATUS_SUCCESS) {
+    page_queue_ = args.hPagingQueue;
+    page_syncobj_ = args.hSyncObject;
+    page_fence_addr_ = (uint64_t *)args.FenceValueCPUVirtualAddress;
+    page_fence_value_ = 0;
+    return true;
+  }
+
+  pr_err("fail %x\n", ret);
+  return false;
+}
+
+bool WDDMDevice::DestroyPagingQueue(void) {
+  D3DDDI_DESTROYPAGINGQUEUE args = {0};
+  args.hPagingQueue = page_queue_;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTDestroyPagingQueue(&args));
+  if (ret == STATUS_SUCCESS)
+    return true;
+
+  pr_err("fail %x\n", ret);
+  return false;
+}
+
+void WDDMDevice::SetPowerOptimization(bool restore) {
+  void *priv_data;
+  int priv_size;
+
+  priv_size = thunk_proxy::GetPowerOptPrivDataSize();
+  priv_data = malloc(priv_size);
+  assert(priv_data);
+  memset(priv_data, 0, priv_size);
+  thunk_proxy::FillinPowerOptPrivData(priv_data, restore);
+
+  D3DKMT_ESCAPE d3dkmt_escape;
+  memset(&d3dkmt_escape, 0, sizeof(d3dkmt_escape));
+
+  d3dkmt_escape.hAdapter              = adapter_;
+  d3dkmt_escape.hDevice               = device_;
+  d3dkmt_escape.hContext              = 0; //KMD only use device to identify the process
+  d3dkmt_escape.Type                  = D3DKMT_ESCAPE_DRIVERPRIVATE;
+  d3dkmt_escape.pPrivateDriverData    = priv_data;
+  d3dkmt_escape.PrivateDriverDataSize = priv_size;
+  d3dkmt_escape.Flags.HardwareAccess  = true;
+
+  NTSTATUS status = DXCORE_CALL(D3DKMTEscape(&d3dkmt_escape));
+  pr_debug("status %d, restore %d\n", status, restore);
+  free(priv_data);
+}
+
+void WDDMDevice::UpdatePageFence(uint64_t fence_value) {
+  uint64_t current = page_fence_value_.load();
+
+  // atomically set fence value when target is bigger than current one
+  do {
+    if (current >= fence_value)
+      break;
+  } while (!page_fence_value_.compare_exchange_weak(current, fence_value));
+}
+
+ErrorCode WDDMDevice::CreateGpuMemory(const GpuMemoryCreateInfo &create_info,
+                                        GpuMemory **gpu_mem, gpusize *gpu_va) {
+  ErrorCode ret;
+
+  *gpu_mem = nullptr;
+  auto mem = new GpuMemory(this);
+  if (create_info.dmabuf_fd > 0)
+    ret = mem->ImportPhysicalHandle(create_info, gpu_va);
+  else 
+    ret = mem->Init(create_info);
+  if (ret == ErrorCode::Success)
+    *gpu_mem = mem;
+  else
+    delete mem;
+
+  return ret;
+}
+
+void *WDDMDevice::Lock(D3DKMT_HANDLE handle) {
+  D3DKMT_LOCK2 args = {0};
+  args.hDevice = device_;
+  args.hAllocation = handle;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTLock2(&args));
+  if (ret == STATUS_SUCCESS)
+    return args.pData;
+
+  pr_err("fail %x\n", ret);
+  return NULL;
+}
+
+bool WDDMDevice::Unlock(D3DKMT_HANDLE handle) {
+  D3DKMT_UNLOCK2 args = {0};
+  args.hDevice = device_;
+  args.hAllocation = handle;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTUnlock2(&args));
+  if (ret == STATUS_SUCCESS)
+    return true;
+
+  pr_err("fail %x\n", ret);
+  return false;
+}
+
+bool WDDMDevice::CreateContext(int engine, D3DKMT_HANDLE *handle) {
+  void *priv_data;
+  int priv_size;
+
+  int ordinal = EngineOrdinal(engine, &device_info_);
+  if (ordinal < 0)
+    return false;
+
+  priv_size = thunk_proxy::GetContextPrivDataSize();
+  priv_data = malloc(priv_size);
+  assert(priv_data);
+  memset(priv_data, 0, priv_size);
+  thunk_proxy::FillinContextPrivData(priv_data, SupportStateShadowingByCpFw());
+
+  D3DKMT_CREATECONTEXTVIRTUAL args = {0};
+  args.hDevice = device_;
+  args.EngineAffinity = 1 << 0;
+  args.NodeOrdinal = ordinal;
+  args.pPrivateDriverData = priv_data;
+  args.PrivateDriverDataSize = priv_size;
+  args.ClientHint = D3DKMT_CLIENTHINT_OPENCL;
+
+  if (IsHwsEnabled(engine))
+    args.Flags.HwQueueSupported = 1;
+  else
+    args.Flags.DisableGpuTimeout = thunk_proxy::ShouldDisableGpuTimeout(engine, &device_info_);
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTCreateContextVirtual(&args));
+  if (ret == STATUS_SUCCESS) {
+    *handle = args.hContext;
+    free(priv_data);
+    return true;
+  }
+
+  free(priv_data);
+
+  pr_err("fail %x\n", ret);
+  return false;
+}
+
+bool WDDMDevice::DestroyContext(D3DKMT_HANDLE handle) {
+  D3DKMT_DESTROYCONTEXT args = {0};
+  args.hContext = handle;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTDestroyContext(&args));
+  if (ret == STATUS_SUCCESS)
+    return true;
+
+  pr_err("fail %x\n", ret);
+  return false;
+}
+
+bool WDDMDevice::GpuWait(WDDMQueue *queue, const D3DKMT_HANDLE *syncobjs,
+			 uint64_t *values, int count) {
+
+  D3DKMT_WAITFORSYNCHRONIZATIONOBJECTFROMGPU args = {0};
+  args.hContext = queue->context;
+  args.ObjectCount = count;
+  args.ObjectHandleArray = syncobjs;
+  args.MonitoredFenceValueArray = values;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTWaitForSynchronizationObjectFromGpu(&args));
+  if (ret == STATUS_SUCCESS)
+      return true;
+
+  pr_err("fail %x\n", ret);
+  return false;
+}
+
+bool WDDMDevice::GpuSignal(D3DKMT_HANDLE context, const D3DKMT_HANDLE *syncobjs,
+			   uint64_t *value, int count) {
+  D3DKMT_SIGNALSYNCHRONIZATIONOBJECTFROMGPU args = {0};
+  args.hContext = context;
+  args.ObjectCount = count;
+  args.ObjectHandleArray = syncobjs;
+  args.MonitoredFenceValueArray = value;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTSignalSynchronizationObjectFromGpu(&args));
+  if (ret == STATUS_SUCCESS)
+    return true;
+
+  pr_err("fail %x\n", ret);
+  return false;
+}
+
+bool WDDMDevice::CpuWait(const D3DKMT_HANDLE *syncobjs, uint64_t *value,
+			 int count, bool wait_any) {
+  D3DKMT_WAITFORSYNCHRONIZATIONOBJECTFROMCPU args = {0};
+  args.hDevice = device_;
+  args.ObjectCount = count;
+  args.ObjectHandleArray = syncobjs;
+  args.FenceValueArray = value;
+  args.Flags.WaitAny = wait_any;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTWaitForSynchronizationObjectFromCpu(&args));
+  if (ret == STATUS_SUCCESS)
+    return true;
+
+  pr_err("fail %x\n", ret);
+  return false;
+}
+
+bool WDDMDevice::WaitOnPagingFenceFromCpu() {
+  uint64_t page_fence_value = 0;
+
+  page_fence_value = page_fence_value_.load();
+  if (CpuWait(&page_syncobj_, &page_fence_value, 1, false))
+    return true;
+
+  return false;
+}
+
+bool WDDMDevice::CreateSyncobj(D3DKMT_HANDLE *handle, uint64_t **addr) {
+  D3DKMT_CREATESYNCHRONIZATIONOBJECT2 args = {0};
+  args.hDevice = device_;
+  args.Info.Type = D3DDDI_MONITORED_FENCE;
+  args.Info.MonitoredFence.EngineAffinity = 1 << 0;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTCreateSynchronizationObject2(&args));
+  if (ret == STATUS_SUCCESS) {
+    *handle = args.hSyncObject;
+    *addr = (uint64_t *)args.Info.MonitoredFence.FenceValueCPUVirtualAddress;
+    pr_debug("create syncobj cpu addr=%p gpu addr=%" PRIx64 "\n",
+             args.Info.MonitoredFence.FenceValueCPUVirtualAddress,
+             args.Info.MonitoredFence.FenceValueGPUVirtualAddress);
+
+    return true;
+  }
+
+  pr_err("fail %x\n", ret);
+  return false;
+}
+
+void WDDMDevice::DestroySyncobj(D3DKMT_HANDLE handle) {
+  D3DKMT_DESTROYSYNCHRONIZATIONOBJECT args = {0};
+  args.hSyncObject = handle;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTDestroySynchronizationObject(&args));
+  if (ret != STATUS_SUCCESS)
+    pr_err("fail %x\n", ret);
+}
+
+void WDDMDevice::InitCmdbufInfo(void) {
+  if (device_info_.major == 9) {
+    cmdbuf_aql_frame_size_ = 2 * sizeof(gfx9::AcquireMemTemplate);
+  } else if (device_info_.major >= 10) {
+    cmdbuf_aql_frame_size_ = 2 * sizeof(gfx10::AcquireMemTemplate);
+  }
+
+  if (device_info_.major >= 11) {
+    cmdbuf_aql_frame_size_ += sizeof(SetScratchTemplate);
+    cmdbuf_aql_frame_size_ += sizeof(DispatchProgramResourceRegs); // BuildComputeShaderParams
+  }
+
+  cmdbuf_aql_frame_size_ +=
+    sizeof(PM4MEC_COPY_DATA) * 2 +
+    sizeof(BarrierTemplate) * 2 +
+    sizeof(DispatchTemplate) +
+    sizeof(AtomicTemplate) * 2;
+
+  // Add safety margin to account for alignment and future additions
+  cmdbuf_aql_frame_size_ += 128;
+
+  cmdbuf_aql_frame_size_ = AlignUp(cmdbuf_aql_frame_size_, 0x10);
+
+  cmdbuf_size_ = AlignUp(cmdbuf_aql_frame_num_ * cmdbuf_aql_frame_size_, 0x1000);
+}
+
+uint32_t WDDMDevice::LdsBlocks(const hsa_kernel_dispatch_packet_t *pkt) {
+  static const uint32_t blk_sz = 512;
+  uint32_t total_sz = pkt->group_segment_size;
+  uint32_t blk_num = (total_sz + blk_sz - 1) / blk_sz;
+  return blk_num;
+}
+
+NTSTATUS WDDMCreateDevices(std::vector<WDDMDevice *> &devices)
+{
+  bool supported = false;
+  D3DKMT_ENUMADAPTERS2 args = {0};
+  NTSTATUS ret = DXCORE_CALL(D3DKMTEnumAdapters2(&args));
+  if (ret != STATUS_SUCCESS)
+    return ret;
+
+  if (!args.NumAdapters) {
+    return STATUS_SUCCESS;
+  }
+
+  D3DKMT_ADAPTERINFO *info = new D3DKMT_ADAPTERINFO[args.NumAdapters];
+  if (!info)
+    return STATUS_NO_MEMORY;
+
+  args.pAdapters = info;
+  ret = DXCORE_CALL(D3DKMTEnumAdapters2(&args));
+  if (ret != STATUS_SUCCESS)
+    goto err_out0;
+
+  for (int i = 0; i < args.NumAdapters; i++) {
+    D3DKMT_QUERY_DEVICE_IDS query = {0};
+
+    ret = WDDMQueryAdapter(info[i].hAdapter, KMTQAITYPE_PHYSICALADAPTERDEVICEIDS,
+			   &query, sizeof(query));
+    if (ret != STATUS_SUCCESS)
+      goto err_out1;
+
+    if (query.DeviceIds.VendorID != 0x1002)
+      continue;
+
+    supported = thunk_proxy::QueryAdapterSupported(query.DeviceIds.DeviceID);
+
+    if (supported) {
+      auto device = new WDDMDevice(
+        info[i].hAdapter, info[i].AdapterLuid, devices.size() + 1);
+      if (!device)
+        goto err_out1;
+      devices.push_back(device);
+    }
+  }
+
+  delete[] info;
+  return STATUS_SUCCESS;
+
+ err_out1:
+  for (auto &device : devices)
+    delete device;
+ err_out0:
+  delete[] info;
+  return ret;
+}
+
+bool WDDMDevice::ParseDeviceInfo() {
+  bool ret;
+
+  memset(&device_info_, 0, sizeof(device_info_));
+  ret = thunk_proxy::ParseAdapterInfo(adapter_, &device_info_);
+  if (!ret)
+    return false;
+
+  return true;
+}
+
+void WDDMDevice::DestroyDeviceInfo() {
+  free(device_info_.adapter_info);
+}
+
+void WDDMDevice::GetClockCounters(uint64_t *gpu, uint64_t *cpu) {
+
+  uint32_t engine = GetComputeEngine();
+  int ordinal = EngineOrdinal(engine, &device_info_);
+
+  D3DKMT_QUERYCLOCKCALIBRATION args = {0};
+
+ /* LDA(Linked Display Adapter)
+  * In the LDA design multiple physical GPUs are linked together to be controlled
+  * as a single object from the point of view of power manager, GPU scheduler and
+  * GPU memory manager. The physical GPUs are represented by a signal logical adapter
+  * object. There is a single DXGADAPTER objects, a single KMD adapter object.
+  *
+  * Set PhysicalAdapterIndex to 0 by default with None LDA mode.
+  */
+  args.hAdapter = adapter_;
+  args.NodeOrdinal = ordinal;
+  args.PhysicalAdapterIndex = 0;
+
+  NTSTATUS status = DXCORE_CALL(D3DKMTQueryClockCalibration(&args));
+  if (status) {
+    pr_debug("status %d \n", status);
+  } else {
+    if (gpu)
+      *gpu = args.ClockData.GpuClockCounter;
+
+    if (cpu)
+      *cpu = args.ClockData.CpuClockCounter;
+  }
+}
+
+bool WDDMDevice::CreateQueue(WDDMQueue *queue) {
+  if (!CreateContext(queue->queue_engine, &queue->context))
+    return false;
+
+  GpuMemory *gpu_mem = nullptr;
+  if (queue->cmdbuf_addr == 0) {
+    GpuMemoryCreateInfo create_info{};
+    create_info.size = queue->cmdbuf_size;
+    create_info.domain = thunk_proxy::kSystem;
+
+    auto code = CreateGpuMemory(create_info, &gpu_mem);
+    if (code != ErrorCode::Success)
+        goto err_out0;
+
+    queue->cmdbuf = gpu_mem->GetGpuMemoryHandle();
+    queue->cmdbuf_addr = gpu_mem->GpuAddress();
+  }
+
+  if (queue->Init())
+     goto err_out1;
+
+  return true;
+
+err_out1:
+  delete gpu_mem;
+err_out0:
+  DestroyContext(queue->context);
+
+  return false;
+}
+
+void WDDMDevice::DestroyQueue(WDDMQueue *queue) {
+
+  queue->Fini();
+
+  auto cmdbuf_mem = GpuMemory::Convert(queue->cmdbuf);
+  delete cmdbuf_mem;
+
+  DestroyContext(queue->context);
+}
+
+bool WDDMDevice::SubmitToSwQueue(WDDMQueue *queue, uint64_t command_addr,
+                                uint64_t command_size, uint64_t fence_value) {
+  void *priv_data;
+  int priv_size;
+
+  priv_size = thunk_proxy::GetSubmitPrivDataSize();
+  priv_data = malloc(priv_size);
+  assert(priv_data);
+  memset(priv_data, 0, priv_size);
+  thunk_proxy::FillinSubmitPrivData(priv_data, queue->queue, command_addr, command_size, false);
+
+  D3DKMT_SUBMITCOMMAND args = {0};
+  args.Commands = command_addr;
+  args.CommandLength = command_size;
+  args.BroadcastContextCount = 1;
+  args.BroadcastContext[0] = queue->context;
+  args.pPrivateDriverData = priv_data;
+  args.PrivateDriverDataSize = priv_size;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTSubmitCommand(&args));
+  if (ret != STATUS_SUCCESS) {
+    pr_err("fail %x\n", ret);
+    free(priv_data);
+    return false;
+  }
+
+  free(priv_data);
+
+  if (!GpuSignal(queue->context, &queue->syncobj, &fence_value, 1))
+    return false;
+
+  return true;
+}
+
+bool WDDMDevice::CreateHwQueue(WDDMQueue *queue) {
+  void *priv_data;
+  int priv_size;
+
+  priv_size = thunk_proxy::GetHwQueuePrivDataSize();
+  priv_data = malloc(priv_size);
+  assert(priv_data);
+  memset(priv_data, 0, priv_size);
+  bool FwManagedGfxState = SupportStateShadowingByCpFw();
+  thunk_proxy::FillinHwQueuePrivData(priv_data, FwManagedGfxState, queue->prio);
+
+  D3DKMT_CREATEHWQUEUE createHwQueue = {0};
+  createHwQueue.hHwContext = queue->context;
+  createHwQueue.Flags.DisableGpuTimeout = thunk_proxy::ShouldDisableGpuTimeout(queue->queue_engine, &device_info_);
+  createHwQueue.pPrivateDriverData = priv_data;
+  createHwQueue.PrivateDriverDataSize = priv_size;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTCreateHwQueue(&createHwQueue));
+  if (ret != STATUS_SUCCESS) {
+    pr_err("fail %x\n", ret);
+    free(priv_data);
+    return false;
+  }
+
+  free(priv_data);
+
+  queue->queue = createHwQueue.hHwQueue;
+  queue->syncobj = createHwQueue.hHwQueueProgressFence;
+  queue->sync_addr = (uint64_t *)createHwQueue.HwQueueProgressFenceCPUVirtualAddress;
+
+  return true;
+}
+
+bool WDDMDevice::DestroyHwQueue(WDDMQueue *queue) {
+   D3DKMT_DESTROYHWQUEUE DestroyHwQueue = {
+    .hHwQueue = queue->queue,
+  };
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTDestroyHwQueue(&DestroyHwQueue));
+  if (ret != STATUS_SUCCESS) {
+    pr_err("fail %x\n", ret);
+    return false;
+  }
+
+  return true;
+}
+
+bool WDDMDevice::SubmitToHwQueue(WDDMQueue *queue, uint64_t command_addr,
+                                uint64_t command_size, uint64_t fence_value) {
+  void *priv_data;
+  int priv_size;
+
+  priv_size = thunk_proxy::GetSubmitPrivDataSize();
+  priv_data = malloc(priv_size);
+  assert(priv_data);
+  memset(priv_data, 0, priv_size);
+  thunk_proxy::FillinSubmitPrivData(priv_data, queue->queue, command_addr, command_size, true);
+
+  D3DKMT_SUBMITCOMMANDTOHWQUEUE args = {0};
+  args.hHwQueue = queue->queue;
+  args.HwQueueProgressFenceId = fence_value;
+  args.CommandBuffer = command_addr;
+  args.CommandLength = command_size;
+  args.pPrivateDriverData = priv_data;
+  args.PrivateDriverDataSize = priv_size;
+
+  NTSTATUS ret = DXCORE_CALL(D3DKMTSubmitCommandToHwQueue(&args));
+  if (ret != STATUS_SUCCESS) {
+    pr_err("fail %x\n", ret);
+    free(priv_data);
+    return false;
+  }
+
+  free(priv_data);
+
+  return true;
+}
+
+} // namespace thunk
+} // namespace wsl
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/wddm/gpu_memory.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/wddm/gpu_memory.cpp
new file mode 100644
index 0000000000..e374be8867
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/wddm/gpu_memory.cpp
@@ -0,0 +1,594 @@
+#include <sys/stat.h>
+#include <cinttypes>
+#include <cassert>
+#include "impl/wddm/gpu_memory.h"
+#include "impl/wddm/device.h"
+#include "util/utils.h"
+
+using namespace std;
+
+namespace wsl {
+namespace thunk {
+
+size_t GpuMemory::CalcChunkNumbers(gpusize size) {
+  const auto chunk_size = WDDMDevice::GpuMemoryChunkSize;
+  return (size + chunk_size - 1) / chunk_size;
+}
+
+gpusize GpuMemory::AdjustSize(gpusize size) const {
+  const auto &device_info = device_->DeviceInfo();
+
+  if (device_info.enable_big_page_alignment && desc_.domain == thunk_proxy::kLocal) {
+    uint32_t alignment = device_info.big_page_alignment_size;
+    // BigPage is only supported for allocations > bigPageMinAlignment.
+    // Also, if bigPageMinAlignment == 0, BigPage optimization is not supported per KMD.
+    // We do either LargePage or BigPage alignment, whichever has a higher value.
+    if ((device_info.hw_big_page_min_alignment_size > 0) && (size > device_info.hw_big_page_min_alignment_size)) {
+      alignment = std::max(alignment, device_info.hw_big_page_min_alignment_size);
+      if (size > device_info.hw_big_page_alignment_size)
+        alignment = std::max(alignment, device_info.hw_big_page_alignment_size);
+    }
+    if (alignment > 0)
+      size = AlignUp(size, alignment);
+  } else {
+    const size_t min_size = 4096;
+    size = AlignUp(size, min_size);
+  }
+  return size;
+}
+
+GpuMemory::GpuMemory(WDDMDevice *device) : device_(device) {
+  num_allocations_ = 0;
+  alloc_handles_ptr_ = nullptr;
+  alloc_handle_ = 0;
+  resource_ = 0;
+  mem_fd_ = -1;
+}
+
+GpuMemory::~GpuMemory() {
+  FreeGpuVirtualAddress(GpuAddress(), Size());
+  FreePhysicalMemory();
+  if (desc_.handle_ape_addr > 0)
+    dxg_runtime->HandleApertureFree(desc_.handle_ape_addr);
+}
+
+ErrorCode GpuMemory::Init(const GpuMemoryCreateInfo &create_info) {
+  desc_.domain = create_info.domain;
+  desc_.adapter_luid = device_->GetLuid();
+  desc_.client_size = create_info.size;
+  desc_.alignment = create_info.alignment;
+  desc_.mem_flags = create_info.mem_flags;
+  desc_.engine_flag = create_info.engine_flag;
+  desc_.flags.is_virtual = create_info.flags.virtual_alloc;
+  desc_.flags.is_physical_only = create_info.flags.physical_only;
+  desc_.flags.is_physical_contiguous = create_info.flags.physical_contiguous;
+  desc_.flags.is_imported_sys_memfd = create_info.flags.sysmem_ipc_sig_importer;
+  desc_.flags.is_sysmem_exporter = create_info.flags.sysmem_ipc_sig_exporter;
+  desc_.flags.is_va_required = create_info.flags.alloc_va;
+  desc_.flags.is_blit_kernel_object = create_info.flags.blit_kernel_object;
+
+  /* we can't tell the allocation is regular vmm or ipc mem at creation stage,
+     they share same creation parameters, so forcing all vram allocations to
+     sharable to support IPC mem */
+  if (create_info.flags.interprocess ||
+      desc_.domain == thunk_proxy::AllocDomain::kLocal)
+    desc_.flags.is_shared = true;
+
+  desc_.flags.is_locked = create_info.flags.locked;
+  desc_.size = AdjustSize(desc_.client_size);
+
+  if (IsUserMemory() || IsSystem())
+    desc_.cpu_addr = create_info.user_ptr;
+
+  num_allocations_ = CalcChunkNumbers(Size());
+  if (num_allocations_ == 1)
+    alloc_handles_ptr_ = &alloc_handle_;
+  else
+    alloc_handles_ptr_ = new WinAllocationHandle[num_allocations_];
+
+  memset(alloc_handles_ptr_, 0, num_allocations_ * sizeof(WinAllocationHandle));
+
+  auto code = ErrorCode::Success;
+
+  if (IsPhysicalOnly()) {
+    code = CreatePhysicalMemory();
+    if (code == ErrorCode::Success)
+      code = dxg_runtime->HandleApertureAlloc(desc_.size, &desc_.handle_ape_addr);
+    return code;
+  }
+
+  code = ReserveGpuVirtualAddress(create_info.va_hint, Size(), create_info.alignment);
+  if (IsVirtual() || (code != ErrorCode::Success))
+      return code;
+
+  bool physical_created = false;
+
+  auto guard = MakeScopeGuard([this, &physical_created, &code]() {
+    if (code != ErrorCode::Success) {
+
+      if (physical_created) {
+        FreePhysicalMemory();
+      }
+      FreeGpuVirtualAddress(GpuAddress(), Size());
+    }
+  });
+  (void)guard;
+
+  code = CreatePhysicalMemory();
+  if (code != ErrorCode::Success)
+    return code;
+
+  physical_created = true;
+
+  code = MapGpuVirtualAddress(GpuAddress(), Size());
+  if (code != ErrorCode::Success)
+    return code;
+
+  code = MakeResident();
+  if (code != ErrorCode::Success)
+    return code;
+
+  if (!GetDevice()->WaitOnPagingFenceFromCpu())
+    code = ErrorCode::Unknown;
+
+  return code;
+}
+
+ErrorCode GpuMemory::UnmapGpuVirtualAddress(const gpusize addr, const gpusize size, gpusize offset) {
+  auto code = ErrorCode::Success;
+  size_t i = 0;
+  auto map_addr = addr;
+  auto map_size = size;
+
+  while (offset >= WDDMDevice::GpuMemoryChunkSize) {
+    offset -= WDDMDevice::GpuMemoryChunkSize;
+    i += 1;
+  }
+
+  while (map_size > 0) {
+    auto block_size = std::min(map_size, WDDMDevice::GpuMemoryChunkSize);
+
+    D3DDDI_MAPGPUVIRTUALADDRESS args{};
+
+    args.hPagingQueue = device_->PagingQueue();
+    args.BaseAddress = map_addr;
+    args.hAllocation = GetAllocationHandle(i);
+    args.SizeInPages = block_size / 0x1000;
+    args.Protection.NoAccess = 1;
+
+    code = d3dthunk::MapGpuVirtualAddress(&args);
+
+    if (code == ErrorCode::NotReady)
+      device_->UpdatePageFence(args.PagingFenceValue);
+    else if (code != ErrorCode::Success)
+      break;
+
+    map_addr += block_size;
+    map_size -= block_size;
+    offset = 0;   // reset second unmapped allocation offset to zero
+    i += 1;
+  }
+
+  return code;
+}
+
+ErrorCode GpuMemory::MapGpuVirtualAddress(const gpusize addr, const gpusize size, gpusize offset) {
+
+  auto code = ErrorCode::Success;
+  size_t i = 0;
+  auto map_addr = addr;
+  auto map_size = size;
+  const size_t _4K = 0x1000;
+
+  while (offset >= WDDMDevice::GpuMemoryChunkSize) {
+    offset -= WDDMDevice::GpuMemoryChunkSize;
+    i += 1;
+  }
+  const size_t first_chunk = i;
+  const auto first_chunk_offset = offset;
+  /* Found two limitation for local vram:
+   * 1. invisible vram va has to be 64K aligned, otherwise map gpu va fail
+   * 2. visible vram can not be cpu mapped when command submission or after gpu mapped
+   */
+  while (map_size > 0) {
+    auto block_size = std::min(map_size, WDDMDevice::GpuMemoryChunkSize);
+
+    D3DDDI_MAPGPUVIRTUALADDRESS args{};
+
+    args.hPagingQueue = device_->PagingQueue();
+    args.BaseAddress = map_addr;
+    args.hAllocation = GetAllocationHandle(i);
+    args.OffsetInPages = offset / _4K;
+    args.SizeInPages = block_size / _4K;
+    args.Protection.Write = 1;
+
+    code = d3dthunk::MapGpuVirtualAddress(&args);
+
+    if (code != ErrorCode::Success) {
+      if (code == ErrorCode::NotReady) {
+        const uint64_t fence_value = args.PagingFenceValue;
+        device_->UpdatePageFence(fence_value);
+        code = ErrorCode::Success;
+      } else
+        break;
+    }
+
+    map_addr += block_size;
+    map_size -= block_size;
+    offset = 0;  // reset second mapped allocation offset to zero
+    i++;
+  }
+
+  if (code != ErrorCode::Success) {
+    // Map failed, unmap partial mapped block
+    offset = first_chunk_offset;
+    map_addr = addr;
+    map_size = size;
+    for (size_t j = first_chunk; j < i; j++) {
+      auto block_size = std::min(map_size, WDDMDevice::GpuMemoryChunkSize);
+
+      D3DDDI_MAPGPUVIRTUALADDRESS args{};
+
+      args.hPagingQueue = device_->PagingQueue();
+      args.BaseAddress = map_addr;
+      args.hAllocation = 0;
+      args.OffsetInPages = offset / _4K;
+      args.SizeInPages = block_size / _4K;
+      args.Protection.NoAccess = 1;
+
+      auto unmap_code = d3dthunk::MapGpuVirtualAddress(&args);
+      if (unmap_code == ErrorCode::NotReady)
+        device_->UpdatePageFence(args.PagingFenceValue);
+
+      map_addr += block_size;
+      map_size -= block_size;
+    }
+  }
+
+  return code;
+}
+
+ErrorCode GpuMemory::ReserveGpuVirtualAddress(gpusize base_virt_addr, gpusize size, gpusize alignment) {
+  ErrorCode status;
+  gpusize gpu_virt_addr = 0;
+  if ((desc_.flags.is_sysmem_exporter || desc_.flags.is_imported_sys_memfd)
+      && desc_.domain == thunk_proxy::AllocDomain::kSystem) {
+    int mfd = (mem_fd_ > -1)? mem_fd_ : -1;
+    status = dxg_runtime->ReserveIPCSysMem(Size(), &gpu_virt_addr, desc_.alignment, mfd, desc_.flags.is_locked);
+    if (status == ErrorCode::Success)
+      mem_fd_ = mfd;
+  } else {
+    status = dxg_runtime->ReserveGpuVirtualAddress(desc_.domain, base_virt_addr, size, &gpu_virt_addr, alignment,
+        desc_.flags.is_locked);
+  }
+
+  if (status == ErrorCode::Success) {
+    desc_.gpu_addr = gpu_virt_addr;
+
+    if (IsSystem())
+      desc_.cpu_addr = reinterpret_cast<void *>(desc_.gpu_addr);
+  }
+  return status;
+}
+
+ErrorCode GpuMemory::FreeGpuVirtualAddress(gpusize base_addr, gpusize size) {
+  if (mem_fd_ > -1)
+    return dxg_runtime->FreeIPCSysMem(GpuAddress(), Size(), mem_fd_);
+
+  return base_addr != 0 ?
+         dxg_runtime->FreeGpuVirtualAddress(desc_.domain, base_addr, size) :
+         ErrorCode::Success;
+}
+
+ErrorCode GpuMemory::CreatePhysicalMemory() {
+
+  assert(!IsVirtual() && NumChunks() > 0);
+
+  const auto num_allocations = NumChunks();
+  void *priv_drv_data;
+  void *priv_alloc_data;
+  int priv_drv_data_size;
+  int priv_alloc_data_size;
+
+  thunk_proxy::GetAllocPrivDataSize(&priv_drv_data_size, &priv_alloc_data_size);
+  int total_size = priv_drv_data_size +
+    num_allocations * priv_alloc_data_size +
+    num_allocations * sizeof(D3DDDI_ALLOCATIONINFO2);
+  priv_drv_data = malloc(total_size);
+  if (!priv_drv_data)
+    return ErrorCode::OutOfMemory;
+
+  memset(priv_drv_data, 0, total_size);
+  thunk_proxy::FillinAllocPrivDrvData(priv_drv_data, priv_alloc_data_size);
+
+  priv_alloc_data = static_cast<unsigned char*>(priv_drv_data) + priv_drv_data_size;
+  auto alloc_info = reinterpret_cast<D3DDDI_ALLOCATIONINFO2*>(
+       static_cast<unsigned char*>(priv_alloc_data) + priv_alloc_data_size * num_allocations);
+
+  size_t size = desc_.size;
+  uint64_t addr = desc_.gpu_addr;
+  char *cpu_addr = static_cast<char *>(desc_.cpu_addr);
+  const auto &device_info = GetDevice()->DeviceInfo();
+
+  for (size_t i = 0; i < num_allocations; i++) {
+
+    void* priv_data = (void*)((char*)priv_alloc_data + priv_alloc_data_size * i);
+    size_t block_size = std::min(size, WDDMDevice::GpuMemoryChunkSize);
+
+    if (IsUserMemory() || IsSystem()) {
+      thunk_proxy::SetAllocationInfo(priv_data, block_size, desc_.domain, 0, desc_.mem_flags, desc_.engine_flag, device_info);
+      alloc_info[i].pSystemMem = static_cast<void *>(cpu_addr);
+      cpu_addr += block_size;
+    } else {
+      thunk_proxy::SetAllocationInfo(priv_data, block_size, desc_.domain, addr, desc_.mem_flags, desc_.engine_flag, device_info);
+    }
+
+    size -= block_size;
+    addr += block_size;
+
+    alloc_info[i].pPrivateDriverData = priv_data;
+    alloc_info[i].PrivateDriverDataSize = priv_alloc_data_size;
+    alloc_info[i].VidPnSourceId = D3DDDI_ID_UNINITIALIZED;
+  }
+
+  D3DKMT_CREATEALLOCATION args = {};
+  args.hDevice = device_->DeviceHandle();
+  args.pPrivateDriverData = priv_drv_data;
+  args.PrivateDriverDataSize = priv_drv_data_size;
+  args.NumAllocations = num_allocations;
+  args.pAllocationInfo2 = alloc_info;
+
+  /* The PhysicallyContiguous flag causes allocation failure
+   * args.Flags.PhysicallyContiguous = IsPhysicalContiguous();
+   */
+
+  SharedHandleInfo shared_info;
+  if (IsShared()) {
+    shared_info.size = desc_.size;
+    shared_info.client_size = desc_.client_size;
+    shared_info.domain = desc_.domain;
+    shared_info.adapter_luid = desc_.adapter_luid;
+    shared_info.flags = reinterpret_cast<uint32_t>(desc_.flags.reserved);
+    shared_info.mem_flags = desc_.mem_flags;
+    shared_info.pid = dxg_runtime->parent_pid;
+    shared_info.gpu_addr = desc_.gpu_addr;
+    args.pPrivateRuntimeData = &shared_info;
+    args.PrivateRuntimeDataSize = sizeof(shared_info);
+    args.Flags.NtSecuritySharing = 1;
+    args.Flags.CreateShared = 1;
+    args.Flags.CreateResource = 1;
+  }
+
+  auto status = d3dthunk::CreateAllocation(&args);
+  if (status == ErrorCode::Success) {
+    for (size_t i = 0; i < num_allocations; i++)
+      alloc_handles_ptr_[i] = alloc_info[i].hAllocation;
+
+    resource_ = args.hResource;
+  }
+  free(priv_drv_data);
+  return status;
+}
+
+ErrorCode GpuMemory::FreePhysicalMemory() {
+  auto code = ErrorCode::Success;
+
+  if (alloc_handles_ptr_ == nullptr || (NumChunks() == 1 && *alloc_handles_ptr_ == 0))
+      return code;
+
+  code = d3dthunk::DestroyAllocation(device_->DeviceHandle(),
+                                  resource_,
+                                  NumChunks(),
+                                  alloc_handles_ptr_);
+  if (NumChunks() > 1)
+    delete[] alloc_handles_ptr_;
+
+  alloc_handles_ptr_ = nullptr;
+  return code;
+}
+
+ErrorCode GpuMemory::MakeResident() {
+
+  D3DDDI_MAKERESIDENT args = {};
+  args.hPagingQueue = device_->PagingQueue();
+  args.NumAllocations = NumChunks();
+  args.AllocationList = alloc_handles_ptr_;
+  args.Flags.CantTrimFurther = 1;
+
+  auto code = d3dthunk::MakeResident(&args);
+  if (code == ErrorCode::NotReady) {
+    const auto fence_value = args.PagingFenceValue;
+    device_->UpdatePageFence(fence_value);
+    code = ErrorCode::Success;
+  }
+  return code;
+}
+
+ErrorCode GpuMemory::Evict() {
+
+  D3DKMT_EVICT args = {};
+  args.hDevice = device_->DeviceHandle();
+  args.NumAllocations = NumChunks();
+  args.AllocationList = alloc_handles_ptr_;
+
+  return d3dthunk::Evict(&args);
+}
+
+ErrorCode GpuMemory::ExportPhysicalHandle(int* dmabuf_fd, uint32_t flags) {
+  if (mem_fd_ > -1) {
+    *dmabuf_fd = mem_fd_;
+    return ErrorCode::Success;
+  }
+
+  if (IsShared())
+    return d3dthunk::ShareObjects(1, resource_, flags, dmabuf_fd);
+  else
+    return ErrorCode::UnSupported;
+}
+
+
+ErrorCode GpuMemory::ImportPhysicalHandle(const GpuMemoryCreateInfo &create_info, gpusize *gpu_addr) {
+  D3DKMT_QUERYRESOURCEINFOFROMNTHANDLE query_args;
+  int dmabuf_fd = create_info.dmabuf_fd;
+
+  if (dmabuf_fd <= 0)
+    return ErrorCode::InvalidateParams;
+
+  if(create_info.flags.sysmem_ipc_sig_importer) {
+    // the ipc signal sys mem fd will be closed in Runtime::IPCClientImport, dup to hold a reference
+    mem_fd_ = dup(dmabuf_fd);
+    desc_.client_size = create_info.size;
+    desc_.size = AdjustSize(desc_.client_size);
+    desc_.domain = thunk_proxy::AllocDomain::kSystem;
+    desc_.adapter_luid = device_->GetLuid();
+    desc_.alignment = 0x1000;
+    desc_.mem_flags = create_info.mem_flags;
+    desc_.engine_flag = create_info.engine_flag;
+    desc_.flags.is_imported_sys_memfd = create_info.flags.sysmem_ipc_sig_importer;
+    desc_.flags.is_va_required = create_info.flags.alloc_va;
+    desc_.flags.is_virtual = create_info.flags.virtual_alloc;
+    desc_.flags.is_physical_only = create_info.flags.physical_only;
+    desc_.flags.is_physical_contiguous = create_info.flags.physical_contiguous;
+    desc_.flags.is_locked = create_info.flags.locked;
+
+    auto code = ReserveGpuVirtualAddress(create_info.va_hint, Size(), create_info.alignment);
+    if (code != ErrorCode::Success)
+      return code;
+
+    bool physical_created = false;
+    auto guard = MakeScopeGuard([this, &physical_created, &code]() {
+          if (code != ErrorCode::Success) {
+            if (physical_created)
+              FreePhysicalMemory();
+            FreeGpuVirtualAddress(GpuAddress(), Size());
+          }
+        });
+    (void)guard;
+
+    num_allocations_ = CalcChunkNumbers(Size());
+    if (num_allocations_ == 1)
+      alloc_handles_ptr_ = &alloc_handle_;
+    else
+      alloc_handles_ptr_ = new WinAllocationHandle[num_allocations_];
+
+    memset(alloc_handles_ptr_, 0, num_allocations_ * sizeof(WinAllocationHandle));
+
+    code = CreatePhysicalMemory();
+    if (code != ErrorCode::Success)
+      return code;
+
+    physical_created = true;
+
+    code = MapGpuVirtualAddress(GpuAddress(), Size());
+    if (code != ErrorCode::Success)
+      return code;
+
+    code = MakeResident();
+    if (code != ErrorCode::Success)
+      return code;
+
+    if (!GetDevice()->WaitOnPagingFenceFromCpu())
+      code = ErrorCode::Unknown;
+
+    return code;
+  } else {
+    // vmem importer / ipc vram importer
+    memset(&query_args, 0, sizeof(query_args));
+    query_args.hDevice = device_->DeviceHandle();
+    query_args.hNtHandle = reinterpret_cast<HANDLE>(dmabuf_fd);
+    auto ret = d3dthunk::QueryResourceInfoFromNtHandle(&query_args);
+    if (ret != ErrorCode::Success) {
+      pr_err("query resource info from nt handle failed %d\n", static_cast<int>(ret));
+      return ErrorCode::InvalidateParams;
+    }
+    pr_debug("wsl-thunk: import from nt handle %d, get allocation number %d,"
+             " runtime data size %#x total driver data size %#x resource data size=%#x\n",
+             dmabuf_fd,
+             query_args.NumAllocations,
+             query_args.PrivateRuntimeDataSize,
+             query_args.TotalPrivateDriverDataSize,
+             query_args.ResourcePrivateDriverDataSize);
+
+    SharedHandleInfo shared_info;
+    if(sizeof(shared_info) != query_args.PrivateRuntimeDataSize) {
+      pr_err("shared hanle info size mismatch:%d vs %ld\n",
+             query_args.PrivateRuntimeDataSize, sizeof(shared_info));
+      return ErrorCode::UnSupported;
+    }
+
+    uint32_t total_size = query_args.NumAllocations * sizeof(D3DDDI_OPENALLOCATIONINFO2) +
+      query_args.TotalPrivateDriverDataSize +
+      query_args.ResourcePrivateDriverDataSize;
+    D3DDDI_OPENALLOCATIONINFO2 *open_info =
+      reinterpret_cast<D3DDDI_OPENALLOCATIONINFO2*> (calloc(1, total_size));
+    if (!open_info) {
+      pr_err("alloc open_info failed, NumAllocations:%d\n",
+             query_args.NumAllocations);
+      return ErrorCode::OutOfMemory;
+    }
+
+    auto guard = MakeScopeGuard([&open_info]() { free(open_info); });
+
+    alloc_handles_ptr_ = new WinAllocationHandle[query_args.NumAllocations];
+
+    D3DKMT_OPENRESOURCEFROMNTHANDLE open_args;
+    memset(&open_args, 0, sizeof(open_args));
+    open_args.hDevice = query_args.hDevice;
+    open_args.hNtHandle = query_args.hNtHandle;
+    open_args.NumAllocations = query_args.NumAllocations;
+    open_args.pOpenAllocationInfo2 = open_info;
+    open_args.TotalPrivateDriverDataBufferSize = query_args.TotalPrivateDriverDataSize;
+    open_args.pTotalPrivateDriverDataBuffer = reinterpret_cast<void*>
+      (open_args.pOpenAllocationInfo2 + open_args.NumAllocations);
+    open_args.ResourcePrivateDriverDataSize = query_args.ResourcePrivateDriverDataSize;
+    open_args.pResourcePrivateDriverData = reinterpret_cast<void*>
+      (((uint64_t)open_args.pTotalPrivateDriverDataBuffer) +
+       open_args.TotalPrivateDriverDataBufferSize);
+    open_args.PrivateRuntimeDataSize = query_args.PrivateRuntimeDataSize;
+    open_args.pPrivateRuntimeData = reinterpret_cast<void*> (&shared_info);
+
+    ret = d3dthunk::OpenResourceFromNtHandle(&open_args);
+    if (ret != ErrorCode::Success) {
+      ret = ErrorCode::InvalidateParams;
+      pr_err("open resource failed %d\n", static_cast<int>(ret));
+      return ret;
+    }
+    if (shared_info.pid == dxg_runtime->parent_pid &&
+      create_info.flags.alloc_va &&
+      IsSameAdapter(shared_info.adapter_luid) &&
+      shared_info.gpu_addr) {
+      pr_info("import from same device and samve process, va is required. "
+               "a buffer can't be mapped to 2 va. delete the imported buffer, use the existing one.\n");
+      if (gpu_addr)
+        *gpu_addr = shared_info.gpu_addr;
+      return ErrorCode::SameProcessSameDevice;
+    }
+
+    desc_.size = shared_info.size;
+    desc_.client_size = shared_info.client_size;
+    desc_.domain = shared_info.domain;
+    desc_.flags.reserved = shared_info.flags;
+    desc_.mem_flags = shared_info.mem_flags;
+    desc_.adapter_luid = shared_info.adapter_luid;
+    resource_ = open_args.hResource;
+    num_allocations_ = open_args.NumAllocations;
+    for (int i = 0; i < num_allocations_; i++)
+      alloc_handles_ptr_[i] = open_info[i].hAllocation;
+
+    desc_.flags.is_va_required = create_info.flags.alloc_va;
+    if (desc_.flags.is_va_required) {
+      desc_.flags.is_imported_vram_ipc = 1;
+      ret = ReserveGpuVirtualAddress(create_info.va_hint, desc_.size, create_info.alignment);
+      if (ret != ErrorCode::Success)
+        pr_err("failed to allocate svm range, error:%d\n", static_cast<int>(ret));
+
+      return ret;
+    } else {
+      desc_.flags.is_imported_vram_vmem = 1;
+      return dxg_runtime->HandleApertureAlloc(desc_.size, &desc_.handle_ape_addr);
+    }
+  }
+}
+
+} // namespace thunk
+} // namespace wsl
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/wddm/queue.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/wddm/queue.cpp
new file mode 100644
index 0000000000..44658819cb
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/wddm/queue.cpp
@@ -0,0 +1,1210 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include <cstring>
+#include <cinttypes>
+#include <cstddef>
+
+#include "impl/wddm/queue.h"
+#include "impl/registers.h"
+
+#include "impl/hsa/hsa.h"
+#include "impl/hsa/hsa_ven_amd_loader.h"
+extern hsa_signal_value_t hsakmt_hsa_signal_load_relaxed(hsa_signal_t signal);
+extern hsa_signal_value_t hsakmt_hsa_signal_wait_relaxed(
+    hsa_signal_t signal, hsa_signal_condition_t condition,
+    hsa_signal_value_t compare_value, uint64_t timeout_hint,
+    hsa_wait_state_t wait_state_hint);
+extern void hsakmt_hsa_signal_store_screlease(hsa_signal_t hsa_signal,
+                                      hsa_signal_value_t value);
+extern hsa_status_t hsakmt_hsa_ven_amd_loader_query_host_address(
+    const void *device_address, const void **host_address);
+
+namespace wsl {
+namespace thunk {
+
+hsa_status_t WDDMQueue::SwsInit(void) {
+  if (!device->CreateSyncobj(&syncobj, &sync_addr))
+    return HSA_STATUS_ERROR;
+
+  if (device->AllocUserQueueMemFromUMD()) {
+
+    GpuMemory *gpu_mem = nullptr;
+    GpuMemoryCreateInfo create_info{};
+
+    create_info.domain = thunk_proxy::kUserQueue;
+    create_info.size = device->GetSwsQueueSize();
+    create_info.engine_flag = thunk_proxy::QueueEngine2EngineFlag(queue_engine);
+
+    auto code = device->CreateGpuMemory(create_info, &gpu_mem);
+    if (code != ErrorCode::Success) {
+      device->DestroySyncobj(syncobj);
+      return HSA_STATUS_ERROR;
+    }
+
+    queue_mem = gpu_mem->GetGpuMemoryHandle();
+    queue = gpu_mem->GetAllocationHandle(0);
+  }
+
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t WDDMQueue::SwsFini(void) {
+  device->DestroySyncobj(syncobj);
+
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t WDDMQueue::SwsSubmit(uint64_t command_addr,
+                                  uint64_t command_size,
+                                  uint64_t fence_value) {
+  if (!device->SubmitToSwQueue(this, command_addr, command_size, fence_value))
+    return HSA_STATUS_ERROR;
+
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t WDDMQueue::HwsInit(void) {
+  if (!device->CreateHwQueue(this))
+    return HSA_STATUS_ERROR;
+
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t WDDMQueue::HwsFini(void) {
+  if (!device->DestroyHwQueue(this))
+    return HSA_STATUS_ERROR;
+
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t WDDMQueue::HwsSubmit(uint64_t command_addr,
+                                  uint64_t command_size,
+                                  uint64_t fence_value) {
+  if (!device->SubmitToHwQueue(this, command_addr, command_size, fence_value))
+    return HSA_STATUS_ERROR;
+
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t WDDMQueue::SetPriority(hsa_amd_queue_priority_t priority) {
+  if (!use_hws)
+    return HSA_STATUS_SUCCESS;
+
+  thunk_proxy::SchedLevel new_prio = ConvertSchedLevel(priority);
+  if (prio == new_prio)
+    return HSA_STATUS_SUCCESS;
+
+  pr_debug("set prio %d -> %d\n", prio, new_prio);
+  device->DestroyHwQueue(this);
+
+  prio = new_prio;
+  return HwsInit();
+}
+
+void ComputeQueue::HandleError(hsa_status_t status) {
+  hsa_signal_t sig = amd_queue_rocr_->queue_inactive_signal;
+  hsa_signal_value_t val = -1;
+
+  struct queue_error_t {
+    uint32_t code;
+    hsa_status_t status;
+  };
+  static const queue_error_t QueueErrors[] = {
+    {2, HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS},
+    {4, HSA_STATUS_ERROR_INVALID_ALLOCATION},
+    {8, HSA_STATUS_ERROR_INVALID_CODE_OBJECT},
+    //{16, HSA_STATUS_ERROR_INVALID_ARGUMENT},
+    {32, HSA_STATUS_ERROR_INVALID_PACKET_FORMAT},
+    {64, HSA_STATUS_ERROR_INVALID_ARGUMENT},
+    //{128, HSA_STATUS_ERROR_OUT_OF_REGISTERS},
+    //{0x20000000, HSA_STATUS_ERROR_MEMORY_APERTURE_VIOLATION},
+    //{0x40000000, HSA_STATUS_ERROR_ILLEGAL_INSTRUCTION},
+    {0x80000000, HSA_STATUS_ERROR_EXCEPTION},
+  };
+  for (std::size_t i = 0; i < sizeof(QueueErrors) / sizeof(QueueErrors[0]); ++i) {
+    if (QueueErrors[i].status == status) {
+      val = QueueErrors[i].code;
+      pr_err("error %d, sig_val %ld\n", status, val);
+      break;
+    }
+  }
+
+  if (sig.handle) {
+    hsakmt_hsa_signal_store_screlease(sig, val);
+  }
+  if (error_code_) {
+    error_code_->store(val, std::memory_order_release);
+  }
+}
+
+void ComputeQueue::AqlToPm4Thread(ComputeQueue *queue) {
+
+  // This timing system is used for sleeping this Thread
+  // when one packet is invalid for about 2 seconds.
+  std::chrono::steady_clock::time_point start_time, time;
+  // Set the polling timeout value for 2 seconds
+  const std::chrono::milliseconds kMaxElapsed(2000);
+  uint64_t current_position = queue->GetAqlWriteIndex();
+  bool sleep = false;
+  start_time = std::chrono::steady_clock::now();
+
+  while (true) {
+    if (!queue->IsInvalidPacket()) {
+      hsa_status_t status = queue->Process();
+      if (status != HSA_STATUS_SUCCESS) {
+        pr_err("process compute queue fail status = %08x\n", status);
+        queue->HandleError(status);
+        break;
+      }
+      sleep = false;
+    } else {
+      if (current_position == queue->GetAqlWriteIndex()) {
+        time = std::chrono::steady_clock::now();
+        if (time - start_time > kMaxElapsed)
+          sleep = true;
+      } else {
+        start_time = std::chrono::steady_clock::now();
+        current_position = queue->GetAqlWriteIndex();
+        sleep = false;
+      }
+    }
+
+    if ((queue->GetRingWptr()->load() > queue->GetRingRptr()->load()) && !sleep)
+      continue;
+
+    std::unique_lock<std::mutex> lock(queue->thread_cond_lock_);
+    // CPU wait for valid packet
+    if (queue->GetRingWptr()->load() <= queue->GetRingRptr()->load() ||
+        (sleep && queue->IsInvalidPacket())) {
+      if (queue->thread_stop_)
+        break;
+      pr_debug("wait %p wptr=%" PRIx64 " rptr=%" PRIx64 "\n",
+               queue->ring, queue->GetRingWptr()->load(), queue->GetRingRptr()->load());
+      queue->thread_cond_.wait(lock);
+    }
+  }
+
+  pr_debug("aql to pm4 thread %p exit\n", queue->ring);
+}
+
+ComputeQueue::ComputeQueue(WDDMDevice *device,
+               void *ring,
+               uint64_t ring_size,
+               std::atomic<uint64_t> *ring_wptr,
+               std::atomic<uint64_t> *ring_rptr,
+               volatile int64_t *error_addr,
+               uint32_t cmdbuf_size,
+               uint32_t engine,
+               bool use_hws) :
+               WDDMQueue(device, 0, cmdbuf_size, engine, use_hws),
+               ring(ring),
+               ring_size(ring_size),
+               ring_wptr(ring_wptr),
+               ring_rptr(ring_rptr),
+               error_code_(reinterpret_cast<volatile std::atomic<long int>*>(error_addr)),
+               ib_start_addr(0),
+               ib_size(0),
+               sync_point(0),
+               cmdbuf_aql_frame_write_index(0),
+               cmdbuf_aql_frame_size(0),
+               needs_barrier(true),
+               ready_to_submit(false),
+               platform_atomic_support_(false),
+               signal_addr_(NULL),
+               thread_stop_(false),
+               max_scratch_waves_(device->MaxScratchSlotsPerCu() * device->ComputeUnitCount()),
+               dispatch_waves_(0),
+               scratch_size_per_wave_(0),
+               scratch_size_(0),
+               total_scratch_size_(0),
+               scratch_base_(nullptr) {
+  bool ret = device->CreateQueue(this);
+  assert(ret);
+
+  GpuMemoryCreateInfo create_info{};
+  create_info.size = dxg_runtime->page_size;
+  create_info.domain = thunk_proxy::kSystem;
+  GpuMemory *gpu_mem = nullptr;
+  auto code = device->CreateGpuMemory(create_info, &gpu_mem);
+  assert(code == ErrorCode::Success);
+  amd_queue_mem_ = gpu_mem->GetGpuMemoryHandle();
+  amd_queue_ = reinterpret_cast<amd_queue_v2_t*>(gpu_mem->GpuAddress());
+
+  amd_queue_rocr_ = (amd_queue_v2_t*)((char*)ring_rptr - offsetof(amd_queue_v2_t, read_dispatch_id));
+  aql_to_pm4_thread_ = std::thread(AqlToPm4Thread, this);
+
+  if (device->Major() >= 11)
+    scratch_mem_alignment_size_ = 256;
+  else
+    scratch_mem_alignment_size_ = 1024;
+}
+
+ComputeQueue::~ComputeQueue() {
+  thread_cond_lock_.lock();
+  thread_stop_ = true;
+  thread_cond_lock_.unlock();
+  thread_cond_.notify_one();
+  aql_to_pm4_thread_.join();
+
+  //doorbell_signal_->Release();
+
+  device->DestroyQueue(this);
+
+  if (scratch_base_) {
+    auto scratch_gpu_mem = GpuMemory::Convert(scratch_mem_);
+    delete scratch_gpu_mem;
+  }
+
+  auto amd_queue_gpu_mem = GpuMemory::Convert(amd_queue_mem_);
+  delete amd_queue_gpu_mem;
+}
+
+void ComputeQueue::InitScratchSRD() {
+  // Populate scratch resource descriptor
+  SQ_BUF_RSRC_WORD0 srd0;
+
+  uintptr_t scratch_base = uintptr_t(scratch_base_);
+  srd0.bits.BASE_ADDRESS = scratch_base;
+
+  uint32_t srd1_u32;
+
+  if (device->Major() < 11) {
+    SQ_BUF_RSRC_WORD1 srd1;
+
+    srd1.bits.BASE_ADDRESS_HI = scratch_base >> 32;
+    srd1.bits.STRIDE = 0;
+    srd1.bits.CACHE_SWIZZLE = 0;
+    srd1.bits.SWIZZLE_ENABLE = 1;
+
+    srd1_u32 = srd1.u32All;
+  } else {
+    SQ_BUF_RSRC_WORD1_GFX11 srd1;
+
+    srd1.bits.BASE_ADDRESS_HI = scratch_base >> 32;
+    srd1.bits.STRIDE = 0;
+    srd1.bits.SWIZZLE_ENABLE = 1;
+
+    srd1_u32 = srd1.u32All;
+  }
+
+  SQ_BUF_RSRC_WORD2 srd2;
+
+  srd2.bits.NUM_RECORDS = scratch_size_;
+
+  uint32_t srd3_u32;
+
+  if (device->Major() < 10) {
+    SQ_BUF_RSRC_WORD3 srd3;
+
+    srd3.bits.DST_SEL_X = SQ_SEL_X;
+    srd3.bits.DST_SEL_Y = SQ_SEL_Y;
+    srd3.bits.DST_SEL_Z = SQ_SEL_Z;
+    srd3.bits.DST_SEL_W = SQ_SEL_W;
+    srd3.bits.NUM_FORMAT = BUF_NUM_FORMAT_UINT;
+    srd3.bits.DATA_FORMAT = BUF_DATA_FORMAT_32;
+    srd3.bits.ELEMENT_SIZE = 1;  // 4
+    srd3.bits.INDEX_STRIDE = 3;  // 64
+    srd3.bits.ADD_TID_ENABLE = 1;
+    srd3.bits.ATC__CI__VI = 0;
+    srd3.bits.HASH_ENABLE = 0;
+    srd3.bits.HEAP = 0;
+    srd3.bits.MTYPE__CI__VI = 0;
+    srd3.bits.TYPE = SQ_RSRC_BUF;
+
+    srd3_u32 = srd3.u32All;
+  } else if (device->Major() == 10) {
+    SQ_BUF_RSRC_WORD3_GFX10 srd3;
+
+    srd3.bits.DST_SEL_X = SQ_SEL_X;
+    srd3.bits.DST_SEL_Y = SQ_SEL_Y;
+    srd3.bits.DST_SEL_Z = SQ_SEL_Z;
+    srd3.bits.DST_SEL_W = SQ_SEL_W;
+    srd3.bits.FORMAT = BUF_FORMAT_32_UINT;
+    srd3.bits.RESERVED1 = 0;
+    srd3.bits.INDEX_STRIDE = 0;  // filled in by CP
+    srd3.bits.ADD_TID_ENABLE = 1;
+    srd3.bits.RESOURCE_LEVEL = 1;
+    srd3.bits.RESERVED2 = 0;
+    srd3.bits.OOB_SELECT = 2;  // no bounds check in swizzle mode
+    srd3.bits.TYPE = SQ_RSRC_BUF;
+
+    srd3_u32 = srd3.u32All;
+  } else if (device->Major() == 11) {
+    SQ_BUF_RSRC_WORD3_GFX11 srd3;
+
+    srd3.bits.DST_SEL_X = SQ_SEL_X;
+    srd3.bits.DST_SEL_Y = SQ_SEL_Y;
+    srd3.bits.DST_SEL_Z = SQ_SEL_Z;
+    srd3.bits.DST_SEL_W = SQ_SEL_W;
+    srd3.bits.FORMAT = BUF_FORMAT_32_UINT;
+    srd3.bits.RESERVED1 = 0;
+    srd3.bits.INDEX_STRIDE = 0;  // filled in by CP
+    srd3.bits.ADD_TID_ENABLE = 1;
+    srd3.bits.RESERVED2 = 0;
+    srd3.bits.OOB_SELECT = 2;  // no bounds check in swizzle mode
+    srd3.bits.TYPE = SQ_RSRC_BUF;
+
+    srd3_u32 = srd3.u32All;
+  } else {
+    SQ_BUF_RSRC_WORD3_GFX12 srd3;
+    srd3.bits.DST_SEL_X = SQ_SEL_X;
+    srd3.bits.DST_SEL_Y = SQ_SEL_Y;
+    srd3.bits.DST_SEL_Z = SQ_SEL_Z;
+    srd3.bits.DST_SEL_W = SQ_SEL_W;
+    srd3.bits.FORMAT = BUF_FORMAT_32_UINT;
+    srd3.bits.RESERVED1 = 0;
+    srd3.bits.INDEX_STRIDE = 0;  // filled in by CP
+    srd3.bits.ADD_TID_ENABLE = 1;
+    srd3.bits.WRITE_COMPRESS_ENABLE = 0;
+    srd3.bits.COMPRESSION_EN = 0;
+    srd3.bits.COMPRESSION_ACCESS_MODE = 0;
+    srd3.bits.OOB_SELECT = 2;  // no bounds check in swizzle mode
+    srd3.bits.TYPE = SQ_RSRC_BUF;
+
+    srd3_u32 = srd3.u32All;
+  }
+
+  // Update Queue's Scratch descriptor's property
+  amd_queue_->scratch_resource_descriptor[0] = srd0.u32All;
+  amd_queue_->scratch_resource_descriptor[1] = srd1_u32;
+  amd_queue_->scratch_resource_descriptor[2] = srd2.u32All;
+  amd_queue_->scratch_resource_descriptor[3] = srd3_u32;
+
+  // Populate flat scratch parameters in amd_queue_.
+  amd_queue_->scratch_backing_memory_location = scratch_base;
+
+  // For backwards compatibility this field records the per-lane scratch
+  // for a 64 lane wavefront. If scratch was allocated for 32 lane waves
+  // then the effective size for a 64 lane wave is halved.
+  amd_queue_->scratch_wave64_lane_byte_size = scratch_size_per_wave_ / 64;
+
+  uint64_t num_waves;
+  if (device->Major() < 11) {
+    COMPUTE_TMPRING_SIZE tmpring_size;
+    // Scratch Size per Wave is specified in terms of scratch_mem_alignment_size_
+    tmpring_size.bits.WAVESIZE = scratch_size_per_wave_ / scratch_mem_alignment_size_;
+    num_waves = scratch_size_ / scratch_size_per_wave_;
+    tmpring_size.bits.WAVES = std::min(num_waves, max_scratch_waves_);
+
+    amd_queue_->compute_tmpring_size = tmpring_size.u32All;
+  } else if (device->Major() == 11) {
+    COMPUTE_TMPRING_SIZE_GFX11 tmpring_size;
+    tmpring_size.bits.WAVESIZE = scratch_size_per_wave_ / scratch_mem_alignment_size_;
+    // For GFX11 we specify number of waves per engine instead of total
+    num_waves = scratch_size_ / scratch_size_per_wave_ / device->NumShaderEngine();
+    tmpring_size.bits.WAVES = std::min(num_waves, max_scratch_waves_);
+
+    amd_queue_->compute_tmpring_size = tmpring_size.u32All;
+  } else {
+    COMPUTE_TMPRING_SIZE_GFX12 tmpring_size = {};
+    tmpring_size.bits.WAVESIZE = scratch_size_per_wave_ / scratch_mem_alignment_size_;
+    // For GFX12 we specify number of waves per engine instead of total
+    num_waves = scratch_size_ / scratch_size_per_wave_ / device->NumShaderEngine();
+    tmpring_size.bits.WAVES = std::min(num_waves, max_scratch_waves_);
+
+    amd_queue_->compute_tmpring_size = tmpring_size.u32All;
+  }
+
+  return;
+}
+
+uint64_t ComputeQueue::CalcDispatchGroups(hsa_kernel_dispatch_packet_t *packet)
+{
+  const uint64_t lanes_per_group =
+      (uint64_t(packet->workgroup_size_x) * packet->workgroup_size_y) * packet->workgroup_size_z;
+
+  uint64_t groups = ((uint64_t(packet->grid_size_x) + packet->workgroup_size_x - 1) /
+                      packet->workgroup_size_x) *
+                      ((uint64_t(packet->grid_size_y) + packet->workgroup_size_y - 1) /
+                      packet->workgroup_size_y) *
+                      ((uint64_t(packet->grid_size_z) + packet->workgroup_size_z - 1) /
+                      packet->workgroup_size_z);
+  const uint32_t cu_count = device->ComputeUnitCount();
+  const uint32_t engines = device->NumShaderEngine();
+
+  const uint32_t symmetric_cus = AlignDown(cu_count, engines);
+  const uint32_t asymmetryPerRound = cu_count - symmetric_cus;
+  const uint64_t rounds = groups / cu_count;
+  const uint64_t asymmetricGroups = rounds * asymmetryPerRound;
+  const uint64_t symmetricGroups = groups - asymmetricGroups;
+
+  uint64_t maxGroupsPerEngine =
+        ((symmetricGroups + engines - 1) / engines) + (asymmetryPerRound ? rounds : 0);
+
+  // For gfx10+ devices we must attempt to assign the smaller of 256 lanes or 16 groups to each
+  // engine.
+  if (device->Major() >= 10 &&
+      maxGroupsPerEngine < 16 &&
+      lanes_per_group * maxGroupsPerEngine < 256) {
+    uint64_t groups_per_interleave = (256 + lanes_per_group - 1) / lanes_per_group;
+    maxGroupsPerEngine = std::min(groups_per_interleave, uint64_t(16ul));
+  }
+
+  return maxGroupsPerEngine * engines;
+}
+
+uint64_t ComputeQueue::CalcDispatchWavesPerGroup(hsa_kernel_dispatch_packet_t *packet,
+                                                  bool wave32)
+{
+  const uint32_t lanes_per_wave = wave32 ? 32 : 64;
+
+  const uint64_t lanes_per_group =
+      (uint64_t(packet->workgroup_size_x) * packet->workgroup_size_y) * packet->workgroup_size_z;
+
+  return (lanes_per_group + lanes_per_wave - 1) / lanes_per_wave;
+}
+
+bool ComputeQueue::UpdateScratch(hsa_kernel_dispatch_packet_t *packet, bool wave32) {
+  const uint32_t lanes_per_wave = wave32 ? 32 : 64;
+  const uint64_t size_per_thread = AlignUp(packet->private_segment_size,
+                                  scratch_mem_alignment_size_ / lanes_per_wave);
+
+  uint64_t groups = CalcDispatchGroups(packet);
+  uint64_t waves_per_group = CalcDispatchWavesPerGroup(packet, wave32);
+
+  // For packet batching, the maximum value must be used to fit all packets.
+  scratch_size_per_wave_ = std::max(size_per_thread * lanes_per_wave, scratch_size_per_wave_);
+  dispatch_waves_ = std::max(groups * waves_per_group, dispatch_waves_);
+
+  const uint64_t max_scratch_size = scratch_size_per_wave_ * max_scratch_waves_;
+  const uint64_t dispatch_size = scratch_size_per_wave_ * dispatch_waves_;
+
+  scratch_size_ = std::min(dispatch_size, max_scratch_size);
+
+  if (total_scratch_size_ >= scratch_size_)
+    return true;
+
+  pr_debug("need realloc scratch buffer, size %x -> %x\n",
+           total_scratch_size_, scratch_size_);
+
+  GpuMemoryCreateInfo create_info{};
+  create_info.size = scratch_size_;
+  create_info.domain = thunk_proxy::kLocal;
+  GpuMemory *gpu_mem = nullptr;
+  auto code = device->CreateGpuMemory(create_info, &gpu_mem);
+  if (code != ErrorCode::Success)
+    return false;
+
+  if (scratch_base_) {
+    auto scratch_gpu_mem = GpuMemory::Convert(scratch_mem_);
+    delete scratch_gpu_mem;
+  }
+
+  total_scratch_size_ = scratch_size_;
+  scratch_base_ = reinterpret_cast<void *>(gpu_mem->GpuAddress());
+  scratch_mem_ = gpu_mem->GetGpuMemoryHandle();
+
+  InitScratchSRD();
+  return true;
+}
+
+bool ComputeQueue::RelocateCmdbufScratchBase(uint64_t addr) {
+  if (scratch_base_offset_array_.empty())
+    return true;
+
+  for (size_t i = 0; i < scratch_base_offset_array_.size(); i++) {
+    uint32_t *p_compute_user_data =
+      reinterpret_cast<uint32_t *>(addr + scratch_base_offset_array_[i]);
+    if (device->Major() >= 11) {
+      p_compute_user_data[0] = Ptr48Low32(scratch_base_);
+      p_compute_user_data[1] = Ptr48High8(scratch_base_);
+    } else {
+      p_compute_user_data[0] = PtrLow32(scratch_base_);
+      p_compute_user_data[1] = (p_compute_user_data[1] & 0xffff0000) | PtrHigh32(scratch_base_);
+    }
+  }
+  scratch_base_offset_array_.clear();
+
+  return true;
+}
+
+uint32_t ComputeQueue::UpdateIndexStride(uint32_t srd, bool wave32) {
+
+  assert(device->Major() < 13);
+
+  if (device->Major() == 10) {
+    SQ_BUF_RSRC_WORD3_GFX10 srd3;
+
+    srd3.u32All = srd;
+    srd3.bits.INDEX_STRIDE = wave32 ? 2 : 3;
+
+    return srd3.u32All;
+  } else if (device->Major() == 11) {
+    SQ_BUF_RSRC_WORD3_GFX11 srd3;
+
+    srd3.u32All = srd;
+    srd3.bits.INDEX_STRIDE = wave32 ? 2 : 3;
+
+    return srd3.u32All;
+  } else if (device->Major() == 12) {
+    SQ_BUF_RSRC_WORD3_GFX12 srd3;
+
+    srd3.u32All = srd;
+    srd3.bits.INDEX_STRIDE = wave32 ? 2 : 3;
+
+    return srd3.u32All;
+  }
+
+  return srd;
+}
+
+uint64_t ComputeQueue::GetKernelObjAddr(uint64_t addr) const {
+  /* convert dev_addr to host_addr */
+  auto code = get_gpu_mem((void*)addr);
+  if (code && code->IsBlitKernelObject()) {
+    return code->GpuAddress();
+  }
+
+  uint64_t host_addr = 0;
+  auto ret = hsakmt_hsa_ven_amd_loader_query_host_address(reinterpret_cast<const void *>(addr),
+                                           reinterpret_cast<const void **>(&host_addr));
+  if (ret == HSA_STATUS_SUCCESS) {
+    return host_addr;
+  }
+  pr_err("failed to query host address for kernel object %p, ret=%d\n", (void*)addr, ret);
+  return 0;
+}
+
+void ComputeQueue::RingDoorbell() {
+  thread_cond_lock_.lock();
+  thread_cond_lock_.unlock();
+  pr_debug("notify %p wptr=%" PRIx64 " rptr=%" PRIx64 "\n",
+           ring, GetRingWptr()->load(), GetRingRptr()->load());
+  thread_cond_.notify_one();
+}
+
+hsa_status_t ComputeQueue::Init(void) {
+  hsa_status_t ret = use_hws ? HwsInit() : SwsInit();
+  if (ret)
+    return ret;
+
+  ib_start_addr = cmdbuf_addr;
+  cmdbuf_aql_frame_size = device->GetAqlFrameSize();
+  platform_atomic_support_ = device->SupportPlatformAtomic();
+
+  return ret;
+}
+
+hsa_status_t ComputeQueue::Fini(void) {
+  return use_hws ? HwsFini() : SwsFini();
+}
+
+hsa_status_t ComputeQueue::PreSubmit(void) {
+  if (!device->WaitPagingFence(this))
+    return HSA_STATUS_ERROR;
+
+  RelocateCmdbufScratchBase(ib_start_addr);
+
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t ComputeQueue::EndSubmit(void) {
+  // record last submitted cmdbuf_aql_frame_write_index to see if GPU is hungry
+  sync_point = cmdbuf_aql_frame_write_index;
+
+  ib_start_addr = cmdbuf_addr +
+                  (cmdbuf_aql_frame_write_index % WDDMDevice::GetAqlFrameNum()) *
+                  cmdbuf_aql_frame_size;
+  ib_size = 0;
+
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t ComputeQueue::Submit(void) {
+  hsa_status_t ret = PreSubmit();
+  if (ret)
+    return HSA_STATUS_ERROR;
+
+  ret = use_hws ?
+        HwsSubmit(ib_start_addr, ib_size, cmdbuf_aql_frame_write_index) :
+        SwsSubmit(ib_start_addr, ib_size, cmdbuf_aql_frame_write_index);
+  if (ret)
+    return HSA_STATUS_ERROR;
+
+  ret = EndSubmit();
+  if (ret)
+    return HSA_STATUS_ERROR;
+
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t
+ComputeQueue::KernelDispatchAqlToPm4(char *cpu, hsa_kernel_dispatch_packet_t *packet) {
+  pr_debug("queue %p kernel dispatch head=%x setup=%x wx=%x wy=%x wz=%x "
+           "gx=%x gy=%x gz=%x ps=%x gs=%x ko=%" PRIx64 " ka=%p cs=%" PRIx64 "\n",
+           ring, packet->header,
+           packet->setup, packet->workgroup_size_x, packet->workgroup_size_y,
+           packet->workgroup_size_z, packet->grid_size_x, packet->grid_size_y,
+           packet->grid_size_z, packet->private_segment_size,
+           packet->group_segment_size, packet->kernel_object, packet->kernarg_address,
+           packet->completion_signal.handle);
+
+  if (packet->workgroup_size_x > 1024 ||
+      packet->workgroup_size_y > 1024 ||
+      packet->workgroup_size_z > 1024)
+      return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+
+  int major = device->Major();
+  int i = ib_size;
+
+  const amd_kernel_code_t* kernel_object =
+    (const amd_kernel_code_t *)GetKernelObjAddr(packet->kernel_object);
+  if (kernel_object == NULL) {
+    return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
+  }
+
+  void* entry = (void*)(packet->kernel_object + kernel_object->kernel_code_entry_byte_offset);
+  assert((size_t)entry % AMD_ISA_ALIGN_BYTES == 0);
+
+  pr_debug("kernel object property=%x entry=%p lds=%x+%x\n",
+           kernel_object->kernel_code_properties, entry,
+           kernel_object->workgroup_group_segment_byte_size,
+           packet->group_segment_size);
+
+  if (packet->setup == 0 || packet->setup > 3)
+    return HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS;
+  if (packet->group_segment_size > device->LdsSize())
+    return HSA_STATUS_ERROR_INVALID_ALLOCATION;
+
+  uint32_t lds_blks = device->LdsBlocks(packet);
+  if (lds_blks > 128)
+    return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+
+  const bool wave32 =
+    AMD_HSA_BITS_GET(kernel_object->kernel_code_properties,
+		     AMD_KERNEL_CODE_PROPERTIES_ENABLE_WAVEFRONT_SIZE32);
+
+  assert(packet->private_segment_size >= kernel_object->workitem_private_segment_byte_size);
+
+  if (packet->private_segment_size != 0)
+    UpdateScratch(packet, wave32);
+
+  amd_signal_t *signal = (amd_signal_t *)packet->completion_signal.handle;
+
+  // Record start timestamp when enabling profiling
+  if (signal && EnableProfiling())
+    i += cmd_util.BuildCopyData(&signal->start_ts, cpu + i);
+
+  // Build a barrier packet if it is requested
+  const bool is_barrier_packet = (packet->header >> HSA_PACKET_HEADER_BARRIER) & 0x1;
+  if (is_barrier_packet && needs_barrier)
+    i += cmd_util.BuildBarrier(cpu + i);
+
+  // flush cache
+  i += cmd_util.BuildAcquireMem(major, cpu + i);
+
+  if (major >= 11) {
+    AppendCmdbufSratchBaseOffset(
+      i + offsetof(struct SetScratchTemplate, scratch_lo));
+
+    i += cmd_util.BuildScratch(ScratchBase(), cpu + i);
+    i += cmd_util.BuildComputeShaderParams(cpu + i);
+  }
+
+  struct DispatchInfo info;
+  info.major = major;
+  info.pPacket = packet;
+  info.pEntry = entry;
+  info.pKernelObject = kernel_object;
+  info.ldsBlks = lds_blks;
+  info.pAmdQueue = amd_queue_;
+  info.wave32 = wave32;
+  info.srd = UpdateIndexStride(
+    info.pAmdQueue->scratch_resource_descriptor[3], wave32);
+  info.pScratchBase = ScratchBase();
+  info.scratchSizePerWave = ScratchSizePerWave();
+  memset(info.scratchBaseOffset, 0, sizeof(info.scratchBaseOffset));
+  info.offsetCnt = 0;
+
+  size_t size;
+  size = cmd_util.BuildDispatch(&info, cpu + i);
+  for (int j = 0; j < info.offsetCnt; j++)
+    AppendCmdbufSratchBaseOffset(i + info.scratchBaseOffset[j]);
+  i += size;
+
+  needs_barrier = (packet->completion_signal.handle == 0);
+
+  if (signal) {
+    // wait cs done
+    i += cmd_util.BuildBarrier(cpu + i);
+
+    // Record end timestamp when enabling profiling
+    if (EnableProfiling())
+      i += cmd_util.BuildCopyData(&signal->end_ts, cpu + i);
+
+    // flush cache
+    i += cmd_util.BuildAcquireMem(major, cpu + i);
+
+    assert(signal->kind == AMD_SIGNAL_KIND_USER);
+    uint64_t *signal_addr = (uint64_t *)&signal->value;
+    pr_debug("signal value=%" PRIx64 "\n", signal->value);
+
+    if (platform_atomic_support_)
+      i += cmd_util.BuildAtomicMem(signal_addr, TC_OP_ATOMIC_ADD_RTN_64, cpu + i, cache_policy__mec_atomic_mem__bypass, -1);
+    else
+      signal_addr_ = signal_addr;
+  }
+
+  // The ring_rptr is used to record pm4 queue rptr value,
+  // dispatch readptr position, this is used to share rptr with
+  // aql queue.
+  if (platform_atomic_support_)
+    i += cmd_util.BuildAtomicMem((uint64_t *)ring_rptr, TC_OP_ATOMIC_ADD_RTN_64, cpu + i);
+  else
+    i += cmd_util.BuildWriteData64Command(cpu + i, (uint64_t *)ring_rptr, cmdbuf_aql_frame_write_index + 1);
+
+  // Check if we exceeded the frame size
+  if ((i - ib_size) > cmdbuf_aql_frame_size) {
+    pr_err("PM4 command buffer overflow in KernelDispatch: used %d bytes, limit %d bytes\n", i - ib_size, cmdbuf_aql_frame_size);
+    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+  }
+
+  ib_size = i;
+  cmdbuf_aql_frame_write_index++;
+  packet->header = HSA_PACKET_TYPE_INVALID;
+
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t
+ComputeQueue::BarrierGenericAqlToPm4(char *cpu, hsa_barrier_and_packet_t *packet, bool is_or) {
+  pr_debug("queue %p %s head=%x dep %" PRIx64 " %" PRIx64 " %" PRIx64
+           " %" PRIx64 " %" PRIx64 " cs=%" PRIx64"\n",
+           ring, is_or ? "or" : "and",
+           packet->header, packet->dep_signal[0].handle,
+           packet->dep_signal[1].handle, packet->dep_signal[2].handle,
+           packet->dep_signal[3].handle, packet->dep_signal[4].handle,
+           packet->completion_signal.handle);
+  // fix me: can we use gpu packet?
+  if (is_or) {
+    bool unsignaled = true;
+    hsa_signal_t sig[5];
+    int n = 0;
+    for (int i = 0; i < 5; i++) {
+        if (packet->dep_signal[i].handle)
+          sig[n++] = packet->dep_signal[i];
+    }
+
+    while (n) {
+      for (int i = 0; i < n; i++) {
+        if (!hsakmt_hsa_signal_load_relaxed(sig[i])) {
+          unsignaled = false;
+          break;
+        }
+      }
+      if (!unsignaled)
+        break;
+
+      std::this_thread::sleep_for(std::chrono::microseconds(20));
+    }
+  } else {
+    for (int i = 0; i < 5; i++) {
+      if (!packet->dep_signal[i].handle)
+        continue;
+
+    hsa_signal_value_t value =
+      hsakmt_hsa_signal_wait_relaxed(packet->dep_signal[i], HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
+    assert(value == 0);
+    }
+  }
+
+  int major = device->Major();
+  int i = ib_size;
+
+  if (packet->completion_signal.handle != 0) {
+    amd_signal_t *signal = (amd_signal_t *)packet->completion_signal.handle;
+    assert(signal->kind == AMD_SIGNAL_KIND_USER);
+    uint64_t *signal_addr = (uint64_t *)&signal->value;
+    pr_debug("signal value=%" PRIx64 "\n", signal->value);
+
+    // Record start timestamp when enabling profiling
+    if (EnableProfiling())
+      i += cmd_util.BuildCopyData(&signal->start_ts, cpu + i);
+
+    if (needs_barrier)
+      i += cmd_util.BuildBarrier(cpu + i);
+
+    needs_barrier = false;
+
+    // Record end timestamp when enabling profiling
+    if (EnableProfiling())
+      i += cmd_util.BuildCopyData(&signal->end_ts, cpu + i);
+
+    // flush cache
+    i += cmd_util.BuildAcquireMem(major, cpu + i);
+
+    if (platform_atomic_support_)
+      i += cmd_util.BuildAtomicMem(signal_addr, TC_OP_ATOMIC_ADD_RTN_64, cpu + i, cache_policy__mec_atomic_mem__bypass, -1);
+    else
+      signal_addr_ = signal_addr;
+  }
+
+  // The ring_rptr is used to record pm4 queue rptr value,
+  // dispatch readptr position, this is used to share rptr with
+  // aql queue.
+  if (platform_atomic_support_)
+    i += cmd_util.BuildAtomicMem((uint64_t *)ring_rptr, TC_OP_ATOMIC_ADD_RTN_64, cpu + i);
+  else
+    i += cmd_util.BuildWriteData64Command(cpu + i, (uint64_t *)ring_rptr, cmdbuf_aql_frame_write_index + 1);
+
+  // Check if we exceeded the frame size
+  if ((i - ib_size) > cmdbuf_aql_frame_size) {
+    pr_err("PM4 command buffer overflow in BarrierGeneric: used %d bytes, limit %d bytes\n", i - ib_size, cmdbuf_aql_frame_size);
+    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+  }
+
+  ib_size = i;
+  cmdbuf_aql_frame_write_index++;
+  packet->header = HSA_PACKET_TYPE_INVALID;
+
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t ComputeQueue::VendorSpecificAqlToPm4(char *cpu, amd_aql_pm4_ib *packet) {
+  constexpr uint32_t AMD_AQL_FORMAT_PM4_IB = 0x1;
+  assert(packet->ven_hdr == AMD_AQL_FORMAT_PM4_IB);
+
+  uint8_t op = (packet->ib_jump_cmd[0] >> PM4_OPCODE_SHIFT) & 0xff;
+  assert(op == IT_INDIRECT_BUFFER);
+  uint32_t* pm4_addr = reinterpret_cast<uint32_t*>((static_cast<uint64_t>(packet->ib_jump_cmd[2]) << 32) | (static_cast<uint64_t>(packet->ib_jump_cmd[1]) & ~3ull));
+  uint32_t pm4_size = packet->ib_jump_cmd[3]&0xfffff;
+  pr_debug("queue %p %s VENDOR_SPECIFIC pkt pm4_addr %p pm4_size %#x cs=%" PRIx64"\n",
+           ring, dxg_runtime->vendor_packet_process ? "process" : "skip", pm4_addr, pm4_size,
+           packet->completion_signal.handle);
+  for (int i = 0; i < pm4_size; i++) {
+    pr_debug("pm4_addr[%d]=%#x\n", i, pm4_addr[i]);
+  }
+
+  int i = ib_size;
+
+  if (dxg_runtime->vendor_packet_process) {
+    int major = device->Major();
+    memcpy(cpu+i, pm4_addr, pm4_size * sizeof(uint32_t));
+    i += pm4_size * sizeof(uint32_t);
+
+    if (packet->completion_signal.handle != 0) {
+      amd_signal_t *signal = (amd_signal_t *)packet->completion_signal.handle;
+      assert(signal->kind == AMD_SIGNAL_KIND_USER);
+      uint64_t *signal_addr = (uint64_t *)&signal->value;
+      pr_debug("signal value=%" PRIx64 "\n", signal->value);
+
+      // Record start timestamp when enabling profiling
+      if (EnableProfiling())
+        i += cmd_util.BuildCopyData(&signal->start_ts, cpu + i);
+
+      //if (needs_barrier)
+        i += cmd_util.BuildBarrier(cpu + i);
+
+      //needs_barrier = false;
+
+      // Record end timestamp when enabling profiling
+      if (EnableProfiling())
+        i += cmd_util.BuildCopyData(&signal->end_ts, cpu + i);
+
+      // flush cache
+      i += cmd_util.BuildAcquireMem(major, cpu + i);
+
+      if (platform_atomic_support_)
+        i += cmd_util.BuildAtomicMem(signal_addr, TC_OP_ATOMIC_ADD_RTN_64, cpu + i, cache_policy__mec_atomic_mem__bypass, -1);
+      else
+        signal_addr_ = signal_addr;
+    }
+  } else {
+    if (packet->completion_signal.handle != 0) {
+      hsakmt_hsa_signal_store_screlease(packet->completion_signal, 0);
+    }
+  }
+
+  // The ring_rptr is used to record pm4 queue rptr value,
+  // dispatch readptr position, this is used to share rptr with
+  // aql queue.
+  if (platform_atomic_support_)
+    i += cmd_util.BuildAtomicMem((uint64_t *)ring_rptr, TC_OP_ATOMIC_ADD_RTN_64, cpu + i);
+  else
+    i += cmd_util.BuildWriteData64Command(cpu + i, (uint64_t *)ring_rptr, cmdbuf_aql_frame_write_index + 1);
+
+  // Check if we exceeded the frame size
+  if ((i - ib_size) > cmdbuf_aql_frame_size) {
+    pr_err("PM4 command buffer overflow in VendorSpecific: used %d bytes, limit %d bytes\n", i - ib_size, cmdbuf_aql_frame_size);
+    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+  }
+
+  ib_size = i;
+  cmdbuf_aql_frame_write_index++;
+  packet->header = HSA_PACKET_TYPE_INVALID;
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t ComputeQueue::SwitchAql2PM4(void) {
+
+  uint16_t *packet = (uint16_t *) ((char *)ring +
+    (cmdbuf_aql_frame_write_index % ring_size) * 64);
+  uint16_t header = (*packet >> HSA_PACKET_HEADER_TYPE);
+  header &= (1 << HSA_PACKET_HEADER_WIDTH_TYPE) - 1;
+  hsa_kernel_dispatch_packet_t *aql_packet =
+    (hsa_kernel_dispatch_packet_t *)packet;
+  hsa_status_t ret;
+
+  switch (header) {
+  case HSA_PACKET_TYPE_KERNEL_DISPATCH:
+    ret = KernelDispatchAqlToPm4((char *)ib_start_addr, aql_packet);
+    if (ret != HSA_STATUS_SUCCESS)
+      return ret;
+
+    // Stop merging packages util below conditions are met:
+    // 1) The kernel with completion signal;
+    // 2) The cmdbuf_aql_frame_write_index reaches the end of cmdbuf
+    // 3) The queue is empty now, submit the package right now.
+    if (!(aql_packet->completion_signal.handle) &&
+        (cmdbuf_aql_frame_write_index % WDDMDevice::GetAqlFrameNum()) &&
+        (*sync_addr != sync_point))
+      return HSA_STATUS_SUCCESS;
+
+    break;
+  case HSA_PACKET_TYPE_BARRIER_AND:
+    BarrierGenericAqlToPm4((char *)ib_start_addr, (hsa_barrier_and_packet_t *)aql_packet);
+    break;
+  case HSA_PACKET_TYPE_BARRIER_OR:
+    BarrierGenericAqlToPm4((char *)ib_start_addr, (hsa_barrier_and_packet_t *)aql_packet, true);
+    break;
+  case HSA_PACKET_TYPE_VENDOR_SPECIFIC:
+    VendorSpecificAqlToPm4((char *)ib_start_addr, (amd_aql_pm4_ib *)aql_packet);
+    break;
+  case HSA_PACKET_TYPE_INVALID:
+    // When packets are submitted out of order, the format field of current AQL packet
+    // may not have been updated yet and is still INVALID. Return HSA_STATUS_SUCCESS and
+    // do not process AQL packets before the packet format field is updated.
+    assert(false && "Should not reach here, HSA_PACKET_TYPE_INVALID has been filtered in upper layer");
+    return HSA_STATUS_SUCCESS;
+  default:
+    return HSA_STATUS_ERROR_INVALID_PACKET_FORMAT;
+  }
+
+  ready_to_submit = true;
+
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t ComputeQueue::Process(void) {
+
+  while (cmdbuf_aql_frame_write_index < ring_wptr->load() &&
+         !IsInvalidPacket()) {
+    pr_debug("process %p wptr=%" PRIx64 " rptr=%" PRIx64 "\n",
+             ring, ring_wptr->load(), ring_rptr->load());
+
+    hsa_status_t ret;
+
+    // wait for next few cmdbuf slots to be free
+    // If wptr catch up the rptr in the cmdbuf, this needs wait for the rptr to free the cmdbuf.
+    // Here the wptr comes from queue->cmdbuf_aql_frame_write_index, while rptr comes from *queue->sync_addr.
+    if (*sync_addr + WDDMDevice::GetAqlFrameNum() <= cmdbuf_aql_frame_write_index) {
+      uint64_t value = cmdbuf_aql_frame_write_index - WDDMDevice::GetAqlFrameNum() + 1;
+      if (!device->CpuWait(&syncobj, &value, 1, false))
+        return HSA_STATUS_ERROR;
+    }
+
+    ret = SwitchAql2PM4();
+    if (ret != HSA_STATUS_SUCCESS)
+      return ret;
+
+    if (!ready_to_submit)
+      continue;
+
+    ret = Submit();
+    if (ret != HSA_STATUS_SUCCESS)
+      return ret;
+
+    // CPU wait for GPU fence, and cpu update the signal.
+    if (!platform_atomic_support_ && signal_addr_) {
+      // CPU wait for GPU fence
+      if (!device->CpuWait(&syncobj, &cmdbuf_aql_frame_write_index, 1, false))
+        return HSA_STATUS_ERROR;
+      //CPU update completional signal
+      atomic::Decrement(signal_addr_);
+      signal_addr_ = NULL;
+    }
+
+    ready_to_submit = false;
+
+    pr_debug("done %p wptr=%" PRIx64 " rptr=%" PRIx64 "\n",
+             ring, ring_wptr->load(), ring_rptr->load());
+  }
+
+  return HSA_STATUS_SUCCESS;
+}
+
+void SDMAQueue::SdmaThread(SDMAQueue *queue) {
+
+  while (true) {
+    decltype(queue->wptr_queue_) pendings;
+    {
+      std::unique_lock<std::mutex> lock(queue->thread_cond_lock_);
+      while (queue->wptr_queue_.empty() && !queue->thread_stop_)
+        queue->thread_cond_.wait(lock);
+
+      if (queue->thread_stop_)
+        break;
+
+      pendings.swap(queue->wptr_queue_);
+    }
+
+    for (const auto [start, end] : pendings) {
+      pr_debug("wptr %lx %lx\n", start, end);
+
+      SDMA_PKT_POLL_REGMEM* poll_pkt = reinterpret_cast<SDMA_PKT_POLL_REGMEM*>(queue->cmdbuf_addr + queue->WrapIntoRocrRing(start));
+      SDMA_PKT_POLL_REGMEM* poll_next_pkt = poll_pkt + 1;
+      while (queue->IsPollPacket(poll_pkt)) {
+        uint64_t poll_addr = poll_pkt->ADDR_LO_UNION.addr_31_0 |
+                             (uint64_t)poll_pkt->ADDR_HI_UNION.addr_63_32 << 32;
+
+        uint64_t poll_val = poll_pkt->VALUE_UNION.value;
+        uint32_t skip = 1;
+
+        if (queue->IsPollPacket(poll_next_pkt)) {
+          uint64_t poll_next_addr = poll_next_pkt->ADDR_LO_UNION.addr_31_0 |
+                             (uint64_t)poll_next_pkt->ADDR_HI_UNION.addr_63_32 << 32;
+
+          if (poll_next_addr + sizeof(uint32_t) == poll_addr) {
+            poll_addr = poll_next_addr;
+            poll_val = poll_next_pkt->VALUE_UNION.value |
+                            (uint64_t)poll_pkt->VALUE_UNION.value << 32;
+            skip = 2;
+          }
+        }
+
+        amd_signal_t* signal = (amd_signal_t*)((char*)poll_addr - offsetof(amd_signal_t, value));
+        uint64_t signal_handle = reinterpret_cast<uint64_t>(signal);
+        pr_debug("poll signal %#lx addr %#lx val %ld\n", signal_handle, poll_addr, poll_val);
+        hsa_signal_t hsa_signal = {signal_handle};
+        hsa_signal_value_t value =
+          hsakmt_hsa_signal_wait_relaxed(hsa_signal, HSA_SIGNAL_CONDITION_EQ, poll_val, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
+        assert(value == poll_val);
+
+        memset(poll_pkt, 0, skip * sizeof(*poll_pkt));
+        poll_pkt += skip;
+        poll_next_pkt += skip;
+      }
+      queue->PreparePacket(queue->WrapIntoRocrRing(start), end - start);
+      std::atomic_thread_fence(std::memory_order_release);
+      queue->Submit();
+    }
+  }
+  pr_debug("sdma thread exit\n");
+}
+
+SDMAQueue::SDMAQueue(WDDMDevice *device,
+          void *ring,
+          uint64_t cmdbuf_size,
+          uint32_t engine,
+          bool use_hws) :
+          WDDMQueue(device, reinterpret_cast<uint64_t>(ring), cmdbuf_size, engine, use_hws),
+          wptr_next_(0),
+          wptr_pre_(0),
+          rptr_next(0),
+          thread_stop_(false),
+          ib_size(0),
+          ib_start_addr(0) {
+  bool ret = device->CreateQueue(this);
+  assert(ret);
+
+  thread_ = std::thread(SdmaThread, this);
+}
+
+SDMAQueue::~SDMAQueue() {
+  thread_cond_lock_.lock();
+  thread_stop_ = true;
+  thread_cond_lock_.unlock();
+  thread_cond_.notify_one();
+  thread_.join();
+
+  device->DestroyQueue(this);
+}
+
+void SDMAQueue::RingDoorbell() {
+  pr_debug("ringdoorbell %#lx %#lx\n", wptr_pre_, wptr_next_);
+  thread_cond_lock_.lock();
+
+  wptr_queue_.emplace_back(wptr_pre_, wptr_next_);
+  thread_cond_.notify_one();
+
+  thread_cond_lock_.unlock();
+  wptr_pre_ = wptr_next_;
+}
+
+hsa_status_t SDMAQueue::Init(void) {
+  hsa_status_t ret = use_hws ? HwsInit() : SwsInit();
+  if (ret)
+    return ret;
+
+  std::memset((char *)cmdbuf_addr, 0, cmdbuf_size);
+
+  return ret;
+}
+
+hsa_status_t SDMAQueue::Fini(void) {
+  return use_hws ? HwsFini() : SwsFini();
+}
+
+int SDMAQueue::PreparePacket(uint32_t offset, uint64_t size) {
+  ib_start_addr = cmdbuf_addr + offset;
+  ib_size = size;
+  rptr_next += ib_size;
+
+  return STATUS_SUCCESS;
+}
+
+hsa_status_t SDMAQueue::Submit(void) {
+  if (!device->WaitPagingFence(this))
+    return HSA_STATUS_ERROR;
+
+  int ret = use_hws ?
+            HwsSubmit(ib_start_addr, ib_size, rptr_next) :
+            SwsSubmit(ib_start_addr, ib_size, rptr_next);
+  if (ret)
+    return HSA_STATUS_ERROR;
+
+  return HSA_STATUS_SUCCESS;
+}
+
+} // namespace thunk
+} // namespace wsl
diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/wddm/va_mgr.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/wddm/va_mgr.cpp
new file mode 100644
index 0000000000..4ea93c70f2
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/dxg/wddm/va_mgr.cpp
@@ -0,0 +1,165 @@
+#include <cassert>
+#include <map>
+#include <algorithm>
+#include "impl/wddm/va_mgr.h"
+
+using namespace std;
+
+namespace wsl {
+namespace thunk {
+
+VaMgr::VaMgr(uint64_t start, uint64_t size, uint64_t min_align) {
+  min_align_ = min_align;
+  auto free_it = free_list_.insert(make_pair(size, start));
+  frag_map_[start] = make_fragment(free_it, size);
+}
+
+VaMgr::~VaMgr() {
+
+  if (free_list_.size() != 1)
+    pr_warn("free_list_ size:%ld which should be 1.\n", free_list_.size());
+  if (frag_map_.size() != 1)
+    pr_warn("frag_map_ size:%ld which should be 1.\n", frag_map_.size());
+
+  free_list_.clear();
+  frag_map_.clear();
+}
+
+uint64_t VaMgr::Alloc(uint64_t bytes, uint64_t align, uint64_t addr) {
+
+  if (addr > 0 &&
+      (align == 0 || (addr % align) == 0)) {
+
+    lock_guard<mutex> gard(lock_);
+    auto frag_it = frag_map_.upper_bound(addr);
+    assert(frag_it != frag_map_.begin());
+    --frag_it;
+
+    while (frag_it != frag_map_.begin()) {
+      const uint64_t base = frag_it->first;
+      const uint64_t size = frag_it->second.size;
+
+      // Cannot find free fragment contains the target `addr`
+      if (bytes > size || addr < base || addr + bytes > base + size ||
+          !is_free(frag_it->second)) {
+        --frag_it;
+        continue;
+      } else if (addr >= base + size)
+        break;
+
+
+      // Try to allocate target `addr` from this free fragment
+      auto free_it = frag_it->second.free_list_entry_;
+      assert(free_it != free_list_.end());
+
+      free_list_.erase(free_it);
+      frag_it->second.size = bytes;
+      set_used(frag_it->second);
+
+      // [base, addr)
+      if (addr > base) add_free_fragment(addr - base, base);
+
+      // [addr, addr + bytes) is used
+
+      // [addr + bytes, base + size)
+      if (base + size > addr + bytes) add_free_fragment(base + size - addr - bytes, addr + bytes);
+
+      return addr;
+    }
+  }
+
+  // Allocate not fixed address
+  return AllocImpl(bytes, align);
+}
+
+uint64_t VaMgr::AllocImpl(const uint64_t bytes, const uint64_t align) {
+  uint64_t addr = 0;
+  uint64_t align_bytes = bytes;
+  const int retry = align == 0 ? 0 : 1;
+  const uint64_t new_align = align == 0 ? min_align_ : AlignUp(align, min_align_);
+
+  lock_guard<mutex> gard(lock_);
+  for (int i = 0; i <= retry; i++) {
+    auto free_it = free_list_.lower_bound(align_bytes);
+    if (free_it == free_list_.end()) break;
+
+    uint64_t base = free_it->second;
+    uint64_t size = free_it->first;
+
+    assert(size >= align_bytes);
+
+    auto fragment = frag_map_.find(base);
+
+    assert(fragment != frag_map_.end());
+    assert(size == fragment->second.size);
+
+    uint64_t delta = align == 0 ? 0 : base % align;
+    if (delta == 0) {
+      // already find aligned address
+      addr = base;
+
+      free_list_.erase(free_it);
+      fragment->second.size = bytes;
+      set_used(fragment->second);
+
+      if (size > bytes) add_free_fragment(size - bytes, base + bytes);
+
+      break;
+    } else if (i == 0) {
+      align_bytes += new_align;
+      continue;
+    } else {
+      uint64_t aligned_base = base + align - delta;
+      addr = aligned_base;
+
+      free_list_.erase(free_it);
+
+      add_used_fragment(bytes, aligned_base);
+      add_free_fragment(aligned_base - base, base);
+
+      if (size > aligned_base - base + bytes)
+        add_free_fragment(size - (aligned_base - base) - bytes, aligned_base + bytes);
+
+      break;
+    }
+  }
+  return addr;
+}
+
+void VaMgr::Free(uint64_t addr) {
+  if (addr == 0) return;
+
+  lock_guard<mutex> gard(lock_);
+  auto frag_it = frag_map_.find(addr);
+  if (frag_it == frag_map_.end() || is_free(frag_it->second)) return;
+
+  uint64_t base = addr;
+  // Merge lower
+  if (frag_it != frag_map_.begin()) {
+    auto lower = frag_it;
+    --lower;
+    if (is_free(lower->second)) {
+      remove_free_list_entry(lower->second);
+      base -= lower->second.size;
+      lower->second.size += frag_it->second.size;
+      frag_map_.erase(frag_it);
+      frag_it = lower;
+    }
+  }
+  // Merge upper
+  {
+    auto upper = frag_it;
+    ++upper;
+    if (upper != frag_map_.end() && is_free(upper->second)) {
+      remove_free_list_entry(upper->second);
+      frag_it->second.size += upper->second.size;
+      frag_map_.erase(upper);
+    }
+  }
+  uint64_t size = frag_it->second.size;
+  auto it = free_list_.insert(make_pair(size, base));
+  set_free(frag_it->second, it);
+}
+
+} // namespace thunk
+} // namespace wsl