Add 'projects/rocr-runtime/libhsakmt/src/dxg/' from commit '029690f0a4f62fefefbb67305a066a72e99f8c0b'

git-subtree-dir: projects/rocr-runtime/libhsakmt/src/dxg
git-subtree-mainline: 8760fb4976
git-subtree-split: 029690f0a4
Esse commit está contido em:
German Andryeyev
2026-01-15 15:51:21 -05:00
38 arquivos alterados com 11831 adições e 0 exclusões
@@ -0,0 +1,39 @@
/*
* Copyright © 2023 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including
* the next paragraph) shall be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
HSAKMT_STATUS HSAKMTAPI hsaKmtAisReadWriteFile(void *MemoryAddress,
HSAuint64 MemorySizeInBytes,
HSAint32 fd,
HSAint64 file_offset,
HsaAisFlags AisFlags,
HSAuint64 *SizeCopiedInBytes,
HSAint32 *status)
{
CHECK_DXG_OPEN();
pr_warn_once("not implemented\n");
return HSAKMT_STATUS_NOT_SUPPORTED;
}
@@ -0,0 +1,126 @@
/*
* Copyright © 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including
* the next paragraph) shall be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include <cassert>
#include <cstring>
static uint32_t runtime_capabilities_mask = 0;
HSAKMT_STATUS HSAKMTAPI hsaKmtDbgRegister(HSAuint32 NodeId) {
CHECK_DXG_OPEN();
pr_warn_once("not supported\n");
return HSAKMT_STATUS_NOT_SUPPORTED;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtDbgUnregister(HSAuint32 NodeId) {
CHECK_DXG_OPEN();
pr_warn_once("not supported\n");
return HSAKMT_STATUS_NOT_SUPPORTED;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtDbgWavefrontControl(
HSAuint32 NodeId, HSA_DBG_WAVEOP Operand, HSA_DBG_WAVEMODE Mode,
HSAuint32 TrapId, HsaDbgWaveMessage *DbgWaveMsgRing) {
CHECK_DXG_OPEN();
pr_warn_once("not supported\n");
return HSAKMT_STATUS_NOT_SUPPORTED;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtDbgAddressWatch(
HSAuint32 NodeId, HSAuint32 NumWatchPoints, HSA_DBG_WATCH_MODE WatchMode[],
void *WatchAddress[], HSAuint64 WatchMask[], HsaEvent *WatchEvent[]) {
CHECK_DXG_OPEN();
pr_warn_once("not supported\n");
return HSAKMT_STATUS_NOT_SUPPORTED;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtCheckRuntimeDebugSupport(void) {
CHECK_DXG_OPEN();
pr_warn_once("not supported\n");
return HSAKMT_STATUS_NOT_SUPPORTED;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtRuntimeEnable(void *rDebug, bool setupTtmp) {
HSAKMT_STATUS result = hsaKmtCheckRuntimeDebugSupport();
if (result)
return result;
assert(false);
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtRuntimeDisable(void) {
HSAKMT_STATUS result = hsaKmtCheckRuntimeDebugSupport();
if (result)
return HSAKMT_STATUS_SUCCESS;
assert(false);
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtGetRuntimeCapabilities(HSAuint32 *caps_mask) {
CHECK_DXG_OPEN();
*caps_mask = runtime_capabilities_mask;
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtDbgEnable(void **runtime_info,
HSAuint32 *data_size) {
CHECK_DXG_OPEN();
pr_warn_once("not supported\n");
return HSAKMT_STATUS_NOT_SUPPORTED;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtDbgDisable(void) {
CHECK_DXG_OPEN();
pr_warn_once("not supported\n");
return HSAKMT_STATUS_NOT_SUPPORTED;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtDbgGetDeviceData(void **data,
HSAuint32 *n_entries,
HSAuint32 *entry_size) {
CHECK_DXG_OPEN();
pr_warn_once("not supported\n");
return HSAKMT_STATUS_NOT_SUPPORTED;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtDbgGetQueueData(void **data, HSAuint32 *n_entries,
HSAuint32 *entry_size,
bool suspend_queues) {
CHECK_DXG_OPEN();
pr_warn_once("not supported\n");
return HSAKMT_STATUS_NOT_SUPPORTED;
}
HSAKMT_STATUS HSAKMTAPI
hsaKmtDebugTrapIoctl(struct kfd_ioctl_dbg_trap_args *args, HSA_QUEUEID *Queues,
HSAuint64 *DebugReturn) {
CHECK_DXG_OPEN();
pr_warn_once("not supported\n");
return HSAKMT_STATUS_NOT_SUPPORTED;
}
@@ -0,0 +1,148 @@
/*
* Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
*/
#include "dxcore_loader.h"
#include "librocdxg.h"
#include <cstdlib>
#include <cstring>
#include <iostream>
#include <ntstatus.h>
namespace wsl {
namespace thunk {
namespace dxcore {
DxcoreLoader::DxcoreLoader()
: dxcore_handle_(nullptr)
, init_flag_()
, pfn_D3DKMTCreateAllocation2(nullptr)
, pfn_D3DKMTDestroyAllocation2(nullptr)
, pfn_D3DKMTMapGpuVirtualAddress(nullptr)
, pfn_D3DKMTReserveGpuVirtualAddress(nullptr)
, pfn_D3DKMTFreeGpuVirtualAddress(nullptr)
, pfn_D3DKMTCreateDevice(nullptr)
, pfn_D3DKMTDestroyDevice(nullptr)
, pfn_D3DKMTEnumAdapters2(nullptr)
, pfn_D3DKMTQueryAdapterInfo(nullptr)
, pfn_D3DKMTCreateContextVirtual(nullptr)
, pfn_D3DKMTDestroyContext(nullptr)
, pfn_D3DKMTSubmitCommand(nullptr)
, pfn_D3DKMTCreateSynchronizationObject2(nullptr)
, pfn_D3DKMTDestroySynchronizationObject(nullptr)
, pfn_D3DKMTQueryStatistics(nullptr)
, pfn_D3DKMTEscape(nullptr)
, pfn_D3DKMTLock2(nullptr)
, pfn_D3DKMTUnlock2(nullptr)
, pfn_D3DKMTCreatePagingQueue(nullptr)
, pfn_D3DKMTDestroyPagingQueue(nullptr)
, pfn_D3DKMTWaitForSynchronizationObjectFromGpu(nullptr)
, pfn_D3DKMTSignalSynchronizationObjectFromGpu(nullptr)
, pfn_D3DKMTWaitForSynchronizationObjectFromCpu(nullptr)
, pfn_D3DKMTQueryClockCalibration(nullptr)
, pfn_D3DKMTMakeResident(nullptr)
, pfn_D3DKMTEvict(nullptr)
, pfn_D3DKMTShareObjects(nullptr)
, pfn_D3DKMTQueryResourceInfoFromNtHandle(nullptr)
, pfn_D3DKMTOpenResourceFromNtHandle(nullptr)
, pfn_D3DKMTCreateHwQueue(nullptr)
, pfn_D3DKMTDestroyHwQueue(nullptr)
, pfn_D3DKMTSubmitCommandToHwQueue(nullptr) {
}
DxcoreLoader::~DxcoreLoader() {
Shutdown();
}
bool DxcoreLoader::Initialize() {
dlerror(); // Clear error
dxcore_handle_ = dlopen("libdxcore.so", RTLD_LAZY);
if (!dxcore_handle_) {
pr_err("[DxcoreLoader] Cannot load libdxcore.so: %s\n", dlerror());
return false;
}
pr_info("[DxcoreLoader] libdxcore.so loaded successfully\n");
if (!LoadDxcoreApis()) {
// If API loading failed, close the handle to indicate failure
dlclose(dxcore_handle_);
dxcore_handle_ = nullptr;
return false;
}
return IsLoaded();
}
void DxcoreLoader::Shutdown() {
if (dxcore_handle_) {
if (dlclose(dxcore_handle_) != 0) {
pr_err("[DxcoreLoader] Cannot unload libdxcore.so: %s\n", dlerror());
} else {
pr_info("[DxcoreLoader] libdxcore.so unloaded successfully\n");
}
dxcore_handle_ = nullptr;
}
}
bool DxcoreLoader::LoadDxcoreApis() {
if (!dxcore_handle_) {
pr_err("[DxcoreLoader] Error: dxcore_handle_ is null\n");
return false;
}
dlerror(); // Clear error
// Load all D3DKMT functions
#define LOAD_DXCORE_API(func_name) \
DXCORE_PFN(func_name) = (DXCORE_DEF(func_name)*)dlsym(dxcore_handle_, #func_name); \
if (!DXCORE_PFN(func_name)) { \
pr_err("[DxcoreLoader] Failed to load " #func_name ": %s\n", dlerror()); \
goto ERROR; \
}
LOAD_DXCORE_API(D3DKMTCreateAllocation2);
LOAD_DXCORE_API(D3DKMTDestroyAllocation2);
LOAD_DXCORE_API(D3DKMTMapGpuVirtualAddress);
LOAD_DXCORE_API(D3DKMTReserveGpuVirtualAddress);
LOAD_DXCORE_API(D3DKMTFreeGpuVirtualAddress);
LOAD_DXCORE_API(D3DKMTCreateDevice);
LOAD_DXCORE_API(D3DKMTDestroyDevice);
LOAD_DXCORE_API(D3DKMTEnumAdapters2);
LOAD_DXCORE_API(D3DKMTQueryAdapterInfo);
LOAD_DXCORE_API(D3DKMTCreateContextVirtual);
LOAD_DXCORE_API(D3DKMTDestroyContext);
LOAD_DXCORE_API(D3DKMTSubmitCommand);
LOAD_DXCORE_API(D3DKMTCreateSynchronizationObject2);
LOAD_DXCORE_API(D3DKMTDestroySynchronizationObject);
LOAD_DXCORE_API(D3DKMTQueryStatistics);
LOAD_DXCORE_API(D3DKMTEscape);
LOAD_DXCORE_API(D3DKMTLock2);
LOAD_DXCORE_API(D3DKMTUnlock2);
LOAD_DXCORE_API(D3DKMTCreatePagingQueue);
LOAD_DXCORE_API(D3DKMTDestroyPagingQueue);
LOAD_DXCORE_API(D3DKMTWaitForSynchronizationObjectFromGpu);
LOAD_DXCORE_API(D3DKMTSignalSynchronizationObjectFromGpu);
LOAD_DXCORE_API(D3DKMTWaitForSynchronizationObjectFromCpu);
LOAD_DXCORE_API(D3DKMTQueryClockCalibration);
LOAD_DXCORE_API(D3DKMTMakeResident);
LOAD_DXCORE_API(D3DKMTEvict);
LOAD_DXCORE_API(D3DKMTShareObjects);
LOAD_DXCORE_API(D3DKMTQueryResourceInfoFromNtHandle);
LOAD_DXCORE_API(D3DKMTOpenResourceFromNtHandle);
LOAD_DXCORE_API(D3DKMTCreateHwQueue);
LOAD_DXCORE_API(D3DKMTDestroyHwQueue);
LOAD_DXCORE_API(D3DKMTSubmitCommandToHwQueue);
#undef LOAD_DXCORE_API
pr_info("[DxcoreLoader] All DXCore APIs loaded successfully\n");
return true;
ERROR:
pr_err("[DxcoreLoader] Failed to load DXCore APIs\n");
return false;
}
} // namespace dxcore
} // namespace thunk
} // namespace wsl
@@ -0,0 +1,148 @@
/*
* Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#ifndef LIBROCDXG_DXCORE_LOADER_H
#define LIBROCDXG_DXCORE_LOADER_H
#include "impl/wddm/types.h"
#include <dlfcn.h>
#include <mutex>
#define DXCORE_CALL(function_name) wsl::thunk::dxcore::DxcoreLoader::Instance().pfn_##function_name
namespace wsl {
namespace thunk {
namespace dxcore {
/**
* @brief DxcoreLoader class for dynamic loading of libdxcore.so
*
* This class provides a singleton loader for the DXCore library, allowing
* optional loading based on environment variable LIBROCDXG_ENABLE_DXCORE.
* Supported values: "1", "true", "yes" (case-sensitive).
* If not set or invalid, fallback to stub implementations.
*
* Thread-safe initialization using std::call_once.
*/
// Macro definitions mimicking HSAKMT design
#define DXCORE_DEF(function_name) PFN##function_name
#define DXCORE_PFN(function_name) pfn_##function_name
class DxcoreLoader {
public:
// D3DKMT function type definitions
typedef NTSTATUS (DXCORE_DEF(D3DKMTCreateAllocation2))(void* args);
typedef NTSTATUS (DXCORE_DEF(D3DKMTDestroyAllocation2))(void *args);
typedef NTSTATUS (DXCORE_DEF(D3DKMTMapGpuVirtualAddress))(void* args);
typedef NTSTATUS (DXCORE_DEF(D3DKMTReserveGpuVirtualAddress))(void* args);
typedef NTSTATUS (DXCORE_DEF(D3DKMTFreeGpuVirtualAddress))(void *args);
typedef NTSTATUS (DXCORE_DEF(D3DKMTCreateDevice))(void* args);
typedef NTSTATUS (DXCORE_DEF(D3DKMTDestroyDevice))(void* args);
typedef NTSTATUS (DXCORE_DEF(D3DKMTEnumAdapters2))(void* args);
typedef NTSTATUS (DXCORE_DEF(D3DKMTQueryAdapterInfo))(void* args);
typedef NTSTATUS (DXCORE_DEF(D3DKMTCreateContextVirtual))(void* args);
typedef NTSTATUS (DXCORE_DEF(D3DKMTDestroyContext))(void* args);
typedef NTSTATUS (DXCORE_DEF(D3DKMTSubmitCommand))(void* args);
typedef NTSTATUS (DXCORE_DEF(D3DKMTCreateSynchronizationObject2))(void* args);
typedef NTSTATUS (DXCORE_DEF(D3DKMTDestroySynchronizationObject))(void* args);
typedef NTSTATUS (DXCORE_DEF(D3DKMTQueryStatistics))(void* args);
typedef NTSTATUS (DXCORE_DEF(D3DKMTEscape))(void* args);
typedef NTSTATUS (DXCORE_DEF(D3DKMTLock2))(void* args);
typedef NTSTATUS (DXCORE_DEF(D3DKMTUnlock2))(void* args);
typedef NTSTATUS (DXCORE_DEF(D3DKMTCreatePagingQueue))(void* args);
typedef NTSTATUS (DXCORE_DEF(D3DKMTDestroyPagingQueue))(void* args);
typedef NTSTATUS (DXCORE_DEF(D3DKMTWaitForSynchronizationObjectFromGpu))(void* args);
typedef NTSTATUS (DXCORE_DEF(D3DKMTSignalSynchronizationObjectFromGpu))(void* args);
typedef NTSTATUS (DXCORE_DEF(D3DKMTWaitForSynchronizationObjectFromCpu))(void* args);
typedef NTSTATUS (DXCORE_DEF(D3DKMTQueryClockCalibration))(void* args);
typedef NTSTATUS (DXCORE_DEF(D3DKMTMakeResident))(void* args);
typedef NTSTATUS (DXCORE_DEF(D3DKMTEvict))(void* args);
typedef NTSTATUS (DXCORE_DEF(D3DKMTShareObjects))(size_t num_allocations, WinResourceHandle* resource, OBJECT_ATTRIBUTES* obj_attr, uint32_t flags, void** nt_handle);
typedef NTSTATUS (DXCORE_DEF(D3DKMTQueryResourceInfoFromNtHandle))(void* args);
typedef NTSTATUS (DXCORE_DEF(D3DKMTOpenResourceFromNtHandle))(void* args);
typedef NTSTATUS (DXCORE_DEF(D3DKMTCreateHwQueue))(void* args);
typedef NTSTATUS (DXCORE_DEF(D3DKMTDestroyHwQueue))(void* args);
typedef NTSTATUS (DXCORE_DEF(D3DKMTSubmitCommandToHwQueue))(void* args);
static DxcoreLoader& Instance() {
static DxcoreLoader* instance = new DxcoreLoader();
return (*instance);
}
bool Initialize();
void Shutdown();
bool IsLoaded() const { return dxcore_handle_ != nullptr; }
// Function pointer declarations
DXCORE_DEF(D3DKMTCreateAllocation2)* DXCORE_PFN(D3DKMTCreateAllocation2);
DXCORE_DEF(D3DKMTDestroyAllocation2)* DXCORE_PFN(D3DKMTDestroyAllocation2);
DXCORE_DEF(D3DKMTMapGpuVirtualAddress)* DXCORE_PFN(D3DKMTMapGpuVirtualAddress);
DXCORE_DEF(D3DKMTReserveGpuVirtualAddress)* DXCORE_PFN(D3DKMTReserveGpuVirtualAddress);
DXCORE_DEF(D3DKMTFreeGpuVirtualAddress)* DXCORE_PFN(D3DKMTFreeGpuVirtualAddress);
DXCORE_DEF(D3DKMTCreateDevice)* DXCORE_PFN(D3DKMTCreateDevice);
DXCORE_DEF(D3DKMTDestroyDevice)* DXCORE_PFN(D3DKMTDestroyDevice);
DXCORE_DEF(D3DKMTEnumAdapters2)* DXCORE_PFN(D3DKMTEnumAdapters2);
DXCORE_DEF(D3DKMTQueryAdapterInfo)* DXCORE_PFN(D3DKMTQueryAdapterInfo);
DXCORE_DEF(D3DKMTCreateContextVirtual)* DXCORE_PFN(D3DKMTCreateContextVirtual);
DXCORE_DEF(D3DKMTDestroyContext)* DXCORE_PFN(D3DKMTDestroyContext);
DXCORE_DEF(D3DKMTSubmitCommand)* DXCORE_PFN(D3DKMTSubmitCommand);
DXCORE_DEF(D3DKMTCreateSynchronizationObject2)* DXCORE_PFN(D3DKMTCreateSynchronizationObject2);
DXCORE_DEF(D3DKMTDestroySynchronizationObject)* DXCORE_PFN(D3DKMTDestroySynchronizationObject);
DXCORE_DEF(D3DKMTQueryStatistics)* DXCORE_PFN(D3DKMTQueryStatistics);
DXCORE_DEF(D3DKMTEscape)* DXCORE_PFN(D3DKMTEscape);
DXCORE_DEF(D3DKMTLock2)* DXCORE_PFN(D3DKMTLock2);
DXCORE_DEF(D3DKMTUnlock2)* DXCORE_PFN(D3DKMTUnlock2);
DXCORE_DEF(D3DKMTCreatePagingQueue)* DXCORE_PFN(D3DKMTCreatePagingQueue);
DXCORE_DEF(D3DKMTDestroyPagingQueue)* DXCORE_PFN(D3DKMTDestroyPagingQueue);
DXCORE_DEF(D3DKMTWaitForSynchronizationObjectFromGpu)* DXCORE_PFN(D3DKMTWaitForSynchronizationObjectFromGpu);
DXCORE_DEF(D3DKMTSignalSynchronizationObjectFromGpu)* DXCORE_PFN(D3DKMTSignalSynchronizationObjectFromGpu);
DXCORE_DEF(D3DKMTWaitForSynchronizationObjectFromCpu)* DXCORE_PFN(D3DKMTWaitForSynchronizationObjectFromCpu);
DXCORE_DEF(D3DKMTQueryClockCalibration)* DXCORE_PFN(D3DKMTQueryClockCalibration);
DXCORE_DEF(D3DKMTMakeResident)* DXCORE_PFN(D3DKMTMakeResident);
DXCORE_DEF(D3DKMTEvict)* DXCORE_PFN(D3DKMTEvict);
DXCORE_DEF(D3DKMTShareObjects)* DXCORE_PFN(D3DKMTShareObjects);
DXCORE_DEF(D3DKMTQueryResourceInfoFromNtHandle)* DXCORE_PFN(D3DKMTQueryResourceInfoFromNtHandle);
DXCORE_DEF(D3DKMTOpenResourceFromNtHandle)* DXCORE_PFN(D3DKMTOpenResourceFromNtHandle);
DXCORE_DEF(D3DKMTCreateHwQueue)* DXCORE_PFN(D3DKMTCreateHwQueue);
DXCORE_DEF(D3DKMTDestroyHwQueue)* DXCORE_PFN(D3DKMTDestroyHwQueue);
DXCORE_DEF(D3DKMTSubmitCommandToHwQueue)* DXCORE_PFN(D3DKMTSubmitCommandToHwQueue);
private:
DxcoreLoader();
~DxcoreLoader();
bool LoadDxcoreApis();
void* dxcore_handle_;
std::once_flag init_flag_; // For thread-safe initialization
// Disable copy
DxcoreLoader(const DxcoreLoader&) = delete;
DxcoreLoader& operator=(const DxcoreLoader&) = delete;
};
} // namespace dxcore
} // namespace thunk
} // namespace wsl
#endif // LIBROCDXG_DXCORE_LOADER_H
@@ -0,0 +1,127 @@
/*
* Copyright © 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including
* the next paragraph) shall be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include <cstdio>
#include <cassert>
#include <thread>
#include <chrono>
HSAKMT_STATUS HSAKMTAPI hsaKmtCreateEvent(HsaEventDescriptor *EventDesc,
bool ManualReset, bool IsSignaled,
HsaEvent **Event) {
CHECK_DXG_OPEN();
pr_warn_once("not supported\n");
assert(false);
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtDestroyEvent(HsaEvent *Event) {
CHECK_DXG_OPEN();
if (!Event)
return HSAKMT_STATUS_SUCCESS;
pr_warn_once("not supported\n");
assert(false);
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtSetEvent(HsaEvent *Event) {
CHECK_DXG_OPEN();
pr_warn_once("not supported\n");
if (!Event)
return HSAKMT_STATUS_INVALID_HANDLE;
assert(false);
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtResetEvent(HsaEvent *Event) {
CHECK_DXG_OPEN();
pr_warn_once("not supported\n");
if (!Event)
return HSAKMT_STATUS_INVALID_HANDLE;
assert(false);
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtQueryEventState(HsaEvent *Event) {
CHECK_DXG_OPEN();
pr_warn_once("not supported\n");
if (!Event)
return HSAKMT_STATUS_INVALID_HANDLE;
assert(false);
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnEvent(HsaEvent *Event,
HSAuint32 Milliseconds) {
return hsaKmtWaitOnEvent_Ext(Event, Milliseconds, NULL);
}
HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnEvent_Ext(HsaEvent *Event,
HSAuint32 Milliseconds,
uint64_t *event_age) {
if (!Event)
return HSAKMT_STATUS_INVALID_HANDLE;
return hsaKmtWaitOnMultipleEvents_Ext(&Event, 1, true, Milliseconds,
event_age);
}
HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents(HsaEvent *Events[],
HSAuint32 NumEvents,
bool WaitOnAll,
HSAuint32 Milliseconds) {
return hsaKmtWaitOnMultipleEvents_Ext(Events, NumEvents, WaitOnAll,
Milliseconds, NULL);
}
HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents_Ext(HsaEvent *Events[],
HSAuint32 NumEvents,
bool WaitOnAll,
HSAuint32 Milliseconds,
uint64_t *event_age) {
CHECK_DXG_OPEN();
if (!Events)
return HSAKMT_STATUS_INVALID_HANDLE;
if (NumEvents == 1 && Events[0] == nullptr) {
std::this_thread::sleep_for(std::chrono::microseconds(20));
return HSAKMT_STATUS_SUCCESS;
}
assert(false);
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtOpenSMI(HSAuint32 NodeId, int *fd) {
CHECK_DXG_OPEN();
pr_debug("node id %d\n", NodeId);
assert(false);
return HSAKMT_STATUS_SUCCESS;
}
+137
Ver Arquivo
@@ -0,0 +1,137 @@
#include <dlfcn.h>
#include "impl/hsa/hsa.h"
#include "impl/hsa/hsa_ven_amd_loader.h"
static std::mutex* lock_ = new std::mutex();
#if 1
#define _HSAKMT_LOOKUP_SYMS(_sym) \
if (fn_##_sym == nullptr) { \
std::lock_guard<std::mutex> gard(*lock_); \
if (fn_##_sym == nullptr) { \
fn_##_sym = \
reinterpret_cast<decltype(fn_##_sym)>(dlsym(RTLD_DEFAULT, #_sym)); \
if (!fn_##_sym) { \
pr_err("%s not found - %s\n", #_sym, dlerror()); \
} \
} \
}
#define _HSAKMT_EXEC_API(_sym, ...) \
do { \
if (fn_##_sym != nullptr) { \
return fn_##_sym(__VA_ARGS__); \
} \
} while(0);
bool hsakmt_hsa_loader_init() {
void *hsa_loader_handle = dlopen("libhsa-runtime64.so", RTLD_NOW | RTLD_GLOBAL);
if (hsa_loader_handle == nullptr) {
pr_err("dlopen libhsa-runtime64.so failed - %s\n", dlerror());
return false;
}
dlclose(hsa_loader_handle);
return true;
}
hsa_signal_value_t hsakmt_hsa_signal_load_relaxed(hsa_signal_t signal) {
static hsa_signal_value_t (*fn_hsa_signal_load_relaxed)(hsa_signal_t signal) = nullptr;
_HSAKMT_LOOKUP_SYMS(hsa_signal_load_relaxed);
_HSAKMT_EXEC_API(hsa_signal_load_relaxed, signal);
return 0;
}
hsa_signal_value_t hsakmt_hsa_signal_wait_relaxed(
hsa_signal_t signal, hsa_signal_condition_t condition,
hsa_signal_value_t compare_value, uint64_t timeout_hint,
hsa_wait_state_t wait_state_hint) {
static hsa_signal_value_t (*fn_hsa_signal_wait_relaxed)(
hsa_signal_t signal, hsa_signal_condition_t condition,
hsa_signal_value_t compare_value, uint64_t timeout_hint,
hsa_wait_state_t wait_state_hint) = nullptr;
_HSAKMT_LOOKUP_SYMS(hsa_signal_wait_relaxed);
_HSAKMT_EXEC_API(hsa_signal_wait_relaxed, signal, condition, compare_value,
timeout_hint, wait_state_hint);
return 0;
}
void hsakmt_hsa_signal_store_screlease(hsa_signal_t hsa_signal,
hsa_signal_value_t value){
static void (*fn_hsa_signal_store_screlease)(hsa_signal_t hsa_signal,
hsa_signal_value_t value) = nullptr;
_HSAKMT_LOOKUP_SYMS(hsa_signal_store_screlease);
_HSAKMT_EXEC_API(hsa_signal_store_screlease, hsa_signal, value);
}
hsa_status_t hsakmt_hsa_ven_amd_loader_query_host_address(
const void *device_address, const void **host_address) {
static hsa_status_t (*fn_hsa_ven_amd_loader_query_host_address)(
const void *device_address, const void **host_address) = nullptr;
if (fn_hsa_ven_amd_loader_query_host_address == nullptr) {
std::lock_guard<std::mutex> gard(*lock_);
if (fn_hsa_ven_amd_loader_query_host_address == nullptr) {
hsa_status_t (*fn_hsa_system_get_extension_table)(
uint16_t extension, uint16_t version_major, uint16_t version_minor, void *table);
fn_hsa_system_get_extension_table =
reinterpret_cast<decltype(fn_hsa_system_get_extension_table)>(dlsym(RTLD_DEFAULT, "hsa_system_get_extension_table"));
if (fn_hsa_system_get_extension_table == nullptr) {
pr_err("%s not found - %s\n", "hsa_system_get_extension_table", dlerror());
return HSA_STATUS_ERROR;
}
hsa_ven_amd_loader_1_03_pfn_t table;
fn_hsa_system_get_extension_table(HSA_EXTENSION_AMD_LOADER, 1, 3, &table);
fn_hsa_ven_amd_loader_query_host_address =
table.hsa_ven_amd_loader_query_host_address;
}
}
_HSAKMT_EXEC_API(hsa_ven_amd_loader_query_host_address, device_address, host_address);
return HSA_STATUS_ERROR;
}
#else
hsa_signal_value_t hsakmt_hsa_signal_load_relaxed(hsa_signal_t signal) {
return hsa_signal_load_relaxed(signal);
}
hsa_signal_value_t hsakmt_hsa_signal_wait_relaxed(
hsa_signal_t signal, hsa_signal_condition_t condition,
hsa_signal_value_t compare_value, uint64_t timeout_hint,
hsa_wait_state_t wait_state_hint) {
return hsa_signal_wait_relaxed(signal, condition, compare_value, timeout_hint,
wait_state_hint);
}
void hsakmt_hsa_signal_store_screlease(hsa_signal_t hsa_signal,
hsa_signal_value_t value) {
hsa_signal_store_screlease(hsa_signal, value);
}
hsa_status_t hsakmt_hsa_ven_amd_loader_query_host_address(
const void *device_address, const void **host_address) {
static hsa_status_t (*fn_hsa_ven_amd_loader_query_host_address)(
const void *device_address, const void **host_address) = nullptr;
if (fn_hsa_ven_amd_loader_query_host_address == nullptr) {
std::lock_guard<std::mutex> gard(*lock_);
if (fn_hsa_ven_amd_loader_query_host_address == nullptr) {
hsa_ven_amd_loader_1_03_pfn_t table;
hsa_system_get_extension_table(HSA_EXTENSION_AMD_LOADER, 1, 3, &table);
fn_hsa_ven_amd_loader_query_host_address =
table.hsa_ven_amd_loader_query_host_address;
}
}
if (fn_hsa_ven_amd_loader_query_host_address)
return fn_hsa_ven_amd_loader_query_host_address(device_address, host_address);
return HSA_STATUS_ERROR;
}
#endif
@@ -0,0 +1,31 @@
/*
* Copyright © 2025 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including
* the next paragraph) shall be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
HSAKMT_STATUS HSAKMTAPI hsaKmtModelEnabled(bool* enable)
{
*enable = false;
pr_warn_once("not supported\n");
return HSAKMT_STATUS_SUCCESS;
}
@@ -0,0 +1,182 @@
////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
// AMD Research and AMD HSA Software Development
//
// Advanced Micro Devices, Inc.
//
// www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// - Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimers.
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimers in
// the documentation and/or other materials provided with the distribution.
// - Neither the names of Advanced Micro Devices, Inc,
// nor the names of its contributors may be used to endorse or promote
// products derived from this Software without specific prior written
// permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
//
////////////////////////////////////////////////////////////////////////////////
#include <cstdint>
#include "impl/wddm/types.h"
#include "impl/wddm/device.h"
HSAKMT_STATUS HSAKMTAPI hsaKmtGetAMDGPUDeviceHandle(
HSAuint32 NodeId, HsaAMDGPUDeviceHandle *DeviceHandle) {
CHECK_DXG_OPEN();
wsl::thunk::WDDMDevice *pDevice = get_wddmdev(NodeId);
if (pDevice != nullptr) {
*DeviceHandle = reinterpret_cast<HsaAMDGPUDeviceHandle>(pDevice);
return HSAKMT_STATUS_SUCCESS;
}
return HSAKMT_STATUS_ERROR;
}
HSAKMTAPI int amdgpu_device_initialize(int fd,
uint32_t *major_version,
uint32_t *minor_version,
amdgpu_device_handle *device_handle) {
return 0;
}
HSAKMTAPI int amdgpu_device_deinitialize(amdgpu_device_handle device_handle) {
return 0;
}
HSAKMTAPI int amdgpu_query_gpu_info(amdgpu_device_handle dev,
struct amdgpu_gpu_info *info) {
wsl::thunk::WDDMDevice *pDevice =
reinterpret_cast<wsl::thunk::WDDMDevice *>(dev);
memset(info, 0, sizeof(*info));
info->gpu_counter_freq = pDevice->GPUCounterFrequency() / 1000ull;
return 0;
}
HSAKMTAPI int amdgpu_device_get_fd(amdgpu_device_handle dev) {
return dxg_runtime->dxg_fd;
}
HSAKMTAPI int amdgpu_bo_cpu_map(amdgpu_bo_handle bo, void **cpu) {
wsl::thunk::GpuMemory *gpu_mem = reinterpret_cast<wsl::thunk::GpuMemory *>(bo);
if (gpu_mem->IsSysMemFd())
*cpu = gpu_mem->CpuAddress();
return 0;
}
HSAKMTAPI int amdgpu_bo_free(amdgpu_bo_handle buf_handle) {
wsl::thunk::GpuMemory *gpu_mem = reinterpret_cast<wsl::thunk::GpuMemory *>(buf_handle);
void *MemoryAddress = gpu_mem->IsVaAllocated() ? (void*)gpu_mem->GpuAddress() : (void*)gpu_mem->HandleApeAddress();
auto ret = hsaKmtFreeMemory((void*)MemoryAddress, gpu_mem->Size());
return ret == HSAKMT_STATUS_SUCCESS ? 0 : -1;
}
HSAKMTAPI int amdgpu_bo_export(amdgpu_bo_handle bo,
enum amdgpu_bo_handle_type type,
uint32_t *shared_handle) {
*shared_handle = 0;
return 0;
}
HSAKMTAPI int amdgpu_bo_import(amdgpu_device_handle dev,
enum amdgpu_bo_handle_type type,
uint32_t shared_handle,
struct amdgpu_bo_import_result *output) {
if (type != amdgpu_bo_handle_type_dma_buf_fd) {
pr_err("not implemented\n");
return -1;
}
wsl::thunk::WDDMDevice *pDevice = reinterpret_cast<wsl::thunk::WDDMDevice *>(dev);
wsl::thunk::GpuMemoryHandle mem_handle;
bool is_ipc_memfd = is_ipc_sysmemfd(shared_handle);
bool alloc_va = is_ipc_memfd;
HSAKMT_STATUS ret = import_dmabuf_fd(shared_handle, pDevice->NodeId(),
alloc_va, is_ipc_memfd, &mem_handle);
if (ret == HSAKMT_STATUS_SUCCESS) {
//use GpuMemory object handle as drm buf handle
output->buf_handle = reinterpret_cast<amdgpu_bo_handle>(mem_handle);
return 0;
} else {
return -1;
}
}
HSAKMTAPI int amdgpu_bo_va_op(amdgpu_bo_handle bo,
uint64_t offset,
uint64_t size,
uint64_t addr,
uint64_t flags,
uint32_t ops) {
wsl::thunk::GpuMemory *gpu_mem = reinterpret_cast<wsl::thunk::GpuMemory *>(bo);
assert(gpu_mem != nullptr);
switch(ops) {
case AMDGPU_VA_OP_MAP:
{
if (gpu_mem->GpuAddress() == addr) {
pr_info("bo is mapped already\n");
return 0;
} else if (gpu_mem->GpuAddress()) {
pr_err("amdgpu_bo_va_op: GPU memory already mapped at %p, but requested to map at %p\n",
reinterpret_cast<void *>(gpu_mem->GpuAddress()), reinterpret_cast<void *>(addr));
return -1;
}
auto code = gpu_mem->MapGpuVirtualAddress(reinterpret_cast<gpusize>(addr), size, offset);
if (code != ErrorCode::Success)
return -1;
code = gpu_mem->MakeResident();
if (code != ErrorCode::Success)
return -1;
}
break;
case AMDGPU_VA_OP_UNMAP:
{
auto code = gpu_mem->UnmapGpuVirtualAddress(reinterpret_cast<gpusize>(addr), size, offset);
if (code != ErrorCode::Success)
return -1;
gpu_mem->Evict();
}
break;
}
return 0;
}
HSAKMTAPI int amdgpu_bo_query_info(amdgpu_bo_handle bo, struct amdgpu_bo_info* info) {
return 0;
}
HSAKMTAPI int amdgpu_bo_set_metadata(amdgpu_bo_handle bo, struct amdgpu_bo_metadata* info) {
return 0;
}
HSAKMTAPI int drmCommandWriteRead(int fd, unsigned long drmCommandIndex,
void *data, unsigned long size) {
return 0;
}
@@ -0,0 +1,289 @@
/*
* Copyright © 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including
* the next paragraph) shall be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#ifndef LIBHSAKMT_H_INCLUDED
#define LIBHSAKMT_H_INCLUDED
#include <pthread.h>
#include <stdint.h>
#include <limits.h>
#include "hsakmt/hsakmt.h"
#include "hsakmt/hsakmt_drm.h"
#include "impl/wddm/va_mgr.h"
#include "impl/wddm/types.h"
#include "impl/wddm/device.h"
#include "dxcore_loader.h"
wsl::thunk::WDDMDevice* get_wddmdev(uint32_t node_id);
uint32_t get_num_wddmdev();
wsl::thunk::GpuMemory *get_gpu_mem(void *MemoryAddress);
#define HSAKMT_DEBUG_LEVEL_ERR -1
#define HSAKMT_DEBUG_LEVEL_DEFAULT 3
#define HSAKMT_DEBUG_LEVEL_WARNING 4
#define HSAKMT_DEBUG_LEVEL_INFO 6
#define HSAKMT_DEBUG_LEVEL_DEBUG 7
struct hsakmtRuntime {
hsakmtRuntime()
: dxg_fd(-1),
parent_pid(getpid()),
is_forked(false),
hsakmt_debug_level(HSAKMT_DEBUG_LEVEL_DEFAULT),
dxg_open_count(0),
hsakmt_mutex(PTHREAD_MUTEX_INITIALIZER),
hsakmt_is_dgpu(false),
is_svm_api_supported(false),
zfb_support(0),
vendor_packet_process(0),
check_avail_sysram(false),
max_single_alloc_size(0),
enable_thunk_sub_allocator(0),
local_heap_space_start_(0),
local_heap_space_size_(0),
system_heap_space_start_(0),
system_heap_space_size_(0),
handle_aperture_start_(0),
handle_aperture_size_(0),
default_node(1) {}
void HeapInit();
void HeapFini();
bool ReserveSvmSpace(uint64_t &base, uint64_t &size, uint64_t align);
bool FreeSvmSpace(uint64_t &base, uint64_t &size);
bool ReserveLocalHeapSpace();
bool FreeLocalHeapSpace();
void InitLocalHeapMgr();
bool ReserveSystemHeapSpace();
uint64_t SystemHeapSize() { return system_heap_space_size_; }
bool FreeSystemHeapSpace();
bool CommitSystemHeapSpace(void* addr, int64_t size, bool lock);
bool DecommitSystemHeapSpace(void* addr, int64_t size);
void InitSystemHeapMgr();
ErrorCode ReserveGpuVirtualAddress(const thunk_proxy::AllocDomain domain,
gpusize hit_base_addr, gpusize size,
gpusize *out_gpu_virt_addr, gpusize alignment, bool lock);
ErrorCode FreeGpuVirtualAddress(const thunk_proxy::AllocDomain domain,
gpusize gpu_addr, gpusize size);
bool CommitSystemHeapSpaceIPC(void* addr, int64_t size, int &fd, bool lock=false);
bool DecommitSystemHeapSpaceIPC(void* addr, int64_t size, int &memfd);
ErrorCode ReserveIPCSysMem(gpusize size,
gpusize *out_gpu_virt_addr, gpusize alignment,
int &memfd, bool lock);
ErrorCode FreeIPCSysMem(gpusize gpu_addr, gpusize size, int &memfd);
bool InitHandleApertureSpace();
void InitHandleApertureMgr();
ErrorCode HandleApertureAlloc(gpusize size, gpusize *out_gpu_virt_addr);
void HandleApertureFree(gpusize gpu_addr);
pthread_mutex_t hsakmt_mutex;
const char *dxg_device_name = "/dev/dxg";
long page_size;
int page_shift;
int dxg_fd = -1;
pid_t parent_pid = -1;
bool is_forked = false;
int hsakmt_debug_level = HSAKMT_DEBUG_LEVEL_DEFAULT;
unsigned long dxg_open_count;
bool hsakmt_is_dgpu;
bool is_svm_api_supported;
int zfb_support;
int vendor_packet_process;
bool check_avail_sysram;
size_t max_single_alloc_size;
int enable_thunk_sub_allocator;
uint32_t default_node;
/* local heap means bo's backend is vram of all GPUs */
uint64_t local_heap_space_start_;
uint64_t local_heap_space_size_;
/* manage the reserved local heap space which shared by CPU and GPUs */
std::unique_ptr<wsl::thunk::VaMgr> local_heap_mgr_;
/* system heap means bo's backend is system ram */
uint64_t system_heap_space_start_;
uint64_t system_heap_space_size_;
/* manage the reserved system heap space which shared by CPU and GPUs */
std::unique_ptr<wsl::thunk::VaMgr> system_heap_mgr_;
uint64_t handle_aperture_start_;
uint64_t handle_aperture_size_;
std::unique_ptr<wsl::thunk::VaMgr> handle_aperture_mgr_;
};
extern hsakmtRuntime *dxg_runtime;
#undef HSAKMTAPI
#define HSAKMTAPI __attribute__((visibility ("default")))
#if defined(__clang__)
#if __has_feature(address_sanitizer)
#define SANITIZER_AMDGPU 1
#endif
#endif
/*Avoid pointer-to-int-cast warning*/
#define PORT_VPTR_TO_UINT64(vptr) ((uint64_t)(unsigned long)(vptr))
/*Avoid int-to-pointer-cast warning*/
#define PORT_UINT64_TO_VPTR(v) ((void*)(unsigned long)(v))
#define CHECK_DXG_OPEN() \
do { if (dxg_runtime->dxg_open_count == 0 || dxg_runtime->is_forked) return HSAKMT_STATUS_KERNEL_IO_CHANNEL_NOT_OPENED; } while (0)
/* 64KB BigK fragment size for TLB efficiency */
#define GPU_BIGK_PAGE_SIZE (1 << 16)
/* 2MB huge page size for 4-level page tables on Vega10 and later GPUs */
#define GPU_HUGE_PAGE_SIZE (2 << 20)
#define CHECK_PAGE_MULTIPLE(x) \
do { if ((uint64_t)PORT_VPTR_TO_UINT64(x) % dxg_runtime->page_size) return HSAKMT_STATUS_INVALID_PARAMETER; } while(0)
#define ALIGN_UP(x,align) (((uint64_t)(x) + (align) - 1) & ~(uint64_t)((align)-1))
#define ALIGN_UP_32(x,align) (((uint32_t)(x) + (align) - 1) & ~(uint32_t)((align)-1))
#define PAGE_ALIGN_UP(x) ALIGN_UP(x,dxg_runtime->page_size)
#define BITMASK(n) ((n) ? (UINT64_MAX >> (sizeof(UINT64_MAX) * CHAR_BIT - (n))) : 0)
#define ARRAY_LEN(array) (sizeof(array) / sizeof(array[0]))
/* HSA Thunk logging usage */
#define get_thread_id() \
([]() -> std::string { \
std::stringstream str_thrd_id; \
str_thrd_id << std::hex << std::this_thread::get_id(); \
return str_thrd_id.str(); \
})()
#define hsakmt_print_common(stream, fmt, ...) \
do { \
fprintf(stream, "pid:%d tid:0x%s [%s] " fmt, getpid(), get_thread_id().c_str(), __FUNCTION__, ##__VA_ARGS__); \
fflush(stream); \
} while (false)
#ifdef NDEBUG
#define hsakmt_print(level, fmt, ...) \
do { } while (false)
#else
#define hsakmt_print(level, fmt, ...) \
do { \
if (level <= dxg_runtime->hsakmt_debug_level) { \
hsakmt_print_common(stdout, fmt, ##__VA_ARGS__); \
} \
} while (false)
#endif
#define pr_err(fmt, ...) \
hsakmt_print_common(stderr, fmt, ##__VA_ARGS__)
#define pr_warn(fmt, ...) \
hsakmt_print(HSAKMT_DEBUG_LEVEL_WARNING, fmt, ##__VA_ARGS__)
#define pr_info(fmt, ...) \
hsakmt_print(HSAKMT_DEBUG_LEVEL_INFO, fmt, ##__VA_ARGS__)
#define pr_debug(fmt, ...) \
hsakmt_print(HSAKMT_DEBUG_LEVEL_DEBUG, fmt, ##__VA_ARGS__)
#define pr_err_once(fmt, ...) \
({ \
static bool __print_once; \
if (!__print_once) { \
__print_once = true; \
pr_err(fmt, ##__VA_ARGS__); \
} \
})
#define pr_warn_once(fmt, ...) \
({ \
static bool __print_once; \
if (!__print_once) { \
__print_once = true; \
pr_warn(fmt, ##__VA_ARGS__); \
} \
})
/* Expects HSA_ENGINE_ID.ui32, returns gfxv (full) in hex */
#define HSA_GET_GFX_VERSION_FULL(ui32) \
(((ui32.Major) << 16) | ((ui32.Minor) << 8) | (ui32.Stepping))
HSAKMT_STATUS validate_nodeid(uint32_t nodeid, uint32_t *gpu_id);
HSAKMT_STATUS gpuid_to_nodeid(uint32_t gpu_id, uint32_t* node_id);
bool prefer_ats(HSAuint32 node_id);
uint16_t get_device_id_by_node_id(HSAuint32 node_id);
uint16_t get_device_id_by_gpu_id(HSAuint32 gpu_id);
uint32_t get_direct_link_cpu(uint32_t gpu_node);
HSAKMT_STATUS topology_sysfs_get_system_props(HsaSystemProperties& props);
HSAKMT_STATUS topology_get_node_props(HSAuint32 NodeId,
HsaNodeProperties *NodeProperties);
HSAKMT_STATUS topology_get_iolink_props(HSAuint32 NodeId,
HSAuint32 NumIoLinks,
HsaIoLinkProperties *IoLinkProperties);
void topology_setup_is_dgpu_param(HsaNodeProperties *props);
HSAuint32 PageSizeFromFlags(unsigned int pageSizeFlags);
uint32_t get_num_sysfs_nodes(void);
bool is_forked_child(void);
void clear_allocation_map(void);
class BlockAllocator {
private:
static const size_t block_size_ = 128 * 1024 * 1024; // 128MB blocks.
public:
void* alloc(size_t request_size, size_t& allocated_size) const;
void free(void* ptr, size_t length) const;
size_t block_size() const { return block_size_; }
};
void reset_suballocator(void);
void trim_suballocator(void);
HSAKMT_STATUS hsaKmtAllocMemoryAlignInternal(HSAuint32 PreferredNode,
HSAuint64 SizeInBytes,
HSAuint64 Alignment,
HsaMemFlags MemFlags,
void **MemoryAddress,
bool SkipSubAlloc = false);
HSAKMT_STATUS hsaKmtFreeMemoryInternal(void *MemoryAddress,
HSAuint64 SizeInBytes,
bool SkipSubAlloc = false);
bool queue_acquire_buffer(void *MemoryAddress);
bool queue_release_buffer(void *MemoryAddress);
/* Calculate VGPR and SGPR register file size per CU */
uint32_t get_vgpr_size_per_cu(HSA_ENGINE_ID id);
#define SGPR_SIZE_PER_CU 0x4000
bool is_ipc_sysmemfd(int fd);
HSAKMT_STATUS import_dmabuf_fd(int DMABufFd,
uint32_t NodeId,
bool alloc_va,
bool is_ipc_memfd,
wsl::thunk::GpuMemoryHandle *GpuMemHandle);
bool hsakmt_hsa_loader_init();
#endif
@@ -0,0 +1,113 @@
HSAKMT_1
{
global:
hsaKmtOpenKFD;
hsaKmtCloseKFD;
hsaKmtGetVersion;
hsaKmtAcquireSystemProperties;
hsaKmtReleaseSystemProperties;
hsaKmtGetNodeProperties;
hsaKmtGetNodeMemoryProperties;
hsaKmtGetNodeCacheProperties;
hsaKmtGetNodeIoLinkProperties;
hsaKmtCreateEvent;
hsaKmtDestroyEvent;
hsaKmtSetEvent;
hsaKmtResetEvent;
hsaKmtQueryEventState;
hsaKmtWaitOnEvent;
hsaKmtWaitOnMultipleEvents;
hsaKmtCreateQueue;
hsaKmtCreateQueueExt;
hsaKmtUpdateQueue;
hsaKmtDestroyQueue;
hsaKmtSetQueueCUMask;
hsaKmtSetMemoryPolicy;
hsaKmtAllocMemory;
hsaKmtAllocMemoryAlign;
hsaKmtFreeMemory;
hsaKmtAvailableMemory;
hsaKmtRegisterMemory;
hsaKmtRegisterMemoryToNodes;
hsaKmtRegisterMemoryWithFlags;
hsaKmtRegisterGraphicsHandleToNodes;
hsaKmtRegisterGraphicsHandleToNodesExt;
hsaKmtShareMemory;
hsaKmtRegisterSharedHandle;
hsaKmtRegisterSharedHandleToNodes;
hsaKmtProcessVMRead;
hsaKmtProcessVMWrite;
hsaKmtDeregisterMemory;
hsaKmtMapMemoryToGPU;
hsaKmtMapMemoryToGPUNodes;
hsaKmtUnmapMemoryToGPU;
hsaKmtDbgRegister;
hsaKmtDbgUnregister;
hsaKmtDbgWavefrontControl;
hsaKmtDbgAddressWatch;
hsaKmtDbgEnable;
hsaKmtDbgDisable;
hsaKmtDbgGetDeviceData;
hsaKmtDbgGetQueueData;
hsaKmtGetClockCounters;
hsaKmtPmcGetCounterProperties;
hsaKmtPmcRegisterTrace;
hsaKmtPmcUnregisterTrace;
hsaKmtPmcAcquireTraceAccess;
hsaKmtPmcReleaseTraceAccess;
hsaKmtPmcStartTrace;
hsaKmtPmcQueryTrace;
hsaKmtPmcStopTrace;
hsaKmtMapGraphicHandle;
hsaKmtUnmapGraphicHandle;
hsaKmtSetTrapHandler;
hsaKmtGetTileConfig;
hsaKmtQueryPointerInfo;
hsaKmtSetMemoryUserData;
hsaKmtGetQueueInfo;
hsaKmtAllocQueueGWS;
hsaKmtRuntimeEnable;
hsaKmtRuntimeDisable;
hsaKmtCheckRuntimeDebugSupport;
hsaKmtGetRuntimeCapabilities;
hsaKmtDebugTrapIoctl;
hsaKmtSPMAcquire;
hsaKmtSPMRelease;
hsaKmtSPMSetDestBuffer;
hsaKmtSVMSetAttr;
hsaKmtSVMGetAttr;
hsaKmtSetXNACKMode;
hsaKmtGetXNACKMode;
hsaKmtOpenSMI;
hsaKmtExportDMABufHandle;
hsaKmtGetMemoryHandle;
hsaKmtWaitOnEvent_Ext;
hsaKmtWaitOnMultipleEvents_Ext;
hsaKmtReplaceAsanHeaderPage;
hsaKmtReturnAsanHeaderPage;
hsaKmtGetAMDGPUDeviceHandle;
hsaKmtPcSamplingQueryCapabilities;
hsaKmtPcSamplingCreate;
hsaKmtPcSamplingDestroy;
hsaKmtPcSamplingStart;
hsaKmtPcSamplingStop;
hsaKmtPcSamplingSupport;
hsaKmtAisReadWriteFile;
hsaKmtModelEnabled;
hsaKmtQueueRingDoorbell;
amdgpu_device_initialize;
amdgpu_device_deinitialize;
amdgpu_query_gpu_info;
amdgpu_bo_import;
amdgpu_bo_va_op;
amdgpu_device_get_fd;
amdgpu_bo_cpu_map;
amdgpu_bo_free;
amdgpu_bo_export;
amdgpu_bo_query_info;
amdgpu_bo_set_metadata;
drmCommandWriteRead;
local: *;
};
@@ -0,0 +1,989 @@
/*
* Copyright © 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including
* the next paragraph) shall be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/sysinfo.h>
#include <sys/stat.h>
#include <fcntl.h>
#include "impl/wddm/gpu_memory.h"
#include "util/simple_heap.h"
struct Allocation {
Allocation()
: handle(0), cpu_addr(0), gpu_addr(0), size(0), userptr(false),
user_data(nullptr), size_requested(0), node_id(0), mem_flags_value(0),
dmabuf_fd(-1), rocr_userdata(nullptr) {}
Allocation(wsl::thunk::GpuMemoryHandle handle_arg, void *cpu_addr_arg,
uint64_t gpu_addr_arg, size_t size_arg, bool userptr_arg = false,
void *user_data_arg = nullptr, size_t user_size_arg = 0,
HSAuint32 node_id_arg = 0, HSAuint32 mem_flags_value_arg = 0)
: handle(handle_arg), cpu_addr(cpu_addr_arg), gpu_addr(gpu_addr_arg),
size(size_arg), userptr(userptr_arg), user_data(user_data_arg),
size_requested(user_size_arg), node_id(node_id_arg),
mem_flags_value(mem_flags_value_arg), dmabuf_fd(-1), rocr_userdata(nullptr) {}
wsl::thunk::GpuMemoryHandle handle;
void *cpu_addr;
uint64_t gpu_addr;
bool userptr;
size_t size; /* actual size = align_up(size_requested, granularity) */
void *user_data;
size_t size_requested; /* size requested by user */
HSAuint32 node_id;
HSAuint32 mem_flags_value;
int dmabuf_fd;
void *rocr_userdata;
};
static std::map<const void *, Allocation>* allocation_map_ = new std::map<const void *, Allocation>();
static std::mutex* allocation_map_lock_ = new std::mutex();
void clear_allocation_map(void)
{
//delete allocation_map_lock_;
allocation_map_lock_ = new std::mutex();
std::lock_guard<std::mutex> lock(*allocation_map_lock_);
delete allocation_map_;
allocation_map_ = new std::map<const void *, Allocation>();
}
HSAKMT_STATUS HSAKMTAPI hsaKmtSetMemoryPolicy(HSAuint32 Node,
HSAuint32 DefaultPolicy,
HSAuint32 AlternatePolicy,
void *MemoryAddressAlternate,
HSAuint64 MemorySizeInBytes) {
CHECK_DXG_OPEN();
pr_warn_once("not implemented\n");
assert(false);
return HSAKMT_STATUS_SUCCESS;
}
HSAuint32 PageSizeFromFlags(unsigned int pageSizeFlags) {
switch (pageSizeFlags) {
case HSA_PAGE_SIZE_4KB:
return 4 * 1024;
case HSA_PAGE_SIZE_64KB:
return 64 * 1024;
case HSA_PAGE_SIZE_2MB:
return 2 * 1024 * 1024;
case HSA_PAGE_SIZE_1GB:
return 1024 * 1024 * 1024;
default:
assert(false);
return 4 * 1024;
}
}
HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemory(HSAuint32 PreferredNode,
HSAuint64 SizeInBytes,
HsaMemFlags MemFlags,
void **MemoryAddress) {
return hsaKmtAllocMemoryAlign(PreferredNode, SizeInBytes, 0, MemFlags,
MemoryAddress);
}
#define POWER_OF_2(x) ((x && (!(x & (x - 1)))) ? 1 : 0)
bool isSystemMemoryAvailable(HSAuint64 SizeInBytes) {
struct sysinfo info;
if (sysinfo(&info) != 0)
return false;
return SizeInBytes <= info.freeram;
}
void* BlockAllocator::alloc(size_t request_size, size_t& allocated_size) const {
void *address;
HsaMemFlags MemFlags;
MemFlags.Value = 0;
MemFlags.ui32.CoarseGrain = 1;
MemFlags.ui32.NoSubstitute = 1;
allocated_size = wsl::AlignUp(request_size, block_size());
if (HSAKMT_STATUS_SUCCESS == hsaKmtAllocMemoryAlignInternal(1, allocated_size, 0, MemFlags, &address, true))
return address;
return nullptr;
}
void BlockAllocator::free(void* ptr, size_t length) const {
if (HSAKMT_STATUS_SUCCESS != hsaKmtFreeMemoryInternal(ptr, length, true))
pr_err("wsl-thunk: BlockAllocator::free() err, address %p, length:%zu\n", ptr, length);
}
static wsl::SimpleHeap<BlockAllocator> fragment_allocator_;
void reset_suballocator(void) {
fragment_allocator_.reset();
}
void trim_suballocator(void) {
fragment_allocator_.trim();
}
HSAKMT_STATUS hsaKmtAllocMemoryAlignInternal(HSAuint32 PreferredNode,
HSAuint64 SizeInBytes,
HSAuint64 Alignment,
HsaMemFlags MemFlags,
void **MemoryAddress,
bool SkipSubAlloc) {
CHECK_DXG_OPEN();
if (!MemoryAddress)
return HSAKMT_STATUS_INVALID_PARAMETER;
if (MemFlags.ui32.FixedAddress) {
if (*MemoryAddress == nullptr)
return HSAKMT_STATUS_INVALID_PARAMETER;
} else
*MemoryAddress = nullptr;
uint32_t node = (PreferredNode == 0) ? dxg_runtime->default_node : PreferredNode;
wsl::thunk::WDDMDevice *dev = get_wddmdev(node);
if (!dev)
return HSAKMT_STATUS_ERROR;
wsl::thunk::GpuMemory *gpu_mem = nullptr;
wsl::thunk::GpuMemoryCreateInfo create_info{};
create_info.size = SizeInBytes;
/* If initialize scratch pool of GpuAgent, treat it as SVM reserve */
if (MemFlags.ui32.Scratch && MemFlags.ui32.HostAccess && SizeInBytes > 0x80000000)
MemFlags.ui32.OnlyAddress = 1;
create_info.alignment = Alignment;
create_info.va_hint = reinterpret_cast<gpusize>(*MemoryAddress);
if ((PreferredNode == 0 && MemFlags.ui32.HostAccess)
|| dxg_runtime->zfb_support || MemFlags.ui32.GTTAccess) {
if (SizeInBytes > dxg_runtime->max_single_alloc_size)
return HSAKMT_STATUS_NO_MEMORY;
if (dxg_runtime->check_avail_sysram && !isSystemMemoryAvailable(SizeInBytes))
return HSAKMT_STATUS_NO_MEMORY;
/* If allocate VRAM under ZFB mode */
if (dxg_runtime->zfb_support && MemFlags.ui32.NonPaged == 1)
MemFlags.ui32.CoarseGrain = 1;
// AllocateNonPaged == AllocateIPC
create_info.flags.sysmem_ipc_sig_exporter = !!(MemFlags.ui32.NonPaged && !MemFlags.ui32.GTTAccess);
create_info.domain = thunk_proxy::AllocDomain::kSystem;
} else {
create_info.domain = thunk_proxy::AllocDomain::kLocal;
}
if (!MemFlags.ui32.CoarseGrain)
create_info.mem_flags = thunk_proxy::kFineGrain;
//In hsa-runtime, only kernarg region set Uncached.
if (MemFlags.ui32.Uncached)
create_info.mem_flags |= thunk_proxy::kKernarg;
create_info.flags.physical_only = MemFlags.ui32.NoAddress;
create_info.flags.alloc_va = !create_info.flags.physical_only;
create_info.flags.interprocess = MemFlags.ui32.NoAddress;
create_info.flags.interprocess |= MemFlags.ui32.Contiguous;
create_info.flags.physical_contiguous = MemFlags.ui32.Contiguous;
create_info.flags.locked = MemFlags.ui32.NoSubstitute;//AllocatePinned
create_info.flags.virtual_alloc = MemFlags.ui32.OnlyAddress;
create_info.flags.blit_kernel_object =
(MemFlags.ui32.ExecuteBlit && MemFlags.ui32.ExecuteAccess &&
(create_info.domain == thunk_proxy::AllocDomain::kSystem));
/*when only alloc virtual or only physical, it's vmm allocation, force to local*/
if (create_info.flags.virtual_alloc || create_info.flags.physical_only
|| create_info.flags.physical_contiguous) {
create_info.domain = thunk_proxy::AllocDomain::kLocal;
SkipSubAlloc = true;
}
/* Only allow using the suballocator for ordinary VRAM.*/
bool trim_safe = false;
if (!SkipSubAlloc && create_info.domain == thunk_proxy::AllocDomain::kLocal) {
/* just quickly skip SA if size is bigger than SA block size.*/
gpusize real_size;
if (create_info.size > GPU_HUGE_PAGE_SIZE)
real_size = wsl::AlignUp(create_info.size, GPU_HUGE_PAGE_SIZE);
else
real_size = wsl::AlignUp(create_info.size, getpagesize());
if (real_size < fragment_allocator_.default_block_size()) {
*MemoryAddress = fragment_allocator_.alloc(real_size);
if (*MemoryAddress)
return HSAKMT_STATUS_SUCCESS;
}
/* SA might keep a lot of free blocks as *cache*.
* We can trim them if direct allocation fails at first time.
*/
trim_safe = true;
}
after_trim:
auto code = dev->CreateGpuMemory(create_info, &gpu_mem);
if (code == ErrorCode::Success) {
std::lock_guard<std::mutex> gard(*allocation_map_lock_);
/* For these physical allcations, use GpuMemory object's address as thunk handle*/
if (create_info.flags.physical_only || create_info.dmabuf_fd > 0)
*MemoryAddress = reinterpret_cast<void*>(gpu_mem->HandleApeAddress());
else
*MemoryAddress = reinterpret_cast<void *>(gpu_mem->GpuAddress());
(*allocation_map_)[*MemoryAddress] = Allocation(
gpu_mem->GetGpuMemoryHandle(), *MemoryAddress, (uint64_t)*MemoryAddress,
create_info.size, false, nullptr, SizeInBytes,
MemFlags.ui32.GTTAccess ? 0 : PreferredNode, MemFlags.Value);
return HSAKMT_STATUS_SUCCESS;
} else if (trim_safe) {
/* attempt to release memory from the block allocator and retry */
fragment_allocator_.trim();
trim_safe = false;
goto after_trim;
}
return HSAKMT_STATUS_ERROR;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemoryAlign(HSAuint32 PreferredNode,
HSAuint64 SizeInBytes,
HSAuint64 Alignment,
HsaMemFlags MemFlags,
void **MemoryAddress) {
return hsaKmtAllocMemoryAlignInternal(PreferredNode, SizeInBytes,
Alignment, MemFlags,
MemoryAddress,
!dxg_runtime->enable_thunk_sub_allocator);
}
HSAKMT_STATUS hsaKmtFreeMemoryInternal(void *MemoryAddress,
HSAuint64 SizeInBytes,
bool SkipSubAlloc) {
CHECK_DXG_OPEN();
if (!MemoryAddress)
return HSAKMT_STATUS_INVALID_PARAMETER;
if (!SkipSubAlloc) {
if (fragment_allocator_.free(MemoryAddress))
return HSAKMT_STATUS_SUCCESS;
}
wsl::thunk::GpuMemory *gpu_mem = nullptr;
{
std::lock_guard<std::mutex> gard(*allocation_map_lock_);
auto it = allocation_map_->find(MemoryAddress);
if (it == allocation_map_->end()) {
return HSAKMT_STATUS_ERROR;
}
gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
if (gpu_mem->IsQueueReferenced())
return HSAKMT_STATUS_ERROR;
wsl::thunk::GpuMemoryDescFlags flags;
flags.reserved = gpu_mem->Flags();
if (flags.is_imported_vram_ipc &&
gpu_mem->DecSharedReference()) {
pr_info("memory is still referenced\n");
return HSAKMT_STATUS_SUCCESS;
}
if (it->second.dmabuf_fd >= 0) {
close(it->second.dmabuf_fd);
it->second.dmabuf_fd = -1;
}
allocation_map_->erase(it);
}
delete gpu_mem;
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtFreeMemory(void *MemoryAddress,
HSAuint64 SizeInBytes) {
return hsaKmtFreeMemoryInternal(MemoryAddress, SizeInBytes);
}
bool queue_acquire_buffer(void *MemoryAddress) {
if (!MemoryAddress)
return false;
wsl::thunk::GpuMemory *gpu_mem = nullptr;
{
std::lock_guard<std::mutex> gard(*allocation_map_lock_);
auto it = allocation_map_->find(MemoryAddress);
if (it == allocation_map_->end()) {
return HSAKMT_STATUS_ERROR;
}
gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
gpu_mem->GetQueueReference();
}
if (gpu_mem == nullptr)
return false;
return true;
}
bool queue_release_buffer(void *MemoryAddress) {
if (!MemoryAddress)
return false;
wsl::thunk::GpuMemory *gpu_mem = nullptr;
{
std::lock_guard<std::mutex> gard(*allocation_map_lock_);
auto it = allocation_map_->find(MemoryAddress);
if (it == allocation_map_->end()) {
return HSAKMT_STATUS_ERROR;
}
gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
gpu_mem->PutQueueReference();
}
if (gpu_mem == nullptr)
return false;
return true;
}
wsl::thunk::GpuMemory *get_gpu_mem(void *MemoryAddress) {
std::lock_guard<std::mutex> gard(*allocation_map_lock_);
auto it = allocation_map_->find(MemoryAddress);
if (it == allocation_map_->end()) {
return nullptr;
}
return wsl::thunk::GpuMemory::Convert(it->second.handle);
}
HSAKMT_STATUS HSAKMTAPI hsaKmtAvailableMemory(HSAuint32 Node,
HSAuint64 *AvailableBytes) {
CHECK_DXG_OPEN();
if (!AvailableBytes)
return HSAKMT_STATUS_INVALID_PARAMETER;
wsl::thunk::WDDMDevice *dev = get_wddmdev(Node);
if (!dev)
return HSAKMT_STATUS_ERROR;
*AvailableBytes = dev->VramAvail();
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemory(void *MemoryAddress,
HSAuint64 MemorySizeInBytes) {
CHECK_DXG_OPEN();
pr_warn_once("not implemented\n");
assert(false);
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryToNodes(void *MemoryAddress,
HSAuint64 MemorySizeInBytes,
HSAuint64 NumberOfNodes,
HSAuint32 *NodeArray) {
CHECK_DXG_OPEN();
assert(false);
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryWithFlags(
void *MemoryAddress, HSAuint64 MemorySizeInBytes, HsaMemFlags MemFlags) {
CHECK_DXG_OPEN();
if (!MemoryAddress)
return HSAKMT_STATUS_INVALID_PARAMETER;
pr_debug("address %p\n", MemoryAddress);
if (MemFlags.ui32.ExtendedCoherent && MemFlags.ui32.CoarseGrain)
return HSAKMT_STATUS_INVALID_PARAMETER;
// Registered memory should be ordinary paged host memory.
if ((MemFlags.ui32.HostAccess != 1) || (MemFlags.ui32.NonPaged == 1))
return HSAKMT_STATUS_NOT_SUPPORTED;
if (!dxg_runtime->hsakmt_is_dgpu)
/* TODO: support mixed APU and dGPU configurations */
return HSAKMT_STATUS_NOT_SUPPORTED;
return HSAKMT_STATUS_SUCCESS;
}
bool is_ipc_sysmemfd(int fd) {
std::string fdPath = "/proc/self/fd/" + std::to_string(fd);
char linkTarget[256];
ssize_t bytes = readlink(fdPath.c_str(), linkTarget, sizeof(linkTarget) - 1);
if (bytes == -1)
return false;
linkTarget[bytes] = '\0';
return strstr(linkTarget, "rocr4wsl_gtt") != nullptr;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterGraphicsHandleToNodes(HSAuint64 GraphicsResourceHandle,
HsaGraphicsResourceInfo *GraphicsResourceInfo,
HSAuint64 NumberOfNodes,
HSAuint32 *NodeArray) {
HSA_REGISTER_MEM_FLAGS regFlags;
regFlags.Value = 0;
return hsaKmtRegisterGraphicsHandleToNodesExt(GraphicsResourceHandle,
GraphicsResourceInfo,
NumberOfNodes,
NodeArray,
regFlags);
}
HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterGraphicsHandleToNodesExt(HSAuint64 GraphicsResourceHandle,
HsaGraphicsResourceInfo *GraphicsResourceInfo,
HSAuint64 NumberOfNodes,
HSAuint32 *NodeArray,
HSA_REGISTER_MEM_FLAGS RegisterFlags) {
CHECK_DXG_OPEN();
uint32_t *gpu_id_array = NULL;
HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
if (is_ipc_sysmemfd(GraphicsResourceHandle)) {
GraphicsResourceInfo->NodeId = dxg_runtime->default_node;
pr_info("skip register sysmemfd. It would be released in next step\n");
return HSAKMT_STATUS_SUCCESS;
}
if (NumberOfNodes == 0) {
RegisterFlags.ui32.requiresVAddr = 0;
NumberOfNodes = 1;
NodeArray = (HSAuint32*)&(dxg_runtime->default_node);
}
pr_debug("number of nodes %lu\n", NumberOfNodes);
wsl::thunk::GpuMemoryHandle mem_handle;
ret = import_dmabuf_fd(GraphicsResourceHandle, NodeArray[0],
RegisterFlags.ui32.requiresVAddr,
false, &mem_handle);
if (ret != HSAKMT_STATUS_SUCCESS) {
pr_err("hsaKmtRegisterGraphicsHandleToNodesExt: import_dmabuf_fd failed, "
"GraphicsResourceHandle: %lu, NodeId: %u\n",
GraphicsResourceHandle, NodeArray[0]);
return ret;
}
wsl::thunk::GpuMemory *gpu_mem = wsl::thunk::GpuMemory::Convert(mem_handle);
GraphicsResourceInfo->NodeId = gpu_mem->GetDevice()->NodeId();
GraphicsResourceInfo->SizeInBytes = gpu_mem->ClientSize();
GraphicsResourceInfo->MemoryAddress = RegisterFlags.ui32.requiresVAddr ?
reinterpret_cast<void *>(gpu_mem->GpuAddress()):
reinterpret_cast<void*>(gpu_mem->HandleApeAddress());
return ret;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtExportDMABufHandle(void *MemoryAddress,
HSAuint64 MemorySizeInBytes,
int *DMABufFd,
HSAuint64 *Offset) {
CHECK_DXG_OPEN();
std::lock_guard<std::mutex> gard(*allocation_map_lock_);
auto it = allocation_map_->upper_bound(MemoryAddress);
if (it != allocation_map_->begin()) {
--it;
if (it->second.dmabuf_fd == -1) {
auto gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
auto code = gpu_mem->ExportPhysicalHandle(DMABufFd);
if (code != ErrorCode::Success)
return HSAKMT_STATUS_ERROR;
it->second.dmabuf_fd = *DMABufFd;
}
*DMABufFd = dup(it->second.dmabuf_fd);
*Offset = reinterpret_cast<uint64_t>(MemoryAddress) - it->second.gpu_addr;
return HSAKMT_STATUS_SUCCESS;
}
return HSAKMT_STATUS_ERROR;
}
HSAKMT_STATUS HSAKMTAPI
hsaKmtGetMemoryHandle(void *MemoryAddress, HSAuint64 SizeInBytes,
uint64_t *SharedMemoryHandle) {
CHECK_DXG_OPEN();
return HSAKMT_STATUS_NOT_SUPPORTED;
}
HSAKMT_STATUS import_dmabuf_fd(int DMABufFd,
uint32_t NodeId,
bool alloc_va,
bool is_ipc_memfd,
wsl::thunk::GpuMemoryHandle *GpuMemHandle) {
CHECK_DXG_OPEN();
*GpuMemHandle = nullptr;
wsl::thunk::WDDMDevice* dev = get_wddmdev(NodeId);
wsl::thunk::GpuMemory *gpu_mem = nullptr;
wsl::thunk::GpuMemoryCreateInfo create_info{};
create_info.dmabuf_fd = DMABufFd;
create_info.flags.alloc_va = alloc_va;
if (is_ipc_memfd) {
struct stat st;
fstat(DMABufFd, &st);
uint64_t sz = st.st_size;
if (4096 <= sz && sz < dxg_runtime->SystemHeapSize() && (sz & 0xfff) == 0) {
pr_debug("DMABufFd %d is sys mem fd(IPC signal), get size:%ld from it\n", DMABufFd, st.st_size);
create_info.flags.sysmem_ipc_sig_importer = 1; // set to 1 when backend is system memory
create_info.size = st.st_size;
}
}
gpusize gpu_va = 0;
auto code = dev->CreateGpuMemory(create_info, &gpu_mem, &gpu_va);
if (code == ErrorCode::SameProcessSameDevice) {
/* Unit_hipMemPoolExportToShareableHandle_SameProc */
pr_info("imported from same process, use the old one\n");
std::lock_guard<std::mutex> gard(*allocation_map_lock_);
auto it = allocation_map_->find((void*)gpu_va);
if (it == allocation_map_->end()) {
pr_err("where's the conflict buffer? va %#lx\n", create_info.va_hint);
return HSAKMT_STATUS_ERROR;
}
wsl::thunk::GpuMemory *conflict_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
conflict_mem->IncSharedReference();
*GpuMemHandle = it->second.handle;
return HSAKMT_STATUS_SUCCESS;
} else if (code != ErrorCode::Success) {
pr_err("fail to import fd, ret %d\n", (int)code);
return HSAKMT_STATUS_ERROR;
}
void *MemoryAddress;
if (alloc_va)
MemoryAddress = reinterpret_cast<void *>(gpu_mem->GpuAddress());
else
MemoryAddress = reinterpret_cast<void*>(gpu_mem->HandleApeAddress());
*GpuMemHandle = gpu_mem->GetGpuMemoryHandle();
std::lock_guard<std::mutex> gard(*allocation_map_lock_);
/*
* the gpu_mem->Flags() need convert back from GpuMemoryCreateFlags to
* HsaMemFlags, reference hsaKmtAllocMemoryAlign
* */
(*allocation_map_)[MemoryAddress] = Allocation(
*GpuMemHandle, MemoryAddress, (uint64_t)MemoryAddress,
gpu_mem->Size(), false, nullptr, gpu_mem->ClientSize(),
NodeId, gpu_mem->Flags());
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI
hsaKmtShareMemory(void *MemoryAddress, HSAuint64 SizeInBytes,
HsaSharedMemoryHandle *SharedMemoryHandle) {
CHECK_DXG_OPEN();
pr_warn_once("not implemented\n");
assert(false);
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI
hsaKmtRegisterSharedHandle(const HsaSharedMemoryHandle *SharedMemoryHandle,
void **MemoryAddress, HSAuint64 *SizeInBytes) {
CHECK_DXG_OPEN();
pr_warn_once("not implemented\n");
assert(false);
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterSharedHandleToNodes(
const HsaSharedMemoryHandle *SharedMemoryHandle, void **MemoryAddress,
HSAuint64 *SizeInBytes, HSAuint64 NumberOfNodes, HSAuint32 *NodeArray) {
CHECK_DXG_OPEN();
pr_warn_once("not implemented\n");
assert(false);
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtProcessVMRead(HSAuint32 Pid,
HsaMemoryRange *LocalMemoryArray,
HSAuint64 LocalMemoryArrayCount,
HsaMemoryRange *RemoteMemoryArray,
HSAuint64 RemoteMemoryArrayCount,
HSAuint64 *SizeCopied) {
CHECK_DXG_OPEN();
pr_warn_once("has been deprecated\n");
assert(false);
return HSAKMT_STATUS_NOT_IMPLEMENTED;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtProcessVMWrite(HSAuint32 Pid,
HsaMemoryRange *LocalMemoryArray,
HSAuint64 LocalMemoryArrayCount,
HsaMemoryRange *RemoteMemoryArray,
HSAuint64 RemoteMemoryArrayCount,
HSAuint64 *SizeCopied) {
CHECK_DXG_OPEN();
pr_warn_once("has been deprecated\n");
assert(false);
return HSAKMT_STATUS_NOT_IMPLEMENTED;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtDeregisterMemory(void *MemoryAddress) {
CHECK_DXG_OPEN();
if (!MemoryAddress)
return HSAKMT_STATUS_INVALID_PARAMETER;
pr_debug("address %p\n", MemoryAddress);
{
std::lock_guard<std::mutex> gard(*allocation_map_lock_);
auto it = allocation_map_->find(MemoryAddress);
if (it == allocation_map_->end()) {
return HSAKMT_STATUS_SUCCESS;
}
auto *gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
wsl::thunk::GpuMemoryDescFlags flags;
flags.reserved = gpu_mem->Flags();
// IPC mem(vram)
if (flags.is_imported_vram_ipc &&
gpu_mem->DecSharedReference() == 0) {
allocation_map_->erase(it);
delete gpu_mem;
return HSAKMT_STATUS_SUCCESS;
}
if (it->second.userptr) {
allocation_map_->erase(it);
allocation_map_->erase((void *)it->second.gpu_addr);
delete gpu_mem;
return HSAKMT_STATUS_SUCCESS;
}
}
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtMapMemoryToGPU(void *MemoryAddress,
HSAuint64 MemorySizeInBytes,
HSAuint64 *AlternateVAGPU) {
HSAuint64 NumberOfNodes = 1;
HSAuint32 NodeArray[] = {dxg_runtime->default_node};
HsaMemMapFlags MemMapFlags;
MemMapFlags.Value = 0;
return hsaKmtMapMemoryToGPUNodes(MemoryAddress, MemorySizeInBytes, AlternateVAGPU,
MemMapFlags, NumberOfNodes, NodeArray);
}
HSAKMT_STATUS HSAKMTAPI hsaKmtMapMemoryToGPUNodes(
void *MemoryAddress, HSAuint64 MemorySizeInBytes, HSAuint64 *AlternateVAGPU,
HsaMemMapFlags MemMapFlags, HSAuint64 NumberOfNodes, HSAuint32 *NodeArray) {
CHECK_DXG_OPEN();
if (!MemoryAddress || !AlternateVAGPU) {
pr_err("FIXME: mapping NULL pointer\n");
return HSAKMT_STATUS_ERROR;
}
uint64_t start = wsl::AlignDown((uint64_t)MemoryAddress, 4096);
uint64_t end =
wsl::AlignUp((uint64_t)MemoryAddress + MemorySizeInBytes, 4096);
void *aligned_ptr = (void *)start;
size_t aligned_size = end - start;
{
if (nullptr != fragment_allocator_.block_base(aligned_ptr))
return HSAKMT_STATUS_SUCCESS;
}
{
std::lock_guard<std::mutex> gard(*allocation_map_lock_);
auto it = allocation_map_->find(aligned_ptr);
if (it != allocation_map_->end()) {
wsl::thunk::GpuMemory *gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
wsl::thunk::GpuMemoryDescFlags flags;
flags.reserved = gpu_mem->Flags();
// IPC mem
if (flags.is_imported_vram_ipc) {
auto code = gpu_mem->MapGpuVirtualAddress(gpu_mem->GpuAddress(), gpu_mem->Size());
if (code != ErrorCode::Success)
return HSAKMT_STATUS_ERROR;
code = gpu_mem->MakeResident();
if (code != ErrorCode::Success)
return HSAKMT_STATUS_ERROR;
wsl::thunk::WDDMDevice *dev = gpu_mem->GetDevice();
if (!dev->WaitOnPagingFenceFromCpu())
return HSAKMT_STATUS_ERROR;
return HSAKMT_STATUS_SUCCESS;
}
if (!it->second.userptr) {
// GTT/Local mem
if (it->second.size >= MemorySizeInBytes) {
*AlternateVAGPU = (uint64_t)MemoryAddress;
return HSAKMT_STATUS_SUCCESS;
} else {
return HSAKMT_STATUS_ERROR;
}
}
}
// userptr mem
it = allocation_map_->find(MemoryAddress);
if (it != allocation_map_->end()) {
if (it->second.userptr && it->second.size >= MemorySizeInBytes) {
*AlternateVAGPU =
(uintptr_t)it->second.gpu_addr +
((uintptr_t)MemoryAddress - (uintptr_t)it->second.cpu_addr);
return HSAKMT_STATUS_SUCCESS;
}
}
}
// map userptr
wsl::thunk::WDDMDevice *dev = get_wddmdev(NodeArray[0]);
if (!dev)
return HSAKMT_STATUS_ERROR;
wsl::thunk::GpuMemory *gpu_mem = nullptr;
wsl::thunk::GpuMemoryHandle handle = 0;
uint64_t addr;
wsl::thunk::GpuMemoryCreateInfo create_info{};
create_info.domain = thunk_proxy::kUserMemory;
create_info.size = aligned_size;
create_info.user_ptr = aligned_ptr;
auto code = dev->CreateGpuMemory(create_info, &gpu_mem);
if (code == ErrorCode::Success) {
addr = gpu_mem->GpuAddress();
handle = gpu_mem->GetGpuMemoryHandle();
} else {
return HSAKMT_STATUS_ERROR;
}
{
std::lock_guard<std::mutex> guard(*allocation_map_lock_);
(*allocation_map_)[MemoryAddress] =
Allocation(handle, aligned_ptr, addr, aligned_size, true, MemoryAddress,
MemorySizeInBytes);
(*allocation_map_)[(void *)addr] =
Allocation(handle, aligned_ptr, addr, aligned_size, true, nullptr,
MemorySizeInBytes);
}
*AlternateVAGPU = addr + ((uintptr_t)MemoryAddress - (uintptr_t)aligned_ptr);
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtUnmapMemoryToGPU(void *MemoryAddress) {
CHECK_DXG_OPEN();
if (!MemoryAddress) {
/* Workaround for runtime bug */
pr_err("FIXME: Unmapping NULL pointer\n");
return HSAKMT_STATUS_SUCCESS;
}
pr_debug("address %p\n", MemoryAddress);
{
if (nullptr != fragment_allocator_.block_base(MemoryAddress))
return HSAKMT_STATUS_SUCCESS;
}
wsl::thunk::GpuMemory *gpu_mem = nullptr;
{
std::lock_guard<std::mutex> gard(*allocation_map_lock_);
auto it = allocation_map_->find(MemoryAddress);
if (it == allocation_map_->end()) {
return HSAKMT_STATUS_ERROR;
}
gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
if (gpu_mem->IsQueueReferenced())
return HSAKMT_STATUS_ERROR;
// IPC mem
wsl::thunk::GpuMemoryDescFlags flags;
flags.reserved = gpu_mem->Flags();
if (flags.is_imported_vram_ipc &&
!gpu_mem->IsSharedFromSameProcess()) {
auto code = gpu_mem->UnmapGpuVirtualAddress(gpu_mem->GpuAddress(), gpu_mem->Size());
if (code != ErrorCode::Success)
return HSAKMT_STATUS_ERROR;
gpu_mem->Evict();
return HSAKMT_STATUS_SUCCESS;
}
}
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtMapGraphicHandle(HSAuint32 NodeId,
HSAuint64 GraphicDeviceHandle,
HSAuint64 GraphicResourceHandle,
HSAuint64 GraphicResourceOffset,
HSAuint64 GraphicResourceSize,
HSAuint64 *FlatMemoryAddress) {
CHECK_DXG_OPEN();
pr_warn_once("not implemented\n");
/* This API was only ever implemented in KFD for Kaveri and
* was never upstreamed. There are no open-source users of
* this interface. It has been superseded by
* RegisterGraphicsHandleToNodes.
*/
return HSAKMT_STATUS_NOT_IMPLEMENTED;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtUnmapGraphicHandle(HSAuint32 NodeId,
HSAuint64 FlatMemoryAddress,
HSAuint64 SizeInBytes) {
CHECK_DXG_OPEN();
pr_warn_once("not implemented\n");
assert(false);
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtGetTileConfig(HSAuint32 NodeId,
HsaGpuTileConfig *config) {
CHECK_DXG_OPEN();
pr_warn_once("not implemented\n");
assert(false);
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtQueryPointerInfo(const void *Pointer,
HsaPointerInfo *PointerInfo) {
CHECK_DXG_OPEN();
if (!Pointer || !PointerInfo)
return HSAKMT_STATUS_INVALID_PARAMETER;
pr_debug("pointer %p\n", Pointer);
memset(PointerInfo, 0, sizeof(HsaPointerInfo));
wsl::thunk::GpuMemory *gpu_mem = nullptr;
Allocation allocation_info;
bool found = false;
{
std::lock_guard<std::mutex> gard(*allocation_map_lock_);
auto it = allocation_map_->upper_bound(Pointer);
if (it != allocation_map_->begin()) {
--it;
if (Pointer >= it->first &&
(Pointer < reinterpret_cast<const uint8_t*>(it->first) + it->second.size_requested)) {
allocation_info = it->second;
gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
found = true;
}
}
}
if (!found) {
pr_debug("can't found allocation for %p\n", Pointer);
PointerInfo->Type = HSA_POINTER_UNKNOWN;
return HSAKMT_STATUS_ERROR;
}
if (allocation_info.userptr) {
PointerInfo->Type = HSA_POINTER_REGISTERED_USER;
PointerInfo->SizeInBytes = allocation_info.size;
} else if (gpu_mem->IsVirtual()) {
PointerInfo->Type = HSA_POINTER_RESERVED_ADDR;
} else {
PointerInfo->Type = HSA_POINTER_ALLOCATED;
PointerInfo->SizeInBytes = allocation_info.size_requested;
}
PointerInfo->Node = allocation_info.node_id;
PointerInfo->MemFlags.Value = allocation_info.mem_flags_value;
PointerInfo->CPUAddress = allocation_info.cpu_addr;
PointerInfo->GPUAddress = allocation_info.gpu_addr;
PointerInfo->UserData = allocation_info.rocr_userdata;
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtSetMemoryUserData(const void *Pointer,
void *UserData) {
CHECK_DXG_OPEN();
uint64_t aligned_ptr = wsl::AlignDown((uint64_t)Pointer, 4096);
std::lock_guard<std::mutex> gard(*allocation_map_lock_);
auto it = allocation_map_->find((void *)aligned_ptr);
if (it != allocation_map_->end()) {
it->second.rocr_userdata = UserData;
return HSAKMT_STATUS_SUCCESS;
}
return HSAKMT_STATUS_ERROR;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtReplaceAsanHeaderPage(void *addr) {
CHECK_DXG_OPEN();
pr_warn_once("not supported\n");
assert(false);
#ifdef SANITIZER_AMDGPU
pr_debug("address %p\n", addr);
CHECK_DXG_OPEN();
return HSAKMT_STATUS_SUCCESS;
#else
return HSAKMT_STATUS_NOT_SUPPORTED;
#endif
}
HSAKMT_STATUS HSAKMTAPI hsaKmtReturnAsanHeaderPage(void *addr) {
CHECK_DXG_OPEN();
pr_warn_once("not supported\n");
assert(false);
#ifdef SANITIZER_AMDGPU
pr_debug("address %p\n", addr);
CHECK_DXG_OPEN();
return HSAKMT_STATUS_SUCCESS;
#else
return HSAKMT_STATUS_NOT_SUPPORTED;
#endif
}
@@ -0,0 +1,626 @@
/*
* Copyright © 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including
* the next paragraph) shall be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include <stdlib.h>
#include <cstring>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <sys/sysinfo.h>
#include <linux/mman.h>
#include <fcntl.h>
#include <unistd.h>
#include <cstdio>
#include <strings.h>
#include <cassert>
hsakmtRuntime *dxg_runtime = new hsakmtRuntime();
void hsakmtRuntime::HeapInit() {
ReserveLocalHeapSpace();
ReserveSystemHeapSpace();
InitHandleApertureSpace();
InitLocalHeapMgr();
InitSystemHeapMgr();
InitHandleApertureMgr();
}
void hsakmtRuntime::HeapFini() {
FreeSystemHeapSpace();
FreeLocalHeapSpace();
}
bool hsakmtRuntime::ReserveSvmSpace(uint64_t &base, uint64_t &size, uint64_t align) {
uint64_t sys_va[16] = {0};
uint64_t local_va;
uint64_t sys_va_size;
int match_index = -1;
void* ptr = NULL;
wsl::thunk::WDDMDevice* device;
size_t num_adapters = get_num_wddmdev();
base = 0;
sys_va_size = size + align;
/* it will retry 16 times to find the avaliable range. */
for (int i = 0; i < 16; i++) {
local_va = 0;
ptr = mmap(NULL, sys_va_size , PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
if (ptr == MAP_FAILED) {
pr_err("fail to reserve cpu va in %d time!\n", i);
break;
}
sys_va[i] = (uint64_t)ptr;
int match_cnt = 0;
for (uint32_t j = 0; j < num_adapters; j++) {
device = get_wddmdev(j+1);
uint64_t start = (base == 0) ? (uint64_t)ptr : base;
uint64_t end = start + ((base == 0) ? sys_va_size : size) + 1;
if (wsl::thunk::d3dthunk::ReserveGpuVirtualAddress(
device->GetAdapter(), size,
start,
end, &local_va) == ErrorCode::Success) {
match_cnt++;
base = local_va;
pr_debug("success to reserve gpu va %lx and va cpu %p in %d time\n",
local_va, ptr, i);
} else {
pr_err("%s fail to reserve gpu va for cpu va %p in %d time!\n",
__FUNCTION__, ptr, i);
}
}
if (match_cnt == num_adapters) {
match_index = i;
break;
}
}
if (match_index >= 0) {
/* release cpu unused ranges*/
uint64_t left_size = local_va - sys_va[match_index];
uint64_t right_size = align - left_size;
if ((left_size > 0) && munmap((void*)sys_va[match_index], left_size))
pr_err("fail to unmap left %lx with size %lx\n", sys_va[match_index], left_size);
if ((right_size > 0) && munmap((void*)(local_va + size), right_size))
pr_err("fail to unmap right %lx with size %lx\n", (local_va + size), right_size);
} else {
pr_err("fail to reserve Local Heap Space!\n");
base = 0;
size = 0;
}
/* free match fail address for cpu va */
int free = match_index >= 0 ? match_index : 16;
for (int j = 0; j < free; j++) {
if (sys_va[j] != 0 && munmap((void*)sys_va[j], sys_va_size)) {
pr_err("fail to unmap %d %lx\n", j, sys_va[j]);
}
}
return match_index >= 0;
}
/*
* To find the avaliable same range for cpu
* virtual space and gpu virtual space.
* sys_va_size of cpu va range is larger 1G
* than gpu va range, otherwise ReserveGPUVirtualAddress
* will return error.
*/
bool hsakmtRuntime::ReserveLocalHeapSpace() {
wsl::thunk::WDDMDevice* device;
uint64_t total_local_size = 0;
uint64_t align = 0x40000000; /* 1G */
size_t num_adapters = get_num_wddmdev();
for (uint32_t j = 0; j < num_adapters; j++) {
device = get_wddmdev(j+1);
if (device == nullptr)
return -1;
/*
* For APU, use non local memory(shared GPU memory) as GPU memory,
* because it has small local memory
*/
if (device->IsDgpu())
total_local_size = wsl::Max(device->LocalHeapSize(), total_local_size);
else
total_local_size = wsl::Max(device->LocalHeapSize(), device->NonLocalHeapSize(), total_local_size);
}
total_local_size = wsl::AlignUp(total_local_size, align) * 4;
local_heap_space_start_ = 0;
local_heap_space_size_ = total_local_size;
return ReserveSvmSpace(local_heap_space_start_, local_heap_space_size_, align);
}
bool hsakmtRuntime::FreeSvmSpace(uint64_t &base, uint64_t &size) {
wsl::thunk::WDDMDevice* device;
size_t num_adapters = get_num_wddmdev();
for (uint32_t j = 0; j < num_adapters; j++) {
device = get_wddmdev(j+1);
if (device == nullptr)
return -1;
wsl::thunk::d3dthunk::FreeGpuVirtualAddress(device->GetAdapter(), base, size);
}
void *cpu = (void *)base;
auto r = (munmap(cpu, size) == 0);
base = 0;
size = 0;
return r;
}
bool hsakmtRuntime::FreeLocalHeapSpace() {
return FreeSvmSpace(local_heap_space_start_, local_heap_space_size_);
}
void hsakmtRuntime::InitLocalHeapMgr() {
local_heap_mgr_ = std::make_unique<wsl::thunk::VaMgr>(local_heap_space_start_,
local_heap_space_size_,
DEFAULT_GPU_PAGE_SIZE);
}
bool hsakmtRuntime::ReserveSystemHeapSpace() {
struct sysinfo info;
int ret = sysinfo(&info);
uint64_t max_ram = 0x10000000000;
uint64_t alignment = 0x100000000;
assert(!ret);
int32_t protFlags = PROT_NONE;
// minimum of reserve size is 8G, maximum of reserve size is 1T.
system_heap_space_size_ = std::min(wsl::AlignUp(info.totalram, alignment) * 2, max_ram);
return ReserveSvmSpace(system_heap_space_start_, system_heap_space_size_, alignment);
}
bool hsakmtRuntime::FreeSystemHeapSpace(void) {
return FreeSvmSpace(system_heap_space_start_, system_heap_space_size_);
}
bool hsakmtRuntime::CommitSystemHeapSpace(void* addr, int64_t size, bool lock) {
int32_t protFlags = PROT_READ | PROT_WRITE | PROT_EXEC;
int32_t mapFlags = MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED|
MAP_NORESERVE|MAP_UNINITIALIZED;
if (lock)
mapFlags |= MAP_LOCKED;
void* paddr = mmap(addr, size, protFlags, mapFlags, -1, 0);
if (paddr == MAP_FAILED) {
pr_err("fail to commit %s addr = %p, paddr = %p\n", (lock ? "locked" : ""), addr, paddr);
return false;
}
assert(addr == paddr);
/*if (!Runtime::runtime_singleton_->PinWARequired())
return true;*/
/*
* Do not make the pages in this range available to the child
* after a fork(2). This is useful to prevent copy-on-write
* semantics from changing the physical location of a page if
* the parent writes to it after a fork(2). (Such page
* relocations cause problems for hardware that DMAs into the
* page.)
*
* https://man7.org/linux/man-pages/man2/madvise.2.html
*/
if (madvise(addr, size, MADV_DONTFORK))
pr_err("fail to set MADV_DONTFORK for addr = %p\n", addr);
return true;
}
bool hsakmtRuntime::DecommitSystemHeapSpace(void* addr, int64_t size) {
int32_t protFlags = PROT_NONE;
int32_t mapFlags = MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED|
MAP_NORESERVE|MAP_UNINITIALIZED;
void* paddr = mmap(addr, size, protFlags, mapFlags, -1, 0);
if (paddr == MAP_FAILED) {
pr_err("fail to decommit addr = %p, paddr = %p\n", addr, paddr);
return false;
}
assert(addr == paddr);
return true;
}
void hsakmtRuntime::InitSystemHeapMgr() {
system_heap_mgr_ = std::make_unique<wsl::thunk::VaMgr>(system_heap_space_start_,
system_heap_space_size_,
DEFAULT_GPU_PAGE_SIZE);
}
ErrorCode hsakmtRuntime::ReserveGpuVirtualAddress(const thunk_proxy::AllocDomain domain,
gpusize hit_base_addr, gpusize size,
gpusize *out_gpu_virt_addr, gpusize alignment, bool lock) {
gpusize gpu_addr = 0;
ErrorCode code = ErrorCode::Success;
uint64_t align = alignment == 0 ? (64 * 1024) : alignment; // default 64K alignment
if (size >= GPU_HUGE_PAGE_SIZE)
align = GPU_HUGE_PAGE_SIZE;
if (domain == thunk_proxy::kSystem) {
gpu_addr = system_heap_mgr_->Alloc(size, align, hit_base_addr);
if (gpu_addr == 0)
code = ErrorCode::OutOfMemory;
if (!CommitSystemHeapSpace((void*)gpu_addr, size, lock)) {
system_heap_mgr_->Free(gpu_addr);
code = ErrorCode::SyscallFail;
}
} else {
gpu_addr = local_heap_mgr_->Alloc(size, align, hit_base_addr);
if (gpu_addr == 0)
code = ErrorCode::OutOfGpuMemory;
}
*out_gpu_virt_addr = (code == ErrorCode::Success) ? gpu_addr : 0;
return code;
}
ErrorCode hsakmtRuntime::FreeGpuVirtualAddress(const thunk_proxy::AllocDomain domain,
gpusize gpu_addr, gpusize size) {
auto code = ErrorCode::Success;
if (domain == thunk_proxy::kSystem) {
DecommitSystemHeapSpace((void *)gpu_addr, size);
system_heap_mgr_->Free(gpu_addr);
} else {
local_heap_mgr_->Free(gpu_addr);
}
return code;
}
bool hsakmtRuntime::CommitSystemHeapSpaceIPC(void* addr, int64_t size, int &memfd, bool lock) {
int fd = -1;
if (memfd == -1) {
fd = memfd_create("rocr4wsl_gtt", MFD_CLOEXEC);
if (fd < 0) {
pr_err("memfd_create failed\n");
return false;
}
ftruncate(fd, size);
} else {
fd = memfd;
}
int32_t protFlags = PROT_READ | PROT_WRITE;
int32_t mapFlags = MAP_SHARED | MAP_FIXED | MAP_NORESERVE |
MAP_UNINITIALIZED | (lock ? MAP_LOCKED : 0);
void* paddr = mmap(addr, size, protFlags, mapFlags, fd, 0);
if (paddr == MAP_FAILED) {
pr_err("fail to commit %s addr = %p, paddr = %p\n", (lock ? "locked" : ""), addr, paddr);
if (memfd == -1)
close(fd);
return false;
}
assert(addr == paddr);
memfd = fd;
if (madvise(addr, size, MADV_DONTFORK))
pr_err("fail to set MADV_DONTFORK for addr = %p\n", addr);
return true;
}
bool hsakmtRuntime::DecommitSystemHeapSpaceIPC(void* addr, int64_t size, int &memfd) {
if (munmap(addr, size) != 0) {
pr_err("fail to unmap = %p \n", addr);
return false;
}
close(memfd);
memfd = -1;
return true;
}
ErrorCode hsakmtRuntime::ReserveIPCSysMem(gpusize size,
gpusize *out_gpu_virt_addr, gpusize alignment,
int &memfd, bool lock) {
gpusize gpu_addr = 0;
ErrorCode code = ErrorCode::Success;
gpu_addr = system_heap_mgr_->Alloc(size, alignment, 0);
if (gpu_addr == 0)
return ErrorCode::OutOfMemory;
if (!CommitSystemHeapSpaceIPC((void*)gpu_addr, size, memfd, lock)) {
system_heap_mgr_->Free(gpu_addr);
code = ErrorCode::SyscallFail;
}
*out_gpu_virt_addr = (code == ErrorCode::Success) ? gpu_addr : 0;
return code;
}
ErrorCode hsakmtRuntime::FreeIPCSysMem(gpusize gpu_addr, gpusize size, int &memfd) {
auto code = ErrorCode::Success;
DecommitSystemHeapSpaceIPC((void *)gpu_addr, size, memfd);
system_heap_mgr_->Free(gpu_addr);
return code;
}
bool hsakmtRuntime::InitHandleApertureSpace() {
wsl::thunk::WDDMDevice* device;
size_t num_adapters = get_num_wddmdev();
handle_aperture_start_ = START_NON_CANONICAL_ADDR;
handle_aperture_size_ = 1ULL << 47;
while (handle_aperture_start_ < END_NON_CANONICAL_ADDR - 1) {
for (uint32_t j = 0; j < num_adapters;) {
device = get_wddmdev(j+1);
if (device == nullptr)
return -1;
if (device->PrivateApertureBase() &&
IS_OVERLAPPING(device->PrivateApertureBase(),
device->PrivateApertureSize(),
handle_aperture_start_,
handle_aperture_size_)) {
handle_aperture_start_ += (1ULL << 47);
continue;
}
if (device->SharedApertureBase() &&
IS_OVERLAPPING(device->SharedApertureBase(),
device->SharedApertureSize(),
handle_aperture_start_,
handle_aperture_size_)) {
handle_aperture_start_ += (1ULL << 47);
continue;
}
j++;
}
pr_debug("handle aperture start %lx, size %lx\n", handle_aperture_start_, handle_aperture_size_);
return true;
}
handle_aperture_start_ = 0;
pr_err("fail\n");
return false;
}
void hsakmtRuntime::InitHandleApertureMgr() {
handle_aperture_mgr_ = std::make_unique<wsl::thunk::VaMgr>(handle_aperture_start_,
handle_aperture_size_,
DEFAULT_GPU_PAGE_SIZE);
}
ErrorCode hsakmtRuntime::HandleApertureAlloc(gpusize size, gpusize *out_gpu_virt_addr) {
uint64_t align = DEFAULT_GPU_PAGE_SIZE;
if (size >= GPU_HUGE_PAGE_SIZE)
align = GPU_HUGE_PAGE_SIZE;
*out_gpu_virt_addr = handle_aperture_mgr_->Alloc(size, align);
if (*out_gpu_virt_addr == 0)
return ErrorCode::OutOfHandleApeMemory;
return ErrorCode::Success;
}
void hsakmtRuntime::HandleApertureFree(gpusize gpu_addr) {
handle_aperture_mgr_->Free(gpu_addr);
}
/* is_forked_child detects when the process has forked since the last
* time this function was called. We cannot rely on pthread_atfork
* because the process can fork without calling the fork function in
* libc (using clone or calling the system call directly).
*/
bool is_forked_child(void) {
if (dxg_runtime->is_forked)
return true;
pid_t cur_pid = getpid();
if (dxg_runtime->parent_pid != cur_pid) {
dxg_runtime->is_forked = true;
dxg_runtime->parent_pid = cur_pid;
return true;
}
return false;
}
/* Callbacks from pthread_atfork */
static void prepare_fork_handler(void) { pthread_mutex_lock(&dxg_runtime->hsakmt_mutex); }
static void parent_fork_handler(void) { pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex); }
static void child_fork_handler(void) {
pthread_mutex_init(&dxg_runtime->hsakmt_mutex, NULL);
dxg_runtime->is_forked = true;
}
/* Call this from the child process after fork. This will clear all
* data that is duplicated from the parent process, that is not valid
* in the child.
* The topology information is duplicated from the parent is valid
* in the child process so it is not cleared
*/
static void clear_after_fork(void) {
reset_suballocator();
clear_allocation_map();
if (dxg_runtime->dxg_fd >= 0) {
close(dxg_runtime->dxg_fd);
dxg_runtime->dxg_fd = -1;
}
delete dxg_runtime;
dxg_runtime = new hsakmtRuntime();
}
static inline void init_page_size(void) {
dxg_runtime->page_size = sysconf(_SC_PAGESIZE);
dxg_runtime->page_shift = ffs(dxg_runtime->page_size) - 1;
}
static HSAKMT_STATUS init_vars_from_env(void) {
char *envvar;
int debug_level;
/* Normally libraries don't print messages. For debugging purpose, we'll
* print messages if an environment variable, HSAKMT_DEBUG_LEVEL, is set.
*/
envvar = getenv("HSAKMT_DEBUG_LEVEL");
if (envvar) {
dxg_runtime->hsakmt_debug_level = atoi(envvar);
}
/* Check whether to support Zero frame buffer */
envvar = getenv("HSA_ZFB");
if (envvar)
dxg_runtime->zfb_support = atoi(envvar);
/* Check whether to handle vendor specific aql packet */
envvar = getenv("WSLKMT_VENDOR_PACKET");
if (envvar)
dxg_runtime->vendor_packet_process = atoi(envvar);
/* Decide whether to check available system memory before allocation */
envvar = getenv("WSL_CHECK_AVAIL_SYSRAM");
if (envvar)
dxg_runtime->check_avail_sysram = !strcmp(envvar, "1");
envvar = getenv("WSL_ENABLE_THUNK_SUB_ALLOCATOR");
if (envvar)
dxg_runtime->enable_thunk_sub_allocator = atoi(envvar);
envvar = getenv("ROCR_VISIBLE_DEVICES");
if (envvar) {
std::string devices(envvar);
size_t first_num_pos = devices.find_first_of("0123456789");
if (first_num_pos != std::string::npos)
dxg_runtime->default_node = std::stoi(devices.substr(first_num_pos)) + 1;
}
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtOpenKFD(void) {
HSAKMT_STATUS result;
int fd = -1;
HsaSystemProperties sys_props;
char *error;
pthread_mutex_lock(&dxg_runtime->hsakmt_mutex);
/* If the process has forked, the child process must re-initialize
* it's connection to DXG. Any references tracked by dxg_open_count
* belong to the parent
*/
if (is_forked_child())
clear_after_fork();
if (dxg_runtime->dxg_open_count == 0) {
static bool atfork_installed = false;
result = init_vars_from_env();
if (result != HSAKMT_STATUS_SUCCESS)
goto open_failed;
if (dxg_runtime->dxg_fd < 0) {
fd = open(dxg_runtime->dxg_device_name, O_RDWR | O_CLOEXEC);
if (fd == -1) {
result = HSAKMT_STATUS_KERNEL_IO_CHANNEL_NOT_OPENED;
goto open_failed;
}
dxg_runtime->dxg_fd = fd;
}
if (!wsl::thunk::dxcore::DxcoreLoader::Instance().Initialize()) {
pr_err("Failed to load libdxcore.so\n");
result = HSAKMT_STATUS_ERROR;
goto dxcore_loader_failed;
}
hsakmt_hsa_loader_init();
init_page_size();
char *useSvmStr = getenv("HSA_USE_SVM");
dxg_runtime->is_svm_api_supported = !(useSvmStr && !strcmp(useSvmStr, "0")) && false;
dxg_runtime->dxg_open_count = 1;
if (!atfork_installed) {
/* Atfork handlers cannot be uninstalled and
* must be installed only once. Otherwise
* prepare will deadlock when trying to take
* the same lock multiple times.
*/
pthread_atfork(prepare_fork_handler, parent_fork_handler,
child_fork_handler);
atfork_installed = true;
}
} else {
dxg_runtime->dxg_open_count++;
result = HSAKMT_STATUS_KERNEL_ALREADY_OPENED;
}
reset_suballocator();
pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex);
return result;
dxcore_loader_failed:
close(fd);
open_failed:
pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex);
return result;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtCloseKFD(void) {
HSAKMT_STATUS result;
pthread_mutex_lock(&dxg_runtime->hsakmt_mutex);
if (dxg_runtime->dxg_open_count > 0) {
if (--dxg_runtime->dxg_open_count == 0) {
close(dxg_runtime->dxg_fd);
dxg_runtime->dxg_fd = -1;
wsl::thunk::dxcore::DxcoreLoader::Instance().Shutdown();
}
result = HSAKMT_STATUS_SUCCESS;
} else
result = HSAKMT_STATUS_KERNEL_IO_CHANNEL_NOT_OPENED;
pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex);
return result;
}
@@ -0,0 +1,78 @@
/*
* Copyright © 2023 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including
* the next paragraph) shall be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include <assert.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingSupport(void) {
CHECK_DXG_OPEN();
// Used for profiling tools
pr_warn_once("not supported\n");
return HSAKMT_STATUS_NOT_SUPPORTED;
}
HSAKMT_STATUS HSAKMTAPI
hsaKmtPcSamplingQueryCapabilities(HSAuint32 NodeId, void *sample_info,
HSAuint32 sample_info_sz, HSAuint32 *size) {
CHECK_DXG_OPEN();
// Used for profiling tools
pr_warn_once("not supported\n");
return HSAKMT_STATUS_NOT_SUPPORTED;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingCreate(HSAuint32 NodeId,
HsaPcSamplingInfo *sample_info,
HsaPcSamplingTraceId *traceId) {
CHECK_DXG_OPEN();
// Used for profiling tools
pr_warn_once("not supported\n");
return HSAKMT_STATUS_NOT_SUPPORTED;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingDestroy(HSAuint32 NodeId,
HsaPcSamplingTraceId traceId) {
CHECK_DXG_OPEN();
// Used for profiling tools
pr_warn_once("not supported\n");
return HSAKMT_STATUS_NOT_SUPPORTED;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingStart(HSAuint32 NodeId,
HsaPcSamplingTraceId traceId) {
CHECK_DXG_OPEN();
// Used for profiling tools
pr_warn_once("not supported\n");
return HSAKMT_STATUS_NOT_SUPPORTED;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingStop(HSAuint32 NodeId,
HsaPcSamplingTraceId traceId) {
CHECK_DXG_OPEN();
// Used for profiling tools
pr_warn_once("not supported\n");
return HSAKMT_STATUS_NOT_SUPPORTED;
}
@@ -0,0 +1,90 @@
/*
* Copyright © 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including
* the next paragraph) shall be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
HSAKMT_STATUS HSAKMTAPI hsaKmtPmcGetCounterProperties(
HSAuint32 NodeId, HsaCounterProperties **CounterProperties) {
CHECK_DXG_OPEN();
pr_warn_once("not supported\n");
return HSAKMT_STATUS_NOT_SUPPORTED;
}
/* Registers a set of (HW) counters to be used for tracing/profiling */
HSAKMT_STATUS HSAKMTAPI hsaKmtPmcRegisterTrace(HSAuint32 NodeId,
HSAuint32 NumberOfCounters,
HsaCounter *Counters,
HsaPmcTraceRoot *TraceRoot) {
CHECK_DXG_OPEN();
pr_warn_once("not supported\n");
return HSAKMT_STATUS_NOT_SUPPORTED;
}
/* Unregisters a set of (HW) counters used for tracing/profiling */
HSAKMT_STATUS HSAKMTAPI hsaKmtPmcUnregisterTrace(HSAuint32 NodeId,
HSATraceId TraceId) {
CHECK_DXG_OPEN();
pr_warn_once("not supported\n");
return HSAKMT_STATUS_NOT_SUPPORTED;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtPmcAcquireTraceAccess(HSAuint32 NodeId,
HSATraceId TraceId) {
CHECK_DXG_OPEN();
pr_warn_once("not supported\n");
return HSAKMT_STATUS_NOT_SUPPORTED;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtPmcReleaseTraceAccess(HSAuint32 NodeId,
HSATraceId TraceId) {
CHECK_DXG_OPEN();
pr_warn_once("not supported\n");
return HSAKMT_STATUS_NOT_SUPPORTED;
}
/* Starts tracing operation on a previously established set of performance
* counters */
HSAKMT_STATUS HSAKMTAPI hsaKmtPmcStartTrace(HSATraceId TraceId,
void *TraceBuffer,
HSAuint64 TraceBufferSizeBytes) {
CHECK_DXG_OPEN();
pr_warn_once("not supported\n");
return HSAKMT_STATUS_NOT_SUPPORTED;
}
/*Forces an update of all the counters that a previously started trace operation
* has registered */
HSAKMT_STATUS HSAKMTAPI hsaKmtPmcQueryTrace(HSATraceId TraceId) {
CHECK_DXG_OPEN();
pr_warn_once("not supported\n");
return HSAKMT_STATUS_NOT_SUPPORTED;
}
/* Stops tracing operation on a previously established set of performance
* counters */
HSAKMT_STATUS HSAKMTAPI hsaKmtPmcStopTrace(HSATraceId TraceId) {
CHECK_DXG_OPEN();
pr_warn_once("not supported\n");
return HSAKMT_STATUS_NOT_SUPPORTED;
}
@@ -0,0 +1,216 @@
/*
* Copyright © 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including
* the next paragraph) shall be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include <cinttypes>
#include "impl/wddm/device.h"
#include "impl/wddm/queue.h"
#include "impl/hsa/amd_hsa_signal.h"
uint32_t get_vgpr_size_per_cu(HSA_ENGINE_ID id) {
uint32_t vgpr_size = 0x40000;
uint32_t gfxv = HSA_GET_GFX_VERSION_FULL(id.ui32);
if( gfxv == 0x1100 || gfxv == 0x1101 ||
gfxv == 0x1151 ||
gfxv == 0x1200 || gfxv ==0x1201) {
vgpr_size = 0x60000;
}
return vgpr_size;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueue(HSAuint32 NodeId,
HSA_QUEUE_TYPE Type,
HSAuint32 QueuePercentage,
HSA_QUEUE_PRIORITY Priority,
void *QueueAddress,
HSAuint64 QueueSizeInBytes,
HsaEvent *Event,
HsaQueueResource *QueueResource)
{
if (Type == HSA_QUEUE_SDMA_BY_ENG_ID)
return HSAKMT_STATUS_ERROR;
return hsaKmtCreateQueueExt(NodeId, Type, QueuePercentage, Priority, 0,
QueueAddress, QueueSizeInBytes, Event,
QueueResource);
}
HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueueExt(HSAuint32 NodeId,
HSA_QUEUE_TYPE Type,
HSAuint32 QueuePercentage,
HSA_QUEUE_PRIORITY Priority,
HSAuint32 SdmaEngineId,
void *QueueAddress,
HSAuint64 QueueSizeInBytes,
HsaEvent *Event,
HsaQueueResource *QueueResource) {
HSAKMT_STATUS result;
CHECK_DXG_OPEN();
assert(Event == nullptr);
if (Priority < HSA_QUEUE_PRIORITY_MINIMUM ||
Priority > HSA_QUEUE_PRIORITY_MAXIMUM)
return HSAKMT_STATUS_INVALID_PARAMETER;
wsl::thunk::WDDMDevice *device_ = get_wddmdev(NodeId);
assert(device_);
if (queue_acquire_buffer(QueueAddress) == false)
return HSAKMT_STATUS_INVALID_PARAMETER;
switch (Type) {
case HSA_QUEUE_COMPUTE_AQL: {
assert(QueueResource->ErrorReason == nullptr);
uint64_t pkg_num = QueueSizeInBytes / 64;
uint32_t cmdbuf_size = device_->GetCmdbufSize();
uint32_t queue_engine = device_->GetComputeEngine();
bool use_hws = device_->IsHwsEnabled(queue_engine);
auto queue_ = new wsl::thunk::ComputeQueue(
device_, QueueAddress, pkg_num,
reinterpret_cast<std::atomic<uint64_t> *>(
QueueResource->Queue_write_ptr_aql),
reinterpret_cast<std::atomic<uint64_t> *>(
QueueResource->Queue_read_ptr_aql),
QueueResource->ErrorReason, cmdbuf_size, queue_engine, use_hws);
QueueResource->QueueId = reinterpret_cast<HSA_QUEUEID>(queue_);
// for doorbell_signal.hardware_doorbell_ptr
QueueResource->Queue_DoorBell_aql = queue_->GetDoorbellPtr();
} break;
case HSA_QUEUE_SDMA:
case HSA_QUEUE_SDMA_BY_ENG_ID: {
pr_debug("create sdma queue in engine %d\n", SdmaEngineId);
uint32_t queue_engine = device_->GetSdmaEngine(0); // TODO: SdmaEngineId
bool use_hws = device_->IsHwsEnabled(queue_engine);
auto queue_ = new wsl::thunk::SDMAQueue(
device_, QueueAddress, QueueSizeInBytes,
queue_engine, use_hws);
QueueResource->QueueId = reinterpret_cast<HSA_QUEUEID>(queue_);
QueueResource->Queue_DoorBell_aql = queue_->GetDoorbellPtr();
QueueResource->Queue_write_ptr_aql = queue_->GetRingWptr();
QueueResource->Queue_read_ptr_aql = queue_->GetRingRptr();
} break;
default:
assert(false);
QueueResource->QueueId = 0;
QueueResource->Queue_DoorBell = nullptr;
break;
}
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtUpdateQueue(
HSA_QUEUEID QueueId, HSAuint32 QueuePercentage, HSA_QUEUE_PRIORITY Priority,
void *QueueAddress, HSAuint64 QueueSize, HsaEvent *Event) {
CHECK_DXG_OPEN();
if (Priority < HSA_QUEUE_PRIORITY_MINIMUM ||
Priority > HSA_QUEUE_PRIORITY_MAXIMUM)
return HSAKMT_STATUS_INVALID_PARAMETER;
auto queue_ = reinterpret_cast<wsl::thunk::ComputeQueue *>(QueueId);
if (!queue_)
return HSAKMT_STATUS_INVALID_PARAMETER;
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtDestroyQueue(HSA_QUEUEID QueueId) {
CHECK_DXG_OPEN();
auto queue_ = reinterpret_cast<wsl::thunk::WDDMQueue *>(QueueId);
void *QueueAddress = queue_->GetHsaQueueAddr();
if (!queue_)
return HSAKMT_STATUS_INVALID_PARAMETER;
delete queue_;
queue_release_buffer(QueueAddress);
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtSetQueueCUMask(HSA_QUEUEID QueueId,
HSAuint32 CUMaskCount,
HSAuint32 *QueueCUMask) {
CHECK_DXG_OPEN();
auto queue_ = reinterpret_cast<wsl::thunk::ComputeQueue *>(QueueId);
if (!queue_)
return HSAKMT_STATUS_INVALID_PARAMETER;
if (CUMaskCount == 0 || !QueueCUMask || ((CUMaskCount % 32) != 0))
return HSAKMT_STATUS_INVALID_PARAMETER;
pr_warn_once("not implemented\n");
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtGetQueueInfo(HSA_QUEUEID QueueId,
HsaQueueInfo *QueueInfo) {
CHECK_DXG_OPEN();
if (QueueInfo == NULL)
return HSAKMT_STATUS_INVALID_PARAMETER;
memset(QueueInfo, 0, sizeof(*QueueInfo));
assert(false);
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtSetTrapHandler(HSAuint32 Node,
void *TrapHandlerBaseAddress,
HSAuint64 TrapHandlerSizeInBytes,
void *TrapBufferBaseAddress,
HSAuint64 TrapBufferSizeInBytes) {
CHECK_DXG_OPEN();
pr_warn_once("not implemented\n");
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtAllocQueueGWS(HSA_QUEUEID QueueId, HSAuint32 nGWS,
HSAuint32 *firstGWS) {
CHECK_DXG_OPEN();
auto queue_ = reinterpret_cast<wsl::thunk::ComputeQueue *>(QueueId);
if (!queue_)
return HSAKMT_STATUS_INVALID_PARAMETER;
assert(false);
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtQueueRingDoorbell(HSA_QUEUEID QueueId) {
CHECK_DXG_OPEN();
auto queue_ = reinterpret_cast<wsl::thunk::WDDMQueue *>(QueueId);
if (!queue_)
return HSAKMT_STATUS_INVALID_PARAMETER;
queue_->RingDoorbell();
return HSAKMT_STATUS_SUCCESS;
}
@@ -0,0 +1,50 @@
/*
* Copyright © 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including
* the next paragraph) shall be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include <stdlib.h>
#include <stdio.h>
HSAKMT_STATUS HSAKMTAPI hsaKmtSPMAcquire(HSAuint32 PreferredNode) {
CHECK_DXG_OPEN();
// Used for profiling tools
pr_warn_once("not supported\n");
return HSAKMT_STATUS_NOT_SUPPORTED;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtSPMSetDestBuffer(
HSAuint32 PreferredNode, HSAuint32 SizeInBytes, HSAuint32 *timeout,
HSAuint32 *SizeCopied, void *DestMemoryAddress, bool *isSPMDataLoss) {
CHECK_DXG_OPEN();
// Used for profiling tools
pr_warn_once("not supported\n");
return HSAKMT_STATUS_NOT_SUPPORTED;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtSPMRelease(HSAuint32 PreferredNode) {
CHECK_DXG_OPEN();
// Used for profiling tools
pr_warn_once("not supported\n");
return HSAKMT_STATUS_NOT_SUPPORTED;
}
@@ -0,0 +1,55 @@
/*
* Copyright © 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including
* the next paragraph) shall be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
/* Helper functions for calling KFD SVM ioctl */
HSAKMT_STATUS HSAKMTAPI hsaKmtSVMSetAttr(void *start_addr, HSAuint64 size,
unsigned int nattr,
HSA_SVM_ATTRIBUTE *attrs) {
CHECK_DXG_OPEN();
pr_warn_once("not supported\n");
return HSAKMT_STATUS_NOT_SUPPORTED;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtSVMGetAttr(void *start_addr, HSAuint64 size,
unsigned int nattr,
HSA_SVM_ATTRIBUTE *attrs) {
CHECK_DXG_OPEN();
pr_warn_once("not supported\n");
return HSAKMT_STATUS_NOT_SUPPORTED;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtSetXNACKMode(HSAint32 enable) {
CHECK_DXG_OPEN();
pr_warn_once("not supported\n");
return HSAKMT_STATUS_NOT_SUPPORTED;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtGetXNACKMode(HSAint32 *enable) {
CHECK_DXG_OPEN();
pr_warn_once("not supported\n");
*enable = false;
return HSAKMT_STATUS_SUCCESS;
}
@@ -0,0 +1,49 @@
/*
* Copyright © 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including
* the next paragraph) shall be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include <iostream>
#include <ctime>
#include <cstring>
#include <cassert>
#include "impl/wddm/device.h"
HSAKMT_STATUS HSAKMTAPI hsaKmtGetClockCounters(HSAuint32 NodeId,
HsaClockCounters *Counters) {
HSAKMT_STATUS result = HSAKMT_STATUS_SUCCESS;
CHECK_DXG_OPEN();
std::memset(Counters, 0, sizeof(*Counters));
wsl::thunk::WDDMDevice *device_ = get_wddmdev(NodeId);
assert(device_);
device_->GetClockCounters(&Counters->GPUClockCounter, &Counters->CPUClockCounter);
struct timespec ts;
if (clock_gettime(CLOCK_MONOTONIC_RAW, &ts) == 0)
Counters->SystemClockCounter = ts.tv_sec * 1e9 + ts.tv_nsec;
Counters->SystemClockFrequencyHz = 1000000000;
return result;
}
Diferenças do arquivo suprimidas por serem muito extensas Carregar Diff
@@ -0,0 +1,519 @@
////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
// AMD Research and AMD HSA Software Development
//
// Advanced Micro Devices, Inc.
//
// www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// - Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimers.
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimers in
// the documentation and/or other materials provided with the distribution.
// - Neither the names of Advanced Micro Devices, Inc,
// nor the names of its contributors may be used to endorse or promote
// products derived from this Software without specific prior written
// permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
//
////////////////////////////////////////////////////////////////////////////////
/*
Helpers to use native types with C++11 atomic operations.
Fixes GCC builtin functionality for x86 with respect to WC and non-temporal
stores.
*/
#ifndef HSA_RUNTIME_CORE_UTIL_ATOMIC_HELPERS_H_
#define HSA_RUNTIME_CORE_UTIL_ATOMIC_HELPERS_H_
#include <atomic>
#include "utils.h"
//ALWAYS_CONSERVATIVE will very likely overfence your code.
//For use as a debugging aid only.
#define ALWAYS_CONSERVATIVE 0
#if !ALWAYS_CONSERVATIVE
#if defined(__x86_64__) || defined(_M_X64)
#define X64_ORDER_WC 1
#endif
#if X64_ORDER_WC
#include <xmmintrin.h>
#endif
#endif
namespace wsl {
namespace atomic {
static constexpr int c11ToBuiltInFlags(std::memory_order order)
{
#if ALWAYS_CONSERVATIVE
return __ATOMIC_RELAXED;
#elif X64_ORDER_WC
return __ATOMIC_RELAXED;
#else
return (order == std::memory_order_relaxed) ? __ATOMIC_RELAXED :
(order == std::memory_order_acquire) ? __ATOMIC_ACQUIRE :
(order == std::memory_order_release) ? __ATOMIC_RELEASE :
(order == std::memory_order_seq_cst) ? __ATOMIC_SEQ_CST :
(order == std::memory_order_consume) ? __ATOMIC_CONSUME :
(order == std::memory_order_acq_rel) ? __ATOMIC_ACQ_REL :
__ATOMIC_SEQ_CST;
#endif
}
static __forceinline void PreFence(std::memory_order order) {
#if ALWAYS_CONSERVATIVE
switch (order) {
case std::memory_order_release:
case std::memory_order_seq_cst:
case std::memory_order_acq_rel:
__atomic_thread_fence(__ATOMIC_SEQ_CST);
default:;
}
#elif X64_ORDER_WC
switch (order) {
case std::memory_order_release:
case std::memory_order_seq_cst:
case std::memory_order_acq_rel:
_mm_sfence();
default:;
}
#endif
}
static __forceinline void PostFence(std::memory_order order) {
#if ALWAYS_CONSERVATIVE
switch (order) {
case std::memory_order_seq_cst:
case std::memory_order_acq_rel:
case std::memory_order_acquire:
__atomic_thread_fence(__ATOMIC_SEQ_CST);
default:;
}
#elif X64_ORDER_WC
switch (order) {
case std::memory_order_seq_cst:
return _mm_mfence();
case std::memory_order_acq_rel:
case std::memory_order_acquire:
return _mm_lfence();
default:;
}
#endif
}
static __forceinline void Fence(std::memory_order order=std::memory_order_seq_cst) {
#if ALWAYS_CONSERVATIVE
__atomic_thread_fence(__ATOMIC_SEQ_CST);
#elif X64_ORDER_WC
switch (order) {
case std::memory_order_seq_cst:
case std::memory_order_acq_rel:
return _mm_mfence();
case std::memory_order_acquire:
return _mm_lfence();
case std::memory_order_release:
return _mm_sfence();
default:;
}
#else
std::atomic_thread_fence(order);
#endif
}
template <class T>
static __forceinline void BasicCheck(const T* ptr) {
constexpr bool value = __atomic_always_lock_free(sizeof(T), 0);
static_assert(value, "Atomic type may not be compatible with peripheral atomics.");
};
template <class T>
static __forceinline void BasicCheck(const volatile T* ptr) {
constexpr bool value = __atomic_always_lock_free(sizeof(T), 0);
static_assert(value, "Atomic type may not be compatible with peripheral atomics.");
};
/// @brief: Load value of type T atomically with specified memory order.
/// @param: ptr(Input), a pointer to type T.
/// @param: order(Input), memory order with atomic load, relaxed by default.
/// @return: T, loaded value.
template <class T>
static __forceinline T
Load(const T* ptr, std::memory_order order = std::memory_order_relaxed) {
BasicCheck<T>(ptr);
T ret;
PreFence(order);
__atomic_load(ptr, &ret, c11ToBuiltInFlags(order));
PostFence(order);
return ret;
}
/// @brief: function overloading, for more info, see previous one.
/// @param: ptr(Input), a pointer to volatile type T.
/// @param: order(Input), memory order with atomic load, relaxed by default.
/// @return: T, loaded value.
template <class T>
static __forceinline T
Load(const volatile T* ptr,
std::memory_order order = std::memory_order_relaxed) {
BasicCheck<T>(ptr);
T ret;
PreFence(order);
__atomic_load(ptr, &ret, c11ToBuiltInFlags(order));
PostFence(order);
return ret;
}
/// @brief: Store value of type T with specified memory order.
/// @param: ptr(Input), a pointer to instance which will be stored.
/// @param: val(Input), value to be stored.
/// @param: order(Input), memory order with atomic store, relaxed by default.
/// @return: void.
template <class T>
static __forceinline void Store(
T* ptr, T val, std::memory_order order = std::memory_order_relaxed) {
BasicCheck<T>(ptr);
PreFence(order);
__atomic_store(ptr, &val, c11ToBuiltInFlags(order));
PostFence(order);
}
/// @brief: Function overloading, for more info, see previous one.
/// @param: ptr(Input), a pointer to volatile instance which will be stored.
/// @param: val(Input), value to be stored.
/// @param: order(Input), memory order with atomic store, relaxed by default.
/// @return: void.
template <class T>
static __forceinline void Store(
volatile T* ptr, T val,
std::memory_order order = std::memory_order_relaxed) {
BasicCheck<T>(ptr);
PreFence(order);
__atomic_store(ptr, &val, c11ToBuiltInFlags(order));
PostFence(order);
}
/// @brief: Compare and swap value atomically with specified memory order.
/// @param: ptr(Input), a pointer to variable which is operated on.
/// @param: val(Input), value to be stored if condition is satisfied.
/// @param: expected(Input), value which is expected.
/// @param: order(Input), memory order with atomic operation.
/// @return: T, observed value of type T.
template <class T>
static __forceinline T
Cas(T* ptr, T val, T expected,
std::memory_order order = std::memory_order_relaxed) {
BasicCheck<T>(ptr);
PreFence(order);
__atomic_compare_exchange(ptr, &expected, &val, false, c11ToBuiltInFlags(order), __ATOMIC_RELAXED);
PostFence(order);
return expected;
}
/// @brief: Function overloading, for more info, see previous one.
/// @param: ptr(Input), a pointer to volatile variable which is operated on.
/// @param: val(Input), value to be stored if condition is satisfied.
/// @param: expected(Input), value which is expected.
/// @param: order(Input), memory order which is relaxed by default.
/// @return: T, observed value of type T.
template <class T>
static __forceinline T
Cas(volatile T* ptr, T val, T expected,
std::memory_order order = std::memory_order_relaxed) {
BasicCheck<T>(ptr);
PreFence(order);
__atomic_compare_exchange(ptr, &expected, &val, false, c11ToBuiltInFlags(order), __ATOMIC_RELAXED);
PostFence(order);
return expected;
}
/// @brief: Exchange the value atomically with specified memory order.
/// @param: ptr(Input), a pointer to variable which is operated on.
/// @param: val(Input), value to be stored.
/// @param: order(Input), memory order which is relaxed by default.
/// @return: T, the value prior to the exchange.
template <class T>
static __forceinline T
Exchange(T* ptr, T val,
std::memory_order order = std::memory_order_relaxed) {
BasicCheck<T>(ptr);
T ret;
PreFence(order);
__atomic_exchange(ptr, &val, &ret, c11ToBuiltInFlags(order));
PostFence(order);
return ret;
}
/// @brief: Function overloading, for more info, see previous one.
/// @param: ptr(Input), a pointer to variable which is operated on.
/// @param: val(Input), value to be stored.
/// @param: order(Input), memory order which is relaxed by default.
/// @return: T, the value prior to the exchange.
template <class T>
static __forceinline T
Exchange(volatile T* ptr, T val,
std::memory_order order = std::memory_order_relaxed) {
BasicCheck<T>(ptr);
T ret;
PreFence(order);
__atomic_exchange(ptr, &val, &ret, c11ToBuiltInFlags(order));
PostFence(order);
return ret;
}
/// @brief: Add value to variable atomically with specified memory order.
/// @param: ptr(Input), a pointer to variable which is operated on.
/// @param: val(Input), value to be added.
/// @param: order(Input), memory order which is relaxed by default.
/// @return: T, the value of the variable prior to the addition.
template <class T>
static __forceinline T
Add(T* ptr, T val, std::memory_order order = std::memory_order_relaxed) {
BasicCheck<T>(ptr);
PreFence(order);
T ret = __atomic_fetch_add(ptr, val, c11ToBuiltInFlags(order));
PostFence(order);
return ret;
}
/// @brief: Subtract value from the variable atomically with specified memory
/// order.
/// @param: ptr(Input), a pointer to variable which is operated on.
/// @param: val(Input), value to be subtraced.
/// @param: order(Input), memory order which is relaxed by default.
/// @return: T, value of the variable prior to the subtraction.
template <class T>
static __forceinline T
Sub(T* ptr, T val, std::memory_order order = std::memory_order_relaxed) {
BasicCheck<T>(ptr);
PreFence(order);
T ret = __atomic_fetch_sub(ptr, val, c11ToBuiltInFlags(order));
PostFence(order);
return ret;
}
/// @brief: Bit And operation on variable atomically with specified memory
/// order.
/// @param: ptr(Input), a pointer to variable which is operated on.
/// @param: val(Input), value which is ANDed with variable.
/// @param: order(Input), memory order which is relaxed by default.
/// @return: T, value of variable prior to the operation.
template <class T>
static __forceinline T
And(T* ptr, T val, std::memory_order order = std::memory_order_relaxed) {
BasicCheck<T>(ptr);
PreFence(order);
T ret = __atomic_fetch_and(ptr, val, c11ToBuiltInFlags(order));
PostFence(order);
return ret;
}
/// @brief: Bit Or operation on variable atomically with specified memory order.
/// @param: ptr(Input), a pointer to variable which is operated on.
/// @param: val(Input), value which is ORed with variable.
/// @param: order(Input), memory order which is relaxed by default.
/// @return: T, value of variable prior to the operation.
template <class T>
static __forceinline T
Or(T* ptr, T val, std::memory_order order = std::memory_order_relaxed) {
BasicCheck<T>(ptr);
PreFence(order);
T ret = __atomic_fetch_or(ptr, val, c11ToBuiltInFlags(order));
PostFence(order);
return ret;
}
/// @brief: Bit Xor operation on variable atomically with specified memory
/// order.
/// @param: ptr(Input), a pointer to variable which is operated on.
/// @param: val(Input), value which is XORed with variable.
/// @order: order(Input), memory order which is relaxed by default.
/// @return: T, valud of variable prior to the opertaion.
template <class T>
static __forceinline T
Xor(T* ptr, T val, std::memory_order order = std::memory_order_relaxed) {
BasicCheck<T>(ptr);
PreFence(order);
T ret = __atomic_fetch_xor(ptr, val, c11ToBuiltInFlags(order));
PostFence(order);
return ret;
}
/// @brief: Increase the value of variable atomically with specified memory
/// order.
/// @param: ptr(Input), a pointer to variable which is operated on.
/// @param: order(Input), memory order which is relaxed by default.
/// @return: T, value of variable prior to the operation.
template <class T>
static __forceinline T
Increment(T* ptr, std::memory_order order = std::memory_order_relaxed) {
BasicCheck<T>(ptr);
PreFence(order);
T ret = __atomic_fetch_add(ptr, 1, c11ToBuiltInFlags(order));
PostFence(order);
return ret;
}
/// @brief: Decrease the value of the variable atomically with specified memory
/// order.
/// @param: ptr(Input), a pointer to variable which is operated on.
/// @param: order(Input), memory order which is relaxed by default.
/// @return: T, value of variable prior to the operation.
template <class T>
static __forceinline T
Decrement(T* ptr, std::memory_order order = std::memory_order_relaxed) {
BasicCheck<T>(ptr);
PreFence(order);
T ret = __atomic_fetch_sub(ptr, 1, c11ToBuiltInFlags(order));
PostFence(order);
return ret;
}
/// @brief: Add value to variable atomically with specified memory order.
/// @param: ptr(Input), a pointer to volatile variable which is operated on.
/// @param: val(Input), value to be added.
/// @param: order(Input), memory order which is relaxed by default.
/// @return: T, the value of the variable prior to the addition.
template <class T>
static __forceinline T
Add(volatile T* ptr, T val,
std::memory_order order = std::memory_order_relaxed) {
BasicCheck<T>(ptr);
PreFence(order);
T ret = __atomic_fetch_add(ptr, val, c11ToBuiltInFlags(order));
PostFence(order);
return ret;
}
/// @brief: Subtract value from the variable atomically with specified memory
/// order.
/// @param: ptr(Input), a pointer to volatile variable which is operated on.
/// @param: val(Input), value to be subtraced.
/// @param: order(Input), memory order which is relaxed by default.
/// @return: T, value of the variable prior to the subtraction.
template <class T>
static __forceinline T
Sub(volatile T* ptr, T val,
std::memory_order order = std::memory_order_relaxed) {
BasicCheck<T>(ptr);
PreFence(order);
T ret = __atomic_fetch_sub(ptr, val, c11ToBuiltInFlags(order));
PostFence(order);
return ret;
}
/// @brief: Bit And operation on variable atomically with specified memory
/// order.
/// @param: ptr(Input), a pointer to volatile variable which is operated on.
/// @param: val(Input), value which is ANDed with variable.
/// @param: order(Input), memory order which is relaxed by default.
/// @return: T, value of variable prior to the operation.
template <class T>
static __forceinline T
And(volatile T* ptr, T val,
std::memory_order order = std::memory_order_relaxed) {
BasicCheck<T>(ptr);
PreFence(order);
T ret = __atomic_fetch_and(ptr, val, c11ToBuiltInFlags(order));
PostFence(order);
return ret;
}
/// @brief: Bit Or operation on variable atomically with specified memory order.
/// @param: ptr(Input), a pointer to volatile variable which is operated on.
/// @param: val(Input), value which is ORed with variable.
/// @param: order(Input), memory order which is relaxed by default.
/// @return: T, value of variable prior to the operation.
template <class T>
static __forceinline T Or(volatile T* ptr, T val,
std::memory_order order = std::memory_order_relaxed) {
BasicCheck<T>(ptr);
PreFence(order);
T ret = __atomic_fetch_or(ptr, val, c11ToBuiltInFlags(order));
PostFence(order);
return ret;
}
/// @brief: Bit Xor operation on variable atomically with specified memory
/// order.
/// @param: ptr(Input), a pointer to volatile variable which is operated on.
/// @param: val(Input), value which is XORed with variable.
/// @order: order(Input), memory order which is relaxed by default.
/// @return: T, valud of variable prior to the opertaion.
template <class T>
static __forceinline T
Xor(volatile T* ptr, T val,
std::memory_order order = std::memory_order_relaxed) {
BasicCheck<T>(ptr);
PreFence(order);
T ret = __atomic_fetch_xor(ptr, val, c11ToBuiltInFlags(order));
PostFence(order);
return ret;
}
/// @brief: Increase the value of variable atomically with specified memory
/// order.
/// @param: ptr(Input), a pointer to volatile variable which is operated on.
/// @param: order(Input), memory order which is relaxed by default.
/// @return: T, value of variable prior to the operation.
template <class T>
static __forceinline T
Increment(volatile T* ptr,
std::memory_order order = std::memory_order_relaxed) {
BasicCheck<T>(ptr);
PreFence(order);
T ret = __atomic_fetch_add(ptr, 1, c11ToBuiltInFlags(order));
PostFence(order);
return ret;
}
/// @brief: Decrease the value of the variable atomically with specified memory
/// order.
/// @param: ptr(Input), a pointer to volatile variable which is operated on.
/// @param: order(Input), memory order which is relaxed by default.
/// @return: T, value of variable prior to the operation.
template <class T>
static __forceinline T
Decrement(volatile T* ptr,
std::memory_order order = std::memory_order_relaxed) {
BasicCheck<T>(ptr);
PreFence(order);
T ret = __atomic_fetch_sub(ptr, 1, c11ToBuiltInFlags(order));
PostFence(order);
return ret;
}
} // namespace atomic
} // namespace wsl
#ifdef X64_ORDER_WC
#undef X64_ORDER_WC
#endif
#ifdef ALWAYS_CONSERVATIVE
#undef ALWAYS_CONSERVATIVE
#endif
#endif // HSA_RUNTIME_CORE_UTIL_ATOMIC_HELPERS_H_
@@ -0,0 +1,155 @@
////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
// AMD Research and AMD HSA Software Development
//
// Advanced Micro Devices, Inc.
//
// www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// - Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimers.
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimers in
// the documentation and/or other materials provided with the distribution.
// - Neither the names of Advanced Micro Devices, Inc,
// nor the names of its contributors may be used to endorse or promote
// products derived from this Software without specific prior written
// permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIESd OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
//
////////////////////////////////////////////////////////////////////////////////
#ifndef HSA_RUNTIME_CORE_UTIL_LAZY_PTR_H_
#define HSA_RUNTIME_CORE_UTIL_LAZY_PTR_H_
#include <memory>
#include <utility>
#include <functional>
#include "core/util/locks.h"
#include "core/util/utils.h"
namespace wsl {
/*
* Wrapper for a std::unique_ptr that initializes its object at first use.
*/
template <typename T> class lazy_ptr {
public:
lazy_ptr() {}
explicit lazy_ptr(std::function<T*()> Constructor) { reset(Constructor); }
lazy_ptr(lazy_ptr&& rhs) {
obj = std::move(rhs.obj);
func = std::move(rhs.func);
}
lazy_ptr& operator=(lazy_ptr&& rhs) {
obj = std::move(rhs.obj);
func = std::move(rhs.func);
}
lazy_ptr(lazy_ptr&) = delete;
lazy_ptr& operator=(lazy_ptr&) = delete;
void reset(std::function<T*()> Constructor = nullptr) {
obj.reset();
func = Constructor;
}
void reset(T* ptr) {
obj.reset(ptr);
func = nullptr;
}
bool operator==(T* rhs) const { return obj.get() == rhs; }
bool operator!=(T* rhs) const { return obj.get() != rhs; }
const std::unique_ptr<T>& operator->() const {
make(true);
assert(obj != nullptr && "Null dereference through lazy_ptr.");
return obj;
}
std::unique_ptr<T>& operator*() {
make(true);
return obj;
}
const std::unique_ptr<T>& operator*() const {
make(true);
return obj;
}
/*
* Ensures that the object is created or is being created.
* This is useful when early construction of the object is required.
*/
void touch() const { make(false); }
// Tells if the lazy object has been constructed or not.
// Construction may fail silently (return nullptr).
bool created() const {
std::atomic_thread_fence(std::memory_order_acquire);
return func == nullptr;
}
// Tells if the lazy object exists or not.
bool empty() const {
std::atomic_thread_fence(std::memory_order_acquire);
return obj == nullptr;
}
private:
mutable std::unique_ptr<T> obj;
mutable std::function<T*(void)> func;
mutable KernelMutex lock;
// Separated from make to improve inlining.
void make_body(bool block) const {
if (block) {
lock.Acquire();
} else if (!lock.Try()) {
return;
}
MAKE_SCOPE_GUARD([&]() { lock.Release(); });
if (func == nullptr) return;
T* ptr = func();
obj.reset(ptr);
std::atomic_thread_fence(std::memory_order_release);
func = nullptr;
}
__forceinline void make(bool block) const {
if (!created()) {
make_body(block);
}
}
};
} // namespace wsl
#endif // HSA_RUNTIME_CORE_UTIL_LAZY_PTR_H_
@@ -0,0 +1,769 @@
////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2014-2024, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
// AMD Research and AMD HSA Software Development
//
// Advanced Micro Devices, Inc.
//
// www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// - Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimers.
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimers in
// the documentation and/or other materials provided with the distribution.
// - Neither the names of Advanced Micro Devices, Inc,
// nor the names of its contributors may be used to endorse or promote
// products derived from this Software without specific prior written
// permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
//
////////////////////////////////////////////////////////////////////////////////
#ifdef __linux__
#include "core/util/os.h"
#include "core/util/utils.h"
#include <link.h>
#include <dlfcn.h>
#include <pthread.h>
#include <limits.h>
#include <sched.h>
#include <sys/sysinfo.h>
#include <sys/time.h>
#include <sys/utsname.h>
#include <unistd.h>
#include <errno.h>
#include <cstring>
#include <atomic>
#include <memory>
#include <string>
#include <utility>
#include <semaphore.h>
#include "core/inc/runtime.h"
#if defined(__i386__) || defined(__x86_64__)
#include <cpuid.h>
#endif
namespace wsl {
namespace os {
struct ThreadArgs {
void* entry_args;
ThreadEntry entry_function;
};
void* __stdcall ThreadTrampoline(void* arg) {
ThreadArgs* ar = (ThreadArgs*)arg;
ThreadEntry CallMe = ar->entry_function;
void* Data = ar->entry_args;
delete ar;
CallMe(Data);
return nullptr;
}
// Thread container allows multiple waits and separate close (destroy).
class os_thread {
public:
explicit os_thread(ThreadEntry function, void* threadArgument, uint stackSize)
: thread(0), lock(nullptr), state(RUNNING) {
int err;
std::unique_ptr<ThreadArgs> args(new ThreadArgs);
lock = CreateMutex();
if (lock == nullptr) return;
args->entry_args = threadArgument;
args->entry_function = function;
pthread_attr_t attrib;
err = pthread_attr_init(&attrib);
if (err != 0) {
pr_err("pthread_attr_init failed: %s\n", strerror(err));
return;
}
if (stackSize != 0) {
stackSize = Max(uint(PTHREAD_STACK_MIN), stackSize);
stackSize = AlignUp(stackSize, 4096);
err = pthread_attr_setstacksize(&attrib, stackSize);
if (err != 0) {
pr_err("pthread_attr_setstacksize failed: %s\n", strerror(err));
err = pthread_attr_destroy(&attrib);
if (err != 0) {
pr_err("pthread_attr_destroy failed: %s\n", strerror(err));
return;
}
}
}
int cores = 0;
cpu_set_t* cpuset = nullptr;
if (core::Runtime::runtime_singleton_->flag().override_cpu_affinity()) {
cores = get_nprocs_conf();
cpuset = CPU_ALLOC(cores);
if (cpuset == nullptr) {
pr_err("CPU_ALLOC failed: %s\n", strerror(errno));
return;
}
CPU_ZERO_S(CPU_ALLOC_SIZE(cores), cpuset);
for (int i = 0; i < cores; i++) {
CPU_SET_S(i, CPU_ALLOC_SIZE(cores), cpuset);
}
err = pthread_attr_setaffinity_np(&attrib, CPU_ALLOC_SIZE(cores), cpuset);
CPU_FREE(cpuset);
if (err != 0) {
pr_err("pthread_setaffinity_np failed: %s\n", strerror(err));
return;
}
}
err = pthread_create(&thread, &attrib, ThreadTrampoline, args.get());
// Probably a stack size error since system limits can be different from PTHREAD_STACK_MIN
// Attempt to grow the stack within reason.
if ((err == EINVAL) && stackSize != 0) {
while (stackSize < 20 * 1024 * 1024) {
stackSize *= 2;
err = pthread_attr_setstacksize(&attrib, stackSize);
if (err != 0) {
pr_err("pthread_attr_setstacksize failed: %s\n", strerror(err));
return;
}
err = pthread_create(&thread, &attrib, ThreadTrampoline, args.get());
if (err != EINVAL) break;
pr_debug("pthread_create returned EINVAL, doubling stack size\n");
}
}
if (err == 0)
args.release();
else
thread = 0;
err = pthread_attr_destroy(&attrib);
if (err != 0) {
pr_err("pthread_attr_destroy failed: %s\n", strerror(err));
}
}
os_thread(os_thread&& rhs) {
thread = rhs.thread;
lock = rhs.lock;
state = int(rhs.state);
rhs.thread = 0;
rhs.lock = nullptr;
}
os_thread(os_thread&) = delete;
~os_thread() {
if (lock != nullptr) DestroyMutex(lock);
if ((state == RUNNING) && (thread != 0)) {
int err = pthread_detach(thread);
if (err != 0) pr_err("pthread_detach failed: %s\n", strerror(err));
}
}
bool Valid() { return (lock != nullptr) && (thread != 0); }
bool Wait() {
if (state == FINISHED) return true;
AcquireMutex(lock);
if (state == FINISHED) {
ReleaseMutex(lock);
return true;
}
int err = pthread_join(thread, NULL);
bool success = (err == 0);
if (success) state = FINISHED;
ReleaseMutex(lock);
return success;
}
private:
pthread_t thread;
Mutex lock;
std::atomic<int> state;
enum { FINISHED = 0, RUNNING = 1 };
};
static_assert(sizeof(LibHandle) == sizeof(void*), "OS abstraction size mismatch");
static_assert(sizeof(Semaphore) == sizeof(sem_t*), "OS abstraction size mismatch");
static_assert(sizeof(Mutex) == sizeof(pthread_mutex_t*), "OS abstraction size mismatch");
static_assert(sizeof(SharedMutex) == sizeof(pthread_rwlock_t*), "OS abstraction size mismatch");
static_assert(sizeof(Thread) == sizeof(os_thread*), "OS abstraction size mismatch");
LibHandle LoadLib(std::string filename) {
void* ret = dlopen(filename.c_str(), RTLD_LAZY);
if (ret == nullptr) pr_err("LoadLib(%s) failed: %s\n", filename.c_str(), dlerror());
return *(LibHandle*)&ret;
}
void* GetExportAddress(LibHandle lib, std::string export_name) {
void* ret = dlsym(*(void**)&lib, export_name.c_str());
// dlsym searches the given library and all the library's load dependencies.
// Remaining code limits symbol lookup to only the library handle given.
// This lookup pattern matches Windows.
if (ret == NULL) return ret;
link_map* map;
int err = dlinfo(*(void**)&lib, RTLD_DI_LINKMAP, &map);
if (err == -1) {
pr_err("dlinfo failed: %s\n", dlerror());
return nullptr;
}
Dl_info info;
err = dladdr(ret, &info);
if (err == 0) {
pr_err("dladdr failed.\n");
return nullptr;
}
if (strcmp(info.dli_fname, map->l_name) == 0) return ret;
return NULL;
}
void CloseLib(LibHandle lib) { dlclose(*(void**)&lib); }
/*
* @brief Look for a symbol called "HSA_AMD_TOOL_PRIORITY" across all loaded
* shared libraries, and if found, store the name of the library
*
* @param[in]: info A dl_phdr_info struct pointer, which contains information
* about library's load address, header, and name.
*
* @param[in]: size integer size of dl_phdr_info struct
*
* @param[out]: data copy of the data argument to dl_phdr_iterate call
*
* @retval:: Return 0 on Success. If callback returns a non-zero value,
* dl_iterate_phdr() will stop processing, even if there are unprocessed
* shared objects.
*/
static int callback(struct dl_phdr_info* info, size_t size, void* data) {
std::vector<std::string>* loadedToolsLib = (std::vector<std::string>*)data;
assert(loadedToolsLib != nullptr);
/*
* Check if lib name is not empty and its not a "vdso.so" lib,
* The vDSO is a special shared object file that is built into the Linux kernel.
* It is not a regular shared library and thus does not have all the properties
* of regular shared libraries. The way the vDSO is loaded and organized in memory
* is different from regular shared libraries and it's not guaranteed that it
* will have a specific segment or section. Hence its skipped.
*/
if ((info) && (info->dlpi_name[0] != '\0')) {
if (std::string(info->dlpi_name).find("vdso.so") != std::string::npos) return 0;
/*
* Iterate through the program headers of the loaded lib and check for PT_DYNAMIC program
* header. If the PT_DYNAMIC program header is found, use dlpi_addr and dlpi_phdr members
* of dl_phdr_info struct to get the address of the dynamic section of the loaded
* library in memory
*/
for (int i = 0; i < info->dlpi_phnum; i++) {
if (info->dlpi_phdr[i].p_type == PT_DYNAMIC) {
Elf64_Dyn* dyn_section = (Elf64_Dyn*)(info->dlpi_addr + info->dlpi_phdr[i].p_vaddr);
char* strings = nullptr;
Elf64_Xword limit = 0;
/*
* The dynamic section is searched for DT_STRTAB (address of string table),
* and DT_STRSZ (size of string table)
* DT_NULL - Marks the end of the _DYNAMIC array
*/
for (int j = 0;; j++) {
if (dyn_section[j].d_tag == DT_NULL) break;
if (dyn_section[j].d_tag == DT_STRTAB) strings = (char*)(dyn_section[j].d_un.d_ptr);
if (dyn_section[j].d_tag == DT_STRSZ) limit = dyn_section[j].d_un.d_val;
}
if (strings == nullptr) pr_debug("String table not found\n");
/*
* Hacky lookup, if string and symbol tables are found,
* iterate through the strings in string table and check if
* any string matches "HSA_AMD_TOOL_PRIORITY".
* If yes, then add the name of the library to the vector of
* lib names
*/
if (strings != nullptr) {
char* end = strings + limit;
while (strings < end) {
if (strcmp(strings, "HSA_AMD_TOOL_PRIORITY") == 0) {
loadedToolsLib->push_back(info->dlpi_name);
return 0;
}
strings += (strlen(strings) + 1);
}
}
}
}
}
return 0;
}
std::vector<LibHandle> GetLoadedToolsLib() {
std::vector<LibHandle> ret;
std::vector<std::string> names;
/* Iterate through all of the loaded shared libraries in the process */
dl_iterate_phdr(callback, &names);
if (!names.empty()) {
for (auto& name : names) ret.push_back(LoadLib(name));
}
return ret;
}
std::string GetLibraryName(LibHandle lib) {
link_map *map;
if(dlinfo(lib, RTLD_DI_LINKMAP, &map)!=0)
return "";
return map->l_name;
}
Semaphore CreateSemaphore() {
sem_t *sem = new sem_t;
sem_init(sem, 0, 0);
return *(Semaphore*)&sem;
}
bool WaitSemaphore(Semaphore sem) {
while(sem_wait(*(sem_t**)&sem))
if (errno != EINTR) return false;
return true;
}
void PostSemaphore(Semaphore sem) {
if (sem_post(*(sem_t**)&sem))
assert(false && "Failed to post semaphore");
}
void DestroySemaphore(Semaphore sem) {
sem_destroy(*(sem_t**)&sem);
delete *(sem_t**)&sem;
}
Mutex CreateMutex() {
pthread_mutex_t* mutex = new pthread_mutex_t;
pthread_mutex_init(mutex, NULL);
return *(Mutex*)&mutex;
}
bool TryAcquireMutex(Mutex lock) {
return pthread_mutex_trylock(*(pthread_mutex_t**)&lock) == 0;
}
bool AcquireMutex(Mutex lock) {
return pthread_mutex_lock(*(pthread_mutex_t**)&lock) == 0;
}
void ReleaseMutex(Mutex lock) {
pthread_mutex_unlock(*(pthread_mutex_t**)&lock);
}
void DestroyMutex(Mutex lock) {
pthread_mutex_destroy(*(pthread_mutex_t**)&lock);
delete *(pthread_mutex_t**)&lock;
}
void Sleep(int delay_in_millisec) { usleep(delay_in_millisec * 1000); }
void uSleep(int delayInUs) { usleep(delayInUs); }
void YieldThread() { sched_yield(); }
Thread CreateThread(ThreadEntry function, void* threadArgument, uint stackSize) {
os_thread* result = new os_thread(function, threadArgument, stackSize);
if (!result->Valid()) {
delete result;
return nullptr;
}
return reinterpret_cast<Thread>(result);
}
void CloseThread(Thread thread) { delete reinterpret_cast<os_thread*>(thread); }
bool WaitForThread(Thread thread) { return reinterpret_cast<os_thread*>(thread)->Wait(); }
bool WaitForAllThreads(Thread* threads, uint threadCount) {
for (uint i = 0; i < threadCount; i++) WaitForThread(threads[i]);
return true;
}
bool IsEnvVarSet(std::string env_var_name) {
char* buff = NULL;
buff = getenv(env_var_name.c_str());
return (buff != NULL);
}
void SetEnvVar(std::string env_var_name, std::string env_var_value) {
setenv(env_var_name.c_str(), env_var_value.c_str(), 1);
}
int GetProcessId() {
return ::getpid();
}
std::string GetEnvVar(std::string env_var_name) {
char* buff;
buff = getenv(env_var_name.c_str());
std::string ret;
if (buff) {
ret = buff;
}
return ret;
}
size_t GetUserModeVirtualMemorySize() {
#ifdef _LP64
// https://www.kernel.org/doc/Documentation/x86/x86_64/mm.txt :
// user space is 0000000000000000 - 00007fffffffffff (=47 bits)
return (size_t)(0x800000000000);
#else
return (size_t)(0xffffffff); // ~4GB
#endif
}
size_t GetUsablePhysicalHostMemorySize() {
struct sysinfo info = {0};
if (sysinfo(&info) != 0) {
return 0;
}
const size_t physical_size =
static_cast<size_t>(info.totalram * info.mem_unit);
return std::min(GetUserModeVirtualMemorySize(), physical_size);
}
uintptr_t GetUserModeVirtualMemoryBase() { return (uintptr_t)0; }
// Os event implementation
typedef struct EventDescriptor_ {
pthread_cond_t event;
pthread_mutex_t mutex;
bool state;
bool auto_reset;
} EventDescriptor;
EventHandle CreateOsEvent(bool auto_reset, bool init_state) {
EventDescriptor* eventDescrp;
eventDescrp = (EventDescriptor*)malloc(sizeof(EventDescriptor));
pthread_mutex_init(&eventDescrp->mutex, NULL);
pthread_cond_init(&eventDescrp->event, NULL);
eventDescrp->auto_reset = auto_reset;
eventDescrp->state = init_state;
EventHandle handle = reinterpret_cast<EventHandle>(eventDescrp);
return handle;
}
int DestroyOsEvent(EventHandle event) {
if (event == NULL) {
return -1;
}
EventDescriptor* eventDescrp = reinterpret_cast<EventDescriptor*>(event);
int ret_code = pthread_cond_destroy(&eventDescrp->event);
ret_code |= pthread_mutex_destroy(&eventDescrp->mutex);
free(eventDescrp);
return ret_code;
}
int WaitForOsEvent(EventHandle event, unsigned int milli_seconds) {
if (event == NULL) {
return -1;
}
EventDescriptor* eventDescrp = reinterpret_cast<EventDescriptor*>(event);
// Event wait time is 0 and state is non-signaled, return directly
if (milli_seconds == 0) {
int tmp_ret = pthread_mutex_trylock(&eventDescrp->mutex);
if (tmp_ret == EBUSY) {
// Timeout
return 1;
}
}
int ret_code = 0;
pthread_mutex_lock(&eventDescrp->mutex);
if (!eventDescrp->state) {
if (milli_seconds == 0) {
ret_code = 1;
} else {
struct timespec ts;
struct timeval tp;
ret_code = gettimeofday(&tp, NULL);
ts.tv_sec = tp.tv_sec;
ts.tv_nsec = tp.tv_usec * 1000;
unsigned int sec = milli_seconds / 1000;
unsigned int mSec = milli_seconds % 1000;
ts.tv_sec += sec;
ts.tv_nsec += mSec * 1000000;
// More then one second, add 1 sec to the tv_sec elem
if (ts.tv_nsec > 1000000000) {
ts.tv_sec += 1;
ts.tv_nsec = ts.tv_nsec - 1000000000;
}
ret_code =
pthread_cond_timedwait(&eventDescrp->event, &eventDescrp->mutex, &ts);
// Time out
if (ret_code == 110) {
ret_code = 0x14003; // 1 means time out in HSA
}
if (ret_code == 0 && eventDescrp->auto_reset) {
eventDescrp->state = false;
}
}
} else if (eventDescrp->auto_reset) {
eventDescrp->state = false;
}
pthread_mutex_unlock(&eventDescrp->mutex);
return ret_code;
}
int SetOsEvent(EventHandle event) {
if (event == NULL) {
return -1;
}
EventDescriptor* eventDescrp = reinterpret_cast<EventDescriptor*>(event);
int ret_code = 0;
ret_code = pthread_mutex_lock(&eventDescrp->mutex);
eventDescrp->state = true;
ret_code = pthread_mutex_unlock(&eventDescrp->mutex);
ret_code |= pthread_cond_signal(&eventDescrp->event);
return ret_code;
}
int ResetOsEvent(EventHandle event) {
if (event == NULL) {
return -1;
}
EventDescriptor* eventDescrp = reinterpret_cast<EventDescriptor*>(event);
int ret_code = 0;
ret_code = pthread_mutex_lock(&eventDescrp->mutex);
eventDescrp->state = false;
ret_code = pthread_mutex_unlock(&eventDescrp->mutex);
return ret_code;
}
static double invPeriod = 0.0;
uint64_t ReadAccurateClock() {
if (invPeriod == 0.0) AccurateClockFrequency();
timespec time;
int err = clock_gettime(CLOCK_MONOTONIC_RAW, &time);
if (err != 0) {
pr_err("clock_gettime(CLOCK_MONOTONIC_RAW,...) failed %s\n", strerror(errno));
abort();
}
return (uint64_t(time.tv_sec) * 1000000000ull + uint64_t(time.tv_nsec)) * invPeriod;
}
uint64_t AccurateClockFrequency() {
static clockid_t clock = CLOCK_MONOTONIC;
static std::atomic<bool> first(true);
// Check kernel version - not a concurrency concern.
// use non-RAW for getres due to bug in older 2.6.x kernels
if (first.load(std::memory_order_acquire)) {
utsname kernelInfo;
if (uname(&kernelInfo) == 0) {
try {
std::string ver = kernelInfo.release;
size_t idx;
int major = std::stoi(ver, &idx);
int minor = std::stoi(ver.substr(idx + 1));
if ((major >= 4) && (minor >= 4)) {
clock = CLOCK_MONOTONIC_RAW;
}
} catch (...) {
// Kernel version string doesn't conform to the standard pattern.
// Keep using the "safe" (non-RAW) clock.
}
}
first.store(false, std::memory_order_release);
}
timespec time;
int err = clock_getres(clock, &time);
if (err != 0) {
pr_err("clock_getres failed %s\n", strerror(errno));
abort();
}
if (time.tv_sec != 0 || time.tv_nsec >= 0xFFFFFFFF) {
pr_err("clock_getres(CLOCK_MONOTONIC(_RAW),...) returned very low frequency (<1Hz).\n");
abort();
}
if (invPeriod == 0.0) invPeriod = 1.0 / double(time.tv_nsec);
return 1000000000ull / uint64_t(time.tv_nsec);
}
SharedMutex CreateSharedMutex() {
pthread_rwlockattr_t attrib;
int err = pthread_rwlockattr_init(&attrib);
if (err != 0) {
pr_err("rw lock attribute init failed: %s\n", strerror(err));
return nullptr;
}
#ifdef __GLIBC__
err = pthread_rwlockattr_setkind_np(&attrib, PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP);
if (err != 0) {
pr_err("Set rw lock attribute failure: %s\n", strerror(err));
return nullptr;
}
#else
err = pthread_rwlockattr_setkind(&attrib, PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP);
if (err != 0) {
pr_err("Set rw lock attribute failure: %s\n", strerror(err));
return nullptr;
}
#endif
pthread_rwlock_t* lock = new pthread_rwlock_t;
err = pthread_rwlock_init(lock, &attrib);
if (err != 0) {
pr_err("rw lock init failed: %s\n", strerror(err));
return nullptr;
}
pthread_rwlockattr_destroy(&attrib);
return lock;
}
bool TryAcquireSharedMutex(SharedMutex lock) {
int err = pthread_rwlock_trywrlock(*(pthread_rwlock_t**)&lock);
return err == 0;
}
bool AcquireSharedMutex(SharedMutex lock) {
int err = pthread_rwlock_wrlock(*(pthread_rwlock_t**)&lock);
return err == 0;
}
void ReleaseSharedMutex(SharedMutex lock) {
int err = pthread_rwlock_unlock(*(pthread_rwlock_t**)&lock);
if (err != 0) {
pr_err("SharedMutex unlock failed: %s\n", strerror(err));
abort();
}
}
bool TrySharedAcquireSharedMutex(SharedMutex lock) {
int err = pthread_rwlock_tryrdlock(*(pthread_rwlock_t**)&lock);
return err == 0;
}
bool SharedAcquireSharedMutex(SharedMutex lock) {
int err = pthread_rwlock_rdlock(*(pthread_rwlock_t**)&lock);
return err == 0;
}
void SharedReleaseSharedMutex(SharedMutex lock) {
int err = pthread_rwlock_unlock(*(pthread_rwlock_t**)&lock);
if (err != 0) {
pr_err("SharedMutex unlock failed: %s\n", strerror(err));
abort();
}
}
void DestroySharedMutex(SharedMutex lock) {
pthread_rwlock_destroy(*(pthread_rwlock_t**)&lock);
delete *(pthread_rwlock_t**)&lock;
}
static uint64_t sys_clock_period_ = 0;
uint64_t ReadSystemClock() {
struct timespec ts;
clock_gettime(CLOCK_BOOTTIME, &ts);
uint64_t time = (uint64_t(ts.tv_sec) * 1000000000 + uint64_t(ts.tv_nsec));
if (sys_clock_period_ != 1)
return time / sys_clock_period_;
else
return time;
}
uint64_t SystemClockFrequency() {
struct timespec ts;
clock_getres(CLOCK_BOOTTIME, &ts);
sys_clock_period_ = (uint64_t(ts.tv_sec) * 1000000000 + uint64_t(ts.tv_nsec));
return 1000000000 / sys_clock_period_;
}
bool ParseCpuID(cpuid_t* cpuinfo) {
#if defined(__i386__) || defined(__x86_64__)
uint32_t eax, ebx, ecx, edx, max_eax = 0;
memset(cpuinfo, 0, sizeof(*cpuinfo));
/* Make sure current CPU supports at least EAX 4 */
if (!__get_cpuid_max(0x80000004, NULL)) return false;
// Manufacturer ID is a twelve-character ASCII string stored in order EBX, EDX, ECX.
if (!__get_cpuid(0, &max_eax, (uint32_t*)&cpuinfo->ManufacturerID[0],
(uint32_t*)&cpuinfo->ManufacturerID[8],
(uint32_t*)&cpuinfo->ManufacturerID[4])) {
return false;
}
if (!strcmp(cpuinfo->ManufacturerID, "AuthenticAMD")) {
if (__get_cpuid(0x80000001, &eax, &ebx, &ecx, &edx)) {
cpuinfo->mwaitx = !!((ecx >> 29) & 0x1);
}
}
return true;
#else
return false;
#endif
}
} // namespace os
} // namespace wsl
#endif
@@ -0,0 +1,290 @@
////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
// AMD Research and AMD HSA Software Development
//
// Advanced Micro Devices, Inc.
//
// www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// - Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimers.
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimers in
// the documentation and/or other materials provided with the distribution.
// - Neither the names of Advanced Micro Devices, Inc,
// nor the names of its contributors may be used to endorse or promote
// products derived from this Software without specific prior written
// permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
//
////////////////////////////////////////////////////////////////////////////////
// Library of syncronization primitives - to be added to as needed.
#ifndef HSA_RUNTIME_CORE_UTIL_LOCKS_H_
#define HSA_RUNTIME_CORE_UTIL_LOCKS_H_
#include "utils.h"
#include "os.h"
namespace wsl {
class HybridMutex {
public:
HybridMutex():lock_(0) {
sem_ = os::CreateSemaphore();
}
~HybridMutex() {
os::DestroySemaphore(sem_);
}
bool Try() {
int old = 0;
return lock_.compare_exchange_strong(old, 1);
}
bool Acquire() {
int cnt = maxSpinIterPause + maxSpinIterYield;
int old = 0;
while (!lock_.compare_exchange_strong(old, 1)) {
cnt--;
if (cnt > maxSpinIterPause) {
_mm_pause();
} else if (cnt-- > maxSpinIterYield) {
os::YieldThread();
} else {
os::WaitSemaphore(sem_);
cnt = maxSpinIterPause + maxSpinIterYield;
}
old = 0;
}
return true;
}
void Release() {
int old = 1;
if (lock_.compare_exchange_strong(old, 0))
os::PostSemaphore(sem_);
}
private:
std::atomic<int> lock_;
os::Semaphore sem_;
const uint32_t maxSpinIterPause = 55;
const uint32_t maxSpinIterYield = 55;
/// @brief: Disable copiable and assignable ability.
DISALLOW_COPY_AND_ASSIGN(HybridMutex);
};
/// @brief: a class represents a kernel mutex.
/// Uses the kernel's scheduler to keep the waiting thread from being scheduled
/// until the lock is released (Best for long waits, though anything using
/// a kernel object is a long wait).
class KernelMutex {
public:
KernelMutex() { lock_ = os::CreateMutex(); }
~KernelMutex() { os::DestroyMutex(lock_); }
bool Try() { return os::TryAcquireMutex(lock_); }
bool Acquire() { return os::AcquireMutex(lock_); }
void Release() { os::ReleaseMutex(lock_); }
private:
os::Mutex lock_;
/// @brief: Disable copiable and assignable ability.
DISALLOW_COPY_AND_ASSIGN(KernelMutex);
};
/// @brief: represents a spin lock.
/// For very short hold durations on the order of the thread scheduling
/// quanta or less.
class SpinMutex {
public:
SpinMutex() { lock_ = 0; }
bool Try() {
int old = 0;
return lock_.compare_exchange_strong(old, 1);
}
bool Acquire() {
int old = 0;
while (!lock_.compare_exchange_strong(old, 1))
{
old=0;
os::YieldThread();
}
return true;
}
void Release() { lock_ = 0; }
private:
std::atomic<int> lock_;
/// @brief: Disable copiable and assignable ability.
DISALLOW_COPY_AND_ASSIGN(SpinMutex);
};
class KernelEvent {
public:
KernelEvent() { evt_ = os::CreateOsEvent(true, true); }
~KernelEvent() { os::DestroyOsEvent(evt_); }
bool IsSet() { return os::WaitForOsEvent(evt_, 0)==0; }
bool WaitForSet() { return os::WaitForOsEvent(evt_, 0xFFFFFFFF)==0; }
void Set() { os::SetOsEvent(evt_); }
void Reset() { os::ResetOsEvent(evt_); }
private:
os::EventHandle evt_;
/// @brief: Disable copiable and assignable ability.
DISALLOW_COPY_AND_ASSIGN(KernelEvent);
};
/// @brief: represents a yielding shared mutex.
/// aka read/write mutex
class KernelSharedMutex {
public:
/// @brief: Interfaces ScopedAcquire to shared operations.
class Shared {
public:
explicit Shared(KernelSharedMutex* lock) : lock_(lock) {}
bool Try() { return lock_->TryShared(); }
bool Acquire() { return lock_->AcquireShared(); }
void Release() { lock_->ReleaseShared(); }
private:
KernelSharedMutex* lock_;
};
KernelSharedMutex() { lock_ = os::CreateSharedMutex(); }
~KernelSharedMutex() { os::DestroySharedMutex(lock_); }
// Exclusive mode operations
bool Try() { return os::TryAcquireSharedMutex(lock_); }
bool Acquire() { return os::AcquireSharedMutex(lock_); }
void Release() { os::ReleaseSharedMutex(lock_); }
// Shared mode operations
bool TryShared() { return os::TrySharedAcquireSharedMutex(lock_); }
bool AcquireShared() { return os::SharedAcquireSharedMutex(lock_); }
void ReleaseShared() { os::SharedReleaseSharedMutex(lock_); }
// Return shared operations interface
Shared shared() { return Shared(this); }
private:
os::SharedMutex lock_;
/// @brief: Disable copiable and assignable ability.
DISALLOW_COPY_AND_ASSIGN(KernelSharedMutex);
};
/// @brief: Type trait to identify mutex types
template <class T> class isMutex {
public:
enum { value = false };
};
template <> class isMutex<HybridMutex> {
public:
enum { value = true };
};
template <> class isMutex<KernelMutex> {
public:
enum { value = true };
};
template <> class isMutex<SpinMutex> {
public:
enum { value = true };
};
template <> class isMutex<KernelSharedMutex> {
public:
enum { value = true };
};
/// @brief: A class behaves as a lock in a scope. When trying to enter into the
/// critical section, creat a object of this class. After the control path goes
/// out of the scope, it will release the lock automatically.
template <class LockType> class ScopedAcquire {
public:
/// @brief: When constructing, acquire the lock.
/// @param: lock(Input), pointer to an existing lock.
explicit ScopedAcquire(LockType* lock) : lock_(lock), doRelease(true) {
static_assert(isMutex<LockType>::value, "ScopedAcquire requires a mutex type.");
lock_.Acquire();
}
explicit ScopedAcquire(LockType lock) : lock_(lock), doRelease(true) {
static_assert(!isMutex<LockType>::value, "Mutex types are not copyable.");
lock_.Acquire();
}
/// @brief: when destructing, release the lock.
~ScopedAcquire() {
if (doRelease) lock_.Release();
}
/// @brief: Release the lock early. Avoid using when possible.
void Release() {
lock_.Release();
doRelease = false;
}
private:
/// @brief: Adapts between pointers to mutex types and mutex pointer types.
template <class T, bool B> class container {
public:
container(T* lock) : lock_(lock) {}
__forceinline bool Acquire() { return lock_->Acquire(); }
__forceinline void Release() { return lock_->Release(); }
private:
T* lock_;
};
/// @brief: Specialization for mutex pointer types.
template <class T> class container<T, false> {
public:
container(T lock) : lock_(lock) {}
__forceinline bool Acquire() { return lock_.Acquire(); }
__forceinline void Release() { return lock_.Release(); }
private:
T lock_;
};
container<LockType, isMutex<LockType>::value> lock_;
bool doRelease;
/// @brief: Disable copiable and assignable ability.
DISALLOW_COPY_AND_ASSIGN(ScopedAcquire);
};
} // namespace wsl
#endif // HSA_RUNTIME_CORE_SUTIL_LOCKS_H_
@@ -0,0 +1,327 @@
////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2014-2024, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
// AMD Research and AMD HSA Software Development
//
// Advanced Micro Devices, Inc.
//
// www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// - Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimers.
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimers in
// the documentation and/or other materials provided with the distribution.
// - Neither the names of Advanced Micro Devices, Inc,
// nor the names of its contributors may be used to endorse or promote
// products derived from this Software without specific prior written
// permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
//
////////////////////////////////////////////////////////////////////////////////
// Minimal operating system abstraction interfaces.
#ifndef HSA_RUNTIME_CORE_UTIL_OS_H_
#define HSA_RUNTIME_CORE_UTIL_OS_H_
#include <string>
#include <vector>
#include "utils.h"
namespace wsl {
namespace os {
typedef void* LibHandle;
typedef void* Semaphore;
typedef void* Mutex;
typedef void* SharedMutex;
typedef void* Thread;
typedef void* EventHandle;
enum class os_t { OS_WIN = 0, OS_LINUX, COUNT };
static __forceinline std::underlying_type<os_t>::type os_index(os_t val) {
return std::underlying_type<os_t>::type(val);
}
#ifdef _WIN32
static const os_t current_os = os_t::OS_WIN;
#elif __linux__
static const os_t current_os = os_t::OS_LINUX;
#else
static_assert(false, "Operating System not detected!");
#endif
/// @brief: Loads dynamic library based on file name. Return value will be NULL
/// if failed.
/// @param: filename(Input), file name of the library.
/// @return: LibHandle.
LibHandle LoadLib(std::string filename);
/// @brief: Gets the address of exported symbol. Return NULl if failed.
/// @param: lib(Input), library handle which exporting from.
/// @param: export_name(Input), the name of the exported symbol.
/// @return: void*.
void* GetExportAddress(LibHandle lib, std::string export_name);
/// @brief: Unloads the dynamic library.
/// @param: lib(Input), library handle which will be unloaded.
void CloseLib(LibHandle lib);
/// @brief: Lists loaded tool libraries that contain
/// symbol HSA_AMD_TOOL_PRIORITY
/// @return: List of library handles
std::vector<LibHandle> GetLoadedToolsLib();
/// @brief: Returns the library's path name.
/// @param: lib(Input), libray handle
/// @return: Path name of library
std::string GetLibraryName(LibHandle lib);
/// @brief: Creates a Semaphore, will return NULL if failed.
/// @param: void.
/// @return: Semaphore.
Semaphore CreateSemaphore();
/// @brief: Waits for the semaphore. This is a blocking wait.
/// If the Semaphore is signalled, this function will return.
/// @param: sem(Input), handle to the semaphore.
/// @return: void.
bool WaitSemaphore(Semaphore sem);
/// @brief: Post/Signal/Wake-up the semaphore
/// @param: sem(Input), handle to the semaphore.
/// @return: void.
void PostSemaphore(Semaphore sem);
/// @brief: Destroys the semaphore.
/// @param: sem(Input), handle to the semaphore.
/// @return: void.
void DestroySemaphore(Semaphore sem);
/// @brief: Creates a mutex, will return NULL if failed.
/// @param: void.
/// @return: Mutex.
Mutex CreateMutex();
/// @brief: Tries to acquire the mutex once, if successed, return true.
/// @param: lock(Input), handle to the mutex.
/// @return: bool.
bool TryAcquireMutex(Mutex lock);
/// @brief: Aquires the mutex, if the mutex is locked, it will wait until it is
/// released. If the mutex is acquired successfully, it will return true.
/// @param: lock(Input), handle to the mutex.
/// @return: bool.
bool AcquireMutex(Mutex lock);
/// @brief: Releases the mutex.
/// @param: lock(Input), handle to the mutex.
/// @return: void.
void ReleaseMutex(Mutex lock);
/// @brief: Destroys the mutex.
/// @param: lock(Input), handle to the mutex.
/// @return: void.
void DestroyMutex(Mutex lock);
/// @brief: Creates a shared mutex, will return NULL if failed.
/// @param: void.
/// @return: SharedMutex.
SharedMutex CreateSharedMutex();
/// @brief: Tries to acquire the mutex in exclusive mode once, if successed, return true.
/// @param: lock(Input), handle to the shared mutex.
/// @return: bool.
bool TryAcquireSharedMutex(SharedMutex lock);
/// @brief: Aquires the mutex in exclusive mode, if the mutex is locked, it will wait until it is
/// released. If the mutex is acquired successfully, it will return true.
/// @param: lock(Input), handle to the mutex.
/// @return: bool.
bool AcquireSharedMutex(SharedMutex lock);
/// @brief: Releases the mutex from exclusive mode.
/// @param: lock(Input), handle to the mutex.
/// @return: void.
void ReleaseSharedMutex(SharedMutex lock);
/// @brief: Tries to acquire the mutex in shared mode once, if successed, return true.
/// @param: lock(Input), handle to the mutex.
/// @return: bool.
bool TrySharedAcquireSharedMutex(SharedMutex lock);
/// @brief: Aquires the mutex in shared mode, if the mutex in exclusive mode, it will wait until it
/// is released. If the mutex is acquired successfully, it will return true.
/// @param: lock(Input), handle to the mutex.
/// @return: bool.
bool SharedAcquireSharedMutex(SharedMutex lock);
/// @brief: Releases the mutex from shared mode.
/// @param: lock(Input), handle to the mutex.
/// @return: void.
void SharedReleaseSharedMutex(SharedMutex lock);
/// @brief: Destroys the mutex.
/// @param: lock(Input), handle to the mutex.
/// @return: void.
void DestroySharedMutex(SharedMutex lock);
/// @brief: Puts current thread to sleep.
/// @param: delayInMs(Input), time in millisecond for sleeping.
/// @return: void.
void Sleep(int delayInMs);
/// @brief: Puts current thread to sleep.
/// @param: delayInMs(Input), time in millisecond for sleeping.
/// @return: void.
void uSleep(int delayInUs);
/// @brief: Yields current thread.
/// @param: void.
/// @return: void.
void YieldThread();
typedef void (*ThreadEntry)(void*);
/// @brief: Creates a thread will return NULL if failed.
/// @param: entry_function(Input), a pointer to the function which the thread
/// starts from.
/// @param: entry_argument(Input), a pointer to the argument of the thread
/// function.
/// @param: stack_size(Input), size of the thread's stack, 0 by default.
/// @return: Thread, a handle to thread created.
Thread CreateThread(ThreadEntry entry_function, void* entry_argument,
uint stack_size = 0);
/// @brief: Destroys the thread.
/// @param: thread(Input), thread handle to what will be destroyed.
/// @return: void.
void CloseThread(Thread thread);
/// @brief: Waits for specific thread to finish, if successful, return true.
/// @param: thread(Input), handle to waiting thread.
/// @return: bool.
bool WaitForThread(Thread thread);
/// @brief: Waits for multiple threads to finish, if successful, return true.
/// @param; threads(Input), a pointer to a list of thread handle.
/// @param: thread_count(Input), number of threads to be waited on.
/// @return: bool.
bool WaitForAllThreads(Thread* threads, uint thread_count);
/// @brief: Determines if environment key is set.
/// @param: env_var_name(Input), name of the environment value.
/// @return: bool, true for binding any value to environment key,
/// including an empty string. False otherwise
bool IsEnvVarSet(std::string env_var_name);
/// @brief: Sets the environment value.
/// @param: env_var_name(Input), name of the environment value.
/// @param: env_var_value(Input), value of the environment value.s
/// @return: void.
void SetEnvVar(std::string env_var_name, std::string env_var_value);
/// @brief: Gets the value of environment value.
/// @param: env_var_name(Input), name of the environment value.
/// @return: std::string, value of the environment value, returned as string.
std::string GetEnvVar(std::string env_var_name);
/// @brief: Gets the process ID.
/// @param: void
/// @return: int, process ID returned as int.
int GetProcessId();
/// @brief: Gets the max virtual memory size accessible to the application.
/// @param: void.
/// @return: size_t, size of the accessible memory to the application.
size_t GetUserModeVirtualMemorySize();
/// @brief: Gets the max physical host system memory size.
/// @param: void.
/// @return: size_t, size of the physical host system memory.
size_t GetUsablePhysicalHostMemorySize();
/// @brief: Gets the virtual memory base address. It is hardcoded to 0.
/// @param: void.
/// @return: uintptr_t, always 0.
uintptr_t GetUserModeVirtualMemoryBase();
/// @brief os event api, create an event
/// @param: auto_reset whether an event can reset the status automatically
/// @param: init_state initial state of the event
/// @return: event handle
EventHandle CreateOsEvent(bool auto_reset, bool init_state);
/// @brief os event api, destroy an event
/// @param: event handle
/// @return: whether destroy is correct
int DestroyOsEvent(EventHandle event);
/// @brief os event api, wait on event
/// @param: event Event handle
/// @param: milli_seconds wait time
/// @return: Indicate success or timeout
int WaitForOsEvent(EventHandle event, unsigned int milli_seconds);
/// @brief os event api, set event state
/// @param: event Event handle
/// @return: Whether event set is correct
int SetOsEvent(EventHandle event);
/// @brief os event api, reset event state
/// @param: event Event handle
/// @return: Whether event reset is correct
int ResetOsEvent(EventHandle event);
/// @brief reads a clock which is deemed to be accurate for elapsed time
/// measurements, though not necessarilly fast to query
/// @return clock counter value
uint64_t ReadAccurateClock();
/// @brief retrieves the frequency in Hz of the unit used in ReadAccurateClock.
/// It does not necessarilly reflect the resolution of the clock, but is the
/// value needed to convert a difference in the clock's counter value to elapsed
/// seconds. This frequency does not change at runtime.
/// @return returns the frequency
uint64_t AccurateClockFrequency();
/// @brief read the system clock which serves as the HSA system clock
/// counter in KFD.
uint64_t ReadSystemClock();
/// @brief read the system clock frequency
uint64_t SystemClockFrequency();
typedef struct cpuid_s {
char ManufacturerID[13]; // 12 char, NULL terminated
bool mwaitx;
} cpuid_t;
/// @brief parse CPUID
/// @param: cpuinfo struct to be filled
bool ParseCpuID(cpuid_t* cpuinfo);
} // namespace os
} // namespace wsl
#endif // HSA_RUNTIME_CORE_UTIL_OS_H_
@@ -0,0 +1,394 @@
////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
// AMD Research and AMD HSA Software Development
//
// Advanced Micro Devices, Inc.
//
// www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// - Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimers.
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimers in
// the documentation and/or other materials provided with the distribution.
// - Neither the names of Advanced Micro Devices, Inc,
// nor the names of its contributors may be used to endorse or promote
// products derived from this Software without specific prior written
// permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
//
////////////////////////////////////////////////////////////////////////////////
// A simple best fit memory allocator with eager compaction. Manages block sub-allocation.
// For use when memory efficiency is more important than allocation speed.
// O(log n) time.
#ifndef HSA_RUNTME_CORE_UTIL_SIMPLE_HEAP_H_
#define HSA_RUNTME_CORE_UTIL_SIMPLE_HEAP_H_
#include <map>
#include <deque>
#include <utility>
namespace wsl {
template <typename Allocator> class SimpleHeap {
private:
struct Fragment_T {
typedef std::multimap<size_t, uintptr_t>::iterator ptr_t;
ptr_t free_list_entry_;
struct {
size_t size : 62;
bool discard : 1;
bool free : 1;
};
Fragment_T(ptr_t Iterator, size_t Len, bool Free)
: free_list_entry_(Iterator), size(Len), discard(false), free(Free) {}
Fragment_T() = default;
};
struct Block {
uintptr_t base_ptr_;
size_t length_;
Block(uintptr_t base, size_t length) : base_ptr_(base), length_(length) {}
Block() = default;
};
Allocator block_allocator_;
std::multimap<size_t, uintptr_t> free_list_;
std::map<uintptr_t, std::map<uintptr_t, Fragment_T>> block_list_;
std::deque<Block> block_cache_;
// Size of blocks that are at least partially in use.
size_t in_use_size_;
// Total size of block cache
size_t cache_size_;
__forceinline bool isFree(const Fragment_T& node) { return node.free; }
__forceinline void setUsed(Fragment_T& node) {
node.free = false;
node.free_list_entry_ = free_list_.end();
}
__forceinline void setFree(Fragment_T& node, typename Fragment_T::ptr_t Iterator) {
node.free_list_entry_ = Iterator;
node.free = true;
}
__forceinline Fragment_T makeFragment(size_t Len) {
return Fragment_T(free_list_.end(), Len, false);
}
__forceinline Fragment_T makeFragment(typename Fragment_T::ptr_t Iterator, size_t Len) {
return Fragment_T(Iterator, Len, true);
}
__forceinline void removeFreeListEntry(Fragment_T& node) {
if (node.free_list_entry_ != free_list_.end()) {
free_list_.erase(node.free_list_entry_);
node.free_list_entry_ = free_list_.end();
}
}
__forceinline void discard(Fragment_T& node) {
removeFreeListEntry(node);
node.discard = true;
}
public:
explicit SimpleHeap(const Allocator& BlockAllocator = Allocator())
: block_allocator_(BlockAllocator), in_use_size_(0), cache_size_(0) {}
~SimpleHeap() {
trim();
// Leak here may be due to the user. Check is for debugging only.
// assert(in_use_size_ == 0 && "Leak in SimpleHeap.");
}
SimpleHeap(const SimpleHeap& rhs) = delete;
SimpleHeap(SimpleHeap&& rhs) = delete;
SimpleHeap& operator=(const SimpleHeap& rhs) = delete;
SimpleHeap& operator=(SimpleHeap&& rhs) = delete;
void* alloc(size_t bytes) {
// Find best fit.
uintptr_t base;
size_t size;
// For bytes >= 2MB, the requested mem should be aligned
size_t align_bytes = bytes;
const int retry = bytes >= GPU_HUGE_PAGE_SIZE ? 1 : 0;
size_t align = bytes >= GPU_HUGE_PAGE_SIZE ? GPU_HUGE_PAGE_SIZE : DEFAULT_GPU_PAGE_SIZE;
for (int i = 0; i <= retry; i++) {
auto free_fragment = free_list_.lower_bound(align_bytes);
if (free_fragment == free_list_.end()) break;
uintptr_t addr = free_fragment->second;
size = free_fragment->first;
assert(size >= bytes && "SimpleHeap: map lower_bound failure.");
// Find the containing block and fragment
auto it = block_list_.upper_bound(addr);
it--;
auto& frag_map = it->second;
const auto& fragment = frag_map.find(addr);
assert(fragment != frag_map.end() && "Inconsistency in SimpleHeap.");
assert(size == fragment->second.size && "Inconsistency in SimpleHeap.");
size_t delta = addr & (align - 1);
if (!delta) {
// already find aligned address
base = addr;
free_list_.erase(free_fragment);
// Sub-allocate from fragment.
fragment->second.size = bytes;
setUsed(fragment->second);
// Record remaining free space.
if (size > bytes) {
free_fragment = free_list_.insert(std::make_pair(size - bytes, base + bytes));
frag_map[base + bytes] = makeFragment(free_fragment, size - bytes);
}
} else {
// If this is the first request and the requested size is not enough for alignment,
// then request for a bigger hole and do trim.
if (i == 0 && size < bytes + align - delta) {
align_bytes += align;
continue;
}
uintptr_t aligned_base = addr + align - delta;
base = aligned_base;
// Erase the old free list
free_list_.erase(free_fragment);
// fragment 1 - free
free_fragment = free_list_.insert(std::make_pair(aligned_base - addr, addr));
frag_map[addr] = makeFragment(free_fragment, aligned_base - addr);
//fragment 2 - used
frag_map[base] = makeFragment(bytes);
// fragement 3 - free
if (size > aligned_base - addr + bytes) {
free_fragment = free_list_.insert(std::make_pair(size - (aligned_base - addr) - bytes, aligned_base + bytes));
frag_map[aligned_base + bytes] = makeFragment(free_fragment, size - (aligned_base - addr) - bytes);
}
}
return reinterpret_cast<void*>(base);
}
// No usable fragment, check block cache
if (bytes < default_block_size() && !block_cache_.empty()) {
const auto& block = block_cache_.back();
base = block.base_ptr_;
size = block.length_;
block_cache_.pop_back();
cache_size_ -= size;
} else { // Alloc new block - new block may be larger than default.
void* ptr = block_allocator_.alloc(bytes, size);
if (ptr == nullptr) {
fprintf(stderr, "Block allocation failed, Allocator is expected to throw.\n");
return nullptr;
}
base = reinterpret_cast<uintptr_t>(ptr);
}
in_use_size_ += size;
assert(size >= bytes && "Alloc exceeds block size.");
// Sub alloc and insert free region.
if (size > bytes) {
auto free_fragment = free_list_.insert(std::make_pair(size - bytes, base + bytes));
block_list_[base][base + bytes] = makeFragment(free_fragment, size - bytes);
}
// Track used region
block_list_[base][base] = makeFragment(bytes);
// Disallow multiple suballocation from large blocks.
// Prevents a small allocation from retaining a large block.
if (bytes > default_block_size()) {
bool err = discardBlock(reinterpret_cast<void*>(base));
assert(err && "Large block discard failed.");
}
return reinterpret_cast<void*>(base);
}
/* Return block-base the ptr belongs to if the ptr is a valid ptr which is allocated
* from this simpleheap and the block-base is allocated from block_allocator_*/
void* block_base(void* ptr) {
if (ptr == nullptr)
return nullptr;
uintptr_t base = reinterpret_cast<uintptr_t>(ptr);
// Find fragment and validate.
auto frag_map_it = block_list_.upper_bound(base);
if (frag_map_it == block_list_.begin())
return nullptr;
frag_map_it--;
auto& frag_map = frag_map_it->second;
auto fragment = frag_map.find(base);
if (fragment == frag_map.end() || isFree(fragment->second))
return nullptr;
return reinterpret_cast<void*>(frag_map_it->first);
}
void reset() {
free_list_.clear();
block_list_.clear();
block_cache_.clear();
in_use_size_ = 0;
cache_size_ = 0;
}
bool free(void* ptr) {
if (ptr == nullptr) return true;
uintptr_t base = reinterpret_cast<uintptr_t>(ptr);
// Find fragment and validate.
auto frag_map_it = block_list_.upper_bound(base);
if (frag_map_it == block_list_.begin()) return false;
frag_map_it--;
auto& frag_map = frag_map_it->second;
auto fragment = frag_map.find(base);
if (fragment == frag_map.end() || isFree(fragment->second)) return false;
bool discard = fragment->second.discard;
// Merge lower
if (fragment != frag_map.begin()) {
auto lower = fragment;
lower--;
if (isFree(lower->second)) {
removeFreeListEntry(lower->second);
lower->second.size += fragment->second.size;
frag_map.erase(fragment);
fragment = lower;
}
}
// Merge upper
{
auto upper = fragment;
upper++;
if ((upper != frag_map.end()) && isFree(upper->second)) {
removeFreeListEntry(upper->second);
fragment->second.size += upper->second.size;
frag_map.erase(upper);
}
}
// Release whole free blocks.
if (frag_map.size() == 1) {
Block block(fragment->first, fragment->second.size);
block_list_.erase(frag_map_it);
// Discard or add to the block cache.
if (discard) {
block_allocator_.free(reinterpret_cast<void*>(block.base_ptr_), block.length_);
} else {
block_cache_.push_back(block);
cache_size_ += block.length_;
in_use_size_ -= block.length_;
}
balance();
// Don't publish free space since block was moved to the cache.
return true;
}
// Don't report free memory if discarding the fragment.
if (discard) return true;
// Report free fragment
const auto& freeEntry =
free_list_.insert(std::make_pair(size_t(fragment->second.size), fragment->first));
setFree(fragment->second, freeEntry);
return true;
}
void balance() {
// Release old blocks when over cache limit.
while ((block_cache_.size() > 1) && (cache_size_ > in_use_size_ * 2)) {
const auto& block = block_cache_.front();
block_allocator_.free(reinterpret_cast<void*>(block.base_ptr_), block.length_);
cache_size_ -= block.length_;
block_cache_.pop_front();
}
}
void trim() {
for (const auto& block : block_cache_)
block_allocator_.free(reinterpret_cast<void*>(block.base_ptr_), block.length_);
block_cache_.clear();
cache_size_ = 0;
}
size_t cache_size() const { return cache_size_; }
size_t default_block_size() const { return block_allocator_.block_size(); }
// Prevent reuse of the block containing ptr. No further fragments will be allocated from the
// block and the block will not be added to the block cache when it is free.
bool discardBlock(void* ptr) {
if (ptr == nullptr) return true;
uintptr_t base = reinterpret_cast<uintptr_t>(ptr);
// Find block validate.
auto frag_map_it = block_list_.upper_bound(base);
if (frag_map_it == block_list_.begin()) return false;
frag_map_it--;
auto& frag_map = frag_map_it->second;
if ((base < frag_map.begin()->first) ||
(frag_map.rbegin()->first + frag_map.rbegin()->second.size <= base))
return false;
// Is block already discarded?
if (frag_map.begin()->second.discard) return true;
// Mark all fragments for discard and compute block size. Removes freelist records for all
// fragments in the block.
size_t size = 0;
for (auto& frag : frag_map) {
discard(frag.second);
size += frag.second.size;
}
// Remove discarded block from in-use tracking and rebalance the block cache.
in_use_size_ -= size;
balance();
return true;
}
};
} // namespace wsl
#endif // HSA_RUNTME_CORE_UTIL_SIMPLE_HEAP_H_
@@ -0,0 +1,185 @@
////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
// AMD Research and AMD HSA Software Development
//
// Advanced Micro Devices, Inc.
//
// www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// - Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimers.
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimers in
// the documentation and/or other materials provided with the distribution.
// - Neither the names of Advanced Micro Devices, Inc,
// nor the names of its contributors may be used to endorse or promote
// products derived from this Software without specific prior written
// permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
//
////////////////////////////////////////////////////////////////////////////////
#include "small_heap.h"
namespace wsl {
// Inserts node into freelist after place.
// Assumes node will not be an end of the list (list has guard nodes).
void SmallHeap::insertafter(SmallHeap::iterator_t place, SmallHeap::iterator_t node) {
assert(place->first < node->first && "Order violation");
assert(isfree(place->second) && "Freelist operation error.");
iterator_t next = place->second.next;
node->second.next = next;
node->second.prior = place;
place->second.next = node;
next->second.prior = node;
}
// Removes node from freelist.
// Assumes node will not be an end of the list (list has guard nodes).
void SmallHeap::remove(SmallHeap::iterator_t node) {
assert(isfree(node->second) && "Freelist operation error.");
node->second.prior->second.next = node->second.next;
node->second.next->second.prior = node->second.prior;
setused(node->second);
}
// Returns high if merge failed or the merged node.
SmallHeap::memory_t::iterator SmallHeap::merge(SmallHeap::memory_t::iterator low,
SmallHeap::memory_t::iterator high) {
assert(isfree(low->second) && "Merge with allocated block");
assert(isfree(high->second) && "Merge with allocated block");
if ((char*)low->first + low->second.len != (char*)high->first) return high;
assert(!islastfree(high->second) && "Illegal merge.");
low->second.len += high->second.len;
low->second.next = high->second.next;
high->second.next->second.prior = low;
memory.erase(high);
return low;
}
void SmallHeap::free(void* ptr) {
if (ptr == nullptr) return;
auto iterator = memory.find(ptr);
// Check for illegal free
if (iterator == memory.end()) {
assert(false && "Illegal free.");
return;
}
// Return memory to total and link node into free list
total_free += iterator->second.len;
// Could also traverse the free list which might be faster in some cases.
auto before = iterator;
before--;
while (!isfree(before->second)) before--;
assert(before->second.next->first > iterator->first && "Inconsistency in small heap.");
insertafter(before, iterator);
// Attempt compaction
iterator = merge(before, iterator);
merge(iterator, iterator->second.next);
// Update lowHighBondary
high.erase(ptr);
}
void* SmallHeap::alloc(size_t bytes) {
// Is enough memory available?
if ((bytes > total_free) || (bytes == 0)) return nullptr;
iterator_t current;
// Walk the free list and allocate at first fitting location
current = firstfree();
while (!islastfree(current->second)) {
if (bytes <= current->second.len) {
// Decrement from total
total_free -= bytes;
// Split node
if (bytes != current->second.len) {
void* remaining = (char*)current->first + bytes;
Node& node = memory[remaining];
node.len = current->second.len - bytes;
current->second.len = bytes;
insertafter(current, memory.find(remaining));
}
remove(current);
return current->first;
}
current = current->second.next;
}
assert(current->second.len == 0 && "Freelist corruption.");
// Can't service the request due to fragmentation
return nullptr;
}
void* SmallHeap::alloc_high(size_t bytes) {
// Is enough memory available?
if ((bytes > total_free) || (bytes == 0)) return nullptr;
iterator_t current;
// Walk the free list and allocate at first fitting location
current = lastfree();
while (!isfirstfree(current->second)) {
if (bytes <= current->second.len) {
// Decrement from total
total_free -= bytes;
void* alloc;
// Split node
if (bytes != current->second.len) {
alloc = (char*)current->first + current->second.len - bytes;
current->second.len -= bytes;
Node& node = memory[alloc];
node.len = bytes;
setused(node);
} else {
alloc = current->first;
remove(current);
}
high.insert(alloc);
return alloc;
}
current = current->second.prior;
}
assert(current->second.len == 0 && "Freelist corruption.");
// Can't service the request due to fragmentation
return nullptr;
}
} // namespace wsl
@@ -0,0 +1,131 @@
////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
// AMD Research and AMD HSA Software Development
//
// Advanced Micro Devices, Inc.
//
// www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// - Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimers.
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimers in
// the documentation and/or other materials provided with the distribution.
// - Neither the names of Advanced Micro Devices, Inc,
// nor the names of its contributors may be used to endorse or promote
// products derived from this Software without specific prior written
// permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
//
////////////////////////////////////////////////////////////////////////////////
// A simple first fit memory allocator with eager compaction. For use with few
// items (where list iteration is faster than trees).
// Not thread safe!
#ifndef HSA_RUNTME_CORE_UTIL_SMALL_HEAP_H_
#define HSA_RUNTME_CORE_UTIL_SMALL_HEAP_H_
#include <map>
#include <set>
#include "utils.h"
namespace wsl {
class SmallHeap {
private:
struct Node;
typedef std::map<void*, Node> memory_t;
typedef memory_t::iterator iterator_t;
struct Node {
size_t len;
iterator_t next;
iterator_t prior;
};
SmallHeap(const SmallHeap& rhs) = delete;
SmallHeap& operator=(const SmallHeap& rhs) = delete;
void* const pool;
const size_t length;
size_t total_free;
memory_t memory;
std::set<void*> high;
__forceinline bool isfree(const Node& node) const { return node.next != memory.begin(); }
__forceinline bool islastfree(const Node& node) const { return node.next == memory.end(); }
__forceinline bool isfirstfree(const Node& node) const { return node.prior == memory.end(); }
__forceinline void setlastfree(Node& node) { node.next = memory.end(); }
__forceinline void setfirstfree(Node& node) { node.prior = memory.end(); }
__forceinline void setused(Node& node) { node.next = memory.begin(); }
__forceinline iterator_t firstfree() { return memory.begin()->second.next; }
__forceinline iterator_t lastfree() { return memory.rbegin()->second.prior; }
void insertafter(iterator_t place, iterator_t node);
void remove(iterator_t node);
iterator_t merge(iterator_t low, iterator_t high);
public:
SmallHeap() : pool(nullptr), length(0), total_free(0) {}
SmallHeap(void* base, size_t length)
: pool(base), length(length), total_free(length) {
assert(pool != nullptr && "Invalid base address.");
assert(pool != (void*)0xFFFFFFFFFFFFFFFFull && "Invalid base address.");
assert((char*)pool + length != (char*)0xFFFFFFFFFFFFFFFFull && "Invalid pool bounds.");
Node& start = memory[0];
Node& node = memory[pool];
Node& end = memory[(void*)0xFFFFFFFFFFFFFFFFull];
start.len = 0;
start.next = memory.find(pool);
setfirstfree(start);
node.len = length;
node.prior = memory.begin();
node.next = --memory.end();
end.len = 0;
end.prior = start.next;
setlastfree(end);
high.insert((void*)0xFFFFFFFFFFFFFFFFull);
}
void* alloc(size_t bytes);
void* alloc_high(size_t bytes);
void free(void* ptr);
void* base() const { return pool; }
size_t size() const { return length; }
size_t remaining() const { return total_free; }
void* high_split() const { return *high.begin(); }
};
} // namespace wsl
#endif
@@ -0,0 +1,111 @@
////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
// AMD Research and AMD HSA Software Development
//
// Advanced Micro Devices, Inc.
//
// www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// - Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimers.
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimers in
// the documentation and/or other materials provided with the distribution.
// - Neither the names of Advanced Micro Devices, Inc,
// nor the names of its contributors may be used to endorse or promote
// products derived from this Software without specific prior written
// permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
//
////////////////////////////////////////////////////////////////////////////////
#include "core/util/timer.h"
namespace wsl {
namespace timer {
accurate_clock::init::init() {
freq = os::AccurateClockFrequency();
accurate_clock::period_ns = 1e9 / double(freq);
}
// Calibrates the fast clock using the accurate clock.
fast_clock::init::init() {
typedef accurate_clock clock;
clock::duration delay(std::chrono::milliseconds(1));
// calibrate clock
fast_clock::raw_rep min = 0;
clock::duration elapsed;
do {
elapsed = clock::duration::max();
for (int t = 0; t < 10; t++) {
fast_clock::raw_rep r1, r2;
clock::time_point t0, t1, t2, t3;
t0 = clock::now();
std::atomic_signal_fence(std::memory_order_acq_rel);
r1 = fast_clock::raw_now();
std::atomic_signal_fence(std::memory_order_acq_rel);
t1 = clock::now();
std::atomic_signal_fence(std::memory_order_acq_rel);
do {
t2 = clock::now();
} while (t2 - t1 < delay);
std::atomic_signal_fence(std::memory_order_acq_rel);
r2 = fast_clock::raw_now();
std::atomic_signal_fence(std::memory_order_acq_rel);
t3 = clock::now();
// If elapsed time is shorter than last recorded time and both the start
// and end times are confirmed correlated then record the clock readings.
// This protects against inaccuracy due to thread switching
if ((t3 - t1 < elapsed) && ((t1 - t0) * 10 < (t2 - t1)) &&
((t3 - t2) * 10 < (t2 - t1))) {
elapsed = t3 - t1;
min = r2 - r1;
}
}
delay += delay;
} while (min < 1000);
fast_clock::freq = double(min) / duration_in_seconds(elapsed);
fast_clock::period_ps = 1e12 / fast_clock::freq;
// printf("Timer setup took %f ms\n", duration_in_seconds(elapsed)*1000.0f);
// printf("Fast clock frequency: %f MHz\n", double(fast_clock::freq)/1e6);
}
double accurate_clock::period_ns;
accurate_clock::raw_frequency accurate_clock::freq;
accurate_clock::init accurate_clock::accurate_clock_init;
double fast_clock::period_ps;
fast_clock::raw_frequency fast_clock::freq;
fast_clock::init fast_clock::fast_clock_init;
} // namespace timer
} // namespace wsl
@@ -0,0 +1,173 @@
////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
// AMD Research and AMD HSA Software Development
//
// Advanced Micro Devices, Inc.
//
// www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// - Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimers.
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimers in
// the documentation and/or other materials provided with the distribution.
// - Neither the names of Advanced Micro Devices, Inc,
// nor the names of its contributors may be used to endorse or promote
// products derived from this Software without specific prior written
// permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
//
////////////////////////////////////////////////////////////////////////////////
#ifndef HSA_RUNTIME_CORE_UTIL_TIMER_H_
#define HSA_RUNTIME_CORE_UTIL_TIMER_H_
#include "core/util/utils.h"
#include "core/util/os.h"
#include <chrono>
#include <time.h>
#include <type_traits>
namespace wsl {
namespace timer {
// Needed to patch around a mixed arithmetic bug in MSVC's duration_cast as of
// VS 2013.
template <bool isFloat, bool isSigned>
struct wide_type {
typedef double type;
};
template <>
struct wide_type<false, false> {
typedef uintmax_t type;
};
template <>
struct wide_type<false, true> {
typedef intmax_t type;
};
template <typename To, typename Rep, typename Period>
static __forceinline To
duration_cast(const std::chrono::duration<Rep, Period>& d) {
typedef typename wide_type<std::is_floating_point<Rep>::value,
std::is_signed<Rep>::value>::type wide;
typedef std::chrono::duration<wide, typename To::period> unit_convert_t;
unit_convert_t temp = std::chrono::duration_cast<unit_convert_t>(d);
return To(static_cast<typename To::rep>(temp.count()));
}
// End patch
template <typename Rep, typename Period>
static __forceinline double duration_in_seconds(
std::chrono::duration<Rep, Period> delta) {
typedef std::chrono::duration<double, std::ratio<1, 1>> seconds;
return seconds(delta).count();
}
template <typename rep>
static __forceinline rep duration_from_seconds(double delta) {
typedef std::chrono::duration<double, std::ratio<1, 1>> seconds;
return std::chrono::duration_cast<rep>(seconds(delta));
}
// Provices a C++11 standard clock interface to the os::AccurateClock functions
class accurate_clock {
public:
typedef double rep;
typedef std::nano period;
typedef std::chrono::duration<rep, period> duration;
typedef std::chrono::time_point<accurate_clock> time_point;
static const bool is_steady = true;
static __forceinline time_point now() {
return time_point(duration(raw_now() * period_ns));
}
// These two extra APIs and types let us use clocks without conversion to the
// arbitrary period unit
typedef uint64_t raw_rep;
typedef uint64_t raw_frequency;
static __forceinline raw_rep raw_now() { return os::ReadAccurateClock(); }
static __forceinline raw_frequency raw_freq() { return freq; }
private:
static double period_ns;
static raw_frequency freq;
class init {
public:
init();
};
static init accurate_clock_init;
};
// Provices a C++11 standard clock interface to the lowest latency approximate
// clock
class fast_clock {
public:
typedef double rep;
typedef std::pico period;
typedef std::chrono::duration<rep, period> duration;
typedef std::chrono::time_point<fast_clock> time_point;
static const bool is_steady = true;
static __forceinline time_point now() {
return time_point(duration(raw_now() * period_ps));
}
// These two extra APIs and types let us use clocks without conversion to the
// arbitrary period unit
typedef uint64_t raw_rep;
typedef double raw_frequency;
#if defined(__x86_64__) || defined(_M_X64)
static __forceinline raw_rep raw_now() { return __rdtsc(); }
static __forceinline raw_frequency raw_freq() { return freq; }
#else
static __forceinline raw_rep raw_now() {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
return (raw_rep(ts.tv_sec) * 1000000000 + raw_rep(ts.tv_nsec));
}
static __forceinline raw_frequency raw_freq() { return 1.e-9; }
#endif
private:
static double period_ps;
static raw_frequency freq;
class init {
public:
init();
};
static init fast_clock_init;
};
} // namespace timer
} // namespace wsl
#endif
@@ -0,0 +1,389 @@
////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2014-2024, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
// AMD Research and AMD HSA Software Development
//
// Advanced Micro Devices, Inc.
//
// www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// - Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimers.
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimers in
// the documentation and/or other materials provided with the distribution.
// - Neither the names of Advanced Micro Devices, Inc,
// nor the names of its contributors may be used to endorse or promote
// products derived from this Software without specific prior written
// permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
//
////////////////////////////////////////////////////////////////////////////////
// Generally useful utility functions
#ifndef HSA_RUNTIME_CORE_UTIL_UTILS_H_
#define HSA_RUNTIME_CORE_UTIL_UTILS_H_
#include "stdint.h"
#include "stddef.h"
#include "stdlib.h"
#include "stdarg.h"
#include "unistd.h"
#include <assert.h>
#include <iostream>
#include <string>
#include <algorithm>
#include <sstream>
#include <thread>
namespace wsl {
extern FILE* log_file;
extern uint8_t log_flags[8];
typedef unsigned int uint;
typedef uint64_t uint64;
#if defined(__GNUC__)
#if defined(__i386__) || defined(__x86_64__)
#include <x86intrin.h>
#endif
// 2MB huge page size
#define GPU_HUGE_PAGE_SIZE (2 << 20)
// 4KB page size
#define DEFAULT_GPU_PAGE_SIZE (1 << 12)
#define __forceinline __inline__ __attribute__((always_inline))
#define __declspec(x) __attribute__((x))
#undef __stdcall
#define __stdcall // __attribute__((__stdcall__))
#define __ALIGNED__(x) __attribute__((aligned(x)))
void log_printf(const char* file, int line, const char* format, ...);
static __forceinline void* _aligned_malloc(size_t size, size_t alignment) {
#ifdef _ISOC11_SOURCE
return aligned_alloc(alignment, size);
#else
void *mem = NULL;
if (0 != posix_memalign(&mem, alignment, size)) return NULL;
return mem;
#endif
}
static __forceinline void _aligned_free(void* ptr) { return free(ptr); }
#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
#include "intrin.h"
#define __ALIGNED__(x) __declspec(align(x))
#if (_MSC_VER < 1800) // < VS 2013
static __forceinline unsigned long long int strtoull(const char* str,
char** endptr, int base) {
return static_cast<unsigned long long>(_strtoui64(str, endptr, base));
}
#endif
#if (_MSC_VER < 1900) // < VS 2015
#define thread_local __declspec(thread)
#endif
#else
#error "Compiler and/or processor not identified."
#endif
#define STRING2(x) #x
#define STRING(x) STRING2(x)
#define PASTE2(x, y) x##y
#define PASTE(x, y) PASTE2(x, y)
#define __FILENAME__ (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
#define LogPrint(flag, format, ...) \
do { \
if (hsa_flag_isset64(log_flags, flag)) \
wsl::log_printf(__FILENAME__, __LINE__, format, ##__VA_ARGS__); \
} while (false);
// A macro to disallow the copy and move constructor and operator= functions
#define DISALLOW_COPY_AND_ASSIGN(TypeName) \
TypeName(const TypeName&) = delete; \
TypeName(TypeName&&) = delete; \
void operator=(const TypeName&) = delete; \
void operator=(TypeName&&) = delete;
template <typename lambda>
class ScopeGuard {
public:
explicit __forceinline ScopeGuard(const lambda& release)
: release_(release), dismiss_(false) {}
ScopeGuard(ScopeGuard& rhs) { *this = rhs; }
__forceinline ~ScopeGuard() {
if (!dismiss_) release_();
}
__forceinline ScopeGuard& operator=(ScopeGuard& rhs) {
dismiss_ = rhs.dismiss_;
release_ = rhs.release_;
rhs.dismiss_ = true;
return *this;
}
__forceinline void Dismiss() { dismiss_ = true; }
private:
lambda release_;
bool dismiss_;
};
template <typename lambda>
static __forceinline ScopeGuard<lambda> MakeScopeGuard(lambda rel) {
return ScopeGuard<lambda>(rel);
}
#define MAKE_SCOPE_GUARD_HELPER(lname, sname, ...) \
auto lname = __VA_ARGS__; \
ScopeGuard<decltype(lname)> sname(lname);
#define MAKE_SCOPE_GUARD(...) \
MAKE_SCOPE_GUARD_HELPER(PASTE(scopeGuardLambda, __COUNTER__), \
PASTE(scopeGuard, __COUNTER__), __VA_ARGS__)
#define MAKE_NAMED_SCOPE_GUARD(name, ...) \
MAKE_SCOPE_GUARD_HELPER(PASTE(scopeGuardLambda, __COUNTER__), name, \
__VA_ARGS__)
/// @brief: Finds out the min one of two inputs, input must support ">"
/// operator.
/// @param: a(Input), a reference to type T.
/// @param: b(Input), a reference to type T.
/// @return: T.
template <class T>
static __forceinline T Min(const T& a, const T& b) {
return (a > b) ? b : a;
}
template <class T, class... Arg>
static __forceinline T Min(const T& a, const T& b, Arg... args) {
return Min(a, Min(b, args...));
}
/// @brief: Find out the max one of two inputs, input must support ">" operator.
/// @param: a(Input), a reference to type T.
/// @param: b(Input), a reference to type T.
/// @return: T.
template <class T>
static __forceinline T Max(const T& a, const T& b) {
return (b > a) ? b : a;
}
template <class T, class... Arg>
static __forceinline T Max(const T& a, const T& b, Arg... args) {
return Max(a, Max(b, args...));
}
/// @brief: Free the memory space which is newed previously.
/// @param: ptr(Input), a pointer to memory space. Can't be NULL.
/// @return: void.
struct DeleteObject {
template <typename T>
void operator()(const T* ptr) const {
delete ptr;
}
};
/// @brief: Checks if a value is power of two, if it is, return true. Be careful
/// when passing 0.
/// @param: val(Input), the data to be checked.
/// @return: bool.
template <typename T>
static __forceinline bool IsPowerOfTwo(T val) {
return (val & (val - 1)) == 0;
}
/// @brief: Calculates the floor value aligned based on parameter of alignment.
/// If value is at the boundary of alignment, it is unchanged.
/// @param: value(Input), value to be calculated.
/// @param: alignment(Input), alignment value.
/// @return: T.
template <typename T>
static __forceinline T AlignDown(T value, size_t alignment) {
return (T)((value / alignment) * alignment);
}
/// @brief: Same as previous one, but first parameter becomes pointer, for more
/// info, see the previous desciption.
/// @param: value(Input), pointer to type T.
/// @param: alignment(Input), alignment value.
/// @return: T*, pointer to type T.
template <typename T>
static __forceinline T* AlignDown(T* value, size_t alignment) {
return (T*)AlignDown((intptr_t)value, alignment);
}
/// @brief: Calculates the ceiling value aligned based on parameter of
/// alignment.
/// If value is at the boundary of alignment, it is unchanged.
/// @param: value(Input), value to be calculated.
/// @param: alignment(Input), alignment value.
/// @param: T.
template <typename T>
static __forceinline T AlignUp(T value, size_t alignment) {
return AlignDown((T)(value + alignment - 1), alignment);
}
/// @brief: Same as previous one, but first parameter becomes pointer, for more
/// info, see the previous desciption.
/// @param: value(Input), pointer to type T.
/// @param: alignment(Input), alignment value.
/// @return: T*, pointer to type T.
template <typename T>
static __forceinline T* AlignUp(T* value, size_t alignment) {
return (T*)AlignDown((intptr_t)((uint8_t*)value + alignment - 1), alignment);
}
/// @brief: Checks if the input value is at the boundary of alignment, if it is,
/// @return true.
/// @param: value(Input), value to be checked.
/// @param: alignment(Input), alignment value.
/// @return: bool.
template <typename T>
static __forceinline bool IsMultipleOf(T value, size_t alignment) {
return (AlignUp(value, alignment) == value);
}
/// @brief: Same as previous one, but first parameter becomes pointer, for more
/// info, see the previous desciption.
/// @param: value(Input), pointer to type T.
/// @param: alignment(Input), alignment value.
/// @return: bool.
template <typename T>
static __forceinline bool IsMultipleOf(T* value, size_t alignment) {
return (AlignUp(value, alignment) == value);
}
static __forceinline uint32_t NextPow2(uint32_t value) {
if (value == 0) return 1;
uint32_t v = value - 1;
v |= v >> 1;
v |= v >> 2;
v |= v >> 4;
v |= v >> 8;
v |= v >> 16;
return v + 1;
}
static __forceinline uint64_t NextPow2(uint64_t value) {
if (value == 0) return 1;
uint64_t v = value - 1;
v |= v >> 1;
v |= v >> 2;
v |= v >> 4;
v |= v >> 8;
v |= v >> 16;
v |= v >> 32;
return v + 1;
}
static __forceinline bool strIsEmpty(const char* str) noexcept { return str[0] == '\0'; }
static __forceinline std::string& ltrim(std::string& s) {
auto it = std::find_if(s.begin(), s.end(),
[](char c) { return !std::isspace<char>(c, std::locale::classic()); });
s.erase(s.begin(), it);
return s;
}
static __forceinline std::string& rtrim(std::string& s) {
auto it = std::find_if(s.rbegin(), s.rend(),
[](char c) { return !std::isspace<char>(c, std::locale::classic()); });
s.erase(it.base(), s.end());
return s;
}
static __forceinline std::string& trim(std::string& s) { return ltrim(rtrim(s)); }
} // namespace wsl
template <uint32_t lowBit, uint32_t highBit, typename T>
static __forceinline uint32_t BitSelect(T p) {
static_assert(sizeof(T) <= sizeof(uintptr_t), "Type out of range.");
static_assert(highBit < sizeof(uintptr_t) * 8, "Bit index out of range.");
uintptr_t ptr = p;
if (highBit != (sizeof(uintptr_t) * 8 - 1))
return (uint32_t)((ptr & ((1ull << (highBit + 1)) - 1)) >> lowBit);
else
return (uint32_t)(ptr >> lowBit);
}
inline uint32_t PtrLow16Shift8(const void* p) {
uintptr_t ptr = reinterpret_cast<uintptr_t>(p);
return (uint32_t)((ptr & 0xFFFFULL) >> 8);
}
inline uint32_t PtrHigh64Shift16(const void* p) {
uintptr_t ptr = reinterpret_cast<uintptr_t>(p);
return (uint32_t)((ptr & 0xFFFFFFFFFFFF0000ULL) >> 16);
}
inline uint32_t PtrLow40Shift8(const void* p) {
uintptr_t ptr = reinterpret_cast<uintptr_t>(p);
return (uint32_t)((ptr & 0xFFFFFFFFFFULL) >> 8);
}
inline uint32_t PtrHigh64Shift40(const void* p) {
uintptr_t ptr = reinterpret_cast<uintptr_t>(p);
return (uint32_t)((ptr & 0xFFFFFF0000000000ULL) >> 40);
}
static inline uint8_t Ptr48High8(const void* p) {
uintptr_t ptr = reinterpret_cast<uintptr_t>(p);
return (uint8_t)((ptr & 0xFF0000000000ULL) >> 40);
}
static inline uint32_t Ptr48Low32(const void* p) {
uintptr_t ptr = reinterpret_cast<uintptr_t>(p);
assert((ptr & 0xFFFFFFFFFF00ULL) == ptr);
return (uint32_t)((ptr & 0xFFFFFFFFFFULL) >> 8);
}
inline uint32_t PtrLow32(const void* p) {
return static_cast<uint32_t>(reinterpret_cast<uintptr_t>(p));
}
inline uint32_t PtrHigh32(const void* p) {
uint32_t ptr = 0;
#ifdef HSA_LARGE_MODEL
ptr = static_cast<uint32_t>(reinterpret_cast<uintptr_t>(p) >> 32);
#endif
return ptr;
}
inline uint32_t HighPart(uint64_t value) {
return (value & 0xFFFFFFFF00000000) >> 32;
}
inline uint32_t LowPart(uint64_t value) {
return (value & 0x00000000FFFFFFFF);
}
#include "atomic_helpers.h"
#endif // HSA_RUNTIME_CORE_UTIL_UTILS_H_
@@ -0,0 +1,327 @@
////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
// AMD Research and AMD HSA Software Development
//
// Advanced Micro Devices, Inc.
//
// www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// - Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimers.
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimers in
// the documentation and/or other materials provided with the distribution.
// - Neither the names of Advanced Micro Devices, Inc,
// nor the names of its contributors may be used to endorse or promote
// products derived from this Software without specific prior written
// permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
//
////////////////////////////////////////////////////////////////////////////////
#ifdef _WIN32 // Are we compiling for windows?
#define NOMINMAX
#include "core/util/os.h"
#include <algorithm>
#include <process.h>
#include <string>
#include <windows.h>
#include <emmintrin.h>
#include <pmmintrin.h>
#include <xmmintrin.h>
#undef Yield
#undef CreateMutex
namespace wsl {
namespace os {
static_assert(sizeof(LibHandle) == sizeof(HMODULE),
"OS abstraction size mismatch");
static_assert(sizeof(LibHandle) == sizeof(::HANDLE),
"OS abstraction size mismatch");
static_assert(sizeof(Semaphore) == sizeof(::HANDLE),
"OS abstraction size mismatch");
static_assert(sizeof(Mutex) == sizeof(::HANDLE),
"OS abstraction size mismatch");
static_assert(sizeof(Thread) == sizeof(::HANDLE),
"OS abstraction size mismatch");
static_assert(sizeof(EventHandle) == sizeof(::HANDLE),
"OS abstraction size mismatch");
LibHandle LoadLib(std::string filename) {
HMODULE ret = LoadLibrary(filename.c_str());
return *(LibHandle*)&ret;
}
void* GetExportAddress(LibHandle lib, std::string export_name) {
return GetProcAddress(*(HMODULE*)&lib, export_name.c_str());
}
void CloseLib(LibHandle lib) { FreeLibrary(*(::HMODULE*)&lib); }
std::vector<LibHandle> GetLoadedLibs() {
// Use EnumProcessModulesEx
static_assert(false, "Not implemented.");
}
std::string GetLibraryName(LibHandle lib) {
static_assert(false, "Not implemented.");
}
Semaphore CreateSemaphore() {
sem = static_cast<void*>(CreateSemaphore(NULL, 0, LONG_MAX, NULL));
assert(sem != NULL && "CreateSemaphore failed");
return *(Semaphore*)&sem;
}
bool WaitSemaphore(Semaphore sem) {
return WaitForSingleObject(*(::HANDLE*)&lock, INFINITE) == WAIT_OBJECT_0;
}
void PostSemaphore(Semaphore sem) {
ReleaseSemaphore(static_cast<HANDLE>(*sem), 1, NULL);
}
void DestroySemaphore(Semaphore sem) {
if (!CloseHandle(static_cast<HANDLE>(*sem))) {
assert("CloseHandle() failed");
}
*sem = NULL;
}
Mutex CreateMutex() { return CreateEvent(NULL, false, true, NULL); }
bool TryAcquireMutex(Mutex lock) {
return WaitForSingleObject(*(::HANDLE*)&lock, 0) == WAIT_OBJECT_0;
}
bool AcquireMutex(Mutex lock) {
return WaitForSingleObject(*(::HANDLE*)&lock, INFINITE) == WAIT_OBJECT_0;
}
void ReleaseMutex(Mutex lock) { SetEvent(*(::HANDLE*)&lock); }
void DestroyMutex(Mutex lock) { CloseHandle(*(::HANDLE*)&lock); }
void Sleep(int delay_in_millisecond) { ::Sleep(delay_in_millisecond); }
void uSleep(int delayInUs) { ::Sleep(delayInUs / 1000); }
void YieldThread() { ::Sleep(0); }
struct ThreadArgs {
void* entry_args;
ThreadEntry entry_function;
};
unsigned __stdcall ThreadTrampoline(void* arg) {
ThreadArgs* thread_args = (ThreadArgs*)arg;
ThreadEntry entry = thread_args->entry_function;
void* data = thread_args->entry_args;
delete thread_args;
entry(data);
_endthreadex(0);
return 0;
}
Thread CreateThread(ThreadEntry entry_function, void* entry_argument,
uint stack_size) {
ThreadArgs* thread_args = new ThreadArgs();
thread_args->entry_args = entry_argument;
thread_args->entry_function = entry_function;
uintptr_t ret =
_beginthreadex(NULL, stack_size, ThreadTrampoline, thread_args, 0, NULL);
return *(Thread*)&ret;
}
void CloseThread(Thread thread) { CloseHandle(*(::HANDLE*)&thread); }
bool WaitForThread(Thread thread) {
return WaitForSingleObject(*(::HANDLE*)&thread, INFINITE) == WAIT_OBJECT_0;
}
bool WaitForAllThreads(Thread* threads, uint thread_count) {
return WaitForMultipleObjects(thread_count, threads, TRUE, INFINITE) ==
WAIT_OBJECT_0;
}
void SetEnvVar(std::string env_var_name, std::string env_var_value) {
SetEnvironmentVariable(env_var_name.c_str(), env_var_value.c_str());
}
std::string GetEnvVar(std::string env_var_name) {
char* buff;
DWORD char_count = GetEnvironmentVariable(env_var_name.c_str(), NULL, 0);
if (char_count == 0) return "";
buff = (char*)alloca(sizeof(char) * char_count);
GetEnvironmentVariable(env_var_name.c_str(), buff, char_count);
buff[char_count - 1] = '\0';
std::string ret = buff;
return ret;
}
size_t GetUserModeVirtualMemorySize() {
SYSTEM_INFO system_info = {0};
GetSystemInfo(&system_info);
return ((size_t)system_info.lpMaximumApplicationAddress + 1);
}
size_t GetUsablePhysicalHostMemorySize() {
MEMORYSTATUSEX memory_status = {0};
memory_status.dwLength = sizeof(memory_status);
if (GlobalMemoryStatusEx(&memory_status) == 0) {
return 0;
}
const size_t physical_size = static_cast<size_t>(memory_status.ullTotalPhys);
return std::min(GetUserModeVirtualMemorySize(), physical_size);
}
uintptr_t GetUserModeVirtualMemoryBase() { return (uintptr_t)0; }
// Os event wrappers
EventHandle CreateOsEvent(bool auto_reset, bool init_state) {
EventHandle evt = reinterpret_cast<EventHandle>(
CreateEvent(NULL, (BOOL)(!auto_reset), (BOOL)init_state, NULL));
return evt;
}
int DestroyOsEvent(EventHandle event) {
if (event == NULL) {
return -1;
}
return CloseHandle(reinterpret_cast<::HANDLE>(event));
}
int WaitForOsEvent(EventHandle event, unsigned int milli_seconds) {
if (event == NULL) {
return -1;
}
int ret_code =
WaitForSingleObject(reinterpret_cast<::HANDLE>(event), milli_seconds);
if (ret_code == WAIT_TIMEOUT) {
ret_code = 0x14003; // 0x14003 indicates timeout
}
return ret_code;
}
int SetOsEvent(EventHandle event) {
if (event == NULL) {
return -1;
}
return SetEvent(reinterpret_cast<::HANDLE>(event));
}
int ResetOsEvent(EventHandle event) {
if (event == NULL) {
return -1;
}
return ResetEvent(reinterpret_cast<::HANDLE>(event));
}
uint64_t ReadAccurateClock() {
uint64_t ret;
QueryPerformanceCounter((LARGE_INTEGER*)&ret);
return ret;
}
uint64_t AccurateClockFrequency() {
uint64_t ret;
QueryPerformanceFrequency((LARGE_INTEGER*)&ret);
return ret;
}
SharedMutex CreateSharedMutex() {
assert(false && "Not implemented.");
abort();
return nullptr;
}
bool TryAcquireSharedMutex(SharedMutex lock) {
assert(false && "Not implemented.");
abort();
return false;
}
bool AcquireSharedMutex(SharedMutex lock) {
assert(false && "Not implemented.");
abort();
return false;
}
void ReleaseSharedMutex(SharedMutex lock) {
assert(false && "Not implemented.");
abort();
}
bool TrySharedAcquireSharedMutex(SharedMutex lock) {
assert(false && "Not implemented.");
abort();
return false;
}
bool SharedAcquireSharedMutex(SharedMutex lock) {
assert(false && "Not implemented.");
abort();
return false;
}
void SharedReleaseSharedMutex(SharedMutex lock) {
assert(false && "Not implemented.");
abort();
}
void DestroySharedMutex(SharedMutex lock) {
assert(false && "Not implemented.");
abort();
}
uint64_t ReadSystemClock() {
assert(false && "Not implemented.");
abort();
return 0;
}
uint64_t SystemClockFrequency() {
assert(false && "Not implemented.");
abort();
return 0;
}
bool ParseCpuID(cpuid_t* cpuinfo) {
assert(false && "Not implemented.");
abort();
return false;
}
} // namespace os
} // namespace wsl
#endif
@@ -0,0 +1,36 @@
/*
* Copyright © 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including
* the next paragraph) shall be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
const char rocdxgbuildid[] __attribute__((used)) = "ROCDXG BUILD ID: " STRING(ROCDXG_VERSION);
HSAKMT_STATUS HSAKMTAPI hsaKmtGetVersion(HsaVersionInfo *VersionInfo) {
CHECK_DXG_OPEN();
VersionInfo->KernelInterfaceMajorVersion = 1;
VersionInfo->KernelInterfaceMinorVersion = 17;
return HSAKMT_STATUS_SUCCESS;
}
@@ -0,0 +1,320 @@
/* Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. */
#include "impl/wddm/cmd_util.h"
namespace wsl {
namespace thunk {
/*
* Builds a COPY_DATA packet that copies data.
*/
size_t CmdUtil::BuildCopyData(
uint64_t *pDstAddr,
void *pBuffer,
uint32_t dstSel,
uint32_t dstCachePolicy,
uint32_t srcSel,
uint32_t srcCachePolicy,
uint32_t countSel,
uint32_t wrConfirm) {
PM4MEC_COPY_DATA copy_data = {0};
GenerateCmdHeader(&copy_data, IT_COPY_DATA);
copy_data.bitfields2.dst_sel = dstSel;
copy_data.bitfields2.src_sel = srcSel;
copy_data.bitfields2.dst_cache_policy = dstCachePolicy;
copy_data.bitfields2.src_cache_policy = srcCachePolicy;
copy_data.bitfields2.count_sel = countSel;
copy_data.bitfields2.wr_confirm = wrConfirm;
copy_data.bitfields5c.dst_64b_addr_lo = (PtrLow32(pDstAddr) >> 3);
copy_data.dst_addr_hi = PtrHigh32(pDstAddr);
memcpy(pBuffer, &copy_data, sizeof(copy_data));
return sizeof(copy_data);
}
/*
* Builds a EVENT_WRITE packet.
* Applications can use Barrier command to ensure their
* command is executed only after all other commands have
* completed their execution.
*/
size_t CmdUtil::BuildBarrier(
void *pBuffer,
uint32_t eventIndex,
uint32_t eventType) {
BarrierTemplate barrier = {0};
GenerateCmdHeader(&barrier.event_write, IT_EVENT_WRITE);
barrier.event_write.bitfields2.event_index = eventIndex;
barrier.event_write.bitfields2.event_type = eventType;
memcpy(pBuffer, &barrier, sizeof(barrier));
return sizeof(barrier);
}
/**
* Builds a WRITE_DATA packet.
* Writes two DWORDs into the GPU memory address "write_addr"
*/
size_t CmdUtil::BuildWriteData64Command(
void* pBuffer,
uint64_t* write_addr,
uint64_t write_value) {
WriteDataTemplate command = {0};
GenerateCmdHeader(&command.write_data, IT_WRITE_DATA);
// Encode the user specified address to write to
uint64_t addr = uintptr_t(write_addr);
assert(!(addr & 0x3) && "WriteData address must be 4 byte aligned");
// Set the bit to confirm the write operation and cache policy
command.write_data.bitfields2.wr_confirm = wr_confirm__mec_write_data__wait_for_write_confirmation;
command.write_data.bitfields2.cache_policy = cache_policy__mec_write_data__bypass;
// Specify the command to increment address if writing more than one DWord
command.write_data.bitfields2.addr_incr = addr_incr__mec_write_data__increment_address;
// Specify the class to which the write destination belongs
command.write_data.bitfields2.dst_sel = dst_sel__mec_write_data__memory;
command.write_data.bitfields3c.dst_mem_addr_lo = (PtrLow32(write_addr) >> 2);
command.write_data.dst_mem_addr_hi = PtrHigh32(write_addr);
// Specify the value to write
command.write_data.write_data_value = write_value;
memcpy(pBuffer, &command, sizeof(command));
return sizeof(command);
}
/*
* Builds a ACQUIRE_MEM packet.
* Users can submit this command to
* invalidate Gpu caches - L1 and or L2.
*/
size_t CmdUtil::BuildAcquireMem(
uint8_t major,
void *pBuffer) {
size_t ret;
if (major == 9) {
gfx9::AcquireMemTemplate acq = {0};
GenerateCmdHeader(&acq.acquire_mem, IT_ACQUIRE_MEM);
// Specify the size of memory to invalidate. Size is
// specified in terms of 256 byte chunks. A coher_size
// of 0xFFFFFFFF actually specified 0xFFFFFFFF00 (40 bits)
// of memory. The field coher_size_hi specifies memory from
// bits 40-64 for a total of 256 TB.
acq.acquire_mem.coher_size = 0xFFFFFFFF;
acq.acquire_mem.bitfields4.coher_size_hi = 0xFF;
// Specify the address of memory to invalidate. The
// address must be 256 byte aligned.
acq.acquire_mem.coher_base_lo = 0;
acq.acquire_mem.bitfields6.coher_base_hi = 0;
// Specify the poll interval for determing if operation is complete
acq.acquire_mem.bitfields7.poll_interval = 4;
acq.acquire_mem.bitfields2.coher_cntl =
(1 << 29) | // CP_COHER_CNTL__SH_ICACHE_ACTION_ENA_MASK
(1 << 27) | // CP_COHER_CNTL__SH_KCACHE_ACTION_ENA_MASK
(1 << 28); // CP_COHER_CNTL__SH_KCACHE_VOL_ACTION_ENA_MASK
memcpy(pBuffer, &acq, sizeof(acq));
ret = sizeof(acq);
} else if (major >= 10) {
gfx10::AcquireMemTemplate acq = {0};
GenerateCmdHeader(&acq.acquire_mem, IT_ACQUIRE_MEM);
acq.acquire_mem.coher_size = 0xFFFFFFFF;
acq.acquire_mem.bitfields4.coher_size_hi = 0xFF;
acq.acquire_mem.coher_base_lo = 0;
acq.acquire_mem.bitfields6.coher_base_hi = 0;
acq.acquire_mem.bitfields7.poll_interval = 4;
acq.acquire_mem.bitfields8.gcr_cntl =
(1 << 16) | // SEQ = FORWARD
(1 << 15) | // GL2_WB
(1 << 14) | // GL2_INV
(1 << 9) | // GL1_INV
(1 << 8) | // GLV_INV
(1 << 7) | // GLK_INV
(1 << 6) | // GLK_WB
(1 << 5) | // GLM_INV
(1 << 4) | // GLM_WB
(1 << 0); // GLI_INV = ALL
memcpy(pBuffer, &acq, sizeof(acq));
ret = sizeof(acq);
}
return ret;
}
/*
* Builds a scratch packet.
*/
size_t CmdUtil::BuildScratch(
void *pScratchBase,
void *pBuffer) {
struct SetScratchTemplate scratch = {0};
GenerateSetShRegHeader(&scratch, mmCOMPUTE_DISPATCH_SCRATCH_BASE_LO);
scratch.scratch_lo = Ptr48Low32(pScratchBase);
scratch.scratch_hi = Ptr48High8(pScratchBase);
memcpy(pBuffer, &scratch, sizeof(scratch));
return sizeof(scratch);
}
/**
* @ Set Compute Shader parameter for gfx11 and above
*/
size_t CmdUtil::BuildComputeShaderParams(void *pBuffer) {
struct DispatchProgramResourceRegs compute_shader_params = {0};
GenerateSetShRegHeader(&compute_shader_params, mmCOMPUTE_PGM_RSRC3);
// IMAGE_OP: Indicates the compute program contains an image op
// instruction and should be stalled by its WAIT_SYNC fence.
compute_shader_params.compute_pgm_rsrc3 = (1 << 31);
memcpy(pBuffer, &compute_shader_params, sizeof(compute_shader_params));
return sizeof(compute_shader_params);
}
/*
* Builds a dispatch packet.
*/
size_t CmdUtil::BuildDispatch(
struct DispatchInfo *pInfo,
void *pBuffer) {
DispatchTemplate dispatch = {0};
GenerateSetShRegHeader(&dispatch.dimension_regs, mmCOMPUTE_NUM_THREAD_X);
dispatch.dimension_regs.compute_num_thread_x = pInfo->pPacket->workgroup_size_x;
dispatch.dimension_regs.compute_num_thread_y = pInfo->pPacket->workgroup_size_y;
dispatch.dimension_regs.compute_num_thread_z = pInfo->pPacket->workgroup_size_z;
// TODO: Add AQL packet index for debugger
// Debugger requires AQL packet index in COMPUTE_DISPATCH_PKT_ADDR_LO
GenerateSetShRegHeader(&dispatch.program_regs, mmCOMPUTE_PGM_LO);
dispatch.program_regs.compute_pgm_lo = Ptr48Low32(pInfo->pEntry);
dispatch.program_regs.compute_pgm_hi = Ptr48High8(pInfo->pEntry);
GenerateSetShRegHeader(&dispatch.program_resource_regs, mmCOMPUTE_PGM_RSRC1);
dispatch.program_resource_regs.compute_pgm_rsrc1 = pInfo->pKernelObject->compute_pgm_rsrc1;
if (pInfo->major == 11) {
AMD_HSA_BITS_SET(dispatch.program_resource_regs.compute_pgm_rsrc1,
AMD_COMPUTE_PGM_RSRC_ONE_PRIV, 1);
}
dispatch.program_resource_regs.compute_pgm_rsrc2 =
(pInfo->ldsBlks << 15) | pInfo->pKernelObject->compute_pgm_rsrc2;
GenerateSetShRegHeader(&dispatch.resource_regs, mmCOMPUTE_RESOURCE_LIMITS);
dispatch.resource_regs.compute_resource_limits = 0x3ff;
dispatch.resource_regs.compute_static_thread_mgmt_se0 = 0xFFFFFFFF;
dispatch.resource_regs.compute_static_thread_mgmt_se1 = 0xFFFFFFFF;
dispatch.resource_regs.compute_static_thread_mgmt_se2 = 0xFFFFFFFF;
dispatch.resource_regs.compute_static_thread_mgmt_se3 = 0xFFFFFFFF;
dispatch.resource_regs.compute_tmpring_size = pInfo->pAmdQueue->compute_tmpring_size;
GenerateSetShRegHeader(&dispatch.compute_user_data_regs, mmCOMPUTE_USER_DATA_0);
uint32_t sgpr_no = 0;
if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties,
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER)) {
assert(pInfo->major < 11);
pInfo->scratchBaseOffset[pInfo->offsetCnt++] =
offsetof(struct DispatchTemplate, compute_user_data_regs.compute_user_data[0]) +
sgpr_no * sizeof(uint32_t);
dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
pInfo->pAmdQueue->scratch_resource_descriptor[0];
dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
pInfo->pAmdQueue->scratch_resource_descriptor[1];
dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
pInfo->pAmdQueue->scratch_resource_descriptor[2];
dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
pInfo->srd;
}
if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties,
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR)) {
dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = PtrLow32(pInfo->pPacket);
dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = PtrHigh32(pInfo->pPacket);
}
if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties,
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) {
dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = PtrLow32(pInfo->pAmdQueue);
dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = PtrHigh32(pInfo->pAmdQueue);
}
if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties,
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR)) {
dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
PtrLow32(pInfo->pPacket->kernarg_address);
dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
PtrHigh32(pInfo->pPacket->kernarg_address);
}
if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties,
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_ID)) {
// This feature may be enabled as a side effect of indirect calls.
// However, the compiler team confirmed that the dispatch id itself is not used,
// so safe to send 0 for each dispatch.
dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = 0;
dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = 0;
}
if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties,
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_FLAT_SCRATCH_INIT)) {
assert(pInfo->major < 11);
pInfo->scratchBaseOffset[pInfo->offsetCnt++] =
offsetof(struct DispatchTemplate, compute_user_data_regs.compute_user_data[0]) +
sgpr_no * sizeof(uint32_t);
dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
PtrLow32(pInfo->pScratchBase);
dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
PtrHigh32(pInfo->pScratchBase);
}
if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties,
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE)) {
dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] =
pInfo->scratchSizePerWave / (pInfo->wave32 ? 32 : 64);
}
GenerateCmdHeader(&dispatch.dispatch_direct, IT_DISPATCH_DIRECT);
dispatch.dispatch_direct.dispatch_initiator =
(1 << 0) | // COMPUTE_SHADER_EN
(1 << 2) | // FORCE_START_AT_000
(1 << 5); // USE_THREAD_DIMENSIONS
if (pInfo->wave32) dispatch.dispatch_direct.dispatch_initiator |= (1 << 15); // CS_W32_EN
dispatch.dispatch_direct.dim_x = pInfo->pPacket->grid_size_x;
dispatch.dispatch_direct.dim_y = pInfo->pPacket->grid_size_y;
dispatch.dispatch_direct.dim_z = pInfo->pPacket->grid_size_z;
memcpy(pBuffer, &dispatch, sizeof(dispatch));
return sizeof(dispatch);
}
/*
* Builds a ATOMIC_MEM packet.
* Users can submit this command
* to perform atomic operations.
*/
size_t CmdUtil::BuildAtomicMem(
uint64_t *pAddr,
uint32_t atomic,
void *pBuffer,
uint32_t cachePolicy,
uint64_t srcData) {
AtomicTemplate atom = {0};
GenerateCmdHeader(&atom.atomic, IT_ATOMIC_MEM);
atom.atomic.addr_lo = PtrLow32(pAddr);
atom.atomic.addr_hi = PtrHigh32(pAddr);
atom.atomic.bitfields2.atomic = atomic;
atom.atomic.bitfields2.cache_policy = cachePolicy;
atom.atomic.src_data_lo = LowPart(srcData);
atom.atomic.src_data_hi = HighPart(srcData);
memcpy(pBuffer, &atom, sizeof(atom));
return sizeof(atom);
}
} // namespace thunk
} // namespace wsl
@@ -0,0 +1,780 @@
////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
// AMD Research and AMD HSA Software Development
//
// Advanced Micro Devices, Inc.
//
// www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// - Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimers.
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimers in
// the documentation and/or other materials provided with the distribution.
// - Neither the names of Advanced Micro Devices, Inc,
// nor the names of its contributors may be used to endorse or promote
// products derived from this Software without specific prior written
// permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
//
////////////////////////////////////////////////////////////////////////////////
#include <cinttypes>
#include <bitset>
#include <sys/mman.h>
#include <sys/sysinfo.h>
#include <sys/stat.h>
#include <linux/mman.h>
#include <fcntl.h>
#include <unistd.h>
#include "impl/wddm/status.h"
#include "impl/wddm/types.h"
#include "impl/wddm/device.h"
#include "impl/wddm/queue.h"
namespace wsl {
namespace thunk {
const uint32_t WDDMDevice::cmdbuf_aql_frame_num_ = 0x1000;
WDDMDevice::WDDMDevice(D3DKMT_HANDLE adapter, LUID adapter_luid, uint32_t node_id)
: adapter_(adapter), adapter_luid_(adapter_luid), node_id_(node_id) {
memset(&device_info_, 0, sizeof(device_info_));
ParseDeviceInfo();
CreateDevice();
SetPowerOptimization(false);
CreatePagingQueue();
InitCmdbufInfo();
QuerySegmentInfo();
}
WDDMDevice::~WDDMDevice() {
DestroyPagingQueue();
SetPowerOptimization(true);
DestroyDevice();
DestroyDeviceInfo();
}
static NTSTATUS WDDMQueryAdapter(D3DKMT_HANDLE adapter, KMTQUERYADAPTERINFOTYPE type,
void *data, int size)
{
D3DKMT_QUERYADAPTERINFO args = {0};
args.hAdapter = adapter;
args.Type = type;
args.pPrivateDriverData = data;
args.PrivateDriverDataSize = size;
return DXCORE_CALL(D3DKMTQueryAdapterInfo(&args));
}
bool WDDMDevice::QuerySegmentInfo()
{
uint32_t segmentCount = 0;
segment_infos_.clear();
// Get the number of segments
D3DKMT_QUERYSTATISTICS adapterQuery = {};
adapterQuery.Type = D3DKMT_QUERYSTATISTICS_ADAPTER;
adapterQuery.AdapterLuid = adapter_luid_;
NTSTATUS ret = DXCORE_CALL(D3DKMTQueryStatistics(&adapterQuery));
if (ret == STATUS_SUCCESS) {
segmentCount = adapterQuery.QueryResult.AdapterInformation.NbSegments;
pr_debug("Total Segments: %u\n", segmentCount);
} else {
pr_err("Failed to query adapter info\n");
return false;
}
for (uint32_t i = 0; i < segmentCount; i++) {
D3DKMT_QUERYSTATISTICS segQuery = {};
segQuery.Type = D3DKMT_QUERYSTATISTICS_SEGMENT;
segQuery.AdapterLuid = adapter_luid_;
segQuery.QuerySegment.SegmentId = i;
ret = DXCORE_CALL(D3DKMTQueryStatistics(&segQuery));
if (ret != STATUS_SUCCESS) {
pr_err("Failed to query segment %u info\n", i);
return false;
}
auto& seg = segQuery.QueryResult.SegmentInformation;
SegmentInfo info;
info.segment_id = i;
info.segment_type = seg.SegmentProperties.SegmentType;
info.system_memory = seg.SegmentProperties.SystemMemory;
info.aperture = seg.Aperture;
info.commit_limit = seg.CommitLimit;
segment_infos_.push_back(info);
}
return true;
}
bool WDDMDevice::GetSegmentId(D3DKMT_QUERYSTATISTICS_SEGMENT_TYPE segment_type,
uint32_t &segment_id)
{
for (const auto& seg_info : segment_infos_) {
if (seg_info.segment_type == segment_type) {
segment_id = seg_info.segment_id;
return true;
}
}
pr_err("Failed to get segment id for type %u\n", segment_type);
return false;
}
/*Local heap(dedicated GPU memory) includes visiable heap and invisiable heap.
*Non local heap refers to shared GPU memory and it is sytem memory.
*/
uint64_t WDDMDevice::VramAvail(void) {
D3DKMT_QUERYSTATISTICS stats;
NTSTATUS ret;
uint64_t usedVis = 0;
uint64_t usedInv = 0;
uint64_t usedNonLocal = 0;
uint32_t segmentId = 0;
// wait fence complete
uint64_t value = page_fence_value_.load();
if(!CpuWait(&page_syncobj_, &value, 1, false))
return HSA_STATUS_ERROR;
if (IsDgpu()) {
// local cpu-visible memory
if(!GetSegmentId(D3DKMT_QUERYSTATISTICS_SEGMENT_TYPE_MEMORY, segmentId))
return HSA_STATUS_ERROR;
memset(&stats, 0, sizeof(D3DKMT_QUERYSTATISTICS));
stats.Type = D3DKMT_QUERYSTATISTICS_SEGMENT;
stats.AdapterLuid = adapter_luid_;
stats.QuerySegment.SegmentId = segmentId;
ret = DXCORE_CALL(D3DKMTQueryStatistics(&stats));
if (ret == 0)
usedVis = stats.QueryResult.SegmentInformation.BytesResident;
// local invisible memory
if (device_info_.local_invisible_heap_size) {
segmentId++;
memset(&stats, 0, sizeof(D3DKMT_QUERYSTATISTICS));
stats.Type = D3DKMT_QUERYSTATISTICS_SEGMENT;
stats.AdapterLuid = adapter_luid_;
stats.QuerySegment.SegmentId = 1;
ret = DXCORE_CALL(D3DKMTQueryStatistics(&stats));
if (ret == 0)
usedInv = stats.QueryResult.SegmentInformation.BytesResident;
}
return LocalHeapSize() - usedVis - usedInv;
} else {
// APU - NonLocal memory
if(!GetSegmentId(D3DKMT_QUERYSTATISTICS_SEGMENT_TYPE_SYSMEM, segmentId))
return HSA_STATUS_ERROR;
memset(&stats, 0, sizeof(D3DKMT_QUERYSTATISTICS));
stats.Type = D3DKMT_QUERYSTATISTICS_SEGMENT;
stats.AdapterLuid = adapter_luid_;
stats.QuerySegment.SegmentId = segmentId;
ret = DXCORE_CALL(D3DKMTQueryStatistics(&stats));
if (ret == 0)
usedNonLocal = stats.QueryResult.SegmentInformation.BytesResident;
return NonLocalHeapSize() - usedNonLocal;
}
}
bool WDDMDevice::CreateDevice(void) {
D3DKMT_CREATEDEVICE args = {0};
args.hAdapter = adapter_;
NTSTATUS ret = DXCORE_CALL(D3DKMTCreateDevice(&args));
if (ret == STATUS_SUCCESS) {
device_ = args.hDevice;
return true;
}
pr_err("fail %x\n", ret);
return false;
}
bool WDDMDevice::DestroyDevice(void) {
D3DKMT_DESTROYDEVICE args = {0};
args.hDevice = device_;
NTSTATUS ret = DXCORE_CALL(D3DKMTDestroyDevice(&args));
if (ret == STATUS_SUCCESS)
return true;
pr_err("fail %x\n", ret);
return false;
}
bool WDDMDevice::CreatePagingQueue(void) {
D3DKMT_CREATEPAGINGQUEUE args = {0};
args.hDevice = device_;
args.Priority = D3DDDI_PAGINGQUEUE_PRIORITY_NORMAL;
NTSTATUS ret = DXCORE_CALL(D3DKMTCreatePagingQueue(&args));
if (ret == STATUS_SUCCESS) {
page_queue_ = args.hPagingQueue;
page_syncobj_ = args.hSyncObject;
page_fence_addr_ = (uint64_t *)args.FenceValueCPUVirtualAddress;
page_fence_value_ = 0;
return true;
}
pr_err("fail %x\n", ret);
return false;
}
bool WDDMDevice::DestroyPagingQueue(void) {
D3DDDI_DESTROYPAGINGQUEUE args = {0};
args.hPagingQueue = page_queue_;
NTSTATUS ret = DXCORE_CALL(D3DKMTDestroyPagingQueue(&args));
if (ret == STATUS_SUCCESS)
return true;
pr_err("fail %x\n", ret);
return false;
}
void WDDMDevice::SetPowerOptimization(bool restore) {
void *priv_data;
int priv_size;
priv_size = thunk_proxy::GetPowerOptPrivDataSize();
priv_data = malloc(priv_size);
assert(priv_data);
memset(priv_data, 0, priv_size);
thunk_proxy::FillinPowerOptPrivData(priv_data, restore);
D3DKMT_ESCAPE d3dkmt_escape;
memset(&d3dkmt_escape, 0, sizeof(d3dkmt_escape));
d3dkmt_escape.hAdapter = adapter_;
d3dkmt_escape.hDevice = device_;
d3dkmt_escape.hContext = 0; //KMD only use device to identify the process
d3dkmt_escape.Type = D3DKMT_ESCAPE_DRIVERPRIVATE;
d3dkmt_escape.pPrivateDriverData = priv_data;
d3dkmt_escape.PrivateDriverDataSize = priv_size;
d3dkmt_escape.Flags.HardwareAccess = true;
NTSTATUS status = DXCORE_CALL(D3DKMTEscape(&d3dkmt_escape));
pr_debug("status %d, restore %d\n", status, restore);
free(priv_data);
}
void WDDMDevice::UpdatePageFence(uint64_t fence_value) {
uint64_t current = page_fence_value_.load();
// atomically set fence value when target is bigger than current one
do {
if (current >= fence_value)
break;
} while (!page_fence_value_.compare_exchange_weak(current, fence_value));
}
ErrorCode WDDMDevice::CreateGpuMemory(const GpuMemoryCreateInfo &create_info,
GpuMemory **gpu_mem, gpusize *gpu_va) {
ErrorCode ret;
*gpu_mem = nullptr;
auto mem = new GpuMemory(this);
if (create_info.dmabuf_fd > 0)
ret = mem->ImportPhysicalHandle(create_info, gpu_va);
else
ret = mem->Init(create_info);
if (ret == ErrorCode::Success)
*gpu_mem = mem;
else
delete mem;
return ret;
}
void *WDDMDevice::Lock(D3DKMT_HANDLE handle) {
D3DKMT_LOCK2 args = {0};
args.hDevice = device_;
args.hAllocation = handle;
NTSTATUS ret = DXCORE_CALL(D3DKMTLock2(&args));
if (ret == STATUS_SUCCESS)
return args.pData;
pr_err("fail %x\n", ret);
return NULL;
}
bool WDDMDevice::Unlock(D3DKMT_HANDLE handle) {
D3DKMT_UNLOCK2 args = {0};
args.hDevice = device_;
args.hAllocation = handle;
NTSTATUS ret = DXCORE_CALL(D3DKMTUnlock2(&args));
if (ret == STATUS_SUCCESS)
return true;
pr_err("fail %x\n", ret);
return false;
}
bool WDDMDevice::CreateContext(int engine, D3DKMT_HANDLE *handle) {
void *priv_data;
int priv_size;
int ordinal = EngineOrdinal(engine, &device_info_);
if (ordinal < 0)
return false;
priv_size = thunk_proxy::GetContextPrivDataSize();
priv_data = malloc(priv_size);
assert(priv_data);
memset(priv_data, 0, priv_size);
thunk_proxy::FillinContextPrivData(priv_data, SupportStateShadowingByCpFw());
D3DKMT_CREATECONTEXTVIRTUAL args = {0};
args.hDevice = device_;
args.EngineAffinity = 1 << 0;
args.NodeOrdinal = ordinal;
args.pPrivateDriverData = priv_data;
args.PrivateDriverDataSize = priv_size;
args.ClientHint = D3DKMT_CLIENTHINT_OPENCL;
if (IsHwsEnabled(engine))
args.Flags.HwQueueSupported = 1;
else
args.Flags.DisableGpuTimeout = thunk_proxy::ShouldDisableGpuTimeout(engine, &device_info_);
NTSTATUS ret = DXCORE_CALL(D3DKMTCreateContextVirtual(&args));
if (ret == STATUS_SUCCESS) {
*handle = args.hContext;
free(priv_data);
return true;
}
free(priv_data);
pr_err("fail %x\n", ret);
return false;
}
bool WDDMDevice::DestroyContext(D3DKMT_HANDLE handle) {
D3DKMT_DESTROYCONTEXT args = {0};
args.hContext = handle;
NTSTATUS ret = DXCORE_CALL(D3DKMTDestroyContext(&args));
if (ret == STATUS_SUCCESS)
return true;
pr_err("fail %x\n", ret);
return false;
}
bool WDDMDevice::GpuWait(WDDMQueue *queue, const D3DKMT_HANDLE *syncobjs,
uint64_t *values, int count) {
D3DKMT_WAITFORSYNCHRONIZATIONOBJECTFROMGPU args = {0};
args.hContext = queue->context;
args.ObjectCount = count;
args.ObjectHandleArray = syncobjs;
args.MonitoredFenceValueArray = values;
NTSTATUS ret = DXCORE_CALL(D3DKMTWaitForSynchronizationObjectFromGpu(&args));
if (ret == STATUS_SUCCESS)
return true;
pr_err("fail %x\n", ret);
return false;
}
bool WDDMDevice::GpuSignal(D3DKMT_HANDLE context, const D3DKMT_HANDLE *syncobjs,
uint64_t *value, int count) {
D3DKMT_SIGNALSYNCHRONIZATIONOBJECTFROMGPU args = {0};
args.hContext = context;
args.ObjectCount = count;
args.ObjectHandleArray = syncobjs;
args.MonitoredFenceValueArray = value;
NTSTATUS ret = DXCORE_CALL(D3DKMTSignalSynchronizationObjectFromGpu(&args));
if (ret == STATUS_SUCCESS)
return true;
pr_err("fail %x\n", ret);
return false;
}
bool WDDMDevice::CpuWait(const D3DKMT_HANDLE *syncobjs, uint64_t *value,
int count, bool wait_any) {
D3DKMT_WAITFORSYNCHRONIZATIONOBJECTFROMCPU args = {0};
args.hDevice = device_;
args.ObjectCount = count;
args.ObjectHandleArray = syncobjs;
args.FenceValueArray = value;
args.Flags.WaitAny = wait_any;
NTSTATUS ret = DXCORE_CALL(D3DKMTWaitForSynchronizationObjectFromCpu(&args));
if (ret == STATUS_SUCCESS)
return true;
pr_err("fail %x\n", ret);
return false;
}
bool WDDMDevice::WaitOnPagingFenceFromCpu() {
uint64_t page_fence_value = 0;
page_fence_value = page_fence_value_.load();
if (CpuWait(&page_syncobj_, &page_fence_value, 1, false))
return true;
return false;
}
bool WDDMDevice::CreateSyncobj(D3DKMT_HANDLE *handle, uint64_t **addr) {
D3DKMT_CREATESYNCHRONIZATIONOBJECT2 args = {0};
args.hDevice = device_;
args.Info.Type = D3DDDI_MONITORED_FENCE;
args.Info.MonitoredFence.EngineAffinity = 1 << 0;
NTSTATUS ret = DXCORE_CALL(D3DKMTCreateSynchronizationObject2(&args));
if (ret == STATUS_SUCCESS) {
*handle = args.hSyncObject;
*addr = (uint64_t *)args.Info.MonitoredFence.FenceValueCPUVirtualAddress;
pr_debug("create syncobj cpu addr=%p gpu addr=%" PRIx64 "\n",
args.Info.MonitoredFence.FenceValueCPUVirtualAddress,
args.Info.MonitoredFence.FenceValueGPUVirtualAddress);
return true;
}
pr_err("fail %x\n", ret);
return false;
}
void WDDMDevice::DestroySyncobj(D3DKMT_HANDLE handle) {
D3DKMT_DESTROYSYNCHRONIZATIONOBJECT args = {0};
args.hSyncObject = handle;
NTSTATUS ret = DXCORE_CALL(D3DKMTDestroySynchronizationObject(&args));
if (ret != STATUS_SUCCESS)
pr_err("fail %x\n", ret);
}
void WDDMDevice::InitCmdbufInfo(void) {
if (device_info_.major == 9) {
cmdbuf_aql_frame_size_ = 2 * sizeof(gfx9::AcquireMemTemplate);
} else if (device_info_.major >= 10) {
cmdbuf_aql_frame_size_ = 2 * sizeof(gfx10::AcquireMemTemplate);
}
if (device_info_.major >= 11) {
cmdbuf_aql_frame_size_ += sizeof(SetScratchTemplate);
cmdbuf_aql_frame_size_ += sizeof(DispatchProgramResourceRegs); // BuildComputeShaderParams
}
cmdbuf_aql_frame_size_ +=
sizeof(PM4MEC_COPY_DATA) * 2 +
sizeof(BarrierTemplate) * 2 +
sizeof(DispatchTemplate) +
sizeof(AtomicTemplate) * 2;
// Add safety margin to account for alignment and future additions
cmdbuf_aql_frame_size_ += 128;
cmdbuf_aql_frame_size_ = AlignUp(cmdbuf_aql_frame_size_, 0x10);
cmdbuf_size_ = AlignUp(cmdbuf_aql_frame_num_ * cmdbuf_aql_frame_size_, 0x1000);
}
uint32_t WDDMDevice::LdsBlocks(const hsa_kernel_dispatch_packet_t *pkt) {
static const uint32_t blk_sz = 512;
uint32_t total_sz = pkt->group_segment_size;
uint32_t blk_num = (total_sz + blk_sz - 1) / blk_sz;
return blk_num;
}
NTSTATUS WDDMCreateDevices(std::vector<WDDMDevice *> &devices)
{
bool supported = false;
D3DKMT_ENUMADAPTERS2 args = {0};
NTSTATUS ret = DXCORE_CALL(D3DKMTEnumAdapters2(&args));
if (ret != STATUS_SUCCESS)
return ret;
if (!args.NumAdapters) {
return STATUS_SUCCESS;
}
D3DKMT_ADAPTERINFO *info = new D3DKMT_ADAPTERINFO[args.NumAdapters];
if (!info)
return STATUS_NO_MEMORY;
args.pAdapters = info;
ret = DXCORE_CALL(D3DKMTEnumAdapters2(&args));
if (ret != STATUS_SUCCESS)
goto err_out0;
for (int i = 0; i < args.NumAdapters; i++) {
D3DKMT_QUERY_DEVICE_IDS query = {0};
ret = WDDMQueryAdapter(info[i].hAdapter, KMTQAITYPE_PHYSICALADAPTERDEVICEIDS,
&query, sizeof(query));
if (ret != STATUS_SUCCESS)
goto err_out1;
if (query.DeviceIds.VendorID != 0x1002)
continue;
supported = thunk_proxy::QueryAdapterSupported(query.DeviceIds.DeviceID);
if (supported) {
auto device = new WDDMDevice(
info[i].hAdapter, info[i].AdapterLuid, devices.size() + 1);
if (!device)
goto err_out1;
devices.push_back(device);
}
}
delete[] info;
return STATUS_SUCCESS;
err_out1:
for (auto &device : devices)
delete device;
err_out0:
delete[] info;
return ret;
}
bool WDDMDevice::ParseDeviceInfo() {
bool ret;
memset(&device_info_, 0, sizeof(device_info_));
ret = thunk_proxy::ParseAdapterInfo(adapter_, &device_info_);
if (!ret)
return false;
return true;
}
void WDDMDevice::DestroyDeviceInfo() {
free(device_info_.adapter_info);
}
void WDDMDevice::GetClockCounters(uint64_t *gpu, uint64_t *cpu) {
uint32_t engine = GetComputeEngine();
int ordinal = EngineOrdinal(engine, &device_info_);
D3DKMT_QUERYCLOCKCALIBRATION args = {0};
/* LDA(Linked Display Adapter)
* In the LDA design multiple physical GPUs are linked together to be controlled
* as a single object from the point of view of power manager, GPU scheduler and
* GPU memory manager. The physical GPUs are represented by a signal logical adapter
* object. There is a single DXGADAPTER objects, a single KMD adapter object.
*
* Set PhysicalAdapterIndex to 0 by default with None LDA mode.
*/
args.hAdapter = adapter_;
args.NodeOrdinal = ordinal;
args.PhysicalAdapterIndex = 0;
NTSTATUS status = DXCORE_CALL(D3DKMTQueryClockCalibration(&args));
if (status) {
pr_debug("status %d \n", status);
} else {
if (gpu)
*gpu = args.ClockData.GpuClockCounter;
if (cpu)
*cpu = args.ClockData.CpuClockCounter;
}
}
bool WDDMDevice::CreateQueue(WDDMQueue *queue) {
if (!CreateContext(queue->queue_engine, &queue->context))
return false;
GpuMemory *gpu_mem = nullptr;
if (queue->cmdbuf_addr == 0) {
GpuMemoryCreateInfo create_info{};
create_info.size = queue->cmdbuf_size;
create_info.domain = thunk_proxy::kSystem;
auto code = CreateGpuMemory(create_info, &gpu_mem);
if (code != ErrorCode::Success)
goto err_out0;
queue->cmdbuf = gpu_mem->GetGpuMemoryHandle();
queue->cmdbuf_addr = gpu_mem->GpuAddress();
}
if (queue->Init())
goto err_out1;
return true;
err_out1:
delete gpu_mem;
err_out0:
DestroyContext(queue->context);
return false;
}
void WDDMDevice::DestroyQueue(WDDMQueue *queue) {
queue->Fini();
auto cmdbuf_mem = GpuMemory::Convert(queue->cmdbuf);
delete cmdbuf_mem;
DestroyContext(queue->context);
}
bool WDDMDevice::SubmitToSwQueue(WDDMQueue *queue, uint64_t command_addr,
uint64_t command_size, uint64_t fence_value) {
void *priv_data;
int priv_size;
priv_size = thunk_proxy::GetSubmitPrivDataSize();
priv_data = malloc(priv_size);
assert(priv_data);
memset(priv_data, 0, priv_size);
thunk_proxy::FillinSubmitPrivData(priv_data, queue->queue, command_addr, command_size, false);
D3DKMT_SUBMITCOMMAND args = {0};
args.Commands = command_addr;
args.CommandLength = command_size;
args.BroadcastContextCount = 1;
args.BroadcastContext[0] = queue->context;
args.pPrivateDriverData = priv_data;
args.PrivateDriverDataSize = priv_size;
NTSTATUS ret = DXCORE_CALL(D3DKMTSubmitCommand(&args));
if (ret != STATUS_SUCCESS) {
pr_err("fail %x\n", ret);
free(priv_data);
return false;
}
free(priv_data);
if (!GpuSignal(queue->context, &queue->syncobj, &fence_value, 1))
return false;
return true;
}
bool WDDMDevice::CreateHwQueue(WDDMQueue *queue) {
void *priv_data;
int priv_size;
priv_size = thunk_proxy::GetHwQueuePrivDataSize();
priv_data = malloc(priv_size);
assert(priv_data);
memset(priv_data, 0, priv_size);
bool FwManagedGfxState = SupportStateShadowingByCpFw();
thunk_proxy::FillinHwQueuePrivData(priv_data, FwManagedGfxState, queue->prio);
D3DKMT_CREATEHWQUEUE createHwQueue = {0};
createHwQueue.hHwContext = queue->context;
createHwQueue.Flags.DisableGpuTimeout = thunk_proxy::ShouldDisableGpuTimeout(queue->queue_engine, &device_info_);
createHwQueue.pPrivateDriverData = priv_data;
createHwQueue.PrivateDriverDataSize = priv_size;
NTSTATUS ret = DXCORE_CALL(D3DKMTCreateHwQueue(&createHwQueue));
if (ret != STATUS_SUCCESS) {
pr_err("fail %x\n", ret);
free(priv_data);
return false;
}
free(priv_data);
queue->queue = createHwQueue.hHwQueue;
queue->syncobj = createHwQueue.hHwQueueProgressFence;
queue->sync_addr = (uint64_t *)createHwQueue.HwQueueProgressFenceCPUVirtualAddress;
return true;
}
bool WDDMDevice::DestroyHwQueue(WDDMQueue *queue) {
D3DKMT_DESTROYHWQUEUE DestroyHwQueue = {
.hHwQueue = queue->queue,
};
NTSTATUS ret = DXCORE_CALL(D3DKMTDestroyHwQueue(&DestroyHwQueue));
if (ret != STATUS_SUCCESS) {
pr_err("fail %x\n", ret);
return false;
}
return true;
}
bool WDDMDevice::SubmitToHwQueue(WDDMQueue *queue, uint64_t command_addr,
uint64_t command_size, uint64_t fence_value) {
void *priv_data;
int priv_size;
priv_size = thunk_proxy::GetSubmitPrivDataSize();
priv_data = malloc(priv_size);
assert(priv_data);
memset(priv_data, 0, priv_size);
thunk_proxy::FillinSubmitPrivData(priv_data, queue->queue, command_addr, command_size, true);
D3DKMT_SUBMITCOMMANDTOHWQUEUE args = {0};
args.hHwQueue = queue->queue;
args.HwQueueProgressFenceId = fence_value;
args.CommandBuffer = command_addr;
args.CommandLength = command_size;
args.pPrivateDriverData = priv_data;
args.PrivateDriverDataSize = priv_size;
NTSTATUS ret = DXCORE_CALL(D3DKMTSubmitCommandToHwQueue(&args));
if (ret != STATUS_SUCCESS) {
pr_err("fail %x\n", ret);
free(priv_data);
return false;
}
free(priv_data);
return true;
}
} // namespace thunk
} // namespace wsl
@@ -0,0 +1,594 @@
#include <sys/stat.h>
#include <cinttypes>
#include <cassert>
#include "impl/wddm/gpu_memory.h"
#include "impl/wddm/device.h"
#include "util/utils.h"
using namespace std;
namespace wsl {
namespace thunk {
size_t GpuMemory::CalcChunkNumbers(gpusize size) {
const auto chunk_size = WDDMDevice::GpuMemoryChunkSize;
return (size + chunk_size - 1) / chunk_size;
}
gpusize GpuMemory::AdjustSize(gpusize size) const {
const auto &device_info = device_->DeviceInfo();
if (device_info.enable_big_page_alignment && desc_.domain == thunk_proxy::kLocal) {
uint32_t alignment = device_info.big_page_alignment_size;
// BigPage is only supported for allocations > bigPageMinAlignment.
// Also, if bigPageMinAlignment == 0, BigPage optimization is not supported per KMD.
// We do either LargePage or BigPage alignment, whichever has a higher value.
if ((device_info.hw_big_page_min_alignment_size > 0) && (size > device_info.hw_big_page_min_alignment_size)) {
alignment = std::max(alignment, device_info.hw_big_page_min_alignment_size);
if (size > device_info.hw_big_page_alignment_size)
alignment = std::max(alignment, device_info.hw_big_page_alignment_size);
}
if (alignment > 0)
size = AlignUp(size, alignment);
} else {
const size_t min_size = 4096;
size = AlignUp(size, min_size);
}
return size;
}
GpuMemory::GpuMemory(WDDMDevice *device) : device_(device) {
num_allocations_ = 0;
alloc_handles_ptr_ = nullptr;
alloc_handle_ = 0;
resource_ = 0;
mem_fd_ = -1;
}
GpuMemory::~GpuMemory() {
FreeGpuVirtualAddress(GpuAddress(), Size());
FreePhysicalMemory();
if (desc_.handle_ape_addr > 0)
dxg_runtime->HandleApertureFree(desc_.handle_ape_addr);
}
ErrorCode GpuMemory::Init(const GpuMemoryCreateInfo &create_info) {
desc_.domain = create_info.domain;
desc_.adapter_luid = device_->GetLuid();
desc_.client_size = create_info.size;
desc_.alignment = create_info.alignment;
desc_.mem_flags = create_info.mem_flags;
desc_.engine_flag = create_info.engine_flag;
desc_.flags.is_virtual = create_info.flags.virtual_alloc;
desc_.flags.is_physical_only = create_info.flags.physical_only;
desc_.flags.is_physical_contiguous = create_info.flags.physical_contiguous;
desc_.flags.is_imported_sys_memfd = create_info.flags.sysmem_ipc_sig_importer;
desc_.flags.is_sysmem_exporter = create_info.flags.sysmem_ipc_sig_exporter;
desc_.flags.is_va_required = create_info.flags.alloc_va;
desc_.flags.is_blit_kernel_object = create_info.flags.blit_kernel_object;
/* we can't tell the allocation is regular vmm or ipc mem at creation stage,
they share same creation parameters, so forcing all vram allocations to
sharable to support IPC mem */
if (create_info.flags.interprocess ||
desc_.domain == thunk_proxy::AllocDomain::kLocal)
desc_.flags.is_shared = true;
desc_.flags.is_locked = create_info.flags.locked;
desc_.size = AdjustSize(desc_.client_size);
if (IsUserMemory() || IsSystem())
desc_.cpu_addr = create_info.user_ptr;
num_allocations_ = CalcChunkNumbers(Size());
if (num_allocations_ == 1)
alloc_handles_ptr_ = &alloc_handle_;
else
alloc_handles_ptr_ = new WinAllocationHandle[num_allocations_];
memset(alloc_handles_ptr_, 0, num_allocations_ * sizeof(WinAllocationHandle));
auto code = ErrorCode::Success;
if (IsPhysicalOnly()) {
code = CreatePhysicalMemory();
if (code == ErrorCode::Success)
code = dxg_runtime->HandleApertureAlloc(desc_.size, &desc_.handle_ape_addr);
return code;
}
code = ReserveGpuVirtualAddress(create_info.va_hint, Size(), create_info.alignment);
if (IsVirtual() || (code != ErrorCode::Success))
return code;
bool physical_created = false;
auto guard = MakeScopeGuard([this, &physical_created, &code]() {
if (code != ErrorCode::Success) {
if (physical_created) {
FreePhysicalMemory();
}
FreeGpuVirtualAddress(GpuAddress(), Size());
}
});
(void)guard;
code = CreatePhysicalMemory();
if (code != ErrorCode::Success)
return code;
physical_created = true;
code = MapGpuVirtualAddress(GpuAddress(), Size());
if (code != ErrorCode::Success)
return code;
code = MakeResident();
if (code != ErrorCode::Success)
return code;
if (!GetDevice()->WaitOnPagingFenceFromCpu())
code = ErrorCode::Unknown;
return code;
}
ErrorCode GpuMemory::UnmapGpuVirtualAddress(const gpusize addr, const gpusize size, gpusize offset) {
auto code = ErrorCode::Success;
size_t i = 0;
auto map_addr = addr;
auto map_size = size;
while (offset >= WDDMDevice::GpuMemoryChunkSize) {
offset -= WDDMDevice::GpuMemoryChunkSize;
i += 1;
}
while (map_size > 0) {
auto block_size = std::min(map_size, WDDMDevice::GpuMemoryChunkSize);
D3DDDI_MAPGPUVIRTUALADDRESS args{};
args.hPagingQueue = device_->PagingQueue();
args.BaseAddress = map_addr;
args.hAllocation = GetAllocationHandle(i);
args.SizeInPages = block_size / 0x1000;
args.Protection.NoAccess = 1;
code = d3dthunk::MapGpuVirtualAddress(&args);
if (code == ErrorCode::NotReady)
device_->UpdatePageFence(args.PagingFenceValue);
else if (code != ErrorCode::Success)
break;
map_addr += block_size;
map_size -= block_size;
offset = 0; // reset second unmapped allocation offset to zero
i += 1;
}
return code;
}
ErrorCode GpuMemory::MapGpuVirtualAddress(const gpusize addr, const gpusize size, gpusize offset) {
auto code = ErrorCode::Success;
size_t i = 0;
auto map_addr = addr;
auto map_size = size;
const size_t _4K = 0x1000;
while (offset >= WDDMDevice::GpuMemoryChunkSize) {
offset -= WDDMDevice::GpuMemoryChunkSize;
i += 1;
}
const size_t first_chunk = i;
const auto first_chunk_offset = offset;
/* Found two limitation for local vram:
* 1. invisible vram va has to be 64K aligned, otherwise map gpu va fail
* 2. visible vram can not be cpu mapped when command submission or after gpu mapped
*/
while (map_size > 0) {
auto block_size = std::min(map_size, WDDMDevice::GpuMemoryChunkSize);
D3DDDI_MAPGPUVIRTUALADDRESS args{};
args.hPagingQueue = device_->PagingQueue();
args.BaseAddress = map_addr;
args.hAllocation = GetAllocationHandle(i);
args.OffsetInPages = offset / _4K;
args.SizeInPages = block_size / _4K;
args.Protection.Write = 1;
code = d3dthunk::MapGpuVirtualAddress(&args);
if (code != ErrorCode::Success) {
if (code == ErrorCode::NotReady) {
const uint64_t fence_value = args.PagingFenceValue;
device_->UpdatePageFence(fence_value);
code = ErrorCode::Success;
} else
break;
}
map_addr += block_size;
map_size -= block_size;
offset = 0; // reset second mapped allocation offset to zero
i++;
}
if (code != ErrorCode::Success) {
// Map failed, unmap partial mapped block
offset = first_chunk_offset;
map_addr = addr;
map_size = size;
for (size_t j = first_chunk; j < i; j++) {
auto block_size = std::min(map_size, WDDMDevice::GpuMemoryChunkSize);
D3DDDI_MAPGPUVIRTUALADDRESS args{};
args.hPagingQueue = device_->PagingQueue();
args.BaseAddress = map_addr;
args.hAllocation = 0;
args.OffsetInPages = offset / _4K;
args.SizeInPages = block_size / _4K;
args.Protection.NoAccess = 1;
auto unmap_code = d3dthunk::MapGpuVirtualAddress(&args);
if (unmap_code == ErrorCode::NotReady)
device_->UpdatePageFence(args.PagingFenceValue);
map_addr += block_size;
map_size -= block_size;
}
}
return code;
}
ErrorCode GpuMemory::ReserveGpuVirtualAddress(gpusize base_virt_addr, gpusize size, gpusize alignment) {
ErrorCode status;
gpusize gpu_virt_addr = 0;
if ((desc_.flags.is_sysmem_exporter || desc_.flags.is_imported_sys_memfd)
&& desc_.domain == thunk_proxy::AllocDomain::kSystem) {
int mfd = (mem_fd_ > -1)? mem_fd_ : -1;
status = dxg_runtime->ReserveIPCSysMem(Size(), &gpu_virt_addr, desc_.alignment, mfd, desc_.flags.is_locked);
if (status == ErrorCode::Success)
mem_fd_ = mfd;
} else {
status = dxg_runtime->ReserveGpuVirtualAddress(desc_.domain, base_virt_addr, size, &gpu_virt_addr, alignment,
desc_.flags.is_locked);
}
if (status == ErrorCode::Success) {
desc_.gpu_addr = gpu_virt_addr;
if (IsSystem())
desc_.cpu_addr = reinterpret_cast<void *>(desc_.gpu_addr);
}
return status;
}
ErrorCode GpuMemory::FreeGpuVirtualAddress(gpusize base_addr, gpusize size) {
if (mem_fd_ > -1)
return dxg_runtime->FreeIPCSysMem(GpuAddress(), Size(), mem_fd_);
return base_addr != 0 ?
dxg_runtime->FreeGpuVirtualAddress(desc_.domain, base_addr, size) :
ErrorCode::Success;
}
ErrorCode GpuMemory::CreatePhysicalMemory() {
assert(!IsVirtual() && NumChunks() > 0);
const auto num_allocations = NumChunks();
void *priv_drv_data;
void *priv_alloc_data;
int priv_drv_data_size;
int priv_alloc_data_size;
thunk_proxy::GetAllocPrivDataSize(&priv_drv_data_size, &priv_alloc_data_size);
int total_size = priv_drv_data_size +
num_allocations * priv_alloc_data_size +
num_allocations * sizeof(D3DDDI_ALLOCATIONINFO2);
priv_drv_data = malloc(total_size);
if (!priv_drv_data)
return ErrorCode::OutOfMemory;
memset(priv_drv_data, 0, total_size);
thunk_proxy::FillinAllocPrivDrvData(priv_drv_data, priv_alloc_data_size);
priv_alloc_data = static_cast<unsigned char*>(priv_drv_data) + priv_drv_data_size;
auto alloc_info = reinterpret_cast<D3DDDI_ALLOCATIONINFO2*>(
static_cast<unsigned char*>(priv_alloc_data) + priv_alloc_data_size * num_allocations);
size_t size = desc_.size;
uint64_t addr = desc_.gpu_addr;
char *cpu_addr = static_cast<char *>(desc_.cpu_addr);
const auto &device_info = GetDevice()->DeviceInfo();
for (size_t i = 0; i < num_allocations; i++) {
void* priv_data = (void*)((char*)priv_alloc_data + priv_alloc_data_size * i);
size_t block_size = std::min(size, WDDMDevice::GpuMemoryChunkSize);
if (IsUserMemory() || IsSystem()) {
thunk_proxy::SetAllocationInfo(priv_data, block_size, desc_.domain, 0, desc_.mem_flags, desc_.engine_flag, device_info);
alloc_info[i].pSystemMem = static_cast<void *>(cpu_addr);
cpu_addr += block_size;
} else {
thunk_proxy::SetAllocationInfo(priv_data, block_size, desc_.domain, addr, desc_.mem_flags, desc_.engine_flag, device_info);
}
size -= block_size;
addr += block_size;
alloc_info[i].pPrivateDriverData = priv_data;
alloc_info[i].PrivateDriverDataSize = priv_alloc_data_size;
alloc_info[i].VidPnSourceId = D3DDDI_ID_UNINITIALIZED;
}
D3DKMT_CREATEALLOCATION args = {};
args.hDevice = device_->DeviceHandle();
args.pPrivateDriverData = priv_drv_data;
args.PrivateDriverDataSize = priv_drv_data_size;
args.NumAllocations = num_allocations;
args.pAllocationInfo2 = alloc_info;
/* The PhysicallyContiguous flag causes allocation failure
* args.Flags.PhysicallyContiguous = IsPhysicalContiguous();
*/
SharedHandleInfo shared_info;
if (IsShared()) {
shared_info.size = desc_.size;
shared_info.client_size = desc_.client_size;
shared_info.domain = desc_.domain;
shared_info.adapter_luid = desc_.adapter_luid;
shared_info.flags = reinterpret_cast<uint32_t>(desc_.flags.reserved);
shared_info.mem_flags = desc_.mem_flags;
shared_info.pid = dxg_runtime->parent_pid;
shared_info.gpu_addr = desc_.gpu_addr;
args.pPrivateRuntimeData = &shared_info;
args.PrivateRuntimeDataSize = sizeof(shared_info);
args.Flags.NtSecuritySharing = 1;
args.Flags.CreateShared = 1;
args.Flags.CreateResource = 1;
}
auto status = d3dthunk::CreateAllocation(&args);
if (status == ErrorCode::Success) {
for (size_t i = 0; i < num_allocations; i++)
alloc_handles_ptr_[i] = alloc_info[i].hAllocation;
resource_ = args.hResource;
}
free(priv_drv_data);
return status;
}
ErrorCode GpuMemory::FreePhysicalMemory() {
auto code = ErrorCode::Success;
if (alloc_handles_ptr_ == nullptr || (NumChunks() == 1 && *alloc_handles_ptr_ == 0))
return code;
code = d3dthunk::DestroyAllocation(device_->DeviceHandle(),
resource_,
NumChunks(),
alloc_handles_ptr_);
if (NumChunks() > 1)
delete[] alloc_handles_ptr_;
alloc_handles_ptr_ = nullptr;
return code;
}
ErrorCode GpuMemory::MakeResident() {
D3DDDI_MAKERESIDENT args = {};
args.hPagingQueue = device_->PagingQueue();
args.NumAllocations = NumChunks();
args.AllocationList = alloc_handles_ptr_;
args.Flags.CantTrimFurther = 1;
auto code = d3dthunk::MakeResident(&args);
if (code == ErrorCode::NotReady) {
const auto fence_value = args.PagingFenceValue;
device_->UpdatePageFence(fence_value);
code = ErrorCode::Success;
}
return code;
}
ErrorCode GpuMemory::Evict() {
D3DKMT_EVICT args = {};
args.hDevice = device_->DeviceHandle();
args.NumAllocations = NumChunks();
args.AllocationList = alloc_handles_ptr_;
return d3dthunk::Evict(&args);
}
ErrorCode GpuMemory::ExportPhysicalHandle(int* dmabuf_fd, uint32_t flags) {
if (mem_fd_ > -1) {
*dmabuf_fd = mem_fd_;
return ErrorCode::Success;
}
if (IsShared())
return d3dthunk::ShareObjects(1, resource_, flags, dmabuf_fd);
else
return ErrorCode::UnSupported;
}
ErrorCode GpuMemory::ImportPhysicalHandle(const GpuMemoryCreateInfo &create_info, gpusize *gpu_addr) {
D3DKMT_QUERYRESOURCEINFOFROMNTHANDLE query_args;
int dmabuf_fd = create_info.dmabuf_fd;
if (dmabuf_fd <= 0)
return ErrorCode::InvalidateParams;
if(create_info.flags.sysmem_ipc_sig_importer) {
// the ipc signal sys mem fd will be closed in Runtime::IPCClientImport, dup to hold a reference
mem_fd_ = dup(dmabuf_fd);
desc_.client_size = create_info.size;
desc_.size = AdjustSize(desc_.client_size);
desc_.domain = thunk_proxy::AllocDomain::kSystem;
desc_.adapter_luid = device_->GetLuid();
desc_.alignment = 0x1000;
desc_.mem_flags = create_info.mem_flags;
desc_.engine_flag = create_info.engine_flag;
desc_.flags.is_imported_sys_memfd = create_info.flags.sysmem_ipc_sig_importer;
desc_.flags.is_va_required = create_info.flags.alloc_va;
desc_.flags.is_virtual = create_info.flags.virtual_alloc;
desc_.flags.is_physical_only = create_info.flags.physical_only;
desc_.flags.is_physical_contiguous = create_info.flags.physical_contiguous;
desc_.flags.is_locked = create_info.flags.locked;
auto code = ReserveGpuVirtualAddress(create_info.va_hint, Size(), create_info.alignment);
if (code != ErrorCode::Success)
return code;
bool physical_created = false;
auto guard = MakeScopeGuard([this, &physical_created, &code]() {
if (code != ErrorCode::Success) {
if (physical_created)
FreePhysicalMemory();
FreeGpuVirtualAddress(GpuAddress(), Size());
}
});
(void)guard;
num_allocations_ = CalcChunkNumbers(Size());
if (num_allocations_ == 1)
alloc_handles_ptr_ = &alloc_handle_;
else
alloc_handles_ptr_ = new WinAllocationHandle[num_allocations_];
memset(alloc_handles_ptr_, 0, num_allocations_ * sizeof(WinAllocationHandle));
code = CreatePhysicalMemory();
if (code != ErrorCode::Success)
return code;
physical_created = true;
code = MapGpuVirtualAddress(GpuAddress(), Size());
if (code != ErrorCode::Success)
return code;
code = MakeResident();
if (code != ErrorCode::Success)
return code;
if (!GetDevice()->WaitOnPagingFenceFromCpu())
code = ErrorCode::Unknown;
return code;
} else {
// vmem importer / ipc vram importer
memset(&query_args, 0, sizeof(query_args));
query_args.hDevice = device_->DeviceHandle();
query_args.hNtHandle = reinterpret_cast<HANDLE>(dmabuf_fd);
auto ret = d3dthunk::QueryResourceInfoFromNtHandle(&query_args);
if (ret != ErrorCode::Success) {
pr_err("query resource info from nt handle failed %d\n", static_cast<int>(ret));
return ErrorCode::InvalidateParams;
}
pr_debug("wsl-thunk: import from nt handle %d, get allocation number %d,"
" runtime data size %#x total driver data size %#x resource data size=%#x\n",
dmabuf_fd,
query_args.NumAllocations,
query_args.PrivateRuntimeDataSize,
query_args.TotalPrivateDriverDataSize,
query_args.ResourcePrivateDriverDataSize);
SharedHandleInfo shared_info;
if(sizeof(shared_info) != query_args.PrivateRuntimeDataSize) {
pr_err("shared hanle info size mismatch:%d vs %ld\n",
query_args.PrivateRuntimeDataSize, sizeof(shared_info));
return ErrorCode::UnSupported;
}
uint32_t total_size = query_args.NumAllocations * sizeof(D3DDDI_OPENALLOCATIONINFO2) +
query_args.TotalPrivateDriverDataSize +
query_args.ResourcePrivateDriverDataSize;
D3DDDI_OPENALLOCATIONINFO2 *open_info =
reinterpret_cast<D3DDDI_OPENALLOCATIONINFO2*> (calloc(1, total_size));
if (!open_info) {
pr_err("alloc open_info failed, NumAllocations:%d\n",
query_args.NumAllocations);
return ErrorCode::OutOfMemory;
}
auto guard = MakeScopeGuard([&open_info]() { free(open_info); });
alloc_handles_ptr_ = new WinAllocationHandle[query_args.NumAllocations];
D3DKMT_OPENRESOURCEFROMNTHANDLE open_args;
memset(&open_args, 0, sizeof(open_args));
open_args.hDevice = query_args.hDevice;
open_args.hNtHandle = query_args.hNtHandle;
open_args.NumAllocations = query_args.NumAllocations;
open_args.pOpenAllocationInfo2 = open_info;
open_args.TotalPrivateDriverDataBufferSize = query_args.TotalPrivateDriverDataSize;
open_args.pTotalPrivateDriverDataBuffer = reinterpret_cast<void*>
(open_args.pOpenAllocationInfo2 + open_args.NumAllocations);
open_args.ResourcePrivateDriverDataSize = query_args.ResourcePrivateDriverDataSize;
open_args.pResourcePrivateDriverData = reinterpret_cast<void*>
(((uint64_t)open_args.pTotalPrivateDriverDataBuffer) +
open_args.TotalPrivateDriverDataBufferSize);
open_args.PrivateRuntimeDataSize = query_args.PrivateRuntimeDataSize;
open_args.pPrivateRuntimeData = reinterpret_cast<void*> (&shared_info);
ret = d3dthunk::OpenResourceFromNtHandle(&open_args);
if (ret != ErrorCode::Success) {
ret = ErrorCode::InvalidateParams;
pr_err("open resource failed %d\n", static_cast<int>(ret));
return ret;
}
if (shared_info.pid == dxg_runtime->parent_pid &&
create_info.flags.alloc_va &&
IsSameAdapter(shared_info.adapter_luid) &&
shared_info.gpu_addr) {
pr_info("import from same device and samve process, va is required. "
"a buffer can't be mapped to 2 va. delete the imported buffer, use the existing one.\n");
if (gpu_addr)
*gpu_addr = shared_info.gpu_addr;
return ErrorCode::SameProcessSameDevice;
}
desc_.size = shared_info.size;
desc_.client_size = shared_info.client_size;
desc_.domain = shared_info.domain;
desc_.flags.reserved = shared_info.flags;
desc_.mem_flags = shared_info.mem_flags;
desc_.adapter_luid = shared_info.adapter_luid;
resource_ = open_args.hResource;
num_allocations_ = open_args.NumAllocations;
for (int i = 0; i < num_allocations_; i++)
alloc_handles_ptr_[i] = open_info[i].hAllocation;
desc_.flags.is_va_required = create_info.flags.alloc_va;
if (desc_.flags.is_va_required) {
desc_.flags.is_imported_vram_ipc = 1;
ret = ReserveGpuVirtualAddress(create_info.va_hint, desc_.size, create_info.alignment);
if (ret != ErrorCode::Success)
pr_err("failed to allocate svm range, error:%d\n", static_cast<int>(ret));
return ret;
} else {
desc_.flags.is_imported_vram_vmem = 1;
return dxg_runtime->HandleApertureAlloc(desc_.size, &desc_.handle_ape_addr);
}
}
}
} // namespace thunk
} // namespace wsl
Diferenças do arquivo suprimidas por serem muito extensas Carregar Diff
@@ -0,0 +1,165 @@
#include <cassert>
#include <map>
#include <algorithm>
#include "impl/wddm/va_mgr.h"
using namespace std;
namespace wsl {
namespace thunk {
VaMgr::VaMgr(uint64_t start, uint64_t size, uint64_t min_align) {
min_align_ = min_align;
auto free_it = free_list_.insert(make_pair(size, start));
frag_map_[start] = make_fragment(free_it, size);
}
VaMgr::~VaMgr() {
if (free_list_.size() != 1)
pr_warn("free_list_ size:%ld which should be 1.\n", free_list_.size());
if (frag_map_.size() != 1)
pr_warn("frag_map_ size:%ld which should be 1.\n", frag_map_.size());
free_list_.clear();
frag_map_.clear();
}
uint64_t VaMgr::Alloc(uint64_t bytes, uint64_t align, uint64_t addr) {
if (addr > 0 &&
(align == 0 || (addr % align) == 0)) {
lock_guard<mutex> gard(lock_);
auto frag_it = frag_map_.upper_bound(addr);
assert(frag_it != frag_map_.begin());
--frag_it;
while (frag_it != frag_map_.begin()) {
const uint64_t base = frag_it->first;
const uint64_t size = frag_it->second.size;
// Cannot find free fragment contains the target `addr`
if (bytes > size || addr < base || addr + bytes > base + size ||
!is_free(frag_it->second)) {
--frag_it;
continue;
} else if (addr >= base + size)
break;
// Try to allocate target `addr` from this free fragment
auto free_it = frag_it->second.free_list_entry_;
assert(free_it != free_list_.end());
free_list_.erase(free_it);
frag_it->second.size = bytes;
set_used(frag_it->second);
// [base, addr)
if (addr > base) add_free_fragment(addr - base, base);
// [addr, addr + bytes) is used
// [addr + bytes, base + size)
if (base + size > addr + bytes) add_free_fragment(base + size - addr - bytes, addr + bytes);
return addr;
}
}
// Allocate not fixed address
return AllocImpl(bytes, align);
}
uint64_t VaMgr::AllocImpl(const uint64_t bytes, const uint64_t align) {
uint64_t addr = 0;
uint64_t align_bytes = bytes;
const int retry = align == 0 ? 0 : 1;
const uint64_t new_align = align == 0 ? min_align_ : AlignUp(align, min_align_);
lock_guard<mutex> gard(lock_);
for (int i = 0; i <= retry; i++) {
auto free_it = free_list_.lower_bound(align_bytes);
if (free_it == free_list_.end()) break;
uint64_t base = free_it->second;
uint64_t size = free_it->first;
assert(size >= align_bytes);
auto fragment = frag_map_.find(base);
assert(fragment != frag_map_.end());
assert(size == fragment->second.size);
uint64_t delta = align == 0 ? 0 : base % align;
if (delta == 0) {
// already find aligned address
addr = base;
free_list_.erase(free_it);
fragment->second.size = bytes;
set_used(fragment->second);
if (size > bytes) add_free_fragment(size - bytes, base + bytes);
break;
} else if (i == 0) {
align_bytes += new_align;
continue;
} else {
uint64_t aligned_base = base + align - delta;
addr = aligned_base;
free_list_.erase(free_it);
add_used_fragment(bytes, aligned_base);
add_free_fragment(aligned_base - base, base);
if (size > aligned_base - base + bytes)
add_free_fragment(size - (aligned_base - base) - bytes, aligned_base + bytes);
break;
}
}
return addr;
}
void VaMgr::Free(uint64_t addr) {
if (addr == 0) return;
lock_guard<mutex> gard(lock_);
auto frag_it = frag_map_.find(addr);
if (frag_it == frag_map_.end() || is_free(frag_it->second)) return;
uint64_t base = addr;
// Merge lower
if (frag_it != frag_map_.begin()) {
auto lower = frag_it;
--lower;
if (is_free(lower->second)) {
remove_free_list_entry(lower->second);
base -= lower->second.size;
lower->second.size += frag_it->second.size;
frag_map_.erase(frag_it);
frag_it = lower;
}
}
// Merge upper
{
auto upper = frag_it;
++upper;
if (upper != frag_map_.end() && is_free(upper->second)) {
remove_free_list_entry(upper->second);
frag_it->second.size += upper->second.size;
frag_map_.erase(upper);
}
}
uint64_t size = frag_it->second.size;
auto it = free_list_.insert(make_pair(size, base));
set_free(frag_it->second, it);
}
} // namespace thunk
} // namespace wsl