diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/ais.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/ais.cpp new file mode 100644 index 0000000000..e32c28b1d4 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/ais.cpp @@ -0,0 +1,39 @@ +/* + * Copyright © 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + + +HSAKMT_STATUS HSAKMTAPI hsaKmtAisReadWriteFile(void *MemoryAddress, + HSAuint64 MemorySizeInBytes, + HSAint32 fd, + HSAint64 file_offset, + HsaAisFlags AisFlags, + HSAuint64 *SizeCopiedInBytes, + HSAint32 *status) +{ + CHECK_DXG_OPEN(); + + pr_warn_once("not implemented\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/debug.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/debug.cpp new file mode 100644 index 0000000000..2b4425599a --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/debug.cpp @@ -0,0 +1,126 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include + + +static uint32_t runtime_capabilities_mask = 0; + +HSAKMT_STATUS HSAKMTAPI hsaKmtDbgRegister(HSAuint32 NodeId) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtDbgUnregister(HSAuint32 NodeId) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtDbgWavefrontControl( + HSAuint32 NodeId, HSA_DBG_WAVEOP Operand, HSA_DBG_WAVEMODE Mode, + HSAuint32 TrapId, HsaDbgWaveMessage *DbgWaveMsgRing) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtDbgAddressWatch( + HSAuint32 NodeId, HSAuint32 NumWatchPoints, HSA_DBG_WATCH_MODE WatchMode[], + void *WatchAddress[], HSAuint64 WatchMask[], HsaEvent *WatchEvent[]) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtCheckRuntimeDebugSupport(void) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtRuntimeEnable(void *rDebug, bool setupTtmp) { + HSAKMT_STATUS result = hsaKmtCheckRuntimeDebugSupport(); + + if (result) + return result; + + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtRuntimeDisable(void) { + HSAKMT_STATUS result = hsaKmtCheckRuntimeDebugSupport(); + + if (result) + return HSAKMT_STATUS_SUCCESS; + + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtGetRuntimeCapabilities(HSAuint32 *caps_mask) { + CHECK_DXG_OPEN(); + *caps_mask = runtime_capabilities_mask; + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtDbgEnable(void **runtime_info, + HSAuint32 *data_size) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} +HSAKMT_STATUS HSAKMTAPI hsaKmtDbgDisable(void) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtDbgGetDeviceData(void **data, + HSAuint32 *n_entries, + HSAuint32 *entry_size) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtDbgGetQueueData(void **data, HSAuint32 *n_entries, + HSAuint32 *entry_size, + bool suspend_queues) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI +hsaKmtDebugTrapIoctl(struct kfd_ioctl_dbg_trap_args *args, HSA_QUEUEID *Queues, + HSAuint64 *DebugReturn) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/dxcore_loader.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/dxcore_loader.cpp new file mode 100644 index 0000000000..5d38d69c8d --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/dxcore_loader.cpp @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. + */ + +#include "dxcore_loader.h" +#include "librocdxg.h" +#include +#include +#include +#include + +namespace wsl { +namespace thunk { +namespace dxcore { + +DxcoreLoader::DxcoreLoader() + : dxcore_handle_(nullptr) + , init_flag_() + , pfn_D3DKMTCreateAllocation2(nullptr) + , pfn_D3DKMTDestroyAllocation2(nullptr) + , pfn_D3DKMTMapGpuVirtualAddress(nullptr) + , pfn_D3DKMTReserveGpuVirtualAddress(nullptr) + , pfn_D3DKMTFreeGpuVirtualAddress(nullptr) + , pfn_D3DKMTCreateDevice(nullptr) + , pfn_D3DKMTDestroyDevice(nullptr) + , pfn_D3DKMTEnumAdapters2(nullptr) + , pfn_D3DKMTQueryAdapterInfo(nullptr) + , pfn_D3DKMTCreateContextVirtual(nullptr) + , pfn_D3DKMTDestroyContext(nullptr) + , pfn_D3DKMTSubmitCommand(nullptr) + , pfn_D3DKMTCreateSynchronizationObject2(nullptr) + , pfn_D3DKMTDestroySynchronizationObject(nullptr) + , pfn_D3DKMTQueryStatistics(nullptr) + , pfn_D3DKMTEscape(nullptr) + , pfn_D3DKMTLock2(nullptr) + , pfn_D3DKMTUnlock2(nullptr) + , pfn_D3DKMTCreatePagingQueue(nullptr) + , pfn_D3DKMTDestroyPagingQueue(nullptr) + , pfn_D3DKMTWaitForSynchronizationObjectFromGpu(nullptr) + , pfn_D3DKMTSignalSynchronizationObjectFromGpu(nullptr) + , pfn_D3DKMTWaitForSynchronizationObjectFromCpu(nullptr) + , pfn_D3DKMTQueryClockCalibration(nullptr) + , pfn_D3DKMTMakeResident(nullptr) + , pfn_D3DKMTEvict(nullptr) + , pfn_D3DKMTShareObjects(nullptr) + , pfn_D3DKMTQueryResourceInfoFromNtHandle(nullptr) + , pfn_D3DKMTOpenResourceFromNtHandle(nullptr) + , pfn_D3DKMTCreateHwQueue(nullptr) + , pfn_D3DKMTDestroyHwQueue(nullptr) + , pfn_D3DKMTSubmitCommandToHwQueue(nullptr) { +} + +DxcoreLoader::~DxcoreLoader() { + Shutdown(); +} + +bool DxcoreLoader::Initialize() { + dlerror(); // Clear error + dxcore_handle_ = dlopen("libdxcore.so", RTLD_LAZY); + + if (!dxcore_handle_) { + pr_err("[DxcoreLoader] Cannot load libdxcore.so: %s\n", dlerror()); + return false; + } + + pr_info("[DxcoreLoader] libdxcore.so loaded successfully\n"); + if (!LoadDxcoreApis()) { + // If API loading failed, close the handle to indicate failure + dlclose(dxcore_handle_); + dxcore_handle_ = nullptr; + return false; + } + + return IsLoaded(); +} + +void DxcoreLoader::Shutdown() { + if (dxcore_handle_) { + if (dlclose(dxcore_handle_) != 0) { + pr_err("[DxcoreLoader] Cannot unload libdxcore.so: %s\n", dlerror()); + } else { + pr_info("[DxcoreLoader] libdxcore.so unloaded successfully\n"); + } + dxcore_handle_ = nullptr; + } +} + +bool DxcoreLoader::LoadDxcoreApis() { + if (!dxcore_handle_) { + pr_err("[DxcoreLoader] Error: dxcore_handle_ is null\n"); + return false; + } + + dlerror(); // Clear error + + // Load all D3DKMT functions + #define LOAD_DXCORE_API(func_name) \ + DXCORE_PFN(func_name) = (DXCORE_DEF(func_name)*)dlsym(dxcore_handle_, #func_name); \ + if (!DXCORE_PFN(func_name)) { \ + pr_err("[DxcoreLoader] Failed to load " #func_name ": %s\n", dlerror()); \ + goto ERROR; \ + } + + LOAD_DXCORE_API(D3DKMTCreateAllocation2); + LOAD_DXCORE_API(D3DKMTDestroyAllocation2); + LOAD_DXCORE_API(D3DKMTMapGpuVirtualAddress); + LOAD_DXCORE_API(D3DKMTReserveGpuVirtualAddress); + LOAD_DXCORE_API(D3DKMTFreeGpuVirtualAddress); + LOAD_DXCORE_API(D3DKMTCreateDevice); + LOAD_DXCORE_API(D3DKMTDestroyDevice); + LOAD_DXCORE_API(D3DKMTEnumAdapters2); + LOAD_DXCORE_API(D3DKMTQueryAdapterInfo); + LOAD_DXCORE_API(D3DKMTCreateContextVirtual); + LOAD_DXCORE_API(D3DKMTDestroyContext); + LOAD_DXCORE_API(D3DKMTSubmitCommand); + LOAD_DXCORE_API(D3DKMTCreateSynchronizationObject2); + LOAD_DXCORE_API(D3DKMTDestroySynchronizationObject); + LOAD_DXCORE_API(D3DKMTQueryStatistics); + LOAD_DXCORE_API(D3DKMTEscape); + LOAD_DXCORE_API(D3DKMTLock2); + LOAD_DXCORE_API(D3DKMTUnlock2); + LOAD_DXCORE_API(D3DKMTCreatePagingQueue); + LOAD_DXCORE_API(D3DKMTDestroyPagingQueue); + LOAD_DXCORE_API(D3DKMTWaitForSynchronizationObjectFromGpu); + LOAD_DXCORE_API(D3DKMTSignalSynchronizationObjectFromGpu); + LOAD_DXCORE_API(D3DKMTWaitForSynchronizationObjectFromCpu); + LOAD_DXCORE_API(D3DKMTQueryClockCalibration); + LOAD_DXCORE_API(D3DKMTMakeResident); + LOAD_DXCORE_API(D3DKMTEvict); + LOAD_DXCORE_API(D3DKMTShareObjects); + LOAD_DXCORE_API(D3DKMTQueryResourceInfoFromNtHandle); + LOAD_DXCORE_API(D3DKMTOpenResourceFromNtHandle); + LOAD_DXCORE_API(D3DKMTCreateHwQueue); + LOAD_DXCORE_API(D3DKMTDestroyHwQueue); + LOAD_DXCORE_API(D3DKMTSubmitCommandToHwQueue); + + #undef LOAD_DXCORE_API + + pr_info("[DxcoreLoader] All DXCore APIs loaded successfully\n"); + return true; +ERROR: + pr_err("[DxcoreLoader] Failed to load DXCore APIs\n"); + return false; +} + +} // namespace dxcore +} // namespace thunk +} // namespace wsl diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/dxcore_loader.h b/projects/rocr-runtime/libhsakmt/src/dxg/dxcore_loader.h new file mode 100644 index 0000000000..3f649a4da0 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/dxcore_loader.h @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef LIBROCDXG_DXCORE_LOADER_H +#define LIBROCDXG_DXCORE_LOADER_H + +#include "impl/wddm/types.h" +#include +#include + +#define DXCORE_CALL(function_name) wsl::thunk::dxcore::DxcoreLoader::Instance().pfn_##function_name + +namespace wsl { +namespace thunk { +namespace dxcore { + +/** + * @brief DxcoreLoader class for dynamic loading of libdxcore.so + * + * This class provides a singleton loader for the DXCore library, allowing + * optional loading based on environment variable LIBROCDXG_ENABLE_DXCORE. + * Supported values: "1", "true", "yes" (case-sensitive). + * If not set or invalid, fallback to stub implementations. + * + * Thread-safe initialization using std::call_once. + */ + +// Macro definitions mimicking HSAKMT design +#define DXCORE_DEF(function_name) PFN##function_name +#define DXCORE_PFN(function_name) pfn_##function_name + +class DxcoreLoader { +public: + // D3DKMT function type definitions + typedef NTSTATUS (DXCORE_DEF(D3DKMTCreateAllocation2))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTDestroyAllocation2))(void *args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTMapGpuVirtualAddress))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTReserveGpuVirtualAddress))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTFreeGpuVirtualAddress))(void *args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTCreateDevice))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTDestroyDevice))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTEnumAdapters2))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTQueryAdapterInfo))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTCreateContextVirtual))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTDestroyContext))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTSubmitCommand))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTCreateSynchronizationObject2))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTDestroySynchronizationObject))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTQueryStatistics))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTEscape))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTLock2))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTUnlock2))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTCreatePagingQueue))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTDestroyPagingQueue))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTWaitForSynchronizationObjectFromGpu))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTSignalSynchronizationObjectFromGpu))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTWaitForSynchronizationObjectFromCpu))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTQueryClockCalibration))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTMakeResident))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTEvict))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTShareObjects))(size_t num_allocations, WinResourceHandle* resource, OBJECT_ATTRIBUTES* obj_attr, uint32_t flags, void** nt_handle); + typedef NTSTATUS (DXCORE_DEF(D3DKMTQueryResourceInfoFromNtHandle))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTOpenResourceFromNtHandle))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTCreateHwQueue))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTDestroyHwQueue))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTSubmitCommandToHwQueue))(void* args); + + static DxcoreLoader& Instance() { + static DxcoreLoader* instance = new DxcoreLoader(); + return (*instance); + } + + bool Initialize(); + void Shutdown(); + bool IsLoaded() const { return dxcore_handle_ != nullptr; } + + // Function pointer declarations + DXCORE_DEF(D3DKMTCreateAllocation2)* DXCORE_PFN(D3DKMTCreateAllocation2); + DXCORE_DEF(D3DKMTDestroyAllocation2)* DXCORE_PFN(D3DKMTDestroyAllocation2); + DXCORE_DEF(D3DKMTMapGpuVirtualAddress)* DXCORE_PFN(D3DKMTMapGpuVirtualAddress); + DXCORE_DEF(D3DKMTReserveGpuVirtualAddress)* DXCORE_PFN(D3DKMTReserveGpuVirtualAddress); + DXCORE_DEF(D3DKMTFreeGpuVirtualAddress)* DXCORE_PFN(D3DKMTFreeGpuVirtualAddress); + DXCORE_DEF(D3DKMTCreateDevice)* DXCORE_PFN(D3DKMTCreateDevice); + DXCORE_DEF(D3DKMTDestroyDevice)* DXCORE_PFN(D3DKMTDestroyDevice); + DXCORE_DEF(D3DKMTEnumAdapters2)* DXCORE_PFN(D3DKMTEnumAdapters2); + DXCORE_DEF(D3DKMTQueryAdapterInfo)* DXCORE_PFN(D3DKMTQueryAdapterInfo); + DXCORE_DEF(D3DKMTCreateContextVirtual)* DXCORE_PFN(D3DKMTCreateContextVirtual); + DXCORE_DEF(D3DKMTDestroyContext)* DXCORE_PFN(D3DKMTDestroyContext); + DXCORE_DEF(D3DKMTSubmitCommand)* DXCORE_PFN(D3DKMTSubmitCommand); + DXCORE_DEF(D3DKMTCreateSynchronizationObject2)* DXCORE_PFN(D3DKMTCreateSynchronizationObject2); + DXCORE_DEF(D3DKMTDestroySynchronizationObject)* DXCORE_PFN(D3DKMTDestroySynchronizationObject); + DXCORE_DEF(D3DKMTQueryStatistics)* DXCORE_PFN(D3DKMTQueryStatistics); + DXCORE_DEF(D3DKMTEscape)* DXCORE_PFN(D3DKMTEscape); + DXCORE_DEF(D3DKMTLock2)* DXCORE_PFN(D3DKMTLock2); + DXCORE_DEF(D3DKMTUnlock2)* DXCORE_PFN(D3DKMTUnlock2); + DXCORE_DEF(D3DKMTCreatePagingQueue)* DXCORE_PFN(D3DKMTCreatePagingQueue); + DXCORE_DEF(D3DKMTDestroyPagingQueue)* DXCORE_PFN(D3DKMTDestroyPagingQueue); + DXCORE_DEF(D3DKMTWaitForSynchronizationObjectFromGpu)* DXCORE_PFN(D3DKMTWaitForSynchronizationObjectFromGpu); + DXCORE_DEF(D3DKMTSignalSynchronizationObjectFromGpu)* DXCORE_PFN(D3DKMTSignalSynchronizationObjectFromGpu); + DXCORE_DEF(D3DKMTWaitForSynchronizationObjectFromCpu)* DXCORE_PFN(D3DKMTWaitForSynchronizationObjectFromCpu); + DXCORE_DEF(D3DKMTQueryClockCalibration)* DXCORE_PFN(D3DKMTQueryClockCalibration); + DXCORE_DEF(D3DKMTMakeResident)* DXCORE_PFN(D3DKMTMakeResident); + DXCORE_DEF(D3DKMTEvict)* DXCORE_PFN(D3DKMTEvict); + DXCORE_DEF(D3DKMTShareObjects)* DXCORE_PFN(D3DKMTShareObjects); + DXCORE_DEF(D3DKMTQueryResourceInfoFromNtHandle)* DXCORE_PFN(D3DKMTQueryResourceInfoFromNtHandle); + DXCORE_DEF(D3DKMTOpenResourceFromNtHandle)* DXCORE_PFN(D3DKMTOpenResourceFromNtHandle); + DXCORE_DEF(D3DKMTCreateHwQueue)* DXCORE_PFN(D3DKMTCreateHwQueue); + DXCORE_DEF(D3DKMTDestroyHwQueue)* DXCORE_PFN(D3DKMTDestroyHwQueue); + DXCORE_DEF(D3DKMTSubmitCommandToHwQueue)* DXCORE_PFN(D3DKMTSubmitCommandToHwQueue); + +private: + DxcoreLoader(); + ~DxcoreLoader(); + + bool LoadDxcoreApis(); + + void* dxcore_handle_; + std::once_flag init_flag_; // For thread-safe initialization + + // Disable copy + DxcoreLoader(const DxcoreLoader&) = delete; + DxcoreLoader& operator=(const DxcoreLoader&) = delete; +}; + +} // namespace dxcore +} // namespace thunk +} // namespace wsl + +#endif // LIBROCDXG_DXCORE_LOADER_H diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/events.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/events.cpp new file mode 100644 index 0000000000..1a360832de --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/events.cpp @@ -0,0 +1,127 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include + +HSAKMT_STATUS HSAKMTAPI hsaKmtCreateEvent(HsaEventDescriptor *EventDesc, + bool ManualReset, bool IsSignaled, + HsaEvent **Event) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtDestroyEvent(HsaEvent *Event) { + CHECK_DXG_OPEN(); + if (!Event) + return HSAKMT_STATUS_SUCCESS; + + pr_warn_once("not supported\n"); + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtSetEvent(HsaEvent *Event) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + if (!Event) + return HSAKMT_STATUS_INVALID_HANDLE; + + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtResetEvent(HsaEvent *Event) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + if (!Event) + return HSAKMT_STATUS_INVALID_HANDLE; + + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtQueryEventState(HsaEvent *Event) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + if (!Event) + return HSAKMT_STATUS_INVALID_HANDLE; + + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnEvent(HsaEvent *Event, + HSAuint32 Milliseconds) { + return hsaKmtWaitOnEvent_Ext(Event, Milliseconds, NULL); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnEvent_Ext(HsaEvent *Event, + HSAuint32 Milliseconds, + uint64_t *event_age) { + if (!Event) + return HSAKMT_STATUS_INVALID_HANDLE; + + return hsaKmtWaitOnMultipleEvents_Ext(&Event, 1, true, Milliseconds, + event_age); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents(HsaEvent *Events[], + HSAuint32 NumEvents, + bool WaitOnAll, + HSAuint32 Milliseconds) { + return hsaKmtWaitOnMultipleEvents_Ext(Events, NumEvents, WaitOnAll, + Milliseconds, NULL); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents_Ext(HsaEvent *Events[], + HSAuint32 NumEvents, + bool WaitOnAll, + HSAuint32 Milliseconds, + uint64_t *event_age) { + CHECK_DXG_OPEN(); + + if (!Events) + return HSAKMT_STATUS_INVALID_HANDLE; + + if (NumEvents == 1 && Events[0] == nullptr) { + std::this_thread::sleep_for(std::chrono::microseconds(20)); + return HSAKMT_STATUS_SUCCESS; + } + + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtOpenSMI(HSAuint32 NodeId, int *fd) { + CHECK_DXG_OPEN(); + pr_debug("node id %d\n", NodeId); + assert(false); + return HSAKMT_STATUS_SUCCESS; +} diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/hsa.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/hsa.cpp new file mode 100755 index 0000000000..431e7bb91a --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/hsa.cpp @@ -0,0 +1,137 @@ +#include +#include "impl/hsa/hsa.h" +#include "impl/hsa/hsa_ven_amd_loader.h" + +static std::mutex* lock_ = new std::mutex(); + +#if 1 +#define _HSAKMT_LOOKUP_SYMS(_sym) \ +if (fn_##_sym == nullptr) { \ + std::lock_guard gard(*lock_); \ + if (fn_##_sym == nullptr) { \ + fn_##_sym = \ + reinterpret_cast(dlsym(RTLD_DEFAULT, #_sym)); \ + if (!fn_##_sym) { \ + pr_err("%s not found - %s\n", #_sym, dlerror()); \ + } \ + } \ +} + +#define _HSAKMT_EXEC_API(_sym, ...) \ +do { \ + if (fn_##_sym != nullptr) { \ + return fn_##_sym(__VA_ARGS__); \ + } \ +} while(0); + +bool hsakmt_hsa_loader_init() { + void *hsa_loader_handle = dlopen("libhsa-runtime64.so", RTLD_NOW | RTLD_GLOBAL); + if (hsa_loader_handle == nullptr) { + pr_err("dlopen libhsa-runtime64.so failed - %s\n", dlerror()); + return false; + } + dlclose(hsa_loader_handle); + return true; +} + +hsa_signal_value_t hsakmt_hsa_signal_load_relaxed(hsa_signal_t signal) { + static hsa_signal_value_t (*fn_hsa_signal_load_relaxed)(hsa_signal_t signal) = nullptr; + + _HSAKMT_LOOKUP_SYMS(hsa_signal_load_relaxed); + _HSAKMT_EXEC_API(hsa_signal_load_relaxed, signal); + + return 0; +} + +hsa_signal_value_t hsakmt_hsa_signal_wait_relaxed( + hsa_signal_t signal, hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, uint64_t timeout_hint, + hsa_wait_state_t wait_state_hint) { +static hsa_signal_value_t (*fn_hsa_signal_wait_relaxed)( + hsa_signal_t signal, hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, uint64_t timeout_hint, + hsa_wait_state_t wait_state_hint) = nullptr; + + _HSAKMT_LOOKUP_SYMS(hsa_signal_wait_relaxed); + _HSAKMT_EXEC_API(hsa_signal_wait_relaxed, signal, condition, compare_value, + timeout_hint, wait_state_hint); + + return 0; +} + +void hsakmt_hsa_signal_store_screlease(hsa_signal_t hsa_signal, + hsa_signal_value_t value){ +static void (*fn_hsa_signal_store_screlease)(hsa_signal_t hsa_signal, + hsa_signal_value_t value) = nullptr; + + _HSAKMT_LOOKUP_SYMS(hsa_signal_store_screlease); + _HSAKMT_EXEC_API(hsa_signal_store_screlease, hsa_signal, value); +} + +hsa_status_t hsakmt_hsa_ven_amd_loader_query_host_address( + const void *device_address, const void **host_address) { + static hsa_status_t (*fn_hsa_ven_amd_loader_query_host_address)( + const void *device_address, const void **host_address) = nullptr; + + if (fn_hsa_ven_amd_loader_query_host_address == nullptr) { + std::lock_guard gard(*lock_); + if (fn_hsa_ven_amd_loader_query_host_address == nullptr) { + hsa_status_t (*fn_hsa_system_get_extension_table)( + uint16_t extension, uint16_t version_major, uint16_t version_minor, void *table); + fn_hsa_system_get_extension_table = + reinterpret_cast(dlsym(RTLD_DEFAULT, "hsa_system_get_extension_table")); + if (fn_hsa_system_get_extension_table == nullptr) { + pr_err("%s not found - %s\n", "hsa_system_get_extension_table", dlerror()); + return HSA_STATUS_ERROR; + } + + hsa_ven_amd_loader_1_03_pfn_t table; + fn_hsa_system_get_extension_table(HSA_EXTENSION_AMD_LOADER, 1, 3, &table); + fn_hsa_ven_amd_loader_query_host_address = + table.hsa_ven_amd_loader_query_host_address; + } + } + + _HSAKMT_EXEC_API(hsa_ven_amd_loader_query_host_address, device_address, host_address); + return HSA_STATUS_ERROR; +} + +#else +hsa_signal_value_t hsakmt_hsa_signal_load_relaxed(hsa_signal_t signal) { + return hsa_signal_load_relaxed(signal); +} + +hsa_signal_value_t hsakmt_hsa_signal_wait_relaxed( + hsa_signal_t signal, hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, uint64_t timeout_hint, + hsa_wait_state_t wait_state_hint) { + return hsa_signal_wait_relaxed(signal, condition, compare_value, timeout_hint, + wait_state_hint); +} + +void hsakmt_hsa_signal_store_screlease(hsa_signal_t hsa_signal, + hsa_signal_value_t value) { + hsa_signal_store_screlease(hsa_signal, value); +} + +hsa_status_t hsakmt_hsa_ven_amd_loader_query_host_address( + const void *device_address, const void **host_address) { + static hsa_status_t (*fn_hsa_ven_amd_loader_query_host_address)( + const void *device_address, const void **host_address) = nullptr; + + if (fn_hsa_ven_amd_loader_query_host_address == nullptr) { + std::lock_guard gard(*lock_); + if (fn_hsa_ven_amd_loader_query_host_address == nullptr) { + hsa_ven_amd_loader_1_03_pfn_t table; + hsa_system_get_extension_table(HSA_EXTENSION_AMD_LOADER, 1, 3, &table); + fn_hsa_ven_amd_loader_query_host_address = + table.hsa_ven_amd_loader_query_host_address; + } + } + + if (fn_hsa_ven_amd_loader_query_host_address) + return fn_hsa_ven_amd_loader_query_host_address(device_address, host_address); + + return HSA_STATUS_ERROR; +} +#endif diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/hsakmtmodel.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/hsakmtmodel.cpp new file mode 100644 index 0000000000..6799f5d891 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/hsakmtmodel.cpp @@ -0,0 +1,31 @@ +/* +* Copyright © 2025 Advanced Micro Devices, Inc. +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, copy, +* modify, merge, publish, distribute, sublicense, and/or sell copies +* of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including +* the next paragraph) shall be included in all copies or substantial +* portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +*/ + +HSAKMT_STATUS HSAKMTAPI hsaKmtModelEnabled(bool* enable) +{ + *enable = false; + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_SUCCESS; +} diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/libdrm.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/libdrm.cpp new file mode 100644 index 0000000000..2e125dfb3e --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/libdrm.cpp @@ -0,0 +1,182 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// +#include + +#include "impl/wddm/types.h" +#include "impl/wddm/device.h" + +HSAKMT_STATUS HSAKMTAPI hsaKmtGetAMDGPUDeviceHandle( + HSAuint32 NodeId, HsaAMDGPUDeviceHandle *DeviceHandle) { + CHECK_DXG_OPEN(); + + wsl::thunk::WDDMDevice *pDevice = get_wddmdev(NodeId); + if (pDevice != nullptr) { + *DeviceHandle = reinterpret_cast(pDevice); + return HSAKMT_STATUS_SUCCESS; + } + return HSAKMT_STATUS_ERROR; +} + +HSAKMTAPI int amdgpu_device_initialize(int fd, + uint32_t *major_version, + uint32_t *minor_version, + amdgpu_device_handle *device_handle) { + return 0; +} + +HSAKMTAPI int amdgpu_device_deinitialize(amdgpu_device_handle device_handle) { + return 0; +} + +HSAKMTAPI int amdgpu_query_gpu_info(amdgpu_device_handle dev, + struct amdgpu_gpu_info *info) { + wsl::thunk::WDDMDevice *pDevice = + reinterpret_cast(dev); + memset(info, 0, sizeof(*info)); + info->gpu_counter_freq = pDevice->GPUCounterFrequency() / 1000ull; + return 0; +} + +HSAKMTAPI int amdgpu_device_get_fd(amdgpu_device_handle dev) { + return dxg_runtime->dxg_fd; +} + +HSAKMTAPI int amdgpu_bo_cpu_map(amdgpu_bo_handle bo, void **cpu) { + wsl::thunk::GpuMemory *gpu_mem = reinterpret_cast(bo); + if (gpu_mem->IsSysMemFd()) + *cpu = gpu_mem->CpuAddress(); + return 0; +} + +HSAKMTAPI int amdgpu_bo_free(amdgpu_bo_handle buf_handle) { + wsl::thunk::GpuMemory *gpu_mem = reinterpret_cast(buf_handle); + void *MemoryAddress = gpu_mem->IsVaAllocated() ? (void*)gpu_mem->GpuAddress() : (void*)gpu_mem->HandleApeAddress(); + auto ret = hsaKmtFreeMemory((void*)MemoryAddress, gpu_mem->Size()); + return ret == HSAKMT_STATUS_SUCCESS ? 0 : -1; +} + +HSAKMTAPI int amdgpu_bo_export(amdgpu_bo_handle bo, + enum amdgpu_bo_handle_type type, + uint32_t *shared_handle) { + *shared_handle = 0; + return 0; +} + +HSAKMTAPI int amdgpu_bo_import(amdgpu_device_handle dev, + enum amdgpu_bo_handle_type type, + uint32_t shared_handle, + struct amdgpu_bo_import_result *output) { + if (type != amdgpu_bo_handle_type_dma_buf_fd) { + pr_err("not implemented\n"); + return -1; + } + + + wsl::thunk::WDDMDevice *pDevice = reinterpret_cast(dev); + wsl::thunk::GpuMemoryHandle mem_handle; + bool is_ipc_memfd = is_ipc_sysmemfd(shared_handle); + bool alloc_va = is_ipc_memfd; + + HSAKMT_STATUS ret = import_dmabuf_fd(shared_handle, pDevice->NodeId(), + alloc_va, is_ipc_memfd, &mem_handle); + if (ret == HSAKMT_STATUS_SUCCESS) { + //use GpuMemory object handle as drm buf handle + output->buf_handle = reinterpret_cast(mem_handle); + return 0; + } else { + return -1; + } +} + +HSAKMTAPI int amdgpu_bo_va_op(amdgpu_bo_handle bo, + uint64_t offset, + uint64_t size, + uint64_t addr, + uint64_t flags, + uint32_t ops) { + wsl::thunk::GpuMemory *gpu_mem = reinterpret_cast(bo); + assert(gpu_mem != nullptr); + + switch(ops) { + case AMDGPU_VA_OP_MAP: + { + if (gpu_mem->GpuAddress() == addr) { + pr_info("bo is mapped already\n"); + return 0; + } else if (gpu_mem->GpuAddress()) { + pr_err("amdgpu_bo_va_op: GPU memory already mapped at %p, but requested to map at %p\n", + reinterpret_cast(gpu_mem->GpuAddress()), reinterpret_cast(addr)); + return -1; + } + auto code = gpu_mem->MapGpuVirtualAddress(reinterpret_cast(addr), size, offset); + if (code != ErrorCode::Success) + return -1; + + code = gpu_mem->MakeResident(); + if (code != ErrorCode::Success) + return -1; + } + break; + case AMDGPU_VA_OP_UNMAP: + { + auto code = gpu_mem->UnmapGpuVirtualAddress(reinterpret_cast(addr), size, offset); + if (code != ErrorCode::Success) + return -1; + gpu_mem->Evict(); + } + break; + } + return 0; +} + +HSAKMTAPI int amdgpu_bo_query_info(amdgpu_bo_handle bo, struct amdgpu_bo_info* info) { + return 0; +} + +HSAKMTAPI int amdgpu_bo_set_metadata(amdgpu_bo_handle bo, struct amdgpu_bo_metadata* info) { + return 0; +} + +HSAKMTAPI int drmCommandWriteRead(int fd, unsigned long drmCommandIndex, + void *data, unsigned long size) { + return 0; +} diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/librocdxg.h b/projects/rocr-runtime/libhsakmt/src/dxg/librocdxg.h new file mode 100644 index 0000000000..02826b22b0 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/librocdxg.h @@ -0,0 +1,289 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef LIBHSAKMT_H_INCLUDED +#define LIBHSAKMT_H_INCLUDED + +#include +#include +#include +#include "hsakmt/hsakmt.h" +#include "hsakmt/hsakmt_drm.h" + +#include "impl/wddm/va_mgr.h" +#include "impl/wddm/types.h" +#include "impl/wddm/device.h" +#include "dxcore_loader.h" + +wsl::thunk::WDDMDevice* get_wddmdev(uint32_t node_id); +uint32_t get_num_wddmdev(); +wsl::thunk::GpuMemory *get_gpu_mem(void *MemoryAddress); + +#define HSAKMT_DEBUG_LEVEL_ERR -1 +#define HSAKMT_DEBUG_LEVEL_DEFAULT 3 +#define HSAKMT_DEBUG_LEVEL_WARNING 4 +#define HSAKMT_DEBUG_LEVEL_INFO 6 +#define HSAKMT_DEBUG_LEVEL_DEBUG 7 + +struct hsakmtRuntime { + hsakmtRuntime() + : dxg_fd(-1), + parent_pid(getpid()), + is_forked(false), + hsakmt_debug_level(HSAKMT_DEBUG_LEVEL_DEFAULT), + dxg_open_count(0), + hsakmt_mutex(PTHREAD_MUTEX_INITIALIZER), + hsakmt_is_dgpu(false), + is_svm_api_supported(false), + zfb_support(0), + vendor_packet_process(0), + check_avail_sysram(false), + max_single_alloc_size(0), + enable_thunk_sub_allocator(0), + local_heap_space_start_(0), + local_heap_space_size_(0), + system_heap_space_start_(0), + system_heap_space_size_(0), + handle_aperture_start_(0), + handle_aperture_size_(0), + default_node(1) {} + + void HeapInit(); + void HeapFini(); + bool ReserveSvmSpace(uint64_t &base, uint64_t &size, uint64_t align); + bool FreeSvmSpace(uint64_t &base, uint64_t &size); + bool ReserveLocalHeapSpace(); + bool FreeLocalHeapSpace(); + void InitLocalHeapMgr(); + bool ReserveSystemHeapSpace(); + uint64_t SystemHeapSize() { return system_heap_space_size_; } + bool FreeSystemHeapSpace(); + bool CommitSystemHeapSpace(void* addr, int64_t size, bool lock); + bool DecommitSystemHeapSpace(void* addr, int64_t size); + void InitSystemHeapMgr(); + ErrorCode ReserveGpuVirtualAddress(const thunk_proxy::AllocDomain domain, + gpusize hit_base_addr, gpusize size, + gpusize *out_gpu_virt_addr, gpusize alignment, bool lock); + ErrorCode FreeGpuVirtualAddress(const thunk_proxy::AllocDomain domain, + gpusize gpu_addr, gpusize size); + bool CommitSystemHeapSpaceIPC(void* addr, int64_t size, int &fd, bool lock=false); + bool DecommitSystemHeapSpaceIPC(void* addr, int64_t size, int &memfd); + ErrorCode ReserveIPCSysMem(gpusize size, + gpusize *out_gpu_virt_addr, gpusize alignment, + int &memfd, bool lock); + ErrorCode FreeIPCSysMem(gpusize gpu_addr, gpusize size, int &memfd); + bool InitHandleApertureSpace(); + void InitHandleApertureMgr(); + ErrorCode HandleApertureAlloc(gpusize size, gpusize *out_gpu_virt_addr); + void HandleApertureFree(gpusize gpu_addr); + + pthread_mutex_t hsakmt_mutex; + const char *dxg_device_name = "/dev/dxg"; + long page_size; + int page_shift; + int dxg_fd = -1; + pid_t parent_pid = -1; + bool is_forked = false; + int hsakmt_debug_level = HSAKMT_DEBUG_LEVEL_DEFAULT; + unsigned long dxg_open_count; + bool hsakmt_is_dgpu; + bool is_svm_api_supported; + int zfb_support; + int vendor_packet_process; + bool check_avail_sysram; + size_t max_single_alloc_size; + int enable_thunk_sub_allocator; + uint32_t default_node; + + /* local heap means bo's backend is vram of all GPUs */ + uint64_t local_heap_space_start_; + uint64_t local_heap_space_size_; + + /* manage the reserved local heap space which shared by CPU and GPUs */ + std::unique_ptr local_heap_mgr_; + + /* system heap means bo's backend is system ram */ + uint64_t system_heap_space_start_; + uint64_t system_heap_space_size_; + + /* manage the reserved system heap space which shared by CPU and GPUs */ + std::unique_ptr system_heap_mgr_; + + uint64_t handle_aperture_start_; + uint64_t handle_aperture_size_; + std::unique_ptr handle_aperture_mgr_; +}; + +extern hsakmtRuntime *dxg_runtime; + +#undef HSAKMTAPI +#define HSAKMTAPI __attribute__((visibility ("default"))) + +#if defined(__clang__) +#if __has_feature(address_sanitizer) +#define SANITIZER_AMDGPU 1 +#endif +#endif + +/*Avoid pointer-to-int-cast warning*/ +#define PORT_VPTR_TO_UINT64(vptr) ((uint64_t)(unsigned long)(vptr)) + +/*Avoid int-to-pointer-cast warning*/ +#define PORT_UINT64_TO_VPTR(v) ((void*)(unsigned long)(v)) + +#define CHECK_DXG_OPEN() \ + do { if (dxg_runtime->dxg_open_count == 0 || dxg_runtime->is_forked) return HSAKMT_STATUS_KERNEL_IO_CHANNEL_NOT_OPENED; } while (0) + +/* 64KB BigK fragment size for TLB efficiency */ +#define GPU_BIGK_PAGE_SIZE (1 << 16) + +/* 2MB huge page size for 4-level page tables on Vega10 and later GPUs */ +#define GPU_HUGE_PAGE_SIZE (2 << 20) + +#define CHECK_PAGE_MULTIPLE(x) \ + do { if ((uint64_t)PORT_VPTR_TO_UINT64(x) % dxg_runtime->page_size) return HSAKMT_STATUS_INVALID_PARAMETER; } while(0) + +#define ALIGN_UP(x,align) (((uint64_t)(x) + (align) - 1) & ~(uint64_t)((align)-1)) +#define ALIGN_UP_32(x,align) (((uint32_t)(x) + (align) - 1) & ~(uint32_t)((align)-1)) +#define PAGE_ALIGN_UP(x) ALIGN_UP(x,dxg_runtime->page_size) +#define BITMASK(n) ((n) ? (UINT64_MAX >> (sizeof(UINT64_MAX) * CHAR_BIT - (n))) : 0) +#define ARRAY_LEN(array) (sizeof(array) / sizeof(array[0])) + +/* HSA Thunk logging usage */ +#define get_thread_id() \ + ([]() -> std::string { \ + std::stringstream str_thrd_id; \ + str_thrd_id << std::hex << std::this_thread::get_id(); \ + return str_thrd_id.str(); \ + })() +#define hsakmt_print_common(stream, fmt, ...) \ + do { \ + fprintf(stream, "pid:%d tid:0x%s [%s] " fmt, getpid(), get_thread_id().c_str(), __FUNCTION__, ##__VA_ARGS__); \ + fflush(stream); \ + } while (false) +#ifdef NDEBUG +#define hsakmt_print(level, fmt, ...) \ + do { } while (false) +#else +#define hsakmt_print(level, fmt, ...) \ + do { \ + if (level <= dxg_runtime->hsakmt_debug_level) { \ + hsakmt_print_common(stdout, fmt, ##__VA_ARGS__); \ + } \ + } while (false) +#endif + +#define pr_err(fmt, ...) \ + hsakmt_print_common(stderr, fmt, ##__VA_ARGS__) +#define pr_warn(fmt, ...) \ + hsakmt_print(HSAKMT_DEBUG_LEVEL_WARNING, fmt, ##__VA_ARGS__) +#define pr_info(fmt, ...) \ + hsakmt_print(HSAKMT_DEBUG_LEVEL_INFO, fmt, ##__VA_ARGS__) +#define pr_debug(fmt, ...) \ + hsakmt_print(HSAKMT_DEBUG_LEVEL_DEBUG, fmt, ##__VA_ARGS__) +#define pr_err_once(fmt, ...) \ +({ \ + static bool __print_once; \ + if (!__print_once) { \ + __print_once = true; \ + pr_err(fmt, ##__VA_ARGS__); \ + } \ +}) +#define pr_warn_once(fmt, ...) \ +({ \ + static bool __print_once; \ + if (!__print_once) { \ + __print_once = true; \ + pr_warn(fmt, ##__VA_ARGS__); \ + } \ +}) + +/* Expects HSA_ENGINE_ID.ui32, returns gfxv (full) in hex */ +#define HSA_GET_GFX_VERSION_FULL(ui32) \ + (((ui32.Major) << 16) | ((ui32.Minor) << 8) | (ui32.Stepping)) + +HSAKMT_STATUS validate_nodeid(uint32_t nodeid, uint32_t *gpu_id); +HSAKMT_STATUS gpuid_to_nodeid(uint32_t gpu_id, uint32_t* node_id); +bool prefer_ats(HSAuint32 node_id); +uint16_t get_device_id_by_node_id(HSAuint32 node_id); +uint16_t get_device_id_by_gpu_id(HSAuint32 gpu_id); +uint32_t get_direct_link_cpu(uint32_t gpu_node); + +HSAKMT_STATUS topology_sysfs_get_system_props(HsaSystemProperties& props); +HSAKMT_STATUS topology_get_node_props(HSAuint32 NodeId, + HsaNodeProperties *NodeProperties); +HSAKMT_STATUS topology_get_iolink_props(HSAuint32 NodeId, + HSAuint32 NumIoLinks, + HsaIoLinkProperties *IoLinkProperties); +void topology_setup_is_dgpu_param(HsaNodeProperties *props); + +HSAuint32 PageSizeFromFlags(unsigned int pageSizeFlags); + +uint32_t get_num_sysfs_nodes(void); + +bool is_forked_child(void); + +void clear_allocation_map(void); + +class BlockAllocator { +private: + static const size_t block_size_ = 128 * 1024 * 1024; // 128MB blocks. + +public: + void* alloc(size_t request_size, size_t& allocated_size) const; + void free(void* ptr, size_t length) const; + size_t block_size() const { return block_size_; } +}; + +void reset_suballocator(void); +void trim_suballocator(void); + +HSAKMT_STATUS hsaKmtAllocMemoryAlignInternal(HSAuint32 PreferredNode, + HSAuint64 SizeInBytes, + HSAuint64 Alignment, + HsaMemFlags MemFlags, + void **MemoryAddress, + bool SkipSubAlloc = false); + +HSAKMT_STATUS hsaKmtFreeMemoryInternal(void *MemoryAddress, + HSAuint64 SizeInBytes, + bool SkipSubAlloc = false); + +bool queue_acquire_buffer(void *MemoryAddress); +bool queue_release_buffer(void *MemoryAddress); +/* Calculate VGPR and SGPR register file size per CU */ +uint32_t get_vgpr_size_per_cu(HSA_ENGINE_ID id); +#define SGPR_SIZE_PER_CU 0x4000 + +bool is_ipc_sysmemfd(int fd); + +HSAKMT_STATUS import_dmabuf_fd(int DMABufFd, + uint32_t NodeId, + bool alloc_va, + bool is_ipc_memfd, + wsl::thunk::GpuMemoryHandle *GpuMemHandle); + +bool hsakmt_hsa_loader_init(); +#endif diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/librocdxg.ver b/projects/rocr-runtime/libhsakmt/src/dxg/librocdxg.ver new file mode 100644 index 0000000000..d91b29ec90 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/librocdxg.ver @@ -0,0 +1,113 @@ +HSAKMT_1 +{ +global: +hsaKmtOpenKFD; +hsaKmtCloseKFD; +hsaKmtGetVersion; +hsaKmtAcquireSystemProperties; +hsaKmtReleaseSystemProperties; +hsaKmtGetNodeProperties; +hsaKmtGetNodeMemoryProperties; +hsaKmtGetNodeCacheProperties; +hsaKmtGetNodeIoLinkProperties; +hsaKmtCreateEvent; +hsaKmtDestroyEvent; +hsaKmtSetEvent; +hsaKmtResetEvent; +hsaKmtQueryEventState; +hsaKmtWaitOnEvent; +hsaKmtWaitOnMultipleEvents; +hsaKmtCreateQueue; +hsaKmtCreateQueueExt; +hsaKmtUpdateQueue; +hsaKmtDestroyQueue; +hsaKmtSetQueueCUMask; +hsaKmtSetMemoryPolicy; +hsaKmtAllocMemory; +hsaKmtAllocMemoryAlign; +hsaKmtFreeMemory; +hsaKmtAvailableMemory; +hsaKmtRegisterMemory; +hsaKmtRegisterMemoryToNodes; +hsaKmtRegisterMemoryWithFlags; +hsaKmtRegisterGraphicsHandleToNodes; +hsaKmtRegisterGraphicsHandleToNodesExt; +hsaKmtShareMemory; +hsaKmtRegisterSharedHandle; +hsaKmtRegisterSharedHandleToNodes; +hsaKmtProcessVMRead; +hsaKmtProcessVMWrite; +hsaKmtDeregisterMemory; +hsaKmtMapMemoryToGPU; +hsaKmtMapMemoryToGPUNodes; +hsaKmtUnmapMemoryToGPU; +hsaKmtDbgRegister; +hsaKmtDbgUnregister; +hsaKmtDbgWavefrontControl; +hsaKmtDbgAddressWatch; +hsaKmtDbgEnable; +hsaKmtDbgDisable; +hsaKmtDbgGetDeviceData; +hsaKmtDbgGetQueueData; +hsaKmtGetClockCounters; +hsaKmtPmcGetCounterProperties; +hsaKmtPmcRegisterTrace; +hsaKmtPmcUnregisterTrace; +hsaKmtPmcAcquireTraceAccess; +hsaKmtPmcReleaseTraceAccess; +hsaKmtPmcStartTrace; +hsaKmtPmcQueryTrace; +hsaKmtPmcStopTrace; +hsaKmtMapGraphicHandle; +hsaKmtUnmapGraphicHandle; +hsaKmtSetTrapHandler; +hsaKmtGetTileConfig; +hsaKmtQueryPointerInfo; +hsaKmtSetMemoryUserData; +hsaKmtGetQueueInfo; +hsaKmtAllocQueueGWS; +hsaKmtRuntimeEnable; +hsaKmtRuntimeDisable; +hsaKmtCheckRuntimeDebugSupport; +hsaKmtGetRuntimeCapabilities; +hsaKmtDebugTrapIoctl; +hsaKmtSPMAcquire; +hsaKmtSPMRelease; +hsaKmtSPMSetDestBuffer; +hsaKmtSVMSetAttr; +hsaKmtSVMGetAttr; +hsaKmtSetXNACKMode; +hsaKmtGetXNACKMode; +hsaKmtOpenSMI; +hsaKmtExportDMABufHandle; +hsaKmtGetMemoryHandle; +hsaKmtWaitOnEvent_Ext; +hsaKmtWaitOnMultipleEvents_Ext; +hsaKmtReplaceAsanHeaderPage; +hsaKmtReturnAsanHeaderPage; +hsaKmtGetAMDGPUDeviceHandle; +hsaKmtPcSamplingQueryCapabilities; +hsaKmtPcSamplingCreate; +hsaKmtPcSamplingDestroy; +hsaKmtPcSamplingStart; +hsaKmtPcSamplingStop; +hsaKmtPcSamplingSupport; +hsaKmtAisReadWriteFile; +hsaKmtModelEnabled; +hsaKmtQueueRingDoorbell; +amdgpu_device_initialize; +amdgpu_device_deinitialize; +amdgpu_query_gpu_info; +amdgpu_bo_import; +amdgpu_bo_va_op; +amdgpu_device_get_fd; +amdgpu_bo_cpu_map; +amdgpu_bo_free; +amdgpu_bo_export; +amdgpu_bo_query_info; +amdgpu_bo_set_metadata; +drmCommandWriteRead; + +local: *; +}; + diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/memory.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/memory.cpp new file mode 100644 index 0000000000..b6ef48cf29 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/memory.cpp @@ -0,0 +1,989 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "impl/wddm/gpu_memory.h" +#include "util/simple_heap.h" + +struct Allocation { + Allocation() + : handle(0), cpu_addr(0), gpu_addr(0), size(0), userptr(false), + user_data(nullptr), size_requested(0), node_id(0), mem_flags_value(0), + dmabuf_fd(-1), rocr_userdata(nullptr) {} + Allocation(wsl::thunk::GpuMemoryHandle handle_arg, void *cpu_addr_arg, + uint64_t gpu_addr_arg, size_t size_arg, bool userptr_arg = false, + void *user_data_arg = nullptr, size_t user_size_arg = 0, + HSAuint32 node_id_arg = 0, HSAuint32 mem_flags_value_arg = 0) + : handle(handle_arg), cpu_addr(cpu_addr_arg), gpu_addr(gpu_addr_arg), + size(size_arg), userptr(userptr_arg), user_data(user_data_arg), + size_requested(user_size_arg), node_id(node_id_arg), + mem_flags_value(mem_flags_value_arg), dmabuf_fd(-1), rocr_userdata(nullptr) {} + + wsl::thunk::GpuMemoryHandle handle; + void *cpu_addr; + uint64_t gpu_addr; + bool userptr; + size_t size; /* actual size = align_up(size_requested, granularity) */ + void *user_data; + size_t size_requested; /* size requested by user */ + HSAuint32 node_id; + HSAuint32 mem_flags_value; + int dmabuf_fd; + void *rocr_userdata; +}; + +static std::map* allocation_map_ = new std::map(); +static std::mutex* allocation_map_lock_ = new std::mutex(); + +void clear_allocation_map(void) +{ + //delete allocation_map_lock_; + allocation_map_lock_ = new std::mutex(); + std::lock_guard lock(*allocation_map_lock_); + delete allocation_map_; + allocation_map_ = new std::map(); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtSetMemoryPolicy(HSAuint32 Node, + HSAuint32 DefaultPolicy, + HSAuint32 AlternatePolicy, + void *MemoryAddressAlternate, + HSAuint64 MemorySizeInBytes) { + CHECK_DXG_OPEN(); + pr_warn_once("not implemented\n"); + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAuint32 PageSizeFromFlags(unsigned int pageSizeFlags) { + switch (pageSizeFlags) { + case HSA_PAGE_SIZE_4KB: + return 4 * 1024; + case HSA_PAGE_SIZE_64KB: + return 64 * 1024; + case HSA_PAGE_SIZE_2MB: + return 2 * 1024 * 1024; + case HSA_PAGE_SIZE_1GB: + return 1024 * 1024 * 1024; + default: + assert(false); + return 4 * 1024; + } +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemory(HSAuint32 PreferredNode, + HSAuint64 SizeInBytes, + HsaMemFlags MemFlags, + void **MemoryAddress) { + return hsaKmtAllocMemoryAlign(PreferredNode, SizeInBytes, 0, MemFlags, + MemoryAddress); +} + +#define POWER_OF_2(x) ((x && (!(x & (x - 1)))) ? 1 : 0) + +bool isSystemMemoryAvailable(HSAuint64 SizeInBytes) { + struct sysinfo info; + if (sysinfo(&info) != 0) + return false; + return SizeInBytes <= info.freeram; +} + +void* BlockAllocator::alloc(size_t request_size, size_t& allocated_size) const { + void *address; + HsaMemFlags MemFlags; + + MemFlags.Value = 0; + MemFlags.ui32.CoarseGrain = 1; + MemFlags.ui32.NoSubstitute = 1; + allocated_size = wsl::AlignUp(request_size, block_size()); + if (HSAKMT_STATUS_SUCCESS == hsaKmtAllocMemoryAlignInternal(1, allocated_size, 0, MemFlags, &address, true)) + return address; + + return nullptr; +} + +void BlockAllocator::free(void* ptr, size_t length) const { + if (HSAKMT_STATUS_SUCCESS != hsaKmtFreeMemoryInternal(ptr, length, true)) + pr_err("wsl-thunk: BlockAllocator::free() err, address %p, length:%zu\n", ptr, length); +} + +static wsl::SimpleHeap fragment_allocator_; + +void reset_suballocator(void) { + fragment_allocator_.reset(); +} + +void trim_suballocator(void) { + fragment_allocator_.trim(); +} + +HSAKMT_STATUS hsaKmtAllocMemoryAlignInternal(HSAuint32 PreferredNode, + HSAuint64 SizeInBytes, + HSAuint64 Alignment, + HsaMemFlags MemFlags, + void **MemoryAddress, + bool SkipSubAlloc) { + CHECK_DXG_OPEN(); + + if (!MemoryAddress) + return HSAKMT_STATUS_INVALID_PARAMETER; + + if (MemFlags.ui32.FixedAddress) { + if (*MemoryAddress == nullptr) + return HSAKMT_STATUS_INVALID_PARAMETER; + } else + *MemoryAddress = nullptr; + + uint32_t node = (PreferredNode == 0) ? dxg_runtime->default_node : PreferredNode; + wsl::thunk::WDDMDevice *dev = get_wddmdev(node); + if (!dev) + return HSAKMT_STATUS_ERROR; + + wsl::thunk::GpuMemory *gpu_mem = nullptr; + wsl::thunk::GpuMemoryCreateInfo create_info{}; + create_info.size = SizeInBytes; + + /* If initialize scratch pool of GpuAgent, treat it as SVM reserve */ + if (MemFlags.ui32.Scratch && MemFlags.ui32.HostAccess && SizeInBytes > 0x80000000) + MemFlags.ui32.OnlyAddress = 1; + + create_info.alignment = Alignment; + create_info.va_hint = reinterpret_cast(*MemoryAddress); + if ((PreferredNode == 0 && MemFlags.ui32.HostAccess) + || dxg_runtime->zfb_support || MemFlags.ui32.GTTAccess) { + if (SizeInBytes > dxg_runtime->max_single_alloc_size) + return HSAKMT_STATUS_NO_MEMORY; + + if (dxg_runtime->check_avail_sysram && !isSystemMemoryAvailable(SizeInBytes)) + return HSAKMT_STATUS_NO_MEMORY; + + /* If allocate VRAM under ZFB mode */ + if (dxg_runtime->zfb_support && MemFlags.ui32.NonPaged == 1) + MemFlags.ui32.CoarseGrain = 1; + + // AllocateNonPaged == AllocateIPC + create_info.flags.sysmem_ipc_sig_exporter = !!(MemFlags.ui32.NonPaged && !MemFlags.ui32.GTTAccess); + + create_info.domain = thunk_proxy::AllocDomain::kSystem; + } else { + create_info.domain = thunk_proxy::AllocDomain::kLocal; + } + + if (!MemFlags.ui32.CoarseGrain) + create_info.mem_flags = thunk_proxy::kFineGrain; + + //In hsa-runtime, only kernarg region set Uncached. + if (MemFlags.ui32.Uncached) + create_info.mem_flags |= thunk_proxy::kKernarg; + + create_info.flags.physical_only = MemFlags.ui32.NoAddress; + create_info.flags.alloc_va = !create_info.flags.physical_only; + create_info.flags.interprocess = MemFlags.ui32.NoAddress; + create_info.flags.interprocess |= MemFlags.ui32.Contiguous; + create_info.flags.physical_contiguous = MemFlags.ui32.Contiguous; + create_info.flags.locked = MemFlags.ui32.NoSubstitute;//AllocatePinned + create_info.flags.virtual_alloc = MemFlags.ui32.OnlyAddress; + create_info.flags.blit_kernel_object = + (MemFlags.ui32.ExecuteBlit && MemFlags.ui32.ExecuteAccess && + (create_info.domain == thunk_proxy::AllocDomain::kSystem)); + /*when only alloc virtual or only physical, it's vmm allocation, force to local*/ + if (create_info.flags.virtual_alloc || create_info.flags.physical_only + || create_info.flags.physical_contiguous) { + create_info.domain = thunk_proxy::AllocDomain::kLocal; + SkipSubAlloc = true; + } + + /* Only allow using the suballocator for ordinary VRAM.*/ + bool trim_safe = false; + if (!SkipSubAlloc && create_info.domain == thunk_proxy::AllocDomain::kLocal) { + /* just quickly skip SA if size is bigger than SA block size.*/ + gpusize real_size; + if (create_info.size > GPU_HUGE_PAGE_SIZE) + real_size = wsl::AlignUp(create_info.size, GPU_HUGE_PAGE_SIZE); + else + real_size = wsl::AlignUp(create_info.size, getpagesize()); + + if (real_size < fragment_allocator_.default_block_size()) { + *MemoryAddress = fragment_allocator_.alloc(real_size); + if (*MemoryAddress) + return HSAKMT_STATUS_SUCCESS; + } + + /* SA might keep a lot of free blocks as *cache*. + * We can trim them if direct allocation fails at first time. + */ + trim_safe = true; + } + +after_trim: + auto code = dev->CreateGpuMemory(create_info, &gpu_mem); + if (code == ErrorCode::Success) { + std::lock_guard gard(*allocation_map_lock_); + + /* For these physical allcations, use GpuMemory object's address as thunk handle*/ + if (create_info.flags.physical_only || create_info.dmabuf_fd > 0) + *MemoryAddress = reinterpret_cast(gpu_mem->HandleApeAddress()); + else + *MemoryAddress = reinterpret_cast(gpu_mem->GpuAddress()); + + (*allocation_map_)[*MemoryAddress] = Allocation( + gpu_mem->GetGpuMemoryHandle(), *MemoryAddress, (uint64_t)*MemoryAddress, + create_info.size, false, nullptr, SizeInBytes, + MemFlags.ui32.GTTAccess ? 0 : PreferredNode, MemFlags.Value); + return HSAKMT_STATUS_SUCCESS; + } else if (trim_safe) { + /* attempt to release memory from the block allocator and retry */ + fragment_allocator_.trim(); + trim_safe = false; + goto after_trim; + } + + return HSAKMT_STATUS_ERROR; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemoryAlign(HSAuint32 PreferredNode, + HSAuint64 SizeInBytes, + HSAuint64 Alignment, + HsaMemFlags MemFlags, + void **MemoryAddress) { + return hsaKmtAllocMemoryAlignInternal(PreferredNode, SizeInBytes, + Alignment, MemFlags, + MemoryAddress, + !dxg_runtime->enable_thunk_sub_allocator); +} + +HSAKMT_STATUS hsaKmtFreeMemoryInternal(void *MemoryAddress, + HSAuint64 SizeInBytes, + bool SkipSubAlloc) { + CHECK_DXG_OPEN(); + + if (!MemoryAddress) + return HSAKMT_STATUS_INVALID_PARAMETER; + + if (!SkipSubAlloc) { + if (fragment_allocator_.free(MemoryAddress)) + return HSAKMT_STATUS_SUCCESS; + } + + wsl::thunk::GpuMemory *gpu_mem = nullptr; + { + std::lock_guard gard(*allocation_map_lock_); + auto it = allocation_map_->find(MemoryAddress); + if (it == allocation_map_->end()) { + return HSAKMT_STATUS_ERROR; + } + + gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle); + if (gpu_mem->IsQueueReferenced()) + return HSAKMT_STATUS_ERROR; + + wsl::thunk::GpuMemoryDescFlags flags; + flags.reserved = gpu_mem->Flags(); + if (flags.is_imported_vram_ipc && + gpu_mem->DecSharedReference()) { + pr_info("memory is still referenced\n"); + return HSAKMT_STATUS_SUCCESS; + } + + if (it->second.dmabuf_fd >= 0) { + close(it->second.dmabuf_fd); + it->second.dmabuf_fd = -1; + } + allocation_map_->erase(it); + } + + delete gpu_mem; + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtFreeMemory(void *MemoryAddress, + HSAuint64 SizeInBytes) { + return hsaKmtFreeMemoryInternal(MemoryAddress, SizeInBytes); +} + +bool queue_acquire_buffer(void *MemoryAddress) { + if (!MemoryAddress) + return false; + + wsl::thunk::GpuMemory *gpu_mem = nullptr; + { + std::lock_guard gard(*allocation_map_lock_); + auto it = allocation_map_->find(MemoryAddress); + if (it == allocation_map_->end()) { + return HSAKMT_STATUS_ERROR; + } + + gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle); + gpu_mem->GetQueueReference(); + } + if (gpu_mem == nullptr) + return false; + + return true; +} + +bool queue_release_buffer(void *MemoryAddress) { + if (!MemoryAddress) + return false; + + wsl::thunk::GpuMemory *gpu_mem = nullptr; + { + std::lock_guard gard(*allocation_map_lock_); + auto it = allocation_map_->find(MemoryAddress); + if (it == allocation_map_->end()) { + return HSAKMT_STATUS_ERROR; + } + + gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle); + gpu_mem->PutQueueReference(); + } + if (gpu_mem == nullptr) + return false; + + return true; +} + +wsl::thunk::GpuMemory *get_gpu_mem(void *MemoryAddress) { + std::lock_guard gard(*allocation_map_lock_); + auto it = allocation_map_->find(MemoryAddress); + if (it == allocation_map_->end()) { + return nullptr; + } + + return wsl::thunk::GpuMemory::Convert(it->second.handle); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtAvailableMemory(HSAuint32 Node, + HSAuint64 *AvailableBytes) { + CHECK_DXG_OPEN(); + + if (!AvailableBytes) + return HSAKMT_STATUS_INVALID_PARAMETER; + + wsl::thunk::WDDMDevice *dev = get_wddmdev(Node); + if (!dev) + return HSAKMT_STATUS_ERROR; + + *AvailableBytes = dev->VramAvail(); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemory(void *MemoryAddress, + HSAuint64 MemorySizeInBytes) { + CHECK_DXG_OPEN(); + pr_warn_once("not implemented\n"); + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryToNodes(void *MemoryAddress, + HSAuint64 MemorySizeInBytes, + HSAuint64 NumberOfNodes, + HSAuint32 *NodeArray) { + CHECK_DXG_OPEN(); + + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryWithFlags( + void *MemoryAddress, HSAuint64 MemorySizeInBytes, HsaMemFlags MemFlags) { + CHECK_DXG_OPEN(); + + if (!MemoryAddress) + return HSAKMT_STATUS_INVALID_PARAMETER; + + pr_debug("address %p\n", MemoryAddress); + + if (MemFlags.ui32.ExtendedCoherent && MemFlags.ui32.CoarseGrain) + return HSAKMT_STATUS_INVALID_PARAMETER; + + // Registered memory should be ordinary paged host memory. + if ((MemFlags.ui32.HostAccess != 1) || (MemFlags.ui32.NonPaged == 1)) + return HSAKMT_STATUS_NOT_SUPPORTED; + + if (!dxg_runtime->hsakmt_is_dgpu) + /* TODO: support mixed APU and dGPU configurations */ + return HSAKMT_STATUS_NOT_SUPPORTED; + + return HSAKMT_STATUS_SUCCESS; +} + +bool is_ipc_sysmemfd(int fd) { + std::string fdPath = "/proc/self/fd/" + std::to_string(fd); + char linkTarget[256]; + ssize_t bytes = readlink(fdPath.c_str(), linkTarget, sizeof(linkTarget) - 1); + if (bytes == -1) + return false; + linkTarget[bytes] = '\0'; + return strstr(linkTarget, "rocr4wsl_gtt") != nullptr; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterGraphicsHandleToNodes(HSAuint64 GraphicsResourceHandle, + HsaGraphicsResourceInfo *GraphicsResourceInfo, + HSAuint64 NumberOfNodes, + HSAuint32 *NodeArray) { + HSA_REGISTER_MEM_FLAGS regFlags; + regFlags.Value = 0; + + return hsaKmtRegisterGraphicsHandleToNodesExt(GraphicsResourceHandle, + GraphicsResourceInfo, + NumberOfNodes, + NodeArray, + regFlags); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterGraphicsHandleToNodesExt(HSAuint64 GraphicsResourceHandle, + HsaGraphicsResourceInfo *GraphicsResourceInfo, + HSAuint64 NumberOfNodes, + HSAuint32 *NodeArray, + HSA_REGISTER_MEM_FLAGS RegisterFlags) { + CHECK_DXG_OPEN(); + uint32_t *gpu_id_array = NULL; + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + + if (is_ipc_sysmemfd(GraphicsResourceHandle)) { + GraphicsResourceInfo->NodeId = dxg_runtime->default_node; + pr_info("skip register sysmemfd. It would be released in next step\n"); + return HSAKMT_STATUS_SUCCESS; + } + + if (NumberOfNodes == 0) { + RegisterFlags.ui32.requiresVAddr = 0; + NumberOfNodes = 1; + NodeArray = (HSAuint32*)&(dxg_runtime->default_node); + } + + pr_debug("number of nodes %lu\n", NumberOfNodes); + wsl::thunk::GpuMemoryHandle mem_handle; + ret = import_dmabuf_fd(GraphicsResourceHandle, NodeArray[0], + RegisterFlags.ui32.requiresVAddr, + false, &mem_handle); + if (ret != HSAKMT_STATUS_SUCCESS) { + pr_err("hsaKmtRegisterGraphicsHandleToNodesExt: import_dmabuf_fd failed, " + "GraphicsResourceHandle: %lu, NodeId: %u\n", + GraphicsResourceHandle, NodeArray[0]); + return ret; + } + wsl::thunk::GpuMemory *gpu_mem = wsl::thunk::GpuMemory::Convert(mem_handle); + GraphicsResourceInfo->NodeId = gpu_mem->GetDevice()->NodeId(); + GraphicsResourceInfo->SizeInBytes = gpu_mem->ClientSize(); + GraphicsResourceInfo->MemoryAddress = RegisterFlags.ui32.requiresVAddr ? + reinterpret_cast(gpu_mem->GpuAddress()): + reinterpret_cast(gpu_mem->HandleApeAddress()); + + return ret; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtExportDMABufHandle(void *MemoryAddress, + HSAuint64 MemorySizeInBytes, + int *DMABufFd, + HSAuint64 *Offset) { + CHECK_DXG_OPEN(); + + std::lock_guard gard(*allocation_map_lock_); + + auto it = allocation_map_->upper_bound(MemoryAddress); + if (it != allocation_map_->begin()) { + --it; + if (it->second.dmabuf_fd == -1) { + auto gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle); + auto code = gpu_mem->ExportPhysicalHandle(DMABufFd); + if (code != ErrorCode::Success) + return HSAKMT_STATUS_ERROR; + it->second.dmabuf_fd = *DMABufFd; + } + *DMABufFd = dup(it->second.dmabuf_fd); + *Offset = reinterpret_cast(MemoryAddress) - it->second.gpu_addr; + return HSAKMT_STATUS_SUCCESS; + } + + return HSAKMT_STATUS_ERROR; +} + +HSAKMT_STATUS HSAKMTAPI +hsaKmtGetMemoryHandle(void *MemoryAddress, HSAuint64 SizeInBytes, + uint64_t *SharedMemoryHandle) { + CHECK_DXG_OPEN(); + + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS import_dmabuf_fd(int DMABufFd, + uint32_t NodeId, + bool alloc_va, + bool is_ipc_memfd, + wsl::thunk::GpuMemoryHandle *GpuMemHandle) { + CHECK_DXG_OPEN(); + + *GpuMemHandle = nullptr; + wsl::thunk::WDDMDevice* dev = get_wddmdev(NodeId); + wsl::thunk::GpuMemory *gpu_mem = nullptr; + wsl::thunk::GpuMemoryCreateInfo create_info{}; + create_info.dmabuf_fd = DMABufFd; + create_info.flags.alloc_va = alloc_va; + + if (is_ipc_memfd) { + struct stat st; + fstat(DMABufFd, &st); + uint64_t sz = st.st_size; + if (4096 <= sz && sz < dxg_runtime->SystemHeapSize() && (sz & 0xfff) == 0) { + pr_debug("DMABufFd %d is sys mem fd(IPC signal), get size:%ld from it\n", DMABufFd, st.st_size); + create_info.flags.sysmem_ipc_sig_importer = 1; // set to 1 when backend is system memory + create_info.size = st.st_size; + } + } + + gpusize gpu_va = 0; + auto code = dev->CreateGpuMemory(create_info, &gpu_mem, &gpu_va); + if (code == ErrorCode::SameProcessSameDevice) { + /* Unit_hipMemPoolExportToShareableHandle_SameProc */ + pr_info("imported from same process, use the old one\n"); + std::lock_guard gard(*allocation_map_lock_); + auto it = allocation_map_->find((void*)gpu_va); + if (it == allocation_map_->end()) { + pr_err("where's the conflict buffer? va %#lx\n", create_info.va_hint); + return HSAKMT_STATUS_ERROR; + } + wsl::thunk::GpuMemory *conflict_mem = wsl::thunk::GpuMemory::Convert(it->second.handle); + conflict_mem->IncSharedReference(); + *GpuMemHandle = it->second.handle; + return HSAKMT_STATUS_SUCCESS; + } else if (code != ErrorCode::Success) { + pr_err("fail to import fd, ret %d\n", (int)code); + return HSAKMT_STATUS_ERROR; + } + + void *MemoryAddress; + if (alloc_va) + MemoryAddress = reinterpret_cast(gpu_mem->GpuAddress()); + else + MemoryAddress = reinterpret_cast(gpu_mem->HandleApeAddress()); + + *GpuMemHandle = gpu_mem->GetGpuMemoryHandle(); + + std::lock_guard gard(*allocation_map_lock_); + /* + * the gpu_mem->Flags() need convert back from GpuMemoryCreateFlags to + * HsaMemFlags, reference hsaKmtAllocMemoryAlign + * */ + (*allocation_map_)[MemoryAddress] = Allocation( + *GpuMemHandle, MemoryAddress, (uint64_t)MemoryAddress, + gpu_mem->Size(), false, nullptr, gpu_mem->ClientSize(), + NodeId, gpu_mem->Flags()); + + return HSAKMT_STATUS_SUCCESS; + +} + + +HSAKMT_STATUS HSAKMTAPI +hsaKmtShareMemory(void *MemoryAddress, HSAuint64 SizeInBytes, + HsaSharedMemoryHandle *SharedMemoryHandle) { + CHECK_DXG_OPEN(); + pr_warn_once("not implemented\n"); + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI +hsaKmtRegisterSharedHandle(const HsaSharedMemoryHandle *SharedMemoryHandle, + void **MemoryAddress, HSAuint64 *SizeInBytes) { + CHECK_DXG_OPEN(); + pr_warn_once("not implemented\n"); + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterSharedHandleToNodes( + const HsaSharedMemoryHandle *SharedMemoryHandle, void **MemoryAddress, + HSAuint64 *SizeInBytes, HSAuint64 NumberOfNodes, HSAuint32 *NodeArray) { + CHECK_DXG_OPEN(); + pr_warn_once("not implemented\n"); + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtProcessVMRead(HSAuint32 Pid, + HsaMemoryRange *LocalMemoryArray, + HSAuint64 LocalMemoryArrayCount, + HsaMemoryRange *RemoteMemoryArray, + HSAuint64 RemoteMemoryArrayCount, + HSAuint64 *SizeCopied) { + CHECK_DXG_OPEN(); + pr_warn_once("has been deprecated\n"); + assert(false); + return HSAKMT_STATUS_NOT_IMPLEMENTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtProcessVMWrite(HSAuint32 Pid, + HsaMemoryRange *LocalMemoryArray, + HSAuint64 LocalMemoryArrayCount, + HsaMemoryRange *RemoteMemoryArray, + HSAuint64 RemoteMemoryArrayCount, + HSAuint64 *SizeCopied) { + CHECK_DXG_OPEN(); + pr_warn_once("has been deprecated\n"); + assert(false); + return HSAKMT_STATUS_NOT_IMPLEMENTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtDeregisterMemory(void *MemoryAddress) { + CHECK_DXG_OPEN(); + + if (!MemoryAddress) + return HSAKMT_STATUS_INVALID_PARAMETER; + + pr_debug("address %p\n", MemoryAddress); + + { + std::lock_guard gard(*allocation_map_lock_); + + auto it = allocation_map_->find(MemoryAddress); + if (it == allocation_map_->end()) { + return HSAKMT_STATUS_SUCCESS; + } + + auto *gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle); + wsl::thunk::GpuMemoryDescFlags flags; + flags.reserved = gpu_mem->Flags(); + // IPC mem(vram) + if (flags.is_imported_vram_ipc && + gpu_mem->DecSharedReference() == 0) { + allocation_map_->erase(it); + delete gpu_mem; + return HSAKMT_STATUS_SUCCESS; + } + if (it->second.userptr) { + allocation_map_->erase(it); + allocation_map_->erase((void *)it->second.gpu_addr); + delete gpu_mem; + return HSAKMT_STATUS_SUCCESS; + } + } + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtMapMemoryToGPU(void *MemoryAddress, + HSAuint64 MemorySizeInBytes, + HSAuint64 *AlternateVAGPU) { + + HSAuint64 NumberOfNodes = 1; + HSAuint32 NodeArray[] = {dxg_runtime->default_node}; + HsaMemMapFlags MemMapFlags; + MemMapFlags.Value = 0; + + return hsaKmtMapMemoryToGPUNodes(MemoryAddress, MemorySizeInBytes, AlternateVAGPU, + MemMapFlags, NumberOfNodes, NodeArray); +} +HSAKMT_STATUS HSAKMTAPI hsaKmtMapMemoryToGPUNodes( + void *MemoryAddress, HSAuint64 MemorySizeInBytes, HSAuint64 *AlternateVAGPU, + HsaMemMapFlags MemMapFlags, HSAuint64 NumberOfNodes, HSAuint32 *NodeArray) { + CHECK_DXG_OPEN(); + + if (!MemoryAddress || !AlternateVAGPU) { + pr_err("FIXME: mapping NULL pointer\n"); + return HSAKMT_STATUS_ERROR; + } + + uint64_t start = wsl::AlignDown((uint64_t)MemoryAddress, 4096); + uint64_t end = + wsl::AlignUp((uint64_t)MemoryAddress + MemorySizeInBytes, 4096); + + void *aligned_ptr = (void *)start; + size_t aligned_size = end - start; + + { + if (nullptr != fragment_allocator_.block_base(aligned_ptr)) + return HSAKMT_STATUS_SUCCESS; + } + + { + std::lock_guard gard(*allocation_map_lock_); + auto it = allocation_map_->find(aligned_ptr); + if (it != allocation_map_->end()) { + wsl::thunk::GpuMemory *gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle); + wsl::thunk::GpuMemoryDescFlags flags; + flags.reserved = gpu_mem->Flags(); + // IPC mem + if (flags.is_imported_vram_ipc) { + + auto code = gpu_mem->MapGpuVirtualAddress(gpu_mem->GpuAddress(), gpu_mem->Size()); + if (code != ErrorCode::Success) + return HSAKMT_STATUS_ERROR; + + code = gpu_mem->MakeResident(); + if (code != ErrorCode::Success) + return HSAKMT_STATUS_ERROR; + + wsl::thunk::WDDMDevice *dev = gpu_mem->GetDevice(); + if (!dev->WaitOnPagingFenceFromCpu()) + return HSAKMT_STATUS_ERROR; + + return HSAKMT_STATUS_SUCCESS; + } + + if (!it->second.userptr) { + // GTT/Local mem + if (it->second.size >= MemorySizeInBytes) { + *AlternateVAGPU = (uint64_t)MemoryAddress; + return HSAKMT_STATUS_SUCCESS; + } else { + return HSAKMT_STATUS_ERROR; + } + } + } + + // userptr mem + it = allocation_map_->find(MemoryAddress); + if (it != allocation_map_->end()) { + if (it->second.userptr && it->second.size >= MemorySizeInBytes) { + *AlternateVAGPU = + (uintptr_t)it->second.gpu_addr + + ((uintptr_t)MemoryAddress - (uintptr_t)it->second.cpu_addr); + return HSAKMT_STATUS_SUCCESS; + } + } + } + + // map userptr + wsl::thunk::WDDMDevice *dev = get_wddmdev(NodeArray[0]); + if (!dev) + return HSAKMT_STATUS_ERROR; + + wsl::thunk::GpuMemory *gpu_mem = nullptr; + wsl::thunk::GpuMemoryHandle handle = 0; + uint64_t addr; + wsl::thunk::GpuMemoryCreateInfo create_info{}; + create_info.domain = thunk_proxy::kUserMemory; + create_info.size = aligned_size; + create_info.user_ptr = aligned_ptr; + + auto code = dev->CreateGpuMemory(create_info, &gpu_mem); + if (code == ErrorCode::Success) { + addr = gpu_mem->GpuAddress(); + handle = gpu_mem->GetGpuMemoryHandle(); + } else { + return HSAKMT_STATUS_ERROR; + } + + { + std::lock_guard guard(*allocation_map_lock_); + (*allocation_map_)[MemoryAddress] = + Allocation(handle, aligned_ptr, addr, aligned_size, true, MemoryAddress, + MemorySizeInBytes); + (*allocation_map_)[(void *)addr] = + Allocation(handle, aligned_ptr, addr, aligned_size, true, nullptr, + MemorySizeInBytes); + } + + *AlternateVAGPU = addr + ((uintptr_t)MemoryAddress - (uintptr_t)aligned_ptr); + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtUnmapMemoryToGPU(void *MemoryAddress) { + CHECK_DXG_OPEN(); + + if (!MemoryAddress) { + /* Workaround for runtime bug */ + pr_err("FIXME: Unmapping NULL pointer\n"); + return HSAKMT_STATUS_SUCCESS; + } + + pr_debug("address %p\n", MemoryAddress); + + { + if (nullptr != fragment_allocator_.block_base(MemoryAddress)) + return HSAKMT_STATUS_SUCCESS; + } + + wsl::thunk::GpuMemory *gpu_mem = nullptr; + { + std::lock_guard gard(*allocation_map_lock_); + + auto it = allocation_map_->find(MemoryAddress); + if (it == allocation_map_->end()) { + return HSAKMT_STATUS_ERROR; + } + + gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle); + if (gpu_mem->IsQueueReferenced()) + return HSAKMT_STATUS_ERROR; + + // IPC mem + wsl::thunk::GpuMemoryDescFlags flags; + flags.reserved = gpu_mem->Flags(); + if (flags.is_imported_vram_ipc && + !gpu_mem->IsSharedFromSameProcess()) { + auto code = gpu_mem->UnmapGpuVirtualAddress(gpu_mem->GpuAddress(), gpu_mem->Size()); + if (code != ErrorCode::Success) + return HSAKMT_STATUS_ERROR; + gpu_mem->Evict(); + + return HSAKMT_STATUS_SUCCESS; + } + } + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtMapGraphicHandle(HSAuint32 NodeId, + HSAuint64 GraphicDeviceHandle, + HSAuint64 GraphicResourceHandle, + HSAuint64 GraphicResourceOffset, + HSAuint64 GraphicResourceSize, + HSAuint64 *FlatMemoryAddress) { + CHECK_DXG_OPEN(); + pr_warn_once("not implemented\n"); + /* This API was only ever implemented in KFD for Kaveri and + * was never upstreamed. There are no open-source users of + * this interface. It has been superseded by + * RegisterGraphicsHandleToNodes. + */ + return HSAKMT_STATUS_NOT_IMPLEMENTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtUnmapGraphicHandle(HSAuint32 NodeId, + HSAuint64 FlatMemoryAddress, + HSAuint64 SizeInBytes) { + CHECK_DXG_OPEN(); + pr_warn_once("not implemented\n"); + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtGetTileConfig(HSAuint32 NodeId, + HsaGpuTileConfig *config) { + CHECK_DXG_OPEN(); + pr_warn_once("not implemented\n"); + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtQueryPointerInfo(const void *Pointer, + HsaPointerInfo *PointerInfo) { + CHECK_DXG_OPEN(); + + if (!Pointer || !PointerInfo) + return HSAKMT_STATUS_INVALID_PARAMETER; + + pr_debug("pointer %p\n", Pointer); + + memset(PointerInfo, 0, sizeof(HsaPointerInfo)); + + wsl::thunk::GpuMemory *gpu_mem = nullptr; + Allocation allocation_info; + bool found = false; + { + std::lock_guard gard(*allocation_map_lock_); + auto it = allocation_map_->upper_bound(Pointer); + if (it != allocation_map_->begin()) { + --it; + if (Pointer >= it->first && + (Pointer < reinterpret_cast(it->first) + it->second.size_requested)) { + allocation_info = it->second; + gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle); + found = true; + } + } + } + + if (!found) { + pr_debug("can't found allocation for %p\n", Pointer); + PointerInfo->Type = HSA_POINTER_UNKNOWN; + return HSAKMT_STATUS_ERROR; + } + + if (allocation_info.userptr) { + PointerInfo->Type = HSA_POINTER_REGISTERED_USER; + PointerInfo->SizeInBytes = allocation_info.size; + } else if (gpu_mem->IsVirtual()) { + PointerInfo->Type = HSA_POINTER_RESERVED_ADDR; + } else { + PointerInfo->Type = HSA_POINTER_ALLOCATED; + PointerInfo->SizeInBytes = allocation_info.size_requested; + } + + PointerInfo->Node = allocation_info.node_id; + PointerInfo->MemFlags.Value = allocation_info.mem_flags_value; + PointerInfo->CPUAddress = allocation_info.cpu_addr; + PointerInfo->GPUAddress = allocation_info.gpu_addr; + PointerInfo->UserData = allocation_info.rocr_userdata; + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtSetMemoryUserData(const void *Pointer, + void *UserData) { + CHECK_DXG_OPEN(); + + uint64_t aligned_ptr = wsl::AlignDown((uint64_t)Pointer, 4096); + + std::lock_guard gard(*allocation_map_lock_); + auto it = allocation_map_->find((void *)aligned_ptr); + if (it != allocation_map_->end()) { + it->second.rocr_userdata = UserData; + return HSAKMT_STATUS_SUCCESS; + } + + return HSAKMT_STATUS_ERROR; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtReplaceAsanHeaderPage(void *addr) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + assert(false); +#ifdef SANITIZER_AMDGPU + pr_debug("address %p\n", addr); + CHECK_DXG_OPEN(); + + return HSAKMT_STATUS_SUCCESS; +#else + return HSAKMT_STATUS_NOT_SUPPORTED; +#endif +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtReturnAsanHeaderPage(void *addr) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + assert(false); +#ifdef SANITIZER_AMDGPU + pr_debug("address %p\n", addr); + CHECK_DXG_OPEN(); + + return HSAKMT_STATUS_SUCCESS; +#else + return HSAKMT_STATUS_NOT_SUPPORTED; +#endif +} diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/openclose.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/openclose.cpp new file mode 100644 index 0000000000..eb22a13aae --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/openclose.cpp @@ -0,0 +1,626 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +hsakmtRuntime *dxg_runtime = new hsakmtRuntime(); + +void hsakmtRuntime::HeapInit() { + ReserveLocalHeapSpace(); + ReserveSystemHeapSpace(); + InitHandleApertureSpace(); + InitLocalHeapMgr(); + InitSystemHeapMgr(); + InitHandleApertureMgr(); +} + +void hsakmtRuntime::HeapFini() { + FreeSystemHeapSpace(); + FreeLocalHeapSpace(); +} + +bool hsakmtRuntime::ReserveSvmSpace(uint64_t &base, uint64_t &size, uint64_t align) { + uint64_t sys_va[16] = {0}; + uint64_t local_va; + uint64_t sys_va_size; + int match_index = -1; + void* ptr = NULL; + + wsl::thunk::WDDMDevice* device; + size_t num_adapters = get_num_wddmdev(); + + base = 0; + sys_va_size = size + align; + + /* it will retry 16 times to find the avaliable range. */ + for (int i = 0; i < 16; i++) { + local_va = 0; + ptr = mmap(NULL, sys_va_size , PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); + if (ptr == MAP_FAILED) { + pr_err("fail to reserve cpu va in %d time!\n", i); + break; + } + + sys_va[i] = (uint64_t)ptr; + + int match_cnt = 0; + for (uint32_t j = 0; j < num_adapters; j++) { + device = get_wddmdev(j+1); + uint64_t start = (base == 0) ? (uint64_t)ptr : base; + uint64_t end = start + ((base == 0) ? sys_va_size : size) + 1; + + if (wsl::thunk::d3dthunk::ReserveGpuVirtualAddress( + device->GetAdapter(), size, + start, + end, &local_va) == ErrorCode::Success) { + + match_cnt++; + base = local_va; + pr_debug("success to reserve gpu va %lx and va cpu %p in %d time\n", + local_va, ptr, i); + } else { + pr_err("%s fail to reserve gpu va for cpu va %p in %d time!\n", + __FUNCTION__, ptr, i); + } + } + + if (match_cnt == num_adapters) { + match_index = i; + break; + } + } + + if (match_index >= 0) { + /* release cpu unused ranges*/ + uint64_t left_size = local_va - sys_va[match_index]; + uint64_t right_size = align - left_size; + if ((left_size > 0) && munmap((void*)sys_va[match_index], left_size)) + pr_err("fail to unmap left %lx with size %lx\n", sys_va[match_index], left_size); + if ((right_size > 0) && munmap((void*)(local_va + size), right_size)) + pr_err("fail to unmap right %lx with size %lx\n", (local_va + size), right_size); + } else { + pr_err("fail to reserve Local Heap Space!\n"); + base = 0; + size = 0; + } + + /* free match fail address for cpu va */ + int free = match_index >= 0 ? match_index : 16; + for (int j = 0; j < free; j++) { + if (sys_va[j] != 0 && munmap((void*)sys_va[j], sys_va_size)) { + pr_err("fail to unmap %d %lx\n", j, sys_va[j]); + } + } + + return match_index >= 0; +} + +/* + * To find the avaliable same range for cpu + * virtual space and gpu virtual space. + * sys_va_size of cpu va range is larger 1G + * than gpu va range, otherwise ReserveGPUVirtualAddress + * will return error. + */ +bool hsakmtRuntime::ReserveLocalHeapSpace() { + wsl::thunk::WDDMDevice* device; + uint64_t total_local_size = 0; + uint64_t align = 0x40000000; /* 1G */ + size_t num_adapters = get_num_wddmdev(); + + for (uint32_t j = 0; j < num_adapters; j++) { + device = get_wddmdev(j+1); + if (device == nullptr) + return -1; + /* + * For APU, use non local memory(shared GPU memory) as GPU memory, + * because it has small local memory + */ + if (device->IsDgpu()) + total_local_size = wsl::Max(device->LocalHeapSize(), total_local_size); + else + total_local_size = wsl::Max(device->LocalHeapSize(), device->NonLocalHeapSize(), total_local_size); + } + + total_local_size = wsl::AlignUp(total_local_size, align) * 4; + local_heap_space_start_ = 0; + local_heap_space_size_ = total_local_size; + + return ReserveSvmSpace(local_heap_space_start_, local_heap_space_size_, align); +} + +bool hsakmtRuntime::FreeSvmSpace(uint64_t &base, uint64_t &size) { + wsl::thunk::WDDMDevice* device; + size_t num_adapters = get_num_wddmdev(); + for (uint32_t j = 0; j < num_adapters; j++) { + device = get_wddmdev(j+1); + if (device == nullptr) + return -1; + wsl::thunk::d3dthunk::FreeGpuVirtualAddress(device->GetAdapter(), base, size); + } + + void *cpu = (void *)base; + auto r = (munmap(cpu, size) == 0); + base = 0; + size = 0; + return r; +} + +bool hsakmtRuntime::FreeLocalHeapSpace() { + return FreeSvmSpace(local_heap_space_start_, local_heap_space_size_); +} + +void hsakmtRuntime::InitLocalHeapMgr() { + local_heap_mgr_ = std::make_unique(local_heap_space_start_, + local_heap_space_size_, + DEFAULT_GPU_PAGE_SIZE); +} + +bool hsakmtRuntime::ReserveSystemHeapSpace() { + struct sysinfo info; + int ret = sysinfo(&info); + uint64_t max_ram = 0x10000000000; + uint64_t alignment = 0x100000000; + assert(!ret); + + int32_t protFlags = PROT_NONE; + // minimum of reserve size is 8G, maximum of reserve size is 1T. + system_heap_space_size_ = std::min(wsl::AlignUp(info.totalram, alignment) * 2, max_ram); + + return ReserveSvmSpace(system_heap_space_start_, system_heap_space_size_, alignment); +} + +bool hsakmtRuntime::FreeSystemHeapSpace(void) { + return FreeSvmSpace(system_heap_space_start_, system_heap_space_size_); +} + +bool hsakmtRuntime::CommitSystemHeapSpace(void* addr, int64_t size, bool lock) { + int32_t protFlags = PROT_READ | PROT_WRITE | PROT_EXEC; + int32_t mapFlags = MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED| + MAP_NORESERVE|MAP_UNINITIALIZED; + if (lock) + mapFlags |= MAP_LOCKED; + void* paddr = mmap(addr, size, protFlags, mapFlags, -1, 0); + if (paddr == MAP_FAILED) { + pr_err("fail to commit %s addr = %p, paddr = %p\n", (lock ? "locked" : ""), addr, paddr); + return false; + } + assert(addr == paddr); + + /*if (!Runtime::runtime_singleton_->PinWARequired()) + return true;*/ + + /* + * Do not make the pages in this range available to the child + * after a fork(2). This is useful to prevent copy-on-write + * semantics from changing the physical location of a page if + * the parent writes to it after a fork(2). (Such page + * relocations cause problems for hardware that DMAs into the + * page.) + * + * https://man7.org/linux/man-pages/man2/madvise.2.html + */ + if (madvise(addr, size, MADV_DONTFORK)) + pr_err("fail to set MADV_DONTFORK for addr = %p\n", addr); + + return true; +} + +bool hsakmtRuntime::DecommitSystemHeapSpace(void* addr, int64_t size) { + int32_t protFlags = PROT_NONE; + int32_t mapFlags = MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED| + MAP_NORESERVE|MAP_UNINITIALIZED; + void* paddr = mmap(addr, size, protFlags, mapFlags, -1, 0); + if (paddr == MAP_FAILED) { + pr_err("fail to decommit addr = %p, paddr = %p\n", addr, paddr); + return false; + } + assert(addr == paddr); + return true; +} + +void hsakmtRuntime::InitSystemHeapMgr() { + system_heap_mgr_ = std::make_unique(system_heap_space_start_, + system_heap_space_size_, + DEFAULT_GPU_PAGE_SIZE); +} + +ErrorCode hsakmtRuntime::ReserveGpuVirtualAddress(const thunk_proxy::AllocDomain domain, + gpusize hit_base_addr, gpusize size, + gpusize *out_gpu_virt_addr, gpusize alignment, bool lock) { + gpusize gpu_addr = 0; + ErrorCode code = ErrorCode::Success; + + uint64_t align = alignment == 0 ? (64 * 1024) : alignment; // default 64K alignment + if (size >= GPU_HUGE_PAGE_SIZE) + align = GPU_HUGE_PAGE_SIZE; + + if (domain == thunk_proxy::kSystem) { + gpu_addr = system_heap_mgr_->Alloc(size, align, hit_base_addr); + if (gpu_addr == 0) + code = ErrorCode::OutOfMemory; + + if (!CommitSystemHeapSpace((void*)gpu_addr, size, lock)) { + system_heap_mgr_->Free(gpu_addr); + code = ErrorCode::SyscallFail; + } + } else { + gpu_addr = local_heap_mgr_->Alloc(size, align, hit_base_addr); + if (gpu_addr == 0) + code = ErrorCode::OutOfGpuMemory; + } + + *out_gpu_virt_addr = (code == ErrorCode::Success) ? gpu_addr : 0; + return code; +} + +ErrorCode hsakmtRuntime::FreeGpuVirtualAddress(const thunk_proxy::AllocDomain domain, + gpusize gpu_addr, gpusize size) { + auto code = ErrorCode::Success; + + if (domain == thunk_proxy::kSystem) { + DecommitSystemHeapSpace((void *)gpu_addr, size); + system_heap_mgr_->Free(gpu_addr); + } else { + local_heap_mgr_->Free(gpu_addr); + } + + return code; +} + +bool hsakmtRuntime::CommitSystemHeapSpaceIPC(void* addr, int64_t size, int &memfd, bool lock) { + int fd = -1; + + if (memfd == -1) { + fd = memfd_create("rocr4wsl_gtt", MFD_CLOEXEC); + if (fd < 0) { + pr_err("memfd_create failed\n"); + return false; + } + + ftruncate(fd, size); + } else { + fd = memfd; + } + + int32_t protFlags = PROT_READ | PROT_WRITE; + int32_t mapFlags = MAP_SHARED | MAP_FIXED | MAP_NORESERVE | + MAP_UNINITIALIZED | (lock ? MAP_LOCKED : 0); + + void* paddr = mmap(addr, size, protFlags, mapFlags, fd, 0); + if (paddr == MAP_FAILED) { + pr_err("fail to commit %s addr = %p, paddr = %p\n", (lock ? "locked" : ""), addr, paddr); + if (memfd == -1) + close(fd); + return false; + } + assert(addr == paddr); + + memfd = fd; + + if (madvise(addr, size, MADV_DONTFORK)) + pr_err("fail to set MADV_DONTFORK for addr = %p\n", addr); + + return true; +} + +bool hsakmtRuntime::DecommitSystemHeapSpaceIPC(void* addr, int64_t size, int &memfd) { + if (munmap(addr, size) != 0) { + pr_err("fail to unmap = %p \n", addr); + return false; + } + close(memfd); + memfd = -1; + return true; +} + +ErrorCode hsakmtRuntime::ReserveIPCSysMem(gpusize size, + gpusize *out_gpu_virt_addr, gpusize alignment, + int &memfd, bool lock) { + gpusize gpu_addr = 0; + ErrorCode code = ErrorCode::Success; + gpu_addr = system_heap_mgr_->Alloc(size, alignment, 0); + if (gpu_addr == 0) + return ErrorCode::OutOfMemory; + + if (!CommitSystemHeapSpaceIPC((void*)gpu_addr, size, memfd, lock)) { + system_heap_mgr_->Free(gpu_addr); + code = ErrorCode::SyscallFail; + } + + *out_gpu_virt_addr = (code == ErrorCode::Success) ? gpu_addr : 0; + return code; +} + +ErrorCode hsakmtRuntime::FreeIPCSysMem(gpusize gpu_addr, gpusize size, int &memfd) { + auto code = ErrorCode::Success; + + DecommitSystemHeapSpaceIPC((void *)gpu_addr, size, memfd); + + system_heap_mgr_->Free(gpu_addr); + return code; +} + +bool hsakmtRuntime::InitHandleApertureSpace() { + wsl::thunk::WDDMDevice* device; + size_t num_adapters = get_num_wddmdev(); + handle_aperture_start_ = START_NON_CANONICAL_ADDR; + handle_aperture_size_ = 1ULL << 47; + + while (handle_aperture_start_ < END_NON_CANONICAL_ADDR - 1) { + for (uint32_t j = 0; j < num_adapters;) { + device = get_wddmdev(j+1); + if (device == nullptr) + return -1; + + if (device->PrivateApertureBase() && + IS_OVERLAPPING(device->PrivateApertureBase(), + device->PrivateApertureSize(), + handle_aperture_start_, + handle_aperture_size_)) { + handle_aperture_start_ += (1ULL << 47); + continue; + } + + if (device->SharedApertureBase() && + IS_OVERLAPPING(device->SharedApertureBase(), + device->SharedApertureSize(), + handle_aperture_start_, + handle_aperture_size_)) { + handle_aperture_start_ += (1ULL << 47); + continue; + } + + j++; + } + pr_debug("handle aperture start %lx, size %lx\n", handle_aperture_start_, handle_aperture_size_); + return true; + } + + handle_aperture_start_ = 0; + pr_err("fail\n"); + + return false; +} + +void hsakmtRuntime::InitHandleApertureMgr() { + handle_aperture_mgr_ = std::make_unique(handle_aperture_start_, + handle_aperture_size_, + DEFAULT_GPU_PAGE_SIZE); +} + +ErrorCode hsakmtRuntime::HandleApertureAlloc(gpusize size, gpusize *out_gpu_virt_addr) { + uint64_t align = DEFAULT_GPU_PAGE_SIZE; + + if (size >= GPU_HUGE_PAGE_SIZE) + align = GPU_HUGE_PAGE_SIZE; + + *out_gpu_virt_addr = handle_aperture_mgr_->Alloc(size, align); + if (*out_gpu_virt_addr == 0) + return ErrorCode::OutOfHandleApeMemory; + + return ErrorCode::Success; +} + +void hsakmtRuntime::HandleApertureFree(gpusize gpu_addr) { + handle_aperture_mgr_->Free(gpu_addr); +} + +/* is_forked_child detects when the process has forked since the last + * time this function was called. We cannot rely on pthread_atfork + * because the process can fork without calling the fork function in + * libc (using clone or calling the system call directly). + */ +bool is_forked_child(void) { + if (dxg_runtime->is_forked) + return true; + + pid_t cur_pid = getpid(); + if (dxg_runtime->parent_pid != cur_pid) { + dxg_runtime->is_forked = true; + dxg_runtime->parent_pid = cur_pid; + return true; + } + + return false; +} + +/* Callbacks from pthread_atfork */ +static void prepare_fork_handler(void) { pthread_mutex_lock(&dxg_runtime->hsakmt_mutex); } +static void parent_fork_handler(void) { pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex); } +static void child_fork_handler(void) { + pthread_mutex_init(&dxg_runtime->hsakmt_mutex, NULL); + dxg_runtime->is_forked = true; +} + +/* Call this from the child process after fork. This will clear all + * data that is duplicated from the parent process, that is not valid + * in the child. + * The topology information is duplicated from the parent is valid + * in the child process so it is not cleared + */ +static void clear_after_fork(void) { + reset_suballocator(); + clear_allocation_map(); + + if (dxg_runtime->dxg_fd >= 0) { + close(dxg_runtime->dxg_fd); + dxg_runtime->dxg_fd = -1; + } + delete dxg_runtime; + dxg_runtime = new hsakmtRuntime(); + +} + +static inline void init_page_size(void) { + dxg_runtime->page_size = sysconf(_SC_PAGESIZE); + dxg_runtime->page_shift = ffs(dxg_runtime->page_size) - 1; +} + +static HSAKMT_STATUS init_vars_from_env(void) { + char *envvar; + int debug_level; + + /* Normally libraries don't print messages. For debugging purpose, we'll + * print messages if an environment variable, HSAKMT_DEBUG_LEVEL, is set. + */ + envvar = getenv("HSAKMT_DEBUG_LEVEL"); + if (envvar) { + dxg_runtime->hsakmt_debug_level = atoi(envvar); + } + + /* Check whether to support Zero frame buffer */ + envvar = getenv("HSA_ZFB"); + if (envvar) + dxg_runtime->zfb_support = atoi(envvar); + + /* Check whether to handle vendor specific aql packet */ + envvar = getenv("WSLKMT_VENDOR_PACKET"); + if (envvar) + dxg_runtime->vendor_packet_process = atoi(envvar); + + /* Decide whether to check available system memory before allocation */ + envvar = getenv("WSL_CHECK_AVAIL_SYSRAM"); + if (envvar) + dxg_runtime->check_avail_sysram = !strcmp(envvar, "1"); + + envvar = getenv("WSL_ENABLE_THUNK_SUB_ALLOCATOR"); + if (envvar) + dxg_runtime->enable_thunk_sub_allocator = atoi(envvar); + + envvar = getenv("ROCR_VISIBLE_DEVICES"); + if (envvar) { + std::string devices(envvar); + size_t first_num_pos = devices.find_first_of("0123456789"); + if (first_num_pos != std::string::npos) + dxg_runtime->default_node = std::stoi(devices.substr(first_num_pos)) + 1; + } + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtOpenKFD(void) { + HSAKMT_STATUS result; + int fd = -1; + HsaSystemProperties sys_props; + char *error; + + pthread_mutex_lock(&dxg_runtime->hsakmt_mutex); + + /* If the process has forked, the child process must re-initialize + * it's connection to DXG. Any references tracked by dxg_open_count + * belong to the parent + */ + if (is_forked_child()) + clear_after_fork(); + + if (dxg_runtime->dxg_open_count == 0) { + static bool atfork_installed = false; + + result = init_vars_from_env(); + if (result != HSAKMT_STATUS_SUCCESS) + goto open_failed; + + if (dxg_runtime->dxg_fd < 0) { + fd = open(dxg_runtime->dxg_device_name, O_RDWR | O_CLOEXEC); + + if (fd == -1) { + result = HSAKMT_STATUS_KERNEL_IO_CHANNEL_NOT_OPENED; + goto open_failed; + } + + dxg_runtime->dxg_fd = fd; + } + if (!wsl::thunk::dxcore::DxcoreLoader::Instance().Initialize()) { + pr_err("Failed to load libdxcore.so\n"); + result = HSAKMT_STATUS_ERROR; + goto dxcore_loader_failed; + } + + hsakmt_hsa_loader_init(); + init_page_size(); + + char *useSvmStr = getenv("HSA_USE_SVM"); + dxg_runtime->is_svm_api_supported = !(useSvmStr && !strcmp(useSvmStr, "0")) && false; + + dxg_runtime->dxg_open_count = 1; + + if (!atfork_installed) { + /* Atfork handlers cannot be uninstalled and + * must be installed only once. Otherwise + * prepare will deadlock when trying to take + * the same lock multiple times. + */ + pthread_atfork(prepare_fork_handler, parent_fork_handler, + child_fork_handler); + atfork_installed = true; + } + } else { + dxg_runtime->dxg_open_count++; + result = HSAKMT_STATUS_KERNEL_ALREADY_OPENED; + } + + reset_suballocator(); + pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex); + return result; +dxcore_loader_failed: + close(fd); +open_failed: + pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex); + + return result; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtCloseKFD(void) { + HSAKMT_STATUS result; + + pthread_mutex_lock(&dxg_runtime->hsakmt_mutex); + + if (dxg_runtime->dxg_open_count > 0) { + if (--dxg_runtime->dxg_open_count == 0) { + close(dxg_runtime->dxg_fd); + dxg_runtime->dxg_fd = -1; + wsl::thunk::dxcore::DxcoreLoader::Instance().Shutdown(); + } + + result = HSAKMT_STATUS_SUCCESS; + } else + result = HSAKMT_STATUS_KERNEL_IO_CHANNEL_NOT_OPENED; + + pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex); + + return result; +} diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/pc_sampling.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/pc_sampling.cpp new file mode 100644 index 0000000000..6c6a9e2a04 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/pc_sampling.cpp @@ -0,0 +1,78 @@ +/* + * Copyright © 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include + +HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingSupport(void) { + CHECK_DXG_OPEN(); + // Used for profiling tools + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI +hsaKmtPcSamplingQueryCapabilities(HSAuint32 NodeId, void *sample_info, + HSAuint32 sample_info_sz, HSAuint32 *size) { + CHECK_DXG_OPEN(); + // Used for profiling tools + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingCreate(HSAuint32 NodeId, + HsaPcSamplingInfo *sample_info, + HsaPcSamplingTraceId *traceId) { + CHECK_DXG_OPEN(); + // Used for profiling tools + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingDestroy(HSAuint32 NodeId, + HsaPcSamplingTraceId traceId) { + CHECK_DXG_OPEN(); + // Used for profiling tools + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingStart(HSAuint32 NodeId, + HsaPcSamplingTraceId traceId) { + CHECK_DXG_OPEN(); + // Used for profiling tools + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingStop(HSAuint32 NodeId, + HsaPcSamplingTraceId traceId) { + CHECK_DXG_OPEN(); + // Used for profiling tools + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/perfctr.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/perfctr.cpp new file mode 100644 index 0000000000..9189d2dafa --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/perfctr.cpp @@ -0,0 +1,90 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +HSAKMT_STATUS HSAKMTAPI hsaKmtPmcGetCounterProperties( + HSAuint32 NodeId, HsaCounterProperties **CounterProperties) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +/* Registers a set of (HW) counters to be used for tracing/profiling */ +HSAKMT_STATUS HSAKMTAPI hsaKmtPmcRegisterTrace(HSAuint32 NodeId, + HSAuint32 NumberOfCounters, + HsaCounter *Counters, + HsaPmcTraceRoot *TraceRoot) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +/* Unregisters a set of (HW) counters used for tracing/profiling */ + +HSAKMT_STATUS HSAKMTAPI hsaKmtPmcUnregisterTrace(HSAuint32 NodeId, + HSATraceId TraceId) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtPmcAcquireTraceAccess(HSAuint32 NodeId, + HSATraceId TraceId) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtPmcReleaseTraceAccess(HSAuint32 NodeId, + HSATraceId TraceId) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +/* Starts tracing operation on a previously established set of performance + * counters */ +HSAKMT_STATUS HSAKMTAPI hsaKmtPmcStartTrace(HSATraceId TraceId, + void *TraceBuffer, + HSAuint64 TraceBufferSizeBytes) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +/*Forces an update of all the counters that a previously started trace operation + * has registered */ +HSAKMT_STATUS HSAKMTAPI hsaKmtPmcQueryTrace(HSATraceId TraceId) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +/* Stops tracing operation on a previously established set of performance + * counters */ +HSAKMT_STATUS HSAKMTAPI hsaKmtPmcStopTrace(HSATraceId TraceId) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/queues.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/queues.cpp new file mode 100644 index 0000000000..edaaea9d1a --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/queues.cpp @@ -0,0 +1,216 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ +#include +#include "impl/wddm/device.h" +#include "impl/wddm/queue.h" +#include "impl/hsa/amd_hsa_signal.h" + +uint32_t get_vgpr_size_per_cu(HSA_ENGINE_ID id) { + uint32_t vgpr_size = 0x40000; + + uint32_t gfxv = HSA_GET_GFX_VERSION_FULL(id.ui32); + if( gfxv == 0x1100 || gfxv == 0x1101 || + gfxv == 0x1151 || + gfxv == 0x1200 || gfxv ==0x1201) { + vgpr_size = 0x60000; + } + + return vgpr_size; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueue(HSAuint32 NodeId, + HSA_QUEUE_TYPE Type, + HSAuint32 QueuePercentage, + HSA_QUEUE_PRIORITY Priority, + void *QueueAddress, + HSAuint64 QueueSizeInBytes, + HsaEvent *Event, + HsaQueueResource *QueueResource) +{ + if (Type == HSA_QUEUE_SDMA_BY_ENG_ID) + return HSAKMT_STATUS_ERROR; + + return hsaKmtCreateQueueExt(NodeId, Type, QueuePercentage, Priority, 0, + QueueAddress, QueueSizeInBytes, Event, + QueueResource); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueueExt(HSAuint32 NodeId, + HSA_QUEUE_TYPE Type, + HSAuint32 QueuePercentage, + HSA_QUEUE_PRIORITY Priority, + HSAuint32 SdmaEngineId, + void *QueueAddress, + HSAuint64 QueueSizeInBytes, + HsaEvent *Event, + HsaQueueResource *QueueResource) { + HSAKMT_STATUS result; + + CHECK_DXG_OPEN(); + assert(Event == nullptr); + + if (Priority < HSA_QUEUE_PRIORITY_MINIMUM || + Priority > HSA_QUEUE_PRIORITY_MAXIMUM) + return HSAKMT_STATUS_INVALID_PARAMETER; + + wsl::thunk::WDDMDevice *device_ = get_wddmdev(NodeId); + assert(device_); + + if (queue_acquire_buffer(QueueAddress) == false) + return HSAKMT_STATUS_INVALID_PARAMETER; + + switch (Type) { + case HSA_QUEUE_COMPUTE_AQL: { + assert(QueueResource->ErrorReason == nullptr); + uint64_t pkg_num = QueueSizeInBytes / 64; + uint32_t cmdbuf_size = device_->GetCmdbufSize(); + uint32_t queue_engine = device_->GetComputeEngine(); + bool use_hws = device_->IsHwsEnabled(queue_engine); + auto queue_ = new wsl::thunk::ComputeQueue( + device_, QueueAddress, pkg_num, + reinterpret_cast *>( + QueueResource->Queue_write_ptr_aql), + reinterpret_cast *>( + QueueResource->Queue_read_ptr_aql), + QueueResource->ErrorReason, cmdbuf_size, queue_engine, use_hws); + + QueueResource->QueueId = reinterpret_cast(queue_); + // for doorbell_signal.hardware_doorbell_ptr + QueueResource->Queue_DoorBell_aql = queue_->GetDoorbellPtr(); + } break; + case HSA_QUEUE_SDMA: + case HSA_QUEUE_SDMA_BY_ENG_ID: { + pr_debug("create sdma queue in engine %d\n", SdmaEngineId); + uint32_t queue_engine = device_->GetSdmaEngine(0); // TODO: SdmaEngineId + bool use_hws = device_->IsHwsEnabled(queue_engine); + auto queue_ = new wsl::thunk::SDMAQueue( + device_, QueueAddress, QueueSizeInBytes, + queue_engine, use_hws); + QueueResource->QueueId = reinterpret_cast(queue_); + QueueResource->Queue_DoorBell_aql = queue_->GetDoorbellPtr(); + QueueResource->Queue_write_ptr_aql = queue_->GetRingWptr(); + QueueResource->Queue_read_ptr_aql = queue_->GetRingRptr(); + } break; + default: + assert(false); + QueueResource->QueueId = 0; + QueueResource->Queue_DoorBell = nullptr; + break; + } + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtUpdateQueue( + HSA_QUEUEID QueueId, HSAuint32 QueuePercentage, HSA_QUEUE_PRIORITY Priority, + void *QueueAddress, HSAuint64 QueueSize, HsaEvent *Event) { + CHECK_DXG_OPEN(); + + if (Priority < HSA_QUEUE_PRIORITY_MINIMUM || + Priority > HSA_QUEUE_PRIORITY_MAXIMUM) + return HSAKMT_STATUS_INVALID_PARAMETER; + + auto queue_ = reinterpret_cast(QueueId); + if (!queue_) + return HSAKMT_STATUS_INVALID_PARAMETER; + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtDestroyQueue(HSA_QUEUEID QueueId) { + CHECK_DXG_OPEN(); + + auto queue_ = reinterpret_cast(QueueId); + void *QueueAddress = queue_->GetHsaQueueAddr(); + + if (!queue_) + return HSAKMT_STATUS_INVALID_PARAMETER; + + delete queue_; + queue_release_buffer(QueueAddress); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtSetQueueCUMask(HSA_QUEUEID QueueId, + HSAuint32 CUMaskCount, + HSAuint32 *QueueCUMask) { + CHECK_DXG_OPEN(); + + auto queue_ = reinterpret_cast(QueueId); + if (!queue_) + return HSAKMT_STATUS_INVALID_PARAMETER; + + if (CUMaskCount == 0 || !QueueCUMask || ((CUMaskCount % 32) != 0)) + return HSAKMT_STATUS_INVALID_PARAMETER; + + pr_warn_once("not implemented\n"); + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtGetQueueInfo(HSA_QUEUEID QueueId, + HsaQueueInfo *QueueInfo) { + CHECK_DXG_OPEN(); + + if (QueueInfo == NULL) + return HSAKMT_STATUS_INVALID_PARAMETER; + memset(QueueInfo, 0, sizeof(*QueueInfo)); + + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtSetTrapHandler(HSAuint32 Node, + void *TrapHandlerBaseAddress, + HSAuint64 TrapHandlerSizeInBytes, + void *TrapBufferBaseAddress, + HSAuint64 TrapBufferSizeInBytes) { + CHECK_DXG_OPEN(); + pr_warn_once("not implemented\n"); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtAllocQueueGWS(HSA_QUEUEID QueueId, HSAuint32 nGWS, + HSAuint32 *firstGWS) { + CHECK_DXG_OPEN(); + + auto queue_ = reinterpret_cast(QueueId); + if (!queue_) + return HSAKMT_STATUS_INVALID_PARAMETER; + + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtQueueRingDoorbell(HSA_QUEUEID QueueId) { + CHECK_DXG_OPEN(); + + auto queue_ = reinterpret_cast(QueueId); + if (!queue_) + return HSAKMT_STATUS_INVALID_PARAMETER; + + queue_->RingDoorbell(); + return HSAKMT_STATUS_SUCCESS; +} diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/spm.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/spm.cpp new file mode 100644 index 0000000000..14b0faf1f8 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/spm.cpp @@ -0,0 +1,50 @@ +/* + * Copyright © 2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include + +HSAKMT_STATUS HSAKMTAPI hsaKmtSPMAcquire(HSAuint32 PreferredNode) { + CHECK_DXG_OPEN(); + // Used for profiling tools + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtSPMSetDestBuffer( + HSAuint32 PreferredNode, HSAuint32 SizeInBytes, HSAuint32 *timeout, + HSAuint32 *SizeCopied, void *DestMemoryAddress, bool *isSPMDataLoss) { + CHECK_DXG_OPEN(); + // Used for profiling tools + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtSPMRelease(HSAuint32 PreferredNode) { + CHECK_DXG_OPEN(); + // Used for profiling tools + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/svm.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/svm.cpp new file mode 100644 index 0000000000..f2f8a10f68 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/svm.cpp @@ -0,0 +1,55 @@ +/* + * Copyright © 2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/* Helper functions for calling KFD SVM ioctl */ + +HSAKMT_STATUS HSAKMTAPI hsaKmtSVMSetAttr(void *start_addr, HSAuint64 size, + unsigned int nattr, + HSA_SVM_ATTRIBUTE *attrs) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtSVMGetAttr(void *start_addr, HSAuint64 size, + unsigned int nattr, + HSA_SVM_ATTRIBUTE *attrs) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtSetXNACKMode(HSAint32 enable) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtGetXNACKMode(HSAint32 *enable) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + *enable = false; + return HSAKMT_STATUS_SUCCESS; +} diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/thunk_proxy/libthunk_proxy.a b/projects/rocr-runtime/libhsakmt/src/dxg/thunk_proxy/libthunk_proxy.a new file mode 100644 index 0000000000..3b21eb936d Binary files /dev/null and b/projects/rocr-runtime/libhsakmt/src/dxg/thunk_proxy/libthunk_proxy.a differ diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/time.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/time.cpp new file mode 100644 index 0000000000..a28bb29215 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/time.cpp @@ -0,0 +1,49 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ +#include +#include +#include +#include +#include "impl/wddm/device.h" + +HSAKMT_STATUS HSAKMTAPI hsaKmtGetClockCounters(HSAuint32 NodeId, + HsaClockCounters *Counters) { + HSAKMT_STATUS result = HSAKMT_STATUS_SUCCESS; + + CHECK_DXG_OPEN(); + + std::memset(Counters, 0, sizeof(*Counters)); + + wsl::thunk::WDDMDevice *device_ = get_wddmdev(NodeId); + assert(device_); + device_->GetClockCounters(&Counters->GPUClockCounter, &Counters->CPUClockCounter); + + struct timespec ts; + if (clock_gettime(CLOCK_MONOTONIC_RAW, &ts) == 0) + Counters->SystemClockCounter = ts.tv_sec * 1e9 + ts.tv_nsec; + Counters->SystemClockFrequencyHz = 1000000000; + + return result; +} diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/topology.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/topology.cpp new file mode 100644 index 0000000000..2db712e341 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/topology.cpp @@ -0,0 +1,1463 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * Copyright 2016-2018 Raptor Engineering, LLC. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "impl/wddm/types.h" +#include "impl/wddm/device.h" +#include "util/utils.h" + +/* Number of memory banks added by thunk on top of topology + * This only includes static heaps like LDS, scratch and SVM, + * not for MMIO_REMAP heap. MMIO_REMAP memory bank is reported + * dynamically based on whether mmio aperture was mapped + * successfully on this node. + */ +#define NUM_OF_IGPU_HEAPS 3 +#define NUM_OF_DGPU_HEAPS 3 + +typedef struct { + HsaNodeProperties node; + std::vector mem; /* node->NumBanks elements */ + std::vector cache; + std::vector link; +} node_props_t; + +struct _topology_props { + HsaSystemProperties *g_system = nullptr; + std::vector g_props; + std::vector wdevices_; + uint32_t wdevice_num_ = 0; + uint32_t num_sysfs_nodes = 0; + int processor_vendor = -1; + double freq_max_ = 0.0; +}; + +static _topology_props* dxg_topology = new _topology_props(); + +/* Supported System Vendors */ +enum SUPPORTED_PROCESSOR_VENDORS { + GENUINE_INTEL = 0, + AUTHENTIC_AMD, + IBM_POWER +}; +/* Adding newline to make the search easier */ +static const char *supported_processor_vendor_name[] = { + "GenuineIntel", + "AuthenticAMD", + "" // POWER requires a different search method +}; + +static HSAKMT_STATUS topology_take_snapshot(void); +static void topology_drop_snapshot(void); + +/* information from /proc/cpuinfo */ +struct proc_cpuinfo { + uint32_t proc_num; /* processor */ + uint32_t apicid; /* apicid */ + char model_name[HSA_PUBLIC_NAME_SIZE]; /* model name */ +}; + +/* CPU cache table for all CPUs on the system. Each entry has the relative CPU + * info and caches connected to that CPU. + */ +typedef struct cpu_cacheinfo { + int32_t proc_num; /* this cpu's processor number */ + uint32_t num_caches; /* number of caches reported by this cpu */ +} cpu_cacheinfo_t; + +/* num_subdirs - find the number of sub-directories in the specified path + * @dirpath - directory path to find sub-directories underneath + * @prefix - only count sub-directory names starting with prefix. + * Use blank string, "", to count all. + * Return - number of sub-directories + */ +static int num_subdirs(char *dirpath, const char *prefix) { + int count = 0; + DIR *dirp; + struct dirent *dir; + int prefix_len = strlen(prefix); + + dirp = opendir(dirpath); + if (dirp) { + while ((dir = readdir(dirp)) != 0) { + if ((strcmp(dir->d_name, ".") == 0) || (strcmp(dir->d_name, "..") == 0)) + continue; + if (prefix_len && strncmp(dir->d_name, prefix, prefix_len)) + continue; + count++; + } + closedir(dirp); + } + + return count; +} + +/* fscanf_dec - read a file whose content is a decimal number + * @file [IN ] file to read + * @num [OUT] number in the file + */ +static HSAKMT_STATUS fscanf_dec(char *file, uint32_t *num) { + FILE *fd; + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + + fd = fopen(file, "r"); + if (!fd) { + pr_err("Failed to open %s\n", file); + return HSAKMT_STATUS_INVALID_PARAMETER; + } + if (fscanf(fd, "%u", num) != 1) { + pr_err("Failed to parse %s as a decimal.\n", file); + ret = HSAKMT_STATUS_ERROR; + } + + fclose(fd); + return ret; +} + +/* fscanf_str - read a file whose content is a string + * @file [IN ] file to read + * @str [OUT] string in the file + */ +static HSAKMT_STATUS fscanf_str(char *file, char *str) { + FILE *fd; + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + + fd = fopen(file, "r"); + if (!fd) { + pr_err("Failed to open %s\n", file); + return HSAKMT_STATUS_INVALID_PARAMETER; + } + if (fscanf(fd, "%s", str) != 1) { + pr_err("Failed to parse %s as a string.\n", file); + ret = HSAKMT_STATUS_ERROR; + } + + fclose(fd); + return ret; +} + +/* fscanf_size - read a file whose content represents size as a string + * @file [IN ] file to read + * @bytes [OUT] sizes in bytes + */ +static HSAKMT_STATUS fscanf_size(char *file, uint32_t *bytes) { + FILE *fd; + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + char unit; + int n; + + fd = fopen(file, "r"); + if (!fd) { + pr_err("Failed to open %s\n", file); + return HSAKMT_STATUS_INVALID_PARAMETER; + } + + n = fscanf(fd, "%u%c", bytes, &unit); + if (n < 1) { + pr_err("Failed to parse %s\n", file); + ret = HSAKMT_STATUS_ERROR; + } + + if (n == 2) { + switch (unit) { + case 'K': + *bytes <<= 10; + break; + case 'M': + *bytes <<= 20; + break; + case 'G': + *bytes <<= 30; + break; + default: + ret = HSAKMT_STATUS_ERROR; + break; + } + } + + fclose(fd); + return ret; +} + +/* cpumap_to_cpu_ci - translate shared_cpu_map string + cpuinfo->apicid into + * SiblingMap in cache + * @shared_cpu_map [IN ] shared_cpu_map string + * @cpuinfo [IN ] cpuinfo to get apicid + * @this_cache [OUT] CPU cache to fill in SiblingMap + */ +static void cpumap_to_cpu_ci(char *shared_cpu_map, + const std::vector& cpuinfo, + HsaCacheProperties *this_cache) { + int num_hexs, bit; + uint32_t proc, apicid, mask; + char *ch_ptr; + + /* shared_cpu_map is shown as ...X3,X2,X1 Each X is a hex without 0x + * and it's up to 8 characters(32 bits). For the first 32 CPUs(actually + * procs), it's presented in X1. The next 32 is in X2, and so on. + */ + num_hexs = (strlen(shared_cpu_map) + 8) / 9; /* 8 characters + "," */ + ch_ptr = strtok(shared_cpu_map, ","); + while (num_hexs-- > 0) { + mask = strtol(ch_ptr, NULL, 16); /* each X */ + for (bit = 0; bit < 32; bit++) { + if (!((1 << bit) & mask)) + continue; + proc = num_hexs * 32 + bit; + apicid = cpuinfo[proc].apicid; + if (apicid >= HSA_CPU_SIBLINGS) { + pr_warn("SiblingMap buffer %d is too small\n", HSA_CPU_SIBLINGS); + continue; + } + this_cache->SiblingMap[apicid] = 1; + } + ch_ptr = strtok(NULL, ","); + } +} + +/* get_cpu_cache_info - get specified CPU's cache information from sysfs + * @prefix [IN] sysfs path for target cpu cache, + * /sys/devices/system/node/nodeX/cpuY/cache + * @cpuinfo [IN] /proc/cpuinfo data to get apicid + * @cpu_ci: CPU specified. This parameter is an input and also an output. + * [IN] cpu_ci->num_caches: number of index dirs + * [OUT] cpu_ci->cache_info: to store cache info collected + * [OUT] cpu_ci->num_caches: reduces when shared with other cpu(s) + * Return: number of cache reported from this cpu + */ +static int get_cpu_cache_info(const char *prefix, + const std::vector& cpuinfo, + std::vector& cache, + cpu_cacheinfo_t& cpu_ci) { + int n; + char path[256], str[256]; + bool is_power9 = false; + + if (dxg_topology->processor_vendor == IBM_POWER) { + if (strcmp(cpuinfo[0].model_name, "POWER9") == 0) { + is_power9 = true; + } + } + + HsaCacheProperties this_cache; + int num_idx = cpu_ci.num_caches; + for (int idx = 0; idx < num_idx; idx++) { + memset(&this_cache, 0, sizeof(this_cache)); + /* If this cache is shared by multiple CPUs, we only need + * to list it in the first CPU. + */ + if (is_power9) { + // POWER9 has SMT4 + if (cpu_ci.proc_num & 0x3) { + /* proc is not 0,4,8,etc. Skip and reduce the cache count. */ + --cpu_ci.num_caches; + continue; + } + } else { + snprintf(path, 256, "%s/index%d/shared_cpu_list", prefix, idx); + /* shared_cpu_list is shown as n1,n2... or n1-n2,n3-n4... + * For both cases, this cache is listed to proc n1 only. + */ + fscanf_dec(path, (uint32_t *)&n); + if (cpu_ci.proc_num != n) { + /* proc is not n1. Skip and reduce the cache count. */ + --cpu_ci.num_caches; + continue; + } + this_cache.ProcessorIdLow = cpuinfo[cpu_ci.proc_num].apicid; + } + + /* CacheLevel */ + snprintf(path, 256, "%s/index%d/level", prefix, idx); + fscanf_dec(path, &this_cache.CacheLevel); + /* CacheType */ + snprintf(path, 256, "%s/index%d/type", prefix, idx); + + memset(str, 0, sizeof(str)); + fscanf_str(path, str); + if (!strcmp(str, "Data")) + this_cache.CacheType.ui32.Data = 1; + if (!strcmp(str, "Instruction")) + this_cache.CacheType.ui32.Instruction = 1; + if (!strcmp(str, "Unified")) { + this_cache.CacheType.ui32.Data = 1; + this_cache.CacheType.ui32.Instruction = 1; + } + this_cache.CacheType.ui32.CPU = 1; + /* CacheSize */ + snprintf(path, 256, "%s/index%d/size", prefix, idx); + fscanf_size(path, &this_cache.CacheSize); + /* CacheLineSize */ + snprintf(path, 256, "%s/index%d/coherency_line_size", prefix, idx); + fscanf_dec(path, &this_cache.CacheLineSize); + /* CacheAssociativity */ + snprintf(path, 256, "%s/index%d/ways_of_associativity", prefix, idx); + fscanf_dec(path, &this_cache.CacheAssociativity); + /* CacheLinesPerTag */ + snprintf(path, 256, "%s/index%d/physical_line_partition", prefix, idx); + fscanf_dec(path, &this_cache.CacheLinesPerTag); + /* CacheSiblings */ + snprintf(path, 256, "%s/index%d/shared_cpu_map", prefix, idx); + fscanf_str(path, str); + cpumap_to_cpu_ci(str, cpuinfo, &this_cache); + + cache.push_back(this_cache); + } + + return cpu_ci.num_caches; +} + +static HSAKMT_STATUS topology_map_node_id(uint32_t node_id, + wsl::thunk::WDDMDevice *&device) { + uint32_t idx = node_id; + if ((!dxg_topology->wdevices_.size()) || (!node_id) || (node_id >= dxg_topology->num_sysfs_nodes)) { + device = nullptr; + return HSAKMT_STATUS_ERROR; + } + + device = dxg_topology->wdevices_[node_id - 1]; + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS topology_sysfs_get_system_props(HsaSystemProperties& props) { + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + bool is_node_supported = true; + uint32_t num_supported_nodes = 0; + + std::memset(&props, 0, sizeof(props)); + + dxg_runtime->HeapFini(); + for (auto device : dxg_topology->wdevices_) + delete device; + dxg_topology->wdevices_.clear(); + + WDDMCreateDevices(dxg_topology->wdevices_); + int num_adapters = dxg_topology->wdevices_.size(); + if (num_adapters == 0) { + pr_err("No WDDM adapters found.\n"); + return HSAKMT_STATUS_ERROR; + } + + dxg_topology->num_sysfs_nodes = num_adapters + 1; + dxg_runtime->HeapInit(); + props.NumNodes = dxg_topology->num_sysfs_nodes; + if (dxg_runtime->default_node > num_adapters) + dxg_runtime->default_node = num_adapters; + + return ret; +} + +void topology_setup_is_dgpu_param(HsaNodeProperties *props) { + /* if we found a dGPU node, then treat the whole system as dGPU */ + /* noted that some APUs are also treated as dGPU in runtime */ + if (!props->NumCPUCores && props->NumFComputeCores) + dxg_runtime->hsakmt_is_dgpu = true; +} + +static HSAKMT_STATUS topology_get_cpu_model_name(HsaNodeProperties& props, + const std::vector& cpuinfo) { + for (int i = 0; i < cpuinfo.size(); i++) { + if (props.CComputeIdLo == cpuinfo[i].apicid) { + if (!props.DeviceId) /* CPU-only node */ + strncpy((char *)props.AMDName, cpuinfo[i].model_name, + sizeof(props.AMDName)); + /* Convert from UTF8 to UTF16 */ + int j; + for (j = 0; + cpuinfo[i].model_name[j] != '\0' && j < HSA_PUBLIC_NAME_SIZE - 1; j++) + props.MarketingName[j] = cpuinfo[i].model_name[j]; + props.MarketingName[j] = '\0'; + return HSAKMT_STATUS_SUCCESS; + } + } + + return HSAKMT_STATUS_ERROR; +} + +static int topology_search_processor_vendor(const std::string& processor_name) { + for (unsigned int i = 0; i < ARRAY_LEN(supported_processor_vendor_name); i++) { + if (processor_name == supported_processor_vendor_name[i]) + return i; + if (processor_name == "POWER9, altivec supported") + return IBM_POWER; + } + return -1; +} + +/* topology_parse_cpuinfo - Parse /proc/cpuinfo and fill up required + * topology information + * cpuinfo [OUT]: output buffer to hold cpu information + * num_procs: number of processors the output buffer can hold + */ +static HSAKMT_STATUS topology_parse_cpuinfo(std::vector& cpuinfo) { + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + uint32_t num_procs = cpuinfo.size(); + + std::ifstream cpuinfo_max_freq( + "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq"); + if (cpuinfo_max_freq) { + std::string line; + std::getline(cpuinfo_max_freq, line); + dxg_topology->freq_max_ = static_cast(std::stod(line) / 1000); + } + + std::ifstream cpuinfo_file("/proc/cpuinfo"); + if (!cpuinfo_file) { + pr_err("Failed to open /proc/cpuinfo. Unable to get CPU information"); + return HSAKMT_STATUS_ERROR; + } + + std::string line; + uint32_t proc = 0; + while (std::getline(cpuinfo_file, line)) { + if (line.substr(0, 9) == "processor") { + proc = std::stoi(line.substr(line.find(':') + 2)); + if (proc >= num_procs) { + pr_err("cpuinfo contains processor %d larger than %u\n", proc, num_procs); + return HSAKMT_STATUS_NO_MEMORY; + } + continue; + } + + if (line.substr(0, 9) == "vendor_id" && dxg_topology->processor_vendor == -1) { + std::string vendor = line.substr(line.find(':') + 2); + dxg_topology->processor_vendor = topology_search_processor_vendor(vendor.c_str()); + continue; + } + + if (line.substr(0, 10) == "model name") { + std::string model_name = line.substr(line.find(':') + 2); + if (model_name.size() > HSA_PUBLIC_NAME_SIZE) + model_name.resize(HSA_PUBLIC_NAME_SIZE); + std::strncpy(cpuinfo[proc].model_name, model_name.c_str(), HSA_PUBLIC_NAME_SIZE); + continue; + } + + if (line.substr(0, 6) == "apicid") { + cpuinfo[proc].apicid = std::stoi(line.substr(line.find(':') + 2)); + continue; + } + + if (!cpuinfo_max_freq) { + if (line.substr(0, 7) == "cpu MHz") { + double freq = std::stod(line.substr(line.find(':') + 2)); + if (freq > dxg_topology->freq_max_) { + dxg_topology->freq_max_ = freq; + } + continue; + } + } + } + + if (dxg_topology->processor_vendor < 0) { + pr_err("Failed to get Processor Vendor. Setting to %s", supported_processor_vendor_name[GENUINE_INTEL]); + dxg_topology->processor_vendor = GENUINE_INTEL; + } + + return ret; +} + +static HSAKMT_STATUS topology_sysfs_get_node_props(uint32_t node_id, + HsaNodeProperties& props, + bool& p2p_links, + uint32_t& num_p2pLinks) { + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + + memset(&props, 0, sizeof(props)); + p2p_links = false; + num_p2pLinks = 0; + + props.MaxEngineClockMhzCCompute = dxg_topology->freq_max_; + + if (node_id == 0) { + /* CPU node */ + props.NumCPUCores = sysconf(_SC_NPROCESSORS_ONLN); + props.NumMemoryBanks = 1; + props.KFDGpuID = 0; + return HSAKMT_STATUS_SUCCESS; + } + + /* gpu node */ + wsl::thunk::WDDMDevice *device; + ret = topology_map_node_id(node_id, device); + if (ret != HSAKMT_STATUS_SUCCESS) + return ret; + + props.NumCPUCores = 0; + props.NumFComputeCores = device->SimdPerCu() * device->ComputeUnitCount(); + props.NumMemoryBanks = 1; + props.NumCaches = 3; + props.NumIOLinks = 1; + props.CComputeIdLo = 0; + props.FComputeIdLo = 0; + props.Capability.ui32.ASICRevision = device->AsicRevision(); + props.Capability.ui32.WatchPointsTotalBits = + std::log2(device->WatchPointsNum()); + props.MaxWavesPerSIMD = device->WavePerCu() / device->SimdPerCu(); + props.LDSSizeInKB = device->LdsSize() / 1024; + props.GDSSizeInKB = 0; + props.WaveFrontSize = device->WavefrontSize(); + props.NumShaderBanks = device->NumShaderEngine(); + props.NumArrays = device->ShaderArrayPerShaderEngine(); + props.NumCUPerArray = device->ComputeUnitCount() / props.NumArrays; + props.NumSIMDPerCU = device->SimdPerCu(); + props.MaxSlotsScratchCU = device->MaxScratchSlotsPerCu(); + props.VendorId = 0x1002; + props.DeviceId = device->DeviceId(); + props.LocationId = device->PciBusAddr(); + props.LocalMemSize = 0; + props.MaxEngineClockMhzFCompute = device->MaxEngineClockMhz(); + props.DrmRenderMinor = node_id; + + { + int i; + const char *name = device->ProductName(); + for (i = 0; name[i] != 0 && i < HSA_PUBLIC_NAME_SIZE - 1; i++) + props.MarketingName[i] = name[i]; + props.MarketingName[i] = '\0'; + } + props.uCodeEngineVersions.uCodeSDMA = device->GetSdmaFwVersion(); + props.DebugProperties.Value = 0; + props.HiveID = 0; + props.NumSdmaEngines = device->NumSdmaEngine(); + props.NumSdmaXgmiEngines = 0; + props.NumSdmaQueuesPerEngine = 6; // TODO + props.NumCpQueues = device->GetNumCpQueues(); + props.NumGws = 0; + /* + * In Native Linux, if the asic is APU, this value will be set to 1, + * if the asic is dGPU, this value will be set to 0. clr use this info + * to set hostUnifiedMemory_, but for now wsl does not support this feature. + * Therefore, fore vaule to 0 temporarily. + */ + props.Integrated = 0; + props.Domain = device->Domain(); + props.UniqueID = device->Uuid(); + props.NumXcc = 1; + props.KFDGpuID = device->DeviceId(); // TODO + props.FamilyID = device->GfxFamily(); + + props.EngineId.ui32.uCode = device->GetMecFwVersion(); + char *envvar = getenv("HSA_OVERRIDE_GFX_VERSION"); + if (envvar) { + char dummy = '\0'; + uint32_t major = 0, minor = 0, step = 0; + /* HSA_OVERRIDE_GFX_VERSION=major.minor.stepping */ + if ((sscanf(envvar, "%u.%u.%u%c", &major, &minor, &step, &dummy) != 3) || + (major > 63 || minor > 255 || step > 255)) { + pr_err("HSA_OVERRIDE_GFX_VERSION %s is invalid\n", envvar); + return HSAKMT_STATUS_ERROR; + } + props.OverrideEngineId.ui32.Major = major & 0x3f; + props.OverrideEngineId.ui32.Minor = minor & 0xff; + props.OverrideEngineId.ui32.Stepping = step & 0xff; + } + props.EngineId.ui32.Major = device->Major(); + props.EngineId.ui32.Minor = device->Minor(); + props.EngineId.ui32.Stepping = device->Stepping(); + + snprintf((char *)props.AMDName, sizeof(props.AMDName) - 1, "GFX%06x", + HSA_GET_GFX_VERSION_FULL(props.EngineId.ui32)); + + if (!dxg_runtime->is_svm_api_supported) + props.Capability.ui32.SVMAPISupported = 0; + props.Capability.ui32.DoorbellType = 2; + + /* Get VGPR/SGPR size in byte per CU */ + props.SGPRSizePerCU = SGPR_SIZE_PER_CU; + props.VGPRSizePerCU = get_vgpr_size_per_cu(props.EngineId); + + if (props.NumFComputeCores) + assert(props.EngineId.ui32.Major && + "HSA_OVERRIDE_GFX_VERSION may be needed"); + + return ret; +} + +static HSAKMT_STATUS topology_sysfs_get_mem_props(uint32_t node_id, + uint32_t mem_id, + HsaMemoryProperties& props) { + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + + std::memset(&props, 0, sizeof(props)); + if (node_id == 0) { + /* CPU node */ + props.HeapType = HSA_HEAPTYPE_SYSTEM; + + struct sysinfo info; + sysinfo(&info); + props.SizeInBytes = info.totalram; + + /* props.SizeInBytes is the actual physical system + * memory size. Reserve 1/16th for WSL system usage. + */ + dxg_runtime->max_single_alloc_size = info.totalram - (info.totalram >> 4); + + props.Flags.MemoryProperty = 0; + /* TODO: sudo dmidecode --type memory doesn't work on wsl */ + props.Width = 64; + props.MemoryClockMax = 2133; + return HSAKMT_STATUS_SUCCESS; + } + + wsl::thunk::WDDMDevice *device; + ret = topology_map_node_id(node_id, device); + if (ret != HSAKMT_STATUS_SUCCESS) + return ret; + + props.HeapType = HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE; + + if (device->IsDgpu()) + props.SizeInBytes = device->LocalHeapSize(); + else + props.SizeInBytes = device->NonLocalHeapSize(); + + props.Width = device->MemoryBusWidth(); + props.MemoryClockMax = device->MaxMemoryClockMhz(); + + return ret; +} + +/* topology_get_cpu_cache_props - Read CPU cache information from sysfs + * @node [IN] CPU node number + * @cpuinfo [IN] /proc/cpuinfo data + * @tbl [OUT] the node table to fill up + * Return: HSAKMT_STATUS_SUCCESS in success or error number in failure + */ +static HSAKMT_STATUS topology_get_cpu_cache_props(int node, + const std::vector& cpuinfo, + node_props_t& tbl) { + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + + /* Get max path size from /sys/devices/system/node/node%d/%s/cache + * below, which will max out according to the largest filename, + * which can be present twice in the string above. 29 is for the prefix + * and the +6 is for the cache suffix + */ +#ifndef MAXNAMLEN +/* MAXNAMLEN is the BSD name for NAME_MAX. glibc aliases this as NAME_MAX, but + * not musl */ +#define MAXNAMLEN NAME_MAX +#endif + constexpr uint32_t MAXPATHSIZE = 29 + MAXNAMLEN + (MAXNAMLEN + 6); + char path[MAXPATHSIZE], node_dir[MAXPATHSIZE]; + int max_cpus; + int cache_cnt = 0; + DIR *dirp = NULL; + struct dirent *dir; + char *p; + + /* Get info from /sys/devices/system/node/nodeX/cpuY/cache */ + int node_real = node; + if (dxg_topology->processor_vendor == IBM_POWER) { + if (!strcmp(cpuinfo[0].model_name, "POWER9")) { + node_real = node * 8; + } + } + snprintf(node_dir, MAXPATHSIZE, "/sys/devices/system/node/node%d", node_real); + /* Other than cpuY folders, this dir also has cpulist and cpumap */ + max_cpus = num_subdirs(node_dir, "cpu"); + if (max_cpus <= 0) { + /* If CONFIG_NUMA is not enabled in the kernel, + * /sys/devices/system/node doesn't exist. + */ + if (node) { /* CPU node must be 0 or something is wrong */ + pr_err("Fail to get cpu* dirs under %s.", node_dir); + ret = HSAKMT_STATUS_ERROR; + goto exit; + } + /* Fall back to use /sys/devices/system/cpu */ + snprintf(node_dir, MAXPATHSIZE, "/sys/devices/system/cpu"); + max_cpus = num_subdirs(node_dir, "cpu"); + if (max_cpus <= 0) { + pr_err("Fail to get cpu* dirs under %s\n", node_dir); + ret = HSAKMT_STATUS_ERROR; + goto exit; + } + } + + dirp = opendir(node_dir); + while ((dir = readdir(dirp)) != 0) { + if (strncmp(dir->d_name, "cpu", 3)) + continue; + if (!isdigit(dir->d_name[3])) /* ignore files like cpulist */ + continue; + if (strlen(node_dir) + strlen(dir->d_name) + strlen("/cache") + 2 < MAXPATHSIZE) { + std::string path_str = std::string(node_dir) + "/" + dir->d_name + "/cache"; + strncpy(path, path_str.c_str(), MAXPATHSIZE); + path[MAXPATHSIZE - 1] = '\0'; + } else { + pr_err("Path is too long and was truncated.\n"); + goto exit; + } + + cpu_cacheinfo_t cpu_ci; + cpu_ci.num_caches = num_subdirs(path, "index"); + cpu_ci.proc_num= atoi(dir->d_name+3); + + cache_cnt += get_cpu_cache_info(path, cpuinfo, tbl.cache, cpu_ci); + } + assert(cache_cnt == tbl.cache.size()); + tbl.node.NumCaches = cache_cnt; + +exit: + if (dirp) + closedir(dirp); + return ret; +} + +/* For a give Node @node_id the function gets @iolink_id information i.e. parses + * sysfs the following sysfs entry + * ./nodes/@node_id/io_links/@iolink_id/properties. @node_id has to be valid + * accessible node. + * + * If node_to specified by the @iolink_id is not accessible the function returns + * HSAKMT_STATUS_NOT_SUPPORTED. If node_to is accessible, then node_to is mapped + * from sysfs_node to user_node and returns HSAKMT_STATUS_SUCCESS. + */ +static HSAKMT_STATUS topology_sysfs_get_iolink_props(uint32_t node_id, + uint32_t iolink_id, + HsaIoLinkProperties& props, + bool p2pLink) { + wsl::thunk::WDDMDevice *device; + topology_map_node_id(node_id, device); + + std::memset(&props, 0, sizeof(props)); + props.IoLinkType = HSA_IOLINKTYPE_PCIEXPRESS; + props.VersionMajor = props.VersionMinor = 0; + props.NodeFrom = node_id; + props.NodeTo = 0; + props.Weight = 20; + props.Flags.ui32.Override = 1; + props.Flags.ui32.NonCoherent = 1; + props.Flags.ui32.NoAtomics32bit = !(device->SupportPlatformAtomic()); + props.Flags.ui32.NoAtomics64bit = !(device->SupportPlatformAtomic()); + props.RecSdmaEngIdMask = 0; + + return HSAKMT_STATUS_SUCCESS; +} + +/* topology_get_free_io_link_slot_for_node - For the given node_id, find the + * next available free slot to add an io_link + */ +static HsaIoLinkProperties * +topology_get_free_io_link_slot_for_node(uint32_t node_id, + const HsaSystemProperties& sys_props, + std::vector& node_props) { + std::vector& props = node_props[node_id].link; + + if (node_id >= sys_props.NumNodes) { + pr_err("Invalid node [%d]\n", node_id); + return NULL; + } + + if (!props.size()) { + pr_err("No io_link reported for Node [%d]\n", node_id); + return NULL; + } + + if (node_props[node_id].node.NumIOLinks >= sys_props.NumNodes - 1) { + pr_err("No more space for io_link for Node [%d]\n", node_id); + return NULL; + } + + return &props[node_props[node_id].node.NumIOLinks]; +} + +/* topology_add_io_link_for_node - If a free slot is available, + * add io_link for the given Node. + * TODO: Add other members of HsaIoLinkProperties + */ +static HSAKMT_STATUS topology_add_io_link_for_node( + uint32_t node_from, const HsaSystemProperties& sys_props, + std::vector& node_props, HSA_IOLINKTYPE IoLinkType, uint32_t node_to, + uint32_t Weight) { + HsaIoLinkProperties *props; + + props = + topology_get_free_io_link_slot_for_node(node_from, sys_props, node_props); + if (!props) + return HSAKMT_STATUS_NO_MEMORY; + + props->IoLinkType = IoLinkType; + props->NodeFrom = node_from; + props->NodeTo = node_to; + props->Weight = Weight; + node_props[node_from].node.NumIOLinks++; + + return HSAKMT_STATUS_SUCCESS; +} + +/* Find the CPU that this GPU (gpu_node) directly connects to */ +static int32_t gpu_get_direct_link_cpu(uint32_t gpu_node, + const std::vector& node_props) { + const std::vector& props = node_props[gpu_node].link; + uint32_t i; + + if (!node_props[gpu_node].node.KFDGpuID || props.empty() || + node_props[gpu_node].node.NumIOLinks == 0) + return -1; + + for (i = 0; i < node_props[gpu_node].node.NumIOLinks; i++) + if (props[i].IoLinkType == HSA_IOLINKTYPE_PCIEXPRESS && + props[i].Weight <= 20) /* >20 is GPU->CPU->GPU */ + return props[i].NodeTo; + + return -1; +} + +/* Get node1->node2 IO link information. This should be a direct link that has + * been created in the kernel. + */ +static HSAKMT_STATUS get_direct_iolink_info(uint32_t node1, uint32_t node2, + const std::vector& node_props, + HSAuint32 *weight, + HSA_IOLINKTYPE *type) { + const std::vector& props = node_props[node1].link; + uint32_t i; + + if (!props.size()) + return HSAKMT_STATUS_INVALID_NODE_UNIT; + + for (i = 0; i < node_props[node1].node.NumIOLinks; i++) + if (props[i].NodeTo == node2) { + if (weight) + *weight = props[i].Weight; + if (type) + *type = props[i].IoLinkType; + return HSAKMT_STATUS_SUCCESS; + } + + return HSAKMT_STATUS_INVALID_PARAMETER; +} + +static HSAKMT_STATUS get_indirect_iolink_info(uint32_t node1, uint32_t node2, + const std::vector& node_props, + HSAuint32 *weight, + HSA_IOLINKTYPE *type) { + int32_t dir_cpu1 = -1, dir_cpu2 = -1; + HSAKMT_STATUS ret; + uint32_t i; + + *weight = 0; + *type = HSA_IOLINKTYPE_UNDEFINED; + + if (node1 == node2) + return HSAKMT_STATUS_INVALID_PARAMETER; + + /* CPU->CPU is not an indirect link */ + if (!node_props[node1].node.KFDGpuID && !node_props[node2].node.KFDGpuID) + return HSAKMT_STATUS_INVALID_NODE_UNIT; + + if (node_props[node1].node.HiveID && node_props[node2].node.HiveID && + node_props[node1].node.HiveID == node_props[node2].node.HiveID) + return HSAKMT_STATUS_INVALID_PARAMETER; + + if (node_props[node1].node.KFDGpuID) + dir_cpu1 = gpu_get_direct_link_cpu(node1, node_props); + if (node_props[node2].node.KFDGpuID) + dir_cpu2 = gpu_get_direct_link_cpu(node2, node_props); + + if (dir_cpu1 < 0 && dir_cpu2 < 0) + return HSAKMT_STATUS_ERROR; + + /* if the node2(dst) is GPU , it need to be large bar for host access*/ + if (node_props[node2].node.KFDGpuID) { + for (i = 0; i < node_props[node2].node.NumMemoryBanks; ++i) + if (node_props[node2].mem[i].HeapType == HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC) + break; + if (i >= node_props[node2].node.NumMemoryBanks) + return HSAKMT_STATUS_ERROR; + } + /* Possible topology: + * GPU --(weight1) -- CPU -- (weight2) -- GPU + * GPU --(weight1) -- CPU -- (weight2) -- CPU -- (weight3) -- GPU + * GPU --(weight1) -- CPU -- (weight2) -- CPU + * CPU -- (weight2) -- CPU -- (weight3) -- GPU + */ + HSAuint32 weight1 = 0, weight2 = 0, weight3 = 0; + if (dir_cpu1 >= 0) { /* GPU->CPU ... */ + if (dir_cpu2 >= 0) { + if (dir_cpu1 == dir_cpu2) /* GPU->CPU->GPU*/ { + ret = + get_direct_iolink_info(node1, dir_cpu1, node_props, &weight1, NULL); + if (ret != HSAKMT_STATUS_SUCCESS) + return ret; + ret = + get_direct_iolink_info(dir_cpu1, node2, node_props, &weight2, type); + } else /* GPU->CPU->CPU->GPU*/ { + ret = + get_direct_iolink_info(node1, dir_cpu1, node_props, &weight1, NULL); + if (ret != HSAKMT_STATUS_SUCCESS) + return ret; + ret = get_direct_iolink_info(dir_cpu1, dir_cpu2, node_props, &weight2, + type); + if (ret != HSAKMT_STATUS_SUCCESS) + return ret; + /* On QPI interconnection, GPUs can't access + * each other if they are attached to different + * CPU sockets. CPU<->CPU weight larger than 20 + * means the two CPUs are in different sockets. + */ + if (*type == HSA_IOLINK_TYPE_QPI_1_1 && weight2 > 20) + return HSAKMT_STATUS_NOT_SUPPORTED; + ret = + get_direct_iolink_info(dir_cpu2, node2, node_props, &weight3, NULL); + } + } else /* GPU->CPU->CPU */ { + ret = get_direct_iolink_info(node1, dir_cpu1, node_props, &weight1, NULL); + if (ret != HSAKMT_STATUS_SUCCESS) + return ret; + ret = get_direct_iolink_info(dir_cpu1, node2, node_props, &weight2, type); + } + } else { /* CPU->CPU->GPU */ + ret = get_direct_iolink_info(node1, dir_cpu2, node_props, &weight2, type); + if (ret != HSAKMT_STATUS_SUCCESS) + return ret; + ret = get_direct_iolink_info(dir_cpu2, node2, node_props, &weight3, NULL); + } + + if (ret != HSAKMT_STATUS_SUCCESS) + return ret; + + *weight = weight1 + weight2 + weight3; + return HSAKMT_STATUS_SUCCESS; +} + +static void +topology_create_indirect_gpu_links(const HsaSystemProperties& sys_props, + std::vector& node_props) { + + uint32_t i, j; + HSAuint32 weight; + HSA_IOLINKTYPE type; + + for (i = 0; i < sys_props.NumNodes - 1; i++) { + for (j = i + 1; j < sys_props.NumNodes; j++) { + get_indirect_iolink_info(i, j, node_props, &weight, &type); + if (!weight) + goto try_alt_dir; + if (topology_add_io_link_for_node(i, sys_props, node_props, type, j, + weight) != HSAKMT_STATUS_SUCCESS) + pr_err("Fail to add IO link %d->%d\n", i, j); + try_alt_dir: + get_indirect_iolink_info(j, i, node_props, &weight, &type); + if (!weight) + continue; + if (topology_add_io_link_for_node(j, sys_props, node_props, type, i, + weight) != HSAKMT_STATUS_SUCCESS) + pr_err("Fail to add IO link %d->%d\n", j, i); + } + } +} + +HSAKMT_STATUS topology_take_snapshot(void) { + uint32_t i, mem_id, cache_id; + HsaSystemProperties sys_props; + std::vector& temp_props = dxg_topology->g_props; + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + const uint32_t num_procs = sysconf(_SC_NPROCESSORS_ONLN); + std::vector cpuinfo(num_procs); + uint32_t num_ioLinks; + bool p2p_links = false; + uint32_t num_p2pLinks = 0; + + topology_parse_cpuinfo(cpuinfo); + + ret = topology_sysfs_get_system_props(sys_props); + if (ret != HSAKMT_STATUS_SUCCESS) + goto err; + if (sys_props.NumNodes > 0) { + temp_props.resize(sys_props.NumNodes); + + for (i = 0; i < sys_props.NumNodes; i++) { + wsl::thunk::WDDMDevice *device_; + topology_map_node_id(i, device_); + + ret = topology_sysfs_get_node_props(i, temp_props[i].node, p2p_links, + num_p2pLinks); + if (ret != HSAKMT_STATUS_SUCCESS) { + goto err; + } + + topology_setup_is_dgpu_param(&temp_props[i].node); + + if (temp_props[i].node.NumCPUCores) + topology_get_cpu_model_name(temp_props[i].node, cpuinfo); + + if (temp_props[i].node.NumMemoryBanks) { + temp_props[i].mem.resize(temp_props[i].node.NumMemoryBanks); + + for (mem_id = 0; mem_id < temp_props[i].node.NumMemoryBanks; mem_id++) { + ret = topology_sysfs_get_mem_props(i, mem_id, + temp_props[i].mem[mem_id]); + if (ret != HSAKMT_STATUS_SUCCESS) { + goto err; + } + } + } + + if (temp_props[i].node.NumCaches) { + temp_props[i].cache.resize(temp_props[i].node.NumCaches); + for (int j = 0; j < 3; j++) { + temp_props[i].cache[j].CacheType.ui32.Data = 1; + temp_props[i].cache[j].CacheType.ui32.HSACU = 1; + temp_props[i].cache[j].CacheLevel = j + 1; + } + temp_props[i].cache[0].CacheSize = device_->GetL1CacheSize() / 1024; + temp_props[i].cache[1].CacheSize = device_->GetL2CacheSize() / 1024; + temp_props[i].cache[2].CacheSize = device_->GetL3CacheSize() / 1024; + } else if (!temp_props[i].node.KFDGpuID) { /* a CPU node */ + ret = topology_get_cpu_cache_props(i, cpuinfo, temp_props[i]); + if (ret != HSAKMT_STATUS_SUCCESS) { + goto err; + } + } + + /* To simplify, allocate maximum needed memory for io_links for each node. + * This removes the need for realloc when indirect and QPI links are added + * later + */ + temp_props[i].link.resize(sys_props.NumNodes - 1); + num_ioLinks = temp_props[i].node.NumIOLinks - num_p2pLinks; + uint32_t link_id = 0; + + if (num_ioLinks) { + uint32_t sys_link_id = 0; + + /* Parse all the sysfs specified io links. Skip the ones where the + * remote node (node_to) is not accessible + */ + while (sys_link_id < num_ioLinks && link_id < sys_props.NumNodes - 1) { + ret = topology_sysfs_get_iolink_props( + i, sys_link_id++, temp_props[i].link[link_id], false); + if (ret == HSAKMT_STATUS_NOT_SUPPORTED) { + ret = HSAKMT_STATUS_SUCCESS; + continue; + } else if (ret != HSAKMT_STATUS_SUCCESS) { + goto err; + } + link_id++; + } + /* sysfs specifies all the io links. Limit the number to valid ones */ + temp_props[i].node.NumIOLinks = link_id; + } + + if (num_p2pLinks) { + uint32_t sys_link_id = 0; + + /* Parse all the sysfs specified p2p links. + */ + while (sys_link_id < num_p2pLinks && link_id < sys_props.NumNodes - 1) { + ret = topology_sysfs_get_iolink_props( + i, sys_link_id++, temp_props[i].link[link_id], true); + if (ret == HSAKMT_STATUS_NOT_SUPPORTED) { + ret = HSAKMT_STATUS_SUCCESS; + continue; + } else if (ret != HSAKMT_STATUS_SUCCESS) { + goto err; + } + link_id++; + } + temp_props[i].node.NumIOLinks = link_id; + } + } + } + + if (!p2p_links) { + /* All direct IO links are created in the kernel. Here we need to + * connect GPU<->GPU or GPU<->CPU indirect IO links. + */ + topology_create_indirect_gpu_links(sys_props, temp_props); + } + + if (!dxg_topology->g_system) { + dxg_topology->g_system = (HsaSystemProperties *)malloc(sizeof(HsaSystemProperties)); + if (!dxg_topology->g_system) { + ret = HSAKMT_STATUS_NO_MEMORY; + goto err; + } + } + + *dxg_topology->g_system = sys_props; +err: + return ret; +} + +/* Drop the Snashot of the HSA topology information. Assume lock is held. */ +void topology_drop_snapshot(void) { + if (!!dxg_topology->g_system != !!dxg_topology->g_props.size()) + pr_warn("Probably inconsistency?\n"); + + dxg_topology->g_props.clear(); + + free(dxg_topology->g_system); + dxg_topology->g_system = NULL; + + trim_suballocator(); + for (auto device : dxg_topology->wdevices_) + delete device; + dxg_topology->wdevices_.clear(); +} + +HSAKMT_STATUS validate_nodeid(uint32_t nodeid, uint32_t *gpu_id) { + if (dxg_topology->g_props.empty() || !dxg_topology->g_system || dxg_topology->g_system->NumNodes <= nodeid) + return HSAKMT_STATUS_INVALID_NODE_UNIT; + if (gpu_id) + *gpu_id = dxg_topology->g_props[nodeid].node.KFDGpuID; + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS gpuid_to_nodeid(uint32_t gpu_id, uint32_t *node_id) { + uint64_t node_idx; + + for (node_idx = 0; node_idx < dxg_topology->g_system->NumNodes; node_idx++) { + if (dxg_topology->g_props[node_idx].node.KFDGpuID == gpu_id) { + *node_id = node_idx; + return HSAKMT_STATUS_SUCCESS; + } + } + + return HSAKMT_STATUS_INVALID_NODE_UNIT; +} + +HSAKMT_STATUS HSAKMTAPI +hsaKmtAcquireSystemProperties(HsaSystemProperties *SystemProperties) { + HSAKMT_STATUS err = HSAKMT_STATUS_SUCCESS; + + CHECK_DXG_OPEN(); + + if (!SystemProperties) + return HSAKMT_STATUS_INVALID_PARAMETER; + + pthread_mutex_lock(&dxg_runtime->hsakmt_mutex); + + /* We already have a valid snapshot. Avoid double initialization that + * would leak memory. + */ + if (dxg_topology->g_system) { + *SystemProperties = *dxg_topology->g_system; + goto out; + } + + err = topology_take_snapshot(); + if (err != HSAKMT_STATUS_SUCCESS) + goto out; + + assert(dxg_topology->g_system); + + // err = fmm_init_process_apertures(dxg_topology->g_system->NumNodes); + if (err != HSAKMT_STATUS_SUCCESS) + goto init_process_apertures_failed; + + // err = init_process_doorbells(dxg_topology->g_system->NumNodes); + if (err != HSAKMT_STATUS_SUCCESS) + goto init_doorbells_failed; + + *SystemProperties = *dxg_topology->g_system; + + goto out; + +init_doorbells_failed: + // fmm_destroy_process_apertures(); +init_process_apertures_failed: + topology_drop_snapshot(); + +out: + pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex); + return err; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtReleaseSystemProperties(void) { + pthread_mutex_lock(&dxg_runtime->hsakmt_mutex); + + topology_drop_snapshot(); + + pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex); + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS topology_get_node_props(HSAuint32 NodeId, + HsaNodeProperties *NodeProperties) { + if (!dxg_topology->g_system || dxg_topology->g_props.empty() || NodeId >= dxg_topology->g_system->NumNodes) + return HSAKMT_STATUS_ERROR; + + *NodeProperties = dxg_topology->g_props[NodeId].node; + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI +hsaKmtGetNodeProperties(HSAuint32 NodeId, HsaNodeProperties *NodeProperties) { + HSAKMT_STATUS err; + uint32_t gpu_id; + + if (!NodeProperties) + return HSAKMT_STATUS_INVALID_PARAMETER; + + CHECK_DXG_OPEN(); + pthread_mutex_lock(&dxg_runtime->hsakmt_mutex); + + err = validate_nodeid(NodeId, &gpu_id); + if (err != HSAKMT_STATUS_SUCCESS) + goto out; + + err = topology_get_node_props(NodeId, NodeProperties); + if (err != HSAKMT_STATUS_SUCCESS) + goto out; + /* For CPU only node don't add any additional GPU memory banks. */ + if (gpu_id) { + uint64_t base, limit; + if (!(NodeProperties->Integrated)) + NodeProperties->NumMemoryBanks += NUM_OF_DGPU_HEAPS; + else + NodeProperties->NumMemoryBanks += NUM_OF_IGPU_HEAPS; + // TODO: for apu + /*if (fmm_get_aperture_base_and_limit(FMM_MMIO, gpu_id, &base, + &limit) == HSAKMT_STATUS_SUCCESS) + NodeProperties->NumMemoryBanks += 1;*/ + } + +out: + pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex); + return err; +} + +HSAKMT_STATUS HSAKMTAPI +hsaKmtGetNodeMemoryProperties(HSAuint32 NodeId, HSAuint32 NumBanks, + HsaMemoryProperties *MemoryProperties) { + HSAKMT_STATUS err = HSAKMT_STATUS_SUCCESS; + uint32_t i; + + if (!MemoryProperties) + return HSAKMT_STATUS_INVALID_PARAMETER; + + CHECK_DXG_OPEN(); + pthread_mutex_lock(&dxg_runtime->hsakmt_mutex); + + memset(MemoryProperties, 0, NumBanks * sizeof(HsaMemoryProperties)); + for (i = 0; i < wsl::Min(dxg_topology->g_props[NodeId].node.NumMemoryBanks, NumBanks); i++) { + assert(dxg_topology->g_props[NodeId].mem.size()); + MemoryProperties[i] = dxg_topology->g_props[NodeId].mem[i]; + } + + /* The following memory banks does not apply to CPU only node */ + wsl::thunk::WDDMDevice *device_ = get_wddmdev(NodeId); + if (device_ == nullptr) + goto out; + + /*Add LDS*/ + if (i < NumBanks) { + MemoryProperties[i].HeapType = HSA_HEAPTYPE_GPU_LDS; + MemoryProperties[i].VirtualBaseAddress = device_->SharedApertureBase(); + MemoryProperties[i].SizeInBytes = dxg_topology->g_props[NodeId].node.LDSSizeInKB * 1024; + i++; + } + + /* Add SCRATCH */ + if (i < NumBanks) { + MemoryProperties[i].HeapType = HSA_HEAPTYPE_GPU_SCRATCH; + MemoryProperties[i].VirtualBaseAddress = device_->PrivateApertureBase(); + MemoryProperties[i].SizeInBytes = device_->PrivateApertureSize(); + i++; + } + +out: + pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex); + return err; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeCacheProperties( + HSAuint32 NodeId, HSAuint32 ProcessorId, HSAuint32 NumCaches, + HsaCacheProperties *CacheProperties) { + HSAKMT_STATUS err; + uint32_t i; + + if (!CacheProperties) + return HSAKMT_STATUS_INVALID_PARAMETER; + + CHECK_DXG_OPEN(); + pthread_mutex_lock(&dxg_runtime->hsakmt_mutex); + + /* KFD ADD page 18, snapshot protocol violation */ + if (!dxg_topology->g_system || NodeId >= dxg_topology->g_system->NumNodes) { + err = HSAKMT_STATUS_INVALID_NODE_UNIT; + goto out; + } + + if (NumCaches > dxg_topology->g_props[NodeId].node.NumCaches) { + err = HSAKMT_STATUS_INVALID_PARAMETER; + goto out; + } + + for (i = 0; i < wsl::Min(dxg_topology->g_props[NodeId].node.NumCaches, NumCaches); i++) { + assert(dxg_topology->g_props[NodeId].cache.size()); + CacheProperties[i] = dxg_topology->g_props[NodeId].cache[i]; + } + + err = HSAKMT_STATUS_SUCCESS; + +out: + pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex); + return err; +} + +HSAKMT_STATUS topology_get_iolink_props(HSAuint32 NodeId, HSAuint32 NumIoLinks, + HsaIoLinkProperties *IoLinkProperties) { + if (!dxg_topology->g_system || dxg_topology->g_props.empty() || NodeId >= dxg_topology->g_system->NumNodes) + return HSAKMT_STATUS_ERROR; + + memcpy(IoLinkProperties, dxg_topology->g_props[NodeId].link.data(), + NumIoLinks * sizeof(*IoLinkProperties)); + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI +hsaKmtGetNodeIoLinkProperties(HSAuint32 NodeId, HSAuint32 NumIoLinks, + HsaIoLinkProperties *IoLinkProperties) { + HSAKMT_STATUS err; + + if (!IoLinkProperties) + return HSAKMT_STATUS_INVALID_PARAMETER; + + CHECK_DXG_OPEN(); + + pthread_mutex_lock(&dxg_runtime->hsakmt_mutex); + + /* KFD ADD page 18, snapshot protocol violation */ + if (!dxg_topology->g_system || NodeId >= dxg_topology->g_system->NumNodes) { + err = HSAKMT_STATUS_INVALID_NODE_UNIT; + goto out; + } + + if (NumIoLinks > dxg_topology->g_props[NodeId].node.NumIOLinks) { + err = HSAKMT_STATUS_INVALID_PARAMETER; + goto out; + } + + assert(dxg_topology->g_props[NodeId].link.size()); + err = topology_get_iolink_props(NodeId, NumIoLinks, IoLinkProperties); + +out: + pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex); + return err; +} + +uint16_t get_device_id_by_node_id(HSAuint32 node_id) { + if (dxg_topology->g_props.empty() || !dxg_topology->g_system || dxg_topology->g_system->NumNodes <= node_id) + return 0; + + return dxg_topology->g_props[node_id].node.DeviceId; +} + +bool prefer_ats(HSAuint32 node_id) { + return dxg_topology->g_props[node_id].node.Capability.ui32.HSAMMUPresent && + dxg_topology->g_props[node_id].node.NumCPUCores && + dxg_topology->g_props[node_id].node.NumFComputeCores; +} + +uint16_t get_device_id_by_gpu_id(HSAuint32 gpu_id) { + unsigned int i; + + if (dxg_topology->g_props.empty() || !dxg_topology->g_system) + return 0; + + for (i = 0; i < dxg_topology->g_system->NumNodes; i++) { + if (dxg_topology->g_props[i].node.KFDGpuID == gpu_id) + return dxg_topology->g_props[i].node.DeviceId; + } + + return 0; +} + +uint32_t get_direct_link_cpu(uint32_t gpu_node) { + HSAuint64 size = 0; + int32_t cpu_id; + HSAuint32 i; + + cpu_id = gpu_get_direct_link_cpu(gpu_node, dxg_topology->g_props); + if (cpu_id == -1) + return INVALID_NODEID; + + assert(dxg_topology->g_props[cpu_id].mem.size()); + + for (i = 0; i < dxg_topology->g_props[cpu_id].node.NumMemoryBanks; i++) + size += dxg_topology->g_props[cpu_id].mem[i].SizeInBytes; + + return size ? (uint32_t)cpu_id : INVALID_NODEID; +} + +HSAKMT_STATUS validate_nodeid_array(uint32_t **gpu_id_array, + uint32_t NumberOfNodes, + uint32_t *NodeArray) { + HSAKMT_STATUS ret; + unsigned int i; + + if (NumberOfNodes == 0 || !NodeArray || !gpu_id_array) + return HSAKMT_STATUS_INVALID_PARAMETER; + + /* Translate Node IDs to gpu_ids */ + *gpu_id_array = (uint32_t *)malloc(NumberOfNodes * sizeof(uint32_t)); + if (!(*gpu_id_array)) + return HSAKMT_STATUS_NO_MEMORY; + for (i = 0; i < NumberOfNodes; i++) { + ret = validate_nodeid(NodeArray[i], *gpu_id_array + i); + if (ret != HSAKMT_STATUS_SUCCESS) { + free(*gpu_id_array); + break; + } + } + + return ret; +} + +uint32_t get_num_sysfs_nodes(void) { return dxg_topology->num_sysfs_nodes; } + +wsl::thunk::WDDMDevice *get_wddmdev(uint32_t node_id) { + if ((!dxg_topology->wdevices_.size()) || (!node_id) || (node_id >= dxg_topology->num_sysfs_nodes)) + return nullptr; + + return dxg_topology->wdevices_[node_id - 1]; +} + +uint32_t get_num_wddmdev() { + return dxg_topology->wdevices_.size(); +} diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/atomic_helpers.h b/projects/rocr-runtime/libhsakmt/src/dxg/util/atomic_helpers.h new file mode 100644 index 0000000000..4b7f8b0362 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/atomic_helpers.h @@ -0,0 +1,519 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +/* + Helpers to use native types with C++11 atomic operations. + Fixes GCC builtin functionality for x86 with respect to WC and non-temporal + stores. +*/ +#ifndef HSA_RUNTIME_CORE_UTIL_ATOMIC_HELPERS_H_ +#define HSA_RUNTIME_CORE_UTIL_ATOMIC_HELPERS_H_ + +#include +#include "utils.h" + +//ALWAYS_CONSERVATIVE will very likely overfence your code. +//For use as a debugging aid only. +#define ALWAYS_CONSERVATIVE 0 + +#if !ALWAYS_CONSERVATIVE +#if defined(__x86_64__) || defined(_M_X64) +#define X64_ORDER_WC 1 +#endif +#if X64_ORDER_WC +#include +#endif +#endif + +namespace wsl { +namespace atomic { + +static constexpr int c11ToBuiltInFlags(std::memory_order order) +{ +#if ALWAYS_CONSERVATIVE + return __ATOMIC_RELAXED; +#elif X64_ORDER_WC + return __ATOMIC_RELAXED; +#else + return (order == std::memory_order_relaxed) ? __ATOMIC_RELAXED : + (order == std::memory_order_acquire) ? __ATOMIC_ACQUIRE : + (order == std::memory_order_release) ? __ATOMIC_RELEASE : + (order == std::memory_order_seq_cst) ? __ATOMIC_SEQ_CST : + (order == std::memory_order_consume) ? __ATOMIC_CONSUME : + (order == std::memory_order_acq_rel) ? __ATOMIC_ACQ_REL : + __ATOMIC_SEQ_CST; +#endif +} + +static __forceinline void PreFence(std::memory_order order) { +#if ALWAYS_CONSERVATIVE + switch (order) { + case std::memory_order_release: + case std::memory_order_seq_cst: + case std::memory_order_acq_rel: + __atomic_thread_fence(__ATOMIC_SEQ_CST); + default:; + } +#elif X64_ORDER_WC + switch (order) { + case std::memory_order_release: + case std::memory_order_seq_cst: + case std::memory_order_acq_rel: + _mm_sfence(); + default:; + } +#endif +} + +static __forceinline void PostFence(std::memory_order order) { +#if ALWAYS_CONSERVATIVE + switch (order) { + case std::memory_order_seq_cst: + case std::memory_order_acq_rel: + case std::memory_order_acquire: + __atomic_thread_fence(__ATOMIC_SEQ_CST); + default:; + } +#elif X64_ORDER_WC + switch (order) { + case std::memory_order_seq_cst: + return _mm_mfence(); + case std::memory_order_acq_rel: + case std::memory_order_acquire: + return _mm_lfence(); + default:; + } +#endif +} + +static __forceinline void Fence(std::memory_order order=std::memory_order_seq_cst) { +#if ALWAYS_CONSERVATIVE + __atomic_thread_fence(__ATOMIC_SEQ_CST); +#elif X64_ORDER_WC + switch (order) { + case std::memory_order_seq_cst: + case std::memory_order_acq_rel: + return _mm_mfence(); + case std::memory_order_acquire: + return _mm_lfence(); + case std::memory_order_release: + return _mm_sfence(); + default:; + } +#else + std::atomic_thread_fence(order); +#endif +} + +template +static __forceinline void BasicCheck(const T* ptr) { + constexpr bool value = __atomic_always_lock_free(sizeof(T), 0); + static_assert(value, "Atomic type may not be compatible with peripheral atomics."); +}; + +template +static __forceinline void BasicCheck(const volatile T* ptr) { + constexpr bool value = __atomic_always_lock_free(sizeof(T), 0); + static_assert(value, "Atomic type may not be compatible with peripheral atomics."); +}; + +/// @brief: Load value of type T atomically with specified memory order. +/// @param: ptr(Input), a pointer to type T. +/// @param: order(Input), memory order with atomic load, relaxed by default. +/// @return: T, loaded value. +template +static __forceinline T + Load(const T* ptr, std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + T ret; + PreFence(order); + __atomic_load(ptr, &ret, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: function overloading, for more info, see previous one. +/// @param: ptr(Input), a pointer to volatile type T. +/// @param: order(Input), memory order with atomic load, relaxed by default. +/// @return: T, loaded value. +template +static __forceinline T + Load(const volatile T* ptr, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + T ret; + PreFence(order); + __atomic_load(ptr, &ret, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Store value of type T with specified memory order. +/// @param: ptr(Input), a pointer to instance which will be stored. +/// @param: val(Input), value to be stored. +/// @param: order(Input), memory order with atomic store, relaxed by default. +/// @return: void. +template +static __forceinline void Store( + T* ptr, T val, std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + __atomic_store(ptr, &val, c11ToBuiltInFlags(order)); + PostFence(order); +} + +/// @brief: Function overloading, for more info, see previous one. +/// @param: ptr(Input), a pointer to volatile instance which will be stored. +/// @param: val(Input), value to be stored. +/// @param: order(Input), memory order with atomic store, relaxed by default. +/// @return: void. +template +static __forceinline void Store( + volatile T* ptr, T val, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + __atomic_store(ptr, &val, c11ToBuiltInFlags(order)); + PostFence(order); +} + +/// @brief: Compare and swap value atomically with specified memory order. +/// @param: ptr(Input), a pointer to variable which is operated on. +/// @param: val(Input), value to be stored if condition is satisfied. +/// @param: expected(Input), value which is expected. +/// @param: order(Input), memory order with atomic operation. +/// @return: T, observed value of type T. +template +static __forceinline T + Cas(T* ptr, T val, T expected, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + __atomic_compare_exchange(ptr, &expected, &val, false, c11ToBuiltInFlags(order), __ATOMIC_RELAXED); + PostFence(order); + return expected; +} + +/// @brief: Function overloading, for more info, see previous one. +/// @param: ptr(Input), a pointer to volatile variable which is operated on. +/// @param: val(Input), value to be stored if condition is satisfied. +/// @param: expected(Input), value which is expected. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, observed value of type T. +template +static __forceinline T + Cas(volatile T* ptr, T val, T expected, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + __atomic_compare_exchange(ptr, &expected, &val, false, c11ToBuiltInFlags(order), __ATOMIC_RELAXED); + PostFence(order); + return expected; +} + +/// @brief: Exchange the value atomically with specified memory order. +/// @param: ptr(Input), a pointer to variable which is operated on. +/// @param: val(Input), value to be stored. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, the value prior to the exchange. +template +static __forceinline T + Exchange(T* ptr, T val, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + T ret; + PreFence(order); + __atomic_exchange(ptr, &val, &ret, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Function overloading, for more info, see previous one. +/// @param: ptr(Input), a pointer to variable which is operated on. +/// @param: val(Input), value to be stored. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, the value prior to the exchange. +template +static __forceinline T + Exchange(volatile T* ptr, T val, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + T ret; + PreFence(order); + __atomic_exchange(ptr, &val, &ret, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Add value to variable atomically with specified memory order. +/// @param: ptr(Input), a pointer to variable which is operated on. +/// @param: val(Input), value to be added. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, the value of the variable prior to the addition. +template +static __forceinline T + Add(T* ptr, T val, std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + T ret = __atomic_fetch_add(ptr, val, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Subtract value from the variable atomically with specified memory +/// order. +/// @param: ptr(Input), a pointer to variable which is operated on. +/// @param: val(Input), value to be subtraced. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, value of the variable prior to the subtraction. +template +static __forceinline T + Sub(T* ptr, T val, std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + T ret = __atomic_fetch_sub(ptr, val, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Bit And operation on variable atomically with specified memory +/// order. +/// @param: ptr(Input), a pointer to variable which is operated on. +/// @param: val(Input), value which is ANDed with variable. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, value of variable prior to the operation. +template +static __forceinline T + And(T* ptr, T val, std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + T ret = __atomic_fetch_and(ptr, val, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Bit Or operation on variable atomically with specified memory order. +/// @param: ptr(Input), a pointer to variable which is operated on. +/// @param: val(Input), value which is ORed with variable. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, value of variable prior to the operation. +template +static __forceinline T + Or(T* ptr, T val, std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + T ret = __atomic_fetch_or(ptr, val, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Bit Xor operation on variable atomically with specified memory +/// order. +/// @param: ptr(Input), a pointer to variable which is operated on. +/// @param: val(Input), value which is XORed with variable. +/// @order: order(Input), memory order which is relaxed by default. +/// @return: T, valud of variable prior to the opertaion. +template +static __forceinline T + Xor(T* ptr, T val, std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + T ret = __atomic_fetch_xor(ptr, val, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Increase the value of variable atomically with specified memory +/// order. +/// @param: ptr(Input), a pointer to variable which is operated on. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, value of variable prior to the operation. +template +static __forceinline T + Increment(T* ptr, std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + T ret = __atomic_fetch_add(ptr, 1, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Decrease the value of the variable atomically with specified memory +/// order. +/// @param: ptr(Input), a pointer to variable which is operated on. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, value of variable prior to the operation. +template +static __forceinline T + Decrement(T* ptr, std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + T ret = __atomic_fetch_sub(ptr, 1, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Add value to variable atomically with specified memory order. +/// @param: ptr(Input), a pointer to volatile variable which is operated on. +/// @param: val(Input), value to be added. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, the value of the variable prior to the addition. +template +static __forceinline T + Add(volatile T* ptr, T val, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + T ret = __atomic_fetch_add(ptr, val, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Subtract value from the variable atomically with specified memory +/// order. +/// @param: ptr(Input), a pointer to volatile variable which is operated on. +/// @param: val(Input), value to be subtraced. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, value of the variable prior to the subtraction. +template +static __forceinline T + Sub(volatile T* ptr, T val, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + T ret = __atomic_fetch_sub(ptr, val, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Bit And operation on variable atomically with specified memory +/// order. +/// @param: ptr(Input), a pointer to volatile variable which is operated on. +/// @param: val(Input), value which is ANDed with variable. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, value of variable prior to the operation. +template +static __forceinline T + And(volatile T* ptr, T val, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + T ret = __atomic_fetch_and(ptr, val, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Bit Or operation on variable atomically with specified memory order. +/// @param: ptr(Input), a pointer to volatile variable which is operated on. +/// @param: val(Input), value which is ORed with variable. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, value of variable prior to the operation. +template +static __forceinline T Or(volatile T* ptr, T val, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + T ret = __atomic_fetch_or(ptr, val, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Bit Xor operation on variable atomically with specified memory +/// order. +/// @param: ptr(Input), a pointer to volatile variable which is operated on. +/// @param: val(Input), value which is XORed with variable. +/// @order: order(Input), memory order which is relaxed by default. +/// @return: T, valud of variable prior to the opertaion. +template +static __forceinline T + Xor(volatile T* ptr, T val, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + T ret = __atomic_fetch_xor(ptr, val, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Increase the value of variable atomically with specified memory +/// order. +/// @param: ptr(Input), a pointer to volatile variable which is operated on. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, value of variable prior to the operation. +template +static __forceinline T + Increment(volatile T* ptr, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + T ret = __atomic_fetch_add(ptr, 1, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Decrease the value of the variable atomically with specified memory +/// order. +/// @param: ptr(Input), a pointer to volatile variable which is operated on. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, value of variable prior to the operation. +template +static __forceinline T + Decrement(volatile T* ptr, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + T ret = __atomic_fetch_sub(ptr, 1, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} +} // namespace atomic +} // namespace wsl + +#ifdef X64_ORDER_WC +#undef X64_ORDER_WC +#endif + +#ifdef ALWAYS_CONSERVATIVE +#undef ALWAYS_CONSERVATIVE +#endif + +#endif // HSA_RUNTIME_CORE_UTIL_ATOMIC_HELPERS_H_ diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/lazy_ptr.h b/projects/rocr-runtime/libhsakmt/src/dxg/util/lazy_ptr.h new file mode 100644 index 0000000000..b5817af40d --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/lazy_ptr.h @@ -0,0 +1,155 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIESd OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_CORE_UTIL_LAZY_PTR_H_ +#define HSA_RUNTIME_CORE_UTIL_LAZY_PTR_H_ + +#include +#include +#include + +#include "core/util/locks.h" +#include "core/util/utils.h" + +namespace wsl { + +/* + * Wrapper for a std::unique_ptr that initializes its object at first use. + */ +template class lazy_ptr { + public: + lazy_ptr() {} + + explicit lazy_ptr(std::function Constructor) { reset(Constructor); } + + lazy_ptr(lazy_ptr&& rhs) { + obj = std::move(rhs.obj); + func = std::move(rhs.func); + } + + lazy_ptr& operator=(lazy_ptr&& rhs) { + obj = std::move(rhs.obj); + func = std::move(rhs.func); + } + + lazy_ptr(lazy_ptr&) = delete; + lazy_ptr& operator=(lazy_ptr&) = delete; + + void reset(std::function Constructor = nullptr) { + obj.reset(); + func = Constructor; + } + + void reset(T* ptr) { + obj.reset(ptr); + func = nullptr; + } + + bool operator==(T* rhs) const { return obj.get() == rhs; } + bool operator!=(T* rhs) const { return obj.get() != rhs; } + + const std::unique_ptr& operator->() const { + make(true); + assert(obj != nullptr && "Null dereference through lazy_ptr."); + return obj; + } + + std::unique_ptr& operator*() { + make(true); + return obj; + } + + const std::unique_ptr& operator*() const { + make(true); + return obj; + } + + /* + * Ensures that the object is created or is being created. + * This is useful when early construction of the object is required. + */ + void touch() const { make(false); } + + // Tells if the lazy object has been constructed or not. + // Construction may fail silently (return nullptr). + bool created() const { + std::atomic_thread_fence(std::memory_order_acquire); + return func == nullptr; + } + + // Tells if the lazy object exists or not. + bool empty() const { + std::atomic_thread_fence(std::memory_order_acquire); + return obj == nullptr; + } + + private: + mutable std::unique_ptr obj; + mutable std::function func; + mutable KernelMutex lock; + + // Separated from make to improve inlining. + void make_body(bool block) const { + if (block) { + lock.Acquire(); + } else if (!lock.Try()) { + return; + } + MAKE_SCOPE_GUARD([&]() { lock.Release(); }); + if (func == nullptr) return; + T* ptr = func(); + obj.reset(ptr); + std::atomic_thread_fence(std::memory_order_release); + func = nullptr; + } + + __forceinline void make(bool block) const { + if (!created()) { + make_body(block); + } + } + +}; + +} // namespace wsl + +#endif // HSA_RUNTIME_CORE_UTIL_LAZY_PTR_H_ diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/lnx/os_linux.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/util/lnx/os_linux.cpp new file mode 100644 index 0000000000..020ca10b28 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/lnx/os_linux.cpp @@ -0,0 +1,769 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2024, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifdef __linux__ +#include "core/util/os.h" +#include "core/util/utils.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "core/inc/runtime.h" +#if defined(__i386__) || defined(__x86_64__) +#include +#endif + +namespace wsl { +namespace os { + +struct ThreadArgs { + void* entry_args; + ThreadEntry entry_function; +}; + +void* __stdcall ThreadTrampoline(void* arg) { + ThreadArgs* ar = (ThreadArgs*)arg; + ThreadEntry CallMe = ar->entry_function; + void* Data = ar->entry_args; + delete ar; + CallMe(Data); + return nullptr; +} + +// Thread container allows multiple waits and separate close (destroy). +class os_thread { + public: + explicit os_thread(ThreadEntry function, void* threadArgument, uint stackSize) + : thread(0), lock(nullptr), state(RUNNING) { + int err; + std::unique_ptr args(new ThreadArgs); + lock = CreateMutex(); + if (lock == nullptr) return; + + args->entry_args = threadArgument; + args->entry_function = function; + + pthread_attr_t attrib; + err = pthread_attr_init(&attrib); + if (err != 0) { + pr_err("pthread_attr_init failed: %s\n", strerror(err)); + return; + } + + if (stackSize != 0) { + stackSize = Max(uint(PTHREAD_STACK_MIN), stackSize); + stackSize = AlignUp(stackSize, 4096); + err = pthread_attr_setstacksize(&attrib, stackSize); + if (err != 0) { + pr_err("pthread_attr_setstacksize failed: %s\n", strerror(err)); + err = pthread_attr_destroy(&attrib); + if (err != 0) { + pr_err("pthread_attr_destroy failed: %s\n", strerror(err)); + return; + } + } + } + + int cores = 0; + cpu_set_t* cpuset = nullptr; + + if (core::Runtime::runtime_singleton_->flag().override_cpu_affinity()) { + cores = get_nprocs_conf(); + cpuset = CPU_ALLOC(cores); + if (cpuset == nullptr) { + pr_err("CPU_ALLOC failed: %s\n", strerror(errno)); + return; + } + CPU_ZERO_S(CPU_ALLOC_SIZE(cores), cpuset); + for (int i = 0; i < cores; i++) { + CPU_SET_S(i, CPU_ALLOC_SIZE(cores), cpuset); + } + err = pthread_attr_setaffinity_np(&attrib, CPU_ALLOC_SIZE(cores), cpuset); + CPU_FREE(cpuset); + if (err != 0) { + pr_err("pthread_setaffinity_np failed: %s\n", strerror(err)); + return; + } + } + + err = pthread_create(&thread, &attrib, ThreadTrampoline, args.get()); + + // Probably a stack size error since system limits can be different from PTHREAD_STACK_MIN + // Attempt to grow the stack within reason. + if ((err == EINVAL) && stackSize != 0) { + while (stackSize < 20 * 1024 * 1024) { + stackSize *= 2; + err = pthread_attr_setstacksize(&attrib, stackSize); + if (err != 0) { + pr_err("pthread_attr_setstacksize failed: %s\n", strerror(err)); + return; + } + err = pthread_create(&thread, &attrib, ThreadTrampoline, args.get()); + if (err != EINVAL) break; + pr_debug("pthread_create returned EINVAL, doubling stack size\n"); + } + } + + if (err == 0) + args.release(); + else + thread = 0; + + err = pthread_attr_destroy(&attrib); + if (err != 0) { + pr_err("pthread_attr_destroy failed: %s\n", strerror(err)); + } + } + + os_thread(os_thread&& rhs) { + thread = rhs.thread; + lock = rhs.lock; + state = int(rhs.state); + rhs.thread = 0; + rhs.lock = nullptr; + } + + os_thread(os_thread&) = delete; + + ~os_thread() { + if (lock != nullptr) DestroyMutex(lock); + if ((state == RUNNING) && (thread != 0)) { + int err = pthread_detach(thread); + if (err != 0) pr_err("pthread_detach failed: %s\n", strerror(err)); + } + } + + bool Valid() { return (lock != nullptr) && (thread != 0); } + + bool Wait() { + if (state == FINISHED) return true; + AcquireMutex(lock); + if (state == FINISHED) { + ReleaseMutex(lock); + return true; + } + int err = pthread_join(thread, NULL); + bool success = (err == 0); + if (success) state = FINISHED; + ReleaseMutex(lock); + return success; + } + + private: + pthread_t thread; + Mutex lock; + std::atomic state; + enum { FINISHED = 0, RUNNING = 1 }; +}; + +static_assert(sizeof(LibHandle) == sizeof(void*), "OS abstraction size mismatch"); +static_assert(sizeof(Semaphore) == sizeof(sem_t*), "OS abstraction size mismatch"); +static_assert(sizeof(Mutex) == sizeof(pthread_mutex_t*), "OS abstraction size mismatch"); +static_assert(sizeof(SharedMutex) == sizeof(pthread_rwlock_t*), "OS abstraction size mismatch"); +static_assert(sizeof(Thread) == sizeof(os_thread*), "OS abstraction size mismatch"); + +LibHandle LoadLib(std::string filename) { + void* ret = dlopen(filename.c_str(), RTLD_LAZY); + if (ret == nullptr) pr_err("LoadLib(%s) failed: %s\n", filename.c_str(), dlerror()); + return *(LibHandle*)&ret; +} + +void* GetExportAddress(LibHandle lib, std::string export_name) { + void* ret = dlsym(*(void**)&lib, export_name.c_str()); + + // dlsym searches the given library and all the library's load dependencies. + // Remaining code limits symbol lookup to only the library handle given. + // This lookup pattern matches Windows. + if (ret == NULL) return ret; + + link_map* map; + int err = dlinfo(*(void**)&lib, RTLD_DI_LINKMAP, &map); + if (err == -1) { + pr_err("dlinfo failed: %s\n", dlerror()); + return nullptr; + } + + Dl_info info; + err = dladdr(ret, &info); + if (err == 0) { + pr_err("dladdr failed.\n"); + return nullptr; + } + + if (strcmp(info.dli_fname, map->l_name) == 0) return ret; + + return NULL; +} + +void CloseLib(LibHandle lib) { dlclose(*(void**)&lib); } + +/* + * @brief Look for a symbol called "HSA_AMD_TOOL_PRIORITY" across all loaded + * shared libraries, and if found, store the name of the library + * + * @param[in]: info A dl_phdr_info struct pointer, which contains information + * about library's load address, header, and name. + * + * @param[in]: size integer size of dl_phdr_info struct + * + * @param[out]: data copy of the data argument to dl_phdr_iterate call + * + * @retval:: Return 0 on Success. If callback returns a non-zero value, + * dl_iterate_phdr() will stop processing, even if there are unprocessed + * shared objects. + */ + +static int callback(struct dl_phdr_info* info, size_t size, void* data) { + std::vector* loadedToolsLib = (std::vector*)data; + assert(loadedToolsLib != nullptr); + /* + * Check if lib name is not empty and its not a "vdso.so" lib, + * The vDSO is a special shared object file that is built into the Linux kernel. + * It is not a regular shared library and thus does not have all the properties + * of regular shared libraries. The way the vDSO is loaded and organized in memory + * is different from regular shared libraries and it's not guaranteed that it + * will have a specific segment or section. Hence its skipped. + */ + + if ((info) && (info->dlpi_name[0] != '\0')) { + if (std::string(info->dlpi_name).find("vdso.so") != std::string::npos) return 0; + + /* + * Iterate through the program headers of the loaded lib and check for PT_DYNAMIC program + * header. If the PT_DYNAMIC program header is found, use dlpi_addr and dlpi_phdr members + * of dl_phdr_info struct to get the address of the dynamic section of the loaded + * library in memory + */ + + for (int i = 0; i < info->dlpi_phnum; i++) { + if (info->dlpi_phdr[i].p_type == PT_DYNAMIC) { + Elf64_Dyn* dyn_section = (Elf64_Dyn*)(info->dlpi_addr + info->dlpi_phdr[i].p_vaddr); + + char* strings = nullptr; + Elf64_Xword limit = 0; + + /* + * The dynamic section is searched for DT_STRTAB (address of string table), + * and DT_STRSZ (size of string table) + * DT_NULL - Marks the end of the _DYNAMIC array + */ + + for (int j = 0;; j++) { + if (dyn_section[j].d_tag == DT_NULL) break; + + if (dyn_section[j].d_tag == DT_STRTAB) strings = (char*)(dyn_section[j].d_un.d_ptr); + + if (dyn_section[j].d_tag == DT_STRSZ) limit = dyn_section[j].d_un.d_val; + } + + if (strings == nullptr) pr_debug("String table not found\n"); + + /* + * Hacky lookup, if string and symbol tables are found, + * iterate through the strings in string table and check if + * any string matches "HSA_AMD_TOOL_PRIORITY". + * If yes, then add the name of the library to the vector of + * lib names + */ + if (strings != nullptr) { + char* end = strings + limit; + while (strings < end) { + if (strcmp(strings, "HSA_AMD_TOOL_PRIORITY") == 0) { + loadedToolsLib->push_back(info->dlpi_name); + return 0; + } + strings += (strlen(strings) + 1); + } + } + } + } + } + return 0; +} + +std::vector GetLoadedToolsLib() { + std::vector ret; + std::vector names; + + /* Iterate through all of the loaded shared libraries in the process */ + dl_iterate_phdr(callback, &names); + + if (!names.empty()) { + for (auto& name : names) ret.push_back(LoadLib(name)); + } + + return ret; +} + +std::string GetLibraryName(LibHandle lib) { + link_map *map; + if(dlinfo(lib, RTLD_DI_LINKMAP, &map)!=0) + return ""; + return map->l_name; +} + +Semaphore CreateSemaphore() { + sem_t *sem = new sem_t; + sem_init(sem, 0, 0); + return *(Semaphore*)&sem; +} + +bool WaitSemaphore(Semaphore sem) { + while(sem_wait(*(sem_t**)&sem)) + if (errno != EINTR) return false; + + return true; +} + +void PostSemaphore(Semaphore sem) { + if (sem_post(*(sem_t**)&sem)) + assert(false && "Failed to post semaphore"); +} + +void DestroySemaphore(Semaphore sem) { + sem_destroy(*(sem_t**)&sem); + delete *(sem_t**)&sem; +} + +Mutex CreateMutex() { + pthread_mutex_t* mutex = new pthread_mutex_t; + pthread_mutex_init(mutex, NULL); + return *(Mutex*)&mutex; +} + +bool TryAcquireMutex(Mutex lock) { + return pthread_mutex_trylock(*(pthread_mutex_t**)&lock) == 0; +} + +bool AcquireMutex(Mutex lock) { + return pthread_mutex_lock(*(pthread_mutex_t**)&lock) == 0; +} + +void ReleaseMutex(Mutex lock) { + pthread_mutex_unlock(*(pthread_mutex_t**)&lock); +} + +void DestroyMutex(Mutex lock) { + pthread_mutex_destroy(*(pthread_mutex_t**)&lock); + delete *(pthread_mutex_t**)&lock; +} + +void Sleep(int delay_in_millisec) { usleep(delay_in_millisec * 1000); } + +void uSleep(int delayInUs) { usleep(delayInUs); } + +void YieldThread() { sched_yield(); } + +Thread CreateThread(ThreadEntry function, void* threadArgument, uint stackSize) { + os_thread* result = new os_thread(function, threadArgument, stackSize); + if (!result->Valid()) { + delete result; + return nullptr; + } + + return reinterpret_cast(result); +} + +void CloseThread(Thread thread) { delete reinterpret_cast(thread); } + +bool WaitForThread(Thread thread) { return reinterpret_cast(thread)->Wait(); } + +bool WaitForAllThreads(Thread* threads, uint threadCount) { + for (uint i = 0; i < threadCount; i++) WaitForThread(threads[i]); + return true; +} + +bool IsEnvVarSet(std::string env_var_name) { + char* buff = NULL; + buff = getenv(env_var_name.c_str()); + return (buff != NULL); +} + +void SetEnvVar(std::string env_var_name, std::string env_var_value) { + setenv(env_var_name.c_str(), env_var_value.c_str(), 1); +} + +int GetProcessId() { + return ::getpid(); +} + +std::string GetEnvVar(std::string env_var_name) { + char* buff; + buff = getenv(env_var_name.c_str()); + std::string ret; + if (buff) { + ret = buff; + } + return ret; +} + +size_t GetUserModeVirtualMemorySize() { +#ifdef _LP64 + // https://www.kernel.org/doc/Documentation/x86/x86_64/mm.txt : + // user space is 0000000000000000 - 00007fffffffffff (=47 bits) + return (size_t)(0x800000000000); +#else + return (size_t)(0xffffffff); // ~4GB +#endif +} + +size_t GetUsablePhysicalHostMemorySize() { + struct sysinfo info = {0}; + if (sysinfo(&info) != 0) { + return 0; + } + + const size_t physical_size = + static_cast(info.totalram * info.mem_unit); + return std::min(GetUserModeVirtualMemorySize(), physical_size); +} + +uintptr_t GetUserModeVirtualMemoryBase() { return (uintptr_t)0; } + +// Os event implementation +typedef struct EventDescriptor_ { + pthread_cond_t event; + pthread_mutex_t mutex; + bool state; + bool auto_reset; +} EventDescriptor; + +EventHandle CreateOsEvent(bool auto_reset, bool init_state) { + EventDescriptor* eventDescrp; + eventDescrp = (EventDescriptor*)malloc(sizeof(EventDescriptor)); + + pthread_mutex_init(&eventDescrp->mutex, NULL); + pthread_cond_init(&eventDescrp->event, NULL); + eventDescrp->auto_reset = auto_reset; + eventDescrp->state = init_state; + + EventHandle handle = reinterpret_cast(eventDescrp); + + return handle; +} + +int DestroyOsEvent(EventHandle event) { + if (event == NULL) { + return -1; + } + + EventDescriptor* eventDescrp = reinterpret_cast(event); + int ret_code = pthread_cond_destroy(&eventDescrp->event); + ret_code |= pthread_mutex_destroy(&eventDescrp->mutex); + free(eventDescrp); + return ret_code; +} + +int WaitForOsEvent(EventHandle event, unsigned int milli_seconds) { + if (event == NULL) { + return -1; + } + + EventDescriptor* eventDescrp = reinterpret_cast(event); + // Event wait time is 0 and state is non-signaled, return directly + if (milli_seconds == 0) { + int tmp_ret = pthread_mutex_trylock(&eventDescrp->mutex); + if (tmp_ret == EBUSY) { + // Timeout + return 1; + } + } + + int ret_code = 0; + pthread_mutex_lock(&eventDescrp->mutex); + if (!eventDescrp->state) { + if (milli_seconds == 0) { + ret_code = 1; + } else { + struct timespec ts; + struct timeval tp; + + ret_code = gettimeofday(&tp, NULL); + ts.tv_sec = tp.tv_sec; + ts.tv_nsec = tp.tv_usec * 1000; + + unsigned int sec = milli_seconds / 1000; + unsigned int mSec = milli_seconds % 1000; + + ts.tv_sec += sec; + ts.tv_nsec += mSec * 1000000; + + // More then one second, add 1 sec to the tv_sec elem + if (ts.tv_nsec > 1000000000) { + ts.tv_sec += 1; + ts.tv_nsec = ts.tv_nsec - 1000000000; + } + + ret_code = + pthread_cond_timedwait(&eventDescrp->event, &eventDescrp->mutex, &ts); + // Time out + if (ret_code == 110) { + ret_code = 0x14003; // 1 means time out in HSA + } + + if (ret_code == 0 && eventDescrp->auto_reset) { + eventDescrp->state = false; + } + } + } else if (eventDescrp->auto_reset) { + eventDescrp->state = false; + } + pthread_mutex_unlock(&eventDescrp->mutex); + + return ret_code; +} + +int SetOsEvent(EventHandle event) { + if (event == NULL) { + return -1; + } + + EventDescriptor* eventDescrp = reinterpret_cast(event); + int ret_code = 0; + ret_code = pthread_mutex_lock(&eventDescrp->mutex); + eventDescrp->state = true; + ret_code = pthread_mutex_unlock(&eventDescrp->mutex); + ret_code |= pthread_cond_signal(&eventDescrp->event); + + return ret_code; +} + +int ResetOsEvent(EventHandle event) { + if (event == NULL) { + return -1; + } + + EventDescriptor* eventDescrp = reinterpret_cast(event); + int ret_code = 0; + ret_code = pthread_mutex_lock(&eventDescrp->mutex); + eventDescrp->state = false; + ret_code = pthread_mutex_unlock(&eventDescrp->mutex); + + return ret_code; +} + +static double invPeriod = 0.0; + +uint64_t ReadAccurateClock() { + if (invPeriod == 0.0) AccurateClockFrequency(); + timespec time; + int err = clock_gettime(CLOCK_MONOTONIC_RAW, &time); + if (err != 0) { + pr_err("clock_gettime(CLOCK_MONOTONIC_RAW,...) failed %s\n", strerror(errno)); + abort(); + } + return (uint64_t(time.tv_sec) * 1000000000ull + uint64_t(time.tv_nsec)) * invPeriod; +} + +uint64_t AccurateClockFrequency() { + static clockid_t clock = CLOCK_MONOTONIC; + static std::atomic first(true); + // Check kernel version - not a concurrency concern. + // use non-RAW for getres due to bug in older 2.6.x kernels + if (first.load(std::memory_order_acquire)) { + utsname kernelInfo; + if (uname(&kernelInfo) == 0) { + try { + std::string ver = kernelInfo.release; + size_t idx; + int major = std::stoi(ver, &idx); + int minor = std::stoi(ver.substr(idx + 1)); + if ((major >= 4) && (minor >= 4)) { + clock = CLOCK_MONOTONIC_RAW; + } + } catch (...) { + // Kernel version string doesn't conform to the standard pattern. + // Keep using the "safe" (non-RAW) clock. + } + } + first.store(false, std::memory_order_release); + } + timespec time; + int err = clock_getres(clock, &time); + if (err != 0) { + pr_err("clock_getres failed %s\n", strerror(errno)); + abort(); + } + if (time.tv_sec != 0 || time.tv_nsec >= 0xFFFFFFFF) { + pr_err("clock_getres(CLOCK_MONOTONIC(_RAW),...) returned very low frequency (<1Hz).\n"); + abort(); + } + if (invPeriod == 0.0) invPeriod = 1.0 / double(time.tv_nsec); + return 1000000000ull / uint64_t(time.tv_nsec); +} + +SharedMutex CreateSharedMutex() { + pthread_rwlockattr_t attrib; + int err = pthread_rwlockattr_init(&attrib); + if (err != 0) { + pr_err("rw lock attribute init failed: %s\n", strerror(err)); + return nullptr; + } + +#ifdef __GLIBC__ + err = pthread_rwlockattr_setkind_np(&attrib, PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP); + if (err != 0) { + pr_err("Set rw lock attribute failure: %s\n", strerror(err)); + return nullptr; + } +#else + err = pthread_rwlockattr_setkind(&attrib, PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP); + if (err != 0) { + pr_err("Set rw lock attribute failure: %s\n", strerror(err)); + return nullptr; + } +#endif + + pthread_rwlock_t* lock = new pthread_rwlock_t; + err = pthread_rwlock_init(lock, &attrib); + if (err != 0) { + pr_err("rw lock init failed: %s\n", strerror(err)); + return nullptr; + } + + pthread_rwlockattr_destroy(&attrib); + return lock; +} + +bool TryAcquireSharedMutex(SharedMutex lock) { + int err = pthread_rwlock_trywrlock(*(pthread_rwlock_t**)&lock); + return err == 0; +} + +bool AcquireSharedMutex(SharedMutex lock) { + int err = pthread_rwlock_wrlock(*(pthread_rwlock_t**)&lock); + return err == 0; +} + +void ReleaseSharedMutex(SharedMutex lock) { + int err = pthread_rwlock_unlock(*(pthread_rwlock_t**)&lock); + if (err != 0) { + pr_err("SharedMutex unlock failed: %s\n", strerror(err)); + abort(); + } +} + +bool TrySharedAcquireSharedMutex(SharedMutex lock) { + int err = pthread_rwlock_tryrdlock(*(pthread_rwlock_t**)&lock); + return err == 0; +} + +bool SharedAcquireSharedMutex(SharedMutex lock) { + int err = pthread_rwlock_rdlock(*(pthread_rwlock_t**)&lock); + return err == 0; +} + +void SharedReleaseSharedMutex(SharedMutex lock) { + int err = pthread_rwlock_unlock(*(pthread_rwlock_t**)&lock); + if (err != 0) { + pr_err("SharedMutex unlock failed: %s\n", strerror(err)); + abort(); + } +} + +void DestroySharedMutex(SharedMutex lock) { + pthread_rwlock_destroy(*(pthread_rwlock_t**)&lock); + delete *(pthread_rwlock_t**)&lock; +} + +static uint64_t sys_clock_period_ = 0; + +uint64_t ReadSystemClock() { + struct timespec ts; + clock_gettime(CLOCK_BOOTTIME, &ts); + uint64_t time = (uint64_t(ts.tv_sec) * 1000000000 + uint64_t(ts.tv_nsec)); + if (sys_clock_period_ != 1) + return time / sys_clock_period_; + else + return time; +} + +uint64_t SystemClockFrequency() { + struct timespec ts; + clock_getres(CLOCK_BOOTTIME, &ts); + sys_clock_period_ = (uint64_t(ts.tv_sec) * 1000000000 + uint64_t(ts.tv_nsec)); + return 1000000000 / sys_clock_period_; +} + +bool ParseCpuID(cpuid_t* cpuinfo) { +#if defined(__i386__) || defined(__x86_64__) + uint32_t eax, ebx, ecx, edx, max_eax = 0; + memset(cpuinfo, 0, sizeof(*cpuinfo)); + + /* Make sure current CPU supports at least EAX 4 */ + if (!__get_cpuid_max(0x80000004, NULL)) return false; + + // Manufacturer ID is a twelve-character ASCII string stored in order EBX, EDX, ECX. + if (!__get_cpuid(0, &max_eax, (uint32_t*)&cpuinfo->ManufacturerID[0], + (uint32_t*)&cpuinfo->ManufacturerID[8], + (uint32_t*)&cpuinfo->ManufacturerID[4])) { + return false; + } + + if (!strcmp(cpuinfo->ManufacturerID, "AuthenticAMD")) { + if (__get_cpuid(0x80000001, &eax, &ebx, &ecx, &edx)) { + cpuinfo->mwaitx = !!((ecx >> 29) & 0x1); + } + } + return true; +#else + return false; +#endif +} + +} // namespace os +} // namespace wsl + +#endif diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/locks.h b/projects/rocr-runtime/libhsakmt/src/dxg/util/locks.h new file mode 100644 index 0000000000..a17fa09593 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/locks.h @@ -0,0 +1,290 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// Library of syncronization primitives - to be added to as needed. + +#ifndef HSA_RUNTIME_CORE_UTIL_LOCKS_H_ +#define HSA_RUNTIME_CORE_UTIL_LOCKS_H_ + +#include "utils.h" +#include "os.h" + +namespace wsl { + +class HybridMutex { + public: + HybridMutex():lock_(0) { + sem_ = os::CreateSemaphore(); + } + + ~HybridMutex() { + os::DestroySemaphore(sem_); + } + + bool Try() { + int old = 0; + return lock_.compare_exchange_strong(old, 1); + } + + bool Acquire() { + int cnt = maxSpinIterPause + maxSpinIterYield; + + int old = 0; + while (!lock_.compare_exchange_strong(old, 1)) { + cnt--; + if (cnt > maxSpinIterPause) { + _mm_pause(); + } else if (cnt-- > maxSpinIterYield) { + os::YieldThread(); + } else { + os::WaitSemaphore(sem_); + cnt = maxSpinIterPause + maxSpinIterYield; + } + old = 0; + } + return true; + } + + void Release() { + int old = 1; + if (lock_.compare_exchange_strong(old, 0)) + os::PostSemaphore(sem_); + } + + private: + std::atomic lock_; + os::Semaphore sem_; + const uint32_t maxSpinIterPause = 55; + const uint32_t maxSpinIterYield = 55; + + /// @brief: Disable copiable and assignable ability. + DISALLOW_COPY_AND_ASSIGN(HybridMutex); +}; + + +/// @brief: a class represents a kernel mutex. +/// Uses the kernel's scheduler to keep the waiting thread from being scheduled +/// until the lock is released (Best for long waits, though anything using +/// a kernel object is a long wait). +class KernelMutex { + public: + KernelMutex() { lock_ = os::CreateMutex(); } + ~KernelMutex() { os::DestroyMutex(lock_); } + + bool Try() { return os::TryAcquireMutex(lock_); } + bool Acquire() { return os::AcquireMutex(lock_); } + void Release() { os::ReleaseMutex(lock_); } + + private: + os::Mutex lock_; + + /// @brief: Disable copiable and assignable ability. + DISALLOW_COPY_AND_ASSIGN(KernelMutex); +}; + +/// @brief: represents a spin lock. +/// For very short hold durations on the order of the thread scheduling +/// quanta or less. +class SpinMutex { + public: + SpinMutex() { lock_ = 0; } + + bool Try() { + int old = 0; + return lock_.compare_exchange_strong(old, 1); + } + bool Acquire() { + int old = 0; + while (!lock_.compare_exchange_strong(old, 1)) + { + old=0; + os::YieldThread(); + } + return true; + } + void Release() { lock_ = 0; } + + private: + std::atomic lock_; + + /// @brief: Disable copiable and assignable ability. + DISALLOW_COPY_AND_ASSIGN(SpinMutex); +}; + +class KernelEvent { + public: + KernelEvent() { evt_ = os::CreateOsEvent(true, true); } + ~KernelEvent() { os::DestroyOsEvent(evt_); } + + bool IsSet() { return os::WaitForOsEvent(evt_, 0)==0; } + bool WaitForSet() { return os::WaitForOsEvent(evt_, 0xFFFFFFFF)==0; } + void Set() { os::SetOsEvent(evt_); } + void Reset() { os::ResetOsEvent(evt_); } + + private: + os::EventHandle evt_; + + /// @brief: Disable copiable and assignable ability. + DISALLOW_COPY_AND_ASSIGN(KernelEvent); +}; + +/// @brief: represents a yielding shared mutex. +/// aka read/write mutex +class KernelSharedMutex { + public: + /// @brief: Interfaces ScopedAcquire to shared operations. + class Shared { + public: + explicit Shared(KernelSharedMutex* lock) : lock_(lock) {} + bool Try() { return lock_->TryShared(); } + bool Acquire() { return lock_->AcquireShared(); } + void Release() { lock_->ReleaseShared(); } + + private: + KernelSharedMutex* lock_; + }; + + KernelSharedMutex() { lock_ = os::CreateSharedMutex(); } + ~KernelSharedMutex() { os::DestroySharedMutex(lock_); } + + // Exclusive mode operations + bool Try() { return os::TryAcquireSharedMutex(lock_); } + bool Acquire() { return os::AcquireSharedMutex(lock_); } + void Release() { os::ReleaseSharedMutex(lock_); } + + // Shared mode operations + bool TryShared() { return os::TrySharedAcquireSharedMutex(lock_); } + bool AcquireShared() { return os::SharedAcquireSharedMutex(lock_); } + void ReleaseShared() { os::SharedReleaseSharedMutex(lock_); } + + // Return shared operations interface + Shared shared() { return Shared(this); } + + private: + os::SharedMutex lock_; + + /// @brief: Disable copiable and assignable ability. + DISALLOW_COPY_AND_ASSIGN(KernelSharedMutex); +}; + +/// @brief: Type trait to identify mutex types +template class isMutex { + public: + enum { value = false }; +}; +template <> class isMutex { + public: + enum { value = true }; +}; +template <> class isMutex { + public: + enum { value = true }; +}; +template <> class isMutex { + public: + enum { value = true }; +}; +template <> class isMutex { + public: + enum { value = true }; +}; + +/// @brief: A class behaves as a lock in a scope. When trying to enter into the +/// critical section, creat a object of this class. After the control path goes +/// out of the scope, it will release the lock automatically. +template class ScopedAcquire { + public: + /// @brief: When constructing, acquire the lock. + /// @param: lock(Input), pointer to an existing lock. + explicit ScopedAcquire(LockType* lock) : lock_(lock), doRelease(true) { + static_assert(isMutex::value, "ScopedAcquire requires a mutex type."); + lock_.Acquire(); + } + explicit ScopedAcquire(LockType lock) : lock_(lock), doRelease(true) { + static_assert(!isMutex::value, "Mutex types are not copyable."); + lock_.Acquire(); + } + + /// @brief: when destructing, release the lock. + ~ScopedAcquire() { + if (doRelease) lock_.Release(); + } + + /// @brief: Release the lock early. Avoid using when possible. + void Release() { + lock_.Release(); + doRelease = false; + } + + private: + /// @brief: Adapts between pointers to mutex types and mutex pointer types. + template class container { + public: + container(T* lock) : lock_(lock) {} + __forceinline bool Acquire() { return lock_->Acquire(); } + __forceinline void Release() { return lock_->Release(); } + + private: + T* lock_; + }; + + /// @brief: Specialization for mutex pointer types. + template class container { + public: + container(T lock) : lock_(lock) {} + __forceinline bool Acquire() { return lock_.Acquire(); } + __forceinline void Release() { return lock_.Release(); } + + private: + T lock_; + }; + + container::value> lock_; + bool doRelease; + + /// @brief: Disable copiable and assignable ability. + DISALLOW_COPY_AND_ASSIGN(ScopedAcquire); +}; + +} // namespace wsl + +#endif // HSA_RUNTIME_CORE_SUTIL_LOCKS_H_ diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/os.h b/projects/rocr-runtime/libhsakmt/src/dxg/util/os.h new file mode 100644 index 0000000000..2f40cd1581 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/os.h @@ -0,0 +1,327 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2024, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// Minimal operating system abstraction interfaces. + +#ifndef HSA_RUNTIME_CORE_UTIL_OS_H_ +#define HSA_RUNTIME_CORE_UTIL_OS_H_ + +#include +#include +#include "utils.h" + +namespace wsl { +namespace os { +typedef void* LibHandle; +typedef void* Semaphore; +typedef void* Mutex; +typedef void* SharedMutex; +typedef void* Thread; +typedef void* EventHandle; + +enum class os_t { OS_WIN = 0, OS_LINUX, COUNT }; +static __forceinline std::underlying_type::type os_index(os_t val) { + return std::underlying_type::type(val); +} + +#ifdef _WIN32 +static const os_t current_os = os_t::OS_WIN; +#elif __linux__ +static const os_t current_os = os_t::OS_LINUX; +#else +static_assert(false, "Operating System not detected!"); +#endif + +/// @brief: Loads dynamic library based on file name. Return value will be NULL +/// if failed. +/// @param: filename(Input), file name of the library. +/// @return: LibHandle. +LibHandle LoadLib(std::string filename); + +/// @brief: Gets the address of exported symbol. Return NULl if failed. +/// @param: lib(Input), library handle which exporting from. +/// @param: export_name(Input), the name of the exported symbol. +/// @return: void*. +void* GetExportAddress(LibHandle lib, std::string export_name); + +/// @brief: Unloads the dynamic library. +/// @param: lib(Input), library handle which will be unloaded. +void CloseLib(LibHandle lib); + +/// @brief: Lists loaded tool libraries that contain +/// symbol HSA_AMD_TOOL_PRIORITY +/// @return: List of library handles +std::vector GetLoadedToolsLib(); + +/// @brief: Returns the library's path name. +/// @param: lib(Input), libray handle +/// @return: Path name of library +std::string GetLibraryName(LibHandle lib); + +/// @brief: Creates a Semaphore, will return NULL if failed. +/// @param: void. +/// @return: Semaphore. +Semaphore CreateSemaphore(); + +/// @brief: Waits for the semaphore. This is a blocking wait. +/// If the Semaphore is signalled, this function will return. +/// @param: sem(Input), handle to the semaphore. +/// @return: void. +bool WaitSemaphore(Semaphore sem); + +/// @brief: Post/Signal/Wake-up the semaphore +/// @param: sem(Input), handle to the semaphore. +/// @return: void. +void PostSemaphore(Semaphore sem); + +/// @brief: Destroys the semaphore. +/// @param: sem(Input), handle to the semaphore. +/// @return: void. +void DestroySemaphore(Semaphore sem); + +/// @brief: Creates a mutex, will return NULL if failed. +/// @param: void. +/// @return: Mutex. +Mutex CreateMutex(); + +/// @brief: Tries to acquire the mutex once, if successed, return true. +/// @param: lock(Input), handle to the mutex. +/// @return: bool. +bool TryAcquireMutex(Mutex lock); + +/// @brief: Aquires the mutex, if the mutex is locked, it will wait until it is +/// released. If the mutex is acquired successfully, it will return true. +/// @param: lock(Input), handle to the mutex. +/// @return: bool. +bool AcquireMutex(Mutex lock); + +/// @brief: Releases the mutex. +/// @param: lock(Input), handle to the mutex. +/// @return: void. +void ReleaseMutex(Mutex lock); + +/// @brief: Destroys the mutex. +/// @param: lock(Input), handle to the mutex. +/// @return: void. +void DestroyMutex(Mutex lock); + +/// @brief: Creates a shared mutex, will return NULL if failed. +/// @param: void. +/// @return: SharedMutex. +SharedMutex CreateSharedMutex(); + +/// @brief: Tries to acquire the mutex in exclusive mode once, if successed, return true. +/// @param: lock(Input), handle to the shared mutex. +/// @return: bool. +bool TryAcquireSharedMutex(SharedMutex lock); + +/// @brief: Aquires the mutex in exclusive mode, if the mutex is locked, it will wait until it is +/// released. If the mutex is acquired successfully, it will return true. +/// @param: lock(Input), handle to the mutex. +/// @return: bool. +bool AcquireSharedMutex(SharedMutex lock); + +/// @brief: Releases the mutex from exclusive mode. +/// @param: lock(Input), handle to the mutex. +/// @return: void. +void ReleaseSharedMutex(SharedMutex lock); + +/// @brief: Tries to acquire the mutex in shared mode once, if successed, return true. +/// @param: lock(Input), handle to the mutex. +/// @return: bool. +bool TrySharedAcquireSharedMutex(SharedMutex lock); + +/// @brief: Aquires the mutex in shared mode, if the mutex in exclusive mode, it will wait until it +/// is released. If the mutex is acquired successfully, it will return true. +/// @param: lock(Input), handle to the mutex. +/// @return: bool. +bool SharedAcquireSharedMutex(SharedMutex lock); + +/// @brief: Releases the mutex from shared mode. +/// @param: lock(Input), handle to the mutex. +/// @return: void. +void SharedReleaseSharedMutex(SharedMutex lock); + +/// @brief: Destroys the mutex. +/// @param: lock(Input), handle to the mutex. +/// @return: void. +void DestroySharedMutex(SharedMutex lock); + +/// @brief: Puts current thread to sleep. +/// @param: delayInMs(Input), time in millisecond for sleeping. +/// @return: void. +void Sleep(int delayInMs); + +/// @brief: Puts current thread to sleep. +/// @param: delayInMs(Input), time in millisecond for sleeping. +/// @return: void. +void uSleep(int delayInUs); + +/// @brief: Yields current thread. +/// @param: void. +/// @return: void. +void YieldThread(); + +typedef void (*ThreadEntry)(void*); + +/// @brief: Creates a thread will return NULL if failed. +/// @param: entry_function(Input), a pointer to the function which the thread +/// starts from. +/// @param: entry_argument(Input), a pointer to the argument of the thread +/// function. +/// @param: stack_size(Input), size of the thread's stack, 0 by default. +/// @return: Thread, a handle to thread created. +Thread CreateThread(ThreadEntry entry_function, void* entry_argument, + uint stack_size = 0); + +/// @brief: Destroys the thread. +/// @param: thread(Input), thread handle to what will be destroyed. +/// @return: void. +void CloseThread(Thread thread); + +/// @brief: Waits for specific thread to finish, if successful, return true. +/// @param: thread(Input), handle to waiting thread. +/// @return: bool. +bool WaitForThread(Thread thread); + +/// @brief: Waits for multiple threads to finish, if successful, return true. +/// @param; threads(Input), a pointer to a list of thread handle. +/// @param: thread_count(Input), number of threads to be waited on. +/// @return: bool. +bool WaitForAllThreads(Thread* threads, uint thread_count); + +/// @brief: Determines if environment key is set. +/// @param: env_var_name(Input), name of the environment value. +/// @return: bool, true for binding any value to environment key, +/// including an empty string. False otherwise +bool IsEnvVarSet(std::string env_var_name); + +/// @brief: Sets the environment value. +/// @param: env_var_name(Input), name of the environment value. +/// @param: env_var_value(Input), value of the environment value.s +/// @return: void. +void SetEnvVar(std::string env_var_name, std::string env_var_value); + +/// @brief: Gets the value of environment value. +/// @param: env_var_name(Input), name of the environment value. +/// @return: std::string, value of the environment value, returned as string. +std::string GetEnvVar(std::string env_var_name); + +/// @brief: Gets the process ID. +/// @param: void +/// @return: int, process ID returned as int. +int GetProcessId(); + +/// @brief: Gets the max virtual memory size accessible to the application. +/// @param: void. +/// @return: size_t, size of the accessible memory to the application. +size_t GetUserModeVirtualMemorySize(); + +/// @brief: Gets the max physical host system memory size. +/// @param: void. +/// @return: size_t, size of the physical host system memory. +size_t GetUsablePhysicalHostMemorySize(); + +/// @brief: Gets the virtual memory base address. It is hardcoded to 0. +/// @param: void. +/// @return: uintptr_t, always 0. +uintptr_t GetUserModeVirtualMemoryBase(); + +/// @brief os event api, create an event +/// @param: auto_reset whether an event can reset the status automatically +/// @param: init_state initial state of the event +/// @return: event handle +EventHandle CreateOsEvent(bool auto_reset, bool init_state); + +/// @brief os event api, destroy an event +/// @param: event handle +/// @return: whether destroy is correct +int DestroyOsEvent(EventHandle event); + +/// @brief os event api, wait on event +/// @param: event Event handle +/// @param: milli_seconds wait time +/// @return: Indicate success or timeout +int WaitForOsEvent(EventHandle event, unsigned int milli_seconds); + +/// @brief os event api, set event state +/// @param: event Event handle +/// @return: Whether event set is correct +int SetOsEvent(EventHandle event); + +/// @brief os event api, reset event state +/// @param: event Event handle +/// @return: Whether event reset is correct +int ResetOsEvent(EventHandle event); + +/// @brief reads a clock which is deemed to be accurate for elapsed time +/// measurements, though not necessarilly fast to query +/// @return clock counter value +uint64_t ReadAccurateClock(); + +/// @brief retrieves the frequency in Hz of the unit used in ReadAccurateClock. +/// It does not necessarilly reflect the resolution of the clock, but is the +/// value needed to convert a difference in the clock's counter value to elapsed +/// seconds. This frequency does not change at runtime. +/// @return returns the frequency +uint64_t AccurateClockFrequency(); + +/// @brief read the system clock which serves as the HSA system clock +/// counter in KFD. +uint64_t ReadSystemClock(); + +/// @brief read the system clock frequency +uint64_t SystemClockFrequency(); + +typedef struct cpuid_s { + char ManufacturerID[13]; // 12 char, NULL terminated + bool mwaitx; +} cpuid_t; + +/// @brief parse CPUID +/// @param: cpuinfo struct to be filled +bool ParseCpuID(cpuid_t* cpuinfo); + +} // namespace os +} // namespace wsl + +#endif // HSA_RUNTIME_CORE_UTIL_OS_H_ diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/simple_heap.h b/projects/rocr-runtime/libhsakmt/src/dxg/util/simple_heap.h new file mode 100644 index 0000000000..1fb992eb63 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/simple_heap.h @@ -0,0 +1,394 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// A simple best fit memory allocator with eager compaction. Manages block sub-allocation. +// For use when memory efficiency is more important than allocation speed. +// O(log n) time. + +#ifndef HSA_RUNTME_CORE_UTIL_SIMPLE_HEAP_H_ +#define HSA_RUNTME_CORE_UTIL_SIMPLE_HEAP_H_ + +#include +#include +#include + + +namespace wsl { + +template class SimpleHeap { + private: + struct Fragment_T { + typedef std::multimap::iterator ptr_t; + ptr_t free_list_entry_; + struct { + size_t size : 62; + bool discard : 1; + bool free : 1; + }; + + Fragment_T(ptr_t Iterator, size_t Len, bool Free) + : free_list_entry_(Iterator), size(Len), discard(false), free(Free) {} + Fragment_T() = default; + }; + + struct Block { + uintptr_t base_ptr_; + size_t length_; + + Block(uintptr_t base, size_t length) : base_ptr_(base), length_(length) {} + Block() = default; + }; + + Allocator block_allocator_; + + std::multimap free_list_; + std::map> block_list_; + std::deque block_cache_; + + // Size of blocks that are at least partially in use. + size_t in_use_size_; + // Total size of block cache + size_t cache_size_; + + __forceinline bool isFree(const Fragment_T& node) { return node.free; } + __forceinline void setUsed(Fragment_T& node) { + node.free = false; + node.free_list_entry_ = free_list_.end(); + } + __forceinline void setFree(Fragment_T& node, typename Fragment_T::ptr_t Iterator) { + node.free_list_entry_ = Iterator; + node.free = true; + } + __forceinline Fragment_T makeFragment(size_t Len) { + return Fragment_T(free_list_.end(), Len, false); + } + __forceinline Fragment_T makeFragment(typename Fragment_T::ptr_t Iterator, size_t Len) { + return Fragment_T(Iterator, Len, true); + } + __forceinline void removeFreeListEntry(Fragment_T& node) { + if (node.free_list_entry_ != free_list_.end()) { + free_list_.erase(node.free_list_entry_); + node.free_list_entry_ = free_list_.end(); + } + } + __forceinline void discard(Fragment_T& node) { + removeFreeListEntry(node); + node.discard = true; + } + + public: + explicit SimpleHeap(const Allocator& BlockAllocator = Allocator()) + : block_allocator_(BlockAllocator), in_use_size_(0), cache_size_(0) {} + ~SimpleHeap() { + trim(); + // Leak here may be due to the user. Check is for debugging only. + // assert(in_use_size_ == 0 && "Leak in SimpleHeap."); + } + + SimpleHeap(const SimpleHeap& rhs) = delete; + SimpleHeap(SimpleHeap&& rhs) = delete; + SimpleHeap& operator=(const SimpleHeap& rhs) = delete; + SimpleHeap& operator=(SimpleHeap&& rhs) = delete; + + void* alloc(size_t bytes) { + // Find best fit. + uintptr_t base; + size_t size; + // For bytes >= 2MB, the requested mem should be aligned + size_t align_bytes = bytes; + const int retry = bytes >= GPU_HUGE_PAGE_SIZE ? 1 : 0; + size_t align = bytes >= GPU_HUGE_PAGE_SIZE ? GPU_HUGE_PAGE_SIZE : DEFAULT_GPU_PAGE_SIZE; + + for (int i = 0; i <= retry; i++) { + auto free_fragment = free_list_.lower_bound(align_bytes); + if (free_fragment == free_list_.end()) break; + + uintptr_t addr = free_fragment->second; + size = free_fragment->first; + + assert(size >= bytes && "SimpleHeap: map lower_bound failure."); + + // Find the containing block and fragment + auto it = block_list_.upper_bound(addr); + it--; + auto& frag_map = it->second; + const auto& fragment = frag_map.find(addr); + + assert(fragment != frag_map.end() && "Inconsistency in SimpleHeap."); + assert(size == fragment->second.size && "Inconsistency in SimpleHeap."); + + size_t delta = addr & (align - 1); + if (!delta) { + // already find aligned address + base = addr; + free_list_.erase(free_fragment); + // Sub-allocate from fragment. + fragment->second.size = bytes; + setUsed(fragment->second); + // Record remaining free space. + if (size > bytes) { + free_fragment = free_list_.insert(std::make_pair(size - bytes, base + bytes)); + frag_map[base + bytes] = makeFragment(free_fragment, size - bytes); + } + } else { + // If this is the first request and the requested size is not enough for alignment, + // then request for a bigger hole and do trim. + if (i == 0 && size < bytes + align - delta) { + align_bytes += align; + continue; + } + + uintptr_t aligned_base = addr + align - delta; + base = aligned_base; + + // Erase the old free list + free_list_.erase(free_fragment); + + // fragment 1 - free + free_fragment = free_list_.insert(std::make_pair(aligned_base - addr, addr)); + frag_map[addr] = makeFragment(free_fragment, aligned_base - addr); + + //fragment 2 - used + frag_map[base] = makeFragment(bytes); + + // fragement 3 - free + if (size > aligned_base - addr + bytes) { + free_fragment = free_list_.insert(std::make_pair(size - (aligned_base - addr) - bytes, aligned_base + bytes)); + frag_map[aligned_base + bytes] = makeFragment(free_fragment, size - (aligned_base - addr) - bytes); + } + } + return reinterpret_cast(base); + } + + // No usable fragment, check block cache + if (bytes < default_block_size() && !block_cache_.empty()) { + const auto& block = block_cache_.back(); + base = block.base_ptr_; + size = block.length_; + block_cache_.pop_back(); + cache_size_ -= size; + } else { // Alloc new block - new block may be larger than default. + void* ptr = block_allocator_.alloc(bytes, size); + if (ptr == nullptr) { + fprintf(stderr, "Block allocation failed, Allocator is expected to throw.\n"); + return nullptr; + } + base = reinterpret_cast(ptr); + } + + in_use_size_ += size; + assert(size >= bytes && "Alloc exceeds block size."); + // Sub alloc and insert free region. + if (size > bytes) { + auto free_fragment = free_list_.insert(std::make_pair(size - bytes, base + bytes)); + block_list_[base][base + bytes] = makeFragment(free_fragment, size - bytes); + } + // Track used region + block_list_[base][base] = makeFragment(bytes); + + // Disallow multiple suballocation from large blocks. + // Prevents a small allocation from retaining a large block. + if (bytes > default_block_size()) { + bool err = discardBlock(reinterpret_cast(base)); + assert(err && "Large block discard failed."); + } + + return reinterpret_cast(base); + } + + /* Return block-base the ptr belongs to if the ptr is a valid ptr which is allocated + * from this simpleheap and the block-base is allocated from block_allocator_*/ + void* block_base(void* ptr) { + if (ptr == nullptr) + return nullptr; + + uintptr_t base = reinterpret_cast(ptr); + + // Find fragment and validate. + auto frag_map_it = block_list_.upper_bound(base); + if (frag_map_it == block_list_.begin()) + return nullptr; + frag_map_it--; + auto& frag_map = frag_map_it->second; + auto fragment = frag_map.find(base); + if (fragment == frag_map.end() || isFree(fragment->second)) + return nullptr; + + return reinterpret_cast(frag_map_it->first); + } + + void reset() { + free_list_.clear(); + block_list_.clear(); + block_cache_.clear(); + in_use_size_ = 0; + cache_size_ = 0; + } + + bool free(void* ptr) { + if (ptr == nullptr) return true; + + uintptr_t base = reinterpret_cast(ptr); + + // Find fragment and validate. + auto frag_map_it = block_list_.upper_bound(base); + if (frag_map_it == block_list_.begin()) return false; + frag_map_it--; + auto& frag_map = frag_map_it->second; + auto fragment = frag_map.find(base); + if (fragment == frag_map.end() || isFree(fragment->second)) return false; + + bool discard = fragment->second.discard; + + // Merge lower + if (fragment != frag_map.begin()) { + auto lower = fragment; + lower--; + if (isFree(lower->second)) { + removeFreeListEntry(lower->second); + lower->second.size += fragment->second.size; + frag_map.erase(fragment); + fragment = lower; + } + } + + // Merge upper + { + auto upper = fragment; + upper++; + if ((upper != frag_map.end()) && isFree(upper->second)) { + removeFreeListEntry(upper->second); + fragment->second.size += upper->second.size; + frag_map.erase(upper); + } + } + + // Release whole free blocks. + if (frag_map.size() == 1) { + Block block(fragment->first, fragment->second.size); + block_list_.erase(frag_map_it); + + // Discard or add to the block cache. + if (discard) { + block_allocator_.free(reinterpret_cast(block.base_ptr_), block.length_); + } else { + block_cache_.push_back(block); + cache_size_ += block.length_; + in_use_size_ -= block.length_; + } + + balance(); + + // Don't publish free space since block was moved to the cache. + return true; + } + + // Don't report free memory if discarding the fragment. + if (discard) return true; + + // Report free fragment + const auto& freeEntry = + free_list_.insert(std::make_pair(size_t(fragment->second.size), fragment->first)); + setFree(fragment->second, freeEntry); + + return true; + } + + void balance() { + // Release old blocks when over cache limit. + while ((block_cache_.size() > 1) && (cache_size_ > in_use_size_ * 2)) { + const auto& block = block_cache_.front(); + block_allocator_.free(reinterpret_cast(block.base_ptr_), block.length_); + cache_size_ -= block.length_; + block_cache_.pop_front(); + } + } + + void trim() { + for (const auto& block : block_cache_) + block_allocator_.free(reinterpret_cast(block.base_ptr_), block.length_); + block_cache_.clear(); + cache_size_ = 0; + } + + size_t cache_size() const { return cache_size_; } + + size_t default_block_size() const { return block_allocator_.block_size(); } + + // Prevent reuse of the block containing ptr. No further fragments will be allocated from the + // block and the block will not be added to the block cache when it is free. + bool discardBlock(void* ptr) { + if (ptr == nullptr) return true; + + uintptr_t base = reinterpret_cast(ptr); + + // Find block validate. + auto frag_map_it = block_list_.upper_bound(base); + if (frag_map_it == block_list_.begin()) return false; + frag_map_it--; + auto& frag_map = frag_map_it->second; + if ((base < frag_map.begin()->first) || + (frag_map.rbegin()->first + frag_map.rbegin()->second.size <= base)) + return false; + + // Is block already discarded? + if (frag_map.begin()->second.discard) return true; + + // Mark all fragments for discard and compute block size. Removes freelist records for all + // fragments in the block. + size_t size = 0; + for (auto& frag : frag_map) { + discard(frag.second); + size += frag.second.size; + } + + // Remove discarded block from in-use tracking and rebalance the block cache. + in_use_size_ -= size; + balance(); + + return true; + } +}; + +} // namespace wsl + +#endif // HSA_RUNTME_CORE_UTIL_SIMPLE_HEAP_H_ diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/small_heap.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/util/small_heap.cpp new file mode 100644 index 0000000000..bcaef5dd87 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/small_heap.cpp @@ -0,0 +1,185 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "small_heap.h" + +namespace wsl { + +// Inserts node into freelist after place. +// Assumes node will not be an end of the list (list has guard nodes). +void SmallHeap::insertafter(SmallHeap::iterator_t place, SmallHeap::iterator_t node) { + assert(place->first < node->first && "Order violation"); + assert(isfree(place->second) && "Freelist operation error."); + iterator_t next = place->second.next; + node->second.next = next; + node->second.prior = place; + place->second.next = node; + next->second.prior = node; +} + +// Removes node from freelist. +// Assumes node will not be an end of the list (list has guard nodes). +void SmallHeap::remove(SmallHeap::iterator_t node) { + assert(isfree(node->second) && "Freelist operation error."); + node->second.prior->second.next = node->second.next; + node->second.next->second.prior = node->second.prior; + setused(node->second); +} + +// Returns high if merge failed or the merged node. +SmallHeap::memory_t::iterator SmallHeap::merge(SmallHeap::memory_t::iterator low, + SmallHeap::memory_t::iterator high) { + assert(isfree(low->second) && "Merge with allocated block"); + assert(isfree(high->second) && "Merge with allocated block"); + + if ((char*)low->first + low->second.len != (char*)high->first) return high; + + assert(!islastfree(high->second) && "Illegal merge."); + + low->second.len += high->second.len; + low->second.next = high->second.next; + high->second.next->second.prior = low; + + memory.erase(high); + return low; +} + +void SmallHeap::free(void* ptr) { + if (ptr == nullptr) return; + + auto iterator = memory.find(ptr); + + // Check for illegal free + if (iterator == memory.end()) { + assert(false && "Illegal free."); + return; + } + + // Return memory to total and link node into free list + total_free += iterator->second.len; + + // Could also traverse the free list which might be faster in some cases. + auto before = iterator; + before--; + while (!isfree(before->second)) before--; + assert(before->second.next->first > iterator->first && "Inconsistency in small heap."); + insertafter(before, iterator); + + // Attempt compaction + iterator = merge(before, iterator); + merge(iterator, iterator->second.next); + + // Update lowHighBondary + high.erase(ptr); +} + +void* SmallHeap::alloc(size_t bytes) { + // Is enough memory available? + if ((bytes > total_free) || (bytes == 0)) return nullptr; + + iterator_t current; + + // Walk the free list and allocate at first fitting location + current = firstfree(); + while (!islastfree(current->second)) { + if (bytes <= current->second.len) { + // Decrement from total + total_free -= bytes; + + // Split node + if (bytes != current->second.len) { + void* remaining = (char*)current->first + bytes; + Node& node = memory[remaining]; + node.len = current->second.len - bytes; + current->second.len = bytes; + insertafter(current, memory.find(remaining)); + } + + remove(current); + return current->first; + } + current = current->second.next; + } + assert(current->second.len == 0 && "Freelist corruption."); + + // Can't service the request due to fragmentation + return nullptr; +} + +void* SmallHeap::alloc_high(size_t bytes) { + // Is enough memory available? + if ((bytes > total_free) || (bytes == 0)) return nullptr; + + iterator_t current; + + // Walk the free list and allocate at first fitting location + current = lastfree(); + while (!isfirstfree(current->second)) { + if (bytes <= current->second.len) { + // Decrement from total + total_free -= bytes; + + void* alloc; + // Split node + if (bytes != current->second.len) { + alloc = (char*)current->first + current->second.len - bytes; + current->second.len -= bytes; + Node& node = memory[alloc]; + node.len = bytes; + setused(node); + } else { + alloc = current->first; + remove(current); + } + + high.insert(alloc); + return alloc; + } + current = current->second.prior; + } + assert(current->second.len == 0 && "Freelist corruption."); + + // Can't service the request due to fragmentation + return nullptr; +} + +} // namespace wsl diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/small_heap.h b/projects/rocr-runtime/libhsakmt/src/dxg/util/small_heap.h new file mode 100644 index 0000000000..f6e060cb09 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/small_heap.h @@ -0,0 +1,131 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// A simple first fit memory allocator with eager compaction. For use with few +// items (where list iteration is faster than trees). +// Not thread safe! + +#ifndef HSA_RUNTME_CORE_UTIL_SMALL_HEAP_H_ +#define HSA_RUNTME_CORE_UTIL_SMALL_HEAP_H_ + +#include +#include + +#include "utils.h" + +namespace wsl { + +class SmallHeap { + private: + struct Node; + typedef std::map memory_t; + typedef memory_t::iterator iterator_t; + + struct Node { + size_t len; + iterator_t next; + iterator_t prior; + }; + + SmallHeap(const SmallHeap& rhs) = delete; + SmallHeap& operator=(const SmallHeap& rhs) = delete; + + void* const pool; + const size_t length; + + size_t total_free; + memory_t memory; + std::set high; + + __forceinline bool isfree(const Node& node) const { return node.next != memory.begin(); } + __forceinline bool islastfree(const Node& node) const { return node.next == memory.end(); } + __forceinline bool isfirstfree(const Node& node) const { return node.prior == memory.end(); } + __forceinline void setlastfree(Node& node) { node.next = memory.end(); } + __forceinline void setfirstfree(Node& node) { node.prior = memory.end(); } + __forceinline void setused(Node& node) { node.next = memory.begin(); } + + __forceinline iterator_t firstfree() { return memory.begin()->second.next; } + __forceinline iterator_t lastfree() { return memory.rbegin()->second.prior; } + void insertafter(iterator_t place, iterator_t node); + void remove(iterator_t node); + iterator_t merge(iterator_t low, iterator_t high); + + public: + SmallHeap() : pool(nullptr), length(0), total_free(0) {} + SmallHeap(void* base, size_t length) + : pool(base), length(length), total_free(length) { + assert(pool != nullptr && "Invalid base address."); + assert(pool != (void*)0xFFFFFFFFFFFFFFFFull && "Invalid base address."); + assert((char*)pool + length != (char*)0xFFFFFFFFFFFFFFFFull && "Invalid pool bounds."); + + Node& start = memory[0]; + Node& node = memory[pool]; + Node& end = memory[(void*)0xFFFFFFFFFFFFFFFFull]; + + start.len = 0; + start.next = memory.find(pool); + setfirstfree(start); + + node.len = length; + node.prior = memory.begin(); + node.next = --memory.end(); + + end.len = 0; + end.prior = start.next; + setlastfree(end); + + high.insert((void*)0xFFFFFFFFFFFFFFFFull); + } + + void* alloc(size_t bytes); + void* alloc_high(size_t bytes); + void free(void* ptr); + + void* base() const { return pool; } + size_t size() const { return length; } + size_t remaining() const { return total_free; } + void* high_split() const { return *high.begin(); } +}; + +} // namespace wsl + +#endif diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/timer.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/util/timer.cpp new file mode 100644 index 0000000000..c5a2b57c64 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/timer.cpp @@ -0,0 +1,111 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "core/util/timer.h" + +namespace wsl { +namespace timer { + +accurate_clock::init::init() { + freq = os::AccurateClockFrequency(); + accurate_clock::period_ns = 1e9 / double(freq); +} + +// Calibrates the fast clock using the accurate clock. +fast_clock::init::init() { + typedef accurate_clock clock; + clock::duration delay(std::chrono::milliseconds(1)); + + // calibrate clock + fast_clock::raw_rep min = 0; + clock::duration elapsed; + + do { + elapsed = clock::duration::max(); + + for (int t = 0; t < 10; t++) { + fast_clock::raw_rep r1, r2; + clock::time_point t0, t1, t2, t3; + + t0 = clock::now(); + std::atomic_signal_fence(std::memory_order_acq_rel); + r1 = fast_clock::raw_now(); + std::atomic_signal_fence(std::memory_order_acq_rel); + t1 = clock::now(); + std::atomic_signal_fence(std::memory_order_acq_rel); + + do { + t2 = clock::now(); + } while (t2 - t1 < delay); + + std::atomic_signal_fence(std::memory_order_acq_rel); + r2 = fast_clock::raw_now(); + std::atomic_signal_fence(std::memory_order_acq_rel); + t3 = clock::now(); + + // If elapsed time is shorter than last recorded time and both the start + // and end times are confirmed correlated then record the clock readings. + // This protects against inaccuracy due to thread switching + if ((t3 - t1 < elapsed) && ((t1 - t0) * 10 < (t2 - t1)) && + ((t3 - t2) * 10 < (t2 - t1))) { + elapsed = t3 - t1; + min = r2 - r1; + } + } + delay += delay; + } while (min < 1000); + + fast_clock::freq = double(min) / duration_in_seconds(elapsed); + fast_clock::period_ps = 1e12 / fast_clock::freq; + // printf("Timer setup took %f ms\n", duration_in_seconds(elapsed)*1000.0f); + // printf("Fast clock frequency: %f MHz\n", double(fast_clock::freq)/1e6); +} + +double accurate_clock::period_ns; +accurate_clock::raw_frequency accurate_clock::freq; +accurate_clock::init accurate_clock::accurate_clock_init; + +double fast_clock::period_ps; +fast_clock::raw_frequency fast_clock::freq; +fast_clock::init fast_clock::fast_clock_init; +} // namespace timer +} // namespace wsl diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/timer.h b/projects/rocr-runtime/libhsakmt/src/dxg/util/timer.h new file mode 100644 index 0000000000..3012685113 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/timer.h @@ -0,0 +1,173 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_CORE_UTIL_TIMER_H_ +#define HSA_RUNTIME_CORE_UTIL_TIMER_H_ + +#include "core/util/utils.h" +#include "core/util/os.h" +#include +#include +#include + +namespace wsl { +namespace timer { + +// Needed to patch around a mixed arithmetic bug in MSVC's duration_cast as of +// VS 2013. +template +struct wide_type { + typedef double type; +}; +template <> +struct wide_type { + typedef uintmax_t type; +}; +template <> +struct wide_type { + typedef intmax_t type; +}; + +template +static __forceinline To + duration_cast(const std::chrono::duration& d) { + typedef typename wide_type::value, + std::is_signed::value>::type wide; + typedef std::chrono::duration unit_convert_t; + + unit_convert_t temp = std::chrono::duration_cast(d); + return To(static_cast(temp.count())); +} +// End patch + +template +static __forceinline double duration_in_seconds( + std::chrono::duration delta) { + typedef std::chrono::duration> seconds; + return seconds(delta).count(); +} + +template +static __forceinline rep duration_from_seconds(double delta) { + typedef std::chrono::duration> seconds; + return std::chrono::duration_cast(seconds(delta)); +} + +// Provices a C++11 standard clock interface to the os::AccurateClock functions +class accurate_clock { + public: + typedef double rep; + typedef std::nano period; + typedef std::chrono::duration duration; + typedef std::chrono::time_point time_point; + + static const bool is_steady = true; + + static __forceinline time_point now() { + return time_point(duration(raw_now() * period_ns)); + } + + // These two extra APIs and types let us use clocks without conversion to the + // arbitrary period unit + typedef uint64_t raw_rep; + typedef uint64_t raw_frequency; + + static __forceinline raw_rep raw_now() { return os::ReadAccurateClock(); } + static __forceinline raw_frequency raw_freq() { return freq; } + + private: + static double period_ns; + static raw_frequency freq; + + class init { + public: + init(); + }; + static init accurate_clock_init; +}; + +// Provices a C++11 standard clock interface to the lowest latency approximate +// clock +class fast_clock { + public: + typedef double rep; + typedef std::pico period; + typedef std::chrono::duration duration; + typedef std::chrono::time_point time_point; + + static const bool is_steady = true; + + static __forceinline time_point now() { + return time_point(duration(raw_now() * period_ps)); + } + + // These two extra APIs and types let us use clocks without conversion to the + // arbitrary period unit + typedef uint64_t raw_rep; + typedef double raw_frequency; + +#if defined(__x86_64__) || defined(_M_X64) + static __forceinline raw_rep raw_now() { return __rdtsc(); } + static __forceinline raw_frequency raw_freq() { return freq; } +#else + static __forceinline raw_rep raw_now() { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC_RAW, &ts); + return (raw_rep(ts.tv_sec) * 1000000000 + raw_rep(ts.tv_nsec)); + } + static __forceinline raw_frequency raw_freq() { return 1.e-9; } +#endif + + private: + static double period_ps; + static raw_frequency freq; + + class init { + public: + init(); + }; + static init fast_clock_init; +}; +} // namespace timer +} // namespace wsl + +#endif diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/utils.h b/projects/rocr-runtime/libhsakmt/src/dxg/util/utils.h new file mode 100644 index 0000000000..15d61a87e1 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/utils.h @@ -0,0 +1,389 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2024, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// Generally useful utility functions + +#ifndef HSA_RUNTIME_CORE_UTIL_UTILS_H_ +#define HSA_RUNTIME_CORE_UTIL_UTILS_H_ + +#include "stdint.h" +#include "stddef.h" +#include "stdlib.h" +#include "stdarg.h" +#include "unistd.h" +#include +#include +#include +#include +#include +#include + +namespace wsl { +extern FILE* log_file; +extern uint8_t log_flags[8]; + +typedef unsigned int uint; +typedef uint64_t uint64; + +#if defined(__GNUC__) +#if defined(__i386__) || defined(__x86_64__) +#include +#endif + +// 2MB huge page size +#define GPU_HUGE_PAGE_SIZE (2 << 20) + +// 4KB page size +#define DEFAULT_GPU_PAGE_SIZE (1 << 12) + +#define __forceinline __inline__ __attribute__((always_inline)) +#define __declspec(x) __attribute__((x)) +#undef __stdcall +#define __stdcall // __attribute__((__stdcall__)) +#define __ALIGNED__(x) __attribute__((aligned(x))) + +void log_printf(const char* file, int line, const char* format, ...); + +static __forceinline void* _aligned_malloc(size_t size, size_t alignment) { +#ifdef _ISOC11_SOURCE + return aligned_alloc(alignment, size); +#else + void *mem = NULL; + if (0 != posix_memalign(&mem, alignment, size)) return NULL; + return mem; +#endif +} +static __forceinline void _aligned_free(void* ptr) { return free(ptr); } +#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) +#include "intrin.h" +#define __ALIGNED__(x) __declspec(align(x)) +#if (_MSC_VER < 1800) // < VS 2013 +static __forceinline unsigned long long int strtoull(const char* str, + char** endptr, int base) { + return static_cast(_strtoui64(str, endptr, base)); +} +#endif +#if (_MSC_VER < 1900) // < VS 2015 +#define thread_local __declspec(thread) +#endif +#else +#error "Compiler and/or processor not identified." +#endif + +#define STRING2(x) #x +#define STRING(x) STRING2(x) + +#define PASTE2(x, y) x##y +#define PASTE(x, y) PASTE2(x, y) + +#define __FILENAME__ (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) + +#define LogPrint(flag, format, ...) \ + do { \ + if (hsa_flag_isset64(log_flags, flag)) \ + wsl::log_printf(__FILENAME__, __LINE__, format, ##__VA_ARGS__); \ + } while (false); + +// A macro to disallow the copy and move constructor and operator= functions +#define DISALLOW_COPY_AND_ASSIGN(TypeName) \ + TypeName(const TypeName&) = delete; \ + TypeName(TypeName&&) = delete; \ + void operator=(const TypeName&) = delete; \ + void operator=(TypeName&&) = delete; + +template +class ScopeGuard { + public: + explicit __forceinline ScopeGuard(const lambda& release) + : release_(release), dismiss_(false) {} + + ScopeGuard(ScopeGuard& rhs) { *this = rhs; } + + __forceinline ~ScopeGuard() { + if (!dismiss_) release_(); + } + __forceinline ScopeGuard& operator=(ScopeGuard& rhs) { + dismiss_ = rhs.dismiss_; + release_ = rhs.release_; + rhs.dismiss_ = true; + return *this; + } + __forceinline void Dismiss() { dismiss_ = true; } + + private: + lambda release_; + bool dismiss_; +}; + +template +static __forceinline ScopeGuard MakeScopeGuard(lambda rel) { + return ScopeGuard(rel); +} + +#define MAKE_SCOPE_GUARD_HELPER(lname, sname, ...) \ + auto lname = __VA_ARGS__; \ + ScopeGuard sname(lname); +#define MAKE_SCOPE_GUARD(...) \ + MAKE_SCOPE_GUARD_HELPER(PASTE(scopeGuardLambda, __COUNTER__), \ + PASTE(scopeGuard, __COUNTER__), __VA_ARGS__) +#define MAKE_NAMED_SCOPE_GUARD(name, ...) \ + MAKE_SCOPE_GUARD_HELPER(PASTE(scopeGuardLambda, __COUNTER__), name, \ + __VA_ARGS__) + +/// @brief: Finds out the min one of two inputs, input must support ">" +/// operator. +/// @param: a(Input), a reference to type T. +/// @param: b(Input), a reference to type T. +/// @return: T. +template +static __forceinline T Min(const T& a, const T& b) { + return (a > b) ? b : a; +} + +template +static __forceinline T Min(const T& a, const T& b, Arg... args) { + return Min(a, Min(b, args...)); +} + +/// @brief: Find out the max one of two inputs, input must support ">" operator. +/// @param: a(Input), a reference to type T. +/// @param: b(Input), a reference to type T. +/// @return: T. +template +static __forceinline T Max(const T& a, const T& b) { + return (b > a) ? b : a; +} + +template +static __forceinline T Max(const T& a, const T& b, Arg... args) { + return Max(a, Max(b, args...)); +} + +/// @brief: Free the memory space which is newed previously. +/// @param: ptr(Input), a pointer to memory space. Can't be NULL. +/// @return: void. +struct DeleteObject { + template + void operator()(const T* ptr) const { + delete ptr; + } +}; + +/// @brief: Checks if a value is power of two, if it is, return true. Be careful +/// when passing 0. +/// @param: val(Input), the data to be checked. +/// @return: bool. +template +static __forceinline bool IsPowerOfTwo(T val) { + return (val & (val - 1)) == 0; +} + +/// @brief: Calculates the floor value aligned based on parameter of alignment. +/// If value is at the boundary of alignment, it is unchanged. +/// @param: value(Input), value to be calculated. +/// @param: alignment(Input), alignment value. +/// @return: T. +template +static __forceinline T AlignDown(T value, size_t alignment) { + return (T)((value / alignment) * alignment); +} + +/// @brief: Same as previous one, but first parameter becomes pointer, for more +/// info, see the previous desciption. +/// @param: value(Input), pointer to type T. +/// @param: alignment(Input), alignment value. +/// @return: T*, pointer to type T. +template +static __forceinline T* AlignDown(T* value, size_t alignment) { + return (T*)AlignDown((intptr_t)value, alignment); +} + +/// @brief: Calculates the ceiling value aligned based on parameter of +/// alignment. +/// If value is at the boundary of alignment, it is unchanged. +/// @param: value(Input), value to be calculated. +/// @param: alignment(Input), alignment value. +/// @param: T. +template +static __forceinline T AlignUp(T value, size_t alignment) { + return AlignDown((T)(value + alignment - 1), alignment); +} + +/// @brief: Same as previous one, but first parameter becomes pointer, for more +/// info, see the previous desciption. +/// @param: value(Input), pointer to type T. +/// @param: alignment(Input), alignment value. +/// @return: T*, pointer to type T. +template +static __forceinline T* AlignUp(T* value, size_t alignment) { + return (T*)AlignDown((intptr_t)((uint8_t*)value + alignment - 1), alignment); +} + +/// @brief: Checks if the input value is at the boundary of alignment, if it is, +/// @return true. +/// @param: value(Input), value to be checked. +/// @param: alignment(Input), alignment value. +/// @return: bool. +template +static __forceinline bool IsMultipleOf(T value, size_t alignment) { + return (AlignUp(value, alignment) == value); +} + +/// @brief: Same as previous one, but first parameter becomes pointer, for more +/// info, see the previous desciption. +/// @param: value(Input), pointer to type T. +/// @param: alignment(Input), alignment value. +/// @return: bool. +template +static __forceinline bool IsMultipleOf(T* value, size_t alignment) { + return (AlignUp(value, alignment) == value); +} + +static __forceinline uint32_t NextPow2(uint32_t value) { + if (value == 0) return 1; + uint32_t v = value - 1; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + return v + 1; +} + +static __forceinline uint64_t NextPow2(uint64_t value) { + if (value == 0) return 1; + uint64_t v = value - 1; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v |= v >> 32; + return v + 1; +} + +static __forceinline bool strIsEmpty(const char* str) noexcept { return str[0] == '\0'; } + +static __forceinline std::string& ltrim(std::string& s) { + auto it = std::find_if(s.begin(), s.end(), + [](char c) { return !std::isspace(c, std::locale::classic()); }); + s.erase(s.begin(), it); + return s; +} + +static __forceinline std::string& rtrim(std::string& s) { + auto it = std::find_if(s.rbegin(), s.rend(), + [](char c) { return !std::isspace(c, std::locale::classic()); }); + s.erase(it.base(), s.end()); + return s; +} + +static __forceinline std::string& trim(std::string& s) { return ltrim(rtrim(s)); } + +} // namespace wsl + +template +static __forceinline uint32_t BitSelect(T p) { + static_assert(sizeof(T) <= sizeof(uintptr_t), "Type out of range."); + static_assert(highBit < sizeof(uintptr_t) * 8, "Bit index out of range."); + + uintptr_t ptr = p; + if (highBit != (sizeof(uintptr_t) * 8 - 1)) + return (uint32_t)((ptr & ((1ull << (highBit + 1)) - 1)) >> lowBit); + else + return (uint32_t)(ptr >> lowBit); +} + +inline uint32_t PtrLow16Shift8(const void* p) { + uintptr_t ptr = reinterpret_cast(p); + return (uint32_t)((ptr & 0xFFFFULL) >> 8); +} + +inline uint32_t PtrHigh64Shift16(const void* p) { + uintptr_t ptr = reinterpret_cast(p); + return (uint32_t)((ptr & 0xFFFFFFFFFFFF0000ULL) >> 16); +} + +inline uint32_t PtrLow40Shift8(const void* p) { + uintptr_t ptr = reinterpret_cast(p); + return (uint32_t)((ptr & 0xFFFFFFFFFFULL) >> 8); +} + +inline uint32_t PtrHigh64Shift40(const void* p) { + uintptr_t ptr = reinterpret_cast(p); + return (uint32_t)((ptr & 0xFFFFFF0000000000ULL) >> 40); +} + +static inline uint8_t Ptr48High8(const void* p) { + uintptr_t ptr = reinterpret_cast(p); + return (uint8_t)((ptr & 0xFF0000000000ULL) >> 40); +} + +static inline uint32_t Ptr48Low32(const void* p) { + uintptr_t ptr = reinterpret_cast(p); + assert((ptr & 0xFFFFFFFFFF00ULL) == ptr); + return (uint32_t)((ptr & 0xFFFFFFFFFFULL) >> 8); +} + +inline uint32_t PtrLow32(const void* p) { + return static_cast(reinterpret_cast(p)); +} + +inline uint32_t PtrHigh32(const void* p) { + uint32_t ptr = 0; +#ifdef HSA_LARGE_MODEL + ptr = static_cast(reinterpret_cast(p) >> 32); +#endif + return ptr; +} + +inline uint32_t HighPart(uint64_t value) { + return (value & 0xFFFFFFFF00000000) >> 32; +} + +inline uint32_t LowPart(uint64_t value) { + return (value & 0x00000000FFFFFFFF); +} + +#include "atomic_helpers.h" + +#endif // HSA_RUNTIME_CORE_UTIL_UTILS_H_ diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/win/os_win.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/util/win/os_win.cpp new file mode 100644 index 0000000000..b7f2285623 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/win/os_win.cpp @@ -0,0 +1,327 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifdef _WIN32 // Are we compiling for windows? +#define NOMINMAX + +#include "core/util/os.h" + +#include +#include +#include +#include + +#include +#include +#include + +#undef Yield +#undef CreateMutex + +namespace wsl { +namespace os { + +static_assert(sizeof(LibHandle) == sizeof(HMODULE), + "OS abstraction size mismatch"); +static_assert(sizeof(LibHandle) == sizeof(::HANDLE), + "OS abstraction size mismatch"); +static_assert(sizeof(Semaphore) == sizeof(::HANDLE), + "OS abstraction size mismatch"); +static_assert(sizeof(Mutex) == sizeof(::HANDLE), + "OS abstraction size mismatch"); +static_assert(sizeof(Thread) == sizeof(::HANDLE), + "OS abstraction size mismatch"); +static_assert(sizeof(EventHandle) == sizeof(::HANDLE), + "OS abstraction size mismatch"); + +LibHandle LoadLib(std::string filename) { + HMODULE ret = LoadLibrary(filename.c_str()); + return *(LibHandle*)&ret; +} + +void* GetExportAddress(LibHandle lib, std::string export_name) { + return GetProcAddress(*(HMODULE*)&lib, export_name.c_str()); +} + +void CloseLib(LibHandle lib) { FreeLibrary(*(::HMODULE*)&lib); } + +std::vector GetLoadedLibs() { + // Use EnumProcessModulesEx + static_assert(false, "Not implemented."); +} + +std::string GetLibraryName(LibHandle lib) { + static_assert(false, "Not implemented."); +} + +Semaphore CreateSemaphore() { + sem = static_cast(CreateSemaphore(NULL, 0, LONG_MAX, NULL)); + assert(sem != NULL && "CreateSemaphore failed"); + + return *(Semaphore*)&sem; +} + +bool WaitSemaphore(Semaphore sem) { + return WaitForSingleObject(*(::HANDLE*)&lock, INFINITE) == WAIT_OBJECT_0; +} + +void PostSemaphore(Semaphore sem) { + ReleaseSemaphore(static_cast(*sem), 1, NULL); +} + +void DestroySemaphore(Semaphore sem) { + if (!CloseHandle(static_cast(*sem))) { + assert("CloseHandle() failed"); + } + *sem = NULL; +} + +Mutex CreateMutex() { return CreateEvent(NULL, false, true, NULL); } + +bool TryAcquireMutex(Mutex lock) { + return WaitForSingleObject(*(::HANDLE*)&lock, 0) == WAIT_OBJECT_0; +} + +bool AcquireMutex(Mutex lock) { + return WaitForSingleObject(*(::HANDLE*)&lock, INFINITE) == WAIT_OBJECT_0; +} + +void ReleaseMutex(Mutex lock) { SetEvent(*(::HANDLE*)&lock); } + +void DestroyMutex(Mutex lock) { CloseHandle(*(::HANDLE*)&lock); } + +void Sleep(int delay_in_millisecond) { ::Sleep(delay_in_millisecond); } + +void uSleep(int delayInUs) { ::Sleep(delayInUs / 1000); } + +void YieldThread() { ::Sleep(0); } + +struct ThreadArgs { + void* entry_args; + ThreadEntry entry_function; +}; + +unsigned __stdcall ThreadTrampoline(void* arg) { + ThreadArgs* thread_args = (ThreadArgs*)arg; + ThreadEntry entry = thread_args->entry_function; + void* data = thread_args->entry_args; + delete thread_args; + entry(data); + _endthreadex(0); + return 0; +} + +Thread CreateThread(ThreadEntry entry_function, void* entry_argument, + uint stack_size) { + ThreadArgs* thread_args = new ThreadArgs(); + thread_args->entry_args = entry_argument; + thread_args->entry_function = entry_function; + uintptr_t ret = + _beginthreadex(NULL, stack_size, ThreadTrampoline, thread_args, 0, NULL); + return *(Thread*)&ret; +} + +void CloseThread(Thread thread) { CloseHandle(*(::HANDLE*)&thread); } + +bool WaitForThread(Thread thread) { + return WaitForSingleObject(*(::HANDLE*)&thread, INFINITE) == WAIT_OBJECT_0; +} + +bool WaitForAllThreads(Thread* threads, uint thread_count) { + return WaitForMultipleObjects(thread_count, threads, TRUE, INFINITE) == + WAIT_OBJECT_0; +} + +void SetEnvVar(std::string env_var_name, std::string env_var_value) { + SetEnvironmentVariable(env_var_name.c_str(), env_var_value.c_str()); +} + +std::string GetEnvVar(std::string env_var_name) { + char* buff; + DWORD char_count = GetEnvironmentVariable(env_var_name.c_str(), NULL, 0); + if (char_count == 0) return ""; + buff = (char*)alloca(sizeof(char) * char_count); + GetEnvironmentVariable(env_var_name.c_str(), buff, char_count); + buff[char_count - 1] = '\0'; + std::string ret = buff; + return ret; +} + +size_t GetUserModeVirtualMemorySize() { + SYSTEM_INFO system_info = {0}; + GetSystemInfo(&system_info); + return ((size_t)system_info.lpMaximumApplicationAddress + 1); +} + +size_t GetUsablePhysicalHostMemorySize() { + MEMORYSTATUSEX memory_status = {0}; + memory_status.dwLength = sizeof(memory_status); + if (GlobalMemoryStatusEx(&memory_status) == 0) { + return 0; + } + + const size_t physical_size = static_cast(memory_status.ullTotalPhys); + return std::min(GetUserModeVirtualMemorySize(), physical_size); +} + +uintptr_t GetUserModeVirtualMemoryBase() { return (uintptr_t)0; } + +// Os event wrappers +EventHandle CreateOsEvent(bool auto_reset, bool init_state) { + EventHandle evt = reinterpret_cast( + CreateEvent(NULL, (BOOL)(!auto_reset), (BOOL)init_state, NULL)); + return evt; +} + +int DestroyOsEvent(EventHandle event) { + if (event == NULL) { + return -1; + } + return CloseHandle(reinterpret_cast<::HANDLE>(event)); +} + +int WaitForOsEvent(EventHandle event, unsigned int milli_seconds) { + if (event == NULL) { + return -1; + } + + int ret_code = + WaitForSingleObject(reinterpret_cast<::HANDLE>(event), milli_seconds); + if (ret_code == WAIT_TIMEOUT) { + ret_code = 0x14003; // 0x14003 indicates timeout + } + return ret_code; +} + +int SetOsEvent(EventHandle event) { + if (event == NULL) { + return -1; + } + return SetEvent(reinterpret_cast<::HANDLE>(event)); +} + +int ResetOsEvent(EventHandle event) { + if (event == NULL) { + return -1; + } + return ResetEvent(reinterpret_cast<::HANDLE>(event)); +} + +uint64_t ReadAccurateClock() { + uint64_t ret; + QueryPerformanceCounter((LARGE_INTEGER*)&ret); + return ret; +} + +uint64_t AccurateClockFrequency() { + uint64_t ret; + QueryPerformanceFrequency((LARGE_INTEGER*)&ret); + return ret; +} + +SharedMutex CreateSharedMutex() { + assert(false && "Not implemented."); + abort(); + return nullptr; +} + +bool TryAcquireSharedMutex(SharedMutex lock) { + assert(false && "Not implemented."); + abort(); + return false; +} + +bool AcquireSharedMutex(SharedMutex lock) { + assert(false && "Not implemented."); + abort(); + return false; +} + +void ReleaseSharedMutex(SharedMutex lock) { + assert(false && "Not implemented."); + abort(); +} + +bool TrySharedAcquireSharedMutex(SharedMutex lock) { + assert(false && "Not implemented."); + abort(); + return false; +} + +bool SharedAcquireSharedMutex(SharedMutex lock) { + assert(false && "Not implemented."); + abort(); + return false; +} + +void SharedReleaseSharedMutex(SharedMutex lock) { + assert(false && "Not implemented."); + abort(); +} + +void DestroySharedMutex(SharedMutex lock) { + assert(false && "Not implemented."); + abort(); +} + +uint64_t ReadSystemClock() { + assert(false && "Not implemented."); + abort(); + return 0; +} + +uint64_t SystemClockFrequency() { + assert(false && "Not implemented."); + abort(); + return 0; +} + +bool ParseCpuID(cpuid_t* cpuinfo) { + assert(false && "Not implemented."); + abort(); + return false; +} + +} // namespace os +} // namespace wsl + +#endif diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/version.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/version.cpp new file mode 100644 index 0000000000..80dc67d44f --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/version.cpp @@ -0,0 +1,36 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + + +const char rocdxgbuildid[] __attribute__((used)) = "ROCDXG BUILD ID: " STRING(ROCDXG_VERSION); + +HSAKMT_STATUS HSAKMTAPI hsaKmtGetVersion(HsaVersionInfo *VersionInfo) { + CHECK_DXG_OPEN(); + + VersionInfo->KernelInterfaceMajorVersion = 1; + VersionInfo->KernelInterfaceMinorVersion = 17; + + return HSAKMT_STATUS_SUCCESS; +} diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/wddm/cmd_util.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/wddm/cmd_util.cpp new file mode 100644 index 0000000000..d650651e31 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/wddm/cmd_util.cpp @@ -0,0 +1,320 @@ +/* Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. */ + +#include "impl/wddm/cmd_util.h" + +namespace wsl { +namespace thunk { + +/* + * Builds a COPY_DATA packet that copies data. + */ +size_t CmdUtil::BuildCopyData( + uint64_t *pDstAddr, + void *pBuffer, + uint32_t dstSel, + uint32_t dstCachePolicy, + uint32_t srcSel, + uint32_t srcCachePolicy, + uint32_t countSel, + uint32_t wrConfirm) { + PM4MEC_COPY_DATA copy_data = {0}; + + GenerateCmdHeader(©_data, IT_COPY_DATA); + copy_data.bitfields2.dst_sel = dstSel; + copy_data.bitfields2.src_sel = srcSel; + copy_data.bitfields2.dst_cache_policy = dstCachePolicy; + copy_data.bitfields2.src_cache_policy = srcCachePolicy; + copy_data.bitfields2.count_sel = countSel; + copy_data.bitfields2.wr_confirm = wrConfirm; + copy_data.bitfields5c.dst_64b_addr_lo = (PtrLow32(pDstAddr) >> 3); + copy_data.dst_addr_hi = PtrHigh32(pDstAddr); + memcpy(pBuffer, ©_data, sizeof(copy_data)); + + return sizeof(copy_data); +} + +/* + * Builds a EVENT_WRITE packet. + * Applications can use Barrier command to ensure their + * command is executed only after all other commands have + * completed their execution. + */ +size_t CmdUtil::BuildBarrier( + void *pBuffer, + uint32_t eventIndex, + uint32_t eventType) { + BarrierTemplate barrier = {0}; + + GenerateCmdHeader(&barrier.event_write, IT_EVENT_WRITE); + barrier.event_write.bitfields2.event_index = eventIndex; + barrier.event_write.bitfields2.event_type = eventType; + memcpy(pBuffer, &barrier, sizeof(barrier)); + + return sizeof(barrier); +} + +/** + * Builds a WRITE_DATA packet. + * Writes two DWORDs into the GPU memory address "write_addr" + */ + +size_t CmdUtil::BuildWriteData64Command( + void* pBuffer, + uint64_t* write_addr, + uint64_t write_value) { + WriteDataTemplate command = {0}; + GenerateCmdHeader(&command.write_data, IT_WRITE_DATA); + + // Encode the user specified address to write to + uint64_t addr = uintptr_t(write_addr); + assert(!(addr & 0x3) && "WriteData address must be 4 byte aligned"); + + // Set the bit to confirm the write operation and cache policy + command.write_data.bitfields2.wr_confirm = wr_confirm__mec_write_data__wait_for_write_confirmation; + command.write_data.bitfields2.cache_policy = cache_policy__mec_write_data__bypass; + + // Specify the command to increment address if writing more than one DWord + command.write_data.bitfields2.addr_incr = addr_incr__mec_write_data__increment_address; + // Specify the class to which the write destination belongs + command.write_data.bitfields2.dst_sel = dst_sel__mec_write_data__memory; + + command.write_data.bitfields3c.dst_mem_addr_lo = (PtrLow32(write_addr) >> 2); + command.write_data.dst_mem_addr_hi = PtrHigh32(write_addr); + + // Specify the value to write + command.write_data.write_data_value = write_value; + + memcpy(pBuffer, &command, sizeof(command)); + return sizeof(command); +} + +/* + * Builds a ACQUIRE_MEM packet. + * Users can submit this command to + * invalidate Gpu caches - L1 and or L2. + */ +size_t CmdUtil::BuildAcquireMem( + uint8_t major, + void *pBuffer) { + size_t ret; + if (major == 9) { + gfx9::AcquireMemTemplate acq = {0}; + GenerateCmdHeader(&acq.acquire_mem, IT_ACQUIRE_MEM); + // Specify the size of memory to invalidate. Size is + // specified in terms of 256 byte chunks. A coher_size + // of 0xFFFFFFFF actually specified 0xFFFFFFFF00 (40 bits) + // of memory. The field coher_size_hi specifies memory from + // bits 40-64 for a total of 256 TB. + acq.acquire_mem.coher_size = 0xFFFFFFFF; + acq.acquire_mem.bitfields4.coher_size_hi = 0xFF; + // Specify the address of memory to invalidate. The + // address must be 256 byte aligned. + acq.acquire_mem.coher_base_lo = 0; + acq.acquire_mem.bitfields6.coher_base_hi = 0; + // Specify the poll interval for determing if operation is complete + acq.acquire_mem.bitfields7.poll_interval = 4; + acq.acquire_mem.bitfields2.coher_cntl = + (1 << 29) | // CP_COHER_CNTL__SH_ICACHE_ACTION_ENA_MASK + (1 << 27) | // CP_COHER_CNTL__SH_KCACHE_ACTION_ENA_MASK + (1 << 28); // CP_COHER_CNTL__SH_KCACHE_VOL_ACTION_ENA_MASK + memcpy(pBuffer, &acq, sizeof(acq)); + ret = sizeof(acq); + } else if (major >= 10) { + gfx10::AcquireMemTemplate acq = {0}; + GenerateCmdHeader(&acq.acquire_mem, IT_ACQUIRE_MEM); + acq.acquire_mem.coher_size = 0xFFFFFFFF; + acq.acquire_mem.bitfields4.coher_size_hi = 0xFF; + acq.acquire_mem.coher_base_lo = 0; + acq.acquire_mem.bitfields6.coher_base_hi = 0; + acq.acquire_mem.bitfields7.poll_interval = 4; + acq.acquire_mem.bitfields8.gcr_cntl = + (1 << 16) | // SEQ = FORWARD + (1 << 15) | // GL2_WB + (1 << 14) | // GL2_INV + (1 << 9) | // GL1_INV + (1 << 8) | // GLV_INV + (1 << 7) | // GLK_INV + (1 << 6) | // GLK_WB + (1 << 5) | // GLM_INV + (1 << 4) | // GLM_WB + (1 << 0); // GLI_INV = ALL + memcpy(pBuffer, &acq, sizeof(acq)); + ret = sizeof(acq); + } + + return ret; +} + +/* + * Builds a scratch packet. + */ +size_t CmdUtil::BuildScratch( + void *pScratchBase, + void *pBuffer) { + struct SetScratchTemplate scratch = {0}; + + GenerateSetShRegHeader(&scratch, mmCOMPUTE_DISPATCH_SCRATCH_BASE_LO); + scratch.scratch_lo = Ptr48Low32(pScratchBase); + scratch.scratch_hi = Ptr48High8(pScratchBase); + memcpy(pBuffer, &scratch, sizeof(scratch)); + + return sizeof(scratch); +} + +/** + * @ Set Compute Shader parameter for gfx11 and above + */ +size_t CmdUtil::BuildComputeShaderParams(void *pBuffer) { + struct DispatchProgramResourceRegs compute_shader_params = {0}; + + GenerateSetShRegHeader(&compute_shader_params, mmCOMPUTE_PGM_RSRC3); + // IMAGE_OP: Indicates the compute program contains an image op + // instruction and should be stalled by its WAIT_SYNC fence. + compute_shader_params.compute_pgm_rsrc3 = (1 << 31); + + memcpy(pBuffer, &compute_shader_params, sizeof(compute_shader_params)); + + return sizeof(compute_shader_params); +} + + +/* + * Builds a dispatch packet. + */ +size_t CmdUtil::BuildDispatch( + struct DispatchInfo *pInfo, + void *pBuffer) { + DispatchTemplate dispatch = {0}; + + GenerateSetShRegHeader(&dispatch.dimension_regs, mmCOMPUTE_NUM_THREAD_X); + dispatch.dimension_regs.compute_num_thread_x = pInfo->pPacket->workgroup_size_x; + dispatch.dimension_regs.compute_num_thread_y = pInfo->pPacket->workgroup_size_y; + dispatch.dimension_regs.compute_num_thread_z = pInfo->pPacket->workgroup_size_z; + + // TODO: Add AQL packet index for debugger + // Debugger requires AQL packet index in COMPUTE_DISPATCH_PKT_ADDR_LO + GenerateSetShRegHeader(&dispatch.program_regs, mmCOMPUTE_PGM_LO); + dispatch.program_regs.compute_pgm_lo = Ptr48Low32(pInfo->pEntry); + dispatch.program_regs.compute_pgm_hi = Ptr48High8(pInfo->pEntry); + + GenerateSetShRegHeader(&dispatch.program_resource_regs, mmCOMPUTE_PGM_RSRC1); + dispatch.program_resource_regs.compute_pgm_rsrc1 = pInfo->pKernelObject->compute_pgm_rsrc1; + if (pInfo->major == 11) { + AMD_HSA_BITS_SET(dispatch.program_resource_regs.compute_pgm_rsrc1, + AMD_COMPUTE_PGM_RSRC_ONE_PRIV, 1); + } + dispatch.program_resource_regs.compute_pgm_rsrc2 = + (pInfo->ldsBlks << 15) | pInfo->pKernelObject->compute_pgm_rsrc2; + + GenerateSetShRegHeader(&dispatch.resource_regs, mmCOMPUTE_RESOURCE_LIMITS); + dispatch.resource_regs.compute_resource_limits = 0x3ff; + dispatch.resource_regs.compute_static_thread_mgmt_se0 = 0xFFFFFFFF; + dispatch.resource_regs.compute_static_thread_mgmt_se1 = 0xFFFFFFFF; + dispatch.resource_regs.compute_static_thread_mgmt_se2 = 0xFFFFFFFF; + dispatch.resource_regs.compute_static_thread_mgmt_se3 = 0xFFFFFFFF; + + dispatch.resource_regs.compute_tmpring_size = pInfo->pAmdQueue->compute_tmpring_size; + + GenerateSetShRegHeader(&dispatch.compute_user_data_regs, mmCOMPUTE_USER_DATA_0); + + uint32_t sgpr_no = 0; + if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties, + AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER)) { + assert(pInfo->major < 11); + pInfo->scratchBaseOffset[pInfo->offsetCnt++] = + offsetof(struct DispatchTemplate, compute_user_data_regs.compute_user_data[0]) + + sgpr_no * sizeof(uint32_t); + + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = + pInfo->pAmdQueue->scratch_resource_descriptor[0]; + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = + pInfo->pAmdQueue->scratch_resource_descriptor[1]; + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = + pInfo->pAmdQueue->scratch_resource_descriptor[2]; + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = + pInfo->srd; + } + if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties, + AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR)) { + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = PtrLow32(pInfo->pPacket); + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = PtrHigh32(pInfo->pPacket); + } + if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties, + AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) { + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = PtrLow32(pInfo->pAmdQueue); + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = PtrHigh32(pInfo->pAmdQueue); + } + if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties, + AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR)) { + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = + PtrLow32(pInfo->pPacket->kernarg_address); + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = + PtrHigh32(pInfo->pPacket->kernarg_address); + } + if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties, + AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_ID)) { + // This feature may be enabled as a side effect of indirect calls. + // However, the compiler team confirmed that the dispatch id itself is not used, + // so safe to send 0 for each dispatch. + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = 0; + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = 0; + } + if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties, + AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_FLAT_SCRATCH_INIT)) { + assert(pInfo->major < 11); + pInfo->scratchBaseOffset[pInfo->offsetCnt++] = + offsetof(struct DispatchTemplate, compute_user_data_regs.compute_user_data[0]) + + sgpr_no * sizeof(uint32_t); + + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = + PtrLow32(pInfo->pScratchBase); + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = + PtrHigh32(pInfo->pScratchBase); + } + if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties, + AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE)) { + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = + pInfo->scratchSizePerWave / (pInfo->wave32 ? 32 : 64); + } + + GenerateCmdHeader(&dispatch.dispatch_direct, IT_DISPATCH_DIRECT); + dispatch.dispatch_direct.dispatch_initiator = + (1 << 0) | // COMPUTE_SHADER_EN + (1 << 2) | // FORCE_START_AT_000 + (1 << 5); // USE_THREAD_DIMENSIONS + if (pInfo->wave32) dispatch.dispatch_direct.dispatch_initiator |= (1 << 15); // CS_W32_EN + dispatch.dispatch_direct.dim_x = pInfo->pPacket->grid_size_x; + dispatch.dispatch_direct.dim_y = pInfo->pPacket->grid_size_y; + dispatch.dispatch_direct.dim_z = pInfo->pPacket->grid_size_z; + memcpy(pBuffer, &dispatch, sizeof(dispatch)); + + return sizeof(dispatch); +} + +/* + * Builds a ATOMIC_MEM packet. + * Users can submit this command + * to perform atomic operations. + */ +size_t CmdUtil::BuildAtomicMem( + uint64_t *pAddr, + uint32_t atomic, + void *pBuffer, + uint32_t cachePolicy, + uint64_t srcData) { + AtomicTemplate atom = {0}; + + GenerateCmdHeader(&atom.atomic, IT_ATOMIC_MEM); + atom.atomic.addr_lo = PtrLow32(pAddr); + atom.atomic.addr_hi = PtrHigh32(pAddr); + atom.atomic.bitfields2.atomic = atomic; + atom.atomic.bitfields2.cache_policy = cachePolicy; + atom.atomic.src_data_lo = LowPart(srcData); + atom.atomic.src_data_hi = HighPart(srcData); + memcpy(pBuffer, &atom, sizeof(atom)); + + return sizeof(atom); +} + +} // namespace thunk +} // namespace wsl diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/wddm/device.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/wddm/device.cpp new file mode 100644 index 0000000000..f51af85404 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/wddm/device.cpp @@ -0,0 +1,780 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include +#include + +#include +#include +#include +#include +#include +#include +#include "impl/wddm/status.h" +#include "impl/wddm/types.h" +#include "impl/wddm/device.h" +#include "impl/wddm/queue.h" + +namespace wsl { +namespace thunk { + +const uint32_t WDDMDevice::cmdbuf_aql_frame_num_ = 0x1000; + +WDDMDevice::WDDMDevice(D3DKMT_HANDLE adapter, LUID adapter_luid, uint32_t node_id) + : adapter_(adapter), adapter_luid_(adapter_luid), node_id_(node_id) { + memset(&device_info_, 0, sizeof(device_info_)); + + ParseDeviceInfo(); + CreateDevice(); + SetPowerOptimization(false); + CreatePagingQueue(); + InitCmdbufInfo(); + QuerySegmentInfo(); +} + +WDDMDevice::~WDDMDevice() { + DestroyPagingQueue(); + SetPowerOptimization(true); + DestroyDevice(); + + DestroyDeviceInfo(); +} + +static NTSTATUS WDDMQueryAdapter(D3DKMT_HANDLE adapter, KMTQUERYADAPTERINFOTYPE type, + void *data, int size) +{ + D3DKMT_QUERYADAPTERINFO args = {0}; + + args.hAdapter = adapter; + args.Type = type; + args.pPrivateDriverData = data; + args.PrivateDriverDataSize = size; + + return DXCORE_CALL(D3DKMTQueryAdapterInfo(&args)); +} + +bool WDDMDevice::QuerySegmentInfo() +{ + uint32_t segmentCount = 0; + segment_infos_.clear(); + + // Get the number of segments + D3DKMT_QUERYSTATISTICS adapterQuery = {}; + adapterQuery.Type = D3DKMT_QUERYSTATISTICS_ADAPTER; + adapterQuery.AdapterLuid = adapter_luid_; + + NTSTATUS ret = DXCORE_CALL(D3DKMTQueryStatistics(&adapterQuery)); + if (ret == STATUS_SUCCESS) { + segmentCount = adapterQuery.QueryResult.AdapterInformation.NbSegments; + pr_debug("Total Segments: %u\n", segmentCount); + } else { + pr_err("Failed to query adapter info\n"); + return false; + } + + for (uint32_t i = 0; i < segmentCount; i++) { + + D3DKMT_QUERYSTATISTICS segQuery = {}; + segQuery.Type = D3DKMT_QUERYSTATISTICS_SEGMENT; + segQuery.AdapterLuid = adapter_luid_; + segQuery.QuerySegment.SegmentId = i; + + ret = DXCORE_CALL(D3DKMTQueryStatistics(&segQuery)); + if (ret != STATUS_SUCCESS) { + pr_err("Failed to query segment %u info\n", i); + return false; + } + + auto& seg = segQuery.QueryResult.SegmentInformation; + + SegmentInfo info; + info.segment_id = i; + info.segment_type = seg.SegmentProperties.SegmentType; + info.system_memory = seg.SegmentProperties.SystemMemory; + info.aperture = seg.Aperture; + info.commit_limit = seg.CommitLimit; + + segment_infos_.push_back(info); + } + + return true; +} + +bool WDDMDevice::GetSegmentId(D3DKMT_QUERYSTATISTICS_SEGMENT_TYPE segment_type, + uint32_t &segment_id) +{ + for (const auto& seg_info : segment_infos_) { + if (seg_info.segment_type == segment_type) { + segment_id = seg_info.segment_id; + return true; + } + } + pr_err("Failed to get segment id for type %u\n", segment_type); + return false; +} + +/*Local heap(dedicated GPU memory) includes visiable heap and invisiable heap. + *Non local heap refers to shared GPU memory and it is sytem memory. + */ +uint64_t WDDMDevice::VramAvail(void) { + D3DKMT_QUERYSTATISTICS stats; + NTSTATUS ret; + uint64_t usedVis = 0; + uint64_t usedInv = 0; + uint64_t usedNonLocal = 0; + uint32_t segmentId = 0; + + // wait fence complete + uint64_t value = page_fence_value_.load(); + if(!CpuWait(&page_syncobj_, &value, 1, false)) + return HSA_STATUS_ERROR; + + if (IsDgpu()) { + // local cpu-visible memory + if(!GetSegmentId(D3DKMT_QUERYSTATISTICS_SEGMENT_TYPE_MEMORY, segmentId)) + return HSA_STATUS_ERROR; + + memset(&stats, 0, sizeof(D3DKMT_QUERYSTATISTICS)); + stats.Type = D3DKMT_QUERYSTATISTICS_SEGMENT; + stats.AdapterLuid = adapter_luid_; + stats.QuerySegment.SegmentId = segmentId; + ret = DXCORE_CALL(D3DKMTQueryStatistics(&stats)); + if (ret == 0) + usedVis = stats.QueryResult.SegmentInformation.BytesResident; + + // local invisible memory + if (device_info_.local_invisible_heap_size) { + segmentId++; + memset(&stats, 0, sizeof(D3DKMT_QUERYSTATISTICS)); + stats.Type = D3DKMT_QUERYSTATISTICS_SEGMENT; + stats.AdapterLuid = adapter_luid_; + stats.QuerySegment.SegmentId = 1; + + ret = DXCORE_CALL(D3DKMTQueryStatistics(&stats)); + if (ret == 0) + usedInv = stats.QueryResult.SegmentInformation.BytesResident; + } + + return LocalHeapSize() - usedVis - usedInv; + } else { + // APU - NonLocal memory + if(!GetSegmentId(D3DKMT_QUERYSTATISTICS_SEGMENT_TYPE_SYSMEM, segmentId)) + return HSA_STATUS_ERROR; + + memset(&stats, 0, sizeof(D3DKMT_QUERYSTATISTICS)); + stats.Type = D3DKMT_QUERYSTATISTICS_SEGMENT; + stats.AdapterLuid = adapter_luid_; + stats.QuerySegment.SegmentId = segmentId; + ret = DXCORE_CALL(D3DKMTQueryStatistics(&stats)); + if (ret == 0) + usedNonLocal = stats.QueryResult.SegmentInformation.BytesResident; + + return NonLocalHeapSize() - usedNonLocal; + } +} + +bool WDDMDevice::CreateDevice(void) { + D3DKMT_CREATEDEVICE args = {0}; + args.hAdapter = adapter_; + + NTSTATUS ret = DXCORE_CALL(D3DKMTCreateDevice(&args)); + if (ret == STATUS_SUCCESS) { + device_ = args.hDevice; + return true; + } + + pr_err("fail %x\n", ret); + return false; +} + +bool WDDMDevice::DestroyDevice(void) { + D3DKMT_DESTROYDEVICE args = {0}; + args.hDevice = device_; + + NTSTATUS ret = DXCORE_CALL(D3DKMTDestroyDevice(&args)); + if (ret == STATUS_SUCCESS) + return true; + + pr_err("fail %x\n", ret); + return false; +} + +bool WDDMDevice::CreatePagingQueue(void) { + D3DKMT_CREATEPAGINGQUEUE args = {0}; + args.hDevice = device_; + args.Priority = D3DDDI_PAGINGQUEUE_PRIORITY_NORMAL; + + NTSTATUS ret = DXCORE_CALL(D3DKMTCreatePagingQueue(&args)); + if (ret == STATUS_SUCCESS) { + page_queue_ = args.hPagingQueue; + page_syncobj_ = args.hSyncObject; + page_fence_addr_ = (uint64_t *)args.FenceValueCPUVirtualAddress; + page_fence_value_ = 0; + return true; + } + + pr_err("fail %x\n", ret); + return false; +} + +bool WDDMDevice::DestroyPagingQueue(void) { + D3DDDI_DESTROYPAGINGQUEUE args = {0}; + args.hPagingQueue = page_queue_; + + NTSTATUS ret = DXCORE_CALL(D3DKMTDestroyPagingQueue(&args)); + if (ret == STATUS_SUCCESS) + return true; + + pr_err("fail %x\n", ret); + return false; +} + +void WDDMDevice::SetPowerOptimization(bool restore) { + void *priv_data; + int priv_size; + + priv_size = thunk_proxy::GetPowerOptPrivDataSize(); + priv_data = malloc(priv_size); + assert(priv_data); + memset(priv_data, 0, priv_size); + thunk_proxy::FillinPowerOptPrivData(priv_data, restore); + + D3DKMT_ESCAPE d3dkmt_escape; + memset(&d3dkmt_escape, 0, sizeof(d3dkmt_escape)); + + d3dkmt_escape.hAdapter = adapter_; + d3dkmt_escape.hDevice = device_; + d3dkmt_escape.hContext = 0; //KMD only use device to identify the process + d3dkmt_escape.Type = D3DKMT_ESCAPE_DRIVERPRIVATE; + d3dkmt_escape.pPrivateDriverData = priv_data; + d3dkmt_escape.PrivateDriverDataSize = priv_size; + d3dkmt_escape.Flags.HardwareAccess = true; + + NTSTATUS status = DXCORE_CALL(D3DKMTEscape(&d3dkmt_escape)); + pr_debug("status %d, restore %d\n", status, restore); + free(priv_data); +} + +void WDDMDevice::UpdatePageFence(uint64_t fence_value) { + uint64_t current = page_fence_value_.load(); + + // atomically set fence value when target is bigger than current one + do { + if (current >= fence_value) + break; + } while (!page_fence_value_.compare_exchange_weak(current, fence_value)); +} + +ErrorCode WDDMDevice::CreateGpuMemory(const GpuMemoryCreateInfo &create_info, + GpuMemory **gpu_mem, gpusize *gpu_va) { + ErrorCode ret; + + *gpu_mem = nullptr; + auto mem = new GpuMemory(this); + if (create_info.dmabuf_fd > 0) + ret = mem->ImportPhysicalHandle(create_info, gpu_va); + else + ret = mem->Init(create_info); + if (ret == ErrorCode::Success) + *gpu_mem = mem; + else + delete mem; + + return ret; +} + +void *WDDMDevice::Lock(D3DKMT_HANDLE handle) { + D3DKMT_LOCK2 args = {0}; + args.hDevice = device_; + args.hAllocation = handle; + + NTSTATUS ret = DXCORE_CALL(D3DKMTLock2(&args)); + if (ret == STATUS_SUCCESS) + return args.pData; + + pr_err("fail %x\n", ret); + return NULL; +} + +bool WDDMDevice::Unlock(D3DKMT_HANDLE handle) { + D3DKMT_UNLOCK2 args = {0}; + args.hDevice = device_; + args.hAllocation = handle; + + NTSTATUS ret = DXCORE_CALL(D3DKMTUnlock2(&args)); + if (ret == STATUS_SUCCESS) + return true; + + pr_err("fail %x\n", ret); + return false; +} + +bool WDDMDevice::CreateContext(int engine, D3DKMT_HANDLE *handle) { + void *priv_data; + int priv_size; + + int ordinal = EngineOrdinal(engine, &device_info_); + if (ordinal < 0) + return false; + + priv_size = thunk_proxy::GetContextPrivDataSize(); + priv_data = malloc(priv_size); + assert(priv_data); + memset(priv_data, 0, priv_size); + thunk_proxy::FillinContextPrivData(priv_data, SupportStateShadowingByCpFw()); + + D3DKMT_CREATECONTEXTVIRTUAL args = {0}; + args.hDevice = device_; + args.EngineAffinity = 1 << 0; + args.NodeOrdinal = ordinal; + args.pPrivateDriverData = priv_data; + args.PrivateDriverDataSize = priv_size; + args.ClientHint = D3DKMT_CLIENTHINT_OPENCL; + + if (IsHwsEnabled(engine)) + args.Flags.HwQueueSupported = 1; + else + args.Flags.DisableGpuTimeout = thunk_proxy::ShouldDisableGpuTimeout(engine, &device_info_); + + NTSTATUS ret = DXCORE_CALL(D3DKMTCreateContextVirtual(&args)); + if (ret == STATUS_SUCCESS) { + *handle = args.hContext; + free(priv_data); + return true; + } + + free(priv_data); + + pr_err("fail %x\n", ret); + return false; +} + +bool WDDMDevice::DestroyContext(D3DKMT_HANDLE handle) { + D3DKMT_DESTROYCONTEXT args = {0}; + args.hContext = handle; + + NTSTATUS ret = DXCORE_CALL(D3DKMTDestroyContext(&args)); + if (ret == STATUS_SUCCESS) + return true; + + pr_err("fail %x\n", ret); + return false; +} + +bool WDDMDevice::GpuWait(WDDMQueue *queue, const D3DKMT_HANDLE *syncobjs, + uint64_t *values, int count) { + + D3DKMT_WAITFORSYNCHRONIZATIONOBJECTFROMGPU args = {0}; + args.hContext = queue->context; + args.ObjectCount = count; + args.ObjectHandleArray = syncobjs; + args.MonitoredFenceValueArray = values; + + NTSTATUS ret = DXCORE_CALL(D3DKMTWaitForSynchronizationObjectFromGpu(&args)); + if (ret == STATUS_SUCCESS) + return true; + + pr_err("fail %x\n", ret); + return false; +} + +bool WDDMDevice::GpuSignal(D3DKMT_HANDLE context, const D3DKMT_HANDLE *syncobjs, + uint64_t *value, int count) { + D3DKMT_SIGNALSYNCHRONIZATIONOBJECTFROMGPU args = {0}; + args.hContext = context; + args.ObjectCount = count; + args.ObjectHandleArray = syncobjs; + args.MonitoredFenceValueArray = value; + + NTSTATUS ret = DXCORE_CALL(D3DKMTSignalSynchronizationObjectFromGpu(&args)); + if (ret == STATUS_SUCCESS) + return true; + + pr_err("fail %x\n", ret); + return false; +} + +bool WDDMDevice::CpuWait(const D3DKMT_HANDLE *syncobjs, uint64_t *value, + int count, bool wait_any) { + D3DKMT_WAITFORSYNCHRONIZATIONOBJECTFROMCPU args = {0}; + args.hDevice = device_; + args.ObjectCount = count; + args.ObjectHandleArray = syncobjs; + args.FenceValueArray = value; + args.Flags.WaitAny = wait_any; + + NTSTATUS ret = DXCORE_CALL(D3DKMTWaitForSynchronizationObjectFromCpu(&args)); + if (ret == STATUS_SUCCESS) + return true; + + pr_err("fail %x\n", ret); + return false; +} + +bool WDDMDevice::WaitOnPagingFenceFromCpu() { + uint64_t page_fence_value = 0; + + page_fence_value = page_fence_value_.load(); + if (CpuWait(&page_syncobj_, &page_fence_value, 1, false)) + return true; + + return false; +} + +bool WDDMDevice::CreateSyncobj(D3DKMT_HANDLE *handle, uint64_t **addr) { + D3DKMT_CREATESYNCHRONIZATIONOBJECT2 args = {0}; + args.hDevice = device_; + args.Info.Type = D3DDDI_MONITORED_FENCE; + args.Info.MonitoredFence.EngineAffinity = 1 << 0; + + NTSTATUS ret = DXCORE_CALL(D3DKMTCreateSynchronizationObject2(&args)); + if (ret == STATUS_SUCCESS) { + *handle = args.hSyncObject; + *addr = (uint64_t *)args.Info.MonitoredFence.FenceValueCPUVirtualAddress; + pr_debug("create syncobj cpu addr=%p gpu addr=%" PRIx64 "\n", + args.Info.MonitoredFence.FenceValueCPUVirtualAddress, + args.Info.MonitoredFence.FenceValueGPUVirtualAddress); + + return true; + } + + pr_err("fail %x\n", ret); + return false; +} + +void WDDMDevice::DestroySyncobj(D3DKMT_HANDLE handle) { + D3DKMT_DESTROYSYNCHRONIZATIONOBJECT args = {0}; + args.hSyncObject = handle; + + NTSTATUS ret = DXCORE_CALL(D3DKMTDestroySynchronizationObject(&args)); + if (ret != STATUS_SUCCESS) + pr_err("fail %x\n", ret); +} + +void WDDMDevice::InitCmdbufInfo(void) { + if (device_info_.major == 9) { + cmdbuf_aql_frame_size_ = 2 * sizeof(gfx9::AcquireMemTemplate); + } else if (device_info_.major >= 10) { + cmdbuf_aql_frame_size_ = 2 * sizeof(gfx10::AcquireMemTemplate); + } + + if (device_info_.major >= 11) { + cmdbuf_aql_frame_size_ += sizeof(SetScratchTemplate); + cmdbuf_aql_frame_size_ += sizeof(DispatchProgramResourceRegs); // BuildComputeShaderParams + } + + cmdbuf_aql_frame_size_ += + sizeof(PM4MEC_COPY_DATA) * 2 + + sizeof(BarrierTemplate) * 2 + + sizeof(DispatchTemplate) + + sizeof(AtomicTemplate) * 2; + + // Add safety margin to account for alignment and future additions + cmdbuf_aql_frame_size_ += 128; + + cmdbuf_aql_frame_size_ = AlignUp(cmdbuf_aql_frame_size_, 0x10); + + cmdbuf_size_ = AlignUp(cmdbuf_aql_frame_num_ * cmdbuf_aql_frame_size_, 0x1000); +} + +uint32_t WDDMDevice::LdsBlocks(const hsa_kernel_dispatch_packet_t *pkt) { + static const uint32_t blk_sz = 512; + uint32_t total_sz = pkt->group_segment_size; + uint32_t blk_num = (total_sz + blk_sz - 1) / blk_sz; + return blk_num; +} + +NTSTATUS WDDMCreateDevices(std::vector &devices) +{ + bool supported = false; + D3DKMT_ENUMADAPTERS2 args = {0}; + NTSTATUS ret = DXCORE_CALL(D3DKMTEnumAdapters2(&args)); + if (ret != STATUS_SUCCESS) + return ret; + + if (!args.NumAdapters) { + return STATUS_SUCCESS; + } + + D3DKMT_ADAPTERINFO *info = new D3DKMT_ADAPTERINFO[args.NumAdapters]; + if (!info) + return STATUS_NO_MEMORY; + + args.pAdapters = info; + ret = DXCORE_CALL(D3DKMTEnumAdapters2(&args)); + if (ret != STATUS_SUCCESS) + goto err_out0; + + for (int i = 0; i < args.NumAdapters; i++) { + D3DKMT_QUERY_DEVICE_IDS query = {0}; + + ret = WDDMQueryAdapter(info[i].hAdapter, KMTQAITYPE_PHYSICALADAPTERDEVICEIDS, + &query, sizeof(query)); + if (ret != STATUS_SUCCESS) + goto err_out1; + + if (query.DeviceIds.VendorID != 0x1002) + continue; + + supported = thunk_proxy::QueryAdapterSupported(query.DeviceIds.DeviceID); + + if (supported) { + auto device = new WDDMDevice( + info[i].hAdapter, info[i].AdapterLuid, devices.size() + 1); + if (!device) + goto err_out1; + devices.push_back(device); + } + } + + delete[] info; + return STATUS_SUCCESS; + + err_out1: + for (auto &device : devices) + delete device; + err_out0: + delete[] info; + return ret; +} + +bool WDDMDevice::ParseDeviceInfo() { + bool ret; + + memset(&device_info_, 0, sizeof(device_info_)); + ret = thunk_proxy::ParseAdapterInfo(adapter_, &device_info_); + if (!ret) + return false; + + return true; +} + +void WDDMDevice::DestroyDeviceInfo() { + free(device_info_.adapter_info); +} + +void WDDMDevice::GetClockCounters(uint64_t *gpu, uint64_t *cpu) { + + uint32_t engine = GetComputeEngine(); + int ordinal = EngineOrdinal(engine, &device_info_); + + D3DKMT_QUERYCLOCKCALIBRATION args = {0}; + + /* LDA(Linked Display Adapter) + * In the LDA design multiple physical GPUs are linked together to be controlled + * as a single object from the point of view of power manager, GPU scheduler and + * GPU memory manager. The physical GPUs are represented by a signal logical adapter + * object. There is a single DXGADAPTER objects, a single KMD adapter object. + * + * Set PhysicalAdapterIndex to 0 by default with None LDA mode. + */ + args.hAdapter = adapter_; + args.NodeOrdinal = ordinal; + args.PhysicalAdapterIndex = 0; + + NTSTATUS status = DXCORE_CALL(D3DKMTQueryClockCalibration(&args)); + if (status) { + pr_debug("status %d \n", status); + } else { + if (gpu) + *gpu = args.ClockData.GpuClockCounter; + + if (cpu) + *cpu = args.ClockData.CpuClockCounter; + } +} + +bool WDDMDevice::CreateQueue(WDDMQueue *queue) { + if (!CreateContext(queue->queue_engine, &queue->context)) + return false; + + GpuMemory *gpu_mem = nullptr; + if (queue->cmdbuf_addr == 0) { + GpuMemoryCreateInfo create_info{}; + create_info.size = queue->cmdbuf_size; + create_info.domain = thunk_proxy::kSystem; + + auto code = CreateGpuMemory(create_info, &gpu_mem); + if (code != ErrorCode::Success) + goto err_out0; + + queue->cmdbuf = gpu_mem->GetGpuMemoryHandle(); + queue->cmdbuf_addr = gpu_mem->GpuAddress(); + } + + if (queue->Init()) + goto err_out1; + + return true; + +err_out1: + delete gpu_mem; +err_out0: + DestroyContext(queue->context); + + return false; +} + +void WDDMDevice::DestroyQueue(WDDMQueue *queue) { + + queue->Fini(); + + auto cmdbuf_mem = GpuMemory::Convert(queue->cmdbuf); + delete cmdbuf_mem; + + DestroyContext(queue->context); +} + +bool WDDMDevice::SubmitToSwQueue(WDDMQueue *queue, uint64_t command_addr, + uint64_t command_size, uint64_t fence_value) { + void *priv_data; + int priv_size; + + priv_size = thunk_proxy::GetSubmitPrivDataSize(); + priv_data = malloc(priv_size); + assert(priv_data); + memset(priv_data, 0, priv_size); + thunk_proxy::FillinSubmitPrivData(priv_data, queue->queue, command_addr, command_size, false); + + D3DKMT_SUBMITCOMMAND args = {0}; + args.Commands = command_addr; + args.CommandLength = command_size; + args.BroadcastContextCount = 1; + args.BroadcastContext[0] = queue->context; + args.pPrivateDriverData = priv_data; + args.PrivateDriverDataSize = priv_size; + + NTSTATUS ret = DXCORE_CALL(D3DKMTSubmitCommand(&args)); + if (ret != STATUS_SUCCESS) { + pr_err("fail %x\n", ret); + free(priv_data); + return false; + } + + free(priv_data); + + if (!GpuSignal(queue->context, &queue->syncobj, &fence_value, 1)) + return false; + + return true; +} + +bool WDDMDevice::CreateHwQueue(WDDMQueue *queue) { + void *priv_data; + int priv_size; + + priv_size = thunk_proxy::GetHwQueuePrivDataSize(); + priv_data = malloc(priv_size); + assert(priv_data); + memset(priv_data, 0, priv_size); + bool FwManagedGfxState = SupportStateShadowingByCpFw(); + thunk_proxy::FillinHwQueuePrivData(priv_data, FwManagedGfxState, queue->prio); + + D3DKMT_CREATEHWQUEUE createHwQueue = {0}; + createHwQueue.hHwContext = queue->context; + createHwQueue.Flags.DisableGpuTimeout = thunk_proxy::ShouldDisableGpuTimeout(queue->queue_engine, &device_info_); + createHwQueue.pPrivateDriverData = priv_data; + createHwQueue.PrivateDriverDataSize = priv_size; + + NTSTATUS ret = DXCORE_CALL(D3DKMTCreateHwQueue(&createHwQueue)); + if (ret != STATUS_SUCCESS) { + pr_err("fail %x\n", ret); + free(priv_data); + return false; + } + + free(priv_data); + + queue->queue = createHwQueue.hHwQueue; + queue->syncobj = createHwQueue.hHwQueueProgressFence; + queue->sync_addr = (uint64_t *)createHwQueue.HwQueueProgressFenceCPUVirtualAddress; + + return true; +} + +bool WDDMDevice::DestroyHwQueue(WDDMQueue *queue) { + D3DKMT_DESTROYHWQUEUE DestroyHwQueue = { + .hHwQueue = queue->queue, + }; + + NTSTATUS ret = DXCORE_CALL(D3DKMTDestroyHwQueue(&DestroyHwQueue)); + if (ret != STATUS_SUCCESS) { + pr_err("fail %x\n", ret); + return false; + } + + return true; +} + +bool WDDMDevice::SubmitToHwQueue(WDDMQueue *queue, uint64_t command_addr, + uint64_t command_size, uint64_t fence_value) { + void *priv_data; + int priv_size; + + priv_size = thunk_proxy::GetSubmitPrivDataSize(); + priv_data = malloc(priv_size); + assert(priv_data); + memset(priv_data, 0, priv_size); + thunk_proxy::FillinSubmitPrivData(priv_data, queue->queue, command_addr, command_size, true); + + D3DKMT_SUBMITCOMMANDTOHWQUEUE args = {0}; + args.hHwQueue = queue->queue; + args.HwQueueProgressFenceId = fence_value; + args.CommandBuffer = command_addr; + args.CommandLength = command_size; + args.pPrivateDriverData = priv_data; + args.PrivateDriverDataSize = priv_size; + + NTSTATUS ret = DXCORE_CALL(D3DKMTSubmitCommandToHwQueue(&args)); + if (ret != STATUS_SUCCESS) { + pr_err("fail %x\n", ret); + free(priv_data); + return false; + } + + free(priv_data); + + return true; +} + +} // namespace thunk +} // namespace wsl diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/wddm/gpu_memory.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/wddm/gpu_memory.cpp new file mode 100644 index 0000000000..e374be8867 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/wddm/gpu_memory.cpp @@ -0,0 +1,594 @@ +#include +#include +#include +#include "impl/wddm/gpu_memory.h" +#include "impl/wddm/device.h" +#include "util/utils.h" + +using namespace std; + +namespace wsl { +namespace thunk { + +size_t GpuMemory::CalcChunkNumbers(gpusize size) { + const auto chunk_size = WDDMDevice::GpuMemoryChunkSize; + return (size + chunk_size - 1) / chunk_size; +} + +gpusize GpuMemory::AdjustSize(gpusize size) const { + const auto &device_info = device_->DeviceInfo(); + + if (device_info.enable_big_page_alignment && desc_.domain == thunk_proxy::kLocal) { + uint32_t alignment = device_info.big_page_alignment_size; + // BigPage is only supported for allocations > bigPageMinAlignment. + // Also, if bigPageMinAlignment == 0, BigPage optimization is not supported per KMD. + // We do either LargePage or BigPage alignment, whichever has a higher value. + if ((device_info.hw_big_page_min_alignment_size > 0) && (size > device_info.hw_big_page_min_alignment_size)) { + alignment = std::max(alignment, device_info.hw_big_page_min_alignment_size); + if (size > device_info.hw_big_page_alignment_size) + alignment = std::max(alignment, device_info.hw_big_page_alignment_size); + } + if (alignment > 0) + size = AlignUp(size, alignment); + } else { + const size_t min_size = 4096; + size = AlignUp(size, min_size); + } + return size; +} + +GpuMemory::GpuMemory(WDDMDevice *device) : device_(device) { + num_allocations_ = 0; + alloc_handles_ptr_ = nullptr; + alloc_handle_ = 0; + resource_ = 0; + mem_fd_ = -1; +} + +GpuMemory::~GpuMemory() { + FreeGpuVirtualAddress(GpuAddress(), Size()); + FreePhysicalMemory(); + if (desc_.handle_ape_addr > 0) + dxg_runtime->HandleApertureFree(desc_.handle_ape_addr); +} + +ErrorCode GpuMemory::Init(const GpuMemoryCreateInfo &create_info) { + desc_.domain = create_info.domain; + desc_.adapter_luid = device_->GetLuid(); + desc_.client_size = create_info.size; + desc_.alignment = create_info.alignment; + desc_.mem_flags = create_info.mem_flags; + desc_.engine_flag = create_info.engine_flag; + desc_.flags.is_virtual = create_info.flags.virtual_alloc; + desc_.flags.is_physical_only = create_info.flags.physical_only; + desc_.flags.is_physical_contiguous = create_info.flags.physical_contiguous; + desc_.flags.is_imported_sys_memfd = create_info.flags.sysmem_ipc_sig_importer; + desc_.flags.is_sysmem_exporter = create_info.flags.sysmem_ipc_sig_exporter; + desc_.flags.is_va_required = create_info.flags.alloc_va; + desc_.flags.is_blit_kernel_object = create_info.flags.blit_kernel_object; + + /* we can't tell the allocation is regular vmm or ipc mem at creation stage, + they share same creation parameters, so forcing all vram allocations to + sharable to support IPC mem */ + if (create_info.flags.interprocess || + desc_.domain == thunk_proxy::AllocDomain::kLocal) + desc_.flags.is_shared = true; + + desc_.flags.is_locked = create_info.flags.locked; + desc_.size = AdjustSize(desc_.client_size); + + if (IsUserMemory() || IsSystem()) + desc_.cpu_addr = create_info.user_ptr; + + num_allocations_ = CalcChunkNumbers(Size()); + if (num_allocations_ == 1) + alloc_handles_ptr_ = &alloc_handle_; + else + alloc_handles_ptr_ = new WinAllocationHandle[num_allocations_]; + + memset(alloc_handles_ptr_, 0, num_allocations_ * sizeof(WinAllocationHandle)); + + auto code = ErrorCode::Success; + + if (IsPhysicalOnly()) { + code = CreatePhysicalMemory(); + if (code == ErrorCode::Success) + code = dxg_runtime->HandleApertureAlloc(desc_.size, &desc_.handle_ape_addr); + return code; + } + + code = ReserveGpuVirtualAddress(create_info.va_hint, Size(), create_info.alignment); + if (IsVirtual() || (code != ErrorCode::Success)) + return code; + + bool physical_created = false; + + auto guard = MakeScopeGuard([this, &physical_created, &code]() { + if (code != ErrorCode::Success) { + + if (physical_created) { + FreePhysicalMemory(); + } + FreeGpuVirtualAddress(GpuAddress(), Size()); + } + }); + (void)guard; + + code = CreatePhysicalMemory(); + if (code != ErrorCode::Success) + return code; + + physical_created = true; + + code = MapGpuVirtualAddress(GpuAddress(), Size()); + if (code != ErrorCode::Success) + return code; + + code = MakeResident(); + if (code != ErrorCode::Success) + return code; + + if (!GetDevice()->WaitOnPagingFenceFromCpu()) + code = ErrorCode::Unknown; + + return code; +} + +ErrorCode GpuMemory::UnmapGpuVirtualAddress(const gpusize addr, const gpusize size, gpusize offset) { + auto code = ErrorCode::Success; + size_t i = 0; + auto map_addr = addr; + auto map_size = size; + + while (offset >= WDDMDevice::GpuMemoryChunkSize) { + offset -= WDDMDevice::GpuMemoryChunkSize; + i += 1; + } + + while (map_size > 0) { + auto block_size = std::min(map_size, WDDMDevice::GpuMemoryChunkSize); + + D3DDDI_MAPGPUVIRTUALADDRESS args{}; + + args.hPagingQueue = device_->PagingQueue(); + args.BaseAddress = map_addr; + args.hAllocation = GetAllocationHandle(i); + args.SizeInPages = block_size / 0x1000; + args.Protection.NoAccess = 1; + + code = d3dthunk::MapGpuVirtualAddress(&args); + + if (code == ErrorCode::NotReady) + device_->UpdatePageFence(args.PagingFenceValue); + else if (code != ErrorCode::Success) + break; + + map_addr += block_size; + map_size -= block_size; + offset = 0; // reset second unmapped allocation offset to zero + i += 1; + } + + return code; +} + +ErrorCode GpuMemory::MapGpuVirtualAddress(const gpusize addr, const gpusize size, gpusize offset) { + + auto code = ErrorCode::Success; + size_t i = 0; + auto map_addr = addr; + auto map_size = size; + const size_t _4K = 0x1000; + + while (offset >= WDDMDevice::GpuMemoryChunkSize) { + offset -= WDDMDevice::GpuMemoryChunkSize; + i += 1; + } + const size_t first_chunk = i; + const auto first_chunk_offset = offset; + /* Found two limitation for local vram: + * 1. invisible vram va has to be 64K aligned, otherwise map gpu va fail + * 2. visible vram can not be cpu mapped when command submission or after gpu mapped + */ + while (map_size > 0) { + auto block_size = std::min(map_size, WDDMDevice::GpuMemoryChunkSize); + + D3DDDI_MAPGPUVIRTUALADDRESS args{}; + + args.hPagingQueue = device_->PagingQueue(); + args.BaseAddress = map_addr; + args.hAllocation = GetAllocationHandle(i); + args.OffsetInPages = offset / _4K; + args.SizeInPages = block_size / _4K; + args.Protection.Write = 1; + + code = d3dthunk::MapGpuVirtualAddress(&args); + + if (code != ErrorCode::Success) { + if (code == ErrorCode::NotReady) { + const uint64_t fence_value = args.PagingFenceValue; + device_->UpdatePageFence(fence_value); + code = ErrorCode::Success; + } else + break; + } + + map_addr += block_size; + map_size -= block_size; + offset = 0; // reset second mapped allocation offset to zero + i++; + } + + if (code != ErrorCode::Success) { + // Map failed, unmap partial mapped block + offset = first_chunk_offset; + map_addr = addr; + map_size = size; + for (size_t j = first_chunk; j < i; j++) { + auto block_size = std::min(map_size, WDDMDevice::GpuMemoryChunkSize); + + D3DDDI_MAPGPUVIRTUALADDRESS args{}; + + args.hPagingQueue = device_->PagingQueue(); + args.BaseAddress = map_addr; + args.hAllocation = 0; + args.OffsetInPages = offset / _4K; + args.SizeInPages = block_size / _4K; + args.Protection.NoAccess = 1; + + auto unmap_code = d3dthunk::MapGpuVirtualAddress(&args); + if (unmap_code == ErrorCode::NotReady) + device_->UpdatePageFence(args.PagingFenceValue); + + map_addr += block_size; + map_size -= block_size; + } + } + + return code; +} + +ErrorCode GpuMemory::ReserveGpuVirtualAddress(gpusize base_virt_addr, gpusize size, gpusize alignment) { + ErrorCode status; + gpusize gpu_virt_addr = 0; + if ((desc_.flags.is_sysmem_exporter || desc_.flags.is_imported_sys_memfd) + && desc_.domain == thunk_proxy::AllocDomain::kSystem) { + int mfd = (mem_fd_ > -1)? mem_fd_ : -1; + status = dxg_runtime->ReserveIPCSysMem(Size(), &gpu_virt_addr, desc_.alignment, mfd, desc_.flags.is_locked); + if (status == ErrorCode::Success) + mem_fd_ = mfd; + } else { + status = dxg_runtime->ReserveGpuVirtualAddress(desc_.domain, base_virt_addr, size, &gpu_virt_addr, alignment, + desc_.flags.is_locked); + } + + if (status == ErrorCode::Success) { + desc_.gpu_addr = gpu_virt_addr; + + if (IsSystem()) + desc_.cpu_addr = reinterpret_cast(desc_.gpu_addr); + } + return status; +} + +ErrorCode GpuMemory::FreeGpuVirtualAddress(gpusize base_addr, gpusize size) { + if (mem_fd_ > -1) + return dxg_runtime->FreeIPCSysMem(GpuAddress(), Size(), mem_fd_); + + return base_addr != 0 ? + dxg_runtime->FreeGpuVirtualAddress(desc_.domain, base_addr, size) : + ErrorCode::Success; +} + +ErrorCode GpuMemory::CreatePhysicalMemory() { + + assert(!IsVirtual() && NumChunks() > 0); + + const auto num_allocations = NumChunks(); + void *priv_drv_data; + void *priv_alloc_data; + int priv_drv_data_size; + int priv_alloc_data_size; + + thunk_proxy::GetAllocPrivDataSize(&priv_drv_data_size, &priv_alloc_data_size); + int total_size = priv_drv_data_size + + num_allocations * priv_alloc_data_size + + num_allocations * sizeof(D3DDDI_ALLOCATIONINFO2); + priv_drv_data = malloc(total_size); + if (!priv_drv_data) + return ErrorCode::OutOfMemory; + + memset(priv_drv_data, 0, total_size); + thunk_proxy::FillinAllocPrivDrvData(priv_drv_data, priv_alloc_data_size); + + priv_alloc_data = static_cast(priv_drv_data) + priv_drv_data_size; + auto alloc_info = reinterpret_cast( + static_cast(priv_alloc_data) + priv_alloc_data_size * num_allocations); + + size_t size = desc_.size; + uint64_t addr = desc_.gpu_addr; + char *cpu_addr = static_cast(desc_.cpu_addr); + const auto &device_info = GetDevice()->DeviceInfo(); + + for (size_t i = 0; i < num_allocations; i++) { + + void* priv_data = (void*)((char*)priv_alloc_data + priv_alloc_data_size * i); + size_t block_size = std::min(size, WDDMDevice::GpuMemoryChunkSize); + + if (IsUserMemory() || IsSystem()) { + thunk_proxy::SetAllocationInfo(priv_data, block_size, desc_.domain, 0, desc_.mem_flags, desc_.engine_flag, device_info); + alloc_info[i].pSystemMem = static_cast(cpu_addr); + cpu_addr += block_size; + } else { + thunk_proxy::SetAllocationInfo(priv_data, block_size, desc_.domain, addr, desc_.mem_flags, desc_.engine_flag, device_info); + } + + size -= block_size; + addr += block_size; + + alloc_info[i].pPrivateDriverData = priv_data; + alloc_info[i].PrivateDriverDataSize = priv_alloc_data_size; + alloc_info[i].VidPnSourceId = D3DDDI_ID_UNINITIALIZED; + } + + D3DKMT_CREATEALLOCATION args = {}; + args.hDevice = device_->DeviceHandle(); + args.pPrivateDriverData = priv_drv_data; + args.PrivateDriverDataSize = priv_drv_data_size; + args.NumAllocations = num_allocations; + args.pAllocationInfo2 = alloc_info; + + /* The PhysicallyContiguous flag causes allocation failure + * args.Flags.PhysicallyContiguous = IsPhysicalContiguous(); + */ + + SharedHandleInfo shared_info; + if (IsShared()) { + shared_info.size = desc_.size; + shared_info.client_size = desc_.client_size; + shared_info.domain = desc_.domain; + shared_info.adapter_luid = desc_.adapter_luid; + shared_info.flags = reinterpret_cast(desc_.flags.reserved); + shared_info.mem_flags = desc_.mem_flags; + shared_info.pid = dxg_runtime->parent_pid; + shared_info.gpu_addr = desc_.gpu_addr; + args.pPrivateRuntimeData = &shared_info; + args.PrivateRuntimeDataSize = sizeof(shared_info); + args.Flags.NtSecuritySharing = 1; + args.Flags.CreateShared = 1; + args.Flags.CreateResource = 1; + } + + auto status = d3dthunk::CreateAllocation(&args); + if (status == ErrorCode::Success) { + for (size_t i = 0; i < num_allocations; i++) + alloc_handles_ptr_[i] = alloc_info[i].hAllocation; + + resource_ = args.hResource; + } + free(priv_drv_data); + return status; +} + +ErrorCode GpuMemory::FreePhysicalMemory() { + auto code = ErrorCode::Success; + + if (alloc_handles_ptr_ == nullptr || (NumChunks() == 1 && *alloc_handles_ptr_ == 0)) + return code; + + code = d3dthunk::DestroyAllocation(device_->DeviceHandle(), + resource_, + NumChunks(), + alloc_handles_ptr_); + if (NumChunks() > 1) + delete[] alloc_handles_ptr_; + + alloc_handles_ptr_ = nullptr; + return code; +} + +ErrorCode GpuMemory::MakeResident() { + + D3DDDI_MAKERESIDENT args = {}; + args.hPagingQueue = device_->PagingQueue(); + args.NumAllocations = NumChunks(); + args.AllocationList = alloc_handles_ptr_; + args.Flags.CantTrimFurther = 1; + + auto code = d3dthunk::MakeResident(&args); + if (code == ErrorCode::NotReady) { + const auto fence_value = args.PagingFenceValue; + device_->UpdatePageFence(fence_value); + code = ErrorCode::Success; + } + return code; +} + +ErrorCode GpuMemory::Evict() { + + D3DKMT_EVICT args = {}; + args.hDevice = device_->DeviceHandle(); + args.NumAllocations = NumChunks(); + args.AllocationList = alloc_handles_ptr_; + + return d3dthunk::Evict(&args); +} + +ErrorCode GpuMemory::ExportPhysicalHandle(int* dmabuf_fd, uint32_t flags) { + if (mem_fd_ > -1) { + *dmabuf_fd = mem_fd_; + return ErrorCode::Success; + } + + if (IsShared()) + return d3dthunk::ShareObjects(1, resource_, flags, dmabuf_fd); + else + return ErrorCode::UnSupported; +} + + +ErrorCode GpuMemory::ImportPhysicalHandle(const GpuMemoryCreateInfo &create_info, gpusize *gpu_addr) { + D3DKMT_QUERYRESOURCEINFOFROMNTHANDLE query_args; + int dmabuf_fd = create_info.dmabuf_fd; + + if (dmabuf_fd <= 0) + return ErrorCode::InvalidateParams; + + if(create_info.flags.sysmem_ipc_sig_importer) { + // the ipc signal sys mem fd will be closed in Runtime::IPCClientImport, dup to hold a reference + mem_fd_ = dup(dmabuf_fd); + desc_.client_size = create_info.size; + desc_.size = AdjustSize(desc_.client_size); + desc_.domain = thunk_proxy::AllocDomain::kSystem; + desc_.adapter_luid = device_->GetLuid(); + desc_.alignment = 0x1000; + desc_.mem_flags = create_info.mem_flags; + desc_.engine_flag = create_info.engine_flag; + desc_.flags.is_imported_sys_memfd = create_info.flags.sysmem_ipc_sig_importer; + desc_.flags.is_va_required = create_info.flags.alloc_va; + desc_.flags.is_virtual = create_info.flags.virtual_alloc; + desc_.flags.is_physical_only = create_info.flags.physical_only; + desc_.flags.is_physical_contiguous = create_info.flags.physical_contiguous; + desc_.flags.is_locked = create_info.flags.locked; + + auto code = ReserveGpuVirtualAddress(create_info.va_hint, Size(), create_info.alignment); + if (code != ErrorCode::Success) + return code; + + bool physical_created = false; + auto guard = MakeScopeGuard([this, &physical_created, &code]() { + if (code != ErrorCode::Success) { + if (physical_created) + FreePhysicalMemory(); + FreeGpuVirtualAddress(GpuAddress(), Size()); + } + }); + (void)guard; + + num_allocations_ = CalcChunkNumbers(Size()); + if (num_allocations_ == 1) + alloc_handles_ptr_ = &alloc_handle_; + else + alloc_handles_ptr_ = new WinAllocationHandle[num_allocations_]; + + memset(alloc_handles_ptr_, 0, num_allocations_ * sizeof(WinAllocationHandle)); + + code = CreatePhysicalMemory(); + if (code != ErrorCode::Success) + return code; + + physical_created = true; + + code = MapGpuVirtualAddress(GpuAddress(), Size()); + if (code != ErrorCode::Success) + return code; + + code = MakeResident(); + if (code != ErrorCode::Success) + return code; + + if (!GetDevice()->WaitOnPagingFenceFromCpu()) + code = ErrorCode::Unknown; + + return code; + } else { + // vmem importer / ipc vram importer + memset(&query_args, 0, sizeof(query_args)); + query_args.hDevice = device_->DeviceHandle(); + query_args.hNtHandle = reinterpret_cast(dmabuf_fd); + auto ret = d3dthunk::QueryResourceInfoFromNtHandle(&query_args); + if (ret != ErrorCode::Success) { + pr_err("query resource info from nt handle failed %d\n", static_cast(ret)); + return ErrorCode::InvalidateParams; + } + pr_debug("wsl-thunk: import from nt handle %d, get allocation number %d," + " runtime data size %#x total driver data size %#x resource data size=%#x\n", + dmabuf_fd, + query_args.NumAllocations, + query_args.PrivateRuntimeDataSize, + query_args.TotalPrivateDriverDataSize, + query_args.ResourcePrivateDriverDataSize); + + SharedHandleInfo shared_info; + if(sizeof(shared_info) != query_args.PrivateRuntimeDataSize) { + pr_err("shared hanle info size mismatch:%d vs %ld\n", + query_args.PrivateRuntimeDataSize, sizeof(shared_info)); + return ErrorCode::UnSupported; + } + + uint32_t total_size = query_args.NumAllocations * sizeof(D3DDDI_OPENALLOCATIONINFO2) + + query_args.TotalPrivateDriverDataSize + + query_args.ResourcePrivateDriverDataSize; + D3DDDI_OPENALLOCATIONINFO2 *open_info = + reinterpret_cast (calloc(1, total_size)); + if (!open_info) { + pr_err("alloc open_info failed, NumAllocations:%d\n", + query_args.NumAllocations); + return ErrorCode::OutOfMemory; + } + + auto guard = MakeScopeGuard([&open_info]() { free(open_info); }); + + alloc_handles_ptr_ = new WinAllocationHandle[query_args.NumAllocations]; + + D3DKMT_OPENRESOURCEFROMNTHANDLE open_args; + memset(&open_args, 0, sizeof(open_args)); + open_args.hDevice = query_args.hDevice; + open_args.hNtHandle = query_args.hNtHandle; + open_args.NumAllocations = query_args.NumAllocations; + open_args.pOpenAllocationInfo2 = open_info; + open_args.TotalPrivateDriverDataBufferSize = query_args.TotalPrivateDriverDataSize; + open_args.pTotalPrivateDriverDataBuffer = reinterpret_cast + (open_args.pOpenAllocationInfo2 + open_args.NumAllocations); + open_args.ResourcePrivateDriverDataSize = query_args.ResourcePrivateDriverDataSize; + open_args.pResourcePrivateDriverData = reinterpret_cast + (((uint64_t)open_args.pTotalPrivateDriverDataBuffer) + + open_args.TotalPrivateDriverDataBufferSize); + open_args.PrivateRuntimeDataSize = query_args.PrivateRuntimeDataSize; + open_args.pPrivateRuntimeData = reinterpret_cast (&shared_info); + + ret = d3dthunk::OpenResourceFromNtHandle(&open_args); + if (ret != ErrorCode::Success) { + ret = ErrorCode::InvalidateParams; + pr_err("open resource failed %d\n", static_cast(ret)); + return ret; + } + if (shared_info.pid == dxg_runtime->parent_pid && + create_info.flags.alloc_va && + IsSameAdapter(shared_info.adapter_luid) && + shared_info.gpu_addr) { + pr_info("import from same device and samve process, va is required. " + "a buffer can't be mapped to 2 va. delete the imported buffer, use the existing one.\n"); + if (gpu_addr) + *gpu_addr = shared_info.gpu_addr; + return ErrorCode::SameProcessSameDevice; + } + + desc_.size = shared_info.size; + desc_.client_size = shared_info.client_size; + desc_.domain = shared_info.domain; + desc_.flags.reserved = shared_info.flags; + desc_.mem_flags = shared_info.mem_flags; + desc_.adapter_luid = shared_info.adapter_luid; + resource_ = open_args.hResource; + num_allocations_ = open_args.NumAllocations; + for (int i = 0; i < num_allocations_; i++) + alloc_handles_ptr_[i] = open_info[i].hAllocation; + + desc_.flags.is_va_required = create_info.flags.alloc_va; + if (desc_.flags.is_va_required) { + desc_.flags.is_imported_vram_ipc = 1; + ret = ReserveGpuVirtualAddress(create_info.va_hint, desc_.size, create_info.alignment); + if (ret != ErrorCode::Success) + pr_err("failed to allocate svm range, error:%d\n", static_cast(ret)); + + return ret; + } else { + desc_.flags.is_imported_vram_vmem = 1; + return dxg_runtime->HandleApertureAlloc(desc_.size, &desc_.handle_ape_addr); + } + } +} + +} // namespace thunk +} // namespace wsl diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/wddm/queue.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/wddm/queue.cpp new file mode 100644 index 0000000000..44658819cb --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/wddm/queue.cpp @@ -0,0 +1,1210 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include +#include +#include + +#include "impl/wddm/queue.h" +#include "impl/registers.h" + +#include "impl/hsa/hsa.h" +#include "impl/hsa/hsa_ven_amd_loader.h" +extern hsa_signal_value_t hsakmt_hsa_signal_load_relaxed(hsa_signal_t signal); +extern hsa_signal_value_t hsakmt_hsa_signal_wait_relaxed( + hsa_signal_t signal, hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, uint64_t timeout_hint, + hsa_wait_state_t wait_state_hint); +extern void hsakmt_hsa_signal_store_screlease(hsa_signal_t hsa_signal, + hsa_signal_value_t value); +extern hsa_status_t hsakmt_hsa_ven_amd_loader_query_host_address( + const void *device_address, const void **host_address); + +namespace wsl { +namespace thunk { + +hsa_status_t WDDMQueue::SwsInit(void) { + if (!device->CreateSyncobj(&syncobj, &sync_addr)) + return HSA_STATUS_ERROR; + + if (device->AllocUserQueueMemFromUMD()) { + + GpuMemory *gpu_mem = nullptr; + GpuMemoryCreateInfo create_info{}; + + create_info.domain = thunk_proxy::kUserQueue; + create_info.size = device->GetSwsQueueSize(); + create_info.engine_flag = thunk_proxy::QueueEngine2EngineFlag(queue_engine); + + auto code = device->CreateGpuMemory(create_info, &gpu_mem); + if (code != ErrorCode::Success) { + device->DestroySyncobj(syncobj); + return HSA_STATUS_ERROR; + } + + queue_mem = gpu_mem->GetGpuMemoryHandle(); + queue = gpu_mem->GetAllocationHandle(0); + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t WDDMQueue::SwsFini(void) { + device->DestroySyncobj(syncobj); + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t WDDMQueue::SwsSubmit(uint64_t command_addr, + uint64_t command_size, + uint64_t fence_value) { + if (!device->SubmitToSwQueue(this, command_addr, command_size, fence_value)) + return HSA_STATUS_ERROR; + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t WDDMQueue::HwsInit(void) { + if (!device->CreateHwQueue(this)) + return HSA_STATUS_ERROR; + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t WDDMQueue::HwsFini(void) { + if (!device->DestroyHwQueue(this)) + return HSA_STATUS_ERROR; + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t WDDMQueue::HwsSubmit(uint64_t command_addr, + uint64_t command_size, + uint64_t fence_value) { + if (!device->SubmitToHwQueue(this, command_addr, command_size, fence_value)) + return HSA_STATUS_ERROR; + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t WDDMQueue::SetPriority(hsa_amd_queue_priority_t priority) { + if (!use_hws) + return HSA_STATUS_SUCCESS; + + thunk_proxy::SchedLevel new_prio = ConvertSchedLevel(priority); + if (prio == new_prio) + return HSA_STATUS_SUCCESS; + + pr_debug("set prio %d -> %d\n", prio, new_prio); + device->DestroyHwQueue(this); + + prio = new_prio; + return HwsInit(); +} + +void ComputeQueue::HandleError(hsa_status_t status) { + hsa_signal_t sig = amd_queue_rocr_->queue_inactive_signal; + hsa_signal_value_t val = -1; + + struct queue_error_t { + uint32_t code; + hsa_status_t status; + }; + static const queue_error_t QueueErrors[] = { + {2, HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS}, + {4, HSA_STATUS_ERROR_INVALID_ALLOCATION}, + {8, HSA_STATUS_ERROR_INVALID_CODE_OBJECT}, + //{16, HSA_STATUS_ERROR_INVALID_ARGUMENT}, + {32, HSA_STATUS_ERROR_INVALID_PACKET_FORMAT}, + {64, HSA_STATUS_ERROR_INVALID_ARGUMENT}, + //{128, HSA_STATUS_ERROR_OUT_OF_REGISTERS}, + //{0x20000000, HSA_STATUS_ERROR_MEMORY_APERTURE_VIOLATION}, + //{0x40000000, HSA_STATUS_ERROR_ILLEGAL_INSTRUCTION}, + {0x80000000, HSA_STATUS_ERROR_EXCEPTION}, + }; + for (std::size_t i = 0; i < sizeof(QueueErrors) / sizeof(QueueErrors[0]); ++i) { + if (QueueErrors[i].status == status) { + val = QueueErrors[i].code; + pr_err("error %d, sig_val %ld\n", status, val); + break; + } + } + + if (sig.handle) { + hsakmt_hsa_signal_store_screlease(sig, val); + } + if (error_code_) { + error_code_->store(val, std::memory_order_release); + } +} + +void ComputeQueue::AqlToPm4Thread(ComputeQueue *queue) { + + // This timing system is used for sleeping this Thread + // when one packet is invalid for about 2 seconds. + std::chrono::steady_clock::time_point start_time, time; + // Set the polling timeout value for 2 seconds + const std::chrono::milliseconds kMaxElapsed(2000); + uint64_t current_position = queue->GetAqlWriteIndex(); + bool sleep = false; + start_time = std::chrono::steady_clock::now(); + + while (true) { + if (!queue->IsInvalidPacket()) { + hsa_status_t status = queue->Process(); + if (status != HSA_STATUS_SUCCESS) { + pr_err("process compute queue fail status = %08x\n", status); + queue->HandleError(status); + break; + } + sleep = false; + } else { + if (current_position == queue->GetAqlWriteIndex()) { + time = std::chrono::steady_clock::now(); + if (time - start_time > kMaxElapsed) + sleep = true; + } else { + start_time = std::chrono::steady_clock::now(); + current_position = queue->GetAqlWriteIndex(); + sleep = false; + } + } + + if ((queue->GetRingWptr()->load() > queue->GetRingRptr()->load()) && !sleep) + continue; + + std::unique_lock lock(queue->thread_cond_lock_); + // CPU wait for valid packet + if (queue->GetRingWptr()->load() <= queue->GetRingRptr()->load() || + (sleep && queue->IsInvalidPacket())) { + if (queue->thread_stop_) + break; + pr_debug("wait %p wptr=%" PRIx64 " rptr=%" PRIx64 "\n", + queue->ring, queue->GetRingWptr()->load(), queue->GetRingRptr()->load()); + queue->thread_cond_.wait(lock); + } + } + + pr_debug("aql to pm4 thread %p exit\n", queue->ring); +} + +ComputeQueue::ComputeQueue(WDDMDevice *device, + void *ring, + uint64_t ring_size, + std::atomic *ring_wptr, + std::atomic *ring_rptr, + volatile int64_t *error_addr, + uint32_t cmdbuf_size, + uint32_t engine, + bool use_hws) : + WDDMQueue(device, 0, cmdbuf_size, engine, use_hws), + ring(ring), + ring_size(ring_size), + ring_wptr(ring_wptr), + ring_rptr(ring_rptr), + error_code_(reinterpret_cast*>(error_addr)), + ib_start_addr(0), + ib_size(0), + sync_point(0), + cmdbuf_aql_frame_write_index(0), + cmdbuf_aql_frame_size(0), + needs_barrier(true), + ready_to_submit(false), + platform_atomic_support_(false), + signal_addr_(NULL), + thread_stop_(false), + max_scratch_waves_(device->MaxScratchSlotsPerCu() * device->ComputeUnitCount()), + dispatch_waves_(0), + scratch_size_per_wave_(0), + scratch_size_(0), + total_scratch_size_(0), + scratch_base_(nullptr) { + bool ret = device->CreateQueue(this); + assert(ret); + + GpuMemoryCreateInfo create_info{}; + create_info.size = dxg_runtime->page_size; + create_info.domain = thunk_proxy::kSystem; + GpuMemory *gpu_mem = nullptr; + auto code = device->CreateGpuMemory(create_info, &gpu_mem); + assert(code == ErrorCode::Success); + amd_queue_mem_ = gpu_mem->GetGpuMemoryHandle(); + amd_queue_ = reinterpret_cast(gpu_mem->GpuAddress()); + + amd_queue_rocr_ = (amd_queue_v2_t*)((char*)ring_rptr - offsetof(amd_queue_v2_t, read_dispatch_id)); + aql_to_pm4_thread_ = std::thread(AqlToPm4Thread, this); + + if (device->Major() >= 11) + scratch_mem_alignment_size_ = 256; + else + scratch_mem_alignment_size_ = 1024; +} + +ComputeQueue::~ComputeQueue() { + thread_cond_lock_.lock(); + thread_stop_ = true; + thread_cond_lock_.unlock(); + thread_cond_.notify_one(); + aql_to_pm4_thread_.join(); + + //doorbell_signal_->Release(); + + device->DestroyQueue(this); + + if (scratch_base_) { + auto scratch_gpu_mem = GpuMemory::Convert(scratch_mem_); + delete scratch_gpu_mem; + } + + auto amd_queue_gpu_mem = GpuMemory::Convert(amd_queue_mem_); + delete amd_queue_gpu_mem; +} + +void ComputeQueue::InitScratchSRD() { + // Populate scratch resource descriptor + SQ_BUF_RSRC_WORD0 srd0; + + uintptr_t scratch_base = uintptr_t(scratch_base_); + srd0.bits.BASE_ADDRESS = scratch_base; + + uint32_t srd1_u32; + + if (device->Major() < 11) { + SQ_BUF_RSRC_WORD1 srd1; + + srd1.bits.BASE_ADDRESS_HI = scratch_base >> 32; + srd1.bits.STRIDE = 0; + srd1.bits.CACHE_SWIZZLE = 0; + srd1.bits.SWIZZLE_ENABLE = 1; + + srd1_u32 = srd1.u32All; + } else { + SQ_BUF_RSRC_WORD1_GFX11 srd1; + + srd1.bits.BASE_ADDRESS_HI = scratch_base >> 32; + srd1.bits.STRIDE = 0; + srd1.bits.SWIZZLE_ENABLE = 1; + + srd1_u32 = srd1.u32All; + } + + SQ_BUF_RSRC_WORD2 srd2; + + srd2.bits.NUM_RECORDS = scratch_size_; + + uint32_t srd3_u32; + + if (device->Major() < 10) { + SQ_BUF_RSRC_WORD3 srd3; + + srd3.bits.DST_SEL_X = SQ_SEL_X; + srd3.bits.DST_SEL_Y = SQ_SEL_Y; + srd3.bits.DST_SEL_Z = SQ_SEL_Z; + srd3.bits.DST_SEL_W = SQ_SEL_W; + srd3.bits.NUM_FORMAT = BUF_NUM_FORMAT_UINT; + srd3.bits.DATA_FORMAT = BUF_DATA_FORMAT_32; + srd3.bits.ELEMENT_SIZE = 1; // 4 + srd3.bits.INDEX_STRIDE = 3; // 64 + srd3.bits.ADD_TID_ENABLE = 1; + srd3.bits.ATC__CI__VI = 0; + srd3.bits.HASH_ENABLE = 0; + srd3.bits.HEAP = 0; + srd3.bits.MTYPE__CI__VI = 0; + srd3.bits.TYPE = SQ_RSRC_BUF; + + srd3_u32 = srd3.u32All; + } else if (device->Major() == 10) { + SQ_BUF_RSRC_WORD3_GFX10 srd3; + + srd3.bits.DST_SEL_X = SQ_SEL_X; + srd3.bits.DST_SEL_Y = SQ_SEL_Y; + srd3.bits.DST_SEL_Z = SQ_SEL_Z; + srd3.bits.DST_SEL_W = SQ_SEL_W; + srd3.bits.FORMAT = BUF_FORMAT_32_UINT; + srd3.bits.RESERVED1 = 0; + srd3.bits.INDEX_STRIDE = 0; // filled in by CP + srd3.bits.ADD_TID_ENABLE = 1; + srd3.bits.RESOURCE_LEVEL = 1; + srd3.bits.RESERVED2 = 0; + srd3.bits.OOB_SELECT = 2; // no bounds check in swizzle mode + srd3.bits.TYPE = SQ_RSRC_BUF; + + srd3_u32 = srd3.u32All; + } else if (device->Major() == 11) { + SQ_BUF_RSRC_WORD3_GFX11 srd3; + + srd3.bits.DST_SEL_X = SQ_SEL_X; + srd3.bits.DST_SEL_Y = SQ_SEL_Y; + srd3.bits.DST_SEL_Z = SQ_SEL_Z; + srd3.bits.DST_SEL_W = SQ_SEL_W; + srd3.bits.FORMAT = BUF_FORMAT_32_UINT; + srd3.bits.RESERVED1 = 0; + srd3.bits.INDEX_STRIDE = 0; // filled in by CP + srd3.bits.ADD_TID_ENABLE = 1; + srd3.bits.RESERVED2 = 0; + srd3.bits.OOB_SELECT = 2; // no bounds check in swizzle mode + srd3.bits.TYPE = SQ_RSRC_BUF; + + srd3_u32 = srd3.u32All; + } else { + SQ_BUF_RSRC_WORD3_GFX12 srd3; + srd3.bits.DST_SEL_X = SQ_SEL_X; + srd3.bits.DST_SEL_Y = SQ_SEL_Y; + srd3.bits.DST_SEL_Z = SQ_SEL_Z; + srd3.bits.DST_SEL_W = SQ_SEL_W; + srd3.bits.FORMAT = BUF_FORMAT_32_UINT; + srd3.bits.RESERVED1 = 0; + srd3.bits.INDEX_STRIDE = 0; // filled in by CP + srd3.bits.ADD_TID_ENABLE = 1; + srd3.bits.WRITE_COMPRESS_ENABLE = 0; + srd3.bits.COMPRESSION_EN = 0; + srd3.bits.COMPRESSION_ACCESS_MODE = 0; + srd3.bits.OOB_SELECT = 2; // no bounds check in swizzle mode + srd3.bits.TYPE = SQ_RSRC_BUF; + + srd3_u32 = srd3.u32All; + } + + // Update Queue's Scratch descriptor's property + amd_queue_->scratch_resource_descriptor[0] = srd0.u32All; + amd_queue_->scratch_resource_descriptor[1] = srd1_u32; + amd_queue_->scratch_resource_descriptor[2] = srd2.u32All; + amd_queue_->scratch_resource_descriptor[3] = srd3_u32; + + // Populate flat scratch parameters in amd_queue_. + amd_queue_->scratch_backing_memory_location = scratch_base; + + // For backwards compatibility this field records the per-lane scratch + // for a 64 lane wavefront. If scratch was allocated for 32 lane waves + // then the effective size for a 64 lane wave is halved. + amd_queue_->scratch_wave64_lane_byte_size = scratch_size_per_wave_ / 64; + + uint64_t num_waves; + if (device->Major() < 11) { + COMPUTE_TMPRING_SIZE tmpring_size; + // Scratch Size per Wave is specified in terms of scratch_mem_alignment_size_ + tmpring_size.bits.WAVESIZE = scratch_size_per_wave_ / scratch_mem_alignment_size_; + num_waves = scratch_size_ / scratch_size_per_wave_; + tmpring_size.bits.WAVES = std::min(num_waves, max_scratch_waves_); + + amd_queue_->compute_tmpring_size = tmpring_size.u32All; + } else if (device->Major() == 11) { + COMPUTE_TMPRING_SIZE_GFX11 tmpring_size; + tmpring_size.bits.WAVESIZE = scratch_size_per_wave_ / scratch_mem_alignment_size_; + // For GFX11 we specify number of waves per engine instead of total + num_waves = scratch_size_ / scratch_size_per_wave_ / device->NumShaderEngine(); + tmpring_size.bits.WAVES = std::min(num_waves, max_scratch_waves_); + + amd_queue_->compute_tmpring_size = tmpring_size.u32All; + } else { + COMPUTE_TMPRING_SIZE_GFX12 tmpring_size = {}; + tmpring_size.bits.WAVESIZE = scratch_size_per_wave_ / scratch_mem_alignment_size_; + // For GFX12 we specify number of waves per engine instead of total + num_waves = scratch_size_ / scratch_size_per_wave_ / device->NumShaderEngine(); + tmpring_size.bits.WAVES = std::min(num_waves, max_scratch_waves_); + + amd_queue_->compute_tmpring_size = tmpring_size.u32All; + } + + return; +} + +uint64_t ComputeQueue::CalcDispatchGroups(hsa_kernel_dispatch_packet_t *packet) +{ + const uint64_t lanes_per_group = + (uint64_t(packet->workgroup_size_x) * packet->workgroup_size_y) * packet->workgroup_size_z; + + uint64_t groups = ((uint64_t(packet->grid_size_x) + packet->workgroup_size_x - 1) / + packet->workgroup_size_x) * + ((uint64_t(packet->grid_size_y) + packet->workgroup_size_y - 1) / + packet->workgroup_size_y) * + ((uint64_t(packet->grid_size_z) + packet->workgroup_size_z - 1) / + packet->workgroup_size_z); + const uint32_t cu_count = device->ComputeUnitCount(); + const uint32_t engines = device->NumShaderEngine(); + + const uint32_t symmetric_cus = AlignDown(cu_count, engines); + const uint32_t asymmetryPerRound = cu_count - symmetric_cus; + const uint64_t rounds = groups / cu_count; + const uint64_t asymmetricGroups = rounds * asymmetryPerRound; + const uint64_t symmetricGroups = groups - asymmetricGroups; + + uint64_t maxGroupsPerEngine = + ((symmetricGroups + engines - 1) / engines) + (asymmetryPerRound ? rounds : 0); + + // For gfx10+ devices we must attempt to assign the smaller of 256 lanes or 16 groups to each + // engine. + if (device->Major() >= 10 && + maxGroupsPerEngine < 16 && + lanes_per_group * maxGroupsPerEngine < 256) { + uint64_t groups_per_interleave = (256 + lanes_per_group - 1) / lanes_per_group; + maxGroupsPerEngine = std::min(groups_per_interleave, uint64_t(16ul)); + } + + return maxGroupsPerEngine * engines; +} + +uint64_t ComputeQueue::CalcDispatchWavesPerGroup(hsa_kernel_dispatch_packet_t *packet, + bool wave32) +{ + const uint32_t lanes_per_wave = wave32 ? 32 : 64; + + const uint64_t lanes_per_group = + (uint64_t(packet->workgroup_size_x) * packet->workgroup_size_y) * packet->workgroup_size_z; + + return (lanes_per_group + lanes_per_wave - 1) / lanes_per_wave; +} + +bool ComputeQueue::UpdateScratch(hsa_kernel_dispatch_packet_t *packet, bool wave32) { + const uint32_t lanes_per_wave = wave32 ? 32 : 64; + const uint64_t size_per_thread = AlignUp(packet->private_segment_size, + scratch_mem_alignment_size_ / lanes_per_wave); + + uint64_t groups = CalcDispatchGroups(packet); + uint64_t waves_per_group = CalcDispatchWavesPerGroup(packet, wave32); + + // For packet batching, the maximum value must be used to fit all packets. + scratch_size_per_wave_ = std::max(size_per_thread * lanes_per_wave, scratch_size_per_wave_); + dispatch_waves_ = std::max(groups * waves_per_group, dispatch_waves_); + + const uint64_t max_scratch_size = scratch_size_per_wave_ * max_scratch_waves_; + const uint64_t dispatch_size = scratch_size_per_wave_ * dispatch_waves_; + + scratch_size_ = std::min(dispatch_size, max_scratch_size); + + if (total_scratch_size_ >= scratch_size_) + return true; + + pr_debug("need realloc scratch buffer, size %x -> %x\n", + total_scratch_size_, scratch_size_); + + GpuMemoryCreateInfo create_info{}; + create_info.size = scratch_size_; + create_info.domain = thunk_proxy::kLocal; + GpuMemory *gpu_mem = nullptr; + auto code = device->CreateGpuMemory(create_info, &gpu_mem); + if (code != ErrorCode::Success) + return false; + + if (scratch_base_) { + auto scratch_gpu_mem = GpuMemory::Convert(scratch_mem_); + delete scratch_gpu_mem; + } + + total_scratch_size_ = scratch_size_; + scratch_base_ = reinterpret_cast(gpu_mem->GpuAddress()); + scratch_mem_ = gpu_mem->GetGpuMemoryHandle(); + + InitScratchSRD(); + return true; +} + +bool ComputeQueue::RelocateCmdbufScratchBase(uint64_t addr) { + if (scratch_base_offset_array_.empty()) + return true; + + for (size_t i = 0; i < scratch_base_offset_array_.size(); i++) { + uint32_t *p_compute_user_data = + reinterpret_cast(addr + scratch_base_offset_array_[i]); + if (device->Major() >= 11) { + p_compute_user_data[0] = Ptr48Low32(scratch_base_); + p_compute_user_data[1] = Ptr48High8(scratch_base_); + } else { + p_compute_user_data[0] = PtrLow32(scratch_base_); + p_compute_user_data[1] = (p_compute_user_data[1] & 0xffff0000) | PtrHigh32(scratch_base_); + } + } + scratch_base_offset_array_.clear(); + + return true; +} + +uint32_t ComputeQueue::UpdateIndexStride(uint32_t srd, bool wave32) { + + assert(device->Major() < 13); + + if (device->Major() == 10) { + SQ_BUF_RSRC_WORD3_GFX10 srd3; + + srd3.u32All = srd; + srd3.bits.INDEX_STRIDE = wave32 ? 2 : 3; + + return srd3.u32All; + } else if (device->Major() == 11) { + SQ_BUF_RSRC_WORD3_GFX11 srd3; + + srd3.u32All = srd; + srd3.bits.INDEX_STRIDE = wave32 ? 2 : 3; + + return srd3.u32All; + } else if (device->Major() == 12) { + SQ_BUF_RSRC_WORD3_GFX12 srd3; + + srd3.u32All = srd; + srd3.bits.INDEX_STRIDE = wave32 ? 2 : 3; + + return srd3.u32All; + } + + return srd; +} + +uint64_t ComputeQueue::GetKernelObjAddr(uint64_t addr) const { + /* convert dev_addr to host_addr */ + auto code = get_gpu_mem((void*)addr); + if (code && code->IsBlitKernelObject()) { + return code->GpuAddress(); + } + + uint64_t host_addr = 0; + auto ret = hsakmt_hsa_ven_amd_loader_query_host_address(reinterpret_cast(addr), + reinterpret_cast(&host_addr)); + if (ret == HSA_STATUS_SUCCESS) { + return host_addr; + } + pr_err("failed to query host address for kernel object %p, ret=%d\n", (void*)addr, ret); + return 0; +} + +void ComputeQueue::RingDoorbell() { + thread_cond_lock_.lock(); + thread_cond_lock_.unlock(); + pr_debug("notify %p wptr=%" PRIx64 " rptr=%" PRIx64 "\n", + ring, GetRingWptr()->load(), GetRingRptr()->load()); + thread_cond_.notify_one(); +} + +hsa_status_t ComputeQueue::Init(void) { + hsa_status_t ret = use_hws ? HwsInit() : SwsInit(); + if (ret) + return ret; + + ib_start_addr = cmdbuf_addr; + cmdbuf_aql_frame_size = device->GetAqlFrameSize(); + platform_atomic_support_ = device->SupportPlatformAtomic(); + + return ret; +} + +hsa_status_t ComputeQueue::Fini(void) { + return use_hws ? HwsFini() : SwsFini(); +} + +hsa_status_t ComputeQueue::PreSubmit(void) { + if (!device->WaitPagingFence(this)) + return HSA_STATUS_ERROR; + + RelocateCmdbufScratchBase(ib_start_addr); + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ComputeQueue::EndSubmit(void) { + // record last submitted cmdbuf_aql_frame_write_index to see if GPU is hungry + sync_point = cmdbuf_aql_frame_write_index; + + ib_start_addr = cmdbuf_addr + + (cmdbuf_aql_frame_write_index % WDDMDevice::GetAqlFrameNum()) * + cmdbuf_aql_frame_size; + ib_size = 0; + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ComputeQueue::Submit(void) { + hsa_status_t ret = PreSubmit(); + if (ret) + return HSA_STATUS_ERROR; + + ret = use_hws ? + HwsSubmit(ib_start_addr, ib_size, cmdbuf_aql_frame_write_index) : + SwsSubmit(ib_start_addr, ib_size, cmdbuf_aql_frame_write_index); + if (ret) + return HSA_STATUS_ERROR; + + ret = EndSubmit(); + if (ret) + return HSA_STATUS_ERROR; + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t +ComputeQueue::KernelDispatchAqlToPm4(char *cpu, hsa_kernel_dispatch_packet_t *packet) { + pr_debug("queue %p kernel dispatch head=%x setup=%x wx=%x wy=%x wz=%x " + "gx=%x gy=%x gz=%x ps=%x gs=%x ko=%" PRIx64 " ka=%p cs=%" PRIx64 "\n", + ring, packet->header, + packet->setup, packet->workgroup_size_x, packet->workgroup_size_y, + packet->workgroup_size_z, packet->grid_size_x, packet->grid_size_y, + packet->grid_size_z, packet->private_segment_size, + packet->group_segment_size, packet->kernel_object, packet->kernarg_address, + packet->completion_signal.handle); + + if (packet->workgroup_size_x > 1024 || + packet->workgroup_size_y > 1024 || + packet->workgroup_size_z > 1024) + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + + int major = device->Major(); + int i = ib_size; + + const amd_kernel_code_t* kernel_object = + (const amd_kernel_code_t *)GetKernelObjAddr(packet->kernel_object); + if (kernel_object == NULL) { + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + + void* entry = (void*)(packet->kernel_object + kernel_object->kernel_code_entry_byte_offset); + assert((size_t)entry % AMD_ISA_ALIGN_BYTES == 0); + + pr_debug("kernel object property=%x entry=%p lds=%x+%x\n", + kernel_object->kernel_code_properties, entry, + kernel_object->workgroup_group_segment_byte_size, + packet->group_segment_size); + + if (packet->setup == 0 || packet->setup > 3) + return HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS; + if (packet->group_segment_size > device->LdsSize()) + return HSA_STATUS_ERROR_INVALID_ALLOCATION; + + uint32_t lds_blks = device->LdsBlocks(packet); + if (lds_blks > 128) + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + + const bool wave32 = + AMD_HSA_BITS_GET(kernel_object->kernel_code_properties, + AMD_KERNEL_CODE_PROPERTIES_ENABLE_WAVEFRONT_SIZE32); + + assert(packet->private_segment_size >= kernel_object->workitem_private_segment_byte_size); + + if (packet->private_segment_size != 0) + UpdateScratch(packet, wave32); + + amd_signal_t *signal = (amd_signal_t *)packet->completion_signal.handle; + + // Record start timestamp when enabling profiling + if (signal && EnableProfiling()) + i += cmd_util.BuildCopyData(&signal->start_ts, cpu + i); + + // Build a barrier packet if it is requested + const bool is_barrier_packet = (packet->header >> HSA_PACKET_HEADER_BARRIER) & 0x1; + if (is_barrier_packet && needs_barrier) + i += cmd_util.BuildBarrier(cpu + i); + + // flush cache + i += cmd_util.BuildAcquireMem(major, cpu + i); + + if (major >= 11) { + AppendCmdbufSratchBaseOffset( + i + offsetof(struct SetScratchTemplate, scratch_lo)); + + i += cmd_util.BuildScratch(ScratchBase(), cpu + i); + i += cmd_util.BuildComputeShaderParams(cpu + i); + } + + struct DispatchInfo info; + info.major = major; + info.pPacket = packet; + info.pEntry = entry; + info.pKernelObject = kernel_object; + info.ldsBlks = lds_blks; + info.pAmdQueue = amd_queue_; + info.wave32 = wave32; + info.srd = UpdateIndexStride( + info.pAmdQueue->scratch_resource_descriptor[3], wave32); + info.pScratchBase = ScratchBase(); + info.scratchSizePerWave = ScratchSizePerWave(); + memset(info.scratchBaseOffset, 0, sizeof(info.scratchBaseOffset)); + info.offsetCnt = 0; + + size_t size; + size = cmd_util.BuildDispatch(&info, cpu + i); + for (int j = 0; j < info.offsetCnt; j++) + AppendCmdbufSratchBaseOffset(i + info.scratchBaseOffset[j]); + i += size; + + needs_barrier = (packet->completion_signal.handle == 0); + + if (signal) { + // wait cs done + i += cmd_util.BuildBarrier(cpu + i); + + // Record end timestamp when enabling profiling + if (EnableProfiling()) + i += cmd_util.BuildCopyData(&signal->end_ts, cpu + i); + + // flush cache + i += cmd_util.BuildAcquireMem(major, cpu + i); + + assert(signal->kind == AMD_SIGNAL_KIND_USER); + uint64_t *signal_addr = (uint64_t *)&signal->value; + pr_debug("signal value=%" PRIx64 "\n", signal->value); + + if (platform_atomic_support_) + i += cmd_util.BuildAtomicMem(signal_addr, TC_OP_ATOMIC_ADD_RTN_64, cpu + i, cache_policy__mec_atomic_mem__bypass, -1); + else + signal_addr_ = signal_addr; + } + + // The ring_rptr is used to record pm4 queue rptr value, + // dispatch readptr position, this is used to share rptr with + // aql queue. + if (platform_atomic_support_) + i += cmd_util.BuildAtomicMem((uint64_t *)ring_rptr, TC_OP_ATOMIC_ADD_RTN_64, cpu + i); + else + i += cmd_util.BuildWriteData64Command(cpu + i, (uint64_t *)ring_rptr, cmdbuf_aql_frame_write_index + 1); + + // Check if we exceeded the frame size + if ((i - ib_size) > cmdbuf_aql_frame_size) { + pr_err("PM4 command buffer overflow in KernelDispatch: used %d bytes, limit %d bytes\n", i - ib_size, cmdbuf_aql_frame_size); + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + ib_size = i; + cmdbuf_aql_frame_write_index++; + packet->header = HSA_PACKET_TYPE_INVALID; + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t +ComputeQueue::BarrierGenericAqlToPm4(char *cpu, hsa_barrier_and_packet_t *packet, bool is_or) { + pr_debug("queue %p %s head=%x dep %" PRIx64 " %" PRIx64 " %" PRIx64 + " %" PRIx64 " %" PRIx64 " cs=%" PRIx64"\n", + ring, is_or ? "or" : "and", + packet->header, packet->dep_signal[0].handle, + packet->dep_signal[1].handle, packet->dep_signal[2].handle, + packet->dep_signal[3].handle, packet->dep_signal[4].handle, + packet->completion_signal.handle); + // fix me: can we use gpu packet? + if (is_or) { + bool unsignaled = true; + hsa_signal_t sig[5]; + int n = 0; + for (int i = 0; i < 5; i++) { + if (packet->dep_signal[i].handle) + sig[n++] = packet->dep_signal[i]; + } + + while (n) { + for (int i = 0; i < n; i++) { + if (!hsakmt_hsa_signal_load_relaxed(sig[i])) { + unsignaled = false; + break; + } + } + if (!unsignaled) + break; + + std::this_thread::sleep_for(std::chrono::microseconds(20)); + } + } else { + for (int i = 0; i < 5; i++) { + if (!packet->dep_signal[i].handle) + continue; + + hsa_signal_value_t value = + hsakmt_hsa_signal_wait_relaxed(packet->dep_signal[i], HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_BLOCKED); + assert(value == 0); + } + } + + int major = device->Major(); + int i = ib_size; + + if (packet->completion_signal.handle != 0) { + amd_signal_t *signal = (amd_signal_t *)packet->completion_signal.handle; + assert(signal->kind == AMD_SIGNAL_KIND_USER); + uint64_t *signal_addr = (uint64_t *)&signal->value; + pr_debug("signal value=%" PRIx64 "\n", signal->value); + + // Record start timestamp when enabling profiling + if (EnableProfiling()) + i += cmd_util.BuildCopyData(&signal->start_ts, cpu + i); + + if (needs_barrier) + i += cmd_util.BuildBarrier(cpu + i); + + needs_barrier = false; + + // Record end timestamp when enabling profiling + if (EnableProfiling()) + i += cmd_util.BuildCopyData(&signal->end_ts, cpu + i); + + // flush cache + i += cmd_util.BuildAcquireMem(major, cpu + i); + + if (platform_atomic_support_) + i += cmd_util.BuildAtomicMem(signal_addr, TC_OP_ATOMIC_ADD_RTN_64, cpu + i, cache_policy__mec_atomic_mem__bypass, -1); + else + signal_addr_ = signal_addr; + } + + // The ring_rptr is used to record pm4 queue rptr value, + // dispatch readptr position, this is used to share rptr with + // aql queue. + if (platform_atomic_support_) + i += cmd_util.BuildAtomicMem((uint64_t *)ring_rptr, TC_OP_ATOMIC_ADD_RTN_64, cpu + i); + else + i += cmd_util.BuildWriteData64Command(cpu + i, (uint64_t *)ring_rptr, cmdbuf_aql_frame_write_index + 1); + + // Check if we exceeded the frame size + if ((i - ib_size) > cmdbuf_aql_frame_size) { + pr_err("PM4 command buffer overflow in BarrierGeneric: used %d bytes, limit %d bytes\n", i - ib_size, cmdbuf_aql_frame_size); + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + ib_size = i; + cmdbuf_aql_frame_write_index++; + packet->header = HSA_PACKET_TYPE_INVALID; + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ComputeQueue::VendorSpecificAqlToPm4(char *cpu, amd_aql_pm4_ib *packet) { + constexpr uint32_t AMD_AQL_FORMAT_PM4_IB = 0x1; + assert(packet->ven_hdr == AMD_AQL_FORMAT_PM4_IB); + + uint8_t op = (packet->ib_jump_cmd[0] >> PM4_OPCODE_SHIFT) & 0xff; + assert(op == IT_INDIRECT_BUFFER); + uint32_t* pm4_addr = reinterpret_cast((static_cast(packet->ib_jump_cmd[2]) << 32) | (static_cast(packet->ib_jump_cmd[1]) & ~3ull)); + uint32_t pm4_size = packet->ib_jump_cmd[3]&0xfffff; + pr_debug("queue %p %s VENDOR_SPECIFIC pkt pm4_addr %p pm4_size %#x cs=%" PRIx64"\n", + ring, dxg_runtime->vendor_packet_process ? "process" : "skip", pm4_addr, pm4_size, + packet->completion_signal.handle); + for (int i = 0; i < pm4_size; i++) { + pr_debug("pm4_addr[%d]=%#x\n", i, pm4_addr[i]); + } + + int i = ib_size; + + if (dxg_runtime->vendor_packet_process) { + int major = device->Major(); + memcpy(cpu+i, pm4_addr, pm4_size * sizeof(uint32_t)); + i += pm4_size * sizeof(uint32_t); + + if (packet->completion_signal.handle != 0) { + amd_signal_t *signal = (amd_signal_t *)packet->completion_signal.handle; + assert(signal->kind == AMD_SIGNAL_KIND_USER); + uint64_t *signal_addr = (uint64_t *)&signal->value; + pr_debug("signal value=%" PRIx64 "\n", signal->value); + + // Record start timestamp when enabling profiling + if (EnableProfiling()) + i += cmd_util.BuildCopyData(&signal->start_ts, cpu + i); + + //if (needs_barrier) + i += cmd_util.BuildBarrier(cpu + i); + + //needs_barrier = false; + + // Record end timestamp when enabling profiling + if (EnableProfiling()) + i += cmd_util.BuildCopyData(&signal->end_ts, cpu + i); + + // flush cache + i += cmd_util.BuildAcquireMem(major, cpu + i); + + if (platform_atomic_support_) + i += cmd_util.BuildAtomicMem(signal_addr, TC_OP_ATOMIC_ADD_RTN_64, cpu + i, cache_policy__mec_atomic_mem__bypass, -1); + else + signal_addr_ = signal_addr; + } + } else { + if (packet->completion_signal.handle != 0) { + hsakmt_hsa_signal_store_screlease(packet->completion_signal, 0); + } + } + + // The ring_rptr is used to record pm4 queue rptr value, + // dispatch readptr position, this is used to share rptr with + // aql queue. + if (platform_atomic_support_) + i += cmd_util.BuildAtomicMem((uint64_t *)ring_rptr, TC_OP_ATOMIC_ADD_RTN_64, cpu + i); + else + i += cmd_util.BuildWriteData64Command(cpu + i, (uint64_t *)ring_rptr, cmdbuf_aql_frame_write_index + 1); + + // Check if we exceeded the frame size + if ((i - ib_size) > cmdbuf_aql_frame_size) { + pr_err("PM4 command buffer overflow in VendorSpecific: used %d bytes, limit %d bytes\n", i - ib_size, cmdbuf_aql_frame_size); + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + ib_size = i; + cmdbuf_aql_frame_write_index++; + packet->header = HSA_PACKET_TYPE_INVALID; + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ComputeQueue::SwitchAql2PM4(void) { + + uint16_t *packet = (uint16_t *) ((char *)ring + + (cmdbuf_aql_frame_write_index % ring_size) * 64); + uint16_t header = (*packet >> HSA_PACKET_HEADER_TYPE); + header &= (1 << HSA_PACKET_HEADER_WIDTH_TYPE) - 1; + hsa_kernel_dispatch_packet_t *aql_packet = + (hsa_kernel_dispatch_packet_t *)packet; + hsa_status_t ret; + + switch (header) { + case HSA_PACKET_TYPE_KERNEL_DISPATCH: + ret = KernelDispatchAqlToPm4((char *)ib_start_addr, aql_packet); + if (ret != HSA_STATUS_SUCCESS) + return ret; + + // Stop merging packages util below conditions are met: + // 1) The kernel with completion signal; + // 2) The cmdbuf_aql_frame_write_index reaches the end of cmdbuf + // 3) The queue is empty now, submit the package right now. + if (!(aql_packet->completion_signal.handle) && + (cmdbuf_aql_frame_write_index % WDDMDevice::GetAqlFrameNum()) && + (*sync_addr != sync_point)) + return HSA_STATUS_SUCCESS; + + break; + case HSA_PACKET_TYPE_BARRIER_AND: + BarrierGenericAqlToPm4((char *)ib_start_addr, (hsa_barrier_and_packet_t *)aql_packet); + break; + case HSA_PACKET_TYPE_BARRIER_OR: + BarrierGenericAqlToPm4((char *)ib_start_addr, (hsa_barrier_and_packet_t *)aql_packet, true); + break; + case HSA_PACKET_TYPE_VENDOR_SPECIFIC: + VendorSpecificAqlToPm4((char *)ib_start_addr, (amd_aql_pm4_ib *)aql_packet); + break; + case HSA_PACKET_TYPE_INVALID: + // When packets are submitted out of order, the format field of current AQL packet + // may not have been updated yet and is still INVALID. Return HSA_STATUS_SUCCESS and + // do not process AQL packets before the packet format field is updated. + assert(false && "Should not reach here, HSA_PACKET_TYPE_INVALID has been filtered in upper layer"); + return HSA_STATUS_SUCCESS; + default: + return HSA_STATUS_ERROR_INVALID_PACKET_FORMAT; + } + + ready_to_submit = true; + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ComputeQueue::Process(void) { + + while (cmdbuf_aql_frame_write_index < ring_wptr->load() && + !IsInvalidPacket()) { + pr_debug("process %p wptr=%" PRIx64 " rptr=%" PRIx64 "\n", + ring, ring_wptr->load(), ring_rptr->load()); + + hsa_status_t ret; + + // wait for next few cmdbuf slots to be free + // If wptr catch up the rptr in the cmdbuf, this needs wait for the rptr to free the cmdbuf. + // Here the wptr comes from queue->cmdbuf_aql_frame_write_index, while rptr comes from *queue->sync_addr. + if (*sync_addr + WDDMDevice::GetAqlFrameNum() <= cmdbuf_aql_frame_write_index) { + uint64_t value = cmdbuf_aql_frame_write_index - WDDMDevice::GetAqlFrameNum() + 1; + if (!device->CpuWait(&syncobj, &value, 1, false)) + return HSA_STATUS_ERROR; + } + + ret = SwitchAql2PM4(); + if (ret != HSA_STATUS_SUCCESS) + return ret; + + if (!ready_to_submit) + continue; + + ret = Submit(); + if (ret != HSA_STATUS_SUCCESS) + return ret; + + // CPU wait for GPU fence, and cpu update the signal. + if (!platform_atomic_support_ && signal_addr_) { + // CPU wait for GPU fence + if (!device->CpuWait(&syncobj, &cmdbuf_aql_frame_write_index, 1, false)) + return HSA_STATUS_ERROR; + //CPU update completional signal + atomic::Decrement(signal_addr_); + signal_addr_ = NULL; + } + + ready_to_submit = false; + + pr_debug("done %p wptr=%" PRIx64 " rptr=%" PRIx64 "\n", + ring, ring_wptr->load(), ring_rptr->load()); + } + + return HSA_STATUS_SUCCESS; +} + +void SDMAQueue::SdmaThread(SDMAQueue *queue) { + + while (true) { + decltype(queue->wptr_queue_) pendings; + { + std::unique_lock lock(queue->thread_cond_lock_); + while (queue->wptr_queue_.empty() && !queue->thread_stop_) + queue->thread_cond_.wait(lock); + + if (queue->thread_stop_) + break; + + pendings.swap(queue->wptr_queue_); + } + + for (const auto [start, end] : pendings) { + pr_debug("wptr %lx %lx\n", start, end); + + SDMA_PKT_POLL_REGMEM* poll_pkt = reinterpret_cast(queue->cmdbuf_addr + queue->WrapIntoRocrRing(start)); + SDMA_PKT_POLL_REGMEM* poll_next_pkt = poll_pkt + 1; + while (queue->IsPollPacket(poll_pkt)) { + uint64_t poll_addr = poll_pkt->ADDR_LO_UNION.addr_31_0 | + (uint64_t)poll_pkt->ADDR_HI_UNION.addr_63_32 << 32; + + uint64_t poll_val = poll_pkt->VALUE_UNION.value; + uint32_t skip = 1; + + if (queue->IsPollPacket(poll_next_pkt)) { + uint64_t poll_next_addr = poll_next_pkt->ADDR_LO_UNION.addr_31_0 | + (uint64_t)poll_next_pkt->ADDR_HI_UNION.addr_63_32 << 32; + + if (poll_next_addr + sizeof(uint32_t) == poll_addr) { + poll_addr = poll_next_addr; + poll_val = poll_next_pkt->VALUE_UNION.value | + (uint64_t)poll_pkt->VALUE_UNION.value << 32; + skip = 2; + } + } + + amd_signal_t* signal = (amd_signal_t*)((char*)poll_addr - offsetof(amd_signal_t, value)); + uint64_t signal_handle = reinterpret_cast(signal); + pr_debug("poll signal %#lx addr %#lx val %ld\n", signal_handle, poll_addr, poll_val); + hsa_signal_t hsa_signal = {signal_handle}; + hsa_signal_value_t value = + hsakmt_hsa_signal_wait_relaxed(hsa_signal, HSA_SIGNAL_CONDITION_EQ, poll_val, UINT64_MAX, HSA_WAIT_STATE_BLOCKED); + assert(value == poll_val); + + memset(poll_pkt, 0, skip * sizeof(*poll_pkt)); + poll_pkt += skip; + poll_next_pkt += skip; + } + queue->PreparePacket(queue->WrapIntoRocrRing(start), end - start); + std::atomic_thread_fence(std::memory_order_release); + queue->Submit(); + } + } + pr_debug("sdma thread exit\n"); +} + +SDMAQueue::SDMAQueue(WDDMDevice *device, + void *ring, + uint64_t cmdbuf_size, + uint32_t engine, + bool use_hws) : + WDDMQueue(device, reinterpret_cast(ring), cmdbuf_size, engine, use_hws), + wptr_next_(0), + wptr_pre_(0), + rptr_next(0), + thread_stop_(false), + ib_size(0), + ib_start_addr(0) { + bool ret = device->CreateQueue(this); + assert(ret); + + thread_ = std::thread(SdmaThread, this); +} + +SDMAQueue::~SDMAQueue() { + thread_cond_lock_.lock(); + thread_stop_ = true; + thread_cond_lock_.unlock(); + thread_cond_.notify_one(); + thread_.join(); + + device->DestroyQueue(this); +} + +void SDMAQueue::RingDoorbell() { + pr_debug("ringdoorbell %#lx %#lx\n", wptr_pre_, wptr_next_); + thread_cond_lock_.lock(); + + wptr_queue_.emplace_back(wptr_pre_, wptr_next_); + thread_cond_.notify_one(); + + thread_cond_lock_.unlock(); + wptr_pre_ = wptr_next_; +} + +hsa_status_t SDMAQueue::Init(void) { + hsa_status_t ret = use_hws ? HwsInit() : SwsInit(); + if (ret) + return ret; + + std::memset((char *)cmdbuf_addr, 0, cmdbuf_size); + + return ret; +} + +hsa_status_t SDMAQueue::Fini(void) { + return use_hws ? HwsFini() : SwsFini(); +} + +int SDMAQueue::PreparePacket(uint32_t offset, uint64_t size) { + ib_start_addr = cmdbuf_addr + offset; + ib_size = size; + rptr_next += ib_size; + + return STATUS_SUCCESS; +} + +hsa_status_t SDMAQueue::Submit(void) { + if (!device->WaitPagingFence(this)) + return HSA_STATUS_ERROR; + + int ret = use_hws ? + HwsSubmit(ib_start_addr, ib_size, rptr_next) : + SwsSubmit(ib_start_addr, ib_size, rptr_next); + if (ret) + return HSA_STATUS_ERROR; + + return HSA_STATUS_SUCCESS; +} + +} // namespace thunk +} // namespace wsl diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/wddm/va_mgr.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/wddm/va_mgr.cpp new file mode 100644 index 0000000000..4ea93c70f2 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/wddm/va_mgr.cpp @@ -0,0 +1,165 @@ +#include +#include +#include +#include "impl/wddm/va_mgr.h" + +using namespace std; + +namespace wsl { +namespace thunk { + +VaMgr::VaMgr(uint64_t start, uint64_t size, uint64_t min_align) { + min_align_ = min_align; + auto free_it = free_list_.insert(make_pair(size, start)); + frag_map_[start] = make_fragment(free_it, size); +} + +VaMgr::~VaMgr() { + + if (free_list_.size() != 1) + pr_warn("free_list_ size:%ld which should be 1.\n", free_list_.size()); + if (frag_map_.size() != 1) + pr_warn("frag_map_ size:%ld which should be 1.\n", frag_map_.size()); + + free_list_.clear(); + frag_map_.clear(); +} + +uint64_t VaMgr::Alloc(uint64_t bytes, uint64_t align, uint64_t addr) { + + if (addr > 0 && + (align == 0 || (addr % align) == 0)) { + + lock_guard gard(lock_); + auto frag_it = frag_map_.upper_bound(addr); + assert(frag_it != frag_map_.begin()); + --frag_it; + + while (frag_it != frag_map_.begin()) { + const uint64_t base = frag_it->first; + const uint64_t size = frag_it->second.size; + + // Cannot find free fragment contains the target `addr` + if (bytes > size || addr < base || addr + bytes > base + size || + !is_free(frag_it->second)) { + --frag_it; + continue; + } else if (addr >= base + size) + break; + + + // Try to allocate target `addr` from this free fragment + auto free_it = frag_it->second.free_list_entry_; + assert(free_it != free_list_.end()); + + free_list_.erase(free_it); + frag_it->second.size = bytes; + set_used(frag_it->second); + + // [base, addr) + if (addr > base) add_free_fragment(addr - base, base); + + // [addr, addr + bytes) is used + + // [addr + bytes, base + size) + if (base + size > addr + bytes) add_free_fragment(base + size - addr - bytes, addr + bytes); + + return addr; + } + } + + // Allocate not fixed address + return AllocImpl(bytes, align); +} + +uint64_t VaMgr::AllocImpl(const uint64_t bytes, const uint64_t align) { + uint64_t addr = 0; + uint64_t align_bytes = bytes; + const int retry = align == 0 ? 0 : 1; + const uint64_t new_align = align == 0 ? min_align_ : AlignUp(align, min_align_); + + lock_guard gard(lock_); + for (int i = 0; i <= retry; i++) { + auto free_it = free_list_.lower_bound(align_bytes); + if (free_it == free_list_.end()) break; + + uint64_t base = free_it->second; + uint64_t size = free_it->first; + + assert(size >= align_bytes); + + auto fragment = frag_map_.find(base); + + assert(fragment != frag_map_.end()); + assert(size == fragment->second.size); + + uint64_t delta = align == 0 ? 0 : base % align; + if (delta == 0) { + // already find aligned address + addr = base; + + free_list_.erase(free_it); + fragment->second.size = bytes; + set_used(fragment->second); + + if (size > bytes) add_free_fragment(size - bytes, base + bytes); + + break; + } else if (i == 0) { + align_bytes += new_align; + continue; + } else { + uint64_t aligned_base = base + align - delta; + addr = aligned_base; + + free_list_.erase(free_it); + + add_used_fragment(bytes, aligned_base); + add_free_fragment(aligned_base - base, base); + + if (size > aligned_base - base + bytes) + add_free_fragment(size - (aligned_base - base) - bytes, aligned_base + bytes); + + break; + } + } + return addr; +} + +void VaMgr::Free(uint64_t addr) { + if (addr == 0) return; + + lock_guard gard(lock_); + auto frag_it = frag_map_.find(addr); + if (frag_it == frag_map_.end() || is_free(frag_it->second)) return; + + uint64_t base = addr; + // Merge lower + if (frag_it != frag_map_.begin()) { + auto lower = frag_it; + --lower; + if (is_free(lower->second)) { + remove_free_list_entry(lower->second); + base -= lower->second.size; + lower->second.size += frag_it->second.size; + frag_map_.erase(frag_it); + frag_it = lower; + } + } + // Merge upper + { + auto upper = frag_it; + ++upper; + if (upper != frag_map_.end() && is_free(upper->second)) { + remove_free_list_entry(upper->second); + frag_it->second.size += upper->second.size; + frag_map_.erase(upper); + } + } + uint64_t size = frag_it->second.size; + auto it = free_list_.insert(make_pair(size, base)); + set_free(frag_it->second, it); +} + +} // namespace thunk +} // namespace wsl