Files
rocm-systems/memory.cpp
T
2025-12-24 13:30:50 +08:00

990 γραμμές
33 KiB
C++

/*
* Copyright © 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including
* the next paragraph) shall be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/sysinfo.h>
#include <sys/stat.h>
#include <fcntl.h>
#include "impl/wddm/gpu_memory.h"
#include "util/simple_heap.h"
struct Allocation {
Allocation()
: handle(0), cpu_addr(0), gpu_addr(0), size(0), userptr(false),
user_data(nullptr), size_requested(0), node_id(0), mem_flags_value(0),
dmabuf_fd(-1), rocr_userdata(nullptr) {}
Allocation(wsl::thunk::GpuMemoryHandle handle_arg, void *cpu_addr_arg,
uint64_t gpu_addr_arg, size_t size_arg, bool userptr_arg = false,
void *user_data_arg = nullptr, size_t user_size_arg = 0,
HSAuint32 node_id_arg = 0, HSAuint32 mem_flags_value_arg = 0)
: handle(handle_arg), cpu_addr(cpu_addr_arg), gpu_addr(gpu_addr_arg),
size(size_arg), userptr(userptr_arg), user_data(user_data_arg),
size_requested(user_size_arg), node_id(node_id_arg),
mem_flags_value(mem_flags_value_arg), dmabuf_fd(-1), rocr_userdata(nullptr) {}
wsl::thunk::GpuMemoryHandle handle;
void *cpu_addr;
uint64_t gpu_addr;
bool userptr;
size_t size; /* actual size = align_up(size_requested, granularity) */
void *user_data;
size_t size_requested; /* size requested by user */
HSAuint32 node_id;
HSAuint32 mem_flags_value;
int dmabuf_fd;
void *rocr_userdata;
};
static std::map<const void *, Allocation>* allocation_map_ = new std::map<const void *, Allocation>();
static std::mutex* allocation_map_lock_ = new std::mutex();
void clear_allocation_map(void)
{
//delete allocation_map_lock_;
allocation_map_lock_ = new std::mutex();
std::lock_guard<std::mutex> lock(*allocation_map_lock_);
delete allocation_map_;
allocation_map_ = new std::map<const void *, Allocation>();
}
HSAKMT_STATUS HSAKMTAPI hsaKmtSetMemoryPolicy(HSAuint32 Node,
HSAuint32 DefaultPolicy,
HSAuint32 AlternatePolicy,
void *MemoryAddressAlternate,
HSAuint64 MemorySizeInBytes) {
CHECK_DXG_OPEN();
pr_warn_once("not implemented\n");
assert(false);
return HSAKMT_STATUS_SUCCESS;
}
HSAuint32 PageSizeFromFlags(unsigned int pageSizeFlags) {
switch (pageSizeFlags) {
case HSA_PAGE_SIZE_4KB:
return 4 * 1024;
case HSA_PAGE_SIZE_64KB:
return 64 * 1024;
case HSA_PAGE_SIZE_2MB:
return 2 * 1024 * 1024;
case HSA_PAGE_SIZE_1GB:
return 1024 * 1024 * 1024;
default:
assert(false);
return 4 * 1024;
}
}
HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemory(HSAuint32 PreferredNode,
HSAuint64 SizeInBytes,
HsaMemFlags MemFlags,
void **MemoryAddress) {
return hsaKmtAllocMemoryAlign(PreferredNode, SizeInBytes, 0, MemFlags,
MemoryAddress);
}
#define POWER_OF_2(x) ((x && (!(x & (x - 1)))) ? 1 : 0)
bool isSystemMemoryAvailable(HSAuint64 SizeInBytes) {
struct sysinfo info;
if (sysinfo(&info) != 0)
return false;
return SizeInBytes <= info.freeram;
}
void* BlockAllocator::alloc(size_t request_size, size_t& allocated_size) const {
void *address;
HsaMemFlags MemFlags;
MemFlags.Value = 0;
MemFlags.ui32.CoarseGrain = 1;
MemFlags.ui32.NoSubstitute = 1;
allocated_size = wsl::AlignUp(request_size, block_size());
if (HSAKMT_STATUS_SUCCESS == hsaKmtAllocMemoryAlignInternal(1, allocated_size, 0, MemFlags, &address, true))
return address;
return nullptr;
}
void BlockAllocator::free(void* ptr, size_t length) const {
if (HSAKMT_STATUS_SUCCESS != hsaKmtFreeMemoryInternal(ptr, length, true))
pr_err("wsl-thunk: BlockAllocator::free() err, address %p, length:%zu\n", ptr, length);
}
static wsl::SimpleHeap<BlockAllocator> fragment_allocator_;
void reset_suballocator(void) {
fragment_allocator_.reset();
}
void trim_suballocator(void) {
fragment_allocator_.trim();
}
HSAKMT_STATUS hsaKmtAllocMemoryAlignInternal(HSAuint32 PreferredNode,
HSAuint64 SizeInBytes,
HSAuint64 Alignment,
HsaMemFlags MemFlags,
void **MemoryAddress,
bool SkipSubAlloc) {
CHECK_DXG_OPEN();
if (!MemoryAddress)
return HSAKMT_STATUS_INVALID_PARAMETER;
if (MemFlags.ui32.FixedAddress) {
if (*MemoryAddress == nullptr)
return HSAKMT_STATUS_INVALID_PARAMETER;
} else
*MemoryAddress = nullptr;
uint32_t node = (PreferredNode == 0) ? dxg_runtime->default_node : PreferredNode;
wsl::thunk::WDDMDevice *dev = get_wddmdev(node);
if (!dev)
return HSAKMT_STATUS_ERROR;
wsl::thunk::GpuMemory *gpu_mem = nullptr;
wsl::thunk::GpuMemoryCreateInfo create_info{};
create_info.size = SizeInBytes;
/* If initialize scratch pool of GpuAgent, treat it as SVM reserve */
if (MemFlags.ui32.Scratch && MemFlags.ui32.HostAccess && SizeInBytes > 0x80000000)
MemFlags.ui32.OnlyAddress = 1;
create_info.alignment = Alignment;
create_info.va_hint = reinterpret_cast<gpusize>(*MemoryAddress);
if ((PreferredNode == 0 && MemFlags.ui32.HostAccess)
|| dxg_runtime->zfb_support || MemFlags.ui32.GTTAccess) {
if (SizeInBytes > dxg_runtime->max_single_alloc_size)
return HSAKMT_STATUS_NO_MEMORY;
if (dxg_runtime->check_avail_sysram && !isSystemMemoryAvailable(SizeInBytes))
return HSAKMT_STATUS_NO_MEMORY;
/* If allocate VRAM under ZFB mode */
if (dxg_runtime->zfb_support && MemFlags.ui32.NonPaged == 1)
MemFlags.ui32.CoarseGrain = 1;
// AllocateNonPaged == AllocateIPC
create_info.flags.sysmem_ipc_sig_exporter = !!(MemFlags.ui32.NonPaged && !MemFlags.ui32.GTTAccess);
create_info.domain = thunk_proxy::AllocDomain::kSystem;
} else {
create_info.domain = thunk_proxy::AllocDomain::kLocal;
}
if (!MemFlags.ui32.CoarseGrain)
create_info.mem_flags = thunk_proxy::kFineGrain;
//In hsa-runtime, only kernarg region set Uncached.
if (MemFlags.ui32.Uncached)
create_info.mem_flags |= thunk_proxy::kKernarg;
create_info.flags.physical_only = MemFlags.ui32.NoAddress;
create_info.flags.alloc_va = !create_info.flags.physical_only;
create_info.flags.interprocess = MemFlags.ui32.NoAddress;
create_info.flags.interprocess |= MemFlags.ui32.Contiguous;
create_info.flags.physical_contiguous = MemFlags.ui32.Contiguous;
create_info.flags.locked = MemFlags.ui32.NoSubstitute;//AllocatePinned
create_info.flags.virtual_alloc = MemFlags.ui32.OnlyAddress;
create_info.flags.blit_kernel_object =
(MemFlags.ui32.ExecuteBlit && MemFlags.ui32.ExecuteAccess &&
(create_info.domain == thunk_proxy::AllocDomain::kSystem));
/*when only alloc virtual or only physical, it's vmm allocation, force to local*/
if (create_info.flags.virtual_alloc || create_info.flags.physical_only
|| create_info.flags.physical_contiguous) {
create_info.domain = thunk_proxy::AllocDomain::kLocal;
SkipSubAlloc = true;
}
/* Only allow using the suballocator for ordinary VRAM.*/
bool trim_safe = false;
if (!SkipSubAlloc && create_info.domain == thunk_proxy::AllocDomain::kLocal) {
/* just quickly skip SA if size is bigger than SA block size.*/
gpusize real_size;
if (create_info.size > GPU_HUGE_PAGE_SIZE)
real_size = wsl::AlignUp(create_info.size, GPU_HUGE_PAGE_SIZE);
else
real_size = wsl::AlignUp(create_info.size, getpagesize());
if (real_size < fragment_allocator_.default_block_size()) {
*MemoryAddress = fragment_allocator_.alloc(real_size);
if (*MemoryAddress)
return HSAKMT_STATUS_SUCCESS;
}
/* SA might keep a lot of free blocks as *cache*.
* We can trim them if direct allocation fails at first time.
*/
trim_safe = true;
}
after_trim:
auto code = dev->CreateGpuMemory(create_info, &gpu_mem);
if (code == ErrorCode::Success) {
std::lock_guard<std::mutex> gard(*allocation_map_lock_);
/* For these physical allcations, use GpuMemory object's address as thunk handle*/
if (create_info.flags.physical_only || create_info.dmabuf_fd > 0)
*MemoryAddress = reinterpret_cast<void*>(gpu_mem->HandleApeAddress());
else
*MemoryAddress = reinterpret_cast<void *>(gpu_mem->GpuAddress());
(*allocation_map_)[*MemoryAddress] = Allocation(
gpu_mem->GetGpuMemoryHandle(), *MemoryAddress, (uint64_t)*MemoryAddress,
create_info.size, false, nullptr, SizeInBytes,
MemFlags.ui32.GTTAccess ? 0 : PreferredNode, MemFlags.Value);
return HSAKMT_STATUS_SUCCESS;
} else if (trim_safe) {
/* attempt to release memory from the block allocator and retry */
fragment_allocator_.trim();
trim_safe = false;
goto after_trim;
}
return HSAKMT_STATUS_ERROR;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemoryAlign(HSAuint32 PreferredNode,
HSAuint64 SizeInBytes,
HSAuint64 Alignment,
HsaMemFlags MemFlags,
void **MemoryAddress) {
return hsaKmtAllocMemoryAlignInternal(PreferredNode, SizeInBytes,
Alignment, MemFlags,
MemoryAddress,
!dxg_runtime->enable_thunk_sub_allocator);
}
HSAKMT_STATUS hsaKmtFreeMemoryInternal(void *MemoryAddress,
HSAuint64 SizeInBytes,
bool SkipSubAlloc) {
CHECK_DXG_OPEN();
if (!MemoryAddress)
return HSAKMT_STATUS_INVALID_PARAMETER;
if (!SkipSubAlloc) {
if (fragment_allocator_.free(MemoryAddress))
return HSAKMT_STATUS_SUCCESS;
}
wsl::thunk::GpuMemory *gpu_mem = nullptr;
{
std::lock_guard<std::mutex> gard(*allocation_map_lock_);
auto it = allocation_map_->find(MemoryAddress);
if (it == allocation_map_->end()) {
return HSAKMT_STATUS_ERROR;
}
gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
if (gpu_mem->IsQueueReferenced())
return HSAKMT_STATUS_ERROR;
wsl::thunk::GpuMemoryDescFlags flags;
flags.reserved = gpu_mem->Flags();
if (flags.is_imported_vram_ipc &&
gpu_mem->DecSharedReference()) {
pr_info("memory is still referenced\n");
return HSAKMT_STATUS_SUCCESS;
}
if (it->second.dmabuf_fd >= 0) {
close(it->second.dmabuf_fd);
it->second.dmabuf_fd = -1;
}
allocation_map_->erase(it);
}
delete gpu_mem;
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtFreeMemory(void *MemoryAddress,
HSAuint64 SizeInBytes) {
return hsaKmtFreeMemoryInternal(MemoryAddress, SizeInBytes);
}
bool queue_acquire_buffer(void *MemoryAddress) {
if (!MemoryAddress)
return false;
wsl::thunk::GpuMemory *gpu_mem = nullptr;
{
std::lock_guard<std::mutex> gard(*allocation_map_lock_);
auto it = allocation_map_->find(MemoryAddress);
if (it == allocation_map_->end()) {
return HSAKMT_STATUS_ERROR;
}
gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
gpu_mem->GetQueueReference();
}
if (gpu_mem == nullptr)
return false;
return true;
}
bool queue_release_buffer(void *MemoryAddress) {
if (!MemoryAddress)
return false;
wsl::thunk::GpuMemory *gpu_mem = nullptr;
{
std::lock_guard<std::mutex> gard(*allocation_map_lock_);
auto it = allocation_map_->find(MemoryAddress);
if (it == allocation_map_->end()) {
return HSAKMT_STATUS_ERROR;
}
gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
gpu_mem->PutQueueReference();
}
if (gpu_mem == nullptr)
return false;
return true;
}
wsl::thunk::GpuMemory *get_gpu_mem(void *MemoryAddress) {
std::lock_guard<std::mutex> gard(*allocation_map_lock_);
auto it = allocation_map_->find(MemoryAddress);
if (it == allocation_map_->end()) {
return nullptr;
}
return wsl::thunk::GpuMemory::Convert(it->second.handle);
}
HSAKMT_STATUS HSAKMTAPI hsaKmtAvailableMemory(HSAuint32 Node,
HSAuint64 *AvailableBytes) {
CHECK_DXG_OPEN();
if (!AvailableBytes)
return HSAKMT_STATUS_INVALID_PARAMETER;
wsl::thunk::WDDMDevice *dev = get_wddmdev(Node);
if (!dev)
return HSAKMT_STATUS_ERROR;
*AvailableBytes = dev->VramAvail();
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemory(void *MemoryAddress,
HSAuint64 MemorySizeInBytes) {
CHECK_DXG_OPEN();
pr_warn_once("not implemented\n");
assert(false);
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryToNodes(void *MemoryAddress,
HSAuint64 MemorySizeInBytes,
HSAuint64 NumberOfNodes,
HSAuint32 *NodeArray) {
CHECK_DXG_OPEN();
assert(false);
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryWithFlags(
void *MemoryAddress, HSAuint64 MemorySizeInBytes, HsaMemFlags MemFlags) {
CHECK_DXG_OPEN();
if (!MemoryAddress)
return HSAKMT_STATUS_INVALID_PARAMETER;
pr_debug("address %p\n", MemoryAddress);
if (MemFlags.ui32.ExtendedCoherent && MemFlags.ui32.CoarseGrain)
return HSAKMT_STATUS_INVALID_PARAMETER;
// Registered memory should be ordinary paged host memory.
if ((MemFlags.ui32.HostAccess != 1) || (MemFlags.ui32.NonPaged == 1))
return HSAKMT_STATUS_NOT_SUPPORTED;
if (!dxg_runtime->hsakmt_is_dgpu)
/* TODO: support mixed APU and dGPU configurations */
return HSAKMT_STATUS_NOT_SUPPORTED;
return HSAKMT_STATUS_SUCCESS;
}
bool is_ipc_sysmemfd(int fd) {
std::string fdPath = "/proc/self/fd/" + std::to_string(fd);
char linkTarget[256];
ssize_t bytes = readlink(fdPath.c_str(), linkTarget, sizeof(linkTarget) - 1);
if (bytes == -1)
return false;
linkTarget[bytes] = '\0';
return strstr(linkTarget, "rocr4wsl_gtt") != nullptr;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterGraphicsHandleToNodes(HSAuint64 GraphicsResourceHandle,
HsaGraphicsResourceInfo *GraphicsResourceInfo,
HSAuint64 NumberOfNodes,
HSAuint32 *NodeArray) {
HSA_REGISTER_MEM_FLAGS regFlags;
regFlags.Value = 0;
return hsaKmtRegisterGraphicsHandleToNodesExt(GraphicsResourceHandle,
GraphicsResourceInfo,
NumberOfNodes,
NodeArray,
regFlags);
}
HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterGraphicsHandleToNodesExt(HSAuint64 GraphicsResourceHandle,
HsaGraphicsResourceInfo *GraphicsResourceInfo,
HSAuint64 NumberOfNodes,
HSAuint32 *NodeArray,
HSA_REGISTER_MEM_FLAGS RegisterFlags) {
CHECK_DXG_OPEN();
uint32_t *gpu_id_array = NULL;
HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
if (is_ipc_sysmemfd(GraphicsResourceHandle)) {
GraphicsResourceInfo->NodeId = dxg_runtime->default_node;
pr_info("skip register sysmemfd. It would be released in next step\n");
return HSAKMT_STATUS_SUCCESS;
}
if (NumberOfNodes == 0) {
RegisterFlags.ui32.requiresVAddr = 0;
NumberOfNodes = 1;
NodeArray = (HSAuint32*)&(dxg_runtime->default_node);
}
pr_debug("number of nodes %lu\n", NumberOfNodes);
wsl::thunk::GpuMemoryHandle mem_handle;
ret = import_dmabuf_fd(GraphicsResourceHandle, NodeArray[0],
RegisterFlags.ui32.requiresVAddr,
false, &mem_handle);
if (ret != HSAKMT_STATUS_SUCCESS) {
pr_err("hsaKmtRegisterGraphicsHandleToNodesExt: import_dmabuf_fd failed, "
"GraphicsResourceHandle: %lu, NodeId: %u\n",
GraphicsResourceHandle, NodeArray[0]);
return ret;
}
wsl::thunk::GpuMemory *gpu_mem = wsl::thunk::GpuMemory::Convert(mem_handle);
GraphicsResourceInfo->NodeId = gpu_mem->GetDevice()->NodeId();
GraphicsResourceInfo->SizeInBytes = gpu_mem->ClientSize();
GraphicsResourceInfo->MemoryAddress = RegisterFlags.ui32.requiresVAddr ?
reinterpret_cast<void *>(gpu_mem->GpuAddress()):
reinterpret_cast<void*>(gpu_mem->HandleApeAddress());
return ret;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtExportDMABufHandle(void *MemoryAddress,
HSAuint64 MemorySizeInBytes,
int *DMABufFd,
HSAuint64 *Offset) {
CHECK_DXG_OPEN();
std::lock_guard<std::mutex> gard(*allocation_map_lock_);
auto it = allocation_map_->upper_bound(MemoryAddress);
if (it != allocation_map_->begin()) {
--it;
if (it->second.dmabuf_fd == -1) {
auto gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
auto code = gpu_mem->ExportPhysicalHandle(DMABufFd);
if (code != ErrorCode::Success)
return HSAKMT_STATUS_ERROR;
it->second.dmabuf_fd = *DMABufFd;
}
*DMABufFd = dup(it->second.dmabuf_fd);
*Offset = reinterpret_cast<uint64_t>(MemoryAddress) - it->second.gpu_addr;
return HSAKMT_STATUS_SUCCESS;
}
return HSAKMT_STATUS_ERROR;
}
HSAKMT_STATUS HSAKMTAPI
hsaKmtGetMemoryHandle(void *MemoryAddress, HSAuint64 SizeInBytes,
uint64_t *SharedMemoryHandle) {
CHECK_DXG_OPEN();
return HSAKMT_STATUS_NOT_SUPPORTED;
}
HSAKMT_STATUS import_dmabuf_fd(int DMABufFd,
uint32_t NodeId,
bool alloc_va,
bool is_ipc_memfd,
wsl::thunk::GpuMemoryHandle *GpuMemHandle) {
CHECK_DXG_OPEN();
*GpuMemHandle = nullptr;
wsl::thunk::WDDMDevice* dev = get_wddmdev(NodeId);
wsl::thunk::GpuMemory *gpu_mem = nullptr;
wsl::thunk::GpuMemoryCreateInfo create_info{};
create_info.dmabuf_fd = DMABufFd;
create_info.flags.alloc_va = alloc_va;
if (is_ipc_memfd) {
struct stat st;
fstat(DMABufFd, &st);
uint64_t sz = st.st_size;
if (4096 <= sz && sz < dxg_runtime->SystemHeapSize() && (sz & 0xfff) == 0) {
pr_debug("DMABufFd %d is sys mem fd(IPC signal), get size:%ld from it\n", DMABufFd, st.st_size);
create_info.flags.sysmem_ipc_sig_importer = 1; // set to 1 when backend is system memory
create_info.size = st.st_size;
}
}
gpusize gpu_va = 0;
auto code = dev->CreateGpuMemory(create_info, &gpu_mem, &gpu_va);
if (code == ErrorCode::SameProcessSameDevice) {
/* Unit_hipMemPoolExportToShareableHandle_SameProc */
pr_info("imported from same process, use the old one\n");
std::lock_guard<std::mutex> gard(*allocation_map_lock_);
auto it = allocation_map_->find((void*)gpu_va);
if (it == allocation_map_->end()) {
pr_err("where's the conflict buffer? va %#lx\n", create_info.va_hint);
return HSAKMT_STATUS_ERROR;
}
wsl::thunk::GpuMemory *conflict_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
conflict_mem->IncSharedReference();
*GpuMemHandle = it->second.handle;
return HSAKMT_STATUS_SUCCESS;
} else if (code != ErrorCode::Success) {
pr_err("fail to import fd, ret %d\n", (int)code);
return HSAKMT_STATUS_ERROR;
}
void *MemoryAddress;
if (alloc_va)
MemoryAddress = reinterpret_cast<void *>(gpu_mem->GpuAddress());
else
MemoryAddress = reinterpret_cast<void*>(gpu_mem->HandleApeAddress());
*GpuMemHandle = gpu_mem->GetGpuMemoryHandle();
std::lock_guard<std::mutex> gard(*allocation_map_lock_);
/*
* the gpu_mem->Flags() need convert back from GpuMemoryCreateFlags to
* HsaMemFlags, reference hsaKmtAllocMemoryAlign
* */
(*allocation_map_)[MemoryAddress] = Allocation(
*GpuMemHandle, MemoryAddress, (uint64_t)MemoryAddress,
gpu_mem->Size(), false, nullptr, gpu_mem->ClientSize(),
NodeId, gpu_mem->Flags());
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI
hsaKmtShareMemory(void *MemoryAddress, HSAuint64 SizeInBytes,
HsaSharedMemoryHandle *SharedMemoryHandle) {
CHECK_DXG_OPEN();
pr_warn_once("not implemented\n");
assert(false);
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI
hsaKmtRegisterSharedHandle(const HsaSharedMemoryHandle *SharedMemoryHandle,
void **MemoryAddress, HSAuint64 *SizeInBytes) {
CHECK_DXG_OPEN();
pr_warn_once("not implemented\n");
assert(false);
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterSharedHandleToNodes(
const HsaSharedMemoryHandle *SharedMemoryHandle, void **MemoryAddress,
HSAuint64 *SizeInBytes, HSAuint64 NumberOfNodes, HSAuint32 *NodeArray) {
CHECK_DXG_OPEN();
pr_warn_once("not implemented\n");
assert(false);
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtProcessVMRead(HSAuint32 Pid,
HsaMemoryRange *LocalMemoryArray,
HSAuint64 LocalMemoryArrayCount,
HsaMemoryRange *RemoteMemoryArray,
HSAuint64 RemoteMemoryArrayCount,
HSAuint64 *SizeCopied) {
CHECK_DXG_OPEN();
pr_warn_once("has been deprecated\n");
assert(false);
return HSAKMT_STATUS_NOT_IMPLEMENTED;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtProcessVMWrite(HSAuint32 Pid,
HsaMemoryRange *LocalMemoryArray,
HSAuint64 LocalMemoryArrayCount,
HsaMemoryRange *RemoteMemoryArray,
HSAuint64 RemoteMemoryArrayCount,
HSAuint64 *SizeCopied) {
CHECK_DXG_OPEN();
pr_warn_once("has been deprecated\n");
assert(false);
return HSAKMT_STATUS_NOT_IMPLEMENTED;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtDeregisterMemory(void *MemoryAddress) {
CHECK_DXG_OPEN();
if (!MemoryAddress)
return HSAKMT_STATUS_INVALID_PARAMETER;
pr_debug("address %p\n", MemoryAddress);
{
std::lock_guard<std::mutex> gard(*allocation_map_lock_);
auto it = allocation_map_->find(MemoryAddress);
if (it == allocation_map_->end()) {
return HSAKMT_STATUS_SUCCESS;
}
auto *gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
wsl::thunk::GpuMemoryDescFlags flags;
flags.reserved = gpu_mem->Flags();
// IPC mem(vram)
if (flags.is_imported_vram_ipc &&
gpu_mem->DecSharedReference() == 0) {
allocation_map_->erase(it);
delete gpu_mem;
return HSAKMT_STATUS_SUCCESS;
}
if (it->second.userptr) {
allocation_map_->erase(it);
allocation_map_->erase((void *)it->second.gpu_addr);
delete gpu_mem;
return HSAKMT_STATUS_SUCCESS;
}
}
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtMapMemoryToGPU(void *MemoryAddress,
HSAuint64 MemorySizeInBytes,
HSAuint64 *AlternateVAGPU) {
HSAuint64 NumberOfNodes = 1;
HSAuint32 NodeArray[] = {dxg_runtime->default_node};
HsaMemMapFlags MemMapFlags;
MemMapFlags.Value = 0;
return hsaKmtMapMemoryToGPUNodes(MemoryAddress, MemorySizeInBytes, AlternateVAGPU,
MemMapFlags, NumberOfNodes, NodeArray);
}
HSAKMT_STATUS HSAKMTAPI hsaKmtMapMemoryToGPUNodes(
void *MemoryAddress, HSAuint64 MemorySizeInBytes, HSAuint64 *AlternateVAGPU,
HsaMemMapFlags MemMapFlags, HSAuint64 NumberOfNodes, HSAuint32 *NodeArray) {
CHECK_DXG_OPEN();
if (!MemoryAddress || !AlternateVAGPU) {
pr_err("FIXME: mapping NULL pointer\n");
return HSAKMT_STATUS_ERROR;
}
uint64_t start = wsl::AlignDown((uint64_t)MemoryAddress, 4096);
uint64_t end =
wsl::AlignUp((uint64_t)MemoryAddress + MemorySizeInBytes, 4096);
void *aligned_ptr = (void *)start;
size_t aligned_size = end - start;
{
if (nullptr != fragment_allocator_.block_base(aligned_ptr))
return HSAKMT_STATUS_SUCCESS;
}
{
std::lock_guard<std::mutex> gard(*allocation_map_lock_);
auto it = allocation_map_->find(aligned_ptr);
if (it != allocation_map_->end()) {
wsl::thunk::GpuMemory *gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
wsl::thunk::GpuMemoryDescFlags flags;
flags.reserved = gpu_mem->Flags();
// IPC mem
if (flags.is_imported_vram_ipc) {
auto code = gpu_mem->MapGpuVirtualAddress(gpu_mem->GpuAddress(), gpu_mem->Size());
if (code != ErrorCode::Success)
return HSAKMT_STATUS_ERROR;
code = gpu_mem->MakeResident();
if (code != ErrorCode::Success)
return HSAKMT_STATUS_ERROR;
wsl::thunk::WDDMDevice *dev = gpu_mem->GetDevice();
if (!dev->WaitOnPagingFenceFromCpu())
return HSAKMT_STATUS_ERROR;
return HSAKMT_STATUS_SUCCESS;
}
if (!it->second.userptr) {
// GTT/Local mem
if (it->second.size >= MemorySizeInBytes) {
*AlternateVAGPU = (uint64_t)MemoryAddress;
return HSAKMT_STATUS_SUCCESS;
} else {
return HSAKMT_STATUS_ERROR;
}
}
}
// userptr mem
it = allocation_map_->find(MemoryAddress);
if (it != allocation_map_->end()) {
if (it->second.userptr && it->second.size >= MemorySizeInBytes) {
*AlternateVAGPU =
(uintptr_t)it->second.gpu_addr +
((uintptr_t)MemoryAddress - (uintptr_t)it->second.cpu_addr);
return HSAKMT_STATUS_SUCCESS;
}
}
}
// map userptr
wsl::thunk::WDDMDevice *dev = get_wddmdev(NodeArray[0]);
if (!dev)
return HSAKMT_STATUS_ERROR;
wsl::thunk::GpuMemory *gpu_mem = nullptr;
wsl::thunk::GpuMemoryHandle handle = 0;
uint64_t addr;
wsl::thunk::GpuMemoryCreateInfo create_info{};
create_info.domain = thunk_proxy::kUserMemory;
create_info.size = aligned_size;
create_info.user_ptr = aligned_ptr;
auto code = dev->CreateGpuMemory(create_info, &gpu_mem);
if (code == ErrorCode::Success) {
addr = gpu_mem->GpuAddress();
handle = gpu_mem->GetGpuMemoryHandle();
} else {
return HSAKMT_STATUS_ERROR;
}
{
std::lock_guard<std::mutex> guard(*allocation_map_lock_);
(*allocation_map_)[MemoryAddress] =
Allocation(handle, aligned_ptr, addr, aligned_size, true, MemoryAddress,
MemorySizeInBytes);
(*allocation_map_)[(void *)addr] =
Allocation(handle, aligned_ptr, addr, aligned_size, true, nullptr,
MemorySizeInBytes);
}
*AlternateVAGPU = addr + ((uintptr_t)MemoryAddress - (uintptr_t)aligned_ptr);
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtUnmapMemoryToGPU(void *MemoryAddress) {
CHECK_DXG_OPEN();
if (!MemoryAddress) {
/* Workaround for runtime bug */
pr_err("FIXME: Unmapping NULL pointer\n");
return HSAKMT_STATUS_SUCCESS;
}
pr_debug("address %p\n", MemoryAddress);
{
if (nullptr != fragment_allocator_.block_base(MemoryAddress))
return HSAKMT_STATUS_SUCCESS;
}
wsl::thunk::GpuMemory *gpu_mem = nullptr;
{
std::lock_guard<std::mutex> gard(*allocation_map_lock_);
auto it = allocation_map_->find(MemoryAddress);
if (it == allocation_map_->end()) {
return HSAKMT_STATUS_ERROR;
}
gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
if (gpu_mem->IsQueueReferenced())
return HSAKMT_STATUS_ERROR;
// IPC mem
wsl::thunk::GpuMemoryDescFlags flags;
flags.reserved = gpu_mem->Flags();
if (flags.is_imported_vram_ipc &&
!gpu_mem->IsSharedFromSameProcess()) {
auto code = gpu_mem->UnmapGpuVirtualAddress(gpu_mem->GpuAddress(), gpu_mem->Size());
if (code != ErrorCode::Success)
return HSAKMT_STATUS_ERROR;
gpu_mem->Evict();
return HSAKMT_STATUS_SUCCESS;
}
}
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtMapGraphicHandle(HSAuint32 NodeId,
HSAuint64 GraphicDeviceHandle,
HSAuint64 GraphicResourceHandle,
HSAuint64 GraphicResourceOffset,
HSAuint64 GraphicResourceSize,
HSAuint64 *FlatMemoryAddress) {
CHECK_DXG_OPEN();
pr_warn_once("not implemented\n");
/* This API was only ever implemented in KFD for Kaveri and
* was never upstreamed. There are no open-source users of
* this interface. It has been superseded by
* RegisterGraphicsHandleToNodes.
*/
return HSAKMT_STATUS_NOT_IMPLEMENTED;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtUnmapGraphicHandle(HSAuint32 NodeId,
HSAuint64 FlatMemoryAddress,
HSAuint64 SizeInBytes) {
CHECK_DXG_OPEN();
pr_warn_once("not implemented\n");
assert(false);
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtGetTileConfig(HSAuint32 NodeId,
HsaGpuTileConfig *config) {
CHECK_DXG_OPEN();
pr_warn_once("not implemented\n");
assert(false);
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtQueryPointerInfo(const void *Pointer,
HsaPointerInfo *PointerInfo) {
CHECK_DXG_OPEN();
if (!Pointer || !PointerInfo)
return HSAKMT_STATUS_INVALID_PARAMETER;
pr_debug("pointer %p\n", Pointer);
memset(PointerInfo, 0, sizeof(HsaPointerInfo));
wsl::thunk::GpuMemory *gpu_mem = nullptr;
Allocation allocation_info;
bool found = false;
{
std::lock_guard<std::mutex> gard(*allocation_map_lock_);
auto it = allocation_map_->upper_bound(Pointer);
if (it != allocation_map_->begin()) {
--it;
if (Pointer >= it->first &&
(Pointer < reinterpret_cast<const uint8_t*>(it->first) + it->second.size_requested)) {
allocation_info = it->second;
gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle);
found = true;
}
}
}
if (!found) {
pr_debug("can't found allocation for %p\n", Pointer);
PointerInfo->Type = HSA_POINTER_UNKNOWN;
return HSAKMT_STATUS_ERROR;
}
if (allocation_info.userptr) {
PointerInfo->Type = HSA_POINTER_REGISTERED_USER;
PointerInfo->SizeInBytes = allocation_info.size;
} else if (gpu_mem->IsVirtual()) {
PointerInfo->Type = HSA_POINTER_RESERVED_ADDR;
} else {
PointerInfo->Type = HSA_POINTER_ALLOCATED;
PointerInfo->SizeInBytes = allocation_info.size_requested;
}
PointerInfo->Node = allocation_info.node_id;
PointerInfo->MemFlags.Value = allocation_info.mem_flags_value;
PointerInfo->CPUAddress = allocation_info.cpu_addr;
PointerInfo->GPUAddress = allocation_info.gpu_addr;
PointerInfo->UserData = allocation_info.rocr_userdata;
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtSetMemoryUserData(const void *Pointer,
void *UserData) {
CHECK_DXG_OPEN();
uint64_t aligned_ptr = wsl::AlignDown((uint64_t)Pointer, 4096);
std::lock_guard<std::mutex> gard(*allocation_map_lock_);
auto it = allocation_map_->find((void *)aligned_ptr);
if (it != allocation_map_->end()) {
it->second.rocr_userdata = UserData;
return HSAKMT_STATUS_SUCCESS;
}
return HSAKMT_STATUS_ERROR;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtReplaceAsanHeaderPage(void *addr) {
CHECK_DXG_OPEN();
pr_warn_once("not supported\n");
assert(false);
#ifdef SANITIZER_AMDGPU
pr_debug("address %p\n", addr);
CHECK_DXG_OPEN();
return HSAKMT_STATUS_SUCCESS;
#else
return HSAKMT_STATUS_NOT_SUPPORTED;
#endif
}
HSAKMT_STATUS HSAKMTAPI hsaKmtReturnAsanHeaderPage(void *addr) {
CHECK_DXG_OPEN();
pr_warn_once("not supported\n");
assert(false);
#ifdef SANITIZER_AMDGPU
pr_debug("address %p\n", addr);
CHECK_DXG_OPEN();
return HSAKMT_STATUS_SUCCESS;
#else
return HSAKMT_STATUS_NOT_SUPPORTED;
#endif
}