wsl/hsakmt: implement ipc signal

IPC Signal only support sys ram backend and CPU&GPU both accessible,
IPC Memory only support vram backend and only GPU accessible.

Reviewed-by: Flora Cui <flora.cui@amd.com>
Signed-off-by: tiancyin <tianci.yin@amd.com>
Этот коммит содержится в:
tiancyin
2025-02-24 16:17:58 +08:00
коммит произвёл Frank Min
родитель 29ed03ab0a
Коммит f4390637fb
5 изменённых файлов: 202 добавлений и 18 удалений
+8 -4
Просмотреть файл
@@ -82,6 +82,9 @@ HSAKMTAPI int amdgpu_device_get_fd(amdgpu_device_handle dev) {
}
HSAKMTAPI int amdgpu_bo_cpu_map(amdgpu_bo_handle bo, void **cpu) {
wsl::thunk::GpuMemory *gpu_mem = get_gpu_mem(bo);
if (gpu_mem->IsSysMemFd())
*cpu = gpu_mem->CpuAddress();
return 0;
}
@@ -116,11 +119,14 @@ HSAKMTAPI int amdgpu_bo_va_op(amdgpu_bo_handle bo,
uint64_t addr,
uint64_t flags,
uint32_t ops) {
wsl::thunk::GpuMemory *gpu_mem = get_gpu_mem(bo);
assert(gpu_mem != nullptr);
if (gpu_mem->IsSysMemFd())
return 0;
switch(ops) {
case AMDGPU_VA_OP_MAP:
{
wsl::thunk::GpuMemory *gpu_mem = get_gpu_mem(bo);
assert(gpu_mem != nullptr);
auto code = gpu_mem->MapGpuVirtualAddress(reinterpret_cast<gpusize>(addr), size, offset);
if (code != ErrorCode::Success)
return -1;
@@ -132,8 +138,6 @@ HSAKMTAPI int amdgpu_bo_va_op(amdgpu_bo_handle bo,
break;
case AMDGPU_VA_OP_UNMAP:
{
wsl::thunk::GpuMemory *gpu_mem = get_gpu_mem(bo);
assert(gpu_mem != nullptr);
auto code = gpu_mem->UnmapGpuVirtualAddress(reinterpret_cast<gpusize>(addr), size, offset);
if (code != ErrorCode::Success)
return -1;
+1 -1
Просмотреть файл
@@ -202,6 +202,6 @@ uint32_t get_vgpr_size_per_cu(HSA_ENGINE_ID id);
HSAKMT_STATUS hsaKmtImportDMABufHandle(int DMABufFd,
HsaGraphicsResourceInfo *GraphicsResourceInfo,
bool requiresVAddr = false);
HSA_REGISTER_MEM_FLAGS RegisterFlags = {0});
#endif
+29 -10
Просмотреть файл
@@ -31,6 +31,7 @@
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/sysinfo.h>
#include <sys/stat.h>
#include <fcntl.h>
#include "impl/wddm/gpu_memory.h"
#include "util/simple_heap.h"
@@ -191,6 +192,9 @@ HSAKMT_STATUS hsaKmtAllocMemoryAlignInternal(HSAuint32 PreferredNode,
if (zfb_support && MemFlags.ui32.NonPaged == 1)
MemFlags.ui32.CoarseGrain = 1;
// AllocateNonPaged == AllocateIPC
create_info.flags.imported_sys_memfd = !!(MemFlags.ui32.NonPaged && !MemFlags.ui32.GTTAccess);
create_info.domain = thunk_proxy::AllocDomain::kSystem;
} else {
create_info.domain = thunk_proxy::AllocDomain::kLocal;
@@ -454,8 +458,8 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterGraphicsHandleToNodesExt(HSAuint64 Graphic
GraphicsResourceInfo->NodeId = 1;
return hsaKmtImportDMABufHandle(GraphicsResourceHandle,
GraphicsResourceInfo,
!!RegisterFlags.ui32.requiresVAddr);
GraphicsResourceInfo,
RegisterFlags);
}
HSAKMT_STATUS HSAKMTAPI hsaKmtExportDMABufHandle(void *MemoryAddress,
@@ -477,7 +481,6 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtExportDMABufHandle(void *MemoryAddress,
it->second.dmabuf_fd = *DMABufFd;
}
*DMABufFd = dup(it->second.dmabuf_fd);
*Offset = reinterpret_cast<uint64_t>(MemoryAddress) - it->second.gpu_addr;
return HSAKMT_STATUS_SUCCESS;
}
@@ -488,21 +491,37 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtExportDMABufHandle(void *MemoryAddress,
HSAKMT_STATUS hsaKmtImportDMABufHandle(int DMABufFd,
HsaGraphicsResourceInfo *GraphicsResourceInfo,
bool requiresVAddr) {
HSA_REGISTER_MEM_FLAGS RegisterFlags) {
CHECK_DXG_OPEN();
wsl::thunk::WDDMDevice* dev = get_wddmdev(1);
wsl::thunk::GpuMemory *gpu_mem = nullptr;
wsl::thunk::GpuMemoryCreateInfo create_info{};
create_info.dmabuf_fd = DMABufFd;
create_info.flags.imported_vram_alloc_va = requiresVAddr;
create_info.flags.imported_vram_alloc_va = RegisterFlags.ui32.requiresVAddr;
std::string fdPath = "/proc/self/fd/" + std::to_string(DMABufFd);
char linkTarget[256];
ssize_t bytes = readlink(fdPath.c_str(), linkTarget, sizeof(linkTarget) - 1);
if (bytes == -1)
pr_err("Error reading link\n");
linkTarget[bytes] = '\0';
if (strstr(linkTarget, "rocr4wsl_gtt") != nullptr) {
struct stat st;
fstat(DMABufFd, &st);
uint64_t sz = st.st_size;
if (4096 <= sz && sz < dev->SystemHeapSize() && (sz & 0xfff) == 0) {
pr_debug("DMABufFd %d is sys mem fd(IPC signal), get size:%ld from it\n", DMABufFd, st.st_size);
create_info.flags.imported_sys_memfd = 1; // set to 1 when backend is system memory
create_info.flags.imported_vram_alloc_va = 0; // set to 1 when backend is vram
create_info.size = st.st_size;
}
}
auto code = dev->CreateGpuMemory(create_info, &gpu_mem);
if (code == ErrorCode::Success) {
void *MemoryAddress;
if (requiresVAddr)
if (create_info.flags.imported_sys_memfd || create_info.flags.imported_vram_alloc_va)
MemoryAddress = reinterpret_cast<void *>(gpu_mem->GpuAddress());
else
MemoryAddress = reinterpret_cast<void*>(gpu_mem->HandleApeAddress());
@@ -589,12 +608,12 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDeregisterMemory(void *MemoryAddress) {
{
std::lock_guard<std::mutex> gard(*allocation_map_lock_);
// IPC mem
// IPC mem(vram) and IPC signal(sys mem)
auto it_ipc = allocation_map_.find(MemoryAddress);
if (it_ipc != allocation_map_.end()) {
wsl::thunk::GpuMemoryDescFlags flags;
flags.reserved = it_ipc->second.mem_flags_value;
if (flags.is_imported_vram_alloc_va) {
if (flags.is_imported_vram_alloc_va || flags.is_imported_sys_memfd) {
wsl::thunk::GpuMemory *gpu_mem;
gpu_mem = wsl::thunk::GpuMemory::Convert(it_ipc->second.handle);
allocation_map_.erase(it_ipc);
+84
Просмотреть файл
@@ -45,6 +45,7 @@
#include <sys/mman.h>
#include <sys/sysinfo.h>
#include <sys/stat.h>
#include <linux/mman.h>
#include <fcntl.h>
#include <unistd.h>
@@ -231,6 +232,52 @@ bool WDDMDevice::DecommitSystemHeapSpace(void* addr, int64_t size) {
return true;
}
bool WDDMDevice::CommitSystemHeapSpaceIPC(void* addr, int64_t size, int &memfd, bool lock) {
int fd = -1;
if (memfd == -1) {
fd = memfd_create("rocr4wsl_gtt", MFD_CLOEXEC);
if (fd < 0) {
pr_err("memfd_create failed\n");
return false;
}
ftruncate(fd, size);
} else {
fd = memfd;
}
int32_t protFlags = PROT_READ | PROT_WRITE;
int32_t mapFlags = MAP_SHARED | MAP_FIXED | MAP_NORESERVE |
MAP_UNINITIALIZED | (lock ? MAP_LOCKED : 0);
void* paddr = mmap(addr, size, protFlags, mapFlags, fd, 0);
if (paddr == MAP_FAILED) {
pr_err("fail to commit %s addr = %p, paddr = %p\n", (lock ? "locked" : ""), addr, paddr);
if (memfd == -1)
close(fd);
return false;
}
assert(addr == paddr);
memfd = fd;
if (madvise(addr, size, MADV_DONTFORK))
pr_err("fail to set MADV_DONTFORK for addr = %p\n", addr);
return true;
}
bool WDDMDevice::DecommitSystemHeapSpaceIPC(void* addr, int64_t size, int &memfd) {
if (munmap(addr, size) != 0) {
pr_err("fail to unmap = %p \n", addr);
return false;
}
close(memfd);
memfd = -1;
return true;
}
bool WDDMDevice::ReserveSystemHeapSpace() {
struct sysinfo info;
int ret = sysinfo(&info);
@@ -458,6 +505,43 @@ ErrorCode WDDMDevice::FreeGpuVirtualAddress(const thunk_proxy::AllocDomain domai
return code;
}
ErrorCode WDDMDevice::ReserveIPCSysMem(gpusize size,
gpusize *out_gpu_virt_addr, gpusize alignment,
int &memfd, bool lock) {
gpusize gpu_addr = 0;
ErrorCode code = ErrorCode::Success;
code = d3dthunk::ReserveGpuVirtualAddress(adapter_, size,
system_heap_space_start_,
system_heap_space_start_ + system_heap_space_size_,
&gpu_addr);
if (code != ErrorCode::Success)
return code;
if (!CommitSystemHeapSpaceIPC((void*)gpu_addr, size, memfd, lock)) {
d3dthunk::FreeGpuVirtualAddress(adapter_, gpu_addr, size);
code = ErrorCode::SyscallFail;
}
*out_gpu_virt_addr = (code == ErrorCode::Success) ? gpu_addr : 0;
return code;
}
ErrorCode WDDMDevice::FreeIPCSysMem(gpusize gpu_addr, gpusize size, int &memfd) {
auto code = ErrorCode::Success;
DecommitSystemHeapSpaceIPC((void *)gpu_addr, size, memfd);
d3dthunk::FreeGpuVirtualAddressArgs free_args{};
free_args.hAdapter = adapter_;
free_args.BaseAddress = gpu_addr;
free_args.Size = size;
code = d3dthunk::FreeGpuVirtualAddress(&free_args);
return code;
}
ErrorCode WDDMDevice::HandleApertureAlloc(gpusize size, gpusize *out_gpu_virt_addr) {
uint64_t align = DEFAULT_GPU_PAGE_SIZE;
+80 -3
Просмотреть файл
@@ -1,3 +1,4 @@
#include <sys/stat.h>
#include <cinttypes>
#include <cassert>
#include "impl/wddm/gpu_memory.h"
@@ -41,6 +42,7 @@ GpuMemory::GpuMemory(WDDMDevice *device) : device_(device) {
alloc_handles_ptr_ = nullptr;
alloc_handle_ = 0;
resource_ = 0;
mem_fd_ = -1;
}
GpuMemory::~GpuMemory() {
@@ -60,6 +62,7 @@ ErrorCode GpuMemory::Init(const GpuMemoryCreateInfo &create_info) {
desc_.flags.is_virtual = create_info.flags.virtual_alloc;
desc_.flags.is_physical_only = create_info.flags.physical_only;
desc_.flags.is_physical_contiguous = create_info.flags.physical_contiguous;
desc_.flags.is_imported_sys_memfd = create_info.flags.imported_sys_memfd;
/* we can't tell the allocation is regular vmm or ipc mem at creation stage,
they share same creation parameters, so forcing all vram allocations to
@@ -69,7 +72,6 @@ ErrorCode GpuMemory::Init(const GpuMemoryCreateInfo &create_info) {
desc_.flags.is_shared = true;
desc_.flags.is_locked = create_info.flags.locked;
desc_.size = AdjustSize(desc_.client_size);
if (IsUserMemory() || IsSystem())
@@ -243,9 +245,18 @@ ErrorCode GpuMemory::MapGpuVirtualAddress(const gpusize addr, const gpusize size
}
ErrorCode GpuMemory::ReserveGpuVirtualAddress(gpusize base_virt_addr, gpusize size, gpusize alignment) {
ErrorCode status;
gpusize gpu_virt_addr = 0;
auto status = device_->ReserveGpuVirtualAddress(desc_.domain, base_virt_addr, size, &gpu_virt_addr, alignment,
desc_.flags.is_locked);
if (desc_.flags.is_imported_sys_memfd && desc_.domain == thunk_proxy::AllocDomain::kSystem) {
int mfd = (mem_fd_ > -1)? mem_fd_ : -1;
status = device_->ReserveIPCSysMem(Size(), &gpu_virt_addr, desc_.alignment, mfd, desc_.flags.is_locked);
if (status == ErrorCode::Success)
mem_fd_ = mfd;
} else {
status = device_->ReserveGpuVirtualAddress(desc_.domain, base_virt_addr, size, &gpu_virt_addr, alignment,
desc_.flags.is_locked);
}
if (status == ErrorCode::Success) {
desc_.gpu_addr = gpu_virt_addr;
@@ -256,6 +267,9 @@ ErrorCode GpuMemory::ReserveGpuVirtualAddress(gpusize base_virt_addr, gpusize si
}
ErrorCode GpuMemory::FreeGpuVirtualAddress(gpusize base_addr, gpusize size) {
if (mem_fd_ > -1)
return device_->FreeIPCSysMem(GpuAddress(), Size(), mem_fd_);
return base_addr != 0 ?
device_->FreeGpuVirtualAddress(desc_.domain, base_addr, size) :
ErrorCode::Success;
@@ -386,6 +400,11 @@ ErrorCode GpuMemory::Evict() {
}
ErrorCode GpuMemory::ExportPhysicalHandle(int* dmabuf_fd, uint32_t flags) {
if (mem_fd_ > -1) {
*dmabuf_fd = mem_fd_;
return ErrorCode::Success;
}
if (IsShared())
return d3dthunk::ShareObjects(num_allocations_, resource_, flags, dmabuf_fd);
else
@@ -400,6 +419,64 @@ ErrorCode GpuMemory::ImportPhysicalHandle(const GpuMemoryCreateInfo &create_info
if (dmabuf_fd <= 0)
return ErrorCode::InvalidateParams;
if(create_info.flags.imported_sys_memfd) {
// the ipc signal sys mem fd will be closed in Runtime::IPCClientImport, dup to hold a reference
mem_fd_ = dup(dmabuf_fd);
desc_.client_size = create_info.size;
desc_.size = AdjustSize(desc_.client_size);
desc_.domain = thunk_proxy::AllocDomain::kSystem;
desc_.adapter_luid = device_->GetLuid();
desc_.alignment = 0x1000;
desc_.mem_flags = create_info.mem_flags;
desc_.engine_flag = create_info.engine_flag;
desc_.flags.is_imported_sys_memfd = create_info.flags.imported_sys_memfd;
desc_.flags.is_virtual = create_info.flags.virtual_alloc;
desc_.flags.is_physical_only = create_info.flags.physical_only;
desc_.flags.is_physical_contiguous = create_info.flags.physical_contiguous;
desc_.flags.is_locked = create_info.flags.locked;
auto code = ReserveGpuVirtualAddress(create_info.va_hint, Size(), create_info.alignment);
if (code != ErrorCode::Success)
return code;
bool physical_created = false;
auto guard = MakeScopeGuard([this, &physical_created, &code]() {
if (code != ErrorCode::Success) {
if (physical_created)
FreePhysicalMemory();
FreeGpuVirtualAddress(GpuAddress(), Size());
}
});
(void)guard;
num_allocations_ = CalcChunkNumbers(Size());
if (num_allocations_ == 1)
alloc_handles_ptr_ = &alloc_handle_;
else
alloc_handles_ptr_ = new WinAllocationHandle[num_allocations_];
memset(alloc_handles_ptr_, 0, num_allocations_ * sizeof(WinAllocationHandle));
code = CreatePhysicalMemory();
if (code != ErrorCode::Success)
return code;
physical_created = true;
code = MapGpuVirtualAddress(GpuAddress(), Size());
if (code != ErrorCode::Success)
return code;
code = MakeResident();
if (code != ErrorCode::Success)
return code;
if (!GetDevice()->WaitOnPagingFenceFromCpu())
code = ErrorCode::Unknown;
return code;
}
memset(&query_args, 0, sizeof(query_args));
query_args.hDevice = device_->DeviceHandle();
query_args.hNtHandle = reinterpret_cast<HANDLE>(dmabuf_fd);