From f4390637fb280f9e95c3c335668e1f992148ab21 Mon Sep 17 00:00:00 2001 From: tiancyin Date: Mon, 24 Feb 2025 16:17:58 +0800 Subject: [PATCH] wsl/hsakmt: implement ipc signal IPC Signal only support sys ram backend and CPU&GPU both accessible, IPC Memory only support vram backend and only GPU accessible. Reviewed-by: Flora Cui Signed-off-by: tiancyin --- libdrm.cpp | 12 ++++--- libhsakmt.h | 2 +- memory.cpp | 39 +++++++++++++++------ wddm/device.cpp | 84 +++++++++++++++++++++++++++++++++++++++++++++ wddm/gpu_memory.cpp | 83 ++++++++++++++++++++++++++++++++++++++++++-- 5 files changed, 202 insertions(+), 18 deletions(-) diff --git a/libdrm.cpp b/libdrm.cpp index 7d366916bd..4bdbe21699 100644 --- a/libdrm.cpp +++ b/libdrm.cpp @@ -82,6 +82,9 @@ HSAKMTAPI int amdgpu_device_get_fd(amdgpu_device_handle dev) { } HSAKMTAPI int amdgpu_bo_cpu_map(amdgpu_bo_handle bo, void **cpu) { + wsl::thunk::GpuMemory *gpu_mem = get_gpu_mem(bo); + if (gpu_mem->IsSysMemFd()) + *cpu = gpu_mem->CpuAddress(); return 0; } @@ -116,11 +119,14 @@ HSAKMTAPI int amdgpu_bo_va_op(amdgpu_bo_handle bo, uint64_t addr, uint64_t flags, uint32_t ops) { + wsl::thunk::GpuMemory *gpu_mem = get_gpu_mem(bo); + assert(gpu_mem != nullptr); + if (gpu_mem->IsSysMemFd()) + return 0; + switch(ops) { case AMDGPU_VA_OP_MAP: { - wsl::thunk::GpuMemory *gpu_mem = get_gpu_mem(bo); - assert(gpu_mem != nullptr); auto code = gpu_mem->MapGpuVirtualAddress(reinterpret_cast(addr), size, offset); if (code != ErrorCode::Success) return -1; @@ -132,8 +138,6 @@ HSAKMTAPI int amdgpu_bo_va_op(amdgpu_bo_handle bo, break; case AMDGPU_VA_OP_UNMAP: { - wsl::thunk::GpuMemory *gpu_mem = get_gpu_mem(bo); - assert(gpu_mem != nullptr); auto code = gpu_mem->UnmapGpuVirtualAddress(reinterpret_cast(addr), size, offset); if (code != ErrorCode::Success) return -1; diff --git a/libhsakmt.h b/libhsakmt.h index 202a5f4752..392db59afa 100644 --- a/libhsakmt.h +++ b/libhsakmt.h @@ -202,6 +202,6 @@ uint32_t get_vgpr_size_per_cu(HSA_ENGINE_ID id); HSAKMT_STATUS hsaKmtImportDMABufHandle(int DMABufFd, HsaGraphicsResourceInfo *GraphicsResourceInfo, - bool requiresVAddr = false); + HSA_REGISTER_MEM_FLAGS RegisterFlags = {0}); #endif diff --git a/memory.cpp b/memory.cpp index 6300439567..dfc5a1f2f1 100644 --- a/memory.cpp +++ b/memory.cpp @@ -31,6 +31,7 @@ #include #include #include +#include #include #include "impl/wddm/gpu_memory.h" #include "util/simple_heap.h" @@ -191,6 +192,9 @@ HSAKMT_STATUS hsaKmtAllocMemoryAlignInternal(HSAuint32 PreferredNode, if (zfb_support && MemFlags.ui32.NonPaged == 1) MemFlags.ui32.CoarseGrain = 1; + // AllocateNonPaged == AllocateIPC + create_info.flags.imported_sys_memfd = !!(MemFlags.ui32.NonPaged && !MemFlags.ui32.GTTAccess); + create_info.domain = thunk_proxy::AllocDomain::kSystem; } else { create_info.domain = thunk_proxy::AllocDomain::kLocal; @@ -454,8 +458,8 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterGraphicsHandleToNodesExt(HSAuint64 Graphic GraphicsResourceInfo->NodeId = 1; return hsaKmtImportDMABufHandle(GraphicsResourceHandle, - GraphicsResourceInfo, - !!RegisterFlags.ui32.requiresVAddr); + GraphicsResourceInfo, + RegisterFlags); } HSAKMT_STATUS HSAKMTAPI hsaKmtExportDMABufHandle(void *MemoryAddress, @@ -477,7 +481,6 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtExportDMABufHandle(void *MemoryAddress, it->second.dmabuf_fd = *DMABufFd; } *DMABufFd = dup(it->second.dmabuf_fd); - *Offset = reinterpret_cast(MemoryAddress) - it->second.gpu_addr; return HSAKMT_STATUS_SUCCESS; } @@ -488,21 +491,37 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtExportDMABufHandle(void *MemoryAddress, HSAKMT_STATUS hsaKmtImportDMABufHandle(int DMABufFd, HsaGraphicsResourceInfo *GraphicsResourceInfo, - bool requiresVAddr) { - - + HSA_REGISTER_MEM_FLAGS RegisterFlags) { CHECK_DXG_OPEN(); wsl::thunk::WDDMDevice* dev = get_wddmdev(1); wsl::thunk::GpuMemory *gpu_mem = nullptr; wsl::thunk::GpuMemoryCreateInfo create_info{}; create_info.dmabuf_fd = DMABufFd; - create_info.flags.imported_vram_alloc_va = requiresVAddr; + create_info.flags.imported_vram_alloc_va = RegisterFlags.ui32.requiresVAddr; + + std::string fdPath = "/proc/self/fd/" + std::to_string(DMABufFd); + char linkTarget[256]; + ssize_t bytes = readlink(fdPath.c_str(), linkTarget, sizeof(linkTarget) - 1); + if (bytes == -1) + pr_err("Error reading link\n"); + linkTarget[bytes] = '\0'; + if (strstr(linkTarget, "rocr4wsl_gtt") != nullptr) { + struct stat st; + fstat(DMABufFd, &st); + uint64_t sz = st.st_size; + if (4096 <= sz && sz < dev->SystemHeapSize() && (sz & 0xfff) == 0) { + pr_debug("DMABufFd %d is sys mem fd(IPC signal), get size:%ld from it\n", DMABufFd, st.st_size); + create_info.flags.imported_sys_memfd = 1; // set to 1 when backend is system memory + create_info.flags.imported_vram_alloc_va = 0; // set to 1 when backend is vram + create_info.size = st.st_size; + } + } auto code = dev->CreateGpuMemory(create_info, &gpu_mem); if (code == ErrorCode::Success) { void *MemoryAddress; - if (requiresVAddr) + if (create_info.flags.imported_sys_memfd || create_info.flags.imported_vram_alloc_va) MemoryAddress = reinterpret_cast(gpu_mem->GpuAddress()); else MemoryAddress = reinterpret_cast(gpu_mem->HandleApeAddress()); @@ -589,12 +608,12 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDeregisterMemory(void *MemoryAddress) { { std::lock_guard gard(*allocation_map_lock_); - // IPC mem + // IPC mem(vram) and IPC signal(sys mem) auto it_ipc = allocation_map_.find(MemoryAddress); if (it_ipc != allocation_map_.end()) { wsl::thunk::GpuMemoryDescFlags flags; flags.reserved = it_ipc->second.mem_flags_value; - if (flags.is_imported_vram_alloc_va) { + if (flags.is_imported_vram_alloc_va || flags.is_imported_sys_memfd) { wsl::thunk::GpuMemory *gpu_mem; gpu_mem = wsl::thunk::GpuMemory::Convert(it_ipc->second.handle); allocation_map_.erase(it_ipc); diff --git a/wddm/device.cpp b/wddm/device.cpp index c7c97793e7..0a0de0f7d9 100644 --- a/wddm/device.cpp +++ b/wddm/device.cpp @@ -45,6 +45,7 @@ #include #include +#include #include #include #include @@ -231,6 +232,52 @@ bool WDDMDevice::DecommitSystemHeapSpace(void* addr, int64_t size) { return true; } +bool WDDMDevice::CommitSystemHeapSpaceIPC(void* addr, int64_t size, int &memfd, bool lock) { + int fd = -1; + + if (memfd == -1) { + fd = memfd_create("rocr4wsl_gtt", MFD_CLOEXEC); + if (fd < 0) { + pr_err("memfd_create failed\n"); + return false; + } + + ftruncate(fd, size); + } else { + fd = memfd; + } + + int32_t protFlags = PROT_READ | PROT_WRITE; + int32_t mapFlags = MAP_SHARED | MAP_FIXED | MAP_NORESERVE | + MAP_UNINITIALIZED | (lock ? MAP_LOCKED : 0); + + void* paddr = mmap(addr, size, protFlags, mapFlags, fd, 0); + if (paddr == MAP_FAILED) { + pr_err("fail to commit %s addr = %p, paddr = %p\n", (lock ? "locked" : ""), addr, paddr); + if (memfd == -1) + close(fd); + return false; + } + assert(addr == paddr); + + memfd = fd; + + if (madvise(addr, size, MADV_DONTFORK)) + pr_err("fail to set MADV_DONTFORK for addr = %p\n", addr); + + return true; +} + +bool WDDMDevice::DecommitSystemHeapSpaceIPC(void* addr, int64_t size, int &memfd) { + if (munmap(addr, size) != 0) { + pr_err("fail to unmap = %p \n", addr); + return false; + } + close(memfd); + memfd = -1; + return true; +} + bool WDDMDevice::ReserveSystemHeapSpace() { struct sysinfo info; int ret = sysinfo(&info); @@ -458,6 +505,43 @@ ErrorCode WDDMDevice::FreeGpuVirtualAddress(const thunk_proxy::AllocDomain domai return code; } +ErrorCode WDDMDevice::ReserveIPCSysMem(gpusize size, + gpusize *out_gpu_virt_addr, gpusize alignment, + int &memfd, bool lock) { + gpusize gpu_addr = 0; + ErrorCode code = ErrorCode::Success; + + code = d3dthunk::ReserveGpuVirtualAddress(adapter_, size, + system_heap_space_start_, + system_heap_space_start_ + system_heap_space_size_, + &gpu_addr); + if (code != ErrorCode::Success) + return code; + + if (!CommitSystemHeapSpaceIPC((void*)gpu_addr, size, memfd, lock)) { + d3dthunk::FreeGpuVirtualAddress(adapter_, gpu_addr, size); + code = ErrorCode::SyscallFail; + } + + *out_gpu_virt_addr = (code == ErrorCode::Success) ? gpu_addr : 0; + return code; +} + +ErrorCode WDDMDevice::FreeIPCSysMem(gpusize gpu_addr, gpusize size, int &memfd) { + auto code = ErrorCode::Success; + + DecommitSystemHeapSpaceIPC((void *)gpu_addr, size, memfd); + + d3dthunk::FreeGpuVirtualAddressArgs free_args{}; + free_args.hAdapter = adapter_; + free_args.BaseAddress = gpu_addr; + free_args.Size = size; + + code = d3dthunk::FreeGpuVirtualAddress(&free_args); + + return code; +} + ErrorCode WDDMDevice::HandleApertureAlloc(gpusize size, gpusize *out_gpu_virt_addr) { uint64_t align = DEFAULT_GPU_PAGE_SIZE; diff --git a/wddm/gpu_memory.cpp b/wddm/gpu_memory.cpp index 3d3b41a39d..6677ea1fda 100644 --- a/wddm/gpu_memory.cpp +++ b/wddm/gpu_memory.cpp @@ -1,3 +1,4 @@ +#include #include #include #include "impl/wddm/gpu_memory.h" @@ -41,6 +42,7 @@ GpuMemory::GpuMemory(WDDMDevice *device) : device_(device) { alloc_handles_ptr_ = nullptr; alloc_handle_ = 0; resource_ = 0; + mem_fd_ = -1; } GpuMemory::~GpuMemory() { @@ -60,6 +62,7 @@ ErrorCode GpuMemory::Init(const GpuMemoryCreateInfo &create_info) { desc_.flags.is_virtual = create_info.flags.virtual_alloc; desc_.flags.is_physical_only = create_info.flags.physical_only; desc_.flags.is_physical_contiguous = create_info.flags.physical_contiguous; + desc_.flags.is_imported_sys_memfd = create_info.flags.imported_sys_memfd; /* we can't tell the allocation is regular vmm or ipc mem at creation stage, they share same creation parameters, so forcing all vram allocations to @@ -69,7 +72,6 @@ ErrorCode GpuMemory::Init(const GpuMemoryCreateInfo &create_info) { desc_.flags.is_shared = true; desc_.flags.is_locked = create_info.flags.locked; - desc_.size = AdjustSize(desc_.client_size); if (IsUserMemory() || IsSystem()) @@ -243,9 +245,18 @@ ErrorCode GpuMemory::MapGpuVirtualAddress(const gpusize addr, const gpusize size } ErrorCode GpuMemory::ReserveGpuVirtualAddress(gpusize base_virt_addr, gpusize size, gpusize alignment) { + ErrorCode status; gpusize gpu_virt_addr = 0; - auto status = device_->ReserveGpuVirtualAddress(desc_.domain, base_virt_addr, size, &gpu_virt_addr, alignment, - desc_.flags.is_locked); + if (desc_.flags.is_imported_sys_memfd && desc_.domain == thunk_proxy::AllocDomain::kSystem) { + int mfd = (mem_fd_ > -1)? mem_fd_ : -1; + status = device_->ReserveIPCSysMem(Size(), &gpu_virt_addr, desc_.alignment, mfd, desc_.flags.is_locked); + if (status == ErrorCode::Success) + mem_fd_ = mfd; + } else { + status = device_->ReserveGpuVirtualAddress(desc_.domain, base_virt_addr, size, &gpu_virt_addr, alignment, + desc_.flags.is_locked); + } + if (status == ErrorCode::Success) { desc_.gpu_addr = gpu_virt_addr; @@ -256,6 +267,9 @@ ErrorCode GpuMemory::ReserveGpuVirtualAddress(gpusize base_virt_addr, gpusize si } ErrorCode GpuMemory::FreeGpuVirtualAddress(gpusize base_addr, gpusize size) { + if (mem_fd_ > -1) + return device_->FreeIPCSysMem(GpuAddress(), Size(), mem_fd_); + return base_addr != 0 ? device_->FreeGpuVirtualAddress(desc_.domain, base_addr, size) : ErrorCode::Success; @@ -386,6 +400,11 @@ ErrorCode GpuMemory::Evict() { } ErrorCode GpuMemory::ExportPhysicalHandle(int* dmabuf_fd, uint32_t flags) { + if (mem_fd_ > -1) { + *dmabuf_fd = mem_fd_; + return ErrorCode::Success; + } + if (IsShared()) return d3dthunk::ShareObjects(num_allocations_, resource_, flags, dmabuf_fd); else @@ -400,6 +419,64 @@ ErrorCode GpuMemory::ImportPhysicalHandle(const GpuMemoryCreateInfo &create_info if (dmabuf_fd <= 0) return ErrorCode::InvalidateParams; + if(create_info.flags.imported_sys_memfd) { + // the ipc signal sys mem fd will be closed in Runtime::IPCClientImport, dup to hold a reference + mem_fd_ = dup(dmabuf_fd); + desc_.client_size = create_info.size; + desc_.size = AdjustSize(desc_.client_size); + desc_.domain = thunk_proxy::AllocDomain::kSystem; + desc_.adapter_luid = device_->GetLuid(); + desc_.alignment = 0x1000; + desc_.mem_flags = create_info.mem_flags; + desc_.engine_flag = create_info.engine_flag; + desc_.flags.is_imported_sys_memfd = create_info.flags.imported_sys_memfd; + desc_.flags.is_virtual = create_info.flags.virtual_alloc; + desc_.flags.is_physical_only = create_info.flags.physical_only; + desc_.flags.is_physical_contiguous = create_info.flags.physical_contiguous; + desc_.flags.is_locked = create_info.flags.locked; + + auto code = ReserveGpuVirtualAddress(create_info.va_hint, Size(), create_info.alignment); + if (code != ErrorCode::Success) + return code; + + bool physical_created = false; + auto guard = MakeScopeGuard([this, &physical_created, &code]() { + if (code != ErrorCode::Success) { + if (physical_created) + FreePhysicalMemory(); + FreeGpuVirtualAddress(GpuAddress(), Size()); + } + }); + (void)guard; + + num_allocations_ = CalcChunkNumbers(Size()); + if (num_allocations_ == 1) + alloc_handles_ptr_ = &alloc_handle_; + else + alloc_handles_ptr_ = new WinAllocationHandle[num_allocations_]; + + memset(alloc_handles_ptr_, 0, num_allocations_ * sizeof(WinAllocationHandle)); + + code = CreatePhysicalMemory(); + if (code != ErrorCode::Success) + return code; + + physical_created = true; + + code = MapGpuVirtualAddress(GpuAddress(), Size()); + if (code != ErrorCode::Success) + return code; + + code = MakeResident(); + if (code != ErrorCode::Success) + return code; + + if (!GetDevice()->WaitOnPagingFenceFromCpu()) + code = ErrorCode::Unknown; + + return code; + } + memset(&query_args, 0, sizeof(query_args)); query_args.hDevice = device_->DeviceHandle(); query_args.hNtHandle = reinterpret_cast(dmabuf_fd);