wsl/hsakmt: implement ipc signal
IPC Signal only support sys ram backend and CPU&GPU both accessible, IPC Memory only support vram backend and only GPU accessible. Reviewed-by: Flora Cui <flora.cui@amd.com> Signed-off-by: tiancyin <tianci.yin@amd.com>
Этот коммит содержится в:
+8
-4
@@ -82,6 +82,9 @@ HSAKMTAPI int amdgpu_device_get_fd(amdgpu_device_handle dev) {
|
||||
}
|
||||
|
||||
HSAKMTAPI int amdgpu_bo_cpu_map(amdgpu_bo_handle bo, void **cpu) {
|
||||
wsl::thunk::GpuMemory *gpu_mem = get_gpu_mem(bo);
|
||||
if (gpu_mem->IsSysMemFd())
|
||||
*cpu = gpu_mem->CpuAddress();
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -116,11 +119,14 @@ HSAKMTAPI int amdgpu_bo_va_op(amdgpu_bo_handle bo,
|
||||
uint64_t addr,
|
||||
uint64_t flags,
|
||||
uint32_t ops) {
|
||||
wsl::thunk::GpuMemory *gpu_mem = get_gpu_mem(bo);
|
||||
assert(gpu_mem != nullptr);
|
||||
if (gpu_mem->IsSysMemFd())
|
||||
return 0;
|
||||
|
||||
switch(ops) {
|
||||
case AMDGPU_VA_OP_MAP:
|
||||
{
|
||||
wsl::thunk::GpuMemory *gpu_mem = get_gpu_mem(bo);
|
||||
assert(gpu_mem != nullptr);
|
||||
auto code = gpu_mem->MapGpuVirtualAddress(reinterpret_cast<gpusize>(addr), size, offset);
|
||||
if (code != ErrorCode::Success)
|
||||
return -1;
|
||||
@@ -132,8 +138,6 @@ HSAKMTAPI int amdgpu_bo_va_op(amdgpu_bo_handle bo,
|
||||
break;
|
||||
case AMDGPU_VA_OP_UNMAP:
|
||||
{
|
||||
wsl::thunk::GpuMemory *gpu_mem = get_gpu_mem(bo);
|
||||
assert(gpu_mem != nullptr);
|
||||
auto code = gpu_mem->UnmapGpuVirtualAddress(reinterpret_cast<gpusize>(addr), size, offset);
|
||||
if (code != ErrorCode::Success)
|
||||
return -1;
|
||||
|
||||
+1
-1
@@ -202,6 +202,6 @@ uint32_t get_vgpr_size_per_cu(HSA_ENGINE_ID id);
|
||||
|
||||
HSAKMT_STATUS hsaKmtImportDMABufHandle(int DMABufFd,
|
||||
HsaGraphicsResourceInfo *GraphicsResourceInfo,
|
||||
bool requiresVAddr = false);
|
||||
HSA_REGISTER_MEM_FLAGS RegisterFlags = {0});
|
||||
|
||||
#endif
|
||||
|
||||
+29
-10
@@ -31,6 +31,7 @@
|
||||
#include <sys/types.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/sysinfo.h>
|
||||
#include <sys/stat.h>
|
||||
#include <fcntl.h>
|
||||
#include "impl/wddm/gpu_memory.h"
|
||||
#include "util/simple_heap.h"
|
||||
@@ -191,6 +192,9 @@ HSAKMT_STATUS hsaKmtAllocMemoryAlignInternal(HSAuint32 PreferredNode,
|
||||
if (zfb_support && MemFlags.ui32.NonPaged == 1)
|
||||
MemFlags.ui32.CoarseGrain = 1;
|
||||
|
||||
// AllocateNonPaged == AllocateIPC
|
||||
create_info.flags.imported_sys_memfd = !!(MemFlags.ui32.NonPaged && !MemFlags.ui32.GTTAccess);
|
||||
|
||||
create_info.domain = thunk_proxy::AllocDomain::kSystem;
|
||||
} else {
|
||||
create_info.domain = thunk_proxy::AllocDomain::kLocal;
|
||||
@@ -454,8 +458,8 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterGraphicsHandleToNodesExt(HSAuint64 Graphic
|
||||
|
||||
GraphicsResourceInfo->NodeId = 1;
|
||||
return hsaKmtImportDMABufHandle(GraphicsResourceHandle,
|
||||
GraphicsResourceInfo,
|
||||
!!RegisterFlags.ui32.requiresVAddr);
|
||||
GraphicsResourceInfo,
|
||||
RegisterFlags);
|
||||
}
|
||||
|
||||
HSAKMT_STATUS HSAKMTAPI hsaKmtExportDMABufHandle(void *MemoryAddress,
|
||||
@@ -477,7 +481,6 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtExportDMABufHandle(void *MemoryAddress,
|
||||
it->second.dmabuf_fd = *DMABufFd;
|
||||
}
|
||||
*DMABufFd = dup(it->second.dmabuf_fd);
|
||||
|
||||
*Offset = reinterpret_cast<uint64_t>(MemoryAddress) - it->second.gpu_addr;
|
||||
return HSAKMT_STATUS_SUCCESS;
|
||||
}
|
||||
@@ -488,21 +491,37 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtExportDMABufHandle(void *MemoryAddress,
|
||||
|
||||
HSAKMT_STATUS hsaKmtImportDMABufHandle(int DMABufFd,
|
||||
HsaGraphicsResourceInfo *GraphicsResourceInfo,
|
||||
bool requiresVAddr) {
|
||||
|
||||
|
||||
HSA_REGISTER_MEM_FLAGS RegisterFlags) {
|
||||
CHECK_DXG_OPEN();
|
||||
|
||||
wsl::thunk::WDDMDevice* dev = get_wddmdev(1);
|
||||
wsl::thunk::GpuMemory *gpu_mem = nullptr;
|
||||
wsl::thunk::GpuMemoryCreateInfo create_info{};
|
||||
create_info.dmabuf_fd = DMABufFd;
|
||||
create_info.flags.imported_vram_alloc_va = requiresVAddr;
|
||||
create_info.flags.imported_vram_alloc_va = RegisterFlags.ui32.requiresVAddr;
|
||||
|
||||
std::string fdPath = "/proc/self/fd/" + std::to_string(DMABufFd);
|
||||
char linkTarget[256];
|
||||
ssize_t bytes = readlink(fdPath.c_str(), linkTarget, sizeof(linkTarget) - 1);
|
||||
if (bytes == -1)
|
||||
pr_err("Error reading link\n");
|
||||
linkTarget[bytes] = '\0';
|
||||
if (strstr(linkTarget, "rocr4wsl_gtt") != nullptr) {
|
||||
struct stat st;
|
||||
fstat(DMABufFd, &st);
|
||||
uint64_t sz = st.st_size;
|
||||
if (4096 <= sz && sz < dev->SystemHeapSize() && (sz & 0xfff) == 0) {
|
||||
pr_debug("DMABufFd %d is sys mem fd(IPC signal), get size:%ld from it\n", DMABufFd, st.st_size);
|
||||
create_info.flags.imported_sys_memfd = 1; // set to 1 when backend is system memory
|
||||
create_info.flags.imported_vram_alloc_va = 0; // set to 1 when backend is vram
|
||||
create_info.size = st.st_size;
|
||||
}
|
||||
}
|
||||
|
||||
auto code = dev->CreateGpuMemory(create_info, &gpu_mem);
|
||||
if (code == ErrorCode::Success) {
|
||||
void *MemoryAddress;
|
||||
if (requiresVAddr)
|
||||
if (create_info.flags.imported_sys_memfd || create_info.flags.imported_vram_alloc_va)
|
||||
MemoryAddress = reinterpret_cast<void *>(gpu_mem->GpuAddress());
|
||||
else
|
||||
MemoryAddress = reinterpret_cast<void*>(gpu_mem->HandleApeAddress());
|
||||
@@ -589,12 +608,12 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDeregisterMemory(void *MemoryAddress) {
|
||||
{
|
||||
std::lock_guard<std::mutex> gard(*allocation_map_lock_);
|
||||
|
||||
// IPC mem
|
||||
// IPC mem(vram) and IPC signal(sys mem)
|
||||
auto it_ipc = allocation_map_.find(MemoryAddress);
|
||||
if (it_ipc != allocation_map_.end()) {
|
||||
wsl::thunk::GpuMemoryDescFlags flags;
|
||||
flags.reserved = it_ipc->second.mem_flags_value;
|
||||
if (flags.is_imported_vram_alloc_va) {
|
||||
if (flags.is_imported_vram_alloc_va || flags.is_imported_sys_memfd) {
|
||||
wsl::thunk::GpuMemory *gpu_mem;
|
||||
gpu_mem = wsl::thunk::GpuMemory::Convert(it_ipc->second.handle);
|
||||
allocation_map_.erase(it_ipc);
|
||||
|
||||
@@ -45,6 +45,7 @@
|
||||
|
||||
#include <sys/mman.h>
|
||||
#include <sys/sysinfo.h>
|
||||
#include <sys/stat.h>
|
||||
#include <linux/mman.h>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
@@ -231,6 +232,52 @@ bool WDDMDevice::DecommitSystemHeapSpace(void* addr, int64_t size) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool WDDMDevice::CommitSystemHeapSpaceIPC(void* addr, int64_t size, int &memfd, bool lock) {
|
||||
int fd = -1;
|
||||
|
||||
if (memfd == -1) {
|
||||
fd = memfd_create("rocr4wsl_gtt", MFD_CLOEXEC);
|
||||
if (fd < 0) {
|
||||
pr_err("memfd_create failed\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
ftruncate(fd, size);
|
||||
} else {
|
||||
fd = memfd;
|
||||
}
|
||||
|
||||
int32_t protFlags = PROT_READ | PROT_WRITE;
|
||||
int32_t mapFlags = MAP_SHARED | MAP_FIXED | MAP_NORESERVE |
|
||||
MAP_UNINITIALIZED | (lock ? MAP_LOCKED : 0);
|
||||
|
||||
void* paddr = mmap(addr, size, protFlags, mapFlags, fd, 0);
|
||||
if (paddr == MAP_FAILED) {
|
||||
pr_err("fail to commit %s addr = %p, paddr = %p\n", (lock ? "locked" : ""), addr, paddr);
|
||||
if (memfd == -1)
|
||||
close(fd);
|
||||
return false;
|
||||
}
|
||||
assert(addr == paddr);
|
||||
|
||||
memfd = fd;
|
||||
|
||||
if (madvise(addr, size, MADV_DONTFORK))
|
||||
pr_err("fail to set MADV_DONTFORK for addr = %p\n", addr);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool WDDMDevice::DecommitSystemHeapSpaceIPC(void* addr, int64_t size, int &memfd) {
|
||||
if (munmap(addr, size) != 0) {
|
||||
pr_err("fail to unmap = %p \n", addr);
|
||||
return false;
|
||||
}
|
||||
close(memfd);
|
||||
memfd = -1;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool WDDMDevice::ReserveSystemHeapSpace() {
|
||||
struct sysinfo info;
|
||||
int ret = sysinfo(&info);
|
||||
@@ -458,6 +505,43 @@ ErrorCode WDDMDevice::FreeGpuVirtualAddress(const thunk_proxy::AllocDomain domai
|
||||
return code;
|
||||
}
|
||||
|
||||
ErrorCode WDDMDevice::ReserveIPCSysMem(gpusize size,
|
||||
gpusize *out_gpu_virt_addr, gpusize alignment,
|
||||
int &memfd, bool lock) {
|
||||
gpusize gpu_addr = 0;
|
||||
ErrorCode code = ErrorCode::Success;
|
||||
|
||||
code = d3dthunk::ReserveGpuVirtualAddress(adapter_, size,
|
||||
system_heap_space_start_,
|
||||
system_heap_space_start_ + system_heap_space_size_,
|
||||
&gpu_addr);
|
||||
if (code != ErrorCode::Success)
|
||||
return code;
|
||||
|
||||
if (!CommitSystemHeapSpaceIPC((void*)gpu_addr, size, memfd, lock)) {
|
||||
d3dthunk::FreeGpuVirtualAddress(adapter_, gpu_addr, size);
|
||||
code = ErrorCode::SyscallFail;
|
||||
}
|
||||
|
||||
*out_gpu_virt_addr = (code == ErrorCode::Success) ? gpu_addr : 0;
|
||||
return code;
|
||||
}
|
||||
|
||||
ErrorCode WDDMDevice::FreeIPCSysMem(gpusize gpu_addr, gpusize size, int &memfd) {
|
||||
auto code = ErrorCode::Success;
|
||||
|
||||
DecommitSystemHeapSpaceIPC((void *)gpu_addr, size, memfd);
|
||||
|
||||
d3dthunk::FreeGpuVirtualAddressArgs free_args{};
|
||||
free_args.hAdapter = adapter_;
|
||||
free_args.BaseAddress = gpu_addr;
|
||||
free_args.Size = size;
|
||||
|
||||
code = d3dthunk::FreeGpuVirtualAddress(&free_args);
|
||||
|
||||
return code;
|
||||
}
|
||||
|
||||
ErrorCode WDDMDevice::HandleApertureAlloc(gpusize size, gpusize *out_gpu_virt_addr) {
|
||||
uint64_t align = DEFAULT_GPU_PAGE_SIZE;
|
||||
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#include <sys/stat.h>
|
||||
#include <cinttypes>
|
||||
#include <cassert>
|
||||
#include "impl/wddm/gpu_memory.h"
|
||||
@@ -41,6 +42,7 @@ GpuMemory::GpuMemory(WDDMDevice *device) : device_(device) {
|
||||
alloc_handles_ptr_ = nullptr;
|
||||
alloc_handle_ = 0;
|
||||
resource_ = 0;
|
||||
mem_fd_ = -1;
|
||||
}
|
||||
|
||||
GpuMemory::~GpuMemory() {
|
||||
@@ -60,6 +62,7 @@ ErrorCode GpuMemory::Init(const GpuMemoryCreateInfo &create_info) {
|
||||
desc_.flags.is_virtual = create_info.flags.virtual_alloc;
|
||||
desc_.flags.is_physical_only = create_info.flags.physical_only;
|
||||
desc_.flags.is_physical_contiguous = create_info.flags.physical_contiguous;
|
||||
desc_.flags.is_imported_sys_memfd = create_info.flags.imported_sys_memfd;
|
||||
|
||||
/* we can't tell the allocation is regular vmm or ipc mem at creation stage,
|
||||
they share same creation parameters, so forcing all vram allocations to
|
||||
@@ -69,7 +72,6 @@ ErrorCode GpuMemory::Init(const GpuMemoryCreateInfo &create_info) {
|
||||
desc_.flags.is_shared = true;
|
||||
|
||||
desc_.flags.is_locked = create_info.flags.locked;
|
||||
|
||||
desc_.size = AdjustSize(desc_.client_size);
|
||||
|
||||
if (IsUserMemory() || IsSystem())
|
||||
@@ -243,9 +245,18 @@ ErrorCode GpuMemory::MapGpuVirtualAddress(const gpusize addr, const gpusize size
|
||||
}
|
||||
|
||||
ErrorCode GpuMemory::ReserveGpuVirtualAddress(gpusize base_virt_addr, gpusize size, gpusize alignment) {
|
||||
ErrorCode status;
|
||||
gpusize gpu_virt_addr = 0;
|
||||
auto status = device_->ReserveGpuVirtualAddress(desc_.domain, base_virt_addr, size, &gpu_virt_addr, alignment,
|
||||
desc_.flags.is_locked);
|
||||
if (desc_.flags.is_imported_sys_memfd && desc_.domain == thunk_proxy::AllocDomain::kSystem) {
|
||||
int mfd = (mem_fd_ > -1)? mem_fd_ : -1;
|
||||
status = device_->ReserveIPCSysMem(Size(), &gpu_virt_addr, desc_.alignment, mfd, desc_.flags.is_locked);
|
||||
if (status == ErrorCode::Success)
|
||||
mem_fd_ = mfd;
|
||||
} else {
|
||||
status = device_->ReserveGpuVirtualAddress(desc_.domain, base_virt_addr, size, &gpu_virt_addr, alignment,
|
||||
desc_.flags.is_locked);
|
||||
}
|
||||
|
||||
if (status == ErrorCode::Success) {
|
||||
desc_.gpu_addr = gpu_virt_addr;
|
||||
|
||||
@@ -256,6 +267,9 @@ ErrorCode GpuMemory::ReserveGpuVirtualAddress(gpusize base_virt_addr, gpusize si
|
||||
}
|
||||
|
||||
ErrorCode GpuMemory::FreeGpuVirtualAddress(gpusize base_addr, gpusize size) {
|
||||
if (mem_fd_ > -1)
|
||||
return device_->FreeIPCSysMem(GpuAddress(), Size(), mem_fd_);
|
||||
|
||||
return base_addr != 0 ?
|
||||
device_->FreeGpuVirtualAddress(desc_.domain, base_addr, size) :
|
||||
ErrorCode::Success;
|
||||
@@ -386,6 +400,11 @@ ErrorCode GpuMemory::Evict() {
|
||||
}
|
||||
|
||||
ErrorCode GpuMemory::ExportPhysicalHandle(int* dmabuf_fd, uint32_t flags) {
|
||||
if (mem_fd_ > -1) {
|
||||
*dmabuf_fd = mem_fd_;
|
||||
return ErrorCode::Success;
|
||||
}
|
||||
|
||||
if (IsShared())
|
||||
return d3dthunk::ShareObjects(num_allocations_, resource_, flags, dmabuf_fd);
|
||||
else
|
||||
@@ -400,6 +419,64 @@ ErrorCode GpuMemory::ImportPhysicalHandle(const GpuMemoryCreateInfo &create_info
|
||||
if (dmabuf_fd <= 0)
|
||||
return ErrorCode::InvalidateParams;
|
||||
|
||||
if(create_info.flags.imported_sys_memfd) {
|
||||
// the ipc signal sys mem fd will be closed in Runtime::IPCClientImport, dup to hold a reference
|
||||
mem_fd_ = dup(dmabuf_fd);
|
||||
desc_.client_size = create_info.size;
|
||||
desc_.size = AdjustSize(desc_.client_size);
|
||||
desc_.domain = thunk_proxy::AllocDomain::kSystem;
|
||||
desc_.adapter_luid = device_->GetLuid();
|
||||
desc_.alignment = 0x1000;
|
||||
desc_.mem_flags = create_info.mem_flags;
|
||||
desc_.engine_flag = create_info.engine_flag;
|
||||
desc_.flags.is_imported_sys_memfd = create_info.flags.imported_sys_memfd;
|
||||
desc_.flags.is_virtual = create_info.flags.virtual_alloc;
|
||||
desc_.flags.is_physical_only = create_info.flags.physical_only;
|
||||
desc_.flags.is_physical_contiguous = create_info.flags.physical_contiguous;
|
||||
desc_.flags.is_locked = create_info.flags.locked;
|
||||
|
||||
auto code = ReserveGpuVirtualAddress(create_info.va_hint, Size(), create_info.alignment);
|
||||
if (code != ErrorCode::Success)
|
||||
return code;
|
||||
|
||||
bool physical_created = false;
|
||||
auto guard = MakeScopeGuard([this, &physical_created, &code]() {
|
||||
if (code != ErrorCode::Success) {
|
||||
if (physical_created)
|
||||
FreePhysicalMemory();
|
||||
FreeGpuVirtualAddress(GpuAddress(), Size());
|
||||
}
|
||||
});
|
||||
(void)guard;
|
||||
|
||||
num_allocations_ = CalcChunkNumbers(Size());
|
||||
if (num_allocations_ == 1)
|
||||
alloc_handles_ptr_ = &alloc_handle_;
|
||||
else
|
||||
alloc_handles_ptr_ = new WinAllocationHandle[num_allocations_];
|
||||
|
||||
memset(alloc_handles_ptr_, 0, num_allocations_ * sizeof(WinAllocationHandle));
|
||||
|
||||
code = CreatePhysicalMemory();
|
||||
if (code != ErrorCode::Success)
|
||||
return code;
|
||||
|
||||
physical_created = true;
|
||||
|
||||
code = MapGpuVirtualAddress(GpuAddress(), Size());
|
||||
if (code != ErrorCode::Success)
|
||||
return code;
|
||||
|
||||
code = MakeResident();
|
||||
if (code != ErrorCode::Success)
|
||||
return code;
|
||||
|
||||
if (!GetDevice()->WaitOnPagingFenceFromCpu())
|
||||
code = ErrorCode::Unknown;
|
||||
|
||||
return code;
|
||||
}
|
||||
|
||||
memset(&query_args, 0, sizeof(query_args));
|
||||
query_args.hDevice = device_->DeviceHandle();
|
||||
query_args.hNtHandle = reinterpret_cast<HANDLE>(dmabuf_fd);
|
||||
|
||||
Ссылка в новой задаче
Block a user