wsl/libhsakmt: move system heap from device to thunk runtime

In multi-GPU, system heap space is shared between all GPUs, not belongs to
specific one GPU, so move it from wddm device (which presents a specific GPU)
to thunk runtime which has gloable view, can manage system heap for all GPUs.

Introduce a new va_Mgr instance to manage system heap, since local heap
and system heap both comply with SVM(Shared Virtual Memory), without
this new mgr, every allocation has to call KMD at least once (each GPU
needs a call) to allocate GPU VA, the new mgr manage the space itself,
no longer call KMD.

Reviewed-by: Flora Cui <flora.cui@amd.com>
Signed-off-by: tiancyin <tianci.yin@amd.com>
Этот коммит содержится в:
tiancyin
2025-06-27 16:47:51 +08:00
коммит произвёл Frank Min
родитель d5eb871bbb
Коммит 8e07aca2ae
4 изменённых файлов: 102 добавлений и 85 удалений
+15
Просмотреть файл
@@ -63,13 +63,21 @@ struct hsakmtRuntime {
enable_thunk_sub_allocator(0),
local_heap_space_start_(0),
local_heap_space_size_(0),
system_heap_space_start_(0),
system_heap_space_size_(0),
default_node(1) {}
void HeapInit();
void HeapFini();
bool ReserveSvmSpace(uint64_t &base, uint64_t &size, uint64_t align);
bool FreeSvmSpace(uint64_t &base, uint64_t &size);
bool ReserveLocalHeapSpace();
bool FreeLocalHeapSpace();
void InitLocalHeapMgr();
bool ReserveSystemHeapSpace();
uint64_t SystemHeapSize() { return system_heap_space_size_; }
bool FreeSystemHeapSpace();
void InitSystemHeapMgr();
pthread_mutex_t hsakmt_mutex;
const char *dxg_device_name = "/dev/dxg";
@@ -95,6 +103,13 @@ struct hsakmtRuntime {
/* manage the reserved local heap space which shared by CPU and GPUs */
std::unique_ptr<wsl::thunk::VaMgr> local_heap_mgr_;
/* system heap means bo's backend is system ram */
uint64_t system_heap_space_start_;
uint64_t system_heap_space_size_;
/* manage the reserved system heap space which shared by CPU and GPUs */
std::unique_ptr<wsl::thunk::VaMgr> system_heap_mgr_;
};
extern hsakmtRuntime *dxg_runtime;
+1 -1
Просмотреть файл
@@ -547,7 +547,7 @@ HSAKMT_STATUS import_dmabuf_fd(int DMABufFd,
struct stat st;
fstat(DMABufFd, &st);
uint64_t sz = st.st_size;
if (4096 <= sz && sz < dev->SystemHeapSize() && (sz & 0xfff) == 0) {
if (4096 <= sz && sz < dxg_runtime->SystemHeapSize() && (sz & 0xfff) == 0) {
pr_debug("DMABufFd %d is sys mem fd(IPC signal), get size:%ld from it\n", DMABufFd, st.st_size);
create_info.flags.sysmem_ipc_sig_importer = 1; // set to 1 when backend is system memory
create_info.size = st.st_size;
+77 -31
Просмотреть файл
@@ -27,6 +27,8 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <sys/sysinfo.h>
#include <linux/mman.h>
#include <fcntl.h>
#include <unistd.h>
#include <cstdio>
@@ -39,41 +41,28 @@ hsakmtRuntime *dxg_runtime = new hsakmtRuntime();
void hsakmtRuntime::HeapInit() {
ReserveLocalHeapSpace();
ReserveSystemHeapSpace();
InitLocalHeapMgr();
InitSystemHeapMgr();
}
void hsakmtRuntime::HeapFini() {
FreeSystemHeapSpace();
FreeLocalHeapSpace();
}
/*
* To find the avaliable same range for cpu
* virtual space and gpu virtual space.
* sys_va_size of cpu va range is larger 1G
* than gpu va range, otherwise ReserveGPUVirtualAddress
* will return error.
*/
bool hsakmtRuntime::ReserveLocalHeapSpace() {
bool hsakmtRuntime::ReserveSvmSpace(uint64_t &base, uint64_t &size, uint64_t align) {
uint64_t sys_va[16] = {0};
uint64_t local_va;
uint64_t sys_va_size;
int match_index = -1;
uint64_t align = 0x40000000; /* 1G */
void* ptr = NULL;
wsl::thunk::WDDMDevice* device;
uint64_t total_local_size = 0;
size_t num_adapters = get_num_wddmdev();
for (uint32_t j = 0; j < num_adapters; j++) {
device = get_wddmdev(j+1);
if (device == nullptr)
return -1;
total_local_size += wsl::AlignUp(device->LocalHeapSize(), align) * 4;
}
local_heap_space_start_ = 0;
local_heap_space_size_ = total_local_size;
sys_va_size = local_heap_space_size_ + align;
base = 0;
sys_va_size = size + align;
/* it will retry 16 times to find the avaliable range. */
for (int i = 0; i < 16; i++) {
@@ -89,16 +78,16 @@ bool hsakmtRuntime::ReserveLocalHeapSpace() {
int match_cnt = 0;
for (uint32_t j = 0; j < num_adapters; j++) {
device = get_wddmdev(j+1);
uint64_t start = (local_heap_space_start_ == 0) ? (uint64_t)ptr : local_heap_space_start_;
uint64_t end = start + ((local_heap_space_start_ == 0) ? sys_va_size : local_heap_space_size_) + 1;
uint64_t start = (base == 0) ? (uint64_t)ptr : base;
uint64_t end = start + ((base == 0) ? sys_va_size : size) + 1;
if (wsl::thunk::d3dthunk::ReserveGpuVirtualAddress(
device->GetAdapter(), local_heap_space_size_,
device->GetAdapter(), size,
start,
end, &local_va) == ErrorCode::Success) {
match_cnt++;
local_heap_space_start_ = local_va;
base = local_va;
pr_debug("success to reserve gpu va %lx and va cpu %p in %d time\n",
local_va, ptr, i);
} else {
@@ -119,12 +108,12 @@ bool hsakmtRuntime::ReserveLocalHeapSpace() {
uint64_t right_size = align - left_size;
if ((left_size > 0) && munmap((void*)sys_va[match_index], left_size))
pr_err("fail to unmap left %lx with size %lx\n", sys_va[match_index], left_size);
if ((right_size > 0) && munmap((void*)(local_va + local_heap_space_size_), right_size))
pr_err("fail to unmap right %lx with size %lx\n", (local_va + local_heap_space_size_), right_size);
if ((right_size > 0) && munmap((void*)(local_va + size), right_size))
pr_err("fail to unmap right %lx with size %lx\n", (local_va + size), right_size);
} else {
pr_err("fail to reserve Local Heap Space!\n");
local_heap_space_start_ = 0;
local_heap_space_size_ = 0;
base = 0;
size = 0;
}
/* free match fail address for cpu va */
@@ -138,18 +127,51 @@ bool hsakmtRuntime::ReserveLocalHeapSpace() {
return match_index >= 0;
}
bool hsakmtRuntime::FreeLocalHeapSpace() {
/*
* To find the avaliable same range for cpu
* virtual space and gpu virtual space.
* sys_va_size of cpu va range is larger 1G
* than gpu va range, otherwise ReserveGPUVirtualAddress
* will return error.
*/
bool hsakmtRuntime::ReserveLocalHeapSpace() {
wsl::thunk::WDDMDevice* device;
uint64_t total_local_size = 0;
uint64_t align = 0x40000000; /* 1G */
size_t num_adapters = get_num_wddmdev();
for (uint32_t j = 0; j < num_adapters; j++) {
device = get_wddmdev(j+1);
if (device == nullptr)
return -1;
total_local_size += wsl::AlignUp(device->LocalHeapSize(), align) * 4;
}
local_heap_space_start_ = 0;
local_heap_space_size_ = total_local_size;
return ReserveSvmSpace(local_heap_space_start_, local_heap_space_size_, align);
}
bool hsakmtRuntime::FreeSvmSpace(uint64_t &base, uint64_t &size) {
wsl::thunk::WDDMDevice* device;
size_t num_adapters = get_num_wddmdev();
for (uint32_t j = 0; j < num_adapters; j++) {
device = get_wddmdev(j+1);
if (device == nullptr)
return -1;
wsl::thunk::d3dthunk::FreeGpuVirtualAddress(device->GetAdapter(), local_heap_space_start_, local_heap_space_size_);
wsl::thunk::d3dthunk::FreeGpuVirtualAddress(device->GetAdapter(), base, size);
}
void *cpu = (void *)local_heap_space_start_;
return munmap(cpu, local_heap_space_size_) == 0;
void *cpu = (void *)base;
auto r = (munmap(cpu, size) == 0);
base = 0;
size = 0;
return r;
}
bool hsakmtRuntime::FreeLocalHeapSpace() {
return FreeSvmSpace(local_heap_space_start_, local_heap_space_size_);
}
void hsakmtRuntime::InitLocalHeapMgr() {
@@ -158,6 +180,30 @@ void hsakmtRuntime::InitLocalHeapMgr() {
DEFAULT_GPU_PAGE_SIZE);
}
bool hsakmtRuntime::ReserveSystemHeapSpace() {
struct sysinfo info;
int ret = sysinfo(&info);
uint64_t max_ram = 0x10000000000;
uint64_t alignment = 0x100000000;
assert(!ret);
int32_t protFlags = PROT_NONE;
// minimum of reserve size is 8G, maximum of reserve size is 1T.
system_heap_space_size_ = std::min(wsl::AlignUp(info.totalram, alignment) * 2, max_ram);
return ReserveSvmSpace(system_heap_space_start_, system_heap_space_size_, alignment);
}
bool hsakmtRuntime::FreeSystemHeapSpace(void) {
return FreeSvmSpace(system_heap_space_start_, system_heap_space_size_);
}
void hsakmtRuntime::InitSystemHeapMgr() {
system_heap_mgr_ = std::make_unique<wsl::thunk::VaMgr>(system_heap_space_start_,
system_heap_space_size_,
DEFAULT_GPU_PAGE_SIZE);
}
/* is_forked_child detects when the process has forked since the last
* time this function was called. We cannot rely on pthread_atfork
* because the process can fork without calling the fork function in
+9 -53
Просмотреть файл
@@ -67,14 +67,12 @@ WDDMDevice::WDDMDevice(D3DKMT_HANDLE adapter, LUID adapter_luid, uint32_t node_i
CreateDevice();
SetPowerOptimization(false);
CreatePagingQueue();
ReserveSystemHeapSpace();
InitHandleApertureSpace();
InitHandleApertureMgr();
InitCmdbufInfo();
}
WDDMDevice::~WDDMDevice() {
FreeSystemHeapSpace();
DestroyPagingQueue();
SetPowerOptimization(true);
DestroyDevice();
@@ -275,36 +273,6 @@ bool WDDMDevice::DecommitSystemHeapSpaceIPC(void* addr, int64_t size, int &memfd
return true;
}
bool WDDMDevice::ReserveSystemHeapSpace() {
struct sysinfo info;
int ret = sysinfo(&info);
uint64_t max_ram = 0x10000000000;
uint64_t alignment = 0x100000000;
assert(!ret);
int32_t protFlags = PROT_NONE;
// minimum of reserve size is 8G, maximum of reserve size is 1T.
system_heap_space_size_ = std::min(AlignUp(info.totalram, alignment) * 2, max_ram);
void* cpu = mmap(NULL, system_heap_space_size_, protFlags,
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
if (cpu == MAP_FAILED) {
pr_err("fail to reserve system_heap_space_size_ = %lx \n", system_heap_space_size_);
return false;
}
system_heap_space_start_ = (uint64_t)cpu;
return true;
}
bool WDDMDevice::FreeSystemHeapSpace(void) {
void *cpu = (void *)system_heap_space_start_;
if (munmap(cpu, system_heap_space_size_) != 0) {
pr_err("fail to unmap = %p \n", cpu);
return false;
}
return true;
}
void WDDMDevice::InitHandleApertureMgr() {
handle_aperture_mgr_ = std::make_unique<VaMgr>(handle_aperture_start_,
handle_aperture_size_,
@@ -372,28 +340,23 @@ ErrorCode WDDMDevice::ReserveGpuVirtualAddress(const thunk_proxy::AllocDomain do
gpusize gpu_addr = 0;
ErrorCode code = ErrorCode::Success;
if (domain == thunk_proxy::kSystem) {
uint64_t align = alignment == 0 ? (64 * 1024) : alignment; // default 64K alignment
if (size >= GPU_HUGE_PAGE_SIZE)
align = GPU_HUGE_PAGE_SIZE;
code = d3dthunk::ReserveGpuVirtualAddress(adapter_, size,
system_heap_space_start_,
system_heap_space_start_ + system_heap_space_size_,
&gpu_addr);
if (code != ErrorCode::Success)
return code;
if (domain == thunk_proxy::kSystem) {
gpu_addr = dxg_runtime->system_heap_mgr_->Alloc(size, align, hit_base_addr);
if (gpu_addr == 0)
code = ErrorCode::OutOfMemory;
if (!CommitSystemHeapSpace((void*)gpu_addr, size, lock)) {
d3dthunk::FreeGpuVirtualAddress(adapter_, gpu_addr, size);
dxg_runtime->system_heap_mgr_->Free(gpu_addr);
code = ErrorCode::SyscallFail;
}
} else {
uint64_t align = alignment == 0 ? (64 * 1024) : alignment; // default 64K alignment
if (domain == thunk_proxy::kLocal && size >= GPU_HUGE_PAGE_SIZE)
align = GPU_HUGE_PAGE_SIZE;
gpu_addr = dxg_runtime->local_heap_mgr_->Alloc(size, align, hit_base_addr);
if (gpu_addr == 0)
code = ErrorCode::OutOfGpuMemory;
}
*out_gpu_virt_addr = (code == ErrorCode::Success) ? gpu_addr : 0;
@@ -405,15 +368,8 @@ ErrorCode WDDMDevice::FreeGpuVirtualAddress(const thunk_proxy::AllocDomain domai
auto code = ErrorCode::Success;
if (domain == thunk_proxy::kSystem) {
DecommitSystemHeapSpace((void *)gpu_addr, size);
d3dthunk::FreeGpuVirtualAddressArgs free_args{};
free_args.hAdapter = adapter_;
free_args.BaseAddress = gpu_addr;
free_args.Size = size;
code = d3dthunk::FreeGpuVirtualAddress(&free_args);
dxg_runtime->system_heap_mgr_->Free(gpu_addr);
} else {
dxg_runtime->local_heap_mgr_->Free(gpu_addr);
}