libhsakmt: Refactor and clean up CPU mappings

Use a common helper for CPU mappings to reduce duplicate code.
Consistently use MAP_SHARED for all render_fd mappings.
Remove double-mapping for AQL queue buffers on the CPU. This workaround
is only needed on the GPU.

Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Change-Id: Iff86c8cc9f1e5c982614b3f11129bc2cf8cbba02


[ROCm/ROCR-Runtime commit: 73b0fb3d7c]
Этот коммит содержится в:
Felix Kuehling
2022-09-07 15:32:56 -04:00
родитель eac689291a
Коммит b3db03a7d5
+44 -69
Просмотреть файл
@@ -1417,6 +1417,23 @@ static void *__fmm_allocate_device(uint32_t gpu_id, void *address, uint64_t Memo
return mem;
}
static void *fmm_map_to_cpu(void *mem, uint64_t size, bool host_access,
int fd, uint64_t mmap_offset) {
int flag = MAP_SHARED | MAP_FIXED;
int prot = host_access ? PROT_READ | PROT_WRITE : PROT_NONE;
void *ret = mmap(mem, size, prot, flag, fd, mmap_offset);
if (ret != MAP_FAILED)
/* This madvise() call is needed to avoid additional references
* to mapped BOs in child processes that can prevent freeing
* memory in the parent process and lead to out-of-memory
* conditions.
*/
madvise(mem, size, MADV_DONTFORK);
return ret;
}
void *fmm_allocate_device(uint32_t gpu_id, uint32_t node_id, void *address,
uint64_t MemorySizeInBytes, HsaMemFlags mflags)
{
@@ -1465,25 +1482,15 @@ void *fmm_allocate_device(uint32_t gpu_id, uint32_t node_id, void *address,
}
if (mem) {
int map_fd = gpu_mem[gpu_mem_id].drm_render_fd;
int prot = mflags.ui32.HostAccess ? PROT_READ | PROT_WRITE :
PROT_NONE;
int flag = mflags.ui32.HostAccess ? MAP_SHARED | MAP_FIXED :
MAP_PRIVATE|MAP_FIXED;
void *ret = mmap(mem, MemorySizeInBytes, prot, flag,
map_fd, mmap_offset);
void *ret = fmm_map_to_cpu(mem, MemorySizeInBytes,
mflags.ui32.HostAccess,
gpu_mem[gpu_mem_id].drm_render_fd,
mmap_offset);
if (ret == MAP_FAILED) {
__fmm_release(vm_obj, aperture);
return NULL;
}
/*
* This madvise() call is needed to avoid additional references
* to mapped BOs in child processes that can prevent freeing
* memory in the parent process and lead to out-of-memory
* conditions.
*/
madvise(mem, MemorySizeInBytes, MADV_DONTFORK);
}
return mem;
@@ -1718,29 +1725,14 @@ static void *fmm_allocate_host_gpu(uint32_t node_id, void *address,
&mmap_offset, ioc_flags, &vm_obj);
if (mem && mflags.ui32.HostAccess) {
int map_fd = gpu_drm_fd;
void *ret = mmap(mem, MemorySizeInBytes,
PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_FIXED, map_fd, mmap_offset);
void *ret = fmm_map_to_cpu(mem, MemorySizeInBytes,
mflags.ui32.HostAccess,
gpu_drm_fd, mmap_offset);
if (ret == MAP_FAILED) {
__fmm_release(vm_obj, aperture);
return NULL;
}
madvise(ret, MemorySizeInBytes, MADV_DONTFORK);
if (mflags.ui32.AQLQueueMemory) {
uint64_t my_buf_size = size / 2;
memset(ret, 0, MemorySizeInBytes);
mmap(VOID_PTR_ADD(mem, my_buf_size), MemorySizeInBytes,
PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_FIXED, map_fd, mmap_offset);
madvise(VOID_PTR_ADD(mem, my_buf_size),
MemorySizeInBytes,
MADV_DONTFORK);
}
}
}
@@ -2822,9 +2814,9 @@ static int _fmm_map_to_gpu_scratch(uint32_t gpu_id, manageable_aperture_t *apert
int32_t gpu_mem_id;
int ret;
bool is_debugger = 0;
uint32_t flags;
void *mmap_ret = NULL;
uint64_t mmap_offset = 0;
int map_fd;
vm_object_t *obj;
/* Retrieve gpu_mem id according to gpu_id */
@@ -2841,37 +2833,22 @@ static int _fmm_map_to_gpu_scratch(uint32_t gpu_id, manageable_aperture_t *apert
return -1;
is_debugger = debug_get_reg_status(gpu_mem[gpu_mem_id].node_id);
flags = is_debugger ? KFD_IOC_ALLOC_MEM_FLAGS_GTT :
KFD_IOC_ALLOC_MEM_FLAGS_VRAM;
flags |= KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE;
/* allocate object within the scratch backing aperture */
if (!is_debugger) {
obj = fmm_allocate_memory_object(
gpu_id, address, size, aperture, &mmap_offset,
KFD_IOC_ALLOC_MEM_FLAGS_VRAM |
KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE);
if (!obj)
return -1;
/* Create a CPU mapping for the debugger */
map_fd = gpu_mem[gpu_mem_id].drm_render_fd;
mmap_ret = mmap(address, size, PROT_NONE,
MAP_PRIVATE | MAP_FIXED, map_fd, mmap_offset);
if (mmap_ret == MAP_FAILED) {
__fmm_release(obj, aperture);
return -1;
}
} else {
obj = fmm_allocate_memory_object(
gpu_id, address, size, aperture, &mmap_offset,
KFD_IOC_ALLOC_MEM_FLAGS_GTT |
KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE);
map_fd = gpu_mem[gpu_mem_id].drm_render_fd;
mmap_ret = mmap(address, size,
PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_FIXED, map_fd, mmap_offset);
if (mmap_ret == MAP_FAILED) {
__fmm_release(obj, aperture);
return -1;
}
obj = fmm_allocate_memory_object(gpu_id, address, size,
aperture, &mmap_offset, flags);
if (!obj)
return -1;
/* Create a CPU mapping for the debugger */
mmap_ret = fmm_map_to_cpu(address, size, is_debugger,
gpu_mem[gpu_mem_id].drm_render_fd,
mmap_offset);
if (mmap_ret == MAP_FAILED) {
__fmm_release(obj, aperture);
return -1;
}
madvise(mmap_ret, size, MADV_DONTFORK);
/* map to GPU */
ret = _fmm_map_to_gpu(aperture, address, size, NULL, &gpu_id, sizeof(uint32_t));
@@ -3537,7 +3514,6 @@ HSAKMT_STATUS fmm_register_shared_memory(const HsaSharedMemoryHandle *SharedMemo
if (importArgs.mmap_offset) {
int32_t gpu_mem_id = gpu_mem_find_by_gpu_id(importArgs.gpu_id);
int map_fd;
void *ret;
if (gpu_mem_id < 0) {
@@ -3545,15 +3521,14 @@ HSAKMT_STATUS fmm_register_shared_memory(const HsaSharedMemoryHandle *SharedMemo
goto err_free_obj;
}
obj->node_id = gpu_mem[gpu_mem_id].node_id;
map_fd = gpu_mem[gpu_mem_id].drm_render_fd;
ret = mmap(reservedMem, (SizeInPages << PAGE_SHIFT),
PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_FIXED, map_fd, importArgs.mmap_offset);
ret = fmm_map_to_cpu(reservedMem, (SizeInPages << PAGE_SHIFT),
true, gpu_mem[gpu_mem_id].drm_render_fd,
importArgs.mmap_offset);
if (ret == MAP_FAILED) {
err = HSAKMT_STATUS_ERROR;
goto err_free_obj;
}
madvise(ret, (SizeInPages << PAGE_SHIFT), MADV_DONTFORK);
}
*MemoryAddress = reservedMem;