From e305dc9c82216f4faa78cd08ca02a7454669e4da Mon Sep 17 00:00:00 2001 From: Oak Zeng Date: Fri, 27 Oct 2017 16:17:51 -0400 Subject: [PATCH] Use drm render device to map kfd BOs Previously kfd device is used to map memory for CPU access. However this is not compatible with how TTM handles CPU mapping on eviction - memory won't be unmapped and remapped on restore. This fixes the issue by mmapping memory using DRM render device. This patch requires a coordinated kernel driver change to work. To make it compatible with old kernel driver, some temporary codes are included. Once the coordinated kernel driver is checked in, the temporary codes can be removed. Change-Id: Ie7b304c4a82b7e8d5ab703acb81d66430af4f0bc Signed-off-by: Oak Zeng [ROCm/ROCR-Runtime commit: 68a2d286cadae6623cb505e2d624b43dee86779b] --- projects/rocr-runtime/include/hsakmttypes.h | 2 +- projects/rocr-runtime/src/fmm.c | 21 +++++++++----- projects/rocr-runtime/src/libhsakmt.h | 1 + projects/rocr-runtime/src/topology.c | 32 ++++++++++++++++++++- 4 files changed, 47 insertions(+), 9 deletions(-) diff --git a/projects/rocr-runtime/include/hsakmttypes.h b/projects/rocr-runtime/include/hsakmttypes.h index eb640d7014..6fc65df068 100644 --- a/projects/rocr-runtime/include/hsakmttypes.h +++ b/projects/rocr-runtime/include/hsakmttypes.h @@ -247,7 +247,7 @@ typedef struct _HsaNodeProperties HSAuint64 LocalMemSize; // Local memory size HSAuint32 MaxEngineClockMhzFCompute; // maximum engine clocks for CPU and HSAuint32 MaxEngineClockMhzCCompute; // GPU function, including any boost caopabilities, - + HSAint32 DrmRenderMinor; // DRM render device minor device number HSAuint16 MarketingName[HSA_PUBLIC_NAME_SIZE]; // Public name of the "device" on the node (board or APU name). // Unicode string HSAuint8 AMDName[HSA_PUBLIC_NAME_SIZE]; //CAL Name of the "device", ASCII diff --git a/projects/rocr-runtime/src/fmm.c b/projects/rocr-runtime/src/fmm.c index add26aa4b1..67cea1c834 100644 --- a/projects/rocr-runtime/src/fmm.c +++ b/projects/rocr-runtime/src/fmm.c @@ -1056,9 +1056,12 @@ void *fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes, HsaMemFla } if (mem && flags.ui32.HostAccess) { + int map_fd = mmap_offset >= (1ULL<<40) ? kfd_fd : + get_drm_render_fd_by_gpu_id(gpu_id); void *ret = mmap(mem, MemorySizeInBytes, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_FIXED, kfd_fd, mmap_offset); + MAP_SHARED | MAP_FIXED, + map_fd, mmap_offset); if (ret == MAP_FAILED) { __fmm_release(mem, aperture); return NULL; @@ -1245,9 +1248,11 @@ static void *fmm_allocate_host_gpu(uint32_t node_id, uint64_t MemorySizeInBytes, ioc_flags, &vm_obj); if (mem && flags.ui32.HostAccess) { + int map_fd = mmap_offset >= (1ULL<<40) ? kfd_fd : + get_drm_render_fd_by_gpu_id(gpu_id); void *ret = mmap(mem, MemorySizeInBytes, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_FIXED, kfd_fd, mmap_offset); + MAP_SHARED | MAP_FIXED, map_fd, mmap_offset); if (ret == MAP_FAILED) { __fmm_release(mem, aperture); return NULL; @@ -1259,7 +1264,7 @@ static void *fmm_allocate_host_gpu(uint32_t node_id, uint64_t MemorySizeInBytes, memset(ret, 0, MemorySizeInBytes); mmap(VOID_PTR_ADD(mem, my_buf_size), MemorySizeInBytes, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_FIXED, kfd_fd, mmap_offset); + MAP_SHARED | MAP_FIXED, map_fd, mmap_offset); } } } @@ -1827,6 +1832,8 @@ static int _fmm_map_to_gpu_scratch(uint32_t gpu_id, manageable_aperture_t *apert if (!obj) return -1; } else { + int map_fd = mmap_offset >= (1ULL<<40) ? kfd_fd : + get_drm_render_fd_by_gpu_id(gpu_id); fmm_allocate_memory_in_device(gpu_id, address, size, @@ -1835,8 +1842,7 @@ static int _fmm_map_to_gpu_scratch(uint32_t gpu_id, manageable_aperture_t *apert KFD_IOC_ALLOC_MEM_FLAGS_GTT); mmap_ret = mmap(address, size, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_FIXED, - kfd_fd, mmap_offset); + MAP_SHARED | MAP_FIXED, map_fd, mmap_offset); if (mmap_ret == MAP_FAILED) { __fmm_release(mem, aperture); return -1; @@ -2753,10 +2759,11 @@ HSAKMT_STATUS fmm_register_shared_memory(const HsaSharedMemoryHandle *SharedMemo pthread_mutex_unlock(&aperture->fmm_mutex); if (importArgs.mmap_offset) { + int map_fd = importArgs.mmap_offset >= (1ULL<<40) ? kfd_fd : + get_drm_render_fd_by_gpu_id(importArgs.gpu_id); void *ret = mmap(reservedMem, (SharedMemoryStruct->SizeInPages << PAGE_SHIFT), PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_FIXED, kfd_fd, - importArgs.mmap_offset); + MAP_SHARED | MAP_FIXED, map_fd, importArgs.mmap_offset); if (ret == MAP_FAILED) { err = HSAKMT_STATUS_ERROR; goto err_free_obj; diff --git a/projects/rocr-runtime/src/libhsakmt.h b/projects/rocr-runtime/src/libhsakmt.h index 142379010e..07402ffef5 100644 --- a/projects/rocr-runtime/src/libhsakmt.h +++ b/projects/rocr-runtime/src/libhsakmt.h @@ -107,6 +107,7 @@ HSAKMT_STATUS validate_nodeid(uint32_t nodeid, uint32_t *gpu_id); HSAKMT_STATUS gpuid_to_nodeid(uint32_t gpu_id, uint32_t* node_id); uint16_t get_device_id_by_node(HSAuint32 node_id); uint16_t get_device_id_by_gpu_id(HSAuint32 gpu_id); +int get_drm_render_fd_by_gpu_id(HSAuint32 gpu_id); HSAKMT_STATUS validate_nodeid_array(uint32_t **gpu_id_array, uint32_t NumberOfNodes, uint32_t *NodeArray); diff --git a/projects/rocr-runtime/src/topology.c b/projects/rocr-runtime/src/topology.c index 44b6fe4e64..2cc2ebdba6 100644 --- a/projects/rocr-runtime/src/topology.c +++ b/projects/rocr-runtime/src/topology.c @@ -54,6 +54,7 @@ typedef struct { HsaMemoryProperties *mem; /* node->NumBanks elements */ HsaCacheProperties *cache; HsaIoLinkProperties *link; + int drm_render_fd; } node_t; static HsaSystemProperties *_system = NULL; @@ -239,6 +240,8 @@ free_node(node_t *n) free((n)->cache); if ((n)->link) free((n)->link); + if ((n)->drm_render_fd > 0) + close((n)->drm_render_fd); } static void free_nodes(node_t *temp_nodes, int size) @@ -825,6 +828,8 @@ HSAKMT_STATUS topology_sysfs_get_node_props(uint32_t node_id, props->MaxEngineClockMhzCCompute = (uint32_t)prop_val; else if (strcmp(prop_name, "local_mem_size") == 0) props->LocalMemSize = prop_val; + else if (strcmp(prop_name, "drm_render_minor") == 0) + props->DrmRenderMinor = (int32_t)prop_val; } @@ -1512,6 +1517,16 @@ static void topology_create_indirect_gpu_links(const HsaSystemProperties *sys_pr } } + +static void open_drm_render_device(node_t *n) +{ + int minor = n->node.DrmRenderMinor; + char path[128]; + + sprintf(path, "/dev/dri/renderD%d", minor); + n->drm_render_fd = open(path, O_RDWR | O_CLOEXEC); +} + HSAKMT_STATUS topology_take_snapshot(void) { uint32_t gen_start, gen_end, i, mem_id, cache_id, link_id; @@ -1609,7 +1624,7 @@ retry: } } } - + open_drm_render_device(&temp_nodes[i]); } pci_cleanup(pacc); } @@ -1970,6 +1985,21 @@ uint16_t get_device_id_by_gpu_id(HSAuint32 gpu_id) return 0; } +int get_drm_render_fd_by_gpu_id(HSAuint32 gpu_id) +{ + unsigned int i; + + if (!node || !_system) + return 0; + + for (i = 0; i < _system->NumNodes; i++) { + if (node[i].gpu_id == gpu_id) + return node[i].drm_render_fd; + } + + return -1; +} + HSAKMT_STATUS validate_nodeid_array(uint32_t **gpu_id_array, uint32_t NumberOfNodes, uint32_t *NodeArray) {