Use drm render device to map kfd BOs

Previously kfd device is used to map memory for CPU access. However this is not compatible with how TTM handles CPU mapping on eviction - memory won't be unmapped and remapped on restore. This fixes the issue by mmapping memory using DRM render device. This patch requires a coordinated kernel driver change to work. To make it compatible with old kernel driver, some temporary codes are included. Once the coordinated kernel driver is checked in, the temporary codes can be removed. Change-Id: Ie7b304c4a82b7e8d5ab703acb81d66430af4f0bc Signed-off-by: Oak Zeng <Oak.Zeng@amd.com> [ROCm/ROCR-Runtime commit: 68a2d286ca]
2017-10-27 16:17:51 -04:00
commit e305dc9c82
@@ -247,7 +247,7 @@ typedef struct _HsaNodeProperties
    HSAuint64       LocalMemSize;       // Local memory size
    HSAuint32       MaxEngineClockMhzFCompute;  // maximum engine clocks for CPU and
    HSAuint32       MaxEngineClockMhzCCompute;  // GPU function, including any boost caopabilities,
-
+    HSAint32        DrmRenderMinor;             // DRM render device minor device number
    HSAuint16       MarketingName[HSA_PUBLIC_NAME_SIZE];   // Public name of the "device" on the node (board or APU name).
                                       // Unicode string
    HSAuint8        AMDName[HSA_PUBLIC_NAME_SIZE];   //CAL Name of the "device", ASCII
@@ -1056,9 +1056,12 @@ void *fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes, HsaMemFla
 	}

 	if (mem && flags.ui32.HostAccess) {
+		int map_fd = mmap_offset >= (1ULL<<40) ? kfd_fd :
+					get_drm_render_fd_by_gpu_id(gpu_id);
 		void *ret = mmap(mem, MemorySizeInBytes,
 				 PROT_READ | PROT_WRITE,
-				 MAP_SHARED | MAP_FIXED, kfd_fd, mmap_offset);
+				 MAP_SHARED | MAP_FIXED,
+				 map_fd, mmap_offset);
 		if (ret == MAP_FAILED) {
 			__fmm_release(mem, aperture);
 			return NULL;
@@ -1245,9 +1248,11 @@ static void *fmm_allocate_host_gpu(uint32_t node_id, uint64_t MemorySizeInBytes,
 					     ioc_flags, &vm_obj);

 		if (mem && flags.ui32.HostAccess) {
+			int map_fd = mmap_offset >= (1ULL<<40) ? kfd_fd :
+						get_drm_render_fd_by_gpu_id(gpu_id);
 			void *ret = mmap(mem, MemorySizeInBytes,
 					 PROT_READ | PROT_WRITE,
-					 MAP_SHARED | MAP_FIXED, kfd_fd, mmap_offset);
+					 MAP_SHARED | MAP_FIXED, map_fd, mmap_offset);
 			if (ret == MAP_FAILED) {
 				__fmm_release(mem, aperture);
 				return NULL;
@@ -1259,7 +1264,7 @@ static void *fmm_allocate_host_gpu(uint32_t node_id, uint64_t MemorySizeInBytes,
 				memset(ret, 0, MemorySizeInBytes);
 				mmap(VOID_PTR_ADD(mem, my_buf_size), MemorySizeInBytes,
 				     PROT_READ | PROT_WRITE,
-				     MAP_SHARED | MAP_FIXED, kfd_fd, mmap_offset);
+				     MAP_SHARED | MAP_FIXED, map_fd, mmap_offset);
 			}
 		}
 	}
@@ -1827,6 +1832,8 @@ static int _fmm_map_to_gpu_scratch(uint32_t gpu_id, manageable_aperture_t *apert
 		if (!obj)
 			return -1;
 	} else {
+		int map_fd = mmap_offset >= (1ULL<<40) ? kfd_fd :
+					get_drm_render_fd_by_gpu_id(gpu_id);
 		fmm_allocate_memory_in_device(gpu_id,
 					address,
 					size,
@@ -1835,8 +1842,7 @@ static int _fmm_map_to_gpu_scratch(uint32_t gpu_id, manageable_aperture_t *apert
 					KFD_IOC_ALLOC_MEM_FLAGS_GTT);
 		mmap_ret = mmap(address, size,
 				PROT_READ | PROT_WRITE,
-				MAP_SHARED | MAP_FIXED,
-				kfd_fd, mmap_offset);
+				MAP_SHARED | MAP_FIXED, map_fd, mmap_offset);
 		if (mmap_ret == MAP_FAILED) {
 			__fmm_release(mem, aperture);
 			return -1;
@@ -2753,10 +2759,11 @@ HSAKMT_STATUS fmm_register_shared_memory(const HsaSharedMemoryHandle *SharedMemo
 	pthread_mutex_unlock(&aperture->fmm_mutex);

 	if (importArgs.mmap_offset) {
+		int map_fd = importArgs.mmap_offset >= (1ULL<<40) ? kfd_fd :
+					get_drm_render_fd_by_gpu_id(importArgs.gpu_id);
 		void *ret = mmap(reservedMem, (SharedMemoryStruct->SizeInPages << PAGE_SHIFT),
 				 PROT_READ | PROT_WRITE,
-				 MAP_SHARED | MAP_FIXED, kfd_fd,
-				 importArgs.mmap_offset);
+				 MAP_SHARED | MAP_FIXED, map_fd, importArgs.mmap_offset);
 		if (ret == MAP_FAILED) {
 			err = HSAKMT_STATUS_ERROR;
 			goto err_free_obj;
@@ -107,6 +107,7 @@ HSAKMT_STATUS validate_nodeid(uint32_t nodeid, uint32_t *gpu_id);
 HSAKMT_STATUS gpuid_to_nodeid(uint32_t gpu_id, uint32_t* node_id);
 uint16_t get_device_id_by_node(HSAuint32 node_id);
 uint16_t get_device_id_by_gpu_id(HSAuint32 gpu_id);
+int get_drm_render_fd_by_gpu_id(HSAuint32 gpu_id);
 HSAKMT_STATUS validate_nodeid_array(uint32_t **gpu_id_array,
 		uint32_t NumberOfNodes, uint32_t *NodeArray);

@@ -54,6 +54,7 @@ typedef struct {
 	HsaMemoryProperties *mem;     /* node->NumBanks elements */
 	HsaCacheProperties *cache;
 	HsaIoLinkProperties *link;
+	int drm_render_fd;
 } node_t;

 static HsaSystemProperties *_system = NULL;
@@ -239,6 +240,8 @@ free_node(node_t *n)
 		free((n)->cache);
 	if ((n)->link)
 		free((n)->link);
+	if ((n)->drm_render_fd > 0)
+		close((n)->drm_render_fd);
 }

 static void free_nodes(node_t *temp_nodes, int size)
@@ -825,6 +828,8 @@ HSAKMT_STATUS topology_sysfs_get_node_props(uint32_t node_id,
 			props->MaxEngineClockMhzCCompute = (uint32_t)prop_val;
 		else if (strcmp(prop_name, "local_mem_size") == 0)
 			props->LocalMemSize = prop_val;
+		else if (strcmp(prop_name, "drm_render_minor") == 0)
+			props->DrmRenderMinor = (int32_t)prop_val;

 	}

@@ -1512,6 +1517,16 @@ static void topology_create_indirect_gpu_links(const HsaSystemProperties *sys_pr
 	}
 }

+
+static void open_drm_render_device(node_t *n)
+{
+	int minor = n->node.DrmRenderMinor;
+	char path[128];
+
+	sprintf(path, "/dev/dri/renderD%d", minor);
+	n->drm_render_fd = open(path, O_RDWR | O_CLOEXEC);
+}
+
 HSAKMT_STATUS topology_take_snapshot(void)
 {
 	uint32_t gen_start, gen_end, i, mem_id, cache_id, link_id;
@@ -1609,7 +1624,7 @@ retry:
 					}
 				}
 			}
-
+			open_drm_render_device(&temp_nodes[i]);
 		}
 		pci_cleanup(pacc);
 	}
@@ -1970,6 +1985,21 @@ uint16_t get_device_id_by_gpu_id(HSAuint32 gpu_id)
 	return 0;
 }

+int get_drm_render_fd_by_gpu_id(HSAuint32 gpu_id)
+{
+	unsigned int i;
+
+	if (!node || !_system)
+		return 0;
+
+	for (i = 0; i < _system->NumNodes; i++) {
+		if (node[i].gpu_id == gpu_id)
+			return node[i].drm_render_fd;
+	}
+
+	return -1;
+}
+
 HSAKMT_STATUS validate_nodeid_array(uint32_t **gpu_id_array,
 		uint32_t NumberOfNodes, uint32_t *NodeArray)
 {