Use drm render device to map kfd BOs

Previously kfd device is used to map memory for CPU access.
However this is not compatible with how TTM handles CPU mapping
on eviction - memory won't be unmapped and remapped on restore.
This fixes the issue by mmapping memory using DRM render device.

This patch requires a coordinated kernel driver change to work.
To make it compatible with old kernel driver, some temporary codes
are included. Once the coordinated kernel driver is checked in,
the temporary codes can be removed.



Change-Id: Ie7b304c4a82b7e8d5ab703acb81d66430af4f0bc
Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>


[ROCm/ROCR-Runtime commit: 68a2d286ca]
This commit is contained in:
Oak Zeng
2017-10-27 16:17:51 -04:00
zatwierdzone przez Felix Kuehling
rodzic a068301408
commit e305dc9c82
4 zmienionych plików z 47 dodań i 9 usunięć
@@ -247,7 +247,7 @@ typedef struct _HsaNodeProperties
HSAuint64 LocalMemSize; // Local memory size
HSAuint32 MaxEngineClockMhzFCompute; // maximum engine clocks for CPU and
HSAuint32 MaxEngineClockMhzCCompute; // GPU function, including any boost caopabilities,
HSAint32 DrmRenderMinor; // DRM render device minor device number
HSAuint16 MarketingName[HSA_PUBLIC_NAME_SIZE]; // Public name of the "device" on the node (board or APU name).
// Unicode string
HSAuint8 AMDName[HSA_PUBLIC_NAME_SIZE]; //CAL Name of the "device", ASCII
+14 -7
Wyświetl plik
@@ -1056,9 +1056,12 @@ void *fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes, HsaMemFla
}
if (mem && flags.ui32.HostAccess) {
int map_fd = mmap_offset >= (1ULL<<40) ? kfd_fd :
get_drm_render_fd_by_gpu_id(gpu_id);
void *ret = mmap(mem, MemorySizeInBytes,
PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_FIXED, kfd_fd, mmap_offset);
MAP_SHARED | MAP_FIXED,
map_fd, mmap_offset);
if (ret == MAP_FAILED) {
__fmm_release(mem, aperture);
return NULL;
@@ -1245,9 +1248,11 @@ static void *fmm_allocate_host_gpu(uint32_t node_id, uint64_t MemorySizeInBytes,
ioc_flags, &vm_obj);
if (mem && flags.ui32.HostAccess) {
int map_fd = mmap_offset >= (1ULL<<40) ? kfd_fd :
get_drm_render_fd_by_gpu_id(gpu_id);
void *ret = mmap(mem, MemorySizeInBytes,
PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_FIXED, kfd_fd, mmap_offset);
MAP_SHARED | MAP_FIXED, map_fd, mmap_offset);
if (ret == MAP_FAILED) {
__fmm_release(mem, aperture);
return NULL;
@@ -1259,7 +1264,7 @@ static void *fmm_allocate_host_gpu(uint32_t node_id, uint64_t MemorySizeInBytes,
memset(ret, 0, MemorySizeInBytes);
mmap(VOID_PTR_ADD(mem, my_buf_size), MemorySizeInBytes,
PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_FIXED, kfd_fd, mmap_offset);
MAP_SHARED | MAP_FIXED, map_fd, mmap_offset);
}
}
}
@@ -1827,6 +1832,8 @@ static int _fmm_map_to_gpu_scratch(uint32_t gpu_id, manageable_aperture_t *apert
if (!obj)
return -1;
} else {
int map_fd = mmap_offset >= (1ULL<<40) ? kfd_fd :
get_drm_render_fd_by_gpu_id(gpu_id);
fmm_allocate_memory_in_device(gpu_id,
address,
size,
@@ -1835,8 +1842,7 @@ static int _fmm_map_to_gpu_scratch(uint32_t gpu_id, manageable_aperture_t *apert
KFD_IOC_ALLOC_MEM_FLAGS_GTT);
mmap_ret = mmap(address, size,
PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_FIXED,
kfd_fd, mmap_offset);
MAP_SHARED | MAP_FIXED, map_fd, mmap_offset);
if (mmap_ret == MAP_FAILED) {
__fmm_release(mem, aperture);
return -1;
@@ -2753,10 +2759,11 @@ HSAKMT_STATUS fmm_register_shared_memory(const HsaSharedMemoryHandle *SharedMemo
pthread_mutex_unlock(&aperture->fmm_mutex);
if (importArgs.mmap_offset) {
int map_fd = importArgs.mmap_offset >= (1ULL<<40) ? kfd_fd :
get_drm_render_fd_by_gpu_id(importArgs.gpu_id);
void *ret = mmap(reservedMem, (SharedMemoryStruct->SizeInPages << PAGE_SHIFT),
PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_FIXED, kfd_fd,
importArgs.mmap_offset);
MAP_SHARED | MAP_FIXED, map_fd, importArgs.mmap_offset);
if (ret == MAP_FAILED) {
err = HSAKMT_STATUS_ERROR;
goto err_free_obj;
@@ -107,6 +107,7 @@ HSAKMT_STATUS validate_nodeid(uint32_t nodeid, uint32_t *gpu_id);
HSAKMT_STATUS gpuid_to_nodeid(uint32_t gpu_id, uint32_t* node_id);
uint16_t get_device_id_by_node(HSAuint32 node_id);
uint16_t get_device_id_by_gpu_id(HSAuint32 gpu_id);
int get_drm_render_fd_by_gpu_id(HSAuint32 gpu_id);
HSAKMT_STATUS validate_nodeid_array(uint32_t **gpu_id_array,
uint32_t NumberOfNodes, uint32_t *NodeArray);
+31 -1
Wyświetl plik
@@ -54,6 +54,7 @@ typedef struct {
HsaMemoryProperties *mem; /* node->NumBanks elements */
HsaCacheProperties *cache;
HsaIoLinkProperties *link;
int drm_render_fd;
} node_t;
static HsaSystemProperties *_system = NULL;
@@ -239,6 +240,8 @@ free_node(node_t *n)
free((n)->cache);
if ((n)->link)
free((n)->link);
if ((n)->drm_render_fd > 0)
close((n)->drm_render_fd);
}
static void free_nodes(node_t *temp_nodes, int size)
@@ -825,6 +828,8 @@ HSAKMT_STATUS topology_sysfs_get_node_props(uint32_t node_id,
props->MaxEngineClockMhzCCompute = (uint32_t)prop_val;
else if (strcmp(prop_name, "local_mem_size") == 0)
props->LocalMemSize = prop_val;
else if (strcmp(prop_name, "drm_render_minor") == 0)
props->DrmRenderMinor = (int32_t)prop_val;
}
@@ -1512,6 +1517,16 @@ static void topology_create_indirect_gpu_links(const HsaSystemProperties *sys_pr
}
}
static void open_drm_render_device(node_t *n)
{
int minor = n->node.DrmRenderMinor;
char path[128];
sprintf(path, "/dev/dri/renderD%d", minor);
n->drm_render_fd = open(path, O_RDWR | O_CLOEXEC);
}
HSAKMT_STATUS topology_take_snapshot(void)
{
uint32_t gen_start, gen_end, i, mem_id, cache_id, link_id;
@@ -1609,7 +1624,7 @@ retry:
}
}
}
open_drm_render_device(&temp_nodes[i]);
}
pci_cleanup(pacc);
}
@@ -1970,6 +1985,21 @@ uint16_t get_device_id_by_gpu_id(HSAuint32 gpu_id)
return 0;
}
int get_drm_render_fd_by_gpu_id(HSAuint32 gpu_id)
{
unsigned int i;
if (!node || !_system)
return 0;
for (i = 0; i < _system->NumNodes; i++) {
if (node[i].gpu_id == gpu_id)
return node[i].drm_render_fd;
}
return -1;
}
HSAKMT_STATUS validate_nodeid_array(uint32_t **gpu_id_array,
uint32_t NumberOfNodes, uint32_t *NodeArray)
{