From fee7a91fb9c167227e67fdd2290b7e6e006d61ff Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Fri, 16 Sep 2016 15:56:57 -0400 Subject: [PATCH] Allocate and map doorbells in SVM for discrete GPUs Allocate doorbells for dGPUs in the SVM aperture and map them for GPU access. This is necessary to allow GPU-initiated submissions to user mode queues. Depends on new doorbell BO allocation flag in KFD. Change-Id: I0737bef4a4764bb4a66c43846707ead2108f6601 [ROCm/ROCR-Runtime commit: 2e0a6eb3716b34fec1c32a235a5f786f2bacc29b] --- .../rocr-runtime/include/linux/kfd_ioctl.h | 2 + projects/rocr-runtime/src/fmm.c | 51 ++++++++ projects/rocr-runtime/src/fmm.h | 1 + projects/rocr-runtime/src/queues.c | 110 ++++++++++++++---- 4 files changed, 143 insertions(+), 21 deletions(-) diff --git a/projects/rocr-runtime/include/linux/kfd_ioctl.h b/projects/rocr-runtime/include/linux/kfd_ioctl.h index a484756ef7..613a2c7b8c 100644 --- a/projects/rocr-runtime/include/linux/kfd_ioctl.h +++ b/projects/rocr-runtime/include/linux/kfd_ioctl.h @@ -324,6 +324,8 @@ enum evict_type { #define KFD_IOC_ALLOC_MEM_FLAGS_USERPTR (1 << 6) +#define KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL (1 << 7) + struct kfd_ioctl_alloc_memory_of_gpu_new_args { uint64_t va_addr; /* to KFD */ uint64_t size; /* to KFD */ diff --git a/projects/rocr-runtime/src/fmm.c b/projects/rocr-runtime/src/fmm.c index 87894c21fc..8cf6553833 100644 --- a/projects/rocr-runtime/src/fmm.c +++ b/projects/rocr-runtime/src/fmm.c @@ -905,6 +905,57 @@ void *fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes, HsaMemFla return mem; } +void *fmm_allocate_doorbell(uint32_t gpu_id, uint64_t MemorySizeInBytes, + uint64_t doorbell_offset) +{ + manageble_aperture_t *aperture; + int32_t gpu_mem_id; + uint32_t ioc_flags; + void *mem; + vm_object_t *vm_obj = NULL; + + /* Retrieve gpu_mem id according to gpu_id */ + gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id); + if (gpu_mem_id < 0) + return NULL; + + /* Use fine-grained aperture */ + aperture = &svm.dgpu_alt_aperture; + ioc_flags = KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL; + + mem = __fmm_allocate_device(gpu_id, MemorySizeInBytes, + aperture, 0, NULL, + ioc_flags, &vm_obj); + + if (mem && vm_obj) { + HsaMemFlags flags; + + /* Cook up some flags for storing in the VM object */ + flags.Value = 0; + flags.ui32.NonPaged = 1; + flags.ui32.HostAccess = 1; + flags.ui32.Reserved = 0xBe11; + + pthread_mutex_lock(&aperture->fmm_mutex); + vm_obj->flags = flags.Value; + gpuid_to_nodeid(gpu_id, &vm_obj->node_id); + pthread_mutex_unlock(&aperture->fmm_mutex); + } + + if (mem) { + void *ret = mmap(mem, MemorySizeInBytes, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED, kfd_fd, + doorbell_offset); + if (ret == MAP_FAILED) { + __fmm_release(mem, aperture); + return NULL; + } + } + + return mem; +} + static void* fmm_allocate_host_cpu(uint64_t MemorySizeInBytes, HsaMemFlags flags) { diff --git a/projects/rocr-runtime/src/fmm.h b/projects/rocr-runtime/src/fmm.h index 96c37f2b08..6fc8c56928 100644 --- a/projects/rocr-runtime/src/fmm.h +++ b/projects/rocr-runtime/src/fmm.h @@ -52,6 +52,7 @@ void fmm_destroy_process_apertures(void); */ void* fmm_allocate_scratch(uint32_t gpu_id, uint64_t MemorySizeInBytes); void* fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes, HsaMemFlags flags); +void* fmm_allocate_doorbell(uint32_t gpu_id, uint64_t MemorySizeInBytes, uint64_t doorbell_offset); void* fmm_allocate_host(uint32_t node_id, uint64_t MemorySizeInBytes, HsaMemFlags flags); void* fmm_open_graphic_handle(uint32_t gpu_id, diff --git a/projects/rocr-runtime/src/queues.c b/projects/rocr-runtime/src/queues.c index 2f53085d4b..3bd9b2827d 100644 --- a/projects/rocr-runtime/src/queues.c +++ b/projects/rocr-runtime/src/queues.c @@ -24,6 +24,7 @@ */ #include "libhsakmt.h" +#include "fmm.h" #include "linux/kfd_ioctl.h" #include #include @@ -181,6 +182,7 @@ struct process_doorbells pthread_mutex_t doorbells_mutex; }; +static unsigned int num_doorbells; static struct process_doorbells *doorbells; HSAKMT_STATUS init_process_doorbells(unsigned int NumNodes) @@ -199,15 +201,94 @@ HSAKMT_STATUS init_process_doorbells(unsigned int NumNodes) doorbells[i].doorbells = NULL; pthread_mutex_init(&doorbells[i].doorbells_mutex, NULL); } + + num_doorbells = NumNodes; + return ret; } void destroy_process_doorbells(void) { - if (doorbells) { - free(doorbells); - doorbells = NULL; + unsigned int i; + + if (!doorbells) + return; + + for (i = 0; i < num_doorbells; i++) { + if (doorbells[i].need_mmap) + continue; + + if (topology_is_dgpu(get_device_id_by_node(i))) { + fmm_unmap_from_gpu(doorbells[i].doorbells); + fmm_release(doorbells[i].doorbells); + } else + munmap(doorbells[i].doorbells, DOORBELLS_PAGE_SIZE); } + + free(doorbells); + doorbells = NULL; + num_doorbells = 0; +} + +static HSAKMT_STATUS map_doorbell_apu(HSAuint32 NodeId, HSAuint32 gpu_id, + HSAuint64 doorbell_offset) +{ + void *ptr; + + ptr = mmap(0, DOORBELLS_PAGE_SIZE, PROT_READ|PROT_WRITE, + MAP_SHARED, kfd_fd, doorbell_offset); + + if (ptr == MAP_FAILED) + return HSAKMT_STATUS_ERROR; + + doorbells[NodeId].need_mmap = false; + doorbells[NodeId].doorbells = ptr; + + return HSAKMT_STATUS_SUCCESS; +} + +static HSAKMT_STATUS map_doorbell_dgpu(HSAuint32 NodeId, HSAuint32 gpu_id, + HSAuint64 doorbell_offset) +{ + void *ptr; + + ptr = fmm_allocate_doorbell(gpu_id, DOORBELLS_PAGE_SIZE, + doorbell_offset); + + if (ptr == NULL) + return HSAKMT_STATUS_ERROR; + + /* map for GPU access */ + if (fmm_map_to_gpu(ptr, DOORBELLS_PAGE_SIZE, NULL)) { + fmm_release(ptr); + return HSAKMT_STATUS_ERROR; + } + + doorbells[NodeId].need_mmap = false; + doorbells[NodeId].doorbells = ptr; + + return HSAKMT_STATUS_SUCCESS; +} + +static HSAKMT_STATUS map_doorbell(HSAuint32 NodeId, HSAuint32 gpu_id, + HSAuint64 doorbell_offset) +{ + HSAKMT_STATUS status = HSAKMT_STATUS_SUCCESS; + + pthread_mutex_lock(&doorbells[NodeId].doorbells_mutex); + if (!doorbells[NodeId].need_mmap) { + pthread_mutex_unlock(&doorbells[NodeId].doorbells_mutex); + return HSAKMT_STATUS_SUCCESS; + } + + if (topology_is_dgpu(get_device_id_by_node(NodeId))) + status = map_doorbell_dgpu(NodeId, gpu_id, doorbell_offset); + else + status = map_doorbell_apu(NodeId, gpu_id, doorbell_offset); + + pthread_mutex_unlock(&doorbells[NodeId].doorbells_mutex); + + return status; } static struct device_info *get_device_info_by_dev_id(uint16_t dev_id) @@ -432,7 +513,6 @@ hsaKmtCreateQueue( uint16_t dev_id; struct device_info *dev_info; int err; - void* ptr; CHECK_KFD_OPEN(); result = validate_nodeid(NodeId, &gpu_id); @@ -495,25 +575,13 @@ hsaKmtCreateQueue( q->queue_id = args.queue_id; - pthread_mutex_lock(&doorbells[NodeId].doorbells_mutex); - - if (doorbells[NodeId].need_mmap) { - ptr = mmap(0, DOORBELLS_PAGE_SIZE, PROT_READ|PROT_WRITE, - MAP_SHARED, kfd_fd, args.doorbell_offset); - - if (ptr == MAP_FAILED) { - pthread_mutex_unlock(&doorbells[NodeId].doorbells_mutex); - hsaKmtDestroyQueue(q->queue_id); - free_queue(q); - return HSAKMT_STATUS_ERROR; - } - - doorbells[NodeId].need_mmap = false; - doorbells[NodeId].doorbells = ptr; + err = map_doorbell(NodeId, gpu_id, args.doorbell_offset); + if (err != HSAKMT_STATUS_SUCCESS) { + hsaKmtDestroyQueue(q->queue_id); + free_queue(q); + return HSAKMT_STATUS_ERROR; } - pthread_mutex_unlock(&doorbells[NodeId].doorbells_mutex); - QueueResource->QueueId = PORT_VPTR_TO_UINT64(q); QueueResource->Queue_DoorBell = VOID_PTR_ADD32(doorbells[NodeId].doorbells, q->queue_id);