Allocate and map doorbells in SVM for discrete GPUs

Allocate doorbells for dGPUs in the SVM aperture and map them for
GPU access. This is necessary to allow GPU-initiated submissions to
user mode queues.

Depends on new doorbell BO allocation flag in KFD.

Change-Id: I0737bef4a4764bb4a66c43846707ead2108f6601


[ROCm/ROCR-Runtime commit: 2e0a6eb371]
이 커밋은 다음에 포함됨:
Felix Kuehling
2016-09-16 15:56:57 -04:00
부모 6b33ada07b
커밋 fee7a91fb9
4개의 변경된 파일143개의 추가작업 그리고 21개의 파일을 삭제
+2
파일 보기
@@ -324,6 +324,8 @@ enum evict_type {
#define KFD_IOC_ALLOC_MEM_FLAGS_USERPTR (1 << 6)
#define KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL (1 << 7)
struct kfd_ioctl_alloc_memory_of_gpu_new_args {
uint64_t va_addr; /* to KFD */
uint64_t size; /* to KFD */
+51
파일 보기
@@ -905,6 +905,57 @@ void *fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes, HsaMemFla
return mem;
}
void *fmm_allocate_doorbell(uint32_t gpu_id, uint64_t MemorySizeInBytes,
uint64_t doorbell_offset)
{
manageble_aperture_t *aperture;
int32_t gpu_mem_id;
uint32_t ioc_flags;
void *mem;
vm_object_t *vm_obj = NULL;
/* Retrieve gpu_mem id according to gpu_id */
gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id);
if (gpu_mem_id < 0)
return NULL;
/* Use fine-grained aperture */
aperture = &svm.dgpu_alt_aperture;
ioc_flags = KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL;
mem = __fmm_allocate_device(gpu_id, MemorySizeInBytes,
aperture, 0, NULL,
ioc_flags, &vm_obj);
if (mem && vm_obj) {
HsaMemFlags flags;
/* Cook up some flags for storing in the VM object */
flags.Value = 0;
flags.ui32.NonPaged = 1;
flags.ui32.HostAccess = 1;
flags.ui32.Reserved = 0xBe11;
pthread_mutex_lock(&aperture->fmm_mutex);
vm_obj->flags = flags.Value;
gpuid_to_nodeid(gpu_id, &vm_obj->node_id);
pthread_mutex_unlock(&aperture->fmm_mutex);
}
if (mem) {
void *ret = mmap(mem, MemorySizeInBytes,
PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_FIXED, kfd_fd,
doorbell_offset);
if (ret == MAP_FAILED) {
__fmm_release(mem, aperture);
return NULL;
}
}
return mem;
}
static void* fmm_allocate_host_cpu(uint64_t MemorySizeInBytes,
HsaMemFlags flags)
{
+1
파일 보기
@@ -52,6 +52,7 @@ void fmm_destroy_process_apertures(void);
*/
void* fmm_allocate_scratch(uint32_t gpu_id, uint64_t MemorySizeInBytes);
void* fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes, HsaMemFlags flags);
void* fmm_allocate_doorbell(uint32_t gpu_id, uint64_t MemorySizeInBytes, uint64_t doorbell_offset);
void* fmm_allocate_host(uint32_t node_id, uint64_t MemorySizeInBytes,
HsaMemFlags flags);
void* fmm_open_graphic_handle(uint32_t gpu_id,
+89 -21
파일 보기
@@ -24,6 +24,7 @@
*/
#include "libhsakmt.h"
#include "fmm.h"
#include "linux/kfd_ioctl.h"
#include <stdlib.h>
#include <string.h>
@@ -181,6 +182,7 @@ struct process_doorbells
pthread_mutex_t doorbells_mutex;
};
static unsigned int num_doorbells;
static struct process_doorbells *doorbells;
HSAKMT_STATUS init_process_doorbells(unsigned int NumNodes)
@@ -199,15 +201,94 @@ HSAKMT_STATUS init_process_doorbells(unsigned int NumNodes)
doorbells[i].doorbells = NULL;
pthread_mutex_init(&doorbells[i].doorbells_mutex, NULL);
}
num_doorbells = NumNodes;
return ret;
}
void destroy_process_doorbells(void)
{
if (doorbells) {
free(doorbells);
doorbells = NULL;
unsigned int i;
if (!doorbells)
return;
for (i = 0; i < num_doorbells; i++) {
if (doorbells[i].need_mmap)
continue;
if (topology_is_dgpu(get_device_id_by_node(i))) {
fmm_unmap_from_gpu(doorbells[i].doorbells);
fmm_release(doorbells[i].doorbells);
} else
munmap(doorbells[i].doorbells, DOORBELLS_PAGE_SIZE);
}
free(doorbells);
doorbells = NULL;
num_doorbells = 0;
}
static HSAKMT_STATUS map_doorbell_apu(HSAuint32 NodeId, HSAuint32 gpu_id,
HSAuint64 doorbell_offset)
{
void *ptr;
ptr = mmap(0, DOORBELLS_PAGE_SIZE, PROT_READ|PROT_WRITE,
MAP_SHARED, kfd_fd, doorbell_offset);
if (ptr == MAP_FAILED)
return HSAKMT_STATUS_ERROR;
doorbells[NodeId].need_mmap = false;
doorbells[NodeId].doorbells = ptr;
return HSAKMT_STATUS_SUCCESS;
}
static HSAKMT_STATUS map_doorbell_dgpu(HSAuint32 NodeId, HSAuint32 gpu_id,
HSAuint64 doorbell_offset)
{
void *ptr;
ptr = fmm_allocate_doorbell(gpu_id, DOORBELLS_PAGE_SIZE,
doorbell_offset);
if (ptr == NULL)
return HSAKMT_STATUS_ERROR;
/* map for GPU access */
if (fmm_map_to_gpu(ptr, DOORBELLS_PAGE_SIZE, NULL)) {
fmm_release(ptr);
return HSAKMT_STATUS_ERROR;
}
doorbells[NodeId].need_mmap = false;
doorbells[NodeId].doorbells = ptr;
return HSAKMT_STATUS_SUCCESS;
}
static HSAKMT_STATUS map_doorbell(HSAuint32 NodeId, HSAuint32 gpu_id,
HSAuint64 doorbell_offset)
{
HSAKMT_STATUS status = HSAKMT_STATUS_SUCCESS;
pthread_mutex_lock(&doorbells[NodeId].doorbells_mutex);
if (!doorbells[NodeId].need_mmap) {
pthread_mutex_unlock(&doorbells[NodeId].doorbells_mutex);
return HSAKMT_STATUS_SUCCESS;
}
if (topology_is_dgpu(get_device_id_by_node(NodeId)))
status = map_doorbell_dgpu(NodeId, gpu_id, doorbell_offset);
else
status = map_doorbell_apu(NodeId, gpu_id, doorbell_offset);
pthread_mutex_unlock(&doorbells[NodeId].doorbells_mutex);
return status;
}
static struct device_info *get_device_info_by_dev_id(uint16_t dev_id)
@@ -432,7 +513,6 @@ hsaKmtCreateQueue(
uint16_t dev_id;
struct device_info *dev_info;
int err;
void* ptr;
CHECK_KFD_OPEN();
result = validate_nodeid(NodeId, &gpu_id);
@@ -495,25 +575,13 @@ hsaKmtCreateQueue(
q->queue_id = args.queue_id;
pthread_mutex_lock(&doorbells[NodeId].doorbells_mutex);
if (doorbells[NodeId].need_mmap) {
ptr = mmap(0, DOORBELLS_PAGE_SIZE, PROT_READ|PROT_WRITE,
MAP_SHARED, kfd_fd, args.doorbell_offset);
if (ptr == MAP_FAILED) {
pthread_mutex_unlock(&doorbells[NodeId].doorbells_mutex);
hsaKmtDestroyQueue(q->queue_id);
free_queue(q);
return HSAKMT_STATUS_ERROR;
}
doorbells[NodeId].need_mmap = false;
doorbells[NodeId].doorbells = ptr;
err = map_doorbell(NodeId, gpu_id, args.doorbell_offset);
if (err != HSAKMT_STATUS_SUCCESS) {
hsaKmtDestroyQueue(q->queue_id);
free_queue(q);
return HSAKMT_STATUS_ERROR;
}
pthread_mutex_unlock(&doorbells[NodeId].doorbells_mutex);
QueueResource->QueueId = PORT_VPTR_TO_UINT64(q);
QueueResource->Queue_DoorBell = VOID_PTR_ADD32(doorbells[NodeId].doorbells, q->queue_id);