libhsakmt: Try to use CPU addr as GPU addr for userptrs

If the CPU addr of a userptr is accessible by the GPU, try to use it
instead of allocating a different GPU address. If something else is
already registered with an overlapping address range, we still need to
allocate a GPU address, because KFD does not support overlapping GPUVM
mappings.

Change-Id: I452963ee45a454f735755a0b43122b9aee5d55be
Signed-off-by: Felix Kuehling <felix.kuehling@gmail.com>


[ROCm/ROCR-Runtime commit: ab181c46c0]
Этот коммит содержится в:
Felix Kuehling
2018-08-05 13:56:27 -04:00
коммит произвёл Felix Kuehling
родитель 3dfb956bd5
Коммит 84bb9072c0
2 изменённых файлов: 66 добавлений и 7 удалений
+63 -6
Просмотреть файл
@@ -502,6 +502,38 @@ loop:
return cur; /* NULL if not found */
}
/* Returns true if there is any object in GPU VM address space
* overlapping the specified address range
*/
static bool vm_exists_overlapping_object(manageable_aperture_t *aper,
const void *addr, uint64_t size)
{
unsigned long start_addr = (unsigned long)addr;
unsigned long end_addr = start_addr + size;
rbtree_t *tree = vm_object_tree(aper, 0);
rbtree_key_t start_key, end_key;
rbtree_node_t *rn_start, *ln_end;
vm_object_t *cur;
start_key = rbtree_key(start_addr, 0);
rn_start = rbtree_lookup_nearest(tree, &start_key, LKP_ALL, RIGHT);
if (rn_start) {
cur = vm_object_entry(rn_start, 0);
if ((unsigned long)cur->start < end_addr)
return true;
}
end_key = rbtree_key(end_addr, 0);
ln_end = rbtree_lookup_nearest(tree, &end_key, LKP_ALL, LEFT);
if (ln_end) {
cur = vm_object_entry(ln_end, 0);
if ((unsigned long)cur->start + cur->size > start_addr)
return true;
}
return false;
}
static vm_object_t *vm_find_object_by_address(manageable_aperture_t *app,
const void *address, uint64_t size)
{
@@ -1136,7 +1168,7 @@ static void *__fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes,
manageable_aperture_t *aperture, uint64_t *mmap_offset,
uint32_t flags, vm_object_t **vm_obj)
{
void *mem = NULL;
void *mem = NULL, *userpage = NULL;
vm_object_t *obj;
/* Check that aperture is properly initialized/supported */
@@ -1145,9 +1177,27 @@ static void *__fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes,
/* Allocate address space */
pthread_mutex_lock(&aperture->fmm_mutex);
mem = aperture_allocate_area(aperture, MemorySizeInBytes);
/* If it's a userptr within the GPU-addressable address space,
* and no object is registered yet in the same address range,
* then use the CPU address as GPU address
*/
if ((flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) && mmap_offset) {
userpage = (void *)*mmap_offset;
if (userpage >= aperture->base &&
VOID_PTR_ADD(userpage, MemorySizeInBytes - 1) <= aperture->limit &&
!vm_exists_overlapping_object(aperture, userpage, MemorySizeInBytes))
mem = userpage;
}
if (!mem)
mem = aperture_allocate_area(aperture, MemorySizeInBytes);
pthread_mutex_unlock(&aperture->fmm_mutex);
if (!mem) {
pr_err("Failed to allocate %ld bytes virtual address space\n",
MemorySizeInBytes);
return NULL;
}
/*
* Now that we have the area reserved, allocate memory in the device
* itself
@@ -1159,9 +1209,11 @@ static void *__fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes,
* allocation of memory in device failed.
* Release region in aperture
*/
pthread_mutex_lock(&aperture->fmm_mutex);
aperture_release_area(aperture, mem, MemorySizeInBytes);
pthread_mutex_unlock(&aperture->fmm_mutex);
if (mem != userpage) {
pthread_mutex_lock(&aperture->fmm_mutex);
aperture_release_area(aperture, mem, MemorySizeInBytes);
pthread_mutex_unlock(&aperture->fmm_mutex);
}
/* Assign NULL to mem to indicate failure to calling function */
mem = NULL;
@@ -1465,7 +1517,12 @@ static void __fmm_release(vm_object_t *object, manageable_aperture_t *aperture)
args.handle = object->handle;
kmtIoctl(kfd_fd, AMDKFD_IOC_FREE_MEMORY_OF_GPU, &args);
aperture_release_area(aperture, object->start, object->size);
/* Userptrs with CPU addr == GPU addr don't have aperture
* space allocated to them
*/
if (!object->userptr ||
(void *)PAGE_ALIGN_DOWN(object->userptr) != object->start)
aperture_release_area(aperture, object->start, object->size);
vm_remove_object(aperture, object);
pthread_mutex_unlock(&aperture->fmm_mutex);
+3 -1
Просмотреть файл
@@ -65,8 +65,10 @@ extern int PAGE_SHIFT;
do { if ((uint64_t)PORT_VPTR_TO_UINT64(x) % PAGE_SIZE) return HSAKMT_STATUS_INVALID_PARAMETER; } while(0)
#define ALIGN_UP(x,align) (((uint64_t)(x) + (align) - 1) & ~(uint64_t)((align)-1))
#define ALIGN_DOWN(x,align) ((uint64_t)(x) & ~(uint64_t)((align)-1))
#define ALIGN_UP_32(x,align) (((uint32_t)(x) + (align) - 1) & ~(uint32_t)((align)-1))
#define PAGE_ALIGN_UP(x) ALIGN_UP(x,PAGE_SIZE)
#define PAGE_ALIGN_UP(x) ALIGN_UP((x),PAGE_SIZE)
#define PAGE_ALIGN_DOWN(x) ALIGN_DOWN((x),PAGE_SIZE)
#define BITMASK(n) (((n) < sizeof(1ULL) * CHAR_BIT ? (1ULL << (n)) : 0) - 1ULL)
#define ARRAY_LEN(array) (sizeof(array) / sizeof(array[0]))