THUNK: fix deregister memory issues

__fmm_release actually fails to find the object if address is not
pagesize aligned.  And the caller did not notice this as __fmm_release
has no err code return.

So to fix this, move the object lookup in caller, and use vm-object
instead. Also fmm_release will pass up the error code.

Change-Id: Ib8ea1ea5ae844844fd20e8e01f0fdb841d218f2c
Signed-off-by: xinhui pan <xinhui.pan@amd.com>


[ROCm/ROCR-Runtime commit: 8ee5647814]
This commit is contained in:
xinhui pan
2018-06-20 13:30:06 +08:00
کامیت شده توسط Felix Kuehling
والد 146d8ec61b
کامیت 78664b43e7
3فایلهای تغییر یافته به همراه43 افزوده شده و 49 حذف شده
+41 -46
مشاهده پرونده
@@ -215,7 +215,7 @@ static inline HsaSharedMemoryHandle *to_hsa_shared_memory_handle(
}
extern int debug_get_reg_status(uint32_t node_id, bool *is_debugged);
static void __fmm_release(void *address, manageable_aperture_t *aperture);
static void __fmm_release(vm_object_t *object, manageable_aperture_t *aperture);
static int _fmm_unmap_from_gpu_scratch(uint32_t gpu_id,
manageable_aperture_t *aperture,
void *address);
@@ -1055,7 +1055,7 @@ void *fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes, HsaMemFla
map_fd, mmap_offset);
if (ret == MAP_FAILED) {
__fmm_release(mem, aperture);
__fmm_release(vm_obj, aperture);
return NULL;
}
}
@@ -1108,7 +1108,7 @@ void *fmm_allocate_doorbell(uint32_t gpu_id, uint64_t MemorySizeInBytes,
MAP_SHARED | MAP_FIXED, kfd_fd,
doorbell_offset);
if (ret == MAP_FAILED) {
__fmm_release(mem, aperture);
__fmm_release(vm_obj, aperture);
return NULL;
}
}
@@ -1262,7 +1262,7 @@ static void *fmm_allocate_host_gpu(uint32_t node_id, uint64_t MemorySizeInBytes,
PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_FIXED, map_fd, mmap_offset);
if (ret == MAP_FAILED) {
__fmm_release(mem, aperture);
__fmm_release(vm_obj, aperture);
return NULL;
}
@@ -1296,23 +1296,13 @@ void *fmm_allocate_host(uint32_t node_id, uint64_t MemorySizeInBytes,
return fmm_allocate_host_cpu(MemorySizeInBytes, flags);
}
static void __fmm_release(void *address, manageable_aperture_t *aperture)
static void __fmm_release(vm_object_t *object, manageable_aperture_t *aperture)
{
struct kfd_ioctl_free_memory_of_gpu_args args = {0};
vm_object_t *object;
void *address;
if (!address)
if (!object)
return;
pthread_mutex_lock(&aperture->fmm_mutex);
/* Find the object to retrieve the handle */
object = vm_find_object_by_address(aperture, address, 0);
if (!object) {
pthread_mutex_unlock(&aperture->fmm_mutex);
return;
}
/* If memory is user memory and it's still GPU mapped, munmap
* would cause an eviction. If the restore happens quickly
* enough, restore would also fail with an error message. So
@@ -1321,6 +1311,8 @@ static void __fmm_release(void *address, manageable_aperture_t *aperture)
args.handle = object->handle;
kmtIoctl(kfd_fd, AMDKFD_IOC_FREE_MEMORY_OF_GPU, &args);
address = object->start;
if (address >= dgpu_shared_aperture_base &&
address <= dgpu_shared_aperture_limit) {
/* Reset NUMA policy */
@@ -1334,48 +1326,52 @@ static void __fmm_release(void *address, manageable_aperture_t *aperture)
pthread_mutex_unlock(&aperture->fmm_mutex);
}
void fmm_release(void *address)
HSAKMT_STATUS fmm_release(void *address)
{
uint32_t i;
bool found = false;
vm_object_t *object;
manageable_aperture_t *aperture = NULL;
for (i = 0; i < gpu_mem_count && !found; i++) {
for (i = 0; i < gpu_mem_count; i++) {
if (gpu_mem[i].gpu_id == NON_VALID_GPU_ID)
continue;
if (address >= gpu_mem[i].scratch_physical.base &&
address <= gpu_mem[i].scratch_physical.limit) {
fmm_release_scratch(gpu_mem[i].gpu_id);
return;
return HSAKMT_STATUS_SUCCESS;
}
if (address >= gpu_mem[i].gpuvm_aperture.base &&
address <= gpu_mem[i].gpuvm_aperture.limit) {
found = true;
__fmm_release(address, &gpu_mem[i].gpuvm_aperture);
fmm_print(gpu_mem[i].gpu_id);
aperture = &gpu_mem[i].gpuvm_aperture;
break;
}
}
if (!found) {
if (!aperture) {
if (address >= svm.dgpu_aperture.base &&
address <= svm.dgpu_aperture.limit) {
found = true;
__fmm_release(address, &svm.dgpu_aperture);
fmm_print(gpu_mem[i].gpu_id);
aperture = &svm.dgpu_aperture;
} else if (address >= svm.dgpu_alt_aperture.base &&
address <= svm.dgpu_alt_aperture.limit) {
found = true;
__fmm_release(address, &svm.dgpu_alt_aperture);
fmm_print(gpu_mem[i].gpu_id);
aperture = &svm.dgpu_alt_aperture;
}
}
/*
* If memory address isn't inside of any defined GPU aperture - it
* refers to the system memory
*/
if (!found) {
if (aperture) {
pthread_mutex_lock(&aperture->fmm_mutex);
object = vm_find_object_by_address(aperture, address, 0);
pthread_mutex_unlock(&aperture->fmm_mutex);
if (!object)
return HSAKMT_STATUS_MEMORY_NOT_REGISTERED;
__fmm_release(object, aperture);
if (i < gpu_mem_count)
fmm_print(gpu_mem[i].gpu_id);
} else {
/*
* If memory address isn't inside of any defined GPU aperture - it
* refers to the system memory
*/
uint64_t size = 0;
/* Release the vm object in CPUVM */
pthread_mutex_lock(&cpuvm_aperture.fmm_mutex);
@@ -1389,6 +1385,7 @@ void fmm_release(void *address)
if (size)
munmap(address, size);
}
return HSAKMT_STATUS_SUCCESS;
}
static int fmm_set_memory_policy(uint32_t gpu_id, int default_policy, int alt_policy,
@@ -2064,11 +2061,11 @@ static int _fmm_map_to_gpu_scratch(uint32_t gpu_id, manageable_aperture_t *apert
void *address, uint64_t size)
{
int32_t gpu_mem_id;
void *mem = NULL;
int ret;
bool is_debugger = 0;
void *mmap_ret = NULL;
uint64_t mmap_offset = 0;
vm_object_t *obj;
/* Retrieve gpu_mem id according to gpu_id */
gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id);
@@ -2086,7 +2083,7 @@ static int _fmm_map_to_gpu_scratch(uint32_t gpu_id, manageable_aperture_t *apert
ret = debug_get_reg_status(gpu_mem[gpu_mem_id].node_id, &is_debugger);
/* allocate object within the scratch backing aperture */
if (!ret && !is_debugger) {
vm_object_t *obj = fmm_allocate_memory_object(
obj = fmm_allocate_memory_object(
gpu_id, address, size, aperture, NULL,
KFD_IOC_ALLOC_MEM_FLAGS_VRAM |
KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE);
@@ -2095,7 +2092,7 @@ static int _fmm_map_to_gpu_scratch(uint32_t gpu_id, manageable_aperture_t *apert
} else {
int map_fd = mmap_offset >= (1ULL<<40) ? kfd_fd :
gpu_mem[gpu_mem_id].drm_render_fd;
fmm_allocate_memory_object(
obj = fmm_allocate_memory_object(
gpu_id, address, size, aperture, &mmap_offset,
KFD_IOC_ALLOC_MEM_FLAGS_GTT |
KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE);
@@ -2103,7 +2100,7 @@ static int _fmm_map_to_gpu_scratch(uint32_t gpu_id, manageable_aperture_t *apert
PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_FIXED, map_fd, mmap_offset);
if (mmap_ret == MAP_FAILED) {
__fmm_release(mem, aperture);
__fmm_release(obj, aperture);
return -1;
}
}
@@ -2112,7 +2109,7 @@ static int _fmm_map_to_gpu_scratch(uint32_t gpu_id, manageable_aperture_t *apert
/* map to GPU */
ret = _fmm_map_to_gpu(aperture, address, size, NULL, NULL, 0);
if (ret != 0)
__fmm_release(mem, aperture);
__fmm_release(obj, aperture);
return ret;
}
@@ -2378,7 +2375,7 @@ static int _fmm_unmap_from_gpu_scratch(uint32_t gpu_id,
pthread_mutex_unlock(&aperture->fmm_mutex);
/* free object in scratch backing aperture */
__fmm_release(address, aperture);
__fmm_release(object, aperture);
return 0;
@@ -2904,7 +2901,6 @@ static HSAKMT_STATUS fmm_deregister_user_memory(void *addr)
{
manageable_aperture_t *aperture;
vm_object_t *obj;
void *svm_addr;
aperture = &svm.dgpu_aperture;
@@ -2915,11 +2911,10 @@ static HSAKMT_STATUS fmm_deregister_user_memory(void *addr)
pthread_mutex_unlock(&aperture->fmm_mutex);
return HSAKMT_STATUS_ERROR;
}
svm_addr = obj->start;
pthread_mutex_unlock(&aperture->fmm_mutex);
/* Destroy BO */
__fmm_release(svm_addr, aperture);
__fmm_release(obj, aperture);
return HSAKMT_STATUS_SUCCESS;
}
@@ -2984,7 +2979,7 @@ HSAKMT_STATUS fmm_deregister_memory(void *address)
* userptrs means releasing the BO.
*/
pthread_mutex_unlock(&aperture->fmm_mutex);
__fmm_release(address, aperture);
__fmm_release(object, aperture);
return HSAKMT_STATUS_SUCCESS;
}
@@ -54,7 +54,7 @@ void *fmm_allocate_doorbell(uint32_t gpu_id, uint64_t MemorySizeInBytes, uint64_
void *fmm_allocate_host(uint32_t node_id, uint64_t MemorySizeInBytes,
HsaMemFlags flags);
void fmm_print(uint32_t node);
void fmm_release(void *address);
HSAKMT_STATUS fmm_release(void *address);
int fmm_map_to_gpu(void *address, uint64_t size, uint64_t *gpuvm_address);
int fmm_unmap_from_gpu(void *address);
bool fmm_get_handle(void *address, uint64_t *handle);
@@ -195,8 +195,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtFreeMemory(void *MemoryAddress,
return HSAKMT_STATUS_ERROR;
}
fmm_release(MemoryAddress);
return HSAKMT_STATUS_SUCCESS;
return fmm_release(MemoryAddress);
}
HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemory(void *MemoryAddress,