From f94be2078c6b70ec5ce7154c38fa8748a09ed3fb Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Thu, 17 Jun 2021 09:31:17 -0400 Subject: [PATCH] libhsakmt: fix multiple threads register userptr race Aperture locking is too fine-grained, it has race between find userptr and allocate userptr object. Change _fmm_allocate_device and fmm_allocate_memory_object to not take the aperture lock, the callers take it, this implements an atomic find userptr or allocate a new one. Change-Id: I6773404e22c1f4382a211c5a9817df23c5534a2a Signed-off-by: Philip Yang [ROCm/ROCR-Runtime commit: c4d5ee28f09c0471346cedc5f26d2c401f8d32f3] --- projects/rocr-runtime/src/fmm.c | 88 +++++++++++++++++++-------------- 1 file changed, 51 insertions(+), 37 deletions(-) diff --git a/projects/rocr-runtime/src/fmm.c b/projects/rocr-runtime/src/fmm.c index fa874156b0..132df4f7f8 100644 --- a/projects/rocr-runtime/src/fmm.c +++ b/projects/rocr-runtime/src/fmm.c @@ -941,12 +941,10 @@ static vm_object_t *fmm_allocate_memory_object(uint32_t gpu_id, void *mem, return NULL; /* Allocate object */ - pthread_mutex_lock(&aperture->fmm_mutex); vm_obj = aperture_allocate_object(aperture, mem, args.handle, MemorySizeInBytes, flags); if (!vm_obj) goto err_object_allocation_failed; - pthread_mutex_unlock(&aperture->fmm_mutex); if (mmap_offset) *mmap_offset = args.mmap_offset; @@ -954,7 +952,6 @@ static vm_object_t *fmm_allocate_memory_object(uint32_t gpu_id, void *mem, return vm_obj; err_object_allocation_failed: - pthread_mutex_unlock(&aperture->fmm_mutex); free_args.handle = args.handle; kmtIoctl(kfd_fd, AMDKFD_IOC_FREE_MEMORY_OF_GPU, &free_args); @@ -1278,9 +1275,7 @@ static void *__fmm_allocate_device(uint32_t gpu_id, void *address, uint64_t Memo return NULL; /* Allocate address space */ - pthread_mutex_lock(&aperture->fmm_mutex); mem = aperture_allocate_area(aperture, address, MemorySizeInBytes); - pthread_mutex_unlock(&aperture->fmm_mutex); /* * Now that we have the area reserved, allocate memory in the device @@ -1293,9 +1288,7 @@ static void *__fmm_allocate_device(uint32_t gpu_id, void *address, uint64_t Memo * allocation of memory in device failed. * Release region in aperture */ - pthread_mutex_lock(&aperture->fmm_mutex); aperture_release_area(aperture, mem, MemorySizeInBytes); - pthread_mutex_unlock(&aperture->fmm_mutex); /* Assign NULL to mem to indicate failure to calling function */ mem = NULL; @@ -1341,17 +1334,19 @@ void *fmm_allocate_device(uint32_t gpu_id, void *address, uint64_t MemorySizeInB if (flags.ui32.Uncached || svm.disable_cache) ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED; + pthread_mutex_lock(&aperture->fmm_mutex); + mem = __fmm_allocate_device(gpu_id, address, size, aperture, &mmap_offset, ioc_flags, &vm_obj); if (mem && vm_obj) { - pthread_mutex_lock(&aperture->fmm_mutex); /* Store memory allocation flags, not ioc flags */ vm_obj->flags = flags.Value; gpuid_to_nodeid(gpu_id, &vm_obj->node_id); - pthread_mutex_unlock(&aperture->fmm_mutex); } + pthread_mutex_unlock(&aperture->fmm_mutex); + if (mem) { int map_fd = gpu_mem[gpu_mem_id].drm_render_fd; int prot = flags.ui32.HostAccess ? PROT_READ | PROT_WRITE : @@ -1397,6 +1392,8 @@ void *fmm_allocate_doorbell(uint32_t gpu_id, uint64_t MemorySizeInBytes, KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | KFD_IOC_ALLOC_MEM_FLAGS_COHERENT; + pthread_mutex_lock(&aperture->fmm_mutex); + mem = __fmm_allocate_device(gpu_id, NULL, MemorySizeInBytes, aperture, NULL, ioc_flags, &vm_obj); @@ -1409,12 +1406,12 @@ void *fmm_allocate_doorbell(uint32_t gpu_id, uint64_t MemorySizeInBytes, flags.ui32.HostAccess = 1; flags.ui32.Reserved = 0xBe1; - pthread_mutex_lock(&aperture->fmm_mutex); vm_obj->flags = flags.Value; gpuid_to_nodeid(gpu_id, &vm_obj->node_id); - pthread_mutex_unlock(&aperture->fmm_mutex); } + pthread_mutex_unlock(&aperture->fmm_mutex); + if (mem) { void *ret = mmap(mem, MemorySizeInBytes, PROT_READ | PROT_WRITE, @@ -1564,13 +1561,13 @@ static void *fmm_allocate_host_gpu(uint32_t node_id, void *address, /* Paged memory is allocated as a userptr mapping, non-paged * memory is allocated from KFD */ + pthread_mutex_lock(&aperture->fmm_mutex); + if (!flags.ui32.NonPaged && svm.userptr_for_paged_mem) { /* Allocate address space */ - pthread_mutex_lock(&aperture->fmm_mutex); mem = aperture_allocate_area(aperture, address, size); - pthread_mutex_unlock(&aperture->fmm_mutex); if (!mem) - return NULL; + goto out_unlock; /* Map anonymous pages */ if (mmap(mem, MemorySizeInBytes, PROT_READ | PROT_WRITE, @@ -1608,7 +1605,7 @@ static void *fmm_allocate_host_gpu(uint32_t node_id, void *address, MAP_SHARED | MAP_FIXED, map_fd, mmap_offset); if (ret == MAP_FAILED) { __fmm_release(vm_obj, aperture); - return NULL; + goto out_unlock; } if (flags.ui32.AQLQueueMemory) { @@ -1624,18 +1621,17 @@ static void *fmm_allocate_host_gpu(uint32_t node_id, void *address, if (mem && vm_obj) { /* Store memory allocation flags, not ioc flags */ - pthread_mutex_lock(&aperture->fmm_mutex); vm_obj->flags = flags.Value; vm_obj->node_id = node_id; - pthread_mutex_unlock(&aperture->fmm_mutex); } + pthread_mutex_unlock(&aperture->fmm_mutex); return mem; out_release_area: /* Release address space */ - pthread_mutex_lock(&aperture->fmm_mutex); aperture_release_area(aperture, mem, size); +out_unlock: pthread_mutex_unlock(&aperture->fmm_mutex); return NULL; @@ -2075,6 +2071,9 @@ static void *map_mmio(uint32_t node_id, uint32_t gpu_id, int mmap_fd) ioc_flags = KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP | KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | KFD_IOC_ALLOC_MEM_FLAGS_COHERENT; + + pthread_mutex_lock(&aperture->fmm_mutex); + mem = __fmm_allocate_device(gpu_id, NULL, PAGE_SIZE, aperture, &mmap_offset, ioc_flags, &vm_obj); @@ -2085,7 +2084,6 @@ static void *map_mmio(uint32_t node_id, uint32_t gpu_id, int mmap_fd) flags.ui32.NonPaged = 1; flags.ui32.HostAccess = 1; flags.ui32.Reserved = 0; - pthread_mutex_lock(&aperture->fmm_mutex); vm_obj->flags = flags.Value; vm_obj->node_id = node_id; pthread_mutex_unlock(&aperture->fmm_mutex); @@ -2550,9 +2548,6 @@ static int _fmm_map_to_gpu(manageable_aperture_t *aperture, vm_object_t *object; int ret = 0; - if (!obj) - pthread_mutex_lock(&aperture->fmm_mutex); - object = obj; if (!object) { /* Find the object to retrieve the handle */ @@ -2615,9 +2610,6 @@ static int _fmm_map_to_gpu(manageable_aperture_t *aperture, exit_ok: err_object_not_found: err_map_failed: - if (!obj) - pthread_mutex_unlock(&aperture->fmm_mutex); - return ret; } @@ -2646,22 +2638,23 @@ static int _fmm_map_to_gpu_scratch(uint32_t gpu_id, manageable_aperture_t *apert return -1; ret = debug_get_reg_status(gpu_mem[gpu_mem_id].node_id, &is_debugger); + /* allocate object within the scratch backing aperture */ + pthread_mutex_lock(&aperture->fmm_mutex); + if (!ret && !is_debugger) { obj = fmm_allocate_memory_object( gpu_id, address, size, aperture, &mmap_offset, KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE); if (!obj) - return -1; + goto out_unlock; /* Create a CPU mapping for the debugger */ map_fd = gpu_mem[gpu_mem_id].drm_render_fd; mmap_ret = mmap(address, size, PROT_NONE, MAP_PRIVATE | MAP_FIXED, map_fd, mmap_offset); - if (mmap_ret == MAP_FAILED) { - __fmm_release(obj, aperture); - return -1; - } + if (mmap_ret == MAP_FAILED) + goto out_release; } else { obj = fmm_allocate_memory_object( gpu_id, address, size, aperture, &mmap_offset, @@ -2671,10 +2664,8 @@ static int _fmm_map_to_gpu_scratch(uint32_t gpu_id, manageable_aperture_t *apert mmap_ret = mmap(address, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, map_fd, mmap_offset); - if (mmap_ret == MAP_FAILED) { - __fmm_release(obj, aperture); - return -1; - } + if (mmap_ret == MAP_FAILED) + goto out_release; } @@ -2683,7 +2674,14 @@ static int _fmm_map_to_gpu_scratch(uint32_t gpu_id, manageable_aperture_t *apert if (ret != 0) __fmm_release(obj, aperture); + pthread_mutex_unlock(&aperture->fmm_mutex); return ret; + +out_release: + __fmm_release(obj, aperture); +out_unlock: + pthread_mutex_unlock(&aperture->fmm_mutex); + return -1; } static int _fmm_map_to_gpu_userptr(void *addr, uint64_t size, @@ -3003,29 +3001,45 @@ static HSAKMT_STATUS fmm_register_user_memory(void *addr, HSAuint64 size, if (svm.check_userptr) fmm_check_user_memory(addr, size); + pthread_mutex_lock(&aperture->fmm_mutex); + + /* catch the race condition where some other thread added the userptr + * object already after the vm_find_object. + */ + obj = vm_find_object_by_userptr(aperture, addr, size); + if (obj) { + ++obj->registration_count; + goto out_found; + } + /* Allocate BO, userptr address is passed in mmap_offset */ svm_addr = __fmm_allocate_device(gpu_id, NULL, aligned_size, aperture, &aligned_addr, KFD_IOC_ALLOC_MEM_FLAGS_USERPTR | KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | (coarse_grain ? 0 : KFD_IOC_ALLOC_MEM_FLAGS_COHERENT), &obj); - if (!svm_addr) + if (!svm_addr) { + pthread_mutex_unlock(&aperture->fmm_mutex); return HSAKMT_STATUS_ERROR; + } if (obj) { - pthread_mutex_lock(&aperture->fmm_mutex); obj->userptr = addr; gpuid_to_nodeid(gpu_id, &obj->node_id); obj->userptr_size = size; obj->registration_count = 1; obj->user_node.key = rbtree_key((unsigned long)addr, size); rbtree_insert(&aperture->user_tree, &obj->user_node); + } else { pthread_mutex_unlock(&aperture->fmm_mutex); - } else return HSAKMT_STATUS_ERROR; + } +out_found: if (obj_ret) *obj_ret = obj; + + pthread_mutex_unlock(&aperture->fmm_mutex); return HSAKMT_STATUS_SUCCESS; }