diff --git a/projects/rocr-runtime/include/linux/kfd_ioctl.h b/projects/rocr-runtime/include/linux/kfd_ioctl.h index c08cea8810..8b41710aa2 100644 --- a/projects/rocr-runtime/include/linux/kfd_ioctl.h +++ b/projects/rocr-runtime/include/linux/kfd_ioctl.h @@ -265,13 +265,27 @@ struct kfd_ioctl_free_memory_of_gpu_args { }; struct kfd_ioctl_map_memory_to_gpu_args { - uint64_t handle; /* to KFD */ + uint64_t handle; /* to KFD */ +}; + +struct kfd_ioctl_map_memory_to_gpu_new_args { + uint64_t handle; /* to KFD */ + uint32_t *device_ids_array; /* to KFD */ + uint32_t device_ids_array_size; /* to KFD */ + uint32_t pad; }; struct kfd_ioctl_unmap_memory_from_gpu_args { uint64_t handle; /* to KFD */ }; +struct kfd_ioctl_unmap_memory_from_gpu_new_args { + uint64_t handle; /* to KFD */ + uint32_t *device_ids_array; /* to KFD */ + uint32_t device_ids_array_size; /* to KFD */ + uint32_t pad; +}; + struct kfd_ioctl_open_graphic_handle_args { uint64_t va_addr; /* to KFD */ uint64_t handle; /* from KFD */ @@ -392,7 +406,12 @@ struct kfd_ioctl_alloc_memory_of_gpu_new_args { #define AMDKFD_IOC_SET_TRAP_HANDLER \ AMDKFD_IOW(0x1a, struct kfd_ioctl_set_trap_handler_args) +#define AMDKFD_IOC_MAP_MEMORY_TO_GPU_NEW \ + AMDKFD_IOWR(0x1b, struct kfd_ioctl_map_memory_to_gpu_new_args) +#define AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU_NEW \ + AMDKFD_IOWR(0x1c, struct kfd_ioctl_unmap_memory_from_gpu_new_args) + #define AMDKFD_COMMAND_START 0x01 -#define AMDKFD_COMMAND_END 0x1b +#define AMDKFD_COMMAND_END 0x1d #endif diff --git a/projects/rocr-runtime/src/events.c b/projects/rocr-runtime/src/events.c index 046d02765e..e8024b95ad 100644 --- a/projects/rocr-runtime/src/events.c +++ b/projects/rocr-runtime/src/events.c @@ -77,7 +77,7 @@ hsaKmtCreateEvent( if (is_dgpu && events_page == NULL) { events_page = allocate_exec_aligned_memory_gpu(KFD_SIGNAL_EVENT_LIMIT * 8, TONGA_PAGE_SIZE, - args.node_id); + args.node_id, true); if (!events_page) { return HSAKMT_STATUS_ERROR; } @@ -143,7 +143,6 @@ hsaKmtDestroyEvent( } free(Event); - return HSAKMT_STATUS_SUCCESS; } diff --git a/projects/rocr-runtime/src/fmm.c b/projects/rocr-runtime/src/fmm.c index ab500d075c..2e593de2a2 100644 --- a/projects/rocr-runtime/src/fmm.c +++ b/projects/rocr-runtime/src/fmm.c @@ -66,6 +66,11 @@ struct vm_object { uint64_t handle; /* opaque */ struct vm_object *next; struct vm_object *prev; + /* + * Nodes to map on SVM mGPU + */ + uint32_t *device_ids_array; + uint32_t device_ids_array_size; }; typedef struct vm_object vm_object_t; @@ -165,6 +170,7 @@ static vm_object_t *vm_create_and_init_object(void *start, uint64_t size, object->size = size; object->handle = handle; object->next = object->prev = NULL; + object->device_ids_array_size = 0; } return object; @@ -698,7 +704,7 @@ void *fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes) { manageble_aperture_t *aperture; int32_t gpu_mem_id; - uint32_t flags; + uint32_t flags, offset; /* Retrieve gpu_mem id according to gpu_id */ gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id); @@ -712,13 +718,15 @@ void *fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes) * In that way the host access range won't be used for local memory */ aperture = &svm.dgpu_aperture; + offset = 0; } else { flags = KFD_IOC_ALLOC_MEM_FLAGS_APU_DEVICE; aperture = &gpu_mem[gpu_mem_id].gpuvm_aperture; + offset = GPUVM_APP_OFFSET; } return __fmm_allocate_device(gpu_id, MemorySizeInBytes, - aperture, GPUVM_APP_OFFSET, NULL, + aperture, offset, NULL, flags); } @@ -812,7 +820,7 @@ void *fmm_open_graphic_handle(uint32_t gpu_id, void *mem = NULL; int32_t i = gpu_mem_find_by_gpu_id(gpu_id); struct kfd_ioctl_open_graphic_handle_args open_graphic_handle_args; - struct kfd_ioctl_unmap_memory_from_gpu_args unmap_args; + struct kfd_ioctl_unmap_memory_from_gpu_new_args unmap_args; /* If not found or aperture isn't properly initialized/supported */ if (i < 0 || !aperture_is_valid(gpu_mem[i].gpuvm_aperture.base, @@ -850,7 +858,9 @@ void *fmm_open_graphic_handle(uint32_t gpu_id, release_mem: unmap_args.handle = open_graphic_handle_args.handle; - kmtIoctl(kfd_fd, AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU, &unmap_args); + unmap_args.device_ids_array = NULL; + unmap_args.device_ids_array_size = 0; + kmtIoctl(kfd_fd, AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU_NEW, &unmap_args); release_area: aperture_release_area(&gpu_mem[i].gpuvm_aperture, mem, MemorySizeInBytes); @@ -1113,7 +1123,7 @@ HSAKMT_STATUS fmm_get_aperture_base_and_limit(aperture_type_e aperture_type, HSA static int _fmm_map_to_gpu_gtt(manageble_aperture_t *aperture, void *address, uint64_t size) { - struct kfd_ioctl_map_memory_to_gpu_args args; + struct kfd_ioctl_map_memory_to_gpu_new_args args; vm_object_t *object; pthread_mutex_lock(&aperture->fmm_mutex); @@ -1125,7 +1135,9 @@ static int _fmm_map_to_gpu_gtt(manageble_aperture_t *aperture, } args.handle = object->handle; - if (kmtIoctl(kfd_fd, AMDKFD_IOC_MAP_MEMORY_TO_GPU, &args)) + args.device_ids_array = object->device_ids_array; + args.device_ids_array_size = object->device_ids_array_size; + if (kmtIoctl(kfd_fd, AMDKFD_IOC_MAP_MEMORY_TO_GPU_NEW, &args)) goto err_map_ioctl_failed; pthread_mutex_unlock(&aperture->fmm_mutex); @@ -1207,7 +1219,7 @@ static int _fmm_map_to_gpu(uint32_t gpu_id, manageble_aperture_t *aperture, void *address, uint64_t size, uint64_t *gpuvm_address) { - struct kfd_ioctl_map_memory_to_gpu_args args; + struct kfd_ioctl_map_memory_to_gpu_new_args args; vm_object_t *object; /* Check that address space was previously reserved */ @@ -1222,7 +1234,9 @@ static int _fmm_map_to_gpu(uint32_t gpu_id, manageble_aperture_t *aperture, goto err_object_not_found; args.handle = object->handle; - if (kmtIoctl(kfd_fd, AMDKFD_IOC_MAP_MEMORY_TO_GPU, &args)) + args.device_ids_array = object->device_ids_array; + args.device_ids_array_size = object->device_ids_array_size; + if (kmtIoctl(kfd_fd, AMDKFD_IOC_MAP_MEMORY_TO_GPU_NEW, &args)) goto err_map_ioctl_failed; pthread_mutex_unlock(&aperture->fmm_mutex); @@ -1291,7 +1305,7 @@ int fmm_map_to_gpu(void *address, uint64_t size, uint64_t *gpuvm_address) static int _fmm_unmap_from_gpu(manageble_aperture_t *aperture, void *address) { vm_object_t *object; - struct kfd_ioctl_unmap_memory_from_gpu_args args; + struct kfd_ioctl_unmap_memory_from_gpu_new_args args; pthread_mutex_lock(&aperture->fmm_mutex); @@ -1301,7 +1315,9 @@ static int _fmm_unmap_from_gpu(manageble_aperture_t *aperture, void *address) goto err; args.handle = object->handle; - kmtIoctl(kfd_fd, AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU, &args); + args.device_ids_array = object->device_ids_array; + args.device_ids_array_size = object->device_ids_array_size; + kmtIoctl(kfd_fd, AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU_NEW, &args); pthread_mutex_unlock(&aperture->fmm_mutex); @@ -1318,7 +1334,7 @@ static int _fmm_unmap_from_gpu_scratch(uint32_t gpu_id, int32_t gpu_mem_id; vm_object_t *object; uint64_t size; - struct kfd_ioctl_unmap_memory_from_gpu_args args; + struct kfd_ioctl_unmap_memory_from_gpu_new_args args; /* Retrieve gpu_mem id according to gpu_id */ gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id); @@ -1339,7 +1355,9 @@ static int _fmm_unmap_from_gpu_scratch(uint32_t gpu_id, /* unmap from GPU */ args.handle = object->handle; - kmtIoctl(kfd_fd, AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU, &args); + args.device_ids_array = object->device_ids_array; + args.device_ids_array_size = object->device_ids_array_size; + kmtIoctl(kfd_fd, AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU_NEW, &args); pthread_mutex_unlock(&aperture->fmm_mutex); @@ -1590,3 +1608,108 @@ void fmm_release_global_resources(void) dgpu_shared_aperture_base = NULL; dgpu_shared_aperture_limit = NULL; } + +int fmm_register_memory(void *address, uint32_t size_in_bytes, + uint32_t *nodes_arr, uint32_t nodes_arr_size) +{ + bool found = false; + manageble_aperture_t *aperture; + vm_object_t *object; + + /* + * Object can be found only on SVM aperture as you can't map + * non SVM object on different device. + */ + aperture = &svm.dgpu_aperture; + pthread_mutex_lock(&aperture->fmm_mutex); + /* Find the object to retrieve the handle */ + object = vm_find_object_by_address(aperture, address, 0); + if (object) + found = true; + pthread_mutex_unlock(&aperture->fmm_mutex); + + if (!found) { + aperture = &svm.dgpu_alt_aperture; + + pthread_mutex_lock(&aperture->fmm_mutex); + /* Find the object to retrieve the handle */ + object = vm_find_object_by_address(aperture, address, 0); + if (object) + found = true; + pthread_mutex_unlock(&aperture->fmm_mutex); + } + + if (!object) + return 1; + + object->device_ids_array = nodes_arr; + object->device_ids_array_size = nodes_arr_size * sizeof(uint32_t); + + return 0; +} + +void fmm_deregister_memory(void *address) +{ + bool found = false; + manageble_aperture_t *aperture; + vm_object_t *object; + + /* + * Object can be found only on SVM aperture as you can't map + * non SVM object on different device. + */ + aperture = &svm.dgpu_aperture; + pthread_mutex_lock(&aperture->fmm_mutex); + /* Find the object to retrieve the handle */ + object = vm_find_object_by_address(aperture, address, 0); + if (object) + found = true; + pthread_mutex_unlock(&aperture->fmm_mutex); + + if (!found) { + aperture = &svm.dgpu_alt_aperture; + pthread_mutex_lock(&aperture->fmm_mutex); + /* Find the object to retrieve the handle */ + object = vm_find_object_by_address(aperture, address, 0); + if (object) + found = true; + pthread_mutex_unlock(&aperture->fmm_mutex); + } + + if (!object || object->device_ids_array_size <= 0) + return; + + free(object->device_ids_array); + object->device_ids_array = NULL; + object->device_ids_array_size = 0; +} + +int fmm_build_nodes_array(uint32_t **array, uint32_t *nodes, uint32_t nodes_num) +{ + uint32_t i, *arr; + if (!nodes) { + nodes_num = 0; + for (i = 0 ; i < NUM_OF_SUPPORTED_GPUS; i++) { + if (gpu_mem[i].gpu_id == 0) + continue; + nodes_num++; + } + } + + arr = (uint32_t *)malloc(sizeof(uint32_t) * nodes_num); + if (!array) + return 1; + + memset(arr, 0, sizeof(uint32_t) * nodes_num); + + nodes_num = 0; + for (i = 0 ; i < NUM_OF_SUPPORTED_GPUS; i++) { + if (gpu_mem[i].gpu_id == 0) + continue; + arr[nodes_num] = gpu_mem[i].gpu_id; + nodes_num++; + } + + *array = arr; + return nodes_num; +} diff --git a/projects/rocr-runtime/src/fmm.h b/projects/rocr-runtime/src/fmm.h index 10a56ff54a..37bdcb3b65 100644 --- a/projects/rocr-runtime/src/fmm.h +++ b/projects/rocr-runtime/src/fmm.h @@ -67,4 +67,9 @@ HSAKMT_STATUS fmm_node_added(HSAuint32 gpu_id); HSAKMT_STATUS fmm_node_removed(HSAuint32 gpu_id); HSAKMT_STATUS fmm_get_aperture_base_and_limit(aperture_type_e aperture_type, HSAuint32 gpu_id, HSAuint64 *aperture_base, HSAuint64 *aperture_limit); + +int fmm_register_memory(void *address, uint32_t size_in_bytes, + uint32_t *nodes_arr, uint32_t nodes_arr_size); +void fmm_deregister_memory(void *address); +int fmm_build_nodes_array(uint32_t **array, uint32_t *nodes, uint32_t nodes_num); #endif /* FMM_H_ */ diff --git a/projects/rocr-runtime/src/libhsakmt.h b/projects/rocr-runtime/src/libhsakmt.h index 4ec1acafaa..c44bb528d0 100644 --- a/projects/rocr-runtime/src/libhsakmt.h +++ b/projects/rocr-runtime/src/libhsakmt.h @@ -77,7 +77,8 @@ bool topology_is_dgpu(uint16_t device_id); HSAuint32 PageSizeFromFlags(unsigned int pageSizeFlags); -void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, uint32_t NodeId); +void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, + uint32_t NodeId, bool peer_to_peer); void free_exec_aligned_memory_gpu(void *addr, uint32_t size, uint32_t align); extern int kmtIoctl(int fd, unsigned long request, void *arg); diff --git a/projects/rocr-runtime/src/memory.c b/projects/rocr-runtime/src/memory.c index 8052fcd3bd..1b043b60c2 100644 --- a/projects/rocr-runtime/src/memory.c +++ b/projects/rocr-runtime/src/memory.c @@ -195,8 +195,21 @@ hsaKmtRegisterMemory( HSAuint64 MemorySizeInBytes /* IN (page-aligned) */ ) { + uint32_t *NodesArray; + uint32_t NodesArraySize; + CHECK_KFD_OPEN(); + /* + * Build NodesArray from all dGPU nodes. + */ + NodesArraySize = fmm_build_nodes_array(&NodesArray, NULL, 0); + if (!NodesArray) + return HSAKMT_STATUS_INVALID_PARAMETER; + + if (fmm_register_memory(MemoryAddress, MemorySizeInBytes, + NodesArray, NodesArraySize) != 0) + return HSAKMT_STATUS_ERROR; return HSAKMT_STATUS_SUCCESS; } @@ -208,6 +221,8 @@ hsaKmtDeregisterMemory( { CHECK_KFD_OPEN(); + fmm_deregister_memory(MemoryAddress); + return HSAKMT_STATUS_SUCCESS; } diff --git a/projects/rocr-runtime/src/queues.c b/projects/rocr-runtime/src/queues.c index a6025bb5cc..10604befa7 100644 --- a/projects/rocr-runtime/src/queues.c +++ b/projects/rocr-runtime/src/queues.c @@ -187,7 +187,8 @@ static void* allocate_exec_aligned_memory_cpu(uint32_t size, uint32_t align) return ptr; } -void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, uint32_t NodeId) +void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, + uint32_t NodeId, bool peer_to_peer) { void *mem; HSAuint64 gpu_va; @@ -205,6 +206,14 @@ void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, uint32_t N if (ret != HSAKMT_STATUS_SUCCESS) { return NULL; } + + if (peer_to_peer) { + if (hsaKmtRegisterMemory(mem, size) != HSAKMT_STATUS_SUCCESS) { + hsaKmtFreeMemory(mem, size); + return NULL; + } + } + if (hsaKmtMapMemoryToGPU(mem, size, &gpu_va) != HSAKMT_STATUS_SUCCESS) { hsaKmtFreeMemory(mem, size); return NULL; @@ -220,6 +229,7 @@ void free_exec_aligned_memory_gpu(void *addr, uint32_t size, uint32_t align) if (hsaKmtUnmapMemoryToGPU(addr) == HSAKMT_STATUS_SUCCESS) { hsaKmtFreeMemory(addr, size); } + hsaKmtDeregisterMemory(addr); } static void* allocate_exec_aligned_memory(uint32_t size, @@ -228,7 +238,7 @@ static void* allocate_exec_aligned_memory(uint32_t size, uint32_t NodeId) { if (IS_DGPU(type)) - return allocate_exec_aligned_memory_gpu(size, TONGA_PAGE_SIZE, NodeId); + return allocate_exec_aligned_memory_gpu(size, TONGA_PAGE_SIZE, NodeId, false); return allocate_exec_aligned_memory_cpu(size, align); } @@ -236,6 +246,7 @@ static void release_exec_aligned_memory_gpu(void *addr, uint32_t size) { if (hsaKmtUnmapMemoryToGPU(addr) == HSAKMT_STATUS_SUCCESS) hsaKmtFreeMemory(addr, (HSAuint64)size); + hsaKmtDeregisterMemory(addr); } static void release_exec_aligned_memory(void *addr, uint32_t size, enum asic_family_type type)