Adding support for mGPU
Change-Id: I5ed184e6a58b38d9dde48867f14513d161cf41a9
Signed-off-by: Ben Goz <ben.goz@amd.com>
[ROCm/ROCR-Runtime commit: ea0f9d2a0b]
Cette révision appartient à :
@@ -265,13 +265,27 @@ struct kfd_ioctl_free_memory_of_gpu_args {
|
||||
};
|
||||
|
||||
struct kfd_ioctl_map_memory_to_gpu_args {
|
||||
uint64_t handle; /* to KFD */
|
||||
uint64_t handle; /* to KFD */
|
||||
};
|
||||
|
||||
struct kfd_ioctl_map_memory_to_gpu_new_args {
|
||||
uint64_t handle; /* to KFD */
|
||||
uint32_t *device_ids_array; /* to KFD */
|
||||
uint32_t device_ids_array_size; /* to KFD */
|
||||
uint32_t pad;
|
||||
};
|
||||
|
||||
struct kfd_ioctl_unmap_memory_from_gpu_args {
|
||||
uint64_t handle; /* to KFD */
|
||||
};
|
||||
|
||||
struct kfd_ioctl_unmap_memory_from_gpu_new_args {
|
||||
uint64_t handle; /* to KFD */
|
||||
uint32_t *device_ids_array; /* to KFD */
|
||||
uint32_t device_ids_array_size; /* to KFD */
|
||||
uint32_t pad;
|
||||
};
|
||||
|
||||
struct kfd_ioctl_open_graphic_handle_args {
|
||||
uint64_t va_addr; /* to KFD */
|
||||
uint64_t handle; /* from KFD */
|
||||
@@ -392,7 +406,12 @@ struct kfd_ioctl_alloc_memory_of_gpu_new_args {
|
||||
#define AMDKFD_IOC_SET_TRAP_HANDLER \
|
||||
AMDKFD_IOW(0x1a, struct kfd_ioctl_set_trap_handler_args)
|
||||
|
||||
#define AMDKFD_IOC_MAP_MEMORY_TO_GPU_NEW \
|
||||
AMDKFD_IOWR(0x1b, struct kfd_ioctl_map_memory_to_gpu_new_args)
|
||||
#define AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU_NEW \
|
||||
AMDKFD_IOWR(0x1c, struct kfd_ioctl_unmap_memory_from_gpu_new_args)
|
||||
|
||||
#define AMDKFD_COMMAND_START 0x01
|
||||
#define AMDKFD_COMMAND_END 0x1b
|
||||
#define AMDKFD_COMMAND_END 0x1d
|
||||
|
||||
#endif
|
||||
|
||||
@@ -77,7 +77,7 @@ hsaKmtCreateEvent(
|
||||
if (is_dgpu && events_page == NULL) {
|
||||
events_page = allocate_exec_aligned_memory_gpu(KFD_SIGNAL_EVENT_LIMIT * 8,
|
||||
TONGA_PAGE_SIZE,
|
||||
args.node_id);
|
||||
args.node_id, true);
|
||||
if (!events_page) {
|
||||
return HSAKMT_STATUS_ERROR;
|
||||
}
|
||||
@@ -143,7 +143,6 @@ hsaKmtDestroyEvent(
|
||||
}
|
||||
|
||||
free(Event);
|
||||
|
||||
return HSAKMT_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
@@ -66,6 +66,11 @@ struct vm_object {
|
||||
uint64_t handle; /* opaque */
|
||||
struct vm_object *next;
|
||||
struct vm_object *prev;
|
||||
/*
|
||||
* Nodes to map on SVM mGPU
|
||||
*/
|
||||
uint32_t *device_ids_array;
|
||||
uint32_t device_ids_array_size;
|
||||
};
|
||||
typedef struct vm_object vm_object_t;
|
||||
|
||||
@@ -165,6 +170,7 @@ static vm_object_t *vm_create_and_init_object(void *start, uint64_t size,
|
||||
object->size = size;
|
||||
object->handle = handle;
|
||||
object->next = object->prev = NULL;
|
||||
object->device_ids_array_size = 0;
|
||||
}
|
||||
|
||||
return object;
|
||||
@@ -698,7 +704,7 @@ void *fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes)
|
||||
{
|
||||
manageble_aperture_t *aperture;
|
||||
int32_t gpu_mem_id;
|
||||
uint32_t flags;
|
||||
uint32_t flags, offset;
|
||||
|
||||
/* Retrieve gpu_mem id according to gpu_id */
|
||||
gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id);
|
||||
@@ -712,13 +718,15 @@ void *fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes)
|
||||
* In that way the host access range won't be used for local memory
|
||||
*/
|
||||
aperture = &svm.dgpu_aperture;
|
||||
offset = 0;
|
||||
} else {
|
||||
flags = KFD_IOC_ALLOC_MEM_FLAGS_APU_DEVICE;
|
||||
aperture = &gpu_mem[gpu_mem_id].gpuvm_aperture;
|
||||
offset = GPUVM_APP_OFFSET;
|
||||
}
|
||||
|
||||
return __fmm_allocate_device(gpu_id, MemorySizeInBytes,
|
||||
aperture, GPUVM_APP_OFFSET, NULL,
|
||||
aperture, offset, NULL,
|
||||
flags);
|
||||
}
|
||||
|
||||
@@ -812,7 +820,7 @@ void *fmm_open_graphic_handle(uint32_t gpu_id,
|
||||
void *mem = NULL;
|
||||
int32_t i = gpu_mem_find_by_gpu_id(gpu_id);
|
||||
struct kfd_ioctl_open_graphic_handle_args open_graphic_handle_args;
|
||||
struct kfd_ioctl_unmap_memory_from_gpu_args unmap_args;
|
||||
struct kfd_ioctl_unmap_memory_from_gpu_new_args unmap_args;
|
||||
|
||||
/* If not found or aperture isn't properly initialized/supported */
|
||||
if (i < 0 || !aperture_is_valid(gpu_mem[i].gpuvm_aperture.base,
|
||||
@@ -850,7 +858,9 @@ void *fmm_open_graphic_handle(uint32_t gpu_id,
|
||||
|
||||
release_mem:
|
||||
unmap_args.handle = open_graphic_handle_args.handle;
|
||||
kmtIoctl(kfd_fd, AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU, &unmap_args);
|
||||
unmap_args.device_ids_array = NULL;
|
||||
unmap_args.device_ids_array_size = 0;
|
||||
kmtIoctl(kfd_fd, AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU_NEW, &unmap_args);
|
||||
release_area:
|
||||
aperture_release_area(&gpu_mem[i].gpuvm_aperture, mem,
|
||||
MemorySizeInBytes);
|
||||
@@ -1113,7 +1123,7 @@ HSAKMT_STATUS fmm_get_aperture_base_and_limit(aperture_type_e aperture_type, HSA
|
||||
static int _fmm_map_to_gpu_gtt(manageble_aperture_t *aperture,
|
||||
void *address, uint64_t size)
|
||||
{
|
||||
struct kfd_ioctl_map_memory_to_gpu_args args;
|
||||
struct kfd_ioctl_map_memory_to_gpu_new_args args;
|
||||
vm_object_t *object;
|
||||
|
||||
pthread_mutex_lock(&aperture->fmm_mutex);
|
||||
@@ -1125,7 +1135,9 @@ static int _fmm_map_to_gpu_gtt(manageble_aperture_t *aperture,
|
||||
}
|
||||
|
||||
args.handle = object->handle;
|
||||
if (kmtIoctl(kfd_fd, AMDKFD_IOC_MAP_MEMORY_TO_GPU, &args))
|
||||
args.device_ids_array = object->device_ids_array;
|
||||
args.device_ids_array_size = object->device_ids_array_size;
|
||||
if (kmtIoctl(kfd_fd, AMDKFD_IOC_MAP_MEMORY_TO_GPU_NEW, &args))
|
||||
goto err_map_ioctl_failed;
|
||||
|
||||
pthread_mutex_unlock(&aperture->fmm_mutex);
|
||||
@@ -1207,7 +1219,7 @@ static int _fmm_map_to_gpu(uint32_t gpu_id, manageble_aperture_t *aperture,
|
||||
void *address, uint64_t size,
|
||||
uint64_t *gpuvm_address)
|
||||
{
|
||||
struct kfd_ioctl_map_memory_to_gpu_args args;
|
||||
struct kfd_ioctl_map_memory_to_gpu_new_args args;
|
||||
vm_object_t *object;
|
||||
|
||||
/* Check that address space was previously reserved */
|
||||
@@ -1222,7 +1234,9 @@ static int _fmm_map_to_gpu(uint32_t gpu_id, manageble_aperture_t *aperture,
|
||||
goto err_object_not_found;
|
||||
|
||||
args.handle = object->handle;
|
||||
if (kmtIoctl(kfd_fd, AMDKFD_IOC_MAP_MEMORY_TO_GPU, &args))
|
||||
args.device_ids_array = object->device_ids_array;
|
||||
args.device_ids_array_size = object->device_ids_array_size;
|
||||
if (kmtIoctl(kfd_fd, AMDKFD_IOC_MAP_MEMORY_TO_GPU_NEW, &args))
|
||||
goto err_map_ioctl_failed;
|
||||
|
||||
pthread_mutex_unlock(&aperture->fmm_mutex);
|
||||
@@ -1291,7 +1305,7 @@ int fmm_map_to_gpu(void *address, uint64_t size, uint64_t *gpuvm_address)
|
||||
static int _fmm_unmap_from_gpu(manageble_aperture_t *aperture, void *address)
|
||||
{
|
||||
vm_object_t *object;
|
||||
struct kfd_ioctl_unmap_memory_from_gpu_args args;
|
||||
struct kfd_ioctl_unmap_memory_from_gpu_new_args args;
|
||||
|
||||
pthread_mutex_lock(&aperture->fmm_mutex);
|
||||
|
||||
@@ -1301,7 +1315,9 @@ static int _fmm_unmap_from_gpu(manageble_aperture_t *aperture, void *address)
|
||||
goto err;
|
||||
|
||||
args.handle = object->handle;
|
||||
kmtIoctl(kfd_fd, AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU, &args);
|
||||
args.device_ids_array = object->device_ids_array;
|
||||
args.device_ids_array_size = object->device_ids_array_size;
|
||||
kmtIoctl(kfd_fd, AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU_NEW, &args);
|
||||
|
||||
pthread_mutex_unlock(&aperture->fmm_mutex);
|
||||
|
||||
@@ -1318,7 +1334,7 @@ static int _fmm_unmap_from_gpu_scratch(uint32_t gpu_id,
|
||||
int32_t gpu_mem_id;
|
||||
vm_object_t *object;
|
||||
uint64_t size;
|
||||
struct kfd_ioctl_unmap_memory_from_gpu_args args;
|
||||
struct kfd_ioctl_unmap_memory_from_gpu_new_args args;
|
||||
|
||||
/* Retrieve gpu_mem id according to gpu_id */
|
||||
gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id);
|
||||
@@ -1339,7 +1355,9 @@ static int _fmm_unmap_from_gpu_scratch(uint32_t gpu_id,
|
||||
|
||||
/* unmap from GPU */
|
||||
args.handle = object->handle;
|
||||
kmtIoctl(kfd_fd, AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU, &args);
|
||||
args.device_ids_array = object->device_ids_array;
|
||||
args.device_ids_array_size = object->device_ids_array_size;
|
||||
kmtIoctl(kfd_fd, AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU_NEW, &args);
|
||||
|
||||
pthread_mutex_unlock(&aperture->fmm_mutex);
|
||||
|
||||
@@ -1590,3 +1608,108 @@ void fmm_release_global_resources(void)
|
||||
dgpu_shared_aperture_base = NULL;
|
||||
dgpu_shared_aperture_limit = NULL;
|
||||
}
|
||||
|
||||
int fmm_register_memory(void *address, uint32_t size_in_bytes,
|
||||
uint32_t *nodes_arr, uint32_t nodes_arr_size)
|
||||
{
|
||||
bool found = false;
|
||||
manageble_aperture_t *aperture;
|
||||
vm_object_t *object;
|
||||
|
||||
/*
|
||||
* Object can be found only on SVM aperture as you can't map
|
||||
* non SVM object on different device.
|
||||
*/
|
||||
aperture = &svm.dgpu_aperture;
|
||||
pthread_mutex_lock(&aperture->fmm_mutex);
|
||||
/* Find the object to retrieve the handle */
|
||||
object = vm_find_object_by_address(aperture, address, 0);
|
||||
if (object)
|
||||
found = true;
|
||||
pthread_mutex_unlock(&aperture->fmm_mutex);
|
||||
|
||||
if (!found) {
|
||||
aperture = &svm.dgpu_alt_aperture;
|
||||
|
||||
pthread_mutex_lock(&aperture->fmm_mutex);
|
||||
/* Find the object to retrieve the handle */
|
||||
object = vm_find_object_by_address(aperture, address, 0);
|
||||
if (object)
|
||||
found = true;
|
||||
pthread_mutex_unlock(&aperture->fmm_mutex);
|
||||
}
|
||||
|
||||
if (!object)
|
||||
return 1;
|
||||
|
||||
object->device_ids_array = nodes_arr;
|
||||
object->device_ids_array_size = nodes_arr_size * sizeof(uint32_t);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void fmm_deregister_memory(void *address)
|
||||
{
|
||||
bool found = false;
|
||||
manageble_aperture_t *aperture;
|
||||
vm_object_t *object;
|
||||
|
||||
/*
|
||||
* Object can be found only on SVM aperture as you can't map
|
||||
* non SVM object on different device.
|
||||
*/
|
||||
aperture = &svm.dgpu_aperture;
|
||||
pthread_mutex_lock(&aperture->fmm_mutex);
|
||||
/* Find the object to retrieve the handle */
|
||||
object = vm_find_object_by_address(aperture, address, 0);
|
||||
if (object)
|
||||
found = true;
|
||||
pthread_mutex_unlock(&aperture->fmm_mutex);
|
||||
|
||||
if (!found) {
|
||||
aperture = &svm.dgpu_alt_aperture;
|
||||
pthread_mutex_lock(&aperture->fmm_mutex);
|
||||
/* Find the object to retrieve the handle */
|
||||
object = vm_find_object_by_address(aperture, address, 0);
|
||||
if (object)
|
||||
found = true;
|
||||
pthread_mutex_unlock(&aperture->fmm_mutex);
|
||||
}
|
||||
|
||||
if (!object || object->device_ids_array_size <= 0)
|
||||
return;
|
||||
|
||||
free(object->device_ids_array);
|
||||
object->device_ids_array = NULL;
|
||||
object->device_ids_array_size = 0;
|
||||
}
|
||||
|
||||
int fmm_build_nodes_array(uint32_t **array, uint32_t *nodes, uint32_t nodes_num)
|
||||
{
|
||||
uint32_t i, *arr;
|
||||
if (!nodes) {
|
||||
nodes_num = 0;
|
||||
for (i = 0 ; i < NUM_OF_SUPPORTED_GPUS; i++) {
|
||||
if (gpu_mem[i].gpu_id == 0)
|
||||
continue;
|
||||
nodes_num++;
|
||||
}
|
||||
}
|
||||
|
||||
arr = (uint32_t *)malloc(sizeof(uint32_t) * nodes_num);
|
||||
if (!array)
|
||||
return 1;
|
||||
|
||||
memset(arr, 0, sizeof(uint32_t) * nodes_num);
|
||||
|
||||
nodes_num = 0;
|
||||
for (i = 0 ; i < NUM_OF_SUPPORTED_GPUS; i++) {
|
||||
if (gpu_mem[i].gpu_id == 0)
|
||||
continue;
|
||||
arr[nodes_num] = gpu_mem[i].gpu_id;
|
||||
nodes_num++;
|
||||
}
|
||||
|
||||
*array = arr;
|
||||
return nodes_num;
|
||||
}
|
||||
|
||||
@@ -67,4 +67,9 @@ HSAKMT_STATUS fmm_node_added(HSAuint32 gpu_id);
|
||||
HSAKMT_STATUS fmm_node_removed(HSAuint32 gpu_id);
|
||||
HSAKMT_STATUS fmm_get_aperture_base_and_limit(aperture_type_e aperture_type, HSAuint32 gpu_id,
|
||||
HSAuint64 *aperture_base, HSAuint64 *aperture_limit);
|
||||
|
||||
int fmm_register_memory(void *address, uint32_t size_in_bytes,
|
||||
uint32_t *nodes_arr, uint32_t nodes_arr_size);
|
||||
void fmm_deregister_memory(void *address);
|
||||
int fmm_build_nodes_array(uint32_t **array, uint32_t *nodes, uint32_t nodes_num);
|
||||
#endif /* FMM_H_ */
|
||||
|
||||
@@ -77,7 +77,8 @@ bool topology_is_dgpu(uint16_t device_id);
|
||||
|
||||
HSAuint32 PageSizeFromFlags(unsigned int pageSizeFlags);
|
||||
|
||||
void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, uint32_t NodeId);
|
||||
void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align,
|
||||
uint32_t NodeId, bool peer_to_peer);
|
||||
void free_exec_aligned_memory_gpu(void *addr, uint32_t size, uint32_t align);
|
||||
|
||||
extern int kmtIoctl(int fd, unsigned long request, void *arg);
|
||||
|
||||
@@ -195,8 +195,21 @@ hsaKmtRegisterMemory(
|
||||
HSAuint64 MemorySizeInBytes /* IN (page-aligned) */
|
||||
)
|
||||
{
|
||||
uint32_t *NodesArray;
|
||||
uint32_t NodesArraySize;
|
||||
|
||||
CHECK_KFD_OPEN();
|
||||
|
||||
/*
|
||||
* Build NodesArray from all dGPU nodes.
|
||||
*/
|
||||
NodesArraySize = fmm_build_nodes_array(&NodesArray, NULL, 0);
|
||||
if (!NodesArray)
|
||||
return HSAKMT_STATUS_INVALID_PARAMETER;
|
||||
|
||||
if (fmm_register_memory(MemoryAddress, MemorySizeInBytes,
|
||||
NodesArray, NodesArraySize) != 0)
|
||||
return HSAKMT_STATUS_ERROR;
|
||||
return HSAKMT_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -208,6 +221,8 @@ hsaKmtDeregisterMemory(
|
||||
{
|
||||
CHECK_KFD_OPEN();
|
||||
|
||||
fmm_deregister_memory(MemoryAddress);
|
||||
|
||||
return HSAKMT_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
@@ -187,7 +187,8 @@ static void* allocate_exec_aligned_memory_cpu(uint32_t size, uint32_t align)
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, uint32_t NodeId)
|
||||
void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align,
|
||||
uint32_t NodeId, bool peer_to_peer)
|
||||
{
|
||||
void *mem;
|
||||
HSAuint64 gpu_va;
|
||||
@@ -205,6 +206,14 @@ void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, uint32_t N
|
||||
if (ret != HSAKMT_STATUS_SUCCESS) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (peer_to_peer) {
|
||||
if (hsaKmtRegisterMemory(mem, size) != HSAKMT_STATUS_SUCCESS) {
|
||||
hsaKmtFreeMemory(mem, size);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
if (hsaKmtMapMemoryToGPU(mem, size, &gpu_va) != HSAKMT_STATUS_SUCCESS) {
|
||||
hsaKmtFreeMemory(mem, size);
|
||||
return NULL;
|
||||
@@ -220,6 +229,7 @@ void free_exec_aligned_memory_gpu(void *addr, uint32_t size, uint32_t align)
|
||||
if (hsaKmtUnmapMemoryToGPU(addr) == HSAKMT_STATUS_SUCCESS) {
|
||||
hsaKmtFreeMemory(addr, size);
|
||||
}
|
||||
hsaKmtDeregisterMemory(addr);
|
||||
}
|
||||
|
||||
static void* allocate_exec_aligned_memory(uint32_t size,
|
||||
@@ -228,7 +238,7 @@ static void* allocate_exec_aligned_memory(uint32_t size,
|
||||
uint32_t NodeId)
|
||||
{
|
||||
if (IS_DGPU(type))
|
||||
return allocate_exec_aligned_memory_gpu(size, TONGA_PAGE_SIZE, NodeId);
|
||||
return allocate_exec_aligned_memory_gpu(size, TONGA_PAGE_SIZE, NodeId, false);
|
||||
return allocate_exec_aligned_memory_cpu(size, align);
|
||||
}
|
||||
|
||||
@@ -236,6 +246,7 @@ static void release_exec_aligned_memory_gpu(void *addr, uint32_t size)
|
||||
{
|
||||
if (hsaKmtUnmapMemoryToGPU(addr) == HSAKMT_STATUS_SUCCESS)
|
||||
hsaKmtFreeMemory(addr, (HSAuint64)size);
|
||||
hsaKmtDeregisterMemory(addr);
|
||||
}
|
||||
|
||||
static void release_exec_aligned_memory(void *addr, uint32_t size, enum asic_family_type type)
|
||||
|
||||
Référencer dans un nouveau ticket
Bloquer un utilisateur