Adding support for mGPU

Change-Id: I5ed184e6a58b38d9dde48867f14513d161cf41a9
Signed-off-by: Ben Goz <ben.goz@amd.com>


[ROCm/ROCR-Runtime commit: ea0f9d2a0b]
Cette révision appartient à :
Ben Goz
2015-12-23 17:23:25 +02:00
Parent d874bcd8b3
révision 2fa7eef572
7 fichiers modifiés avec 192 ajouts et 19 suppressions
+21 -2
Voir le fichier
@@ -265,13 +265,27 @@ struct kfd_ioctl_free_memory_of_gpu_args {
};
struct kfd_ioctl_map_memory_to_gpu_args {
uint64_t handle; /* to KFD */
uint64_t handle; /* to KFD */
};
struct kfd_ioctl_map_memory_to_gpu_new_args {
uint64_t handle; /* to KFD */
uint32_t *device_ids_array; /* to KFD */
uint32_t device_ids_array_size; /* to KFD */
uint32_t pad;
};
struct kfd_ioctl_unmap_memory_from_gpu_args {
uint64_t handle; /* to KFD */
};
struct kfd_ioctl_unmap_memory_from_gpu_new_args {
uint64_t handle; /* to KFD */
uint32_t *device_ids_array; /* to KFD */
uint32_t device_ids_array_size; /* to KFD */
uint32_t pad;
};
struct kfd_ioctl_open_graphic_handle_args {
uint64_t va_addr; /* to KFD */
uint64_t handle; /* from KFD */
@@ -392,7 +406,12 @@ struct kfd_ioctl_alloc_memory_of_gpu_new_args {
#define AMDKFD_IOC_SET_TRAP_HANDLER \
AMDKFD_IOW(0x1a, struct kfd_ioctl_set_trap_handler_args)
#define AMDKFD_IOC_MAP_MEMORY_TO_GPU_NEW \
AMDKFD_IOWR(0x1b, struct kfd_ioctl_map_memory_to_gpu_new_args)
#define AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU_NEW \
AMDKFD_IOWR(0x1c, struct kfd_ioctl_unmap_memory_from_gpu_new_args)
#define AMDKFD_COMMAND_START 0x01
#define AMDKFD_COMMAND_END 0x1b
#define AMDKFD_COMMAND_END 0x1d
#endif
+1 -2
Voir le fichier
@@ -77,7 +77,7 @@ hsaKmtCreateEvent(
if (is_dgpu && events_page == NULL) {
events_page = allocate_exec_aligned_memory_gpu(KFD_SIGNAL_EVENT_LIMIT * 8,
TONGA_PAGE_SIZE,
args.node_id);
args.node_id, true);
if (!events_page) {
return HSAKMT_STATUS_ERROR;
}
@@ -143,7 +143,6 @@ hsaKmtDestroyEvent(
}
free(Event);
return HSAKMT_STATUS_SUCCESS;
}
+135 -12
Voir le fichier
@@ -66,6 +66,11 @@ struct vm_object {
uint64_t handle; /* opaque */
struct vm_object *next;
struct vm_object *prev;
/*
* Nodes to map on SVM mGPU
*/
uint32_t *device_ids_array;
uint32_t device_ids_array_size;
};
typedef struct vm_object vm_object_t;
@@ -165,6 +170,7 @@ static vm_object_t *vm_create_and_init_object(void *start, uint64_t size,
object->size = size;
object->handle = handle;
object->next = object->prev = NULL;
object->device_ids_array_size = 0;
}
return object;
@@ -698,7 +704,7 @@ void *fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes)
{
manageble_aperture_t *aperture;
int32_t gpu_mem_id;
uint32_t flags;
uint32_t flags, offset;
/* Retrieve gpu_mem id according to gpu_id */
gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id);
@@ -712,13 +718,15 @@ void *fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes)
* In that way the host access range won't be used for local memory
*/
aperture = &svm.dgpu_aperture;
offset = 0;
} else {
flags = KFD_IOC_ALLOC_MEM_FLAGS_APU_DEVICE;
aperture = &gpu_mem[gpu_mem_id].gpuvm_aperture;
offset = GPUVM_APP_OFFSET;
}
return __fmm_allocate_device(gpu_id, MemorySizeInBytes,
aperture, GPUVM_APP_OFFSET, NULL,
aperture, offset, NULL,
flags);
}
@@ -812,7 +820,7 @@ void *fmm_open_graphic_handle(uint32_t gpu_id,
void *mem = NULL;
int32_t i = gpu_mem_find_by_gpu_id(gpu_id);
struct kfd_ioctl_open_graphic_handle_args open_graphic_handle_args;
struct kfd_ioctl_unmap_memory_from_gpu_args unmap_args;
struct kfd_ioctl_unmap_memory_from_gpu_new_args unmap_args;
/* If not found or aperture isn't properly initialized/supported */
if (i < 0 || !aperture_is_valid(gpu_mem[i].gpuvm_aperture.base,
@@ -850,7 +858,9 @@ void *fmm_open_graphic_handle(uint32_t gpu_id,
release_mem:
unmap_args.handle = open_graphic_handle_args.handle;
kmtIoctl(kfd_fd, AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU, &unmap_args);
unmap_args.device_ids_array = NULL;
unmap_args.device_ids_array_size = 0;
kmtIoctl(kfd_fd, AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU_NEW, &unmap_args);
release_area:
aperture_release_area(&gpu_mem[i].gpuvm_aperture, mem,
MemorySizeInBytes);
@@ -1113,7 +1123,7 @@ HSAKMT_STATUS fmm_get_aperture_base_and_limit(aperture_type_e aperture_type, HSA
static int _fmm_map_to_gpu_gtt(manageble_aperture_t *aperture,
void *address, uint64_t size)
{
struct kfd_ioctl_map_memory_to_gpu_args args;
struct kfd_ioctl_map_memory_to_gpu_new_args args;
vm_object_t *object;
pthread_mutex_lock(&aperture->fmm_mutex);
@@ -1125,7 +1135,9 @@ static int _fmm_map_to_gpu_gtt(manageble_aperture_t *aperture,
}
args.handle = object->handle;
if (kmtIoctl(kfd_fd, AMDKFD_IOC_MAP_MEMORY_TO_GPU, &args))
args.device_ids_array = object->device_ids_array;
args.device_ids_array_size = object->device_ids_array_size;
if (kmtIoctl(kfd_fd, AMDKFD_IOC_MAP_MEMORY_TO_GPU_NEW, &args))
goto err_map_ioctl_failed;
pthread_mutex_unlock(&aperture->fmm_mutex);
@@ -1207,7 +1219,7 @@ static int _fmm_map_to_gpu(uint32_t gpu_id, manageble_aperture_t *aperture,
void *address, uint64_t size,
uint64_t *gpuvm_address)
{
struct kfd_ioctl_map_memory_to_gpu_args args;
struct kfd_ioctl_map_memory_to_gpu_new_args args;
vm_object_t *object;
/* Check that address space was previously reserved */
@@ -1222,7 +1234,9 @@ static int _fmm_map_to_gpu(uint32_t gpu_id, manageble_aperture_t *aperture,
goto err_object_not_found;
args.handle = object->handle;
if (kmtIoctl(kfd_fd, AMDKFD_IOC_MAP_MEMORY_TO_GPU, &args))
args.device_ids_array = object->device_ids_array;
args.device_ids_array_size = object->device_ids_array_size;
if (kmtIoctl(kfd_fd, AMDKFD_IOC_MAP_MEMORY_TO_GPU_NEW, &args))
goto err_map_ioctl_failed;
pthread_mutex_unlock(&aperture->fmm_mutex);
@@ -1291,7 +1305,7 @@ int fmm_map_to_gpu(void *address, uint64_t size, uint64_t *gpuvm_address)
static int _fmm_unmap_from_gpu(manageble_aperture_t *aperture, void *address)
{
vm_object_t *object;
struct kfd_ioctl_unmap_memory_from_gpu_args args;
struct kfd_ioctl_unmap_memory_from_gpu_new_args args;
pthread_mutex_lock(&aperture->fmm_mutex);
@@ -1301,7 +1315,9 @@ static int _fmm_unmap_from_gpu(manageble_aperture_t *aperture, void *address)
goto err;
args.handle = object->handle;
kmtIoctl(kfd_fd, AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU, &args);
args.device_ids_array = object->device_ids_array;
args.device_ids_array_size = object->device_ids_array_size;
kmtIoctl(kfd_fd, AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU_NEW, &args);
pthread_mutex_unlock(&aperture->fmm_mutex);
@@ -1318,7 +1334,7 @@ static int _fmm_unmap_from_gpu_scratch(uint32_t gpu_id,
int32_t gpu_mem_id;
vm_object_t *object;
uint64_t size;
struct kfd_ioctl_unmap_memory_from_gpu_args args;
struct kfd_ioctl_unmap_memory_from_gpu_new_args args;
/* Retrieve gpu_mem id according to gpu_id */
gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id);
@@ -1339,7 +1355,9 @@ static int _fmm_unmap_from_gpu_scratch(uint32_t gpu_id,
/* unmap from GPU */
args.handle = object->handle;
kmtIoctl(kfd_fd, AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU, &args);
args.device_ids_array = object->device_ids_array;
args.device_ids_array_size = object->device_ids_array_size;
kmtIoctl(kfd_fd, AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU_NEW, &args);
pthread_mutex_unlock(&aperture->fmm_mutex);
@@ -1590,3 +1608,108 @@ void fmm_release_global_resources(void)
dgpu_shared_aperture_base = NULL;
dgpu_shared_aperture_limit = NULL;
}
int fmm_register_memory(void *address, uint32_t size_in_bytes,
uint32_t *nodes_arr, uint32_t nodes_arr_size)
{
bool found = false;
manageble_aperture_t *aperture;
vm_object_t *object;
/*
* Object can be found only on SVM aperture as you can't map
* non SVM object on different device.
*/
aperture = &svm.dgpu_aperture;
pthread_mutex_lock(&aperture->fmm_mutex);
/* Find the object to retrieve the handle */
object = vm_find_object_by_address(aperture, address, 0);
if (object)
found = true;
pthread_mutex_unlock(&aperture->fmm_mutex);
if (!found) {
aperture = &svm.dgpu_alt_aperture;
pthread_mutex_lock(&aperture->fmm_mutex);
/* Find the object to retrieve the handle */
object = vm_find_object_by_address(aperture, address, 0);
if (object)
found = true;
pthread_mutex_unlock(&aperture->fmm_mutex);
}
if (!object)
return 1;
object->device_ids_array = nodes_arr;
object->device_ids_array_size = nodes_arr_size * sizeof(uint32_t);
return 0;
}
void fmm_deregister_memory(void *address)
{
bool found = false;
manageble_aperture_t *aperture;
vm_object_t *object;
/*
* Object can be found only on SVM aperture as you can't map
* non SVM object on different device.
*/
aperture = &svm.dgpu_aperture;
pthread_mutex_lock(&aperture->fmm_mutex);
/* Find the object to retrieve the handle */
object = vm_find_object_by_address(aperture, address, 0);
if (object)
found = true;
pthread_mutex_unlock(&aperture->fmm_mutex);
if (!found) {
aperture = &svm.dgpu_alt_aperture;
pthread_mutex_lock(&aperture->fmm_mutex);
/* Find the object to retrieve the handle */
object = vm_find_object_by_address(aperture, address, 0);
if (object)
found = true;
pthread_mutex_unlock(&aperture->fmm_mutex);
}
if (!object || object->device_ids_array_size <= 0)
return;
free(object->device_ids_array);
object->device_ids_array = NULL;
object->device_ids_array_size = 0;
}
int fmm_build_nodes_array(uint32_t **array, uint32_t *nodes, uint32_t nodes_num)
{
uint32_t i, *arr;
if (!nodes) {
nodes_num = 0;
for (i = 0 ; i < NUM_OF_SUPPORTED_GPUS; i++) {
if (gpu_mem[i].gpu_id == 0)
continue;
nodes_num++;
}
}
arr = (uint32_t *)malloc(sizeof(uint32_t) * nodes_num);
if (!array)
return 1;
memset(arr, 0, sizeof(uint32_t) * nodes_num);
nodes_num = 0;
for (i = 0 ; i < NUM_OF_SUPPORTED_GPUS; i++) {
if (gpu_mem[i].gpu_id == 0)
continue;
arr[nodes_num] = gpu_mem[i].gpu_id;
nodes_num++;
}
*array = arr;
return nodes_num;
}
+5
Voir le fichier
@@ -67,4 +67,9 @@ HSAKMT_STATUS fmm_node_added(HSAuint32 gpu_id);
HSAKMT_STATUS fmm_node_removed(HSAuint32 gpu_id);
HSAKMT_STATUS fmm_get_aperture_base_and_limit(aperture_type_e aperture_type, HSAuint32 gpu_id,
HSAuint64 *aperture_base, HSAuint64 *aperture_limit);
int fmm_register_memory(void *address, uint32_t size_in_bytes,
uint32_t *nodes_arr, uint32_t nodes_arr_size);
void fmm_deregister_memory(void *address);
int fmm_build_nodes_array(uint32_t **array, uint32_t *nodes, uint32_t nodes_num);
#endif /* FMM_H_ */
+2 -1
Voir le fichier
@@ -77,7 +77,8 @@ bool topology_is_dgpu(uint16_t device_id);
HSAuint32 PageSizeFromFlags(unsigned int pageSizeFlags);
void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, uint32_t NodeId);
void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align,
uint32_t NodeId, bool peer_to_peer);
void free_exec_aligned_memory_gpu(void *addr, uint32_t size, uint32_t align);
extern int kmtIoctl(int fd, unsigned long request, void *arg);
+15
Voir le fichier
@@ -195,8 +195,21 @@ hsaKmtRegisterMemory(
HSAuint64 MemorySizeInBytes /* IN (page-aligned) */
)
{
uint32_t *NodesArray;
uint32_t NodesArraySize;
CHECK_KFD_OPEN();
/*
* Build NodesArray from all dGPU nodes.
*/
NodesArraySize = fmm_build_nodes_array(&NodesArray, NULL, 0);
if (!NodesArray)
return HSAKMT_STATUS_INVALID_PARAMETER;
if (fmm_register_memory(MemoryAddress, MemorySizeInBytes,
NodesArray, NodesArraySize) != 0)
return HSAKMT_STATUS_ERROR;
return HSAKMT_STATUS_SUCCESS;
}
@@ -208,6 +221,8 @@ hsaKmtDeregisterMemory(
{
CHECK_KFD_OPEN();
fmm_deregister_memory(MemoryAddress);
return HSAKMT_STATUS_SUCCESS;
}
+13 -2
Voir le fichier
@@ -187,7 +187,8 @@ static void* allocate_exec_aligned_memory_cpu(uint32_t size, uint32_t align)
return ptr;
}
void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, uint32_t NodeId)
void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align,
uint32_t NodeId, bool peer_to_peer)
{
void *mem;
HSAuint64 gpu_va;
@@ -205,6 +206,14 @@ void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, uint32_t N
if (ret != HSAKMT_STATUS_SUCCESS) {
return NULL;
}
if (peer_to_peer) {
if (hsaKmtRegisterMemory(mem, size) != HSAKMT_STATUS_SUCCESS) {
hsaKmtFreeMemory(mem, size);
return NULL;
}
}
if (hsaKmtMapMemoryToGPU(mem, size, &gpu_va) != HSAKMT_STATUS_SUCCESS) {
hsaKmtFreeMemory(mem, size);
return NULL;
@@ -220,6 +229,7 @@ void free_exec_aligned_memory_gpu(void *addr, uint32_t size, uint32_t align)
if (hsaKmtUnmapMemoryToGPU(addr) == HSAKMT_STATUS_SUCCESS) {
hsaKmtFreeMemory(addr, size);
}
hsaKmtDeregisterMemory(addr);
}
static void* allocate_exec_aligned_memory(uint32_t size,
@@ -228,7 +238,7 @@ static void* allocate_exec_aligned_memory(uint32_t size,
uint32_t NodeId)
{
if (IS_DGPU(type))
return allocate_exec_aligned_memory_gpu(size, TONGA_PAGE_SIZE, NodeId);
return allocate_exec_aligned_memory_gpu(size, TONGA_PAGE_SIZE, NodeId, false);
return allocate_exec_aligned_memory_cpu(size, align);
}
@@ -236,6 +246,7 @@ static void release_exec_aligned_memory_gpu(void *addr, uint32_t size)
{
if (hsaKmtUnmapMemoryToGPU(addr) == HSAKMT_STATUS_SUCCESS)
hsaKmtFreeMemory(addr, (HSAuint64)size);
hsaKmtDeregisterMemory(addr);
}
static void release_exec_aligned_memory(void *addr, uint32_t size, enum asic_family_type type)