From 9db147f2d44172a6c4f2a2fb498350a24cde4e25 Mon Sep 17 00:00:00 2001 From: Ben Goz Date: Sun, 23 Aug 2015 17:42:27 +0300 Subject: [PATCH] Support gfx802 dGPU Signed-off-by: Ben Goz [ROCm/ROCR-Runtime commit: fb8378a18b5aa5c7874d0e65758991cffa781703] --- .../rocr-runtime/include/linux/kfd_ioctl.h | 18 +- projects/rocr-runtime/src/events.c | 11 + projects/rocr-runtime/src/fmm.c | 351 +++++++++++++++--- projects/rocr-runtime/src/fmm.h | 3 + projects/rocr-runtime/src/globals.c | 1 + projects/rocr-runtime/src/libhsakmt.h | 10 + projects/rocr-runtime/src/memory.c | 28 +- projects/rocr-runtime/src/queues.c | 129 ++++++- projects/rocr-runtime/src/topology.c | 80 ++-- 9 files changed, 513 insertions(+), 118 deletions(-) diff --git a/projects/rocr-runtime/include/linux/kfd_ioctl.h b/projects/rocr-runtime/include/linux/kfd_ioctl.h index 91aa69a345..39fd4d5be4 100644 --- a/projects/rocr-runtime/include/linux/kfd_ioctl.h +++ b/projects/rocr-runtime/include/linux/kfd_ioctl.h @@ -249,7 +249,7 @@ struct kfd_ioctl_alloc_memory_of_gpu_args { uint64_t size; /* to KFD */ uint64_t handle; /* from KFD */ uint32_t gpu_id; /* to KFD */ - uint32_t pad; + uint64_t mmap_offset; /* from KFD */ }; struct kfd_ioctl_free_memory_of_gpu_args { @@ -273,6 +273,12 @@ struct kfd_ioctl_open_graphic_handle_args { uint32_t pad; }; +struct kfd_ioctl_set_process_dgpu_aperture_args { + uint32_t node_id; + uint64_t dgpu_base; + uint64_t dgpu_limit; +}; + #define AMDKFD_IOCTL_BASE 'K' #define AMDKFD_IO(nr) _IO(AMDKFD_IOCTL_BASE, nr) #define AMDKFD_IOR(nr, type) _IOR(AMDKFD_IOCTL_BASE, nr, type) @@ -342,13 +348,17 @@ struct kfd_ioctl_open_graphic_handle_args { #define AMDKFD_IOC_OPEN_GRAPHIC_HANDLE \ AMDKFD_IOWR(0x15, struct kfd_ioctl_open_graphic_handle_args) -#define AMDKFD_IOC_ALLOC_MEMORY_OF_SCRATCH \ - AMDKFD_IOWR(0x16, struct kfd_ioctl_alloc_memory_of_gpu_args) +#define AMDKFD_IOC_ALLOC_MEMORY_OF_SCRATCH \ + AMDKFD_IOWR(0x16, struct kfd_ioctl_alloc_memory_of_gpu_args) #define AMDKFD_IOC_SET_CU_MASK \ AMDKFD_IOW(0x17, struct kfd_ioctl_set_cu_mask_args) +#define AMDKFD_IOC_SET_PROCESS_DGPU_APERTURE \ + AMDKFD_IOW(0x18, struct kfd_ioctl_set_process_dgpu_aperture_args) + + #define AMDKFD_COMMAND_START 0x01 -#define AMDKFD_COMMAND_END 0x18 +#define AMDKFD_COMMAND_END 0x19 #endif diff --git a/projects/rocr-runtime/src/events.c b/projects/rocr-runtime/src/events.c index f4217c2c58..7cedacb459 100644 --- a/projects/rocr-runtime/src/events.c +++ b/projects/rocr-runtime/src/events.c @@ -30,7 +30,9 @@ #include #include #include +#include #include "linux/kfd_ioctl.h" +#include "fmm.h" static HSAuint64 *events_page = NULL; @@ -70,6 +72,15 @@ hsaKmtCreateEvent( args.event_type = EventDesc->EventType; args.auto_reset = !ManualReset; + /* dGPU code */ + if (is_dgpu && events_page == NULL) { + events_page = allocate_exec_aligned_memory_gpu(KFD_SIGNAL_EVENT_LIMIT * 8, 0x9000); + if (!events_page) { + return HSAKMT_STATUS_ERROR; + } + fmm_get_handle(events_page, &args.event_page_offset); + } + if (kmtIoctl(kfd_fd, AMDKFD_IOC_CREATE_EVENT, &args) != 0) { free(e); *Event = NULL; diff --git a/projects/rocr-runtime/src/fmm.c b/projects/rocr-runtime/src/fmm.c index 67e46162da..df864779ea 100644 --- a/projects/rocr-runtime/src/fmm.c +++ b/projects/rocr-runtime/src/fmm.c @@ -92,10 +92,16 @@ typedef struct { manageble_aperture_t scratch_aperture; manageble_aperture_t scratch_physical; manageble_aperture_t gpuvm_aperture; + manageble_aperture_t dgpu_aperture; } gpu_mem_t; static gpu_mem_t gpu_mem[] = INIT_GPUs_MEM; +static HSAKMT_STATUS dgpu_mem_init(uint8_t node_id, void **base, void **limit); +static int set_dgpu_aperture(uint32_t node_id, uint64_t base, uint64_t limit); +static void __fmm_release(uint32_t gpu_id, void *address, + uint64_t MemorySizeInBytes, manageble_aperture_t *aperture); + static vm_area_t *vm_create_and_init_area(void *start, void *end) { vm_area_t *area = (vm_area_t *) malloc(sizeof(vm_area_t)); @@ -373,45 +379,24 @@ static int32_t gpu_mem_find_by_gpu_id(uint32_t gpu_id) return -1; } -static manageble_aperture_t *find_valid_gpuvm_apperture_of_gpu(uint32_t gpu_id) -{ - manageble_aperture_t *aperture; - int32_t gpu_mem_id; - - /* Retrieve gpu_mem id according to gpu_id */ - gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id); - if (gpu_mem_id < 0) - return NULL; - - aperture = &gpu_mem[gpu_mem_id].gpuvm_aperture; - - /* Check that aperture is properly initialized/supported */ - if (!aperture_is_valid(aperture->base, aperture->limit)) - return NULL; - - return aperture; -} - static int fmm_allocate_memory_in_device(uint32_t gpu_id, void *mem, - uint64_t MemorySizeInBytes) + uint64_t MemorySizeInBytes, + manageble_aperture_t *aperture, + uint64_t *mmap_offset) { struct kfd_ioctl_alloc_memory_of_gpu_args args; struct kfd_ioctl_free_memory_of_gpu_args free_args; - manageble_aperture_t *aperture; if (!mem) return -1; - /* Retrieve gpuvm aperture according to gpu_id */ - aperture = find_valid_gpuvm_apperture_of_gpu(gpu_id); - if (!aperture) - return -1; - /* Allocate memory from amdkfd */ args.gpu_id = gpu_id; args.size = MemorySizeInBytes; - args.va_addr = VOID_PTRS_SUB(mem, aperture->base); + args.va_addr = (uint64_t)mem; + if (!mmap_offset) + args.va_addr = VOID_PTRS_SUB(mem, aperture->base); if (kmtIoctl(kfd_fd, AMDKFD_IOC_ALLOC_MEMORY_OF_GPU, &args)) return -1; @@ -423,6 +408,9 @@ static int fmm_allocate_memory_in_device(uint32_t gpu_id, void *mem, goto err_object_allocation_failed; pthread_mutex_unlock(&aperture->fmm_mutex); + if (mmap_offset) + *mmap_offset = args.mmap_offset; + return 0; err_object_allocation_failed: @@ -541,24 +529,10 @@ void *fmm_allocate_scratch(uint32_t gpu_id, uint64_t MemorySizeInBytes) return (void*)(((((uint64_t)mem) >> 16) + 1) << 16); } -/* - * The offset from GPUVM aperture base address to ensure that address 0 - * (after base subtraction) won't be used - */ -#define GPUVM_APP_OFFSET 0x10000 -void *fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes) +static void* __fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes, + manageble_aperture_t *aperture, uint64_t offset, uint64_t *mmap_offset) { - manageble_aperture_t *aperture; - int32_t gpu_mem_id; void *mem = NULL; - - /* Retrieve gpu_mem id according to gpu_id */ - gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id); - if (gpu_mem_id < 0) - return NULL; - - aperture = &gpu_mem[gpu_mem_id].gpuvm_aperture; - /* Check that aperture is properly initialized/supported */ if (!aperture_is_valid(aperture->base, aperture->limit)) return NULL; @@ -566,14 +540,15 @@ void *fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes) /* Allocate address space */ pthread_mutex_lock(&aperture->fmm_mutex); mem = aperture_allocate_area(aperture, - MemorySizeInBytes, GPUVM_APP_OFFSET); + MemorySizeInBytes, offset); pthread_mutex_unlock(&aperture->fmm_mutex); /* * Now that we have the area reserved, allocate memory in the device * itself */ - if (fmm_allocate_memory_in_device(gpu_id, mem, MemorySizeInBytes)) { + if (fmm_allocate_memory_in_device(gpu_id, mem, + MemorySizeInBytes, aperture, mmap_offset)) { /* * allocation of memory in device failed. * Release region in aperture @@ -589,6 +564,89 @@ void *fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes) return mem; } +/* + * The offset from GPUVM aperture base address to ensure that address 0 + * (after base subtraction) won't be used + */ +#define GPUVM_APP_OFFSET 0x10000 +void *fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes) +{ + manageble_aperture_t *aperture; + int32_t gpu_mem_id; + + /* Retrieve gpu_mem id according to gpu_id */ + gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id); + if (gpu_mem_id < 0) + return NULL; + + aperture = &gpu_mem[gpu_mem_id].gpuvm_aperture; + + return __fmm_allocate_device(gpu_id, MemorySizeInBytes, + aperture, GPUVM_APP_OFFSET, NULL); +} + +static void* fmm_allocate_host_cpu(uint32_t gpu_id, + uint64_t MemorySizeInBytes, HsaMemFlags flags) +{ + int err; + HSAuint64 page_size; + void *mem = NULL; + + page_size = PageSizeFromFlags(flags.ui32.PageSize); + err = posix_memalign(&mem, page_size, MemorySizeInBytes); + if (err != 0) + return NULL; + + if (flags.ui32.ExecuteAccess) { + err = mprotect(mem, MemorySizeInBytes, + PROT_READ | PROT_WRITE | PROT_EXEC); + + if (err != 0) { + free(mem); + return NULL; + } + } + return mem; +} + +static void* fmm_allocate_host_gpu(uint32_t gpu_id, + uint64_t MemorySizeInBytes, HsaMemFlags flags) +{ + void *mem; + manageble_aperture_t *aperture; + int32_t gpu_mem_id; + uint64_t mmap_offset; + + /* Retrieve gpu_mem id according to gpu_id */ + gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id); + if (gpu_mem_id < 0) + return NULL; + + aperture = &gpu_mem[gpu_mem_id].dgpu_aperture; + + MemorySizeInBytes += 0x8000 - (MemorySizeInBytes % 0x8000); + + mem = __fmm_allocate_device(gpu_id, MemorySizeInBytes, + aperture, 0, &mmap_offset); + + void *ret = mmap(mem, MemorySizeInBytes, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_SHARED | MAP_FIXED, kfd_fd , mmap_offset); + if (ret == MAP_FAILED) { + __fmm_release(gpu_id, mem, MemorySizeInBytes, aperture); + return NULL; + } + + return ret; +} + +void* fmm_allocate_host(uint32_t gpu_id, uint64_t MemorySizeInBytes, HsaMemFlags flags, uint16_t dev_id) +{ + if (topology_is_dgpu(dev_id)) + return fmm_allocate_host_gpu(gpu_id, MemorySizeInBytes, flags); + return fmm_allocate_host_cpu(gpu_id, MemorySizeInBytes, flags); +} + void *fmm_open_graphic_handle(uint32_t gpu_id, int32_t graphic_device_handle, uint32_t graphic_handle, @@ -647,20 +705,14 @@ out: } static void __fmm_release(uint32_t gpu_id, void *address, - uint64_t MemorySizeInBytes) + uint64_t MemorySizeInBytes, manageble_aperture_t *aperture) { struct kfd_ioctl_free_memory_of_gpu_args args; - manageble_aperture_t *aperture; vm_object_t *object; if (!address) return; - /* Retrieve gpuvm aperture according to gpu_id */ - aperture = find_valid_gpuvm_apperture_of_gpu(gpu_id); - if (!aperture) - return; - pthread_mutex_lock(&aperture->fmm_mutex); /* Find the object to retrieve the handle */ @@ -696,7 +748,16 @@ void fmm_release(void *address, uint64_t MemorySizeInBytes) if (address >= gpu_mem[i].gpuvm_aperture.base && address <= gpu_mem[i].gpuvm_aperture.limit) { found = true; - __fmm_release(gpu_mem[i].gpu_id, address, MemorySizeInBytes); + __fmm_release(gpu_mem[i].gpu_id, address, + MemorySizeInBytes, &gpu_mem[i].gpuvm_aperture); + fmm_print(gpu_mem[i].gpu_id); + } + + if (address >= gpu_mem[i].dgpu_aperture.base && + address <= gpu_mem[i].dgpu_aperture.limit) { + found = true; + __fmm_release(gpu_mem[i].gpu_id, address, + MemorySizeInBytes, &gpu_mem[i].dgpu_aperture); fmm_print(gpu_mem[i].gpu_id); } } @@ -713,6 +774,8 @@ HSAKMT_STATUS fmm_init_process_apertures(void) { struct kfd_ioctl_get_process_apertures_args args; uint8_t node_id; + uint32_t gpu_id; + HsaNodeProperties props; if (kmtIoctl(kfd_fd, AMDKFD_IOC_GET_PROCESS_APERTURES, (void *) &args)) return HSAKMT_STATUS_ERROR; @@ -721,6 +784,17 @@ HSAKMT_STATUS fmm_init_process_apertures(void) gpu_mem[node_id].gpu_id = args.process_apertures[node_id].gpu_id; + + if (topology_sysfs_get_node_props(node_id, &props, &gpu_id) == + HSAKMT_STATUS_SUCCESS) { + if (topology_is_dgpu(props.DeviceId)) { + dgpu_mem_init(node_id, &gpu_mem[node_id].dgpu_aperture.base, + &gpu_mem[node_id].dgpu_aperture.limit); + set_dgpu_aperture(node_id, (uint64_t)gpu_mem[node_id].dgpu_aperture.base, + (uint64_t)gpu_mem[node_id].dgpu_aperture.limit); + } + } + gpu_mem[node_id].lds_aperture.base = PORT_UINT64_TO_VPTR(args.process_apertures[node_id].lds_base); @@ -804,6 +878,34 @@ HSAuint64 fmm_get_aperture_base(aperture_type_e aperture_type, HSAuint32 gpu_id) } } +static int _fmm_map_to_gpu_gtt(uint32_t gpu_id, manageble_aperture_t *aperture, + void *address, uint64_t size) +{ + struct kfd_ioctl_map_memory_to_gpu_args args; + vm_object_t *object; + + pthread_mutex_lock(&aperture->fmm_mutex); + + /* Find the object to retrieve the handle */ + object = vm_find_object_by_address(aperture, address, 0); + if (!object) { + goto err_object_not_found; + } + + args.handle = object->handle; + if (kmtIoctl(kfd_fd, AMDKFD_IOC_MAP_MEMORY_TO_GPU, &args)) + goto err_map_ioctl_failed; + + pthread_mutex_unlock(&aperture->fmm_mutex); + + return 0; + +err_map_ioctl_failed: +err_object_not_found: + pthread_mutex_unlock(&aperture->fmm_mutex); + return -1; +} + static int _fmm_map_to_gpu(uint32_t gpu_id, manageble_aperture_t *aperture, void *address, uint64_t size, uint64_t *gpuvm_address) @@ -855,6 +957,12 @@ int fmm_map_to_gpu(void *address, uint64_t size, uint64_t *gpuvm_address) return _fmm_map_to_gpu(gpu_mem[i].gpu_id, &gpu_mem[i].gpuvm_aperture, address, size, gpuvm_address); + if ((address >= gpu_mem[i].dgpu_aperture.base) && + (address <= gpu_mem[i].dgpu_aperture.limit)) + /* map it */ + return _fmm_map_to_gpu_gtt(gpu_mem[i].gpu_id, + &gpu_mem[i].dgpu_aperture, + address, size); } /* @@ -904,7 +1012,144 @@ int fmm_unmap_from_gpu(void *address) /* unmap it */ return _fmm_unmap_from_gpu(&gpu_mem[i].gpuvm_aperture, address); + else if ((address >= gpu_mem[i].dgpu_aperture.base) && + (address <= gpu_mem[i].dgpu_aperture.limit)) + /* unmap it */ + return _fmm_unmap_from_gpu(&gpu_mem[i].dgpu_aperture, + address); } return 0; } + +/* Tonga dGPU specific functions */ +static bool is_dgpu_mem_init = false; +static void *dgpu_shared_aperture_base = NULL; +static void *dgpu_shared_aperture_limit = NULL; + +static int set_dgpu_aperture(uint32_t node_id, uint64_t base, uint64_t limit) +{ + struct kfd_ioctl_set_process_dgpu_aperture_args args; + + args.node_id = node_id; + args.dgpu_base = base; + args.dgpu_limit = limit; + + return kmtIoctl(kfd_fd, AMDKFD_IOC_SET_PROCESS_DGPU_APERTURE, &args); +} + +static void *reserve_address(void *addr, long long unsigned int len) +{ + void *ret_addr; + + if (len <= 0) + return NULL; + + ret_addr = mmap(addr, len, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE, -1, 0); + if (addr == MAP_FAILED) + return NULL; + + return ret_addr; +} + +#define ADDRESS_RANGE_LIMIT_MASK 0xFFFFFFFFFF + +static HSAKMT_STATUS dgpu_mem_init(uint8_t node_id, void **base, void **limit) +{ + bool found; + HSAKMT_STATUS ret; + void *addr, *ret_addr; + uint32_t max_len; + long long unsigned int temp; + uint32_t gpu_id; + HsaNodeProperties props; + + if (is_dgpu_mem_init) { + if (base) + base = dgpu_shared_aperture_base; + if (limit) + limit = dgpu_shared_aperture_limit; + return HSAKMT_STATUS_SUCCESS; + } + + ret = topology_sysfs_get_node_props(node_id, &props, &gpu_id); + if (ret != HSAKMT_STATUS_SUCCESS) + return ret; + + max_len = (uint32_t)props.LocalMemSize; + found = false; + + for (addr = (void *)PAGE_SIZE, ret_addr = NULL; + ret_addr != addr; + addr = (void *)((unsigned long)addr + 0x8000)) + { + ret_addr = reserve_address(addr, max_len); + if (!ret_addr) + continue; + temp = (long long unsigned int)ret_addr + max_len; + if (temp < ADDRESS_RANGE_LIMIT_MASK) { + found = true; + break; + } + else + munmap(ret_addr, max_len); + } + + if (found) { + if (base) + *base = ret_addr; + dgpu_shared_aperture_base = ret_addr; + if (limit) + *limit = (void *)((long long unsigned int)ret_addr + max_len); + dgpu_shared_aperture_limit = (void *)((long long unsigned int)ret_addr + max_len); + is_dgpu_mem_init = true; + return HSAKMT_STATUS_SUCCESS; + } + + return HSAKMT_STATUS_ERROR; +} + +bool fmm_get_handle(void *address, uint64_t *handle) +{ + int32_t i; + manageble_aperture_t *aperture; + vm_object_t *object; + bool found; + + found = false; + aperture = NULL; + + /* Find the aperture the requested address belongs to */ + for (i = 0; i < NUM_OF_SUPPORTED_GPUS; i++) { + if (gpu_mem[i].gpu_id == NON_VALID_GPU_ID) + continue; + + if ((address >= gpu_mem[i].gpuvm_aperture.base) && + (address <= gpu_mem[i].gpuvm_aperture.limit)) { + aperture = &gpu_mem[i].gpuvm_aperture; + break; + } + + else if ((address >= gpu_mem[i].dgpu_aperture.base) && + (address <= gpu_mem[i].dgpu_aperture.limit)) { + aperture = &gpu_mem[i].dgpu_aperture; + break; + } + } + + if (!aperture) + return false; + + pthread_mutex_lock(&aperture->fmm_mutex); + /* Find the object to retrieve the handle */ + object = vm_find_object_by_address(aperture, address, 0); + if (object && handle) { + *handle = object->handle; + found = true; + } + pthread_mutex_unlock(&aperture->fmm_mutex); + + + return found; +} diff --git a/projects/rocr-runtime/src/fmm.h b/projects/rocr-runtime/src/fmm.h index d326c83f06..c2ff77e074 100644 --- a/projects/rocr-runtime/src/fmm.h +++ b/projects/rocr-runtime/src/fmm.h @@ -49,6 +49,8 @@ HSAKMT_STATUS fmm_init_process_apertures(void); */ void* fmm_allocate_scratch(uint32_t gpu_id, uint64_t MemorySizeInBytes); void* fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes); +void* fmm_allocate_host(uint32_t gpu_id, uint64_t MemorySizeInBytes, + HsaMemFlags flags, uint16_t dev_id); void* fmm_open_graphic_handle(uint32_t gpu_id, int32_t graphic_device_handle, uint32_t graphic_handle, @@ -58,6 +60,7 @@ bool fmm_is_inside_some_aperture(void* address); void fmm_release(void* address, HSAuint64 MemorySizeInBytes); int fmm_map_to_gpu(void *address, uint64_t size, uint64_t *gpuvm_address); int fmm_unmap_from_gpu(void *address); +bool fmm_get_handle(void *address, uint64_t *handle); /* Topology interface*/ HSAKMT_STATUS fmm_node_added(HSAuint32 gpu_id); diff --git a/projects/rocr-runtime/src/globals.c b/projects/rocr-runtime/src/globals.c index cad6b1f989..d055cecbde 100644 --- a/projects/rocr-runtime/src/globals.c +++ b/projects/rocr-runtime/src/globals.c @@ -31,3 +31,4 @@ int kfd_fd; unsigned long kfd_open_count; unsigned long system_properties_count; pthread_mutex_t hsakmt_mutex = PTHREAD_MUTEX_INITIALIZER; +bool is_dgpu = false; diff --git a/projects/rocr-runtime/src/libhsakmt.h b/projects/rocr-runtime/src/libhsakmt.h index 0d73c8fa8b..37584ac66d 100644 --- a/projects/rocr-runtime/src/libhsakmt.h +++ b/projects/rocr-runtime/src/libhsakmt.h @@ -34,6 +34,7 @@ extern int kfd_fd; extern unsigned long kfd_open_count; extern pthread_mutex_t hsakmt_mutex; +extern bool is_dgpu; #undef HSAKMTAPI #define HSAKMTAPI __attribute__((visibility ("default"))) @@ -65,6 +66,15 @@ HSAKMT_STATUS validate_nodeid(uint32_t nodeid, uint32_t *gpu_id); HSAKMT_STATUS gpuid_to_nodeid(uint32_t gpu_id, uint32_t* node_id); uint16_t get_device_id_by_node(HSAuint32 node_id); +HSAKMT_STATUS topology_sysfs_get_gpu_id(uint32_t node_id, uint32_t *gpu_id); +HSAKMT_STATUS topology_sysfs_get_node_props(uint32_t node_id, HsaNodeProperties *props, uint32_t *gpu_id); +bool topology_is_dgpu(uint16_t gpu_id); + +HSAuint32 PageSizeFromFlags(unsigned int pageSizeFlags); + +void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align); +void free_exec_aligned_memory_gpu(void *addr, uint32_t size); + extern int kmtIoctl(int fd, unsigned long request, void *arg); /* Void pointer arithmetic (or remove -Wpointer-arith to allow void pointers arithmetic) */ diff --git a/projects/rocr-runtime/src/memory.c b/projects/rocr-runtime/src/memory.c index 7adc4fb914..40020f702f 100644 --- a/projects/rocr-runtime/src/memory.c +++ b/projects/rocr-runtime/src/memory.c @@ -86,7 +86,7 @@ hsaKmtSetMemoryPolicy( return (err == -1) ? HSAKMT_STATUS_ERROR : HSAKMT_STATUS_SUCCESS; } -static HSAuint32 PageSizeFromFlags(unsigned int pageSizeFlags) +HSAuint32 PageSizeFromFlags(unsigned int pageSizeFlags) { switch (pageSizeFlags) { case HSA_PAGE_SIZE_4KB: return 4*1024; @@ -109,9 +109,8 @@ hsaKmtAllocMemory( ) { HSAKMT_STATUS result; - HSAuint64 page_size; uint32_t gpu_id; - int err; + HSAuint64 page_size; CHECK_KFD_OPEN(); @@ -119,26 +118,18 @@ hsaKmtAllocMemory( if (result != HSAKMT_STATUS_SUCCESS) return result; - /* The required size should be page aligned (GDS?) */ page_size = PageSizeFromFlags(MemFlags.ui32.PageSize); + if ((!MemoryAddress) || (!SizeInBytes) || - (SizeInBytes & (page_size-1))) + (SizeInBytes & (page_size-1))) { return HSAKMT_STATUS_INVALID_PARAMETER; + } if (MemFlags.ui32.HostAccess && !MemFlags.ui32.NonPaged && !MemFlags.ui32.Scratch) { - err = posix_memalign(MemoryAddress, page_size, SizeInBytes); - if (err != 0) - return HSAKMT_STATUS_NO_MEMORY; - - if (MemFlags.ui32.ExecuteAccess) { - err = mprotect(*MemoryAddress, SizeInBytes, - PROT_READ | PROT_WRITE | PROT_EXEC); - - if (err != 0) { - free(*MemoryAddress); - return err; - } - } + *MemoryAddress = fmm_allocate_host(gpu_id, SizeInBytes, MemFlags, + get_device_id_by_node(PreferredNode)); + if (*MemoryAddress == NULL) + return HSAKMT_STATUS_ERROR; return HSAKMT_STATUS_SUCCESS; } @@ -224,6 +215,7 @@ hsaKmtUnmapMemoryToGPU( ) { CHECK_KFD_OPEN(); + if (!fmm_unmap_from_gpu(MemoryAddress)) return HSAKMT_STATUS_SUCCESS; else diff --git a/projects/rocr-runtime/src/queues.c b/projects/rocr-runtime/src/queues.c index 5bcbe63087..24b737071d 100644 --- a/projects/rocr-runtime/src/queues.c +++ b/projects/rocr-runtime/src/queues.c @@ -34,25 +34,42 @@ #include #include +#define TONGA_PAGE_SIZE 0x9000 + /* 1024 doorbells, 4 bytes each doorbell */ #define DOORBELLS_PAGE_SIZE 1024 * 4 +enum asic_family_type { + CHIP_KAVERI = 0, + CHIP_CARRIZO, + CHIP_TONGA +}; + struct device_info { + enum asic_family_type asic_family; uint32_t ctx_save_restore_size; uint32_t eop_buffer_size; }; struct device_info kaveri_device_info = { + .asic_family = CHIP_KAVERI, .ctx_save_restore_size = 0, .eop_buffer_size = 0, }; struct device_info carrizo_device_info = { + .asic_family = CHIP_CARRIZO, .ctx_save_restore_size = 2756608, .eop_buffer_size = 4096, }; +struct device_info tonga_device_info = { + .asic_family = CHIP_TONGA, + .ctx_save_restore_size = TONGA_PAGE_SIZE, + .eop_buffer_size = TONGA_PAGE_SIZE, +}; + struct device_id { uint16_t dev_id; @@ -87,6 +104,8 @@ struct device_id supported_devices[] = { { 0x9875, &carrizo_device_info }, /* Carrizo */ { 0x9876, &carrizo_device_info }, /* Carrizo */ { 0x9877, &carrizo_device_info }, /* Carrizo */ + { 0x6939, &tonga_device_info }, + { 0x692b, &tonga_device_info }, { 0, NULL } }; @@ -97,6 +116,7 @@ struct queue uint32_t rptr; void *eop_buffer; void *ctx_save_restore; + enum asic_family_type type; }; struct process_doorbells @@ -121,7 +141,7 @@ static struct device_info *get_device_info_by_dev_id(uint16_t dev_id) return NULL; } -static void free_queue(struct queue *q) +static void free_queue_cpu(struct queue *q) { if (q->eop_buffer) free(q->eop_buffer); @@ -130,7 +150,7 @@ static void free_queue(struct queue *q) free(q); } -static void* allocate_exec_aligned_memory(uint32_t size, uint32_t align) +static void* allocate_exec_aligned_memory_cpu(uint32_t size, uint32_t align) { void *ptr; int retval; @@ -149,13 +169,89 @@ static void* allocate_exec_aligned_memory(uint32_t size, uint32_t align) return ptr; } +void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align) +{ + void *mem; + HSAuint64 gpu_va; + HsaMemFlags flags; + HSAKMT_STATUS ret; + + flags.Value = 0; + flags.ui32.HostAccess = 1; + flags.ui32.ExecuteAccess = 1; + flags.ui32.PageSize = HSA_PAGE_SIZE_4KB; + + size += align - (size % align); + + ret = hsaKmtAllocMemory(0, size, flags, &mem); + if (ret != HSAKMT_STATUS_SUCCESS) { + return NULL; + } + if (hsaKmtMapMemoryToGPU(mem, size, &gpu_va) != HSAKMT_STATUS_SUCCESS) { + hsaKmtFreeMemory(mem, size); + return NULL; + } + + return mem; +} + +void free_exec_aligned_memory_gpu(void *addr, uint32_t size) +{ + size += TONGA_PAGE_SIZE - (size % TONGA_PAGE_SIZE); + + if (hsaKmtUnmapMemoryToGPU(addr) == HSAKMT_STATUS_SUCCESS) { + hsaKmtFreeMemory(addr, size); + } +} + +static void* allocate_exec_aligned_memory(uint32_t size, uint32_t align, enum asic_family_type type) +{ + if (type == CHIP_TONGA) + return allocate_exec_aligned_memory_gpu(size, TONGA_PAGE_SIZE); + return allocate_exec_aligned_memory_cpu(size, align); +} + +static void release_exec_aligned_memory_gpu(void *addr, uint32_t size) +{ + if (hsaKmtUnmapMemoryToGPU(addr) == HSAKMT_STATUS_SUCCESS) + hsaKmtFreeMemory(addr, (HSAuint64)size); +} + +static void release_exec_aligned_memory(void *addr, uint32_t size, enum asic_family_type type) +{ + if (type == CHIP_TONGA) + release_exec_aligned_memory_gpu(addr, TONGA_PAGE_SIZE); + else + free(addr); +} + +static void free_queue_gpu(struct queue *q) +{ + if (q->eop_buffer) { + hsaKmtUnmapMemoryToGPU(q->eop_buffer); + hsaKmtFreeMemory(q->eop_buffer, TONGA_PAGE_SIZE); + } + if (q->ctx_save_restore) { + hsaKmtUnmapMemoryToGPU(q->ctx_save_restore); + hsaKmtFreeMemory(q->ctx_save_restore, TONGA_PAGE_SIZE); + } + release_exec_aligned_memory((void *)q, sizeof(*q), q->type); +} + +static void free_queue(struct queue *q, enum asic_family_type type) +{ + if (type == CHIP_TONGA) + return free_queue_gpu(q); + return free_queue_cpu(q); +} + static int handle_concrete_asic(struct device_info *dev_info, struct queue *q, struct kfd_ioctl_create_queue_args *args) { if (dev_info) { if (dev_info->eop_buffer_size > 0) { q->eop_buffer = - allocate_exec_aligned_memory(dev_info->eop_buffer_size, PAGE_SIZE); + allocate_exec_aligned_memory(dev_info->eop_buffer_size, PAGE_SIZE, dev_info->asic_family); if (q->eop_buffer == NULL) { return HSAKMT_STATUS_NO_MEMORY; } @@ -165,7 +261,7 @@ static int handle_concrete_asic(struct device_info *dev_info, struct queue *q, if (dev_info->ctx_save_restore_size > 0) { args->ctx_save_restore_size = dev_info->ctx_save_restore_size; q->ctx_save_restore = - allocate_exec_aligned_memory(dev_info->ctx_save_restore_size, PAGE_SIZE); + allocate_exec_aligned_memory(dev_info->ctx_save_restore_size, PAGE_SIZE, dev_info->asic_family); if (q->ctx_save_restore == NULL) {; return HSAKMT_STATUS_NO_MEMORY; } @@ -201,30 +297,35 @@ hsaKmtCreateQueue( if (result != HSAKMT_STATUS_SUCCESS) return result; - struct queue *q = malloc(sizeof(*q)); + dev_id = get_device_id_by_node(NodeId); + dev_info = get_device_info_by_dev_id(dev_id); + + struct queue *q = allocate_exec_aligned_memory(sizeof (*q), + PAGE_SIZE, dev_info->asic_family); if (q == NULL) return HSAKMT_STATUS_NO_MEMORY; + memset(q, 0, sizeof(*q)); struct kfd_ioctl_create_queue_args args; memset(&args, 0, sizeof(args)); - dev_id = get_device_id_by_node(NodeId); - dev_info = get_device_info_by_dev_id(dev_id); args.gpu_id = gpu_id; + q->type = dev_info->asic_family; + err = handle_concrete_asic(dev_info, q, &args); if (err != HSAKMT_STATUS_SUCCESS) { - free_queue(q); + free_queue(q, dev_info->asic_family); return err; } switch (Type) { case HSA_QUEUE_COMPUTE: args.queue_type = KFD_IOC_QUEUE_TYPE_COMPUTE; break; - case HSA_QUEUE_SDMA: free_queue(q); return HSAKMT_STATUS_UNAVAILABLE; + case HSA_QUEUE_SDMA: free_queue(q, dev_info->asic_family); return HSAKMT_STATUS_UNAVAILABLE; case HSA_QUEUE_COMPUTE_AQL: args.queue_type = KFD_IOC_QUEUE_TYPE_COMPUTE_AQL; break; - default: free_queue(q); return HSAKMT_STATUS_INVALID_PARAMETER; + default: free_queue(q, dev_info->asic_family); return HSAKMT_STATUS_INVALID_PARAMETER; } if (Type != HSA_QUEUE_COMPUTE_AQL) @@ -244,7 +345,7 @@ hsaKmtCreateQueue( if (err == -1) { - free_queue(q); + free_queue(q, dev_info->asic_family); return HSAKMT_STATUS_ERROR; } @@ -259,7 +360,7 @@ hsaKmtCreateQueue( if (ptr == MAP_FAILED) { pthread_mutex_unlock(&doorbells[NodeId].doorbells_mutex); hsaKmtDestroyQueue(q->queue_id); - free_queue(q); + free_queue(q, dev_info->asic_family); return HSAKMT_STATUS_ERROR; } @@ -321,7 +422,7 @@ hsaKmtDestroyQueue( struct kfd_ioctl_destroy_queue_args args; if (q == NULL) - return (HSAKMT_STATUS_INVALID_PARAMETER); + return (HSAKMT_STATUS_INVALID_PARAMETER); memset(&args, 0, sizeof(args)); @@ -335,7 +436,7 @@ hsaKmtDestroyQueue( } else { - free_queue(q); + free_queue(q, q->type); return HSAKMT_STATUS_SUCCESS; } } diff --git a/projects/rocr-runtime/src/topology.c b/projects/rocr-runtime/src/topology.c index 523df0d8da..608efa2d71 100644 --- a/projects/rocr-runtime/src/topology.c +++ b/projects/rocr-runtime/src/topology.c @@ -59,36 +59,40 @@ static struct hsa_gfxip_table { unsigned char major; // GFXIP Major engine version unsigned char minor; // GFXIP Minor engine version unsigned char stepping; // GFXIP Stepping info + unsigned char is_dgpu; // Predicat for dGPU devices } gfxip_lookup_table[] = { /* Kaveri Family */ - { 0x1304, 7, 0, 0 }, - { 0x1305, 7, 0, 0 }, - { 0x1306, 7, 0, 0 }, - { 0x1307, 7, 0, 0 }, - { 0x1309, 7, 0, 0 }, - { 0x130A, 7, 0, 0 }, - { 0x130B, 7, 0, 0 }, - { 0x130C, 7, 0, 0 }, - { 0x130D, 7, 0, 0 }, - { 0x130E, 7, 0, 0 }, - { 0x130F, 7, 0, 0 }, - { 0x1310, 7, 0, 0 }, - { 0x1311, 7, 0, 0 }, - { 0x1312, 7, 0, 0 }, - { 0x1313, 7, 0, 0 }, - { 0x1315, 7, 0, 0 }, - { 0x1316, 7, 0, 0 }, - { 0x1317, 7, 0, 0 }, - { 0x1318, 7, 0, 0 }, - { 0x131B, 7, 0, 0 }, - { 0x131C, 7, 0, 0 }, - { 0x131D, 7, 0, 0 }, + { 0x1304, 7, 0, 0, 0 }, + { 0x1305, 7, 0, 0, 0 }, + { 0x1306, 7, 0, 0, 0 }, + { 0x1307, 7, 0, 0, 0 }, + { 0x1309, 7, 0, 0, 0 }, + { 0x130A, 7, 0, 0, 0 }, + { 0x130B, 7, 0, 0, 0 }, + { 0x130C, 7, 0, 0, 0 }, + { 0x130D, 7, 0, 0, 0 }, + { 0x130E, 7, 0, 0, 0 }, + { 0x130F, 7, 0, 0, 0 }, + { 0x1310, 7, 0, 0, 0 }, + { 0x1311, 7, 0, 0, 0 }, + { 0x1312, 7, 0, 0, 0 }, + { 0x1313, 7, 0, 0, 0 }, + { 0x1315, 7, 0, 0, 0 }, + { 0x1316, 7, 0, 0, 0 }, + { 0x1317, 7, 0, 0, 0 }, + { 0x1318, 7, 0, 0, 0 }, + { 0x131B, 7, 0, 0, 0 }, + { 0x131C, 7, 0, 0, 0 }, + { 0x131D, 7, 0, 0, 0 }, /* Carrizo Family */ - { 0x9870, 8, 0, 1 }, - { 0x9874, 8, 0, 1 }, - { 0x9875, 8, 0, 1 }, - { 0x9876, 8, 0, 1 }, - { 0x9877, 8, 0, 1 } + { 0x9870, 8, 0, 1, 0 }, + { 0x9874, 8, 0, 1, 0 }, + { 0x9875, 8, 0, 1, 0 }, + { 0x9876, 8, 0, 1, 0 }, + { 0x9877, 8, 0, 1, 0 }, + /* Tonga Family */ + { 0x6939, 8, 0, 0, 1 }, + { 0x692b, 8, 0, 0, 1 } }; static void @@ -203,7 +207,7 @@ err1: return ret; } -static HSAKMT_STATUS +HSAKMT_STATUS topology_sysfs_get_gpu_id(uint32_t node_id, uint32_t *gpu_id) { FILE *fd; char path[256]; @@ -222,7 +226,25 @@ topology_sysfs_get_gpu_id(uint32_t node_id, uint32_t *gpu_id) { return ret; } -static HSAKMT_STATUS +bool topology_is_dgpu(uint16_t gpu_id) +{ + uint32_t i, table_size; + + if (is_dgpu) + return is_dgpu; + + table_size = sizeof(gfxip_lookup_table)/sizeof(struct hsa_gfxip_table); + for (i=0; i