diff --git a/src/events.c b/src/events.c index 50c3e1d3bf..a1359d6647 100644 --- a/src/events.c +++ b/src/events.c @@ -75,9 +75,8 @@ hsaKmtCreateEvent( /* dGPU code */ if (is_dgpu && events_page == NULL) { - events_page = allocate_exec_aligned_memory_gpu(KFD_SIGNAL_EVENT_LIMIT * 8, - PAGE_SIZE, - args.node_id, true); + events_page = allocate_exec_aligned_memory_gpu( + KFD_SIGNAL_EVENT_LIMIT * 8, PAGE_SIZE, 0); if (!events_page) { return HSAKMT_STATUS_ERROR; } diff --git a/src/fmm.c b/src/fmm.c index 7c6837843b..6976a44723 100644 --- a/src/fmm.c +++ b/src/fmm.c @@ -137,6 +137,23 @@ static int _fmm_unmap_from_gpu_scratch(uint32_t gpu_id, manageble_aperture_t *aperture, void *address); +static int32_t find_first_dgpu(HSAuint32 *gpu_id) { + int32_t i; + + *gpu_id = NON_VALID_GPU_ID; + + for (i = 0; i < NUM_OF_SUPPORTED_GPUS; i++) { + if (gpu_mem[i].gpu_id == NON_VALID_GPU_ID) + continue; + if (!topology_is_dgpu(gpu_mem[i].device_id)) + continue; + *gpu_id = gpu_mem[i].gpu_id; + return i; + } + + return -1; +} + static vm_area_t *vm_create_and_init_area(void *start, void *end) { vm_area_t *area = (vm_area_t *) malloc(sizeof(vm_area_t)); @@ -720,6 +737,8 @@ void *fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes) return __fmm_allocate_device(gpu_id, MemorySizeInBytes, aperture, offset, NULL, flags); + /* TODO: honor host access mem flag and map to user mode VM if + * needed */ } static void* fmm_allocate_host_cpu(uint64_t MemorySizeInBytes, @@ -746,19 +765,19 @@ static void* fmm_allocate_host_cpu(uint64_t MemorySizeInBytes, return mem; } -static void* fmm_allocate_host_gpu(uint32_t gpu_id, - uint64_t MemorySizeInBytes, HsaMemFlags flags) +static void* fmm_allocate_host_gpu(uint64_t MemorySizeInBytes, + HsaMemFlags flags) { void *mem; manageble_aperture_t *aperture; - int32_t gpu_mem_id; uint64_t mmap_offset; uint32_t ioc_flags; uint32_t size; + int32_t i; + uint32_t gpu_id; - /* Retrieve gpu_mem id according to gpu_id */ - gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id); - if (gpu_mem_id < 0) + i = find_first_dgpu(&gpu_id); + if (i < 0) return NULL; size = MemorySizeInBytes; @@ -776,30 +795,31 @@ static void* fmm_allocate_host_gpu(uint32_t gpu_id, aperture, 0, &mmap_offset, ioc_flags); - /* FIXME: host memory allocated in this way should be mapped on all GPUs */ - void *ret = mmap(mem, MemorySizeInBytes, - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_FIXED, kfd_fd , mmap_offset); - if (ret == MAP_FAILED) { - __fmm_release(mem, MemorySizeInBytes, aperture); - return NULL; + if (flags.ui32.HostAccess) { + void *ret = mmap(mem, MemorySizeInBytes, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED, kfd_fd , mmap_offset); + if (ret == MAP_FAILED) { + __fmm_release(mem, MemorySizeInBytes, aperture); + return NULL; + } + if (flags.ui32.AQLQueueMemory) { + uint64_t my_buf_size = ALIGN_UP(size, aperture->align) / 2; + memset(ret, 0, MemorySizeInBytes); + mmap(VOID_PTR_ADD(mem, my_buf_size), MemorySizeInBytes, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED, kfd_fd , mmap_offset); + } } - if (flags.ui32.AQLQueueMemory) { - uint64_t my_buf_size = ALIGN_UP(size, aperture->align) / 2; - memset(ret, 0, MemorySizeInBytes); - mmap(VOID_PTR_ADD(mem, my_buf_size), MemorySizeInBytes, - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_FIXED, kfd_fd , mmap_offset); - } - return ret; + return mem; } -void* fmm_allocate_host(uint32_t gpu_id, uint64_t MemorySizeInBytes, HsaMemFlags flags, uint16_t dev_id) +void* fmm_allocate_host(uint64_t MemorySizeInBytes, HsaMemFlags flags) { - if (topology_is_dgpu(dev_id)) - return fmm_allocate_host_gpu(gpu_id, MemorySizeInBytes, flags); + if (is_dgpu) + return fmm_allocate_host_gpu(MemorySizeInBytes, flags); return fmm_allocate_host_cpu(MemorySizeInBytes, flags); } diff --git a/src/fmm.h b/src/fmm.h index 881413bbdf..75b5cac4a7 100644 --- a/src/fmm.h +++ b/src/fmm.h @@ -51,8 +51,7 @@ void fmm_destroy_process_apertures(void); */ void* fmm_allocate_scratch(uint32_t gpu_id, uint64_t MemorySizeInBytes); void* fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes); -void* fmm_allocate_host(uint32_t gpu_id, uint64_t MemorySizeInBytes, - HsaMemFlags flags, uint16_t dev_id); +void* fmm_allocate_host(uint64_t MemorySizeInBytes, HsaMemFlags flags); void* fmm_open_graphic_handle(uint32_t gpu_id, int32_t graphic_device_handle, uint32_t graphic_handle, diff --git a/src/libhsakmt.h b/src/libhsakmt.h index d277da07b0..e7c3975646 100644 --- a/src/libhsakmt.h +++ b/src/libhsakmt.h @@ -74,7 +74,7 @@ bool topology_is_dgpu(uint16_t device_id); HSAuint32 PageSizeFromFlags(unsigned int pageSizeFlags); void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, - uint32_t NodeId, bool peer_to_peer); + uint32_t NodeId); void free_exec_aligned_memory_gpu(void *addr, uint32_t size, uint32_t align); HSAKMT_STATUS init_process_doorbells(unsigned int NumNodes); void destroy_process_doorbells(void); diff --git a/src/memory.c b/src/memory.c index 514673d003..08b08bd734 100644 --- a/src/memory.c +++ b/src/memory.c @@ -132,30 +132,16 @@ hsaKmtAllocMemory( return HSAKMT_STATUS_INVALID_PARAMETER; } - if (MemFlags.ui32.HostAccess && !MemFlags.ui32.NonPaged && !MemFlags.ui32.Scratch) { - if (gpu_id == 0 && PreferredNode == 0) { - /* HACK: Currently we need a GPU node for - * system memory allocations on dGPUs and - * MapMemoryToGPU will always map to the same - * GPU used for allocation. Therefore we need - * to allocate system memory from node 1 if - * we're running on a dGPU (indicated by node - * 0 being a CPU with gpu_id==0). This will be - * cleaned up when multi-GPU support is - * implemented. */ - PreferredNode = 1; - result = validate_nodeid(PreferredNode, &gpu_id); - if (result != HSAKMT_STATUS_SUCCESS) - return result; - } - *MemoryAddress = fmm_allocate_host(gpu_id, SizeInBytes, MemFlags, - get_device_id_by_node(PreferredNode)); + if (gpu_id == 0 && !MemFlags.ui32.Scratch) { + *MemoryAddress = fmm_allocate_host(SizeInBytes, MemFlags); + if (*MemoryAddress == NULL) return HSAKMT_STATUS_ERROR; + return HSAKMT_STATUS_SUCCESS; } - if (!MemFlags.ui32.HostAccess && MemFlags.ui32.NonPaged && !MemFlags.ui32.Scratch) { + if (gpu_id && MemFlags.ui32.NonPaged && !MemFlags.ui32.Scratch) { *MemoryAddress = fmm_allocate_device(gpu_id, SizeInBytes); if (*MemoryAddress == NULL) @@ -172,6 +158,17 @@ hsaKmtAllocMemory( return HSAKMT_STATUS_SUCCESS; } + /* Backwards compatibility hack: Allocate system memory if app + * asks for paged memory from a GPU node. */ + if (gpu_id && !MemFlags.ui32.NonPaged && !MemFlags.ui32.Scratch) { + *MemoryAddress = fmm_allocate_host(SizeInBytes, MemFlags); + + if (*MemoryAddress == NULL) + return HSAKMT_STATUS_ERROR; + + return HSAKMT_STATUS_SUCCESS; + } + return HSAKMT_STATUS_INVALID_PARAMETER; } diff --git a/src/queues.c b/src/queues.c index e7e26b87ca..0867548e7d 100644 --- a/src/queues.c +++ b/src/queues.c @@ -218,7 +218,7 @@ static void* allocate_exec_aligned_memory_cpu(uint32_t size, uint32_t align) } void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, - uint32_t NodeId, bool peer_to_peer) + uint32_t NodeId) { void *mem; HSAuint64 gpu_va; @@ -232,12 +232,12 @@ void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, size = ALIGN_UP(size, align); - ret = hsaKmtAllocMemory(NodeId, size, flags, &mem); + ret = hsaKmtAllocMemory(0, size, flags, &mem); if (ret != HSAKMT_STATUS_SUCCESS) { return NULL; } - if (!peer_to_peer) { + if (NodeId != 0) { uint32_t nodes_array[1] = {NodeId}; if (hsaKmtRegisterMemoryToNodes(mem, size, 1, nodes_array) != HSAKMT_STATUS_SUCCESS) { @@ -269,7 +269,7 @@ static void* allocate_exec_aligned_memory(uint32_t size, uint32_t NodeId) { if (IS_DGPU(type)) - return allocate_exec_aligned_memory_gpu(size, align, NodeId, false); + return allocate_exec_aligned_memory_gpu(size, align, NodeId); return allocate_exec_aligned_memory_cpu(size, align); }