diff --git a/projects/rocr-runtime/libhsakmt/include/hsakmt/hsakmt.h b/projects/rocr-runtime/libhsakmt/include/hsakmt/hsakmt.h index 7a37bec049..99dc019773 100644 --- a/projects/rocr-runtime/libhsakmt/include/hsakmt/hsakmt.h +++ b/projects/rocr-runtime/libhsakmt/include/hsakmt/hsakmt.h @@ -406,6 +406,21 @@ hsaKmtAllocMemory( void** MemoryAddress //IN/OUT (page-aligned) ); +/** + Allocates a memory buffer with specific alignment that may be accessed by the GPU + If Alignment is 0, the smallest possible alignment will be used +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtAllocMemoryAlign( + HSAuint32 PreferredNode, //IN + HSAuint64 SizeInBytes, //IN (multiple of page size) + HSAuint64 Alignment, //IN (power of 2 and >= page size) + HsaMemFlags MemFlags, //IN + void** MemoryAddress //IN/OUT (page-aligned) + ); + /** Frees a memory buffer */ diff --git a/projects/rocr-runtime/libhsakmt/src/fmm.c b/projects/rocr-runtime/libhsakmt/src/fmm.c index b99c138345..c54e17719d 100644 --- a/projects/rocr-runtime/libhsakmt/src/fmm.c +++ b/projects/rocr-runtime/libhsakmt/src/fmm.c @@ -856,7 +856,7 @@ static void *aperture_allocate_area_aligned(manageable_aperture_t *app, uint64_t MemorySizeInBytes, uint64_t align) { - return app->ops->allocate_area_aligned(app, address, MemorySizeInBytes, align); + return app->ops->allocate_area_aligned(app, address, MemorySizeInBytes, align ? align : app->align); } static void *aperture_allocate_area(manageable_aperture_t *app, void *address, uint64_t MemorySizeInBytes) @@ -1448,7 +1448,7 @@ void *fmm_allocate_scratch(uint32_t gpu_id, void *address, uint64_t MemorySizeIn static void *__fmm_allocate_device(uint32_t gpu_id, void *address, uint64_t MemorySizeInBytes, manageable_aperture_t *aperture, uint64_t *mmap_offset, - uint32_t ioc_flags, vm_object_t **vm_obj) + uint32_t ioc_flags, uint64_t alignment, vm_object_t **vm_obj) { void *mem = NULL; vm_object_t *obj; @@ -1459,7 +1459,7 @@ static void *__fmm_allocate_device(uint32_t gpu_id, void *address, uint64_t Memo /* Allocate address space */ pthread_mutex_lock(&aperture->fmm_mutex); - mem = aperture_allocate_area(aperture, address, MemorySizeInBytes); + mem = aperture_allocate_area_aligned(aperture, address, MemorySizeInBytes, alignment); pthread_mutex_unlock(&aperture->fmm_mutex); /* @@ -1504,7 +1504,7 @@ static void *fmm_map_to_cpu(void *mem, uint64_t size, bool host_access, } static void *fmm_allocate_va(uint32_t gpu_id, void *address, uint64_t size, - manageable_aperture_t *aperture, HsaMemFlags mflags) + manageable_aperture_t *aperture, uint64_t alignment, HsaMemFlags mflags) { void *mem = NULL; vm_object_t *vm_obj = NULL; @@ -1515,7 +1515,7 @@ static void *fmm_allocate_va(uint32_t gpu_id, void *address, uint64_t size, /* Allocate address space */ pthread_mutex_lock(&aperture->fmm_mutex); - mem = aperture_allocate_area(aperture, address, size); + mem = aperture_allocate_area_aligned(aperture, address, size, alignment); /* assing handle 0 to vm_obj since no mem allocted */ vm_obj = aperture_allocate_object(aperture, mem, 0, size, mflags); @@ -1533,7 +1533,7 @@ static void *fmm_allocate_va(uint32_t gpu_id, void *address, uint64_t size, } void *fmm_allocate_device(uint32_t gpu_id, uint32_t node_id, void *address, - uint64_t MemorySizeInBytes, HsaMemFlags mflags) + uint64_t MemorySizeInBytes, uint64_t alignment, HsaMemFlags mflags) { manageable_aperture_t *aperture; int32_t gpu_mem_id; @@ -1564,7 +1564,7 @@ void *fmm_allocate_device(uint32_t gpu_id, uint32_t node_id, void *address, /* special case for va allocation without vram alloc */ if (mflags.ui32.OnlyAddress) - return fmm_allocate_va(gpu_id, address, size, aperture, mflags); + return fmm_allocate_va(gpu_id, address, size, aperture, alignment, mflags); /* special case for vram allocation without addr */ if(mflags.ui32.NoAddress) @@ -1583,7 +1583,7 @@ void *fmm_allocate_device(uint32_t gpu_id, uint32_t node_id, void *address, ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT; mem = __fmm_allocate_device(gpu_id, address, size, aperture, &mmap_offset, - ioc_flags, &vm_obj); + ioc_flags, alignment, &vm_obj); if (mem && vm_obj) { pthread_mutex_lock(&aperture->fmm_mutex); @@ -1637,7 +1637,7 @@ void *fmm_allocate_doorbell(uint32_t gpu_id, uint64_t MemorySizeInBytes, KFD_IOC_ALLOC_MEM_FLAGS_COHERENT; mem = __fmm_allocate_device(gpu_id, NULL, MemorySizeInBytes, aperture, NULL, - ioc_flags, &vm_obj); + ioc_flags, 0, &vm_obj); if (mem && vm_obj) { HsaMemFlags mflags; @@ -1768,7 +1768,7 @@ static int bind_mem_to_numa(uint32_t node_id, void *mem, } static void *fmm_allocate_host_gpu(uint32_t gpu_id, uint32_t node_id, void *address, - uint64_t MemorySizeInBytes, HsaMemFlags mflags) + uint64_t MemorySizeInBytes, uint64_t alignment, HsaMemFlags mflags) { manageable_aperture_t *aperture; vm_object_t *vm_obj = NULL; @@ -1822,7 +1822,7 @@ static void *fmm_allocate_host_gpu(uint32_t gpu_id, uint32_t node_id, void *addr if (!mflags.ui32.NonPaged && svm.userptr_for_paged_mem) { /* Allocate address space */ pthread_mutex_lock(&aperture->fmm_mutex); - mem = aperture_allocate_area(aperture, address, size); + mem = aperture_allocate_area_aligned(aperture, address, size, alignment); pthread_mutex_unlock(&aperture->fmm_mutex); if (!mem) return NULL; @@ -1854,7 +1854,7 @@ static void *fmm_allocate_host_gpu(uint32_t gpu_id, uint32_t node_id, void *addr } else { ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_GTT; mem = __fmm_allocate_device(preferred_gpu_id, address, size, aperture, - &mmap_offset, ioc_flags, &vm_obj); + &mmap_offset, ioc_flags, alignment, &vm_obj); if (mem && mflags.ui32.HostAccess) { void *ret = fmm_map_to_cpu(mem, MemorySizeInBytes, @@ -1896,10 +1896,16 @@ out_release_area: } void *fmm_allocate_host(uint32_t gpu_id, uint32_t node_id, void *address, - uint64_t MemorySizeInBytes, HsaMemFlags mflags) + uint64_t MemorySizeInBytes, uint64_t alignment, HsaMemFlags mflags) { if (is_dgpu) - return fmm_allocate_host_gpu(gpu_id, node_id, address, MemorySizeInBytes, mflags); + return fmm_allocate_host_gpu(gpu_id, node_id, address, MemorySizeInBytes, alignment, mflags); + + if (alignment) {//Alignment not supported on non-dgpu + pr_err("Non-default alignment not supported on non-dgpu\n"); + return NULL; + } + return fmm_allocate_host_cpu(address, MemorySizeInBytes, mflags); } @@ -2365,7 +2371,7 @@ static void *map_mmio(uint32_t node_id, uint32_t gpu_id, int mmap_fd) KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | KFD_IOC_ALLOC_MEM_FLAGS_COHERENT; mem = __fmm_allocate_device(gpu_id, NULL, PAGE_SIZE, aperture, - &mmap_offset, ioc_flags, &vm_obj); + &mmap_offset, ioc_flags, 0, &vm_obj); if (!mem || !vm_obj) return NULL; @@ -3455,6 +3461,7 @@ static HSAKMT_STATUS fmm_register_user_memory(void *addr, KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | (coarse_grain ? 0 : KFD_IOC_ALLOC_MEM_FLAGS_COHERENT) | (ext_coherent ? KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT : 0), + 0, &obj); if (!svm_addr) return HSAKMT_STATUS_ERROR; diff --git a/projects/rocr-runtime/libhsakmt/src/fmm.h b/projects/rocr-runtime/libhsakmt/src/fmm.h index 80ccfd559f..d40f9027e7 100644 --- a/projects/rocr-runtime/libhsakmt/src/fmm.h +++ b/projects/rocr-runtime/libhsakmt/src/fmm.h @@ -52,10 +52,10 @@ void fmm_destroy_process_apertures(void); /* Memory interface */ void *fmm_allocate_scratch(uint32_t gpu_id, void *address, uint64_t MemorySizeInBytes); void *fmm_allocate_device(uint32_t gpu_id, uint32_t node_id, void *address, - uint64_t MemorySizeInBytes, HsaMemFlags flags); + uint64_t MemorySizeInBytes, uint64_t alignment, HsaMemFlags flags); void *fmm_allocate_doorbell(uint32_t gpu_id, uint64_t MemorySizeInBytes, uint64_t doorbell_offset); void *fmm_allocate_host(uint32_t gpu_id, uint32_t node_id, void *address, uint64_t MemorySizeInBytes, - HsaMemFlags flags); + uint64_t alignment, HsaMemFlags flags); void fmm_print(uint32_t node); HSAKMT_STATUS fmm_release(void *address); HSAKMT_STATUS fmm_map_to_gpu(void *address, uint64_t size, uint64_t *gpuvm_address); diff --git a/projects/rocr-runtime/libhsakmt/src/memory.c b/projects/rocr-runtime/libhsakmt/src/memory.c index b239b1a076..9178b56dc2 100644 --- a/projects/rocr-runtime/libhsakmt/src/memory.c +++ b/projects/rocr-runtime/libhsakmt/src/memory.c @@ -108,6 +108,17 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemory(HSAuint32 PreferredNode, HSAuint64 SizeInBytes, HsaMemFlags MemFlags, void **MemoryAddress) +{ + return hsaKmtAllocMemoryAlign(PreferredNode, SizeInBytes, 0, MemFlags, MemoryAddress); +} + +#define POWER_OF_2(x) ((x && (!(x & (x - 1)))) ? 1 : 0) + +HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemoryAlign(HSAuint32 PreferredNode, + HSAuint64 SizeInBytes, + HSAuint64 Alignment, + HsaMemFlags MemFlags, + void **MemoryAddress) { HSAKMT_STATUS result; uint32_t gpu_id; @@ -128,6 +139,9 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemory(HSAuint32 PreferredNode, page_size = PageSizeFromFlags(MemFlags.ui32.PageSize); + if (Alignment && (Alignment < page_size || !POWER_OF_2(Alignment))) + return HSAKMT_STATUS_INVALID_PARAMETER; + if (!MemoryAddress || !SizeInBytes || (SizeInBytes & (page_size-1))) return HSAKMT_STATUS_INVALID_PARAMETER; @@ -143,6 +157,12 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemory(HSAuint32 PreferredNode, return HSAKMT_STATUS_INVALID_PARAMETER; if (MemFlags.ui32.Scratch) { + if (Alignment) { + // Scratch memory currently forced to SCRATCH_ALIGN + pr_err("[%s] Alignment not supported for scratch memory: %d\n", __func__, PreferredNode); + return HSAKMT_STATUS_NOT_IMPLEMENTED; + } + *MemoryAddress = fmm_allocate_scratch(gpu_id, *MemoryAddress, SizeInBytes); if (!(*MemoryAddress)) { @@ -165,7 +185,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemory(HSAuint32 PreferredNode, MemFlags.ui32.CoarseGrain = 1; *MemoryAddress = fmm_allocate_host(gpu_id, MemFlags.ui32.GTTAccess ? 0 : PreferredNode, - *MemoryAddress, SizeInBytes, MemFlags); + *MemoryAddress, SizeInBytes, Alignment, MemFlags); if (!(*MemoryAddress)) { pr_err("[%s] failed to allocate %lu bytes from host\n", @@ -185,7 +205,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemory(HSAuint32 PreferredNode, } *MemoryAddress = fmm_allocate_device(gpu_id, PreferredNode, *MemoryAddress, - SizeInBytes, MemFlags); + SizeInBytes, Alignment, MemFlags); if (!(*MemoryAddress)) { pr_err("[%s] failed to allocate %lu bytes from device\n", diff --git a/projects/rocr-runtime/libhsakmt/src/queues.c b/projects/rocr-runtime/libhsakmt/src/queues.c index 5a5f079bd2..c263a7546a 100644 --- a/projects/rocr-runtime/libhsakmt/src/queues.c +++ b/projects/rocr-runtime/libhsakmt/src/queues.c @@ -337,7 +337,7 @@ void *allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, uint32_t g size = ALIGN_UP(size, align); if (DeviceLocal && !zfb_support) - mem = fmm_allocate_device(gpu_id, NodeId, mem, size, flags); + mem = fmm_allocate_device(gpu_id, NodeId, mem, size, 0, flags); else { /* VRAM under ZFB mode should be supported here without any * additional code @@ -352,7 +352,7 @@ void *allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, uint32_t g cpu_id = 0; } } - mem = fmm_allocate_host(gpu_id, cpu_id, mem, size, flags); + mem = fmm_allocate_host(gpu_id, cpu_id, mem, size, 0, flags); } if (!mem) {