libhsakmt: Add alignment for memory allocations

New API to support optional alignment parameter for memory allocations. The alignment should be larger than or equal to page size and a power of 2. Change-Id: Ic3fec43b3c4281f74dd33a57ab4143dcf76e1186 Signed-off-by: Chris Freehill <cfreehil@amd.com> [ROCm/ROCR-Runtime commit: a31e84eaef]
2024-05-16 16:32:23 +00:00
@@ -406,6 +406,21 @@ hsaKmtAllocMemory(
    void**          MemoryAddress           //IN/OUT (page-aligned)
    );

+/**
+  Allocates a memory buffer with specific alignment that may be accessed by the GPU
+  If Alignment is 0, the smallest possible alignment will be used
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtAllocMemoryAlign(
+    HSAuint32       PreferredNode,          //IN
+    HSAuint64       SizeInBytes,            //IN  (multiple of page size)
+    HSAuint64       Alignment,              //IN  (power of 2 and >= page size)
+    HsaMemFlags     MemFlags,               //IN
+    void**          MemoryAddress           //IN/OUT (page-aligned)
+    );
+
 /**
  Frees a memory buffer
 */
@@ -856,7 +856,7 @@ static void *aperture_allocate_area_aligned(manageable_aperture_t *app,
 					    uint64_t MemorySizeInBytes,
 					    uint64_t align)
 {
-	return app->ops->allocate_area_aligned(app, address, MemorySizeInBytes, align);
+	return app->ops->allocate_area_aligned(app, address, MemorySizeInBytes, align ? align : app->align);
 }
 static void *aperture_allocate_area(manageable_aperture_t *app, void *address,
 				    uint64_t MemorySizeInBytes)
@@ -1448,7 +1448,7 @@ void *fmm_allocate_scratch(uint32_t gpu_id, void *address, uint64_t MemorySizeIn

 static void *__fmm_allocate_device(uint32_t gpu_id, void *address, uint64_t MemorySizeInBytes,
 		manageable_aperture_t *aperture, uint64_t *mmap_offset,
-		uint32_t ioc_flags, vm_object_t **vm_obj)
+		uint32_t ioc_flags, uint64_t alignment, vm_object_t **vm_obj)
 {
 	void *mem = NULL;
 	vm_object_t *obj;
@@ -1459,7 +1459,7 @@ static void *__fmm_allocate_device(uint32_t gpu_id, void *address, uint64_t Memo

 	/* Allocate address space */
 	pthread_mutex_lock(&aperture->fmm_mutex);
-	mem = aperture_allocate_area(aperture, address, MemorySizeInBytes);
+	mem = aperture_allocate_area_aligned(aperture, address, MemorySizeInBytes, alignment);
 	pthread_mutex_unlock(&aperture->fmm_mutex);

 	/*
@@ -1504,7 +1504,7 @@ static void *fmm_map_to_cpu(void *mem, uint64_t size, bool host_access,
 }

 static void *fmm_allocate_va(uint32_t gpu_id, void *address, uint64_t size,
-			manageable_aperture_t *aperture, HsaMemFlags mflags)
+			manageable_aperture_t *aperture, uint64_t alignment, HsaMemFlags mflags)
 {
 	void *mem = NULL;
 	vm_object_t *vm_obj = NULL;
@@ -1515,7 +1515,7 @@ static void *fmm_allocate_va(uint32_t gpu_id, void *address, uint64_t size,

 	/* Allocate address space */
 	pthread_mutex_lock(&aperture->fmm_mutex);
-	mem = aperture_allocate_area(aperture, address, size);
+	mem = aperture_allocate_area_aligned(aperture, address, size, alignment);
 	/* assing handle 0 to vm_obj since no mem allocted */
 	vm_obj = aperture_allocate_object(aperture, mem, 0,
 					size, mflags);
@@ -1533,7 +1533,7 @@ static void *fmm_allocate_va(uint32_t gpu_id, void *address, uint64_t size,
 }

 void *fmm_allocate_device(uint32_t gpu_id, uint32_t node_id, void *address,
-			  uint64_t MemorySizeInBytes, HsaMemFlags mflags)
+			  uint64_t MemorySizeInBytes, uint64_t alignment, HsaMemFlags mflags)
 {
 	manageable_aperture_t *aperture;
 	int32_t gpu_mem_id;
@@ -1564,7 +1564,7 @@ void *fmm_allocate_device(uint32_t gpu_id, uint32_t node_id, void *address,

 	/* special case for va allocation without vram alloc */
 	if (mflags.ui32.OnlyAddress)
-		return fmm_allocate_va(gpu_id, address, size, aperture, mflags);
+		return fmm_allocate_va(gpu_id, address, size, aperture, alignment, mflags);

 	/* special case for vram allocation without addr */
 	if(mflags.ui32.NoAddress)
@@ -1583,7 +1583,7 @@ void *fmm_allocate_device(uint32_t gpu_id, uint32_t node_id, void *address,
 		ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT;

 	mem = __fmm_allocate_device(gpu_id, address, size, aperture, &mmap_offset,
-				    ioc_flags, &vm_obj);
+				    ioc_flags, alignment, &vm_obj);

 	if (mem && vm_obj) {
 		pthread_mutex_lock(&aperture->fmm_mutex);
@@ -1637,7 +1637,7 @@ void *fmm_allocate_doorbell(uint32_t gpu_id, uint64_t MemorySizeInBytes,
 		    KFD_IOC_ALLOC_MEM_FLAGS_COHERENT;

 	mem = __fmm_allocate_device(gpu_id, NULL, MemorySizeInBytes, aperture, NULL,
-				    ioc_flags, &vm_obj);
+				    ioc_flags, 0, &vm_obj);

 	if (mem && vm_obj) {
 		HsaMemFlags mflags;
@@ -1768,7 +1768,7 @@ static int bind_mem_to_numa(uint32_t node_id, void *mem,
 }

 static void *fmm_allocate_host_gpu(uint32_t gpu_id, uint32_t node_id, void *address,
-				   uint64_t MemorySizeInBytes, HsaMemFlags mflags)
+				   uint64_t MemorySizeInBytes, uint64_t alignment, HsaMemFlags mflags)
 {
 	manageable_aperture_t *aperture;
 	vm_object_t *vm_obj = NULL;
@@ -1822,7 +1822,7 @@ static void *fmm_allocate_host_gpu(uint32_t gpu_id, uint32_t node_id, void *addr
 	if (!mflags.ui32.NonPaged && svm.userptr_for_paged_mem) {
 		/* Allocate address space */
 		pthread_mutex_lock(&aperture->fmm_mutex);
-		mem = aperture_allocate_area(aperture, address, size);
+		mem = aperture_allocate_area_aligned(aperture, address, size, alignment);
 		pthread_mutex_unlock(&aperture->fmm_mutex);
 		if (!mem)
 			return NULL;
@@ -1854,7 +1854,7 @@ static void *fmm_allocate_host_gpu(uint32_t gpu_id, uint32_t node_id, void *addr
 	} else {
 		ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_GTT;
 		mem =  __fmm_allocate_device(preferred_gpu_id, address, size, aperture,
-					     &mmap_offset, ioc_flags, &vm_obj);
+					     &mmap_offset, ioc_flags, alignment, &vm_obj);

 		if (mem && mflags.ui32.HostAccess) {
 			void *ret = fmm_map_to_cpu(mem, MemorySizeInBytes,
@@ -1896,10 +1896,16 @@ out_release_area:
 }

 void *fmm_allocate_host(uint32_t gpu_id, uint32_t node_id, void *address,
-			uint64_t MemorySizeInBytes, HsaMemFlags mflags)
+			uint64_t MemorySizeInBytes, uint64_t alignment, HsaMemFlags mflags)
 {
 	if (is_dgpu)
-		return fmm_allocate_host_gpu(gpu_id, node_id, address, MemorySizeInBytes, mflags);
+		return fmm_allocate_host_gpu(gpu_id, node_id, address, MemorySizeInBytes, alignment, mflags);
+
+	if (alignment) {//Alignment not supported on non-dgpu
+		pr_err("Non-default alignment not supported on non-dgpu\n");
+		return NULL;
+	}
+
 	return fmm_allocate_host_cpu(address, MemorySizeInBytes, mflags);
 }

@@ -2365,7 +2371,7 @@ static void *map_mmio(uint32_t node_id, uint32_t gpu_id, int mmap_fd)
 		KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE |
 		KFD_IOC_ALLOC_MEM_FLAGS_COHERENT;
 	mem = __fmm_allocate_device(gpu_id, NULL, PAGE_SIZE, aperture,
-			&mmap_offset, ioc_flags, &vm_obj);
+			&mmap_offset, ioc_flags, 0, &vm_obj);

 	if (!mem || !vm_obj)
 		return NULL;
@@ -3455,6 +3461,7 @@ static HSAKMT_STATUS fmm_register_user_memory(void *addr,
 			 KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE |
 			 (coarse_grain ? 0 : KFD_IOC_ALLOC_MEM_FLAGS_COHERENT) |
 			 (ext_coherent ? KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT : 0),
+			 0,
 			 &obj);
 	if (!svm_addr)
 		return HSAKMT_STATUS_ERROR;
@@ -52,10 +52,10 @@ void fmm_destroy_process_apertures(void);
 /* Memory interface */
 void *fmm_allocate_scratch(uint32_t gpu_id, void *address, uint64_t MemorySizeInBytes);
 void *fmm_allocate_device(uint32_t gpu_id, uint32_t node_id, void *address,
-			uint64_t MemorySizeInBytes, HsaMemFlags flags);
+			uint64_t MemorySizeInBytes, uint64_t alignment, HsaMemFlags flags);
 void *fmm_allocate_doorbell(uint32_t gpu_id, uint64_t MemorySizeInBytes, uint64_t doorbell_offset);
 void *fmm_allocate_host(uint32_t gpu_id, uint32_t node_id, void *address, uint64_t MemorySizeInBytes,
-			HsaMemFlags flags);
+			uint64_t alignment, HsaMemFlags flags);
 void fmm_print(uint32_t node);
 HSAKMT_STATUS fmm_release(void *address);
 HSAKMT_STATUS fmm_map_to_gpu(void *address, uint64_t size, uint64_t *gpuvm_address);
@@ -108,6 +108,17 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemory(HSAuint32 PreferredNode,
 					  HSAuint64 SizeInBytes,
 					  HsaMemFlags MemFlags,
 					  void **MemoryAddress)
+{
+	return hsaKmtAllocMemoryAlign(PreferredNode, SizeInBytes, 0, MemFlags, MemoryAddress);
+}
+
+#define POWER_OF_2(x) ((x && (!(x & (x - 1)))) ? 1 : 0)
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemoryAlign(HSAuint32 PreferredNode,
+					  HSAuint64 SizeInBytes,
+					  HSAuint64 Alignment,
+					  HsaMemFlags MemFlags,
+					  void **MemoryAddress)
 {
 	HSAKMT_STATUS result;
 	uint32_t gpu_id;
@@ -128,6 +139,9 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemory(HSAuint32 PreferredNode,

 	page_size = PageSizeFromFlags(MemFlags.ui32.PageSize);

+	if (Alignment && (Alignment < page_size || !POWER_OF_2(Alignment)))
+		return HSAKMT_STATUS_INVALID_PARAMETER;
+
 	if (!MemoryAddress || !SizeInBytes || (SizeInBytes & (page_size-1)))
 		return HSAKMT_STATUS_INVALID_PARAMETER;

@@ -143,6 +157,12 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemory(HSAuint32 PreferredNode,
 		return HSAKMT_STATUS_INVALID_PARAMETER;

 	if (MemFlags.ui32.Scratch) {
+		if (Alignment) {
+			// Scratch memory currently forced to SCRATCH_ALIGN
+			pr_err("[%s] Alignment not supported for scratch memory: %d\n", __func__, PreferredNode);
+			return HSAKMT_STATUS_NOT_IMPLEMENTED;
+		}
+
 		*MemoryAddress = fmm_allocate_scratch(gpu_id, *MemoryAddress, SizeInBytes);

 		if (!(*MemoryAddress)) {
@@ -165,7 +185,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemory(HSAuint32 PreferredNode,
 			MemFlags.ui32.CoarseGrain = 1;

 		*MemoryAddress = fmm_allocate_host(gpu_id, MemFlags.ui32.GTTAccess ? 0 : PreferredNode,
-											*MemoryAddress, SizeInBytes, MemFlags);
+						   *MemoryAddress, SizeInBytes, Alignment, MemFlags);

 		if (!(*MemoryAddress)) {
 			pr_err("[%s] failed to allocate %lu bytes from host\n",
@@ -185,7 +205,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemory(HSAuint32 PreferredNode,
 	}

 	*MemoryAddress = fmm_allocate_device(gpu_id, PreferredNode, *MemoryAddress,
-					     SizeInBytes, MemFlags);
+					     SizeInBytes, Alignment, MemFlags);

 	if (!(*MemoryAddress)) {
 		pr_err("[%s] failed to allocate %lu bytes from device\n",
@@ -337,7 +337,7 @@ void *allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, uint32_t g
 	size = ALIGN_UP(size, align);

 	if (DeviceLocal && !zfb_support)
-		mem = fmm_allocate_device(gpu_id, NodeId, mem, size, flags);
+		mem = fmm_allocate_device(gpu_id, NodeId, mem, size, 0, flags);
 	else {
 		/* VRAM under ZFB mode should be supported here without any
 		 * additional code
@@ -352,7 +352,7 @@ void *allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, uint32_t g
 				cpu_id = 0;
 			}
 		}
-		mem = fmm_allocate_host(gpu_id, cpu_id, mem, size, flags);
+		mem = fmm_allocate_host(gpu_id, cpu_id, mem, size, 0, flags);
 	}

 	if (!mem) {