Fix various dgpu memory management issues

Fix TONGA_PAGE_SIZE value and move it to libhsakmt.h for usiing it consistently in all places that require the same alignment for the same reason. Create a generic alignment helper macro to replace some incorrect hand-coded size alignments. Move virtual address and size alignments down into aperture management functions. Alignment is a per-aperture property that is set during fmm_init_process_apertures. Doing the alignment there ensures that all allocations in the same aperture are aligned the same way. Finding objects by size and address can take the alignment into account. Also align the size of physical allocations to back aligned virtual address allocations. CPU mappings do not need to be aligned. Map anonymous pages over released memory mappings to allow the backing pages to be released, while keeping the address space reserved. Add alignment parameter to free_exec_aligned_memory_gpu to match the interface of allocate_exec_aligned_memory_cpu. It doesn't make sense to allow an alignment parameter in one but assume a specific alignment in the other. Change-Id: I74226ca6938f4948f643e5aee1d474720cd89e78 [ROCm/ROCR-Runtime commit: 6a5ca4bc5a]
2015-10-13 19:05:39 -04:00
@@ -74,7 +74,7 @@ hsaKmtCreateEvent(

 	/* dGPU code */
 	if (is_dgpu && events_page == NULL) {
-		events_page = allocate_exec_aligned_memory_gpu(KFD_SIGNAL_EVENT_LIMIT * 8, 0x9000);
+		events_page = allocate_exec_aligned_memory_gpu(KFD_SIGNAL_EVENT_LIMIT * 8, TONGA_PAGE_SIZE);
 		if (!events_page) {
 			return HSAKMT_STATUS_ERROR;
 		}
@@ -42,6 +42,7 @@
 #define INIT_MANAGEBLE_APERTURE(base_value, limit_value) {	\
 	.base = (void *) base_value,				\
 	.limit = (void *) limit_value,				\
+	.align = PAGE_SIZE,					\
 	.vm_ranges = NULL,					\
 	.vm_objects = NULL,					\
 	.fmm_mutex = PTHREAD_MUTEX_INITIALIZER			\
@@ -78,6 +79,7 @@ typedef struct vm_area vm_area_t;
 typedef struct {
 	void *base;
 	void *limit;
+	uint64_t align;
 	vm_area_t *vm_ranges;
 	vm_object_t *vm_objects;
 	pthread_mutex_t fmm_mutex;
@@ -103,6 +105,8 @@ typedef struct {
 } gpu_mem_t;

 static gpu_mem_t gpu_mem[] = INIT_GPUs_MEM;
+static void *dgpu_shared_aperture_base = NULL;
+static void *dgpu_shared_aperture_limit = NULL;

 static HSAKMT_STATUS dgpu_mem_init(uint8_t node_id, void **base, void **limit);
 static int set_dgpu_aperture(uint32_t node_id, uint64_t base, uint64_t limit);
@@ -224,6 +228,8 @@ static vm_object_t *vm_find_object_by_address(manageble_aperture_t *app,
 {
 	vm_object_t *cur = app->vm_objects;

+	size = ALIGN_UP(size, app->align);
+
 	/* Look up the appropriate address range containing the given address */
 	while (cur) {
 		if (cur->start == address && (cur->size == size || size == 0))
@@ -264,6 +270,8 @@ static void aperture_release_area(manageble_aperture_t *app, void *address,
 	vm_area_t *area;
 	uint64_t SizeOfRegion;

+	MemorySizeInBytes = ALIGN_UP(MemorySizeInBytes, app->align);
+
 	area = vm_find(app, address);
 	if (!area)
 		return;
@@ -302,6 +310,8 @@ static void *aperture_allocate_area(manageble_aperture_t *app,
 	next = NULL;
 	new_area = NULL;

+	MemorySizeInBytes = ALIGN_UP(MemorySizeInBytes, app->align);
+
 	cur = app->vm_ranges;
 	if (cur) { /* not empty */
 		/*
@@ -358,6 +368,8 @@ static int aperture_allocate_object(manageble_aperture_t *app,
 {
 	vm_object_t *new_object;

+	MemorySizeInBytes = ALIGN_UP(MemorySizeInBytes, app->align);
+
 	/* Allocate new object */
 	new_object = vm_create_and_init_object(new_address,
 						MemorySizeInBytes,
@@ -400,7 +412,7 @@ static int fmm_allocate_memory_in_device(uint32_t gpu_id, void *mem,

 	/* Allocate memory from amdkfd */
 	args.gpu_id = gpu_id;
-	args.size = MemorySizeInBytes;
+	args.size = ALIGN_UP(MemorySizeInBytes, aperture->align);

 	args.flags = flags;
 	args.va_addr = (uint64_t)mem;
@@ -596,8 +608,6 @@ void *fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes)

 	if (topology_is_dgpu(get_device_id_by_gpu_id(gpu_id))) {
 		flags = KFD_IOC_ALLOC_MEM_FLAGS_DGPU_DEVICE;
-		/* Alignment is needed to match a workaround for a VI HW bug in the kernel */
-		MemorySizeInBytes = (MemorySizeInBytes + 0x7fffULL) & ~0x7fffULL;
 		/*
 		 * TODO: Once VA limit is raised from 0x200000000 (8GB) use gpuvm_aperture.
 		 * In that way the host access range won't be used for local memory
@@ -655,10 +665,6 @@ static void* fmm_allocate_host_gpu(uint32_t gpu_id,
 	else
 		aperture = &gpu_mem[gpu_mem_id].dgpu_alt_aperture; /* coherent */

-	/* Alignment is needed to match a workaround for a VI HW bug in the kernel */
-	/* FIXME: this breaks fmm_release! */
-	MemorySizeInBytes = (MemorySizeInBytes + 0x7fffULL) & ~0x7fffULL;
-
 	mem =  __fmm_allocate_device(gpu_id, MemorySizeInBytes,
 			aperture, 0, &mmap_offset,
 			KFD_IOC_ALLOC_MEM_FLAGS_DGPU_HOST);
@@ -806,6 +812,14 @@ void fmm_release(void *address, uint64_t MemorySizeInBytes)
 		}
 	}

+	if (found &&
+	    address >= dgpu_shared_aperture_base &&
+	    address <= dgpu_shared_aperture_limit) {
+		/* Remove any CPU mapping, but keep the address range reserved */
+		mmap(address, MemorySizeInBytes, PROT_READ | PROT_WRITE,
+		     MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE | MAP_FIXED, -1, 0);
+	}
+
 	/*
 	 * If memory address isn't inside of any defined aperture - it refers
 	 * to the system memory
@@ -872,6 +886,7 @@ HSAKMT_STATUS fmm_init_process_apertures(void)
 						&gpu_mem[node_id].dgpu_aperture.limit);
 				set_dgpu_aperture(node_id, (uint64_t)gpu_mem[node_id].dgpu_aperture.base,
 						(uint64_t)gpu_mem[node_id].dgpu_aperture.limit);
+				gpu_mem[node_id].dgpu_aperture.align = TONGA_PAGE_SIZE;

 				/* Place GPUVM aperture after dGPU aperture
 				 * (FK: I think this is broken but leaving it for now) */
@@ -880,6 +895,7 @@ HSAKMT_STATUS fmm_init_process_apertures(void)
 						gpu_mem[node_id].dgpu_aperture.base);
 				gpu_mem[node_id].gpuvm_aperture.limit = VOID_PTR_ADD(gpu_mem[node_id].gpuvm_aperture.limit,
 						(unsigned long)gpu_mem[node_id].gpuvm_aperture.base);
+				gpu_mem[node_id].gpuvm_aperture.align = TONGA_PAGE_SIZE;

 				/* Use the first 1/4 of the dGPU aperture as
 				 * alternate aperture for coherent access.
@@ -900,6 +916,7 @@ HSAKMT_STATUS fmm_init_process_apertures(void)
 					fprintf(stderr, "Error! Failed to set alt aperture for node %d\n", node_id);
 					ret = HSAKMT_STATUS_ERROR;
 				}
+				gpu_mem[node_id].dgpu_alt_aperture.align = TONGA_PAGE_SIZE;
 			}
 		}
 	}
@@ -1129,8 +1146,6 @@ int fmm_unmap_from_gpu(void *address)

 /* Tonga dGPU specific functions */
 static bool is_dgpu_mem_init = false;
-static void *dgpu_shared_aperture_base = NULL;
-static void *dgpu_shared_aperture_limit = NULL;

 static int set_dgpu_aperture(uint32_t node_id, uint64_t base, uint64_t limit)
 {
@@ -1208,9 +1223,9 @@ static HSAKMT_STATUS dgpu_mem_init(uint8_t node_id, void **base, void **limit)
 	max_len = props.LocalMemSize;
 	found = false;

-	for (addr = (void *)PAGE_SIZE, ret_addr = NULL;
+	for (addr = (void *)TONGA_PAGE_SIZE, ret_addr = NULL;
 		ret_addr != addr;
-		addr = (void *)((unsigned long)addr + 0x8000))
+		addr = (void *)((unsigned long)addr + TONGA_PAGE_SIZE))
 	{
 		ret_addr = reserve_address(addr, max_len);
 		if (!ret_addr)
@@ -49,11 +49,14 @@ extern bool is_dgpu;
 	do { if (kfd_open_count == 0) return HSAKMT_STATUS_KERNEL_IO_CHANNEL_NOT_OPENED; } while (0)

 #define PAGE_SIZE 4096
+/* VI HW bug requires this virtual address alignment */
+#define TONGA_PAGE_SIZE 0x8000

 #define CHECK_PAGE_MULTIPLE(x) \
 	do { if ((uint64_t)PORT_VPTR_TO_UINT64(x) % PAGE_SIZE) return HSAKMT_STATUS_INVALID_PARAMETER; } while(0)

-#define PAGE_ALIGN_UP(x) (((uint64_t)(x) + PAGE_SIZE - 1) & ~(uint64_t)(PAGE_SIZE-1))
+#define ALIGN_UP(x,align) (((uint64_t)(x) + (align) - 1) & ~(uint64_t)((align)-1))
+#define PAGE_ALIGN_UP(x) ALIGN_UP(x,PAGE_SIZE)
 #define BITMASK(n) (((n) < sizeof(1ULL) * CHAR_BIT ? (1ULL << (n)) : 0) - 1ULL)

 /*
@@ -74,7 +77,7 @@ bool topology_is_dgpu(uint16_t gpu_id);
 HSAuint32 PageSizeFromFlags(unsigned int pageSizeFlags);

 void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align);
-void free_exec_aligned_memory_gpu(void *addr, uint32_t size);
+void free_exec_aligned_memory_gpu(void *addr, uint32_t size, uint32_t align);

 extern int kmtIoctl(int fd, unsigned long request, void *arg);

@@ -34,8 +34,6 @@
 #include <sys/mman.h>
 #include <fcntl.h>

-#define TONGA_PAGE_SIZE 0x9000
-
 /* 1024 doorbells, 4 bytes each doorbell */
 #define DOORBELLS_PAGE_SIZE	1024 * 4

@@ -201,7 +199,7 @@ void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align)
 	flags.ui32.ExecuteAccess = 1;
 	flags.ui32.PageSize = HSA_PAGE_SIZE_4KB;

-	size += align - (size % align);
+	size = ALIGN_UP(size, align);

 	ret = hsaKmtAllocMemory(0, size, flags, &mem);
 	if (ret != HSAKMT_STATUS_SUCCESS) {
@@ -215,9 +213,9 @@ void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align)
 	return mem;
 }

-void free_exec_aligned_memory_gpu(void *addr, uint32_t size)
+void free_exec_aligned_memory_gpu(void *addr, uint32_t size, uint32_t align)
 {
-	size += TONGA_PAGE_SIZE - (size % TONGA_PAGE_SIZE);
+	size = ALIGN_UP(size, align);

 	if (hsaKmtUnmapMemoryToGPU(addr) == HSAKMT_STATUS_SUCCESS) {
 		hsaKmtFreeMemory(addr, size);