Make hsaKmtAllocMemory more compliant with the Thunk spec

Allocations from GPU nodes will return VRAM, not system memory. Only non-paged allocation from GPU nodes is supported. System memory can only be allocated from CPU nodes (usually node 0). The HostAccess flag is no longer used to distinguish the memory type. It only indicates, whether the memory is mapped for CPU access. Maintain compatibility with broken KfdTests by returning system memory for paged-memory requested from GPU nodes. Change-Id: I514defede735f55e6de436f41944125b6f2c4ccf
2016-02-06 18:47:40 -05:00
@@ -75,9 +75,8 @@ hsaKmtCreateEvent(

 	/* dGPU code */
 	if (is_dgpu && events_page == NULL) {
-		events_page = allocate_exec_aligned_memory_gpu(KFD_SIGNAL_EVENT_LIMIT * 8,
-			PAGE_SIZE,
-			args.node_id, true);
+		events_page = allocate_exec_aligned_memory_gpu(
+			KFD_SIGNAL_EVENT_LIMIT * 8, PAGE_SIZE, 0);
 		if (!events_page) {
 			return HSAKMT_STATUS_ERROR;
 		}
@@ -137,6 +137,23 @@ static int _fmm_unmap_from_gpu_scratch(uint32_t gpu_id,
 				       manageble_aperture_t *aperture,
 				       void *address);

+static int32_t find_first_dgpu(HSAuint32 *gpu_id) {
+	int32_t i;
+
+	*gpu_id = NON_VALID_GPU_ID;
+
+	for (i = 0; i < NUM_OF_SUPPORTED_GPUS; i++) {
+		if (gpu_mem[i].gpu_id == NON_VALID_GPU_ID)
+			continue;
+		if (!topology_is_dgpu(gpu_mem[i].device_id))
+			continue;
+		*gpu_id = gpu_mem[i].gpu_id;
+		return i;
+	}
+
+	return -1;
+}
+
 static vm_area_t *vm_create_and_init_area(void *start, void *end)
 {
 	vm_area_t *area = (vm_area_t *) malloc(sizeof(vm_area_t));
@@ -720,6 +737,8 @@ void *fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes)
 	return __fmm_allocate_device(gpu_id, MemorySizeInBytes,
 			aperture, offset, NULL,
 			flags);
+	/* TODO: honor host access mem flag and map to user mode VM if
+	 * needed */
 }

 static void* fmm_allocate_host_cpu(uint64_t MemorySizeInBytes,
@@ -746,19 +765,19 @@ static void* fmm_allocate_host_cpu(uint64_t MemorySizeInBytes,
 	return mem;
 }

-static void* fmm_allocate_host_gpu(uint32_t gpu_id,
-		uint64_t MemorySizeInBytes, HsaMemFlags flags)
+static void* fmm_allocate_host_gpu(uint64_t MemorySizeInBytes,
+				   HsaMemFlags flags)
 {
 	void *mem;
 	manageble_aperture_t *aperture;
-	int32_t gpu_mem_id;
 	uint64_t mmap_offset;
 	uint32_t ioc_flags;
 	uint32_t size;
+	int32_t i;
+	uint32_t gpu_id;

-	/* Retrieve gpu_mem id according to gpu_id */
-	gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id);
-	if (gpu_mem_id < 0)
+	i = find_first_dgpu(&gpu_id);
+	if (i < 0)
 		return NULL;

 	size = MemorySizeInBytes;
@@ -776,30 +795,31 @@ static void* fmm_allocate_host_gpu(uint32_t gpu_id,
 			aperture, 0, &mmap_offset,
 			ioc_flags);

-	/* FIXME: host memory allocated in this way should be mapped on all GPUs */
-	void *ret = mmap(mem, MemorySizeInBytes,
-			PROT_READ | PROT_WRITE,
-		       MAP_SHARED | MAP_FIXED, kfd_fd , mmap_offset);
-	if (ret == MAP_FAILED) {
-		__fmm_release(mem, MemorySizeInBytes, aperture);
-		return NULL;
+	if (flags.ui32.HostAccess) {
+		void *ret = mmap(mem, MemorySizeInBytes,
+				 PROT_READ | PROT_WRITE,
+				 MAP_SHARED | MAP_FIXED, kfd_fd , mmap_offset);
+		if (ret == MAP_FAILED) {
+			__fmm_release(mem, MemorySizeInBytes, aperture);
+			return NULL;
+		}
+		if (flags.ui32.AQLQueueMemory) {
+			uint64_t my_buf_size = ALIGN_UP(size, aperture->align) / 2;
+			memset(ret, 0, MemorySizeInBytes);
+			mmap(VOID_PTR_ADD(mem, my_buf_size), MemorySizeInBytes,
+			     PROT_READ | PROT_WRITE,
+			     MAP_SHARED | MAP_FIXED, kfd_fd , mmap_offset);
+		}
 	}

-	if (flags.ui32.AQLQueueMemory) {
-		uint64_t my_buf_size = ALIGN_UP(size, aperture->align) / 2;
-		memset(ret, 0, MemorySizeInBytes);
-		mmap(VOID_PTR_ADD(mem, my_buf_size), MemorySizeInBytes,
-			PROT_READ | PROT_WRITE,
-		       MAP_SHARED | MAP_FIXED, kfd_fd , mmap_offset);
-	}

-	return ret;
+	return mem;
 }

-void* fmm_allocate_host(uint32_t gpu_id, uint64_t MemorySizeInBytes, HsaMemFlags flags, uint16_t dev_id)
+void* fmm_allocate_host(uint64_t MemorySizeInBytes, HsaMemFlags flags)
 {
-	if (topology_is_dgpu(dev_id))
-		return fmm_allocate_host_gpu(gpu_id, MemorySizeInBytes, flags);
+	if (is_dgpu)
+		return fmm_allocate_host_gpu(MemorySizeInBytes, flags);
 	return fmm_allocate_host_cpu(MemorySizeInBytes, flags);
 }

@@ -51,8 +51,7 @@ void fmm_destroy_process_apertures(void);
 */
 void* fmm_allocate_scratch(uint32_t gpu_id, uint64_t MemorySizeInBytes);
 void* fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes);
-void* fmm_allocate_host(uint32_t gpu_id, uint64_t MemorySizeInBytes,
-		HsaMemFlags flags, uint16_t dev_id);
+void* fmm_allocate_host(uint64_t MemorySizeInBytes, HsaMemFlags flags);
 void* fmm_open_graphic_handle(uint32_t gpu_id,
        int32_t graphic_device_handle,
        uint32_t graphic_handle,
@@ -74,7 +74,7 @@ bool topology_is_dgpu(uint16_t device_id);
 HSAuint32 PageSizeFromFlags(unsigned int pageSizeFlags);

 void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align,
-		uint32_t NodeId, bool peer_to_peer);
+                                       uint32_t NodeId);
 void free_exec_aligned_memory_gpu(void *addr, uint32_t size, uint32_t align);
 HSAKMT_STATUS init_process_doorbells(unsigned int NumNodes);
 void destroy_process_doorbells(void);
@@ -132,30 +132,16 @@ hsaKmtAllocMemory(
 		return HSAKMT_STATUS_INVALID_PARAMETER;
 	}

-	if (MemFlags.ui32.HostAccess && !MemFlags.ui32.NonPaged && !MemFlags.ui32.Scratch) {
-		if (gpu_id == 0 && PreferredNode == 0) {
-			/* HACK: Currently we need a GPU node for
-			 * system memory allocations on dGPUs and
-			 * MapMemoryToGPU will always map to the same
-			 * GPU used for allocation. Therefore we need
-			 * to allocate system memory from node 1 if
-			 * we're running on a dGPU (indicated by node
-			 * 0 being a CPU with gpu_id==0). This will be
-			 * cleaned up when multi-GPU support is
-			 * implemented. */
-			PreferredNode = 1;
-			result = validate_nodeid(PreferredNode, &gpu_id);
-			if (result != HSAKMT_STATUS_SUCCESS)
-				return result;
-		}
-		*MemoryAddress = fmm_allocate_host(gpu_id, SizeInBytes, MemFlags,
-				get_device_id_by_node(PreferredNode));
+	if (gpu_id == 0 && !MemFlags.ui32.Scratch) {
+		*MemoryAddress = fmm_allocate_host(SizeInBytes, MemFlags);
+
 		if (*MemoryAddress == NULL)
 			return HSAKMT_STATUS_ERROR;
+
 		return HSAKMT_STATUS_SUCCESS;
 	}

-	if (!MemFlags.ui32.HostAccess && MemFlags.ui32.NonPaged && !MemFlags.ui32.Scratch) {
+	if (gpu_id && MemFlags.ui32.NonPaged && !MemFlags.ui32.Scratch) {
 		*MemoryAddress = fmm_allocate_device(gpu_id, SizeInBytes);

 		if (*MemoryAddress == NULL)
@@ -172,6 +158,17 @@ hsaKmtAllocMemory(
 		return HSAKMT_STATUS_SUCCESS;
 	}

+	/* Backwards compatibility hack: Allocate system memory if app
+	 * asks for paged memory from a GPU node. */
+	if (gpu_id && !MemFlags.ui32.NonPaged && !MemFlags.ui32.Scratch) {
+		*MemoryAddress = fmm_allocate_host(SizeInBytes, MemFlags);
+
+		if (*MemoryAddress == NULL)
+			return HSAKMT_STATUS_ERROR;
+
+		return HSAKMT_STATUS_SUCCESS;
+	}
+
 	return HSAKMT_STATUS_INVALID_PARAMETER;
 }

@@ -218,7 +218,7 @@ static void* allocate_exec_aligned_memory_cpu(uint32_t size, uint32_t align)
 }

 void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align,
-		uint32_t NodeId, bool peer_to_peer)
+				       uint32_t NodeId)
 {
 	void *mem;
 	HSAuint64 gpu_va;
@@ -232,12 +232,12 @@ void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align,

 	size = ALIGN_UP(size, align);

-	ret = hsaKmtAllocMemory(NodeId, size, flags, &mem);
+	ret = hsaKmtAllocMemory(0, size, flags, &mem);
 	if (ret != HSAKMT_STATUS_SUCCESS) {
 		return NULL;
 	}

-	if (!peer_to_peer) {
+	if (NodeId != 0) {
 		uint32_t nodes_array[1] = {NodeId};
 		if (hsaKmtRegisterMemoryToNodes(mem, size, 1, nodes_array)
 		    != HSAKMT_STATUS_SUCCESS) {
@@ -269,7 +269,7 @@ static void* allocate_exec_aligned_memory(uint32_t size,
 					uint32_t NodeId)
 {
 	if (IS_DGPU(type))
-		return allocate_exec_aligned_memory_gpu(size, align, NodeId, false);
+		return allocate_exec_aligned_memory_gpu(size, align, NodeId);
 	return allocate_exec_aligned_memory_cpu(size, align);
 }