diff --git a/src/events.c b/src/events.c
index 50c3e1d3bf..a1359d6647 100644
--- a/src/events.c
+++ b/src/events.c
@@ -75,9 +75,8 @@ hsaKmtCreateEvent(
 
 	/* dGPU code */
 	if (is_dgpu && events_page == NULL) {
-		events_page = allocate_exec_aligned_memory_gpu(KFD_SIGNAL_EVENT_LIMIT * 8,
-			PAGE_SIZE,
-			args.node_id, true);
+		events_page = allocate_exec_aligned_memory_gpu(
+			KFD_SIGNAL_EVENT_LIMIT * 8, PAGE_SIZE, 0);
 		if (!events_page) {
 			return HSAKMT_STATUS_ERROR;
 		}
diff --git a/src/fmm.c b/src/fmm.c
index 7c6837843b..6976a44723 100644
--- a/src/fmm.c
+++ b/src/fmm.c
@@ -137,6 +137,23 @@ static int _fmm_unmap_from_gpu_scratch(uint32_t gpu_id,
 				       manageble_aperture_t *aperture,
 				       void *address);
 
+static int32_t find_first_dgpu(HSAuint32 *gpu_id) {
+	int32_t i;
+
+	*gpu_id = NON_VALID_GPU_ID;
+
+	for (i = 0; i < NUM_OF_SUPPORTED_GPUS; i++) {
+		if (gpu_mem[i].gpu_id == NON_VALID_GPU_ID)
+			continue;
+		if (!topology_is_dgpu(gpu_mem[i].device_id))
+			continue;
+		*gpu_id = gpu_mem[i].gpu_id;
+		return i;
+	}
+
+	return -1;
+}
+
 static vm_area_t *vm_create_and_init_area(void *start, void *end)
 {
 	vm_area_t *area = (vm_area_t *) malloc(sizeof(vm_area_t));
@@ -720,6 +737,8 @@ void *fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes)
 	return __fmm_allocate_device(gpu_id, MemorySizeInBytes,
 			aperture, offset, NULL,
 			flags);
+	/* TODO: honor host access mem flag and map to user mode VM if
+	 * needed */
 }
 
 static void* fmm_allocate_host_cpu(uint64_t MemorySizeInBytes,
@@ -746,19 +765,19 @@ static void* fmm_allocate_host_cpu(uint64_t MemorySizeInBytes,
 	return mem;
 }
 
-static void* fmm_allocate_host_gpu(uint32_t gpu_id,
-		uint64_t MemorySizeInBytes, HsaMemFlags flags)
+static void* fmm_allocate_host_gpu(uint64_t MemorySizeInBytes,
+				   HsaMemFlags flags)
 {
 	void *mem;
 	manageble_aperture_t *aperture;
-	int32_t gpu_mem_id;
 	uint64_t mmap_offset;
 	uint32_t ioc_flags;
 	uint32_t size;
+	int32_t i;
+	uint32_t gpu_id;
 
-	/* Retrieve gpu_mem id according to gpu_id */
-	gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id);
-	if (gpu_mem_id < 0)
+	i = find_first_dgpu(&gpu_id);
+	if (i < 0)
 		return NULL;
 
 	size = MemorySizeInBytes;
@@ -776,30 +795,31 @@ static void* fmm_allocate_host_gpu(uint32_t gpu_id,
 			aperture, 0, &mmap_offset,
 			ioc_flags);
 
-	/* FIXME: host memory allocated in this way should be mapped on all GPUs */
-	void *ret = mmap(mem, MemorySizeInBytes,
-			PROT_READ | PROT_WRITE,
-		       MAP_SHARED | MAP_FIXED, kfd_fd , mmap_offset);
-	if (ret == MAP_FAILED) {
-		__fmm_release(mem, MemorySizeInBytes, aperture);
-		return NULL;
+	if (flags.ui32.HostAccess) {
+		void *ret = mmap(mem, MemorySizeInBytes,
+				 PROT_READ | PROT_WRITE,
+				 MAP_SHARED | MAP_FIXED, kfd_fd , mmap_offset);
+		if (ret == MAP_FAILED) {
+			__fmm_release(mem, MemorySizeInBytes, aperture);
+			return NULL;
+		}
+		if (flags.ui32.AQLQueueMemory) {
+			uint64_t my_buf_size = ALIGN_UP(size, aperture->align) / 2;
+			memset(ret, 0, MemorySizeInBytes);
+			mmap(VOID_PTR_ADD(mem, my_buf_size), MemorySizeInBytes,
+			     PROT_READ | PROT_WRITE,
+			     MAP_SHARED | MAP_FIXED, kfd_fd , mmap_offset);
+		}
 	}
 
-	if (flags.ui32.AQLQueueMemory) {
-		uint64_t my_buf_size = ALIGN_UP(size, aperture->align) / 2;
-		memset(ret, 0, MemorySizeInBytes);
-		mmap(VOID_PTR_ADD(mem, my_buf_size), MemorySizeInBytes,
-			PROT_READ | PROT_WRITE,
-		       MAP_SHARED | MAP_FIXED, kfd_fd , mmap_offset);
-	}
 
-	return ret;
+	return mem;
 }
 
-void* fmm_allocate_host(uint32_t gpu_id, uint64_t MemorySizeInBytes, HsaMemFlags flags, uint16_t dev_id)
+void* fmm_allocate_host(uint64_t MemorySizeInBytes, HsaMemFlags flags)
 {
-	if (topology_is_dgpu(dev_id))
-		return fmm_allocate_host_gpu(gpu_id, MemorySizeInBytes, flags);
+	if (is_dgpu)
+		return fmm_allocate_host_gpu(MemorySizeInBytes, flags);
 	return fmm_allocate_host_cpu(MemorySizeInBytes, flags);
 }
 
diff --git a/src/fmm.h b/src/fmm.h
index 881413bbdf..75b5cac4a7 100644
--- a/src/fmm.h
+++ b/src/fmm.h
@@ -51,8 +51,7 @@ void fmm_destroy_process_apertures(void);
  */
 void* fmm_allocate_scratch(uint32_t gpu_id, uint64_t MemorySizeInBytes);
 void* fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes);
-void* fmm_allocate_host(uint32_t gpu_id, uint64_t MemorySizeInBytes,
-		HsaMemFlags flags, uint16_t dev_id);
+void* fmm_allocate_host(uint64_t MemorySizeInBytes, HsaMemFlags flags);
 void* fmm_open_graphic_handle(uint32_t gpu_id,
         int32_t graphic_device_handle,
         uint32_t graphic_handle,
diff --git a/src/libhsakmt.h b/src/libhsakmt.h
index d277da07b0..e7c3975646 100644
--- a/src/libhsakmt.h
+++ b/src/libhsakmt.h
@@ -74,7 +74,7 @@ bool topology_is_dgpu(uint16_t device_id);
 HSAuint32 PageSizeFromFlags(unsigned int pageSizeFlags);
 
 void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align,
-		uint32_t NodeId, bool peer_to_peer);
+                                       uint32_t NodeId);
 void free_exec_aligned_memory_gpu(void *addr, uint32_t size, uint32_t align);
 HSAKMT_STATUS init_process_doorbells(unsigned int NumNodes);
 void destroy_process_doorbells(void);
diff --git a/src/memory.c b/src/memory.c
index 514673d003..08b08bd734 100644
--- a/src/memory.c
+++ b/src/memory.c
@@ -132,30 +132,16 @@ hsaKmtAllocMemory(
 		return HSAKMT_STATUS_INVALID_PARAMETER;
 	}
 
-	if (MemFlags.ui32.HostAccess && !MemFlags.ui32.NonPaged && !MemFlags.ui32.Scratch) {
-		if (gpu_id == 0 && PreferredNode == 0) {
-			/* HACK: Currently we need a GPU node for
-			 * system memory allocations on dGPUs and
-			 * MapMemoryToGPU will always map to the same
-			 * GPU used for allocation. Therefore we need
-			 * to allocate system memory from node 1 if
-			 * we're running on a dGPU (indicated by node
-			 * 0 being a CPU with gpu_id==0). This will be
-			 * cleaned up when multi-GPU support is
-			 * implemented. */
-			PreferredNode = 1;
-			result = validate_nodeid(PreferredNode, &gpu_id);
-			if (result != HSAKMT_STATUS_SUCCESS)
-				return result;
-		}
-		*MemoryAddress = fmm_allocate_host(gpu_id, SizeInBytes, MemFlags,
-				get_device_id_by_node(PreferredNode));
+	if (gpu_id == 0 && !MemFlags.ui32.Scratch) {
+		*MemoryAddress = fmm_allocate_host(SizeInBytes, MemFlags);
+
 		if (*MemoryAddress == NULL)
 			return HSAKMT_STATUS_ERROR;
+
 		return HSAKMT_STATUS_SUCCESS;
 	}
 
-	if (!MemFlags.ui32.HostAccess && MemFlags.ui32.NonPaged && !MemFlags.ui32.Scratch) {
+	if (gpu_id && MemFlags.ui32.NonPaged && !MemFlags.ui32.Scratch) {
 		*MemoryAddress = fmm_allocate_device(gpu_id, SizeInBytes);
 
 		if (*MemoryAddress == NULL)
@@ -172,6 +158,17 @@ hsaKmtAllocMemory(
 		return HSAKMT_STATUS_SUCCESS;
 	}
 
+	/* Backwards compatibility hack: Allocate system memory if app
+	 * asks for paged memory from a GPU node. */
+	if (gpu_id && !MemFlags.ui32.NonPaged && !MemFlags.ui32.Scratch) {
+		*MemoryAddress = fmm_allocate_host(SizeInBytes, MemFlags);
+
+		if (*MemoryAddress == NULL)
+			return HSAKMT_STATUS_ERROR;
+
+		return HSAKMT_STATUS_SUCCESS;
+	}
+
 	return HSAKMT_STATUS_INVALID_PARAMETER;
 }
 
diff --git a/src/queues.c b/src/queues.c
index e7e26b87ca..0867548e7d 100644
--- a/src/queues.c
+++ b/src/queues.c
@@ -218,7 +218,7 @@ static void* allocate_exec_aligned_memory_cpu(uint32_t size, uint32_t align)
 }
 
 void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align,
-		uint32_t NodeId, bool peer_to_peer)
+				       uint32_t NodeId)
 {
 	void *mem;
 	HSAuint64 gpu_va;
@@ -232,12 +232,12 @@ void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align,
 
 	size = ALIGN_UP(size, align);
 
-	ret = hsaKmtAllocMemory(NodeId, size, flags, &mem);
+	ret = hsaKmtAllocMemory(0, size, flags, &mem);
 	if (ret != HSAKMT_STATUS_SUCCESS) {
 		return NULL;
 	}
 
-	if (!peer_to_peer) {
+	if (NodeId != 0) {
 		uint32_t nodes_array[1] = {NodeId};
 		if (hsaKmtRegisterMemoryToNodes(mem, size, 1, nodes_array)
 		    != HSAKMT_STATUS_SUCCESS) {
@@ -269,7 +269,7 @@ static void* allocate_exec_aligned_memory(uint32_t size,
 					uint32_t NodeId)
 {
 	if (IS_DGPU(type))
-		return allocate_exec_aligned_memory_gpu(size, align, NodeId, false);
+		return allocate_exec_aligned_memory_gpu(size, align, NodeId);
 	return allocate_exec_aligned_memory_cpu(size, align);
 }