diff --git a/projects/rocr-runtime/src/fmm.c b/projects/rocr-runtime/src/fmm.c
index ba810e088f..e2374a403f 100644
--- a/projects/rocr-runtime/src/fmm.c
+++ b/projects/rocr-runtime/src/fmm.c
@@ -171,6 +171,8 @@ typedef struct {
  */
 static gpu_mem_t *gpu_mem;
 static unsigned int gpu_mem_count;
+static gpu_mem_t *g_first_gpu_mem;
+
 static bool hsa_debug;
 static void *dgpu_shared_aperture_base;
 static void *dgpu_shared_aperture_limit;
@@ -238,24 +240,6 @@ static int _fmm_unmap_from_gpu_scratch(uint32_t gpu_id,
 				       void *address);
 static void print_device_id_array(uint32_t *device_id_array, uint32_t device_id_array_size);
 
-static int32_t find_first_dgpu(HSAuint32 *gpu_id)
-{
-	uint32_t i;
-
-	*gpu_id = NON_VALID_GPU_ID;
-
-	for (i = 0; i < gpu_mem_count; i++) {
-		if (gpu_mem[i].gpu_id == NON_VALID_GPU_ID)
-			continue;
-		if (!topology_is_dgpu(gpu_mem[i].device_id))
-			continue;
-		*gpu_id = gpu_mem[i].gpu_id;
-		return i;
-	}
-
-	return -1;
-}
-
 static vm_area_t *vm_create_and_init_area(void *start, void *end)
 {
 	vm_area_t *area = (vm_area_t *) malloc(sizeof(vm_area_t));
@@ -1213,14 +1197,16 @@ static void *fmm_allocate_host_gpu(uint32_t node_id, uint64_t MemorySizeInBytes,
 	uint64_t mmap_offset;
 	uint32_t ioc_flags;
 	uint64_t size;
-	int32_t gpu_mem_id;
+	int32_t gpu_drm_fd;
 	uint32_t gpu_id;
 	vm_object_t *vm_obj = NULL;
 
-	gpu_mem_id = find_first_dgpu(&gpu_id);
-	if (gpu_mem_id < 0)
+	if (!g_first_gpu_mem)
 		return NULL;
 
+	gpu_id = g_first_gpu_mem->gpu_id;
+	gpu_drm_fd = g_first_gpu_mem->drm_render_fd;
+
 	size = MemorySizeInBytes;
 	ioc_flags = 0;
 	if (flags.ui32.CoarseGrain)
@@ -1296,8 +1282,7 @@ static void *fmm_allocate_host_gpu(uint32_t node_id, uint64_t MemorySizeInBytes,
 					     ioc_flags, &vm_obj);
 
 		if (mem && flags.ui32.HostAccess) {
-			int map_fd = mmap_offset >= (1ULL<<40) ? kfd_fd :
-						gpu_mem[gpu_mem_id].drm_render_fd;
+			int map_fd = mmap_offset >= (1ULL<<40) ? kfd_fd : gpu_drm_fd;
 			void *ret = mmap(mem, MemorySizeInBytes,
 					 PROT_READ | PROT_WRITE,
 					 MAP_SHARED | MAP_FIXED, map_fd, mmap_offset);
@@ -1709,7 +1694,7 @@ static void fmm_init_rbtree(void)
 
 HSAKMT_STATUS fmm_init_process_apertures(unsigned int NumNodes)
 {
-	uint32_t i = 0;
+	uint32_t i;
 	int32_t gpu_mem_id = 0;
 	uint32_t gpu_id;
 	HsaNodeProperties props;
@@ -1748,6 +1733,9 @@ HSAKMT_STATUS fmm_init_process_apertures(unsigned int NumNodes)
 	if (!guardPagesStr || sscanf(guardPagesStr, "%u", &guardPages) != 1)
 		guardPages = 1;
 
+	gpu_mem_count = 0;
+	g_first_gpu_mem = NULL;
+
 	/* Trade off - NumNodes includes GPU nodes + CPU Node. So in
 	 * systems with CPU node, slightly more memory is allocated than
 	 * necessary
@@ -1760,10 +1748,10 @@ HSAKMT_STATUS fmm_init_process_apertures(unsigned int NumNodes)
 	 * set to 0 by calloc. This is necessary because this function
 	 * gets called before hsaKmtAcquireSystemProperties() is called.
 	 */
-	gpu_mem_count = 0;
+
 	pacc = pci_alloc();
 	pci_init(pacc);
-	while (i < NumNodes) {
+	for (i = 0; i < NumNodes; i++) {
 		memset(&props, 0, sizeof(props));
 		ret = topology_sysfs_get_node_props(i, &props, &gpu_id, pacc);
 		if (ret != HSAKMT_STATUS_SUCCESS)
@@ -1790,9 +1778,12 @@ HSAKMT_STATUS fmm_init_process_apertures(unsigned int NumNodes)
 				get_vm_alignment(props.DeviceId);
 			gpu_mem[gpu_mem_count].gpuvm_aperture.guard_pages = guardPages;
 			pthread_mutex_init(&gpu_mem[gpu_mem_count].gpuvm_aperture.fmm_mutex, NULL);
+
+			if (!g_first_gpu_mem)
+				g_first_gpu_mem = &gpu_mem[gpu_mem_count];
+
 			gpu_mem_count++;
 		}
-		i++;
 	}
 	pci_cleanup(pacc);
 
@@ -2592,7 +2583,6 @@ static HSAuint8 fmm_check_user_memory(const void *addr, HSAuint64 size)
 
 static HSAKMT_STATUS fmm_register_user_memory(void *addr, HSAuint64 size, vm_object_t **obj_ret)
 {
-	int32_t i;
 	HSAuint32 gpu_id;
 	manageable_aperture_t *aperture;
 	void *svm_addr = NULL;
@@ -2601,10 +2591,12 @@ static HSAKMT_STATUS fmm_register_user_memory(void *addr, HSAuint64 size, vm_obj
 	HSAuint64 aligned_addr = (HSAuint64)addr - page_offset;
 	HSAuint64 aligned_size = PAGE_ALIGN_UP(page_offset + size);
 
-	/* Find first dGPU for creating the userptr BO */
-	i = find_first_dgpu(&gpu_id);
-	if (i < 0)
+	/* Find first GPU for creating the userptr BO */
+	if (!g_first_gpu_mem)
 		return HSAKMT_STATUS_ERROR;
+
+	gpu_id = g_first_gpu_mem->gpu_id;
+
 	aperture = &svm.dgpu_aperture;
 
 	/* Check if this address was already registered */
@@ -2856,12 +2848,13 @@ HSAKMT_STATUS fmm_share_memory(void *MemoryAddress,
 	if (r != HSAKMT_STATUS_SUCCESS)
 		return r;
 	if (!gpu_id && is_dgpu) {
-		/* Sharing non paged system memory. Use first dgpu which was
+		/* Sharing non paged system memory. Use first GPU which was
 		 * used during allocation. See fmm_allocate_host_gpu()
 		 */
-		r = find_first_dgpu(&gpu_id);
-		if (r != HSAKMT_STATUS_SUCCESS)
-			return r;
+		if (!g_first_gpu_mem)
+			return HSAKMT_STATUS_ERROR;
+
+		gpu_id = g_first_gpu_mem->gpu_id;
 	}
 	exportArgs.handle = obj->handle;
 	exportArgs.gpu_id = gpu_id;