From 9da1572c423fd6b17fe3b0be21b950f6c57ce428 Mon Sep 17 00:00:00 2001
From: Junhua Shen <Junhua.Shen@amd.com>
Date: Mon, 10 Nov 2025 11:19:58 +0800
Subject: [PATCH] libhsakmt: Refactor for Multi-KFD Context Support (Multiple
 KFD FDs per Process) (#1701)

* Introduce HsaKFDContext structure and infrastructure for multiple KFD contexts, enabling
   independent contexts within a single process.
* Refactor core components (queue, event, FMM, topology) to be context-aware,
   using explicit HsaKFDContext parameters instead of global state.
* Replace global hsakmt_kfd_fd with context-specific file descriptors, ensuring full context isolation.
* Maintain backward compatibility by redirecting legacy APIs to use the primary context.

This refactoring establishes a foundation for multi-context support while preserving existing functionality.

Signed-off-by: Junhua Shen <Junhua.Shen@amd.com>
---
 .../rocr-runtime/libhsakmt/CMakeLists.txt     |   3 +-
 projects/rocr-runtime/libhsakmt/src/ais.c     |   4 +-
 projects/rocr-runtime/libhsakmt/src/debug.c   |  28 +-
 projects/rocr-runtime/libhsakmt/src/events.c  | 221 +++--
 projects/rocr-runtime/libhsakmt/src/fmm.c     | 727 ++++++++-------
 projects/rocr-runtime/libhsakmt/src/fmm.h     | 144 ++-
 projects/rocr-runtime/libhsakmt/src/globals.c |   2 -
 .../rocr-runtime/libhsakmt/src/hsakmtctx.h    | 827 ++++++++++++++++++
 .../rocr-runtime/libhsakmt/src/hsakmtmodel.c  |  14 +-
 .../rocr-runtime/libhsakmt/src/kfdcontext.c   |  63 ++
 .../rocr-runtime/libhsakmt/src/kfdcontext.h   |  74 ++
 .../rocr-runtime/libhsakmt/src/libhsakmt.h    |  25 +-
 projects/rocr-runtime/libhsakmt/src/memory.c  | 363 ++++++--
 .../rocr-runtime/libhsakmt/src/openclose.c    |  50 +-
 .../rocr-runtime/libhsakmt/src/pc_sampling.c  |  10 +-
 projects/rocr-runtime/libhsakmt/src/queues.c  | 340 ++++---
 projects/rocr-runtime/libhsakmt/src/spm.c     |   6 +-
 projects/rocr-runtime/libhsakmt/src/svm.c     |  48 +-
 projects/rocr-runtime/libhsakmt/src/time.c    |   2 +-
 .../rocr-runtime/libhsakmt/src/topology.c     | 107 ++-
 projects/rocr-runtime/libhsakmt/src/version.c |   2 +-
 21 files changed, 2377 insertions(+), 683 deletions(-)
 create mode 100644 projects/rocr-runtime/libhsakmt/src/hsakmtctx.h
 create mode 100644 projects/rocr-runtime/libhsakmt/src/kfdcontext.c
 create mode 100644 projects/rocr-runtime/libhsakmt/src/kfdcontext.h

diff --git a/projects/rocr-runtime/libhsakmt/CMakeLists.txt b/projects/rocr-runtime/libhsakmt/CMakeLists.txt
index 44b5dc603e..25b3af4af8 100644
--- a/projects/rocr-runtime/libhsakmt/CMakeLists.txt
+++ b/projects/rocr-runtime/libhsakmt/CMakeLists.txt
@@ -130,7 +130,8 @@ set ( HSAKMT_SRC "src/debug.c"
                  "src/version.c"
                  "src/svm.c"
                  "src/pc_sampling.c"
-                 "src/ais.c")
+                 "src/ais.c"
+                 "src/kfdcontext.c")
 
 ## Declare the library target name
 add_library (${HSAKMT_TARGET} STATIC "")
diff --git a/projects/rocr-runtime/libhsakmt/src/ais.c b/projects/rocr-runtime/libhsakmt/src/ais.c
index aca8acc48f..7a46264c86 100644
--- a/projects/rocr-runtime/libhsakmt/src/ais.c
+++ b/projects/rocr-runtime/libhsakmt/src/ais.c
@@ -47,7 +47,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAisReadWriteFile(void *MemoryAddress,
 	/* Support is only for dGPUs */
 
 
-	if (!hsakmt_fmm_get_handle(MemoryAddress, &handle, &size_offset)) {
+	if (!hsakmt_fmm_get_handle(&hsakmt_primary_kfd_ctx, MemoryAddress, &handle, &size_offset)) {
 		pr_err("Address/size out of range: %p/%lu\n", MemoryAddress, MemorySizeInBytes);
 		return HSAKMT_STATUS_INVALID_PARAMETER;
 	}
@@ -66,7 +66,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAisReadWriteFile(void *MemoryAddress,
 	}
 
 	args.in.handle_offset = size_offset;
-	ret = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_AIS_OP, &args);
+	ret = hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_AIS_OP, &args);
 
 	if (SizeCopiedInBytes)
 		*SizeCopiedInBytes = args.out.size_copied;
diff --git a/projects/rocr-runtime/libhsakmt/src/debug.c b/projects/rocr-runtime/libhsakmt/src/debug.c
index 6aad5ea183..7fe450d123 100644
--- a/projects/rocr-runtime/libhsakmt/src/debug.c
+++ b/projects/rocr-runtime/libhsakmt/src/debug.c
@@ -78,7 +78,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDbgRegister(HSAuint32 NodeId)
 
 	args.gpu_id = gpu_id;
 
-	long err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_DBG_REGISTER_DEPRECATED, &args);
+	long err = hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_DBG_REGISTER_DEPRECATED, &args);
 
 	if (err == 0)
 		result = HSAKMT_STATUS_SUCCESS;
@@ -105,7 +105,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDbgUnregister(HSAuint32 NodeId)
 	struct kfd_ioctl_dbg_unregister_args args = {0};
 
 	args.gpu_id = gpu_id;
-	long err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_DBG_UNREGISTER_DEPRECATED, &args);
+	long err = hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_DBG_UNREGISTER_DEPRECATED, &args);
 
 	if (err)
 		return HSAKMT_STATUS_ERROR;
@@ -168,7 +168,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDbgWavefrontControl(HSAuint32 NodeId,
 	run_ptr += sizeof(DbgWaveMsgRing->MemoryVA);
 
 	/* send to kernel */
-	long err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_DBG_WAVE_CONTROL_DEPRECATED, args);
+	long err = hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_DBG_WAVE_CONTROL_DEPRECATED, args);
 
 	free(args);
 
@@ -256,7 +256,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDbgAddressWatch(HSAuint32 NodeId,
 	}
 
 	/* send to kernel */
-	long err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_DBG_ADDRESS_WATCH_DEPRECATED, args);
+	long err = hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_DBG_ADDRESS_WATCH_DEPRECATED, args);
 
 	free(args);
 
@@ -316,7 +316,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRuntimeEnable(void *rDebug,
 		((setupTtmp) ? KFD_RUNTIME_ENABLE_MODE_TTMP_SAVE_MASK : 0);
 	args.r_debug = (HSAuint64)rDebug;
 
-	long err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_RUNTIME_ENABLE, &args);
+	long err = hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_RUNTIME_ENABLE, &args);
 
 	if (err) {
 		if (errno == EBUSY)
@@ -340,7 +340,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRuntimeDisable(void)
 	memset(&args, 0x00, sizeof(args));
 	args.mode_mask = 0; //Disable
 
-	if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_RUNTIME_ENABLE, &args))
+	if (hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_RUNTIME_ENABLE, &args))
 		return HSAKMT_STATUS_ERROR;
 
 	return HSAKMT_STATUS_SUCCESS;
@@ -363,7 +363,7 @@ static HSAKMT_STATUS dbg_trap_get_device_data(void *data,
 	args.device_snapshot.entry_size = entry_size;
 	args.op = KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT;
 	args.pid = getpid();
-	if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_DBG_TRAP, &args))
+	if (hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_DBG_TRAP, &args))
 		return HSAKMT_STATUS_ERROR;
 	*n_entries = args.device_snapshot.num_devices;
 
@@ -384,7 +384,7 @@ static HSAKMT_STATUS dbg_trap_get_queue_data(void *data,
 	args.queue_snapshot.snapshot_buf_ptr = (uint64_t) data;
 	args.pid = getpid();
 
-	if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_DBG_TRAP, &args))
+	if (hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_DBG_TRAP, &args))
 		return HSAKMT_STATUS_ERROR;
 
 	*n_entries = args.queue_snapshot.num_queues;
@@ -410,7 +410,7 @@ static HSAKMT_STATUS dbg_trap_suspend_queues(uint32_t *queue_ids,
 	args.op = KFD_IOC_DBG_TRAP_SUSPEND_QUEUES;
 	args.pid = getpid();
 
-	r = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_DBG_TRAP, &args);
+	r = hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_DBG_TRAP, &args);
 	if (r < 0)
 		return HSAKMT_STATUS_ERROR;
 
@@ -429,7 +429,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDbgEnable(void **runtime_info,
 	CHECK_KFD_MINOR_VERSION(KFD_MINOR_MIN_DEBUG);
 	*data_size = sizeof(struct kfd_runtime_info);
 	args.enable.rinfo_size = *data_size;
-	args.enable.dbg_fd = hsakmt_kfd_fd;
+	args.enable.dbg_fd = hsakmt_primary_kfd_ctx.fd;
 	*runtime_info = malloc(args.enable.rinfo_size);
 	if (!*runtime_info)
 		return HSAKMT_STATUS_NO_MEMORY;
@@ -437,7 +437,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDbgEnable(void **runtime_info,
 	args.op = KFD_IOC_DBG_TRAP_ENABLE;
 	args.pid = getpid();
 
-	if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_DBG_TRAP, &args)) {
+	if (hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_DBG_TRAP, &args)) {
 		free(*runtime_info);
 		return HSAKMT_STATUS_ERROR;
 	}
@@ -450,11 +450,11 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDbgDisable(void)
 
 	CHECK_KFD_OPEN();
 	CHECK_KFD_MINOR_VERSION(KFD_MINOR_MIN_DEBUG);
-	args.enable.dbg_fd = hsakmt_kfd_fd;
+	args.enable.dbg_fd = hsakmt_primary_kfd_ctx.fd;
 	args.op = KFD_IOC_DBG_TRAP_DISABLE;
 	args.pid = getpid();
 
-	if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_DBG_TRAP, &args))
+	if (hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_DBG_TRAP, &args))
 		return HSAKMT_STATUS_ERROR;
 
 	return HSAKMT_STATUS_SUCCESS;
@@ -540,7 +540,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDebugTrapIoctl(struct kfd_ioctl_dbg_trap_args *arg
 		free(queue_ids);
 	}
 
-	long err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_DBG_TRAP, args);
+	long err = hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_DBG_TRAP, args);
 	if (DebugReturn)
 		*DebugReturn = err;
 
diff --git a/projects/rocr-runtime/libhsakmt/src/events.c b/projects/rocr-runtime/libhsakmt/src/events.c
index 2421a80dfd..df97cf6c64 100644
--- a/projects/rocr-runtime/libhsakmt/src/events.c
+++ b/projects/rocr-runtime/libhsakmt/src/events.c
@@ -34,12 +34,36 @@
 #include "hsakmt/linux/kfd_ioctl.h"
 #include "fmm.h"
 #include "hsakmt/hsakmtmodel.h"
+#include <assert.h>
 
-static HSAuint64 *events_page = NULL;
 
-void hsakmt_clear_events_page(void)
+struct hsa_kfd_event_context
 {
-	events_page = NULL;
+	HSAuint64 *events_page;
+};
+
+struct hsa_kfd_event_context *hsakmt_kfdcontext_get_event_context(HsaKFDContext *ctx)
+{
+	assert(ctx);
+
+	if (ctx->event_context)
+		return ctx->event_context;
+
+	ctx->event_context = calloc(1, sizeof(struct hsa_kfd_event_context));
+	if (!ctx->event_context) {
+		pr_err("Alloc memory failed for struct hsa_kfd_event_context size %zu\n",
+				 sizeof(struct hsa_kfd_event_context));
+		return NULL;
+	}
+	return ctx->event_context;
+}
+
+void hsakmt_clear_events_page(HsaKFDContext *ctx)
+{
+	struct hsa_kfd_event_context *event_ctx = hsakmt_kfdcontext_get_event_context(ctx);
+	if (event_ctx) {
+		event_ctx->events_page = NULL;
+	}
 }
 
 static bool IsSystemEventType(HSA_EVENTTYPE type)
@@ -48,14 +72,18 @@ static bool IsSystemEventType(HSA_EVENTTYPE type)
 	return (type != HSA_EVENTTYPE_SIGNAL && type != HSA_EVENTTYPE_DEBUG_EVENT);
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtCreateEvent(HsaEventDescriptor *EventDesc,
-					  bool ManualReset, bool IsSignaled,
-					  HsaEvent **Event)
+HSAKMT_STATUS HSAKMTAPI hsaKmtCreateEventCtx(HsaKFDContext *ctx,
+						 HsaEventDescriptor *EventDesc,
+						 bool ManualReset, bool IsSignaled,
+						 HsaEvent **Event)
 {
 	unsigned int event_limit = KFD_SIGNAL_EVENT_LIMIT;
 
 	CHECK_KFD_OPEN();
 
+	struct hsa_kfd_event_context *event_ctx = NULL;
+	HSAuint64 *events_page = NULL;
+
 	if (EventDesc->EventType >= HSA_EVENTTYPE_MAXID)
 		return HSAKMT_STATUS_INVALID_PARAMETER;
 
@@ -74,9 +102,11 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCreateEvent(HsaEventDescriptor *EventDesc,
 
 	/* dGPU code */
 	pthread_mutex_lock(&hsakmt_mutex);
+	event_ctx = hsakmt_kfdcontext_get_event_context(ctx);
+	events_page = event_ctx->events_page;
 
 	if (hsakmt_is_dgpu && !events_page) {
-		events_page = hsakmt_allocate_exec_aligned_memory_gpu(
+		events_page = hsakmt_allocate_exec_aligned_memory_gpu(ctx,
 			KFD_SIGNAL_EVENT_LIMIT * 8, PAGE_SIZE, 0, 0, true, false, true);
 		if (!events_page) {
 			free(e);
@@ -86,10 +116,10 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCreateEvent(HsaEventDescriptor *EventDesc,
 		if (hsakmt_use_model)
 			model_set_event_page(events_page, KFD_SIGNAL_EVENT_LIMIT);
 		else
-			hsakmt_fmm_get_handle(events_page, (uint64_t *)&args.event_page_offset, NULL);
+			hsakmt_fmm_get_handle(ctx, events_page, (uint64_t *)&args.event_page_offset, NULL);
 	}
 
-	if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_CREATE_EVENT, &args) != 0) {
+	if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_CREATE_EVENT, &args) != 0) {
 		free(e);
 		*Event = NULL;
 		pthread_mutex_unlock(&hsakmt_mutex);
@@ -100,17 +130,17 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCreateEvent(HsaEventDescriptor *EventDesc,
 
 	if (!events_page && args.event_page_offset > 0) {
 		events_page = mmap(NULL, event_limit * 8, PROT_WRITE | PROT_READ,
-				MAP_SHARED, hsakmt_kfd_fd, args.event_page_offset);
+				MAP_SHARED, ctx->fd, args.event_page_offset);
 		if (events_page == MAP_FAILED) {
 			/* old kernels only support 256 events */
 			event_limit = 256;
 			events_page = mmap(NULL, PAGE_SIZE, PROT_WRITE | PROT_READ,
-					   MAP_SHARED, hsakmt_kfd_fd, args.event_page_offset);
+					   MAP_SHARED, ctx->fd, args.event_page_offset);
 		}
 		if (events_page == MAP_FAILED) {
 			events_page = NULL;
 			pthread_mutex_unlock(&hsakmt_mutex);
-			hsaKmtDestroyEvent(e);
+			hsaKmtDestroyEventCtx(ctx, e);
 			return HSAKMT_STATUS_ERROR;
 		}
 	}
@@ -118,10 +148,10 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCreateEvent(HsaEventDescriptor *EventDesc,
 	if (args.event_page_offset > 0 && args.event_slot_index < event_limit)
 		e->EventData.HWData2 = (HSAuint64)&events_page[args.event_slot_index];
 
-        pthread_mutex_unlock(&hsakmt_mutex);
+    pthread_mutex_unlock(&hsakmt_mutex);
 
-        e->EventData.EventType = EventDesc->EventType;
-        e->EventData.HWData1 = args.event_id;
+    e->EventData.EventType = EventDesc->EventType;
+    e->EventData.HWData1 = args.event_id;
 
 	e->EventData.HWData3 = args.event_trigger_data;
 	e->EventData.EventData.SyncVar.SyncVar.UserData =
@@ -134,19 +164,21 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCreateEvent(HsaEventDescriptor *EventDesc,
 
 		set_args.event_id = args.event_id;
 
-                if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_SET_EVENT,
-                                 &set_args) != 0) {
-                  hsaKmtDestroyEvent(e);
-                  return HSAKMT_STATUS_ERROR;
-                }
-        }
+		if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_SET_EVENT, &set_args) != 0) {
+			hsaKmtDestroyEventCtx(ctx, e);
+			return HSAKMT_STATUS_ERROR;
+		}
+	}
 
-        *Event = e;
+	*Event = e;
+	if (!event_ctx->events_page)
+		event_ctx->events_page = events_page;
 
 	return HSAKMT_STATUS_SUCCESS;
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtDestroyEvent(HsaEvent *Event)
+HSAKMT_STATUS HSAKMTAPI hsaKmtDestroyEventCtx(HsaKFDContext *ctx,
+						 HsaEvent *Event)
 {
 	CHECK_KFD_OPEN();
 
@@ -157,14 +189,15 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDestroyEvent(HsaEvent *Event)
 
 	args.event_id = Event->EventId;
 
-	if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_DESTROY_EVENT, &args) != 0)
+	if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_DESTROY_EVENT, &args) != 0)
 		return HSAKMT_STATUS_ERROR;
 
 	free(Event);
 	return HSAKMT_STATUS_SUCCESS;
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtSetEvent(HsaEvent *Event)
+HSAKMT_STATUS HSAKMTAPI hsaKmtSetEventCtx(HsaKFDContext *ctx,
+						 HsaEvent *Event)
 {
 	CHECK_KFD_OPEN();
 
@@ -181,13 +214,14 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtSetEvent(HsaEvent *Event)
 
 	args.event_id = Event->EventId;
 
-	if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_SET_EVENT, &args) == -1)
+	if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_SET_EVENT, &args) == -1)
 		return HSAKMT_STATUS_ERROR;
 
 	return HSAKMT_STATUS_SUCCESS;
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtResetEvent(HsaEvent *Event)
+HSAKMT_STATUS HSAKMTAPI hsaKmtResetEventCtx(HsaKFDContext *ctx,
+						 HsaEvent *Event)
 {
 	CHECK_KFD_OPEN();
 
@@ -204,13 +238,14 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtResetEvent(HsaEvent *Event)
 
 	args.event_id = Event->EventId;
 
-	if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_RESET_EVENT, &args) == -1)
+	if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_RESET_EVENT, &args) == -1)
 		return HSAKMT_STATUS_ERROR;
 
 	return HSAKMT_STATUS_SUCCESS;
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtQueryEventState(HsaEvent *Event)
+HSAKMT_STATUS HSAKMTAPI hsaKmtQueryEventStateCtx(HsaKFDContext *ctx,
+						 HsaEvent *Event)
 {
 	CHECK_KFD_OPEN();
 
@@ -220,22 +255,25 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtQueryEventState(HsaEvent *Event)
 	return HSAKMT_STATUS_SUCCESS;
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnEvent(HsaEvent *Event,
-		HSAuint32 Milliseconds)
+HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnEventCtx(HsaKFDContext *ctx,
+						 HsaEvent *Event,
+						 HSAuint32 Milliseconds)
 {
-	return hsaKmtWaitOnEvent_Ext(Event, Milliseconds, NULL);
+	return hsaKmtWaitOnEvent_ExtCtx(ctx, Event, Milliseconds, NULL);
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnEvent_Ext(HsaEvent *Event,
-		HSAuint32 Milliseconds, uint64_t *event_age)
+HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnEvent_ExtCtx(HsaKFDContext *ctx,
+						 HsaEvent *Event,
+						 HSAuint32 Milliseconds, uint64_t *event_age)
 {
 	if (!Event)
 		return HSAKMT_STATUS_INVALID_HANDLE;
 
-	return hsaKmtWaitOnMultipleEvents_Ext(&Event, 1, true, Milliseconds, event_age);
+	return hsaKmtWaitOnMultipleEvents_ExtCtx(ctx, &Event,
+						1, true, Milliseconds, event_age);
 }
 
-static HSAKMT_STATUS get_mem_info_svm_api(uint64_t address, uint32_t gpu_id)
+static HSAKMT_STATUS get_mem_info_svm_api(HsaKFDContext *ctx, uint64_t address, uint32_t gpu_id)
 {
 	struct kfd_ioctl_svm_args *args;
         uint32_t node_id = 0;
@@ -258,7 +296,7 @@ static HSAKMT_STATUS get_mem_info_svm_api(uint64_t address, uint32_t gpu_id)
 	args->op = KFD_IOCTL_SVM_OP_GET_ATTR;
 	args->nattr = s_attr / sizeof(*attrs);
 	memcpy(args->attrs, attrs, s_attr);
-	if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_SVM + (s_attr << _IOC_SIZESHIFT), args)) {
+	if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_SVM + (s_attr << _IOC_SIZESHIFT), args)) {
 		pr_debug("op get range attrs failed %s\n", strerror(errno));
 		return HSAKMT_STATUS_ERROR;
 	}
@@ -312,8 +350,8 @@ static HSAKMT_STATUS get_mem_info_svm_api(uint64_t address, uint32_t gpu_id)
 	return HSAKMT_STATUS_SUCCESS;
 }
 //Analysis memory exception data, print debug messages
-static void analysis_memory_exception(struct kfd_hsa_memory_exception_data *
-						memory_exception_data)
+static void analysis_memory_exception(HsaKFDContext *ctx,
+				struct kfd_hsa_memory_exception_data *memory_exception_data)
 {
 	HSAKMT_STATUS ret;
 	HsaPointerInfo info;
@@ -331,9 +369,9 @@ static void analysis_memory_exception(struct kfd_hsa_memory_exception_data *
 	else if (memory_exception_data->failure.NoExecute)
 		pr_err("Execute to none-executable page\n");
 
-	ret = hsakmt_fmm_get_mem_info((const void *)addr, &info);
+	ret = hsakmt_fmm_get_mem_info(ctx, (const void *)addr, &info);
 	if (ret != HSAKMT_STATUS_SUCCESS) {
-		ret = get_mem_info_svm_api(addr, memory_exception_data->gpu_id);
+		ret = get_mem_info_svm_api(ctx, addr, memory_exception_data->gpu_id);
 		if (ret != HSAKMT_STATUS_SUCCESS)
 			pr_err("Address does not belong to a known buffer\n");
 		return;
@@ -378,19 +416,22 @@ static void analysis_memory_exception(struct kfd_hsa_memory_exception_data *
 	}
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents(HsaEvent *Events[],
-						   HSAuint32 NumEvents,
-						   bool WaitOnAll,
-						   HSAuint32 Milliseconds)
+HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEventsCtx(HsaKFDContext *ctx,
+						 HsaEvent *Events[],
+						 HSAuint32 NumEvents,
+						 bool WaitOnAll,
+						 HSAuint32 Milliseconds)
 {
-	return hsaKmtWaitOnMultipleEvents_Ext(Events, NumEvents, WaitOnAll, Milliseconds, NULL);
+	return hsaKmtWaitOnMultipleEvents_ExtCtx(ctx, Events,
+						 NumEvents, WaitOnAll, Milliseconds, NULL);
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents_Ext(HsaEvent *Events[],
-						   HSAuint32 NumEvents,
-						   bool WaitOnAll,
-						   HSAuint32 Milliseconds,
-						   uint64_t *event_age)
+HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents_ExtCtx(HsaKFDContext *ctx,
+						 HsaEvent *Events[],
+						 HSAuint32 NumEvents,
+						 bool WaitOnAll,
+						 HSAuint32 Milliseconds,
+						 uint64_t *event_age)
 {
         HSAKMT_STATUS result;
         CHECK_KFD_OPEN();
@@ -417,7 +458,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents_Ext(HsaEvent *Events[],
 	args.num_events = NumEvents;
 	args.events_ptr = (uint64_t)(uintptr_t)event_data;
 
-	if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_WAIT_EVENTS, &args) == -1)
+	if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_WAIT_EVENTS, &args) == -1)
 		result = HSAKMT_STATUS_ERROR;
 	else if (args.wait_result == KFD_IOC_WAIT_RESULT_TIMEOUT)
 		result = HSAKMT_STATUS_WAIT_TIMEOUT;
@@ -438,7 +479,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents_Ext(HsaEvent *Events[],
 				Events[i]->EventData.EventData.MemoryAccessFault.Failure.ECC =
 						((event_data[i].memory_exception_data.ErrorType == 1) || (event_data[i].memory_exception_data.ErrorType == 2)) ? 1 : 0;
 				Events[i]->EventData.EventData.MemoryAccessFault.Flags = HSA_EVENTID_MEMORY_FATAL_PROCESS;
-				analysis_memory_exception(&event_data[i].memory_exception_data);
+				analysis_memory_exception(ctx, &event_data[i].memory_exception_data);
 			} else if (Events[i]->EventData.EventType == HSA_EVENTTYPE_HW_EXCEPTION &&
 				event_data[i].hw_exception_data.gpu_id) {
 
@@ -464,7 +505,7 @@ out:
 	return result;
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtOpenSMI(HSAuint32 NodeId, int *fd)
+HSAKMT_STATUS HSAKMTAPI hsaKmtOpenSMICtx(HsaKFDContext *ctx, HSAuint32 NodeId, int *fd)
 {
 	struct kfd_ioctl_smi_events_args args;
 	HSAKMT_STATUS result;
@@ -481,7 +522,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtOpenSMI(HSAuint32 NodeId, int *fd)
 	}
 
 	args.gpuid = gpuid;
-	result = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_SMI_EVENTS, &args);
+	result = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_SMI_EVENTS, &args);
 	if (result) {
 		pr_debug("open SMI event fd failed %s\n", strerror(errno));
 		return HSAKMT_STATUS_ERROR;
@@ -490,3 +531,73 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtOpenSMI(HSAuint32 NodeId, int *fd)
 	*fd = args.anon_fd;
 	return HSAKMT_STATUS_SUCCESS;
 }
+
+
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtCreateEvent(HsaEventDescriptor *EventDesc,
+					  bool ManualReset, bool IsSignaled,
+					  HsaEvent **Event)
+{
+	return hsaKmtCreateEventCtx(&hsakmt_primary_kfd_ctx, EventDesc, ManualReset,
+					IsSignaled, Event);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtDestroyEvent(HsaEvent *Event)
+{
+	return hsaKmtDestroyEventCtx(&hsakmt_primary_kfd_ctx, Event);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtSetEvent(HsaEvent *Event)
+{
+	return hsaKmtSetEventCtx(&hsakmt_primary_kfd_ctx, Event);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtResetEvent(HsaEvent *Event)
+{
+	return hsaKmtResetEventCtx(&hsakmt_primary_kfd_ctx, Event);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtQueryEventState(HsaEvent *Event)
+{
+	return hsaKmtQueryEventStateCtx(&hsakmt_primary_kfd_ctx, Event);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnEvent(HsaEvent *Event,
+						 HSAuint32 Milliseconds)
+{
+	return hsaKmtWaitOnEvent_Ext(Event, Milliseconds, NULL);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnEvent_Ext(HsaEvent *Event,
+						 HSAuint32 Milliseconds, uint64_t *event_age)
+{
+	if (!Event)
+		return HSAKMT_STATUS_INVALID_HANDLE;
+
+	return hsaKmtWaitOnMultipleEvents_Ext(&Event, 1,
+						 true, Milliseconds, event_age);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents(HsaEvent *Events[],
+						 HSAuint32 NumEvents,
+						 bool WaitOnAll,
+						 HSAuint32 Milliseconds)
+{
+	return hsaKmtWaitOnMultipleEvents_Ext(Events, NumEvents,
+						 WaitOnAll, Milliseconds, NULL);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents_Ext(HsaEvent *Events[],
+						 HSAuint32 NumEvents,
+						 bool WaitOnAll,
+						 HSAuint32 Milliseconds,
+						 uint64_t *event_age)
+{
+	return hsaKmtWaitOnMultipleEvents_ExtCtx(&hsakmt_primary_kfd_ctx,
+						 Events, NumEvents, WaitOnAll, Milliseconds, event_age);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtOpenSMI(HSAuint32 NodeId, int *fd)
+{
+	return hsaKmtOpenSMICtx(&hsakmt_primary_kfd_ctx, NodeId, fd);
+}
diff --git a/projects/rocr-runtime/libhsakmt/src/fmm.c b/projects/rocr-runtime/libhsakmt/src/fmm.c
index 75b9b481de..ea0b43fbd7 100644
--- a/projects/rocr-runtime/libhsakmt/src/fmm.c
+++ b/projects/rocr-runtime/libhsakmt/src/fmm.c
@@ -245,12 +245,44 @@ typedef struct {
 	uint32_t alignment_order;
 } svm_t;
 
-/* The other apertures are specific to each GPU. gpu_mem_t manages GPU
- * specific memory apertures.
- */
-static gpu_mem_t *gpu_mem;
-static unsigned int gpu_mem_count;
-static gpu_mem_t *g_first_gpu_mem;
+struct hsa_kfd_fmm_context
+{
+	/* The other apertures are specific to each GPU. gpu_mem_t manages GPU
+	 * specific memory apertures.
+	 */
+	gpu_mem_t *gpu_mem;
+	unsigned int gpu_mem_count;
+	gpu_mem_t *first_gpu_mem;
+
+#define DRM_FIRST_RENDER_NODE 128
+#define DRM_LAST_RENDER_NODE 255
+
+	/* The VMs from DRM render nodes are used by KFD for the lifetime of
+	 * the process. Therefore we have to keep using the same FDs for the
+	 * lifetime of the process, even when we close and reopen KFD. There
+	 * are up to 128 render nodes that we cache in this array.
+	 */
+	int drm_render_fds[DRM_LAST_RENDER_NODE + 1 - DRM_FIRST_RENDER_NODE];
+
+	/* amdgpu device handle for each gpu that libdrm uses */
+	struct amdgpu_device *amdgpu_handle[DRM_LAST_RENDER_NODE + 1 - DRM_FIRST_RENDER_NODE];
+};
+
+struct hsa_kfd_fmm_context *hsakmt_kfdcontext_get_fmm_context(HsaKFDContext *ctx)
+{
+	assert(ctx);
+
+	if (ctx->fmm_context)
+		return ctx->fmm_context;
+
+	ctx->fmm_context = calloc(1, sizeof(struct hsa_kfd_fmm_context));
+	if (!ctx->fmm_context) {
+		pr_err("Alloc memory failed for struct hsa_kfd_fmm_context size %zu\n",
+				 sizeof(struct hsa_kfd_fmm_context));
+		return NULL;
+	}
+	return ctx->fmm_context;
+}
 
 static void *dgpu_shared_aperture_base;
 static void *dgpu_shared_aperture_limit;
@@ -322,10 +354,9 @@ static inline HsaSharedMemoryHandle *to_hsa_shared_memory_handle(
 	return (HsaSharedMemoryHandle *)SharedMemoryStruct;
 }
 
-static int __fmm_release(vm_object_t *object, manageable_aperture_t *aperture);
-static int _fmm_unmap_from_gpu_scratch(uint32_t gpu_id,
-				       manageable_aperture_t *aperture,
-				       void *address);
+static int __fmm_release(HsaKFDContext *ctx, vm_object_t *object, manageable_aperture_t *aperture);
+static int _fmm_unmap_from_gpu_scratch(HsaKFDContext *ctx, uint32_t gpu_id,
+				       manageable_aperture_t *aperture, void *address);
 static void print_device_id_array(uint32_t *device_id_array, uint32_t device_id_array_size);
 
 static vm_area_t *vm_create_and_init_area(void *start, void *end)
@@ -927,29 +958,29 @@ static vm_object_t *aperture_allocate_object(manageable_aperture_t *app,
 	return new_object;
 }
 
-static int32_t gpu_mem_find_by_gpu_id(uint32_t gpu_id)
+static int32_t gpu_mem_find_by_gpu_id(struct hsa_kfd_fmm_context *fmm_ctx, uint32_t gpu_id)
 {
 	uint32_t i;
 
-	for (i = 0 ; i < gpu_mem_count ; i++)
-		if (gpu_mem[i].gpu_id == gpu_id)
+	for (i = 0 ; i < fmm_ctx->gpu_mem_count ; i++)
+		if (fmm_ctx->gpu_mem[i].gpu_id == gpu_id)
 			return i;
 
 	return -1;
 }
 
-static int32_t gpu_mem_find_by_node_id(uint32_t node_id)
+static int32_t gpu_mem_find_by_node_id(struct hsa_kfd_fmm_context *fmm_ctx, uint32_t node_id)
 {
 	uint32_t i;
 
-	for (i = 0 ; i < gpu_mem_count ; i++)
-		if (gpu_mem[i].node_id == node_id)
+	for (i = 0 ; i < fmm_ctx->gpu_mem_count ; i++)
+		if (fmm_ctx->gpu_mem[i].node_id == node_id)
 			return i;
 
 	return -1;
 }
 
-static manageable_aperture_t *fmm_get_aperture(HsaApertureInfo info)
+static manageable_aperture_t *fmm_get_aperture(struct hsa_kfd_fmm_context *fmm_ctx, HsaApertureInfo info)
 {
 	switch (info.type) {
 	case HSA_APERTURE_DGPU:
@@ -957,7 +988,7 @@ static manageable_aperture_t *fmm_get_aperture(HsaApertureInfo info)
 	case HSA_APERTURE_DGPU_ALT:
 		return svm.dgpu_alt_aperture;
 	case HSA_APERTURE_GPUVM:
-		return &gpu_mem[info.idx].gpuvm_aperture;
+		return &fmm_ctx->gpu_mem[info.idx].gpuvm_aperture;
 	case HSA_APERTURE_CPUVM:
 		return &cpuvm_aperture;
 	case HSA_APERTURE_MEMHANDLE:
@@ -967,23 +998,24 @@ static manageable_aperture_t *fmm_get_aperture(HsaApertureInfo info)
 	}
 }
 
-static gpu_mem_t *fmm_is_scratch_aperture(const void *address)
+static gpu_mem_t *fmm_is_scratch_aperture(struct hsa_kfd_fmm_context *fmm_ctx, const void *address)
 {
 	uint32_t i;
 
-	for (i = 0; i < gpu_mem_count; i++) {
-		if (gpu_mem[i].gpu_id == NON_VALID_GPU_ID)
+	for (i = 0; i < fmm_ctx->gpu_mem_count; i++) {
+		if (fmm_ctx->gpu_mem[i].gpu_id == NON_VALID_GPU_ID)
 			continue;
 
-		if ((address >= gpu_mem[i].scratch_physical.base) &&
-			(address <= gpu_mem[i].scratch_physical.limit))
-			return &gpu_mem[i];
+		if ((address >= fmm_ctx->gpu_mem[i].scratch_physical.base) &&
+			(address <= fmm_ctx->gpu_mem[i].scratch_physical.limit))
+			return &fmm_ctx->gpu_mem[i];
 
 	}
 	return NULL;
 }
 
-static manageable_aperture_t *fmm_find_aperture(const void *address,
+static manageable_aperture_t *fmm_find_aperture(struct hsa_kfd_fmm_context *fmm_ctx,
+						const void *address,
 						HsaApertureInfo *info)
 {
 	manageable_aperture_t *aperture = NULL;
@@ -1001,7 +1033,7 @@ static manageable_aperture_t *fmm_find_aperture(const void *address,
 		if (address >= svm.dgpu_aperture->base &&
 			address <= svm.dgpu_aperture->limit) {
 
-			gpu_mem_ptr = fmm_is_scratch_aperture(address);
+			gpu_mem_ptr = fmm_is_scratch_aperture(fmm_ctx, address);
 			if (gpu_mem_ptr) {
 				aperture = &gpu_mem_ptr->scratch_physical;
 			} else {
@@ -1023,10 +1055,10 @@ static manageable_aperture_t *fmm_find_aperture(const void *address,
 			_info.type = HSA_APERTURE_DGPU;
 		} else {
 			/* gpuvm_aperture */
-			for (i = 0; i < gpu_mem_count; i++) {
-				if ((address >= gpu_mem[i].gpuvm_aperture.base) &&
-					(address <= gpu_mem[i].gpuvm_aperture.limit)) {
-					aperture = &gpu_mem[i].gpuvm_aperture;
+			for (i = 0; i < fmm_ctx->gpu_mem_count; i++) {
+				if ((address >= fmm_ctx->gpu_mem[i].gpuvm_aperture.base) &&
+					(address <= fmm_ctx->gpu_mem[i].gpuvm_aperture.limit)) {
+					aperture = &fmm_ctx->gpu_mem[i].gpuvm_aperture;
 					_info.type = HSA_APERTURE_GPUVM;
 					_info.idx = i;
 				}
@@ -1060,7 +1092,8 @@ static HsaMemFlags fmm_translate_ioc_to_hsa_flags(uint32_t ioc_flags)
 	return mflags;
 }
 
-static HSAKMT_STATUS fmm_register_mem_svm_api(void *address,
+static HSAKMT_STATUS fmm_register_mem_svm_api(HsaKFDContext *ctx,
+						  void *address,
 					      uint64_t size, HsaMemFlags flags)
 {
 	struct kfd_ioctl_svm_args *args;
@@ -1068,8 +1101,9 @@ static HSAKMT_STATUS fmm_register_mem_svm_api(void *address,
 	HSAuint32 page_offset = (HSAuint64)address & (PAGE_SIZE-1);
 	HSAuint64 aligned_addr = (HSAuint64)address - page_offset;
 	HSAuint64 aligned_size = PAGE_ALIGN_UP(page_offset + size);
+	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
 
-	if (!g_first_gpu_mem)
+	if (!fmm_ctx->first_gpu_mem)
 		return HSAKMT_STATUS_ERROR;
 
 	s_attr = 2 * sizeof(struct kfd_ioctl_svm_attribute);
@@ -1087,7 +1121,7 @@ static HSAKMT_STATUS fmm_register_mem_svm_api(void *address,
 	pr_debug("Registering to SVM %p size: %ld\n", (void*)aligned_addr,
 		 aligned_size);
 	/* Driver does one copy_from_user, with extra attrs size */
-	if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_SVM + (s_attr << _IOC_SIZESHIFT), args)) {
+	if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_SVM + (s_attr << _IOC_SIZESHIFT), args)) {
 		pr_debug("op set range attrs failed %s\n", strerror(errno));
 		return HSAKMT_STATUS_ERROR;
 	}
@@ -1095,7 +1129,8 @@ static HSAKMT_STATUS fmm_register_mem_svm_api(void *address,
 	return HSAKMT_STATUS_SUCCESS;
 }
 
-static HSAKMT_STATUS fmm_map_mem_svm_api(void *address,
+static HSAKMT_STATUS fmm_map_mem_svm_api(HsaKFDContext *ctx,
+					      void *address,
 					      uint64_t size,
 					      uint32_t *nodes_to_map,
 					      uint32_t nodes_array_size)
@@ -1103,8 +1138,9 @@ static HSAKMT_STATUS fmm_map_mem_svm_api(void *address,
 	struct kfd_ioctl_svm_args *args;
 	size_t s_attr;
 	uint32_t i, nattr;
+	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
 
-	if (!g_first_gpu_mem)
+	if (!fmm_ctx->first_gpu_mem)
 		return HSAKMT_STATUS_ERROR;
 
 	nattr = nodes_array_size;
@@ -1120,7 +1156,7 @@ static HSAKMT_STATUS fmm_map_mem_svm_api(void *address,
 		args->attrs[i].value = nodes_to_map[i];
 	}
 	/* Driver does one copy_from_user, with extra attrs size */
-	if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_SVM + (s_attr << _IOC_SIZESHIFT), args)) {
+	if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_SVM + (s_attr << _IOC_SIZESHIFT), args)) {
 		pr_debug("op set range attrs failed %s\n", strerror(errno));
 		return HSAKMT_STATUS_ERROR;
 	}
@@ -1131,7 +1167,8 @@ static HSAKMT_STATUS fmm_map_mem_svm_api(void *address,
 /* After allocating the memory, return the vm_object created for this memory.
  * Return NULL if any failure.
  */
-static vm_object_t *fmm_allocate_memory_object(uint32_t gpu_id, void *mem,
+static vm_object_t *fmm_allocate_memory_object(HsaKFDContext *ctx,
+						uint32_t gpu_id, void *mem,
 						uint64_t MemorySizeInBytes,
 						manageable_aperture_t *aperture,
 						uint64_t *mmap_offset,
@@ -1176,7 +1213,7 @@ static vm_object_t *fmm_allocate_memory_object(uint32_t gpu_id, void *mem,
 	do {
 		args.size = size;
 
-		if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_ALLOC_MEMORY_OF_GPU, &args))
+		if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_ALLOC_MEMORY_OF_GPU, &args))
 			goto err_hsakmt_ioctl_failed;
 
 		/* Allocate object */
@@ -1212,14 +1249,14 @@ static vm_object_t *fmm_allocate_memory_object(uint32_t gpu_id, void *mem,
 
 err_object_allocation_failed:
 	free_args.handle = args.handle;
-	if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_FREE_MEMORY_OF_GPU, &free_args)) {
+	if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_FREE_MEMORY_OF_GPU, &free_args)) {
 		pr_err("Failed to free GPU memory with handle: 0x%llx\n", free_args.handle);
 	}
 err_hsakmt_ioctl_failed:
 	if (vm_obj) {
 		do {
 			free_args.handle = vm_obj->handles[--vm_obj->handle_num];
-			if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_FREE_MEMORY_OF_GPU, &free_args))
+			if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_FREE_MEMORY_OF_GPU, &free_args))
 				pr_err("Failed to free GPU memory with handle: 0x%llx\n", free_args.handle);
 		} while (vm_obj->handle_num);
 		pthread_mutex_lock(&aperture->fmm_mutex);
@@ -1258,19 +1295,20 @@ static void manageable_aperture_print(manageable_aperture_t *app)
 	}
 }
 
-void hsakmt_fmm_print(uint32_t gpu_id)
+void hsakmt_fmm_print(HsaKFDContext *ctx, uint32_t gpu_id)
 {
-	int32_t gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id);
+	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
+	int32_t gpu_mem_id = gpu_mem_find_by_gpu_id(fmm_ctx, gpu_id);
 
 	if (gpu_mem_id >= 0) { /* Found */
 		pr_info("LDS aperture:\n");
-		aperture_print(&gpu_mem[gpu_mem_id].lds_aperture);
+		aperture_print(&fmm_ctx->gpu_mem[gpu_mem_id].lds_aperture);
 		pr_info("GPUVM aperture:\n");
-		manageable_aperture_print(&gpu_mem[gpu_mem_id].gpuvm_aperture);
+		manageable_aperture_print(&fmm_ctx->gpu_mem[gpu_mem_id].gpuvm_aperture);
 		pr_info("Scratch aperture:\n");
-		aperture_print(&gpu_mem[gpu_mem_id].scratch_aperture);
+		aperture_print(&fmm_ctx->gpu_mem[gpu_mem_id].scratch_aperture);
 		pr_info("Scratch backing memory:\n");
-		manageable_aperture_print(&gpu_mem[gpu_mem_id].scratch_physical);
+		manageable_aperture_print(&fmm_ctx->gpu_mem[gpu_mem_id].scratch_physical);
 	}
 
 	pr_info("dGPU aperture:\n");
@@ -1282,7 +1320,7 @@ void hsakmt_fmm_print(uint32_t gpu_id)
 		manageable_aperture_print(svm.dgpu_alt_aperture);
 }
 #else
-void hsakmt_fmm_print(uint32_t gpu_id)
+void hsakmt_fmm_print(HsaKFDContext *ctx, uint32_t gpu_id)
 {
 }
 #endif
@@ -1298,7 +1336,8 @@ void hsakmt_fmm_print(uint32_t gpu_id)
  * object is found, this function returns with the
  * (*out_aper)->fmm_mutex locked.
  */
-static vm_object_t *vm_find_object(const void *addr, uint64_t size,
+static vm_object_t *vm_find_object(struct hsa_kfd_fmm_context *fmm_ctx,
+				   const void *addr, uint64_t size,
 				   manageable_aperture_t **out_aper)
 {
 	manageable_aperture_t *aper = NULL;
@@ -1307,11 +1346,11 @@ static vm_object_t *vm_find_object(const void *addr, uint64_t size,
 	vm_object_t *obj = NULL;
 	uint32_t i;
 
-	for (i = 0; i < gpu_mem_count; i++)
-		if (gpu_mem[i].gpu_id != NON_VALID_GPU_ID &&
-		    addr >= gpu_mem[i].gpuvm_aperture.base &&
-		    addr <= gpu_mem[i].gpuvm_aperture.limit) {
-			aper = &gpu_mem[i].gpuvm_aperture;
+	for (i = 0; i < fmm_ctx->gpu_mem_count; i++)
+		if (fmm_ctx->gpu_mem[i].gpu_id != NON_VALID_GPU_ID &&
+		    addr >= fmm_ctx->gpu_mem[i].gpuvm_aperture.base &&
+		    addr <= fmm_ctx->gpu_mem[i].gpuvm_aperture.limit) {
+			aper = &fmm_ctx->gpu_mem[i].gpuvm_aperture;
 			break;
 		}
 
@@ -1409,19 +1448,20 @@ static HSAuint8 fmm_check_user_memory(const void *addr, HSAuint64 size)
 	return sum;
 }
 
-static void fmm_release_scratch(uint32_t gpu_id)
+static void fmm_release_scratch(HsaKFDContext *ctx, uint32_t gpu_id)
 {
 	int32_t gpu_mem_id;
 	uint64_t size;
 	vm_object_t *obj;
 	manageable_aperture_t *aperture;
 	rbtree_node_t *n;
+	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
 
-	gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id);
+	gpu_mem_id = gpu_mem_find_by_gpu_id(fmm_ctx, gpu_id);
 	if (gpu_mem_id < 0)
 		return;
 
-	aperture = &gpu_mem[gpu_mem_id].scratch_physical;
+	aperture = &fmm_ctx->gpu_mem[gpu_mem_id].scratch_physical;
 
 	size = VOID_PTRS_SUB(aperture->limit, aperture->base) + 1;
 
@@ -1435,7 +1475,7 @@ static void fmm_release_scratch(uint32_t gpu_id)
 
 			pthread_mutex_unlock(&aperture->fmm_mutex);
 
-			_fmm_unmap_from_gpu_scratch(gpu_id, aperture, obj_addr);
+			_fmm_unmap_from_gpu_scratch(ctx, gpu_id, aperture, obj_addr);
 
 			pthread_mutex_lock(&aperture->fmm_mutex);
 		}
@@ -1444,16 +1484,16 @@ static void fmm_release_scratch(uint32_t gpu_id)
 		/* release address space */
 		pthread_mutex_lock(&svm.dgpu_aperture->fmm_mutex);
 		aperture_release_area(svm.dgpu_aperture,
-				      gpu_mem[gpu_mem_id].scratch_physical.base,
+				      fmm_ctx->gpu_mem[gpu_mem_id].scratch_physical.base,
 				      size);
 		pthread_mutex_unlock(&svm.dgpu_aperture->fmm_mutex);
 	} else
 		/* release address space */
-		munmap(gpu_mem[gpu_mem_id].scratch_physical.base, size);
+		munmap(fmm_ctx->gpu_mem[gpu_mem_id].scratch_physical.base, size);
 
 	/* invalidate scratch backing aperture */
-	gpu_mem[gpu_mem_id].scratch_physical.base = NULL;
-	gpu_mem[gpu_mem_id].scratch_physical.limit = NULL;
+	fmm_ctx->gpu_mem[gpu_mem_id].scratch_physical.base = NULL;
+	fmm_ctx->gpu_mem[gpu_mem_id].scratch_physical.limit = NULL;
 }
 
 static uint32_t fmm_translate_hsa_to_ioc_flags(HsaMemFlags flags)
@@ -1471,20 +1511,22 @@ static uint32_t fmm_translate_hsa_to_ioc_flags(HsaMemFlags flags)
 }
 
 #define SCRATCH_ALIGN 0x10000
-void *hsakmt_fmm_allocate_scratch(uint32_t gpu_id, void *address, uint64_t MemorySizeInBytes)
+void *hsakmt_fmm_allocate_scratch(HsaKFDContext *ctx,
+			uint32_t gpu_id, void *address, uint64_t MemorySizeInBytes)
 {
 	manageable_aperture_t *aperture_phy;
 	struct kfd_ioctl_set_scratch_backing_va_args args = {0};
 	int32_t gpu_mem_id;
 	void *mem = NULL;
 	uint64_t aligned_size = ALIGN_UP(MemorySizeInBytes, SCRATCH_ALIGN);
+	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
 
 	/* Retrieve gpu_mem id according to gpu_id */
-	gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id);
+	gpu_mem_id = gpu_mem_find_by_gpu_id(fmm_ctx, gpu_id);
 	if (gpu_mem_id < 0)
 		return NULL;
 
-	aperture_phy = &gpu_mem[gpu_mem_id].scratch_physical;
+	aperture_phy = &fmm_ctx->gpu_mem[gpu_mem_id].scratch_physical;
 	if (aperture_phy->base || aperture_phy->limit)
 		/* Scratch was already allocated for this GPU */
 		return NULL;
@@ -1515,15 +1557,16 @@ void *hsakmt_fmm_allocate_scratch(uint32_t gpu_id, void *address, uint64_t Memor
 	args.gpu_id = gpu_id;
 	args.va_addr = ((uint64_t)mem) >> 16;
 
-	if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_SET_SCRATCH_BACKING_VA, &args)) {
-		fmm_release_scratch(gpu_id);
+	if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_SET_SCRATCH_BACKING_VA, &args)) {
+		fmm_release_scratch(ctx, gpu_id);
 		return NULL;
 	}
 
 	return mem;
 }
 
-static void *__fmm_allocate_device(uint32_t gpu_id, void *address, uint64_t MemorySizeInBytes,
+static void *__fmm_allocate_device(HsaKFDContext *ctx,
+		uint32_t gpu_id, void *address, uint64_t MemorySizeInBytes,
 		manageable_aperture_t *aperture, uint64_t *mmap_offset,
 		uint32_t ioc_flags, uint64_t alignment, vm_object_t **vm_obj)
 {
@@ -1545,7 +1588,7 @@ static void *__fmm_allocate_device(uint32_t gpu_id, void *address, uint64_t Memo
 	 * Now that we have the area reserved, allocate memory in the device
 	 * itself
 	 */
-	obj = fmm_allocate_memory_object(gpu_id, mem,
+	obj = fmm_allocate_memory_object(ctx, gpu_id, mem,
 			MemorySizeInBytes, aperture, mmap_offset, ioc_flags);
 	if (!obj) {
 		/*
@@ -1613,7 +1656,8 @@ static void *fmm_allocate_va(uint32_t gpu_id, void *address, uint64_t size,
 }
 
 /* use udmabuf driver to allocate buf */
-static void* udmabuf_allocation(uint32_t gpu_id, uint32_t node_id, uint64_t size,
+static void* udmabuf_allocation(HsaKFDContext *ctx,
+                               uint32_t gpu_id, uint32_t node_id, uint64_t size,
                                manageable_aperture_t *aperture, uint64_t alignment,
                                HsaMemFlags mflags, vm_object_t** vm_obj)
 {
@@ -1699,7 +1743,7 @@ static void* udmabuf_allocation(uint32_t gpu_id, uint32_t node_id, uint64_t size
 	importArgs.gpu_id = gpu_id;
 	importArgs.dmabuf_fd = dmabuf_fd;
 
-	ret = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_IMPORT_DMABUF, (void *)&importArgs);
+	ret = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_IMPORT_DMABUF, (void *)&importArgs);
 	if (ret) {
 		pr_debug("ioctl AMDKFD_IOC_IMPORT_DMABUF failed\n, ret 0x%x", ret);
 		goto error_release_dmabuf;
@@ -1732,7 +1776,8 @@ error_release_memfd:
 	return NULL;
 }
 
-void *hsakmt_fmm_allocate_device(uint32_t gpu_id, uint32_t node_id, void *address,
+void *hsakmt_fmm_allocate_device(HsaKFDContext *ctx,
+			  uint32_t gpu_id, uint32_t node_id, void *address,
 			  uint64_t MemorySizeInBytes, uint64_t alignment, HsaMemFlags mflags)
 {
 	manageable_aperture_t *aperture;
@@ -1741,9 +1786,10 @@ void *hsakmt_fmm_allocate_device(uint32_t gpu_id, uint32_t node_id, void *addres
 	uint64_t size, mmap_offset;
 	void *mem;
 	vm_object_t *vm_obj = NULL;
+	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
 
 	/* Retrieve gpu_mem id according to gpu_id */
-	gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id);
+	gpu_mem_id = gpu_mem_find_by_gpu_id(fmm_ctx, gpu_id);
 	if (gpu_mem_id < 0)
 		return NULL;
 
@@ -1754,12 +1800,12 @@ void *hsakmt_fmm_allocate_device(uint32_t gpu_id, uint32_t node_id, void *addres
 
 	ioc_flags |= fmm_translate_hsa_to_ioc_flags(mflags);
 
-	if (hsakmt_topology_is_svm_needed(gpu_mem[gpu_mem_id].EngineId)) {
+	if (hsakmt_topology_is_svm_needed(fmm_ctx->gpu_mem[gpu_mem_id].EngineId)) {
 		aperture = svm.dgpu_aperture;
 		if (mflags.ui32.AQLQueueMemory)
 			size = MemorySizeInBytes * 2;
 	} else {
-		aperture = &gpu_mem[gpu_mem_id].gpuvm_aperture;
+		aperture = &fmm_ctx->gpu_mem[gpu_mem_id].gpuvm_aperture;
 	}
 
 	/* special case for va allocation without vram alloc */
@@ -1785,7 +1831,7 @@ void *hsakmt_fmm_allocate_device(uint32_t gpu_id, uint32_t node_id, void *addres
 	mem = NULL;
 	if (hsakmt_udmabuf_dev_fd > 0 && aperture == svm.dgpu_aperture && !hsakmt_is_dgpu
 		 && aperture->ops == &mmap_aperture_ops) {
-		mem  = udmabuf_allocation(gpu_id, node_id, size, aperture, alignment,
+		mem  = udmabuf_allocation(ctx, gpu_id, node_id, size, aperture, alignment,
                                         mflags, &vm_obj);
 		pr_debug("udmabuf_allocation mem %p\n", mem);
 		if (!mem)
@@ -1796,24 +1842,25 @@ void *hsakmt_fmm_allocate_device(uint32_t gpu_id, uint32_t node_id, void *addres
 	 * fall back to use device driver to allocate memory
 	 */
 	if (!mem) {
-		mem = __fmm_allocate_device(gpu_id, address, size, aperture, &mmap_offset,
+		mem = __fmm_allocate_device(ctx,
+					   gpu_id, address, size, aperture, &mmap_offset,
 					   ioc_flags, alignment, &vm_obj);
 
 		/* if alloc vram-only not mmap to cpu vm since no va */
 		if (mem && !mflags.ui32.NoAddress) {
 			void *ret = fmm_map_to_cpu(mem, MemorySizeInBytes,
 					   mflags.ui32.HostAccess,
-					   gpu_mem[gpu_mem_id].drm_render_fd,
+					   fmm_ctx->gpu_mem[gpu_mem_id].drm_render_fd,
 					   mmap_offset);
 
 			if (ret == MAP_FAILED) {
-				__fmm_release(vm_obj, aperture);
+				__fmm_release(ctx, vm_obj, aperture);
 				return NULL;
 			}
 #ifdef SANITIZER_AMDGPU
 			if (vm_obj) {
 				vm_obj->mmap_flags = mflags.ui32.HostAccess ? PROT_READ | PROT_WRITE : PROT_NONE;
-				vm_obj->mmap_fd = gpu_mem[gpu_mem_id].drm_render_fd;
+				vm_obj->mmap_fd = fmm_ctx->gpu_mem[gpu_mem_id].drm_render_fd;
 				vm_obj->mmap_offset = mmap_offset;
 			}
 #endif
@@ -1832,7 +1879,8 @@ void *hsakmt_fmm_allocate_device(uint32_t gpu_id, uint32_t node_id, void *addres
 	return mem;
 }
 
-void *hsakmt_fmm_allocate_doorbell(uint32_t gpu_id, uint64_t MemorySizeInBytes,
+void *hsakmt_fmm_allocate_doorbell(HsaKFDContext *ctx,
+				uint32_t gpu_id, uint64_t MemorySizeInBytes,
 			    uint64_t doorbell_mmap_offset)
 {
 	manageable_aperture_t *aperture;
@@ -1840,9 +1888,10 @@ void *hsakmt_fmm_allocate_doorbell(uint32_t gpu_id, uint64_t MemorySizeInBytes,
 	uint32_t ioc_flags;
 	void *mem;
 	vm_object_t *vm_obj = NULL;
+	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
 
 	/* Retrieve gpu_mem id according to gpu_id */
-	gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id);
+	gpu_mem_id = gpu_mem_find_by_gpu_id(fmm_ctx, gpu_id);
 	if (gpu_mem_id < 0)
 		return NULL;
 
@@ -1852,8 +1901,8 @@ void *hsakmt_fmm_allocate_doorbell(uint32_t gpu_id, uint64_t MemorySizeInBytes,
 		    KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE |
 		    KFD_IOC_ALLOC_MEM_FLAGS_COHERENT;
 
-	mem = __fmm_allocate_device(gpu_id, NULL, MemorySizeInBytes, aperture, NULL,
-				    ioc_flags, 0, &vm_obj);
+	mem = __fmm_allocate_device(ctx, gpu_id, NULL, MemorySizeInBytes,
+						aperture, NULL, ioc_flags, 0, &vm_obj);
 
 	if (mem && vm_obj) {
 		HsaMemFlags mflags;
@@ -1872,10 +1921,10 @@ void *hsakmt_fmm_allocate_doorbell(uint32_t gpu_id, uint64_t MemorySizeInBytes,
 	if (mem) {
 		void *ret = mmap(mem, MemorySizeInBytes,
 				 PROT_READ | PROT_WRITE,
-				 MAP_SHARED | MAP_FIXED, hsakmt_kfd_fd,
+				 MAP_SHARED | MAP_FIXED, ctx->fd,
 				 doorbell_mmap_offset);
 		if (ret == MAP_FAILED) {
-			__fmm_release(vm_obj, aperture);
+			__fmm_release(ctx, vm_obj, aperture);
 			return NULL;
 		}
 	}
@@ -1986,8 +2035,10 @@ static int bind_mem_to_numa(uint32_t numa_node_id, void *mem,
 	return 0;
 }
 
-static void *fmm_allocate_host_gpu(uint32_t gpu_id, uint32_t node_id, void *address,
-				   uint64_t MemorySizeInBytes, uint64_t alignment, HsaMemFlags mflags)
+static void *fmm_allocate_host_gpu(HsaKFDContext *ctx,
+				   uint32_t gpu_id, uint32_t node_id, void *address,
+				   uint64_t MemorySizeInBytes,
+				   uint64_t alignment, HsaMemFlags mflags)
 {
 	manageable_aperture_t *aperture;
 	vm_object_t *vm_obj = NULL;
@@ -1995,21 +2046,22 @@ static void *fmm_allocate_host_gpu(uint32_t gpu_id, uint32_t node_id, void *addr
 	int32_t gpu_drm_fd;
 	uint32_t ioc_flags;
 	uint32_t preferred_gpu_id;
-	int gpu_mem_id = 0; /* default to g_first_gpu_mem */
+	int gpu_mem_id = 0; /* default to first_gpu_mem */
 	uint64_t size;
 	void *mem;
 
-	if (!g_first_gpu_mem)
+	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
+	if (!fmm_ctx->first_gpu_mem)
 		return NULL;
 
 	if (gpu_id) {
-		gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id);
+		gpu_mem_id = gpu_mem_find_by_gpu_id(fmm_ctx, gpu_id);
 		if (gpu_mem_id < 0)
 			return NULL;
 	}
 
-	preferred_gpu_id = gpu_mem[gpu_mem_id].gpu_id;
-	gpu_drm_fd = gpu_mem[gpu_mem_id].drm_render_fd;
+	preferred_gpu_id = fmm_ctx->gpu_mem[gpu_mem_id].gpu_id;
+	gpu_drm_fd = fmm_ctx->gpu_mem[gpu_mem_id].drm_render_fd;
 
 	size = MemorySizeInBytes;
 	ioc_flags = 0;
@@ -2068,14 +2120,14 @@ static void *fmm_allocate_host_gpu(uint32_t gpu_id, uint32_t node_id, void *addr
 		/* Create userptr BO */
 		mmap_offset = (uint64_t)mem;
 		ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_USERPTR;
-		vm_obj = fmm_allocate_memory_object(preferred_gpu_id, mem, size,
+		vm_obj = fmm_allocate_memory_object(ctx, preferred_gpu_id, mem, size,
 						       aperture, &mmap_offset,
 						       ioc_flags);
 		if (!vm_obj)
 			goto out_release_area;
 	} else {
 		ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_GTT;
-		mem =  __fmm_allocate_device(preferred_gpu_id, address, size, aperture,
+		mem =  __fmm_allocate_device(ctx, preferred_gpu_id, address, size, aperture,
 					     &mmap_offset, ioc_flags, alignment, &vm_obj);
 
 		if (mem && mflags.ui32.HostAccess) {
@@ -2084,7 +2136,7 @@ static void *fmm_allocate_host_gpu(uint32_t gpu_id, uint32_t node_id, void *addr
 						   gpu_drm_fd, mmap_offset);
 
 			if (ret == MAP_FAILED) {
-				__fmm_release(vm_obj, aperture);
+				__fmm_release(ctx, vm_obj, aperture);
 				return NULL;
 			}
 		}
@@ -2119,11 +2171,12 @@ out_release_area:
 	return NULL;
 }
 
-void *hsakmt_fmm_allocate_host(uint32_t gpu_id, uint32_t node_id, void *address,
+void *hsakmt_fmm_allocate_host(HsaKFDContext *ctx,
+			uint32_t gpu_id, uint32_t node_id, void *address,
 			uint64_t MemorySizeInBytes, uint64_t alignment, HsaMemFlags mflags)
 {
 	if (hsakmt_is_dgpu)
-		return fmm_allocate_host_gpu(gpu_id, node_id, address, MemorySizeInBytes, alignment, mflags);
+		return fmm_allocate_host_gpu(ctx, gpu_id, node_id, address, MemorySizeInBytes, alignment, mflags);
 
 	if (alignment) {//Alignment not supported on non-dgpu
 		pr_err("Non-default alignment not supported on non-dgpu\n");
@@ -2133,7 +2186,8 @@ void *hsakmt_fmm_allocate_host(uint32_t gpu_id, uint32_t node_id, void *address,
 	return fmm_allocate_host_cpu(address, MemorySizeInBytes, mflags);
 }
 
-static int __fmm_release(vm_object_t *object, manageable_aperture_t *aperture)
+static int __fmm_release(HsaKFDContext *ctx,
+			vm_object_t *object, manageable_aperture_t *aperture)
 {
 	struct kfd_ioctl_free_memory_of_gpu_args args = {0};
 	int ret = 0;
@@ -2161,7 +2215,7 @@ static int __fmm_release(vm_object_t *object, manageable_aperture_t *aperture)
 		args.handle = object->handles[i];
 		if (args.handle == 0)
 			continue;
-		if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_FREE_MEMORY_OF_GPU, &args))
+		if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_FREE_MEMORY_OF_GPU, &args))
 			ret = -errno;
 	}
 
@@ -2176,20 +2230,21 @@ err_free_mem_failed:
 	return ret;
 }
 
-HSAKMT_STATUS hsakmt_fmm_release(void *address)
+HSAKMT_STATUS hsakmt_fmm_release(HsaKFDContext *ctx, void *address)
 {
 	manageable_aperture_t *aperture = NULL;
 	vm_object_t *object = NULL;
 	gpu_mem_t *gpu_mem_ptr = NULL;
+	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
 
 	/* Special handling for scratch memory */
-	gpu_mem_ptr = fmm_is_scratch_aperture(address);
+	gpu_mem_ptr = fmm_is_scratch_aperture(fmm_ctx, address);
 	if (gpu_mem_ptr) {
-		fmm_release_scratch(gpu_mem_ptr->gpu_id);
+		fmm_release_scratch(ctx, gpu_mem_ptr->gpu_id);
 		return HSAKMT_STATUS_SUCCESS;
 	}
 
-	object = vm_find_object(address, 0, &aperture);
+	object = vm_find_object(fmm_ctx, address, 0, &aperture);
 
 	if (!object)
 		return hsakmt_is_svm_api_supported ?
@@ -2207,14 +2262,15 @@ HSAKMT_STATUS hsakmt_fmm_release(void *address)
 	} else {
 		pthread_mutex_unlock(&aperture->fmm_mutex);
 
-		if (__fmm_release(object, aperture))
+		if (__fmm_release(ctx, object, aperture))
 			return HSAKMT_STATUS_ERROR;
 	}
 
 	return HSAKMT_STATUS_SUCCESS;
 }
 
-static int fmm_set_memory_policy(uint32_t gpu_id, int default_policy, int alt_policy,
+static int fmm_set_memory_policy(HsaKFDContext *ctx,
+				 uint32_t gpu_id, int default_policy, int alt_policy,
 				 uintptr_t alt_base, uint64_t alt_size,
 				 uint32_t misc_process_flags)
 {
@@ -2227,7 +2283,7 @@ static int fmm_set_memory_policy(uint32_t gpu_id, int default_policy, int alt_po
 	args.alternate_aperture_size = alt_size;
 	args.misc_process_flag = misc_process_flags;
 
-	return hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_SET_MEMORY_POLICY, &args);
+	return hsakmt_ioctl(ctx->fd, AMDKFD_IOC_SET_MEMORY_POLICY, &args);
 }
 
 static uint32_t get_vm_alignment(uint32_t device_id)
@@ -2242,7 +2298,7 @@ static uint32_t get_vm_alignment(uint32_t device_id)
 	return MAX(PAGE_SIZE, page_size);
 }
 
-static HSAKMT_STATUS get_process_apertures(
+static HSAKMT_STATUS get_process_apertures(HsaKFDContext *ctx,
 	struct kfd_process_device_apertures *process_apertures,
 	uint32_t *num_of_nodes)
 {
@@ -2251,7 +2307,7 @@ static HSAKMT_STATUS get_process_apertures(
 
 	args_new.kfd_process_device_apertures_ptr = (uintptr_t)process_apertures;
 	args_new.num_of_nodes = *num_of_nodes;
-	if (!hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_GET_PROCESS_APERTURES_NEW,
+	if (!hsakmt_ioctl(ctx->fd, AMDKFD_IOC_GET_PROCESS_APERTURES_NEW,
 		      (void *)&args_new)) {
 		*num_of_nodes = args_new.num_of_nodes;
 		return HSAKMT_STATUS_SUCCESS;
@@ -2261,7 +2317,7 @@ static HSAKMT_STATUS get_process_apertures(
 	 * a really old kernel */
 	memset(&args_old, 0, sizeof(args_old));
 
-	if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_GET_PROCESS_APERTURES,
+	if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_GET_PROCESS_APERTURES,
 		     (void *)&args_old))
 		return HSAKMT_STATUS_ERROR;
 
@@ -2274,29 +2330,18 @@ static HSAKMT_STATUS get_process_apertures(
 	return HSAKMT_STATUS_SUCCESS;
 }
 
-/* The VMs from DRM render nodes are used by KFD for the lifetime of
- * the process. Therefore we have to keep using the same FDs for the
- * lifetime of the process, even when we close and reopen KFD. There
- * are up to 128 render nodes that we cache in this array.
- */
-#define DRM_FIRST_RENDER_NODE 128
-#define DRM_LAST_RENDER_NODE 255
-static int drm_render_fds[DRM_LAST_RENDER_NODE + 1 - DRM_FIRST_RENDER_NODE];
-
-/* amdgpu device handle for each gpu that libdrm uses */
-static struct amdgpu_device *amdgpu_handle[DRM_LAST_RENDER_NODE + 1 - DRM_FIRST_RENDER_NODE];
-
-int hsakmt_open_drm_render_device(int minor)
+int hsakmt_open_drm_render_device(HsaKFDContext *ctx, int minor)
 {
 	char path[128];
 	int index, fd;
 	uint32_t major_drm, minor_drm;
 	struct amdgpu_device **device_handle;
+	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
 
-	/* Bypass amdgpu if we're running a model. Return hsakmt_kfd_fd, which is the
+	/* Bypass amdgpu if we're running a model. Return ctx->fd, which is the
 	 * backing for all our "GPU" memory. */
 	if (hsakmt_use_model)
-		return hsakmt_kfd_fd;
+		return ctx->fd;
 
 	if (minor < DRM_FIRST_RENDER_NODE || minor > DRM_LAST_RENDER_NODE) {
 		pr_err("DRM render minor %d out of range [%d, %d]\n", minor,
@@ -2306,8 +2351,8 @@ int hsakmt_open_drm_render_device(int minor)
 	index = minor - DRM_FIRST_RENDER_NODE;
 
 	/* If the render node was already opened, keep using the same FD */
-	if (drm_render_fds[index])
-		return drm_render_fds[index];
+	if (fmm_ctx->drm_render_fds[index])
+		return fmm_ctx->drm_render_fds[index];
 
 	sprintf(path, "/dev/dri/renderD%d", minor);
 	fd = open(path, O_RDWR | O_CLOEXEC);
@@ -2319,9 +2364,9 @@ int hsakmt_open_drm_render_device(int minor)
 		}
 		return -errno;
 	}
-	drm_render_fds[index] = fd;
+	fmm_ctx->drm_render_fds[index] = fd;
 
-	device_handle = &amdgpu_handle[index];
+	device_handle = &fmm_ctx->amdgpu_handle[index];
 	if (!amdgpu_device_initialize(fd, &major_drm, &minor_drm, device_handle)) {
 		/* if amdgpu_device_get_fd available query render fd that libdrm uses,
 		 * then close drm_render_fds above, replace it by fd libdrm uses.
@@ -2329,8 +2374,8 @@ int hsakmt_open_drm_render_device(int minor)
 		if (hsakmt_fn_amdgpu_device_get_fd) {
 			fd = hsakmt_fn_amdgpu_device_get_fd(*device_handle);
 			if (fd > 0) {
-				close(drm_render_fds[index]);
-				drm_render_fds[index] = fd;
+				close(fmm_ctx->drm_render_fds[index]);
+				fmm_ctx->drm_render_fds[index] = fd;
 			} else {
 				pr_err("amdgpu_device_get_fd failed: %d\n", fd);
 				amdgpu_device_deinitialize(*device_handle);
@@ -2342,14 +2387,14 @@ int hsakmt_open_drm_render_device(int minor)
 	return fd;
 }
 
-static HSAKMT_STATUS acquire_vm(uint32_t gpu_id, int fd)
+static HSAKMT_STATUS acquire_vm(HsaKFDContext *ctx, uint32_t gpu_id, int fd)
 {
 	struct kfd_ioctl_acquire_vm_args args;
 
 	args.gpu_id = gpu_id;
 	args.drm_fd = fd;
 	pr_info("acquiring VM for %x using %d\n", gpu_id, fd);
-	if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_ACQUIRE_VM, (void *)&args)) {
+	if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_ACQUIRE_VM, (void *)&args)) {
 		pr_err("AMDKFD_IOC_ACQUIRE_VM failed\n");
 		return HSAKMT_STATUS_ERROR;
 	}
@@ -2565,10 +2610,10 @@ static HSAKMT_STATUS init_svm_apertures(HSAuint64 base, HSAuint64 limit,
 	return HSAKMT_STATUS_SUCCESS;
 }
 
-static void fmm_init_rbtree(void)
+static void fmm_init_rbtree(struct hsa_kfd_fmm_context *fmm_ctx)
 {
 	static int once;
-	int i = gpu_mem_count;
+	int i = fmm_ctx->gpu_mem_count;
 
 	if (once++ == 0) {
 		rbtree_init(&svm.apertures[SVM_DEFAULT].tree);
@@ -2582,14 +2627,15 @@ static void fmm_init_rbtree(void)
 	}
 
 	while (i--) {
-		rbtree_init(&gpu_mem[i].scratch_physical.tree);
-		rbtree_init(&gpu_mem[i].scratch_physical.user_tree);
-		rbtree_init(&gpu_mem[i].gpuvm_aperture.tree);
-		rbtree_init(&gpu_mem[i].gpuvm_aperture.user_tree);
+		rbtree_init(&fmm_ctx->gpu_mem[i].scratch_physical.tree);
+		rbtree_init(&fmm_ctx->gpu_mem[i].scratch_physical.user_tree);
+		rbtree_init(&fmm_ctx->gpu_mem[i].gpuvm_aperture.tree);
+		rbtree_init(&fmm_ctx->gpu_mem[i].gpuvm_aperture.user_tree);
 	}
 }
 
-static void *map_mmio(uint32_t node_id, uint32_t gpu_id, int mmap_fd)
+static void *map_mmio(HsaKFDContext *ctx,
+				uint32_t node_id, uint32_t gpu_id, int mmap_fd)
 {
 	void *mem;
 	manageable_aperture_t *aperture = svm.dgpu_alt_aperture;
@@ -2603,7 +2649,8 @@ static void *map_mmio(uint32_t node_id, uint32_t gpu_id, int mmap_fd)
 	ioc_flags = KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP |
 		KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE |
 		KFD_IOC_ALLOC_MEM_FLAGS_COHERENT;
-	mem = __fmm_allocate_device(gpu_id, NULL, PAGE_SIZE, aperture,
+	mem = __fmm_allocate_device(ctx,
+			gpu_id, NULL, PAGE_SIZE, aperture,
 			&mmap_offset, ioc_flags, 0, &vm_obj);
 
 	if (!mem || !vm_obj)
@@ -2628,36 +2675,39 @@ static void *map_mmio(uint32_t node_id, uint32_t gpu_id, int mmap_fd)
 			 MAP_SHARED | MAP_FIXED, mmap_fd,
 			 mmap_offset);
 	if (ret == MAP_FAILED) {
-		__fmm_release(vm_obj, aperture);
+		__fmm_release(ctx, vm_obj, aperture);
 		return NULL;
 	}
 
 	/* Map for GPU access*/
-	if (hsakmt_fmm_map_to_gpu(mem, PAGE_SIZE, NULL)) {
-		__fmm_release(vm_obj, aperture);
+	if (hsakmt_fmm_map_to_gpu(ctx, mem, PAGE_SIZE, NULL)) {
+		__fmm_release(ctx, vm_obj, aperture);
 		return NULL;
 	}
 
 	return mem;
 }
 
-static void release_mmio(void)
+static void release_mmio(HsaKFDContext *ctx)
 {
 	uint32_t gpu_mem_id;
+	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
 
-	for (gpu_mem_id = 0; (uint32_t)gpu_mem_id < gpu_mem_count; gpu_mem_id++) {
-		if (!gpu_mem[gpu_mem_id].mmio_aperture.base)
+	for (gpu_mem_id = 0; gpu_mem_id < fmm_ctx->gpu_mem_count; gpu_mem_id++) {
+		if (!fmm_ctx->gpu_mem[gpu_mem_id].mmio_aperture.base)
 			continue;
-		hsakmt_fmm_unmap_from_gpu(gpu_mem[gpu_mem_id].mmio_aperture.base);
-		munmap(gpu_mem[gpu_mem_id].mmio_aperture.base, PAGE_SIZE);
-		hsakmt_fmm_release(gpu_mem[gpu_mem_id].mmio_aperture.base);
+		hsakmt_fmm_unmap_from_gpu(ctx, fmm_ctx->gpu_mem[gpu_mem_id].mmio_aperture.base);
+		munmap(fmm_ctx->gpu_mem[gpu_mem_id].mmio_aperture.base, PAGE_SIZE);
+		hsakmt_fmm_release(ctx, fmm_ctx->gpu_mem[gpu_mem_id].mmio_aperture.base);
 	}
 }
 
-HSAKMT_STATUS hsakmt_fmm_get_amdgpu_device_handle(uint32_t node_id,
+HSAKMT_STATUS hsakmt_fmm_get_amdgpu_device_handle(HsaKFDContext *ctx,
+						uint32_t node_id,
 						HsaAMDGPUDeviceHandle *DeviceHandle)
 {
-	int32_t i = gpu_mem_find_by_node_id(node_id);
+	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
+	int32_t i = gpu_mem_find_by_node_id(fmm_ctx, node_id);
 	int index;
 
 	if (i < 0)
@@ -2668,11 +2718,11 @@ HSAKMT_STATUS hsakmt_fmm_get_amdgpu_device_handle(uint32_t node_id,
 		return HSAKMT_STATUS_SUCCESS;
 	}
 
-	index = gpu_mem[i].drm_render_minor - DRM_FIRST_RENDER_NODE;
-	if (!amdgpu_handle[index])
+	index = fmm_ctx->gpu_mem[i].drm_render_minor - DRM_FIRST_RENDER_NODE;
+	if (!fmm_ctx->amdgpu_handle[index])
 		return HSAKMT_STATUS_INVALID_HANDLE;
 
-	*DeviceHandle = amdgpu_handle[index];
+	*DeviceHandle = fmm_ctx->amdgpu_handle[index];
 	return HSAKMT_STATUS_SUCCESS;
 }
 
@@ -2681,7 +2731,7 @@ static bool two_apertures_overlap(void *start_1, void *limit_1, void *start_2, v
     return (start_1 >= start_2 && start_1 <= limit_2) || (start_2 >= start_1 && start_2 <= limit_1);
 }
 
-static bool init_mem_handle_aperture(HSAuint32 align, HSAuint32 guard_pages)
+static bool init_mem_handle_aperture(struct hsa_kfd_fmm_context *fmm_ctx, HSAuint32 align, HSAuint32 guard_pages)
 {
 	bool found;
 	uint32_t i;
@@ -2695,24 +2745,24 @@ static bool init_mem_handle_aperture(HSAuint32 align, HSAuint32 guard_pages)
 	while (PORT_VPTR_TO_UINT64(mem_handle_aperture.base) < END_NON_CANONICAL_ADDR - 1) {
 
 		found = true;
-		for (i = 0; i < gpu_mem_count; i++) {
+		for (i = 0; i < fmm_ctx->gpu_mem_count; i++) {
 
-			if (gpu_mem[i].lds_aperture.base &&
-				two_apertures_overlap(gpu_mem[i].lds_aperture.base, gpu_mem[i].lds_aperture.limit,
+			if (fmm_ctx->gpu_mem[i].lds_aperture.base &&
+				two_apertures_overlap(fmm_ctx->gpu_mem[i].lds_aperture.base, fmm_ctx->gpu_mem[i].lds_aperture.limit,
 									mem_handle_aperture.base, mem_handle_aperture.limit)) {
 					found = false;
 					break;
 			}
 
-			if (gpu_mem[i].scratch_aperture.base &&
-				two_apertures_overlap(gpu_mem[i].scratch_aperture.base, gpu_mem[i].scratch_aperture.limit,
+			if (fmm_ctx->gpu_mem[i].scratch_aperture.base &&
+				two_apertures_overlap(fmm_ctx->gpu_mem[i].scratch_aperture.base, fmm_ctx->gpu_mem[i].scratch_aperture.limit,
 									mem_handle_aperture.base, mem_handle_aperture.limit)){
 					found = false;
 					break;
 			}
 
-			if (gpu_mem[i].gpuvm_aperture.base &&
-			   two_apertures_overlap(gpu_mem[i].gpuvm_aperture.base, gpu_mem[i].gpuvm_aperture.limit,
+			if (fmm_ctx->gpu_mem[i].gpuvm_aperture.base &&
+			   two_apertures_overlap(fmm_ctx->gpu_mem[i].gpuvm_aperture.base, fmm_ctx->gpu_mem[i].gpuvm_aperture.limit,
 									mem_handle_aperture.base, mem_handle_aperture.limit)){
 					found = false;
 					break;
@@ -2737,10 +2787,13 @@ static bool init_mem_handle_aperture(HSAuint32 align, HSAuint32 guard_pages)
 	return false;
 }
 
-HSAKMT_STATUS hsakmt_fmm_init_process_apertures(unsigned int NumNodes)
+HSAKMT_STATUS hsakmt_fmm_init_process_apertures(HsaKFDContext *ctx,
+				unsigned int NumNodes)
 {
 	uint32_t i;
+	uint32_t gpu_mem_count = 0;
 	int32_t gpu_mem_id = 0;
+	gpu_mem_t *gpu_mem = NULL;
 	struct kfd_process_device_apertures *process_apertures;
 	uint32_t num_of_sysfs_nodes;
 	HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
@@ -2801,8 +2854,7 @@ HSAKMT_STATUS hsakmt_fmm_init_process_apertures(unsigned int NumNodes)
 	}
 	pr_info("SVM alignment default order is %d.", svm.alignment_order);
 
-	gpu_mem_count = 0;
-	g_first_gpu_mem = NULL;
+	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
 
 	/* Trade off - NumNodes includes GPU nodes + CPU Node. So in
 	 * systems with CPU node, slightly more memory is allocated than
@@ -2830,7 +2882,7 @@ HSAKMT_STATUS hsakmt_fmm_init_process_apertures(unsigned int NumNodes)
 
 		/* Skip non-GPU nodes */
 		if (props.KFDGpuID) {
-			int fd = hsakmt_open_drm_render_device(props.DrmRenderMinor);
+			int fd = hsakmt_open_drm_render_device(ctx, props.DrmRenderMinor);
 			if (fd <= 0) {
 				ret = HSAKMT_STATUS_ERROR;
 				goto gpu_mem_init_failed;
@@ -2867,13 +2919,14 @@ HSAKMT_STATUS hsakmt_fmm_init_process_apertures(unsigned int NumNodes)
 			gpu_mem[gpu_mem_count].gpuvm_aperture.ops = &reserved_aperture_ops;
 			pthread_mutex_init(&gpu_mem[gpu_mem_count].gpuvm_aperture.fmm_mutex, NULL);
 
-			if (!g_first_gpu_mem)
-				g_first_gpu_mem = &gpu_mem[gpu_mem_count];
-
 			gpu_mem_count++;
 		}
 	}
 
+	fmm_ctx->gpu_mem = gpu_mem;
+	fmm_ctx->gpu_mem_count = gpu_mem_count;
+	fmm_ctx->first_gpu_mem = gpu_mem;
+
 	/* The ioctl will also return Number of Nodes if
 	 * args.kfd_process_device_apertures_ptr is set to NULL. This is not
 	 * required since Number of nodes is already known. Kernel will fill in
@@ -2895,7 +2948,7 @@ HSAKMT_STATUS hsakmt_fmm_init_process_apertures(unsigned int NumNodes)
 	 * The Kernel driver could be not aware of this.
 	 * Get from Kernel driver information of all the nodes and then filter it.
 	 */
-	ret = get_process_apertures(process_apertures, &num_of_sysfs_nodes);
+	ret = get_process_apertures(ctx, process_apertures, &num_of_sysfs_nodes);
 	if (ret != HSAKMT_STATUS_SUCCESS)
 		goto get_aperture_ioctl_failed;
 
@@ -2918,7 +2971,7 @@ HSAKMT_STATUS hsakmt_fmm_init_process_apertures(unsigned int NumNodes)
 		/* Map Kernel process device data node i <--> gpu_mem_id which
 		 * indexes into gpu_mem[] based on gpu_id
 		 */
-		gpu_mem_id = gpu_mem_find_by_gpu_id(process_apertures[i].gpu_id);
+		gpu_mem_id = gpu_mem_find_by_gpu_id(fmm_ctx, process_apertures[i].gpu_id);
 		if (gpu_mem_id < 0)
 			continue;
 
@@ -2943,7 +2996,7 @@ HSAKMT_STATUS hsakmt_fmm_init_process_apertures(unsigned int NumNodes)
 			goto aperture_init_failed;
 		for (j = 0; j < nodeProps.NumIOLinks; j++) {
 			int32_t to_gpu_mem_id =
-				gpu_mem_find_by_node_id(linkProps[j].NodeTo);
+				gpu_mem_find_by_node_id(fmm_ctx, linkProps[j].NodeTo);
 			uint32_t peer;
 
 			if (to_gpu_mem_id < 0)
@@ -3002,7 +3055,8 @@ HSAKMT_STATUS hsakmt_fmm_init_process_apertures(unsigned int NumNodes)
 		}
 
 		/* Acquire the VM from the DRM render node for KFD use */
-		ret = acquire_vm(gpu_mem[gpu_mem_id].gpu_id,
+		ret = acquire_vm(ctx,
+				 gpu_mem[gpu_mem_id].gpu_id,
 				 gpu_mem[gpu_mem_id].drm_render_fd);
 		if (ret != HSAKMT_STATUS_SUCCESS)
 			goto aperture_init_failed;
@@ -3030,7 +3084,8 @@ HSAKMT_STATUS hsakmt_fmm_init_process_apertures(unsigned int NumNodes)
 			alt_base = (uintptr_t)svm.dgpu_alt_aperture->base;
 			alt_size = VOID_PTRS_SUB(svm.dgpu_alt_aperture->limit,
 				svm.dgpu_alt_aperture->base) + 1;
-			err = fmm_set_memory_policy(process_apertures[i].gpu_id,
+			err = fmm_set_memory_policy(ctx,
+						    process_apertures[i].gpu_id,
 						    svm.disable_cache ?
 						    KFD_IOC_CACHE_POLICY_COHERENT :
 						    KFD_IOC_CACHE_POLICY_NONCOHERENT,
@@ -3050,18 +3105,18 @@ HSAKMT_STATUS hsakmt_fmm_init_process_apertures(unsigned int NumNodes)
 	cpuvm_aperture.align = PAGE_SIZE;
 	cpuvm_aperture.limit = (void *)0x7FFFFFFFFFFF; /* 2^47 - 1 */
 
-	fmm_init_rbtree();
+	fmm_init_rbtree(fmm_ctx);
 
-	if (!init_mem_handle_aperture(PAGE_SIZE, guardPages))
+	if (!init_mem_handle_aperture(fmm_ctx, PAGE_SIZE, guardPages))
 		pr_err("Failed to init mem_handle_aperture\n");
 
 	for (gpu_mem_id = 0; (uint32_t)gpu_mem_id < gpu_mem_count; gpu_mem_id++) {
 		if (!hsakmt_topology_is_svm_needed(gpu_mem[gpu_mem_id].EngineId))
 			continue;
-		gpu_mem[gpu_mem_id].mmio_aperture.base = map_mmio(
+		gpu_mem[gpu_mem_id].mmio_aperture.base = map_mmio(ctx,
 				gpu_mem[gpu_mem_id].node_id,
 				gpu_mem[gpu_mem_id].gpu_id,
-				hsakmt_kfd_fd);
+				ctx->fd);
 		if (gpu_mem[gpu_mem_id].mmio_aperture.base)
 			gpu_mem[gpu_mem_id].mmio_aperture.limit = (void *)
 			((char *)gpu_mem[gpu_mem_id].mmio_aperture.base +
@@ -3083,13 +3138,15 @@ get_aperture_ioctl_failed:
 	free(process_apertures);
 sysfs_parse_failed:
 gpu_mem_init_failed:
-	hsakmt_fmm_destroy_process_apertures();
+	hsakmt_fmm_destroy_process_apertures(ctx);
 	return ret;
 }
 
-void hsakmt_fmm_destroy_process_apertures(void)
+void hsakmt_fmm_destroy_process_apertures(HsaKFDContext *ctx)
 {
-	release_mmio();
+	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
+
+	release_mmio(ctx);
 
 	if (all_gpu_id_array) {
 		free(all_gpu_id_array);
@@ -3097,48 +3154,51 @@ void hsakmt_fmm_destroy_process_apertures(void)
 	}
 	all_gpu_id_array_size = 0;
 
-	if (gpu_mem) {
-		while (gpu_mem_count-- > 0)
-			free(gpu_mem[gpu_mem_count].usable_peer_id_array);
-		free(gpu_mem);
-		gpu_mem = NULL;
+	if (fmm_ctx->gpu_mem) {
+		while (fmm_ctx->gpu_mem_count-- > 0)
+			free(fmm_ctx->gpu_mem[fmm_ctx->gpu_mem_count].usable_peer_id_array);
+		free(fmm_ctx->gpu_mem);
+		fmm_ctx->gpu_mem = NULL;
+		fmm_ctx->first_gpu_mem = NULL;
 	}
-	gpu_mem_count = 0;
+	fmm_ctx->gpu_mem_count = 0;
 }
 
-HSAKMT_STATUS hsakmt_fmm_get_aperture_base_and_limit(aperture_type_e aperture_type, HSAuint32 gpu_id,
+HSAKMT_STATUS hsakmt_fmm_get_aperture_base_and_limit(HsaKFDContext *ctx,
+			aperture_type_e aperture_type, HSAuint32 gpu_id,
 			HSAuint64 *aperture_base, HSAuint64 *aperture_limit)
 {
 	HSAKMT_STATUS err = HSAKMT_STATUS_ERROR;
-	int32_t slot = gpu_mem_find_by_gpu_id(gpu_id);
+	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
+	int32_t slot = gpu_mem_find_by_gpu_id(fmm_ctx, gpu_id);
 
 	if (slot < 0)
 		return HSAKMT_STATUS_INVALID_PARAMETER;
 
 	switch (aperture_type) {
 	case FMM_GPUVM:
-		if (aperture_is_valid(gpu_mem[slot].gpuvm_aperture.base,
-			gpu_mem[slot].gpuvm_aperture.limit)) {
-			*aperture_base = PORT_VPTR_TO_UINT64(gpu_mem[slot].gpuvm_aperture.base);
-			*aperture_limit = PORT_VPTR_TO_UINT64(gpu_mem[slot].gpuvm_aperture.limit);
+		if (aperture_is_valid(fmm_ctx->gpu_mem[slot].gpuvm_aperture.base,
+			fmm_ctx->gpu_mem[slot].gpuvm_aperture.limit)) {
+			*aperture_base = PORT_VPTR_TO_UINT64(fmm_ctx->gpu_mem[slot].gpuvm_aperture.base);
+			*aperture_limit = PORT_VPTR_TO_UINT64(fmm_ctx->gpu_mem[slot].gpuvm_aperture.limit);
 			err = HSAKMT_STATUS_SUCCESS;
 		}
 		break;
 
 	case FMM_SCRATCH:
-		if (aperture_is_valid(gpu_mem[slot].scratch_aperture.base,
-			gpu_mem[slot].scratch_aperture.limit)) {
-			*aperture_base = PORT_VPTR_TO_UINT64(gpu_mem[slot].scratch_aperture.base);
-			*aperture_limit = PORT_VPTR_TO_UINT64(gpu_mem[slot].scratch_aperture.limit);
+		if (aperture_is_valid(fmm_ctx->gpu_mem[slot].scratch_aperture.base,
+			fmm_ctx->gpu_mem[slot].scratch_aperture.limit)) {
+			*aperture_base = PORT_VPTR_TO_UINT64(fmm_ctx->gpu_mem[slot].scratch_aperture.base);
+			*aperture_limit = PORT_VPTR_TO_UINT64(fmm_ctx->gpu_mem[slot].scratch_aperture.limit);
 			err = HSAKMT_STATUS_SUCCESS;
 		}
 		break;
 
 	case FMM_LDS:
-		if (aperture_is_valid(gpu_mem[slot].lds_aperture.base,
-			gpu_mem[slot].lds_aperture.limit)) {
-			*aperture_base = PORT_VPTR_TO_UINT64(gpu_mem[slot].lds_aperture.base);
-			*aperture_limit = PORT_VPTR_TO_UINT64(gpu_mem[slot].lds_aperture.limit);
+		if (aperture_is_valid(fmm_ctx->gpu_mem[slot].lds_aperture.base,
+			fmm_ctx->gpu_mem[slot].lds_aperture.limit)) {
+			*aperture_base = PORT_VPTR_TO_UINT64(fmm_ctx->gpu_mem[slot].lds_aperture.base);
+			*aperture_limit = PORT_VPTR_TO_UINT64(fmm_ctx->gpu_mem[slot].lds_aperture.limit);
 			err = HSAKMT_STATUS_SUCCESS;
 		}
 		break;
@@ -3156,10 +3216,10 @@ HSAKMT_STATUS hsakmt_fmm_get_aperture_base_and_limit(aperture_type_e aperture_ty
 		break;
 
 	case FMM_MMIO:
-		if (aperture_is_valid(gpu_mem[slot].mmio_aperture.base,
-			gpu_mem[slot].mmio_aperture.limit)) {
-			*aperture_base = PORT_VPTR_TO_UINT64(gpu_mem[slot].mmio_aperture.base);
-			*aperture_limit = PORT_VPTR_TO_UINT64(gpu_mem[slot].mmio_aperture.limit);
+		if (aperture_is_valid(fmm_ctx->gpu_mem[slot].mmio_aperture.base,
+			fmm_ctx->gpu_mem[slot].mmio_aperture.limit)) {
+			*aperture_base = PORT_VPTR_TO_UINT64(fmm_ctx->gpu_mem[slot].mmio_aperture.base);
+			*aperture_limit = PORT_VPTR_TO_UINT64(fmm_ctx->gpu_mem[slot].mmio_aperture.limit);
 			err = HSAKMT_STATUS_SUCCESS;
 		}
 		break;
@@ -3241,7 +3301,8 @@ static void add_device_ids_to_mapped_array(vm_object_t *obj,
 
 
 /* If nodes_to_map is not NULL, map the nodes specified; otherwise map all. */
-static HSAKMT_STATUS _fmm_map_to_gpu(manageable_aperture_t *aperture,
+static HSAKMT_STATUS _fmm_map_to_gpu(HsaKFDContext *ctx,
+			manageable_aperture_t *aperture,
 			void *address, uint64_t size, vm_object_t *obj,
 			uint32_t *nodes_to_map, uint32_t nodes_array_size)
 {
@@ -3250,6 +3311,7 @@ static HSAKMT_STATUS _fmm_map_to_gpu(manageable_aperture_t *aperture,
 	HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
 	int ret_ioctl;
 	uint32_t i;
+	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
 
 	if (!obj)
 		pthread_mutex_lock(&aperture->fmm_mutex);
@@ -3285,14 +3347,14 @@ static HSAKMT_STATUS _fmm_map_to_gpu(manageable_aperture_t *aperture,
 			sizeof(uint32_t);
 	} else {
 	/* not specified, not registered: map all GPUs */
-		int32_t gpu_mem_id = gpu_mem_find_by_node_id(obj->node_id);
+		int32_t gpu_mem_id = gpu_mem_find_by_node_id(fmm_ctx, obj->node_id);
 
 		if (!obj->userptr && hsakmt_get_device_id_by_node_id(obj->node_id) &&
 		    gpu_mem_id >= 0) {
 			args.device_ids_array_ptr = (uint64_t)
-				gpu_mem[gpu_mem_id].usable_peer_id_array;
+				fmm_ctx->gpu_mem[gpu_mem_id].usable_peer_id_array;
 			args.n_devices =
-				gpu_mem[gpu_mem_id].usable_peer_id_num;
+				fmm_ctx->gpu_mem[gpu_mem_id].usable_peer_id_num;
 		} else {
 			args.device_ids_array_ptr = (uint64_t)all_gpu_id_array;
 			args.n_devices = all_gpu_id_array_size / sizeof(uint32_t);
@@ -3303,7 +3365,7 @@ static HSAKMT_STATUS _fmm_map_to_gpu(manageable_aperture_t *aperture,
 		args.n_success = 0;
 		args.handle = object->handles[i];
 
-		ret_ioctl = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_MAP_MEMORY_TO_GPU, &args);
+		ret_ioctl = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_MAP_MEMORY_TO_GPU, &args);
 		if (ret_ioctl) {
 			pr_err("GPU mapping failed (%d) for obj at %p, userptr %p, size %lu",
 				ret_ioctl, object->start, object->userptr, object->size);
@@ -3330,7 +3392,7 @@ static HSAKMT_STATUS _fmm_map_to_gpu(manageable_aperture_t *aperture,
 err_map_failed:
 	while (ret && i--) {
 		args.handle = object->handles[i];
-		hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU, &args);
+		hsakmt_ioctl(ctx->fd, AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU, &args);
 	}
 exit_ok:
 err_object_not_found:
@@ -3339,7 +3401,8 @@ err_object_not_found:
 	return ret;
 }
 
-static HSAKMT_STATUS _fmm_map_to_gpu_scratch(uint32_t gpu_id, manageable_aperture_t *aperture,
+static HSAKMT_STATUS _fmm_map_to_gpu_scratch(HsaKFDContext *ctx,
+				   uint32_t gpu_id, manageable_aperture_t *aperture,
 				   void *address, uint64_t size)
 {
 	int32_t gpu_mem_id;
@@ -3349,9 +3412,10 @@ static HSAKMT_STATUS _fmm_map_to_gpu_scratch(uint32_t gpu_id, manageable_apertur
 	void *mmap_ret = NULL;
 	uint64_t mmap_offset = 0;
 	vm_object_t *obj;
+	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
 
 	/* Retrieve gpu_mem id according to gpu_id */
-	gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id);
+	gpu_mem_id = gpu_mem_find_by_gpu_id(fmm_ctx, gpu_id);
 	if (gpu_mem_id < 0)
 		return HSAKMT_STATUS_INVALID_PARAMETER;
 
@@ -3363,33 +3427,35 @@ static HSAKMT_STATUS _fmm_map_to_gpu_scratch(uint32_t gpu_id, manageable_apertur
 	    VOID_PTR_ADD(address, size - 1) > aperture->limit)
 		return HSAKMT_STATUS_INVALID_PARAMETER;
 
-	is_debugger = hsakmt_debug_get_reg_status(gpu_mem[gpu_mem_id].node_id);
+	is_debugger = hsakmt_debug_get_reg_status(fmm_ctx->gpu_mem[gpu_mem_id].node_id);
 	flags = is_debugger ? KFD_IOC_ALLOC_MEM_FLAGS_GTT :
 			      KFD_IOC_ALLOC_MEM_FLAGS_VRAM;
 	flags |= KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE;
 	/* allocate object within the scratch backing aperture */
-	obj = fmm_allocate_memory_object(gpu_id, address, size,
+	obj = fmm_allocate_memory_object(ctx,
+					 gpu_id, address, size,
 					 aperture, &mmap_offset, flags);
 	if (!obj)
 		return HSAKMT_STATUS_INVALID_HANDLE;
 	/* Create a CPU mapping for the debugger */
 	mmap_ret = fmm_map_to_cpu(address, size, is_debugger,
-				  gpu_mem[gpu_mem_id].drm_render_fd,
+				  fmm_ctx->gpu_mem[gpu_mem_id].drm_render_fd,
 				  mmap_offset);
 	if (mmap_ret == MAP_FAILED) {
-		__fmm_release(obj, aperture);
+		__fmm_release(ctx, obj, aperture);
 		return HSAKMT_STATUS_ERROR;
 	}
 
 	/* map to GPU */
-	ret = _fmm_map_to_gpu(aperture, address, size, NULL, &gpu_id, sizeof(uint32_t));
+	ret = _fmm_map_to_gpu(ctx, aperture, address, size, NULL, &gpu_id, sizeof(uint32_t));
 	if (ret != HSAKMT_STATUS_SUCCESS)
-		__fmm_release(obj, aperture);
+		__fmm_release(ctx, obj, aperture);
 
 	return ret;
 }
 
-static HSAKMT_STATUS _fmm_map_to_gpu_userptr(void *addr, uint64_t size,
+static HSAKMT_STATUS _fmm_map_to_gpu_userptr(HsaKFDContext *ctx,
+					     void *addr, uint64_t size,
 					     uint64_t *gpuvm_addr, vm_object_t *object,
 					     uint32_t *nodes_to_map, uint32_t nodes_array_size)
 {
@@ -3411,14 +3477,14 @@ static HSAKMT_STATUS _fmm_map_to_gpu_userptr(void *addr, uint64_t size,
 		}
 		pr_debug("%s Mapping Address %p size aligned: %ld offset: %x\n",
 			__func__, svm_addr, PAGE_ALIGN_UP(page_offset + size), page_offset);
-		ret = fmm_map_mem_svm_api(svm_addr,
+		ret = fmm_map_mem_svm_api(ctx, svm_addr,
 						  PAGE_ALIGN_UP(page_offset + size),
 						  nodes_to_map,
 						  nodes_array_size / sizeof(uint32_t));
 
 	} else if (object) {
 		svm_addr = object->start;
-		ret = _fmm_map_to_gpu(aperture, svm_addr, object->size, object, NULL, 0);
+		ret = _fmm_map_to_gpu(ctx, aperture, svm_addr, object->size, object, NULL, 0);
 	} else {
 		pr_err("Object is null and SVM API is not supported.\n");
 		return HSAKMT_STATUS_ERROR;
@@ -3429,22 +3495,24 @@ static HSAKMT_STATUS _fmm_map_to_gpu_userptr(void *addr, uint64_t size,
 	return ret;
 }
 
-HSAKMT_STATUS hsakmt_fmm_map_to_gpu(void *address, uint64_t size, uint64_t *gpuvm_address)
+HSAKMT_STATUS hsakmt_fmm_map_to_gpu(HsaKFDContext *ctx,
+				void *address, uint64_t size, uint64_t *gpuvm_address)
 {
 	manageable_aperture_t *aperture = NULL;
 	vm_object_t *object;
 	HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
 	gpu_mem_t *gpu_mem_ptr = NULL;
+	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
 
 	/* Special handling for scratch memory */
-	gpu_mem_ptr = fmm_is_scratch_aperture(address);
+	gpu_mem_ptr = fmm_is_scratch_aperture(fmm_ctx, address);
 	if (gpu_mem_ptr) {
-		return _fmm_map_to_gpu_scratch(gpu_mem_ptr->gpu_id,
+		return _fmm_map_to_gpu_scratch(ctx, gpu_mem_ptr->gpu_id,
 							&gpu_mem_ptr->scratch_physical,
 							address, size);
 	}
 
-	object = vm_find_object(address, size, &aperture);
+	object = vm_find_object(fmm_ctx, address, size, &aperture);
 	if (!object && !hsakmt_is_svm_api_supported) {
 		if (!hsakmt_is_dgpu) {
 			/* Prefetch memory on APUs with dummy-reads */
@@ -3473,9 +3541,9 @@ HSAKMT_STATUS hsakmt_fmm_map_to_gpu(void *address, uint64_t size, uint64_t *gpuv
 		fmm_check_user_memory(address, size);
 		ret = HSAKMT_STATUS_SUCCESS;
 	} else if ((hsakmt_is_svm_api_supported && !object) || (object && (object->userptr))) {
-		ret = _fmm_map_to_gpu_userptr(address, size, gpuvm_address, object, NULL, 0);
+		ret = _fmm_map_to_gpu_userptr(ctx, address, size, gpuvm_address, object, NULL, 0);
 	} else if (aperture) {
-		ret = _fmm_map_to_gpu(aperture, address, size, object, NULL, 0);
+		ret = _fmm_map_to_gpu(ctx, aperture, address, size, object, NULL, 0);
 		/* Update alternate GPUVM address only for
 		 * CPU-invisible apertures on old APUs
 		 */
@@ -3500,7 +3568,8 @@ static void print_device_id_array(uint32_t *device_id_array, uint32_t device_id_
 #endif
 }
 
-static int _fmm_unmap_from_gpu(manageable_aperture_t *aperture, void *address,
+static int _fmm_unmap_from_gpu(HsaKFDContext *ctx,
+		manageable_aperture_t *aperture, void *address,
 		uint32_t *device_ids_array, uint32_t device_ids_array_size,
 		vm_object_t *obj)
 {
@@ -3553,7 +3622,7 @@ static int _fmm_unmap_from_gpu(manageable_aperture_t *aperture, void *address,
 		args.handle = object->handles[i];
 		args.n_success = 0;
 
-		tmp_ret = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU, &args);
+		tmp_ret = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU, &args);
 		if (tmp_ret)
 			ret = tmp_ret;
 	}
@@ -3574,7 +3643,8 @@ out:
 	return ret;
 }
 
-static int _fmm_unmap_from_gpu_scratch(uint32_t gpu_id,
+static int _fmm_unmap_from_gpu_scratch(HsaKFDContext *ctx,
+				       uint32_t gpu_id,
 				       manageable_aperture_t *aperture,
 				       void *address)
 {
@@ -3582,9 +3652,10 @@ static int _fmm_unmap_from_gpu_scratch(uint32_t gpu_id,
 	vm_object_t *object;
 	struct kfd_ioctl_unmap_memory_from_gpu_args args = {0};
 	int ret;
+	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
 
 	/* Retrieve gpu_mem id according to gpu_id */
-	gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id);
+	gpu_mem_id = gpu_mem_find_by_gpu_id(fmm_ctx, gpu_id);
 	if (gpu_mem_id < 0)
 		return -1;
 
@@ -3611,7 +3682,7 @@ static int _fmm_unmap_from_gpu_scratch(uint32_t gpu_id,
 	args.device_ids_array_ptr = (uint64_t)object->mapped_device_id_array;
 	args.n_devices = object->mapped_device_id_array_size / sizeof(uint32_t);
 	args.n_success = 0;
-	ret = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU, &args);
+	ret = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU, &args);
 
 	/* unmap from CPU while keeping the address space reserved */
 	mmap(address, object->size, PROT_NONE,
@@ -3632,29 +3703,31 @@ static int _fmm_unmap_from_gpu_scratch(uint32_t gpu_id,
 	pthread_mutex_unlock(&aperture->fmm_mutex);
 
 	/* free object in scratch backing aperture */
-	return __fmm_release(object, aperture);
+	return __fmm_release(ctx, object, aperture);
 
 err:
 	pthread_mutex_unlock(&aperture->fmm_mutex);
 	return ret;
 }
 
-int hsakmt_fmm_unmap_from_gpu(void *address)
+int hsakmt_fmm_unmap_from_gpu(HsaKFDContext *ctx, void *address)
 {
 	manageable_aperture_t *aperture;
 	vm_object_t *object;
 	int ret;
 	gpu_mem_t *gpu_mem_ptr = NULL;
+	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
 
 	/* Special handling for scratch memory */
-	gpu_mem_ptr = fmm_is_scratch_aperture(address);
+	gpu_mem_ptr = fmm_is_scratch_aperture(fmm_ctx, address);
 	if (gpu_mem_ptr) {
-		return _fmm_unmap_from_gpu_scratch(gpu_mem_ptr->gpu_id,
-							&gpu_mem_ptr->scratch_physical,
-							address);
+		return _fmm_unmap_from_gpu_scratch(ctx,
+					    gpu_mem_ptr->gpu_id,
+					    &gpu_mem_ptr->scratch_physical,
+					    address);
 	}
 
-	object = vm_find_object(address, 0, &aperture);
+	object = vm_find_object(fmm_ctx, address, 0, &aperture);
 	if (!object)
 		/* On APUs GPU unmapping of system memory is a no-op */
 		return (!hsakmt_is_dgpu || hsakmt_is_svm_api_supported) ? 0 : -EINVAL;
@@ -3664,7 +3737,7 @@ int hsakmt_fmm_unmap_from_gpu(void *address)
 		/* On APUs GPU unmapping of system memory is a no-op */
 		ret = 0;
 	else
-		ret = _fmm_unmap_from_gpu(aperture, address, NULL, 0, object);
+		ret = _fmm_unmap_from_gpu(ctx, aperture, address, NULL, 0, object);
 
 	pthread_mutex_unlock(&aperture->fmm_mutex);
 
@@ -3681,24 +3754,23 @@ int hsakmt_fmm_unmap_from_gpu(void *address)
  *
  * Returns true if the handle is found, false otherwise.
  */
-bool hsakmt_fmm_get_handle(void *address, uint64_t *handle, uint64_t *size_offset)
+bool hsakmt_fmm_get_handle(HsaKFDContext *ctx,
+						 void *address, uint64_t *handle, uint64_t *size_offset)
 {
 	uint32_t i;
-	manageable_aperture_t *aperture;
+	manageable_aperture_t *aperture = NULL;
 	vm_object_t *object;
-	bool found;
-
-	found = false;
-	aperture = NULL;
+	bool found = false;
+	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
 
 	/* Find the aperture the requested address belongs to */
-	for (i = 0; i < gpu_mem_count; i++) {
-		if (gpu_mem[i].gpu_id == NON_VALID_GPU_ID)
+	for (i = 0; i < fmm_ctx->gpu_mem_count; i++) {
+		if (fmm_ctx->gpu_mem[i].gpu_id == NON_VALID_GPU_ID)
 			continue;
 
-		if ((address >= gpu_mem[i].gpuvm_aperture.base) &&
-			(address <= gpu_mem[i].gpuvm_aperture.limit)) {
-			aperture = &gpu_mem[i].gpuvm_aperture;
+		if ((address >= fmm_ctx->gpu_mem[i].gpuvm_aperture.base) &&
+			(address <= fmm_ctx->gpu_mem[i].gpuvm_aperture.limit)) {
+			aperture = &fmm_ctx->gpu_mem[i].gpuvm_aperture;
 			break;
 		}
 	}
@@ -3744,7 +3816,8 @@ bool hsakmt_fmm_get_handle(void *address, uint64_t *handle, uint64_t *size_offse
 	return found;
 }
 
-static HSAKMT_STATUS fmm_register_user_memory(void *addr,
+static HSAKMT_STATUS fmm_register_user_memory(HsaKFDContext *ctx,
+						void *addr,
 						HSAuint64 size,
 						vm_object_t **obj_ret,
 						HsaMemFlags flags)
@@ -3756,19 +3829,21 @@ static HSAKMT_STATUS fmm_register_user_memory(void *addr,
 	void *svm_addr;
 	HSAuint32 gpu_id;
 	vm_object_t *obj, *exist_obj;
+	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
 
 	/* Find first GPU for creating the userptr BO */
-	if (!g_first_gpu_mem)
+	if (!fmm_ctx->first_gpu_mem)
 		return HSAKMT_STATUS_ERROR;
 
-	gpu_id = g_first_gpu_mem->gpu_id;
+	gpu_id = fmm_ctx->first_gpu_mem->gpu_id;
 
 	/* Optionally check that the CPU mapping is valid */
 	if (svm.check_userptr)
 		fmm_check_user_memory(addr, size);
 
 	/* Allocate BO, userptr address is passed in mmap_offset */
-	svm_addr = __fmm_allocate_device(gpu_id, NULL, aligned_size, aperture,
+	svm_addr = __fmm_allocate_device(ctx,
+			 gpu_id, NULL, aligned_size, aperture,
 			 &aligned_addr, KFD_IOC_ALLOC_MEM_FLAGS_USERPTR |
 			 KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE |
 			 KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE |
@@ -3801,14 +3876,15 @@ static HSAKMT_STATUS fmm_register_user_memory(void *addr,
 	pthread_mutex_unlock(&aperture->fmm_mutex);
 
 	if (exist_obj)
-		__fmm_release(obj, aperture);
+		__fmm_release(ctx, obj, aperture);
 
 	if (obj_ret)
 		*obj_ret = exist_obj ? exist_obj : obj;
 	return HSAKMT_STATUS_SUCCESS;
 }
 
-HSAKMT_STATUS hsakmt_fmm_register_memory(void *address, uint64_t size_in_bytes,
+HSAKMT_STATUS hsakmt_fmm_register_memory(HsaKFDContext *ctx,
+				  void *address, uint64_t size_in_bytes,
 				  uint32_t *gpu_id_array,
 				  uint32_t gpu_id_array_size,
 				  HsaMemFlags flags)
@@ -3816,6 +3892,7 @@ HSAKMT_STATUS hsakmt_fmm_register_memory(void *address, uint64_t size_in_bytes,
 	manageable_aperture_t *aperture = NULL;
 	vm_object_t *object = NULL;
 	HSAKMT_STATUS ret;
+	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
 
 	if (gpu_id_array_size > 0 && !gpu_id_array)
 		return HSAKMT_STATUS_INVALID_PARAMETER;
@@ -3823,7 +3900,7 @@ HSAKMT_STATUS hsakmt_fmm_register_memory(void *address, uint64_t size_in_bytes,
 	if (flags.ui32.CoarseGrain && flags.ui32.ExtendedCoherent)
 		return HSAKMT_STATUS_INVALID_PARAMETER;
 
-	object = vm_find_object(address, size_in_bytes, &aperture);
+	object = vm_find_object(fmm_ctx, address, size_in_bytes, &aperture);
 	if (!object) {
 		if (!hsakmt_is_dgpu)
 			/* System memory registration on APUs is a no-op */
@@ -3831,12 +3908,12 @@ HSAKMT_STATUS hsakmt_fmm_register_memory(void *address, uint64_t size_in_bytes,
 
 		/* Register a new user ptr */
 		if (hsakmt_is_svm_api_supported) {
-			ret = fmm_register_mem_svm_api(address, size_in_bytes, flags);
+			ret = fmm_register_mem_svm_api(ctx, address, size_in_bytes, flags);
 			if (ret == HSAKMT_STATUS_SUCCESS)
 				return ret;
 			pr_debug("SVM failed, falling back to old registration\n");
 		}
-		ret = fmm_register_user_memory(address, size_in_bytes, &object, flags);
+		ret = fmm_register_user_memory(ctx, address, size_in_bytes, &object, flags);
 
 		if (ret != HSAKMT_STATUS_SUCCESS)
 			return ret;
@@ -3891,7 +3968,8 @@ HSAKMT_STATUS hsakmt_fmm_register_memory(void *address, uint64_t size_in_bytes,
 }
 
 #define GRAPHICS_METADATA_DEFAULT_SIZE 64
-HSAKMT_STATUS hsakmt_fmm_register_graphics_handle(HSAuint64 GraphicsResourceHandle,
+HSAKMT_STATUS hsakmt_fmm_register_graphics_handle(HsaKFDContext *ctx,
+					   HSAuint64 GraphicsResourceHandle,
 					   HsaGraphicsResourceInfo *GraphicsResourceInfo,
 					   uint32_t *gpu_id_array,
 					   uint32_t gpu_id_array_size,
@@ -3909,6 +3987,7 @@ HSAKMT_STATUS hsakmt_fmm_register_graphics_handle(HSAuint64 GraphicsResourceHand
 	int r;
 	HSAKMT_STATUS status = HSAKMT_STATUS_ERROR;
 	static const uint64_t IMAGE_ALIGN = 256*1024;
+	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
 
 	if (gpu_id_array_size > 0 && !gpu_id_array)
 		return HSAKMT_STATUS_INVALID_PARAMETER;
@@ -3919,7 +3998,7 @@ HSAKMT_STATUS hsakmt_fmm_register_graphics_handle(HSAuint64 GraphicsResourceHand
 	if (!metadata)
 		return HSAKMT_STATUS_NO_MEMORY;
 	infoArgs.metadata_ptr = (uint64_t)metadata;
-	r = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_GET_DMABUF_INFO, (void *)&infoArgs);
+	r = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_GET_DMABUF_INFO, (void *)&infoArgs);
 	if (r && infoArgs.metadata_size > GRAPHICS_METADATA_DEFAULT_SIZE) {
 		/* Try again with bigger metadata */
 		free(metadata);
@@ -3927,24 +4006,24 @@ HSAKMT_STATUS hsakmt_fmm_register_graphics_handle(HSAuint64 GraphicsResourceHand
 		if (!metadata)
 			return HSAKMT_STATUS_NO_MEMORY;
 		infoArgs.metadata_ptr = (uint64_t)metadata;
-		r = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_GET_DMABUF_INFO, (void *)&infoArgs);
+		r = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_GET_DMABUF_INFO, (void *)&infoArgs);
 	}
 
 	if (r)
 		goto error_free_metadata;
 
 	/* Choose aperture based on GPU and allocate virtual address */
-	gpu_mem_id = gpu_mem_find_by_gpu_id(infoArgs.gpu_id);
+	gpu_mem_id = gpu_mem_find_by_gpu_id(fmm_ctx, infoArgs.gpu_id);
 	if (gpu_mem_id < 0)
 		goto error_free_metadata;
 
 	/* import DMA buffer without VA assigned */
 	if (!gpu_id_array && gpu_id_array_size == 0 && !RegisterFlags.ui32.requiresVAddr) {
 		aperture = &mem_handle_aperture;
-	} else if (hsakmt_topology_is_svm_needed(gpu_mem[gpu_mem_id].EngineId)) {
+	} else if (hsakmt_topology_is_svm_needed(fmm_ctx->gpu_mem[gpu_mem_id].EngineId)) {
 		aperture = svm.dgpu_aperture;
 	} else {
-		aperture = &gpu_mem[gpu_mem_id].gpuvm_aperture;
+		aperture = &fmm_ctx->gpu_mem[gpu_mem_id].gpuvm_aperture;
 		aperture_base = aperture->base;
 	}
 	if (!aperture_is_valid(aperture->base, aperture->limit))
@@ -3965,7 +4044,7 @@ HSAKMT_STATUS hsakmt_fmm_register_graphics_handle(HSAuint64 GraphicsResourceHand
 
 	importArgs.gpu_id = infoArgs.gpu_id;
 	importArgs.dmabuf_fd = GraphicsResourceHandle;
-	r = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_IMPORT_DMABUF, (void *)&importArgs);
+	r = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_IMPORT_DMABUF, (void *)&importArgs);
 	if (r) {
 		pthread_mutex_unlock(&aperture->fmm_mutex);
 		goto error_release_aperture;
@@ -3996,7 +4075,7 @@ HSAKMT_STATUS hsakmt_fmm_register_graphics_handle(HSAuint64 GraphicsResourceHand
 
 error_release_buffer:
 	freeArgs.handle = importArgs.handle;
-	if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_FREE_MEMORY_OF_GPU, &freeArgs) != 0) {
+	if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_FREE_MEMORY_OF_GPU, &freeArgs) != 0) {
 		/* Handle error if memory is not freed properly */
 		pr_err("Failed to free GPU memory\n");
 	}
@@ -4008,7 +4087,8 @@ error_free_metadata:
 	return status;
 }
 
-HSAKMT_STATUS hsakmt_fmm_export_dma_buf_fd(void *MemoryAddress,
+HSAKMT_STATUS hsakmt_fmm_export_dma_buf_fd(HsaKFDContext *ctx,
+				    void *MemoryAddress,
 				    HSAuint64 MemorySizeInBytes,
 				    int *DMABufFd,
 				    HSAuint64 *Offset)
@@ -4019,8 +4099,9 @@ HSAKMT_STATUS hsakmt_fmm_export_dma_buf_fd(void *MemoryAddress,
 	vm_object_t *obj;
 	HSAuint64 offset;
 	int r;
+	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
 
-	aperture = fmm_find_aperture(MemoryAddress, &ApeInfo);
+	aperture = fmm_find_aperture(fmm_ctx, MemoryAddress, &ApeInfo);
 	if (!aperture)
 		return HSAKMT_STATUS_INVALID_PARAMETER;
 
@@ -4040,7 +4121,7 @@ HSAKMT_STATUS hsakmt_fmm_export_dma_buf_fd(void *MemoryAddress,
 	if (!obj)
 		return HSAKMT_STATUS_INVALID_PARAMETER;
 
-	r = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_EXPORT_DMABUF, (void *)&exportArgs);
+	r = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_EXPORT_DMABUF, (void *)&exportArgs);
 	if (r)
 		return HSAKMT_STATUS_ERROR;
 
@@ -4050,7 +4131,8 @@ HSAKMT_STATUS hsakmt_fmm_export_dma_buf_fd(void *MemoryAddress,
 	return HSAKMT_STATUS_SUCCESS;
 }
 
-HSAKMT_STATUS hsakmt_fmm_share_memory(void *MemoryAddress,
+HSAKMT_STATUS hsakmt_fmm_share_memory(HsaKFDContext *ctx,
+				void *MemoryAddress,
 				HSAuint64 SizeInBytes,
 				HsaSharedMemoryHandle *SharedMemoryHandle)
 {
@@ -4062,11 +4144,12 @@ HSAKMT_STATUS hsakmt_fmm_share_memory(void *MemoryAddress,
 	HsaApertureInfo ApeInfo;
 	HsaSharedMemoryStruct *SharedMemoryStruct =
 		to_hsa_shared_memory_struct(SharedMemoryHandle);
+	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
 
 	if (SizeInBytes >= (1ULL << ((sizeof(HSAuint32) * 8) + PAGE_SHIFT)))
 		return HSAKMT_STATUS_INVALID_PARAMETER;
 
-	aperture = fmm_find_aperture(MemoryAddress, &ApeInfo);
+	aperture = fmm_find_aperture(fmm_ctx, MemoryAddress, &ApeInfo);
 	if (!aperture)
 		return HSAKMT_STATUS_INVALID_PARAMETER;
 
@@ -4083,16 +4166,16 @@ HSAKMT_STATUS hsakmt_fmm_share_memory(void *MemoryAddress,
 		/* Sharing non paged system memory. Use first GPU which was
 		 * used during allocation. See fmm_allocate_host_gpu()
 		 */
-		if (!g_first_gpu_mem)
+		if (!fmm_ctx->first_gpu_mem)
 			return HSAKMT_STATUS_ERROR;
 
-		gpu_id = g_first_gpu_mem->gpu_id;
+		gpu_id = fmm_ctx->first_gpu_mem->gpu_id;
 	}
 	exportArgs.handle = obj->handles[0];
 	exportArgs.gpu_id = gpu_id;
 	exportArgs.flags = obj->mflags.Value;
 
-	r = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_IPC_EXPORT_HANDLE, (void *)&exportArgs);
+	r = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_IPC_EXPORT_HANDLE, (void *)&exportArgs);
 	if (r)
 		return HSAKMT_STATUS_ERROR;
 
@@ -4105,7 +4188,8 @@ HSAKMT_STATUS hsakmt_fmm_share_memory(void *MemoryAddress,
 	return HSAKMT_STATUS_SUCCESS;
 }
 
-HSAKMT_STATUS hsakmt_fmm_register_shared_memory(const HsaSharedMemoryHandle *SharedMemoryHandle,
+HSAKMT_STATUS hsakmt_fmm_register_shared_memory(HsaKFDContext *ctx,
+						const HsaSharedMemoryHandle *SharedMemoryHandle,
 						HSAuint64 *SizeInBytes,
 						void **MemoryAddress,
 						uint32_t *gpu_id_array,
@@ -4122,6 +4206,7 @@ HSAKMT_STATUS hsakmt_fmm_register_shared_memory(const HsaSharedMemoryHandle *Sha
 		to_const_hsa_shared_memory_struct(SharedMemoryHandle);
 	HSAuint64 SizeInPages = SharedMemoryStruct->SizeInPages;
 	HsaMemFlags mflags;
+	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
 
 	if (gpu_id_array_size > 0 && !gpu_id_array)
 		return HSAKMT_STATUS_INVALID_PARAMETER;
@@ -4130,7 +4215,7 @@ HSAKMT_STATUS hsakmt_fmm_register_shared_memory(const HsaSharedMemoryHandle *Sha
 			sizeof(importArgs.share_handle));
 	importArgs.gpu_id = SharedMemoryStruct->ExportGpuId;
 
-	aperture = fmm_get_aperture(SharedMemoryStruct->ApeInfo);
+	aperture = fmm_get_aperture(fmm_ctx, SharedMemoryStruct->ApeInfo);
 	if (!aperture)
 		return HSAKMT_STATUS_INVALID_PARAMETER;
 
@@ -4143,7 +4228,7 @@ HSAKMT_STATUS hsakmt_fmm_register_shared_memory(const HsaSharedMemoryHandle *Sha
 	}
 
 	importArgs.va_addr = (uint64_t)reservedMem;
-	r = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_IPC_IMPORT_HANDLE, (void *)&importArgs);
+	r = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_IPC_IMPORT_HANDLE, (void *)&importArgs);
 	if (r) {
 		err = HSAKMT_STATUS_ERROR;
 		goto err_import;
@@ -4158,7 +4243,7 @@ HSAKMT_STATUS hsakmt_fmm_register_shared_memory(const HsaSharedMemoryHandle *Sha
 	}
 
 	if (importArgs.mmap_offset) {
-		int32_t gpu_mem_id = gpu_mem_find_by_gpu_id(importArgs.gpu_id);
+		int32_t gpu_mem_id = gpu_mem_find_by_gpu_id(fmm_ctx, importArgs.gpu_id);
 		void *ret;
 
 		if (gpu_mem_id < 0) {
@@ -4168,11 +4253,11 @@ HSAKMT_STATUS hsakmt_fmm_register_shared_memory(const HsaSharedMemoryHandle *Sha
 			err = HSAKMT_STATUS_ERROR;
 			goto err_free_mem;
 		}
-		obj->node_id = gpu_mem[gpu_mem_id].node_id;
+		obj->node_id = fmm_ctx->gpu_mem[gpu_mem_id].node_id;
 		pthread_mutex_unlock(&aperture->fmm_mutex);
 
 		ret = fmm_map_to_cpu(reservedMem, (SizeInPages << PAGE_SHIFT),
-				true, gpu_mem[gpu_mem_id].drm_render_fd,
+				true, fmm_ctx->gpu_mem[gpu_mem_id].drm_render_fd,
 				importArgs.mmap_offset);
 
 		if (ret == MAP_FAILED) {
@@ -4199,7 +4284,7 @@ HSAKMT_STATUS hsakmt_fmm_register_shared_memory(const HsaSharedMemoryHandle *Sha
 	return HSAKMT_STATUS_SUCCESS;
 err_free_mem_handle:
 	freeArgs.handle = importArgs.handle;
-	if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_FREE_MEMORY_OF_GPU, &freeArgs) != 0) {
+	if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_FREE_MEMORY_OF_GPU, &freeArgs) != 0) {
 		pr_err("Failed to free GPU memory for handle %llu\n", freeArgs.handle);
 	}
 err_free_mem:
@@ -4209,12 +4294,13 @@ err_import:
 	return err;
 }
 
-HSAKMT_STATUS hsakmt_fmm_deregister_memory(void *address)
+HSAKMT_STATUS hsakmt_fmm_deregister_memory(HsaKFDContext *ctx, void *address)
 {
 	manageable_aperture_t *aperture;
 	vm_object_t *object;
+	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
 
-	object = vm_find_object(address, 0, &aperture);
+	object = vm_find_object(fmm_ctx, address, 0, &aperture);
 	if (!object)
 		/* On APUs we assume it's a random system memory address
 		 * where registration and dergistration is a no-op
@@ -4238,7 +4324,7 @@ HSAKMT_STATUS hsakmt_fmm_deregister_memory(void *address)
 		 * userptrs means releasing the BO.
 		 */
 		pthread_mutex_unlock(&aperture->fmm_mutex);
-		__fmm_release(object, aperture);
+		__fmm_release(ctx, object, aperture);
 		return HSAKMT_STATUS_SUCCESS;
 	}
 
@@ -4268,7 +4354,8 @@ HSAKMT_STATUS hsakmt_fmm_deregister_memory(void *address)
  * and maps nodes_to_map
  */
 
-HSAKMT_STATUS hsakmt_fmm_map_to_gpu_nodes(void *address, uint64_t size,
+HSAKMT_STATUS hsakmt_fmm_map_to_gpu_nodes(HsaKFDContext *ctx,
+		void *address, uint64_t size,
 		uint32_t *nodes_to_map, uint64_t num_of_nodes,
 		uint64_t *gpuvm_address)
 {
@@ -4278,11 +4365,12 @@ HSAKMT_STATUS hsakmt_fmm_map_to_gpu_nodes(void *address, uint64_t size,
 	uint32_t *registered_node_id_array, registered_node_id_array_size;
 	HSAKMT_STATUS ret;
 	int retcode = 0;
+	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
 
 	if (!num_of_nodes || !nodes_to_map || !address)
 		return HSAKMT_STATUS_INVALID_PARAMETER;
 
-	object = vm_find_object(address, size, &aperture);
+	object = vm_find_object(fmm_ctx, address, size, &aperture);
 	if (!object && !hsakmt_is_svm_api_supported)
 		return HSAKMT_STATUS_ERROR;
 	/* Successful vm_find_object returns with aperture locked */
@@ -4307,7 +4395,7 @@ HSAKMT_STATUS hsakmt_fmm_map_to_gpu_nodes(void *address, uint64_t size,
 	}
 
 	if ((hsakmt_is_svm_api_supported && !object) || object->userptr) {
-		retcode = _fmm_map_to_gpu_userptr(address, size, gpuvm_address,
+		retcode = _fmm_map_to_gpu_userptr(ctx, address, size, gpuvm_address,
 				object, nodes_to_map, num_of_nodes * sizeof(uint32_t));
 		if (object)
 			pthread_mutex_unlock(&aperture->fmm_mutex);
@@ -4345,7 +4433,7 @@ HSAKMT_STATUS hsakmt_fmm_map_to_gpu_nodes(void *address, uint64_t size,
 		temp_node_id_array_size *= sizeof(uint32_t);
 
 		if (temp_node_id_array_size) {
-			ret = _fmm_unmap_from_gpu(aperture, address,
+			ret = _fmm_unmap_from_gpu(ctx, aperture, address,
 					temp_node_id_array,
 					temp_node_id_array_size,
 					object);
@@ -4371,7 +4459,7 @@ HSAKMT_STATUS hsakmt_fmm_map_to_gpu_nodes(void *address, uint64_t size,
 	}
 
 	if (map_node_id_array_size)
-		retcode = _fmm_map_to_gpu(aperture, address, size, object,
+		retcode = _fmm_map_to_gpu(ctx, aperture, address, size, object,
 				map_node_id_array,
 				map_node_id_array_size * sizeof(uint32_t));
 
@@ -4383,16 +4471,18 @@ HSAKMT_STATUS hsakmt_fmm_map_to_gpu_nodes(void *address, uint64_t size,
 	return HSAKMT_STATUS_SUCCESS;
 }
 
-HSAKMT_STATUS hsakmt_fmm_get_mem_info(const void *address, HsaPointerInfo *info)
+HSAKMT_STATUS hsakmt_fmm_get_mem_info(HsaKFDContext *ctx,
+				const void *address, HsaPointerInfo *info)
 {
 	HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
 	uint32_t i;
 	manageable_aperture_t *aperture;
 	vm_object_t *vm_obj;
+	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
 
 	memset(info, 0, sizeof(HsaPointerInfo));
 
-	vm_obj = vm_find_object(address, UINT64_MAX, &aperture);
+	vm_obj = vm_find_object(fmm_ctx, address, UINT64_MAX, &aperture);
 	if (!vm_obj) {
 		info->Type = HSA_POINTER_UNKNOWN;
 		return HSAKMT_STATUS_ERROR;
@@ -4468,13 +4558,14 @@ HSAKMT_STATUS hsakmt_fmm_get_mem_info(const void *address, HsaPointerInfo *info)
 }
 
 #ifdef SANITIZER_AMDGPU
-HSAKMT_STATUS hsakmt_fmm_replace_asan_header_page(void* address)
+HSAKMT_STATUS hsakmt_fmm_replace_asan_header_page(HsaKFDContext *ctx, void* address)
 {
 	HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
 	manageable_aperture_t* aperture;
 	vm_object_t* vm_obj;
+	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
 
-	vm_obj = vm_find_object(address, UINT64_MAX, &aperture);
+	vm_obj = vm_find_object(fmm_ctx, address, UINT64_MAX, &aperture);
 	if (!vm_obj)
 		return HSAKMT_STATUS_ERROR;
 	/* Successful vm_find_object returns with the aperture locked */
@@ -4495,13 +4586,14 @@ HSAKMT_STATUS hsakmt_fmm_replace_asan_header_page(void* address)
 	return ret;
 }
 
-HSAKMT_STATUS hsakmt_fmm_return_asan_header_page(void* address)
+HSAKMT_STATUS hsakmt_fmm_return_asan_header_page(HsaKFDContext *ctx, void* address)
 {
 	HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
 	manageable_aperture_t* aperture;
 	vm_object_t* vm_obj;
+	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
 
-	vm_obj = vm_find_object(address, UINT64_MAX, &aperture);
+	vm_obj = vm_find_object(fmm_ctx, address, UINT64_MAX, &aperture);
 	if (!vm_obj)
 		return HSAKMT_STATUS_ERROR;
 	/* Successful vm_find_object returns with the aperture locked */
@@ -4525,12 +4617,14 @@ HSAKMT_STATUS hsakmt_fmm_return_asan_header_page(void* address)
 }
 #endif
 
-HSAKMT_STATUS hsakmt_fmm_set_mem_user_data(const void *mem, void *usr_data)
+HSAKMT_STATUS hsakmt_fmm_set_mem_user_data(HsaKFDContext *ctx,
+					    const void *mem, void *usr_data)
 {
 	manageable_aperture_t *aperture;
 	vm_object_t *vm_obj;
+	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
 
-	vm_obj = vm_find_object(mem, 0, &aperture);
+	vm_obj = vm_find_object(fmm_ctx, mem, 0, &aperture);
 	if (!vm_obj)
 		return HSAKMT_STATUS_ERROR;
 
@@ -4560,29 +4654,32 @@ static void fmm_clear_aperture(manageable_aperture_t *app)
  * after a fork(). This will clear all vm_objects and mmaps duplicated from
  * the parent.
  */
-void hsakmt_fmm_clear_all_mem(void)
+void hsakmt_fmm_clear_all_mem(HsaKFDContext *ctx)
 {
 	uint32_t i;
-
+	
+	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
 	/* Close render node FDs. The child process needs to open new ones */
 	for (i = 0; i <= DRM_LAST_RENDER_NODE - DRM_FIRST_RENDER_NODE; i++) {
 
-		if (amdgpu_handle[i]) {
-			amdgpu_device_deinitialize(amdgpu_handle[i]);
-			amdgpu_handle[i] = NULL;
-		} else if (drm_render_fds[i]) {
-			close(drm_render_fds[i]);
+		if (fmm_ctx->amdgpu_handle[i]) {
+			amdgpu_device_deinitialize(fmm_ctx->amdgpu_handle[i]);
+			fmm_ctx->amdgpu_handle[i] = NULL;
+		} else if (fmm_ctx->drm_render_fds[i]) {
+			close(fmm_ctx->drm_render_fds[i]);
 		}
-		drm_render_fds[i] = 0;
+		fmm_ctx->drm_render_fds[i] = 0;
 	}
 
-	hsakmt_fmm_clear_all_aperture();
+	hsakmt_fmm_clear_all_aperture(ctx);
 }
 
-void hsakmt_fmm_clear_all_aperture(void)
+void hsakmt_fmm_clear_all_aperture(HsaKFDContext *ctx)
 {
 	uint32_t i;
 	void *map_addr;
+	
+	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
 
 	fmm_clear_aperture(&mem_handle_aperture);
 	fmm_clear_aperture(&cpuvm_aperture);
@@ -4609,13 +4706,13 @@ void hsakmt_fmm_clear_all_aperture(void)
 	}
 
 	/* Nothing is initialized. */
-	if (!gpu_mem)
+	if (!fmm_ctx->gpu_mem)
 		return;
 
-	for (i = 0; i < gpu_mem_count; i++) {
-		fmm_clear_aperture(&gpu_mem[i].gpuvm_aperture);
-		fmm_clear_aperture(&gpu_mem[i].scratch_physical);
+	for (i = 0; i < fmm_ctx->gpu_mem_count; i++) {
+		fmm_clear_aperture(&fmm_ctx->gpu_mem[i].gpuvm_aperture);
+		fmm_clear_aperture(&fmm_ctx->gpu_mem[i].scratch_physical);
 	}
 
-	hsakmt_fmm_destroy_process_apertures();
+	hsakmt_fmm_destroy_process_apertures(ctx);
 }
diff --git a/projects/rocr-runtime/libhsakmt/src/fmm.h b/projects/rocr-runtime/libhsakmt/src/fmm.h
index f98b129b5d..ebb2ce7900 100644
--- a/projects/rocr-runtime/libhsakmt/src/fmm.h
+++ b/projects/rocr-runtime/libhsakmt/src/fmm.h
@@ -45,59 +45,113 @@ typedef struct {
 	void *start_address;
 } aperture_properties_t;
 
-HSAKMT_STATUS hsakmt_fmm_get_amdgpu_device_handle(uint32_t node_id,  HsaAMDGPUDeviceHandle *DeviceHandle);
-HSAKMT_STATUS hsakmt_fmm_init_process_apertures(unsigned int NumNodes);
-void hsakmt_fmm_destroy_process_apertures(void);
+HSAKMT_STATUS hsakmt_fmm_get_amdgpu_device_handle(HsaKFDContext *ctx,
+						uint32_t node_id,  HsaAMDGPUDeviceHandle *DeviceHandle);
+HSAKMT_STATUS hsakmt_fmm_init_process_apertures(HsaKFDContext *ctx, unsigned int NumNodes);
+void hsakmt_fmm_destroy_process_apertures(HsaKFDContext *ctx);
 
 /* Memory interface */
-void *hsakmt_fmm_allocate_scratch(uint32_t gpu_id, void *address, uint64_t MemorySizeInBytes);
-void *hsakmt_fmm_allocate_device(uint32_t gpu_id, uint32_t node_id, void *address,
-			uint64_t MemorySizeInBytes, uint64_t alignment, HsaMemFlags flags);
-void *hsakmt_fmm_allocate_doorbell(uint32_t gpu_id, uint64_t MemorySizeInBytes, uint64_t doorbell_offset);
-void *hsakmt_fmm_allocate_host(uint32_t gpu_id, uint32_t node_id, void *address, uint64_t MemorySizeInBytes,
-			uint64_t alignment, HsaMemFlags flags);
-void hsakmt_fmm_print(uint32_t node);
-HSAKMT_STATUS hsakmt_fmm_release(void *address);
-HSAKMT_STATUS hsakmt_fmm_map_to_gpu(void *address, uint64_t size, uint64_t *gpuvm_address);
-int hsakmt_fmm_unmap_from_gpu(void *address);
-bool hsakmt_fmm_get_handle(void *address, uint64_t *handle, uint64_t *size_offset);
-HSAKMT_STATUS hsakmt_fmm_get_mem_info(const void *address, HsaPointerInfo *info);
-HSAKMT_STATUS hsakmt_fmm_set_mem_user_data(const void *mem, void *usr_data);
+// Memory allocation/free functions
+void *hsakmt_fmm_allocate_scratch(HsaKFDContext *ctx,
+						uint32_t gpu_id,
+						void *address,
+						uint64_t MemorySizeInBytes);
+
+void *hsakmt_fmm_allocate_device(HsaKFDContext *ctx,
+						uint32_t gpu_id,
+						uint32_t node_id,
+						void *address,
+						uint64_t MemorySizeInBytes,
+						uint64_t alignment,
+						HsaMemFlags flags);
+
+void *hsakmt_fmm_allocate_host(HsaKFDContext *ctx,
+						uint32_t gpu_id,
+						uint32_t node_id,
+						void *address,
+						uint64_t MemorySizeInBytes,
+						uint64_t alignment,
+						HsaMemFlags flags);
+
+void *hsakmt_fmm_allocate_doorbell(HsaKFDContext *ctx,
+						uint32_t gpu_id,
+						uint64_t MemorySizeInBytes,
+						uint64_t doorbell_offset);
+
+void hsakmt_fmm_print(HsaKFDContext *ctx, uint32_t node);
+HSAKMT_STATUS hsakmt_fmm_release(HsaKFDContext *ctx, void *address);
+
+// Memory mmap/munmap functions
+HSAKMT_STATUS hsakmt_fmm_map_to_gpu(HsaKFDContext *ctx,
+						void *address,
+						uint64_t size,
+						uint64_t *gpuvm_address);
+
+HSAKMT_STATUS hsakmt_fmm_map_to_gpu_nodes(HsaKFDContext *ctx,
+						void *address,
+						uint64_t size,
+						uint32_t *nodes_to_map,
+						uint64_t num_of_nodes,
+						uint64_t *gpuvm_address);
+
+int hsakmt_fmm_unmap_from_gpu(HsaKFDContext *ctx, void *address);
+
+// Memory register/deregister functions
+HSAKMT_STATUS hsakmt_fmm_register_memory(HsaKFDContext *ctx,
+						void *address, uint64_t size_in_bytes,
+						uint32_t *gpu_id_array,
+						uint32_t gpu_id_array_size,
+						HsaMemFlags flags);
+
+HSAKMT_STATUS hsakmt_fmm_register_graphics_handle(HsaKFDContext *ctx,
+						HSAuint64 GraphicsResourceHandle,
+						HsaGraphicsResourceInfo *GraphicsResourceInfo,
+						uint32_t *gpu_id_array,
+						uint32_t gpu_id_array_size,
+						HSA_REGISTER_MEM_FLAGS RegisterFlags);
+
+HSAKMT_STATUS hsakmt_fmm_deregister_memory(HsaKFDContext *ctx, void *address);
+
+// Memory export functions
+HSAKMT_STATUS hsakmt_fmm_export_dma_buf_fd(HsaKFDContext *ctx,
+						void *MemoryAddress,
+						HSAuint64 MemorySizeInBytes,
+						int *DMABufFd,
+						HSAuint64 *Offset);
+
+HSAKMT_STATUS hsakmt_fmm_share_memory(HsaKFDContext *ctx,
+						void *MemoryAddress,
+						HSAuint64 SizeInBytes,
+						HsaSharedMemoryHandle *SharedMemoryHandle);
+
+HSAKMT_STATUS hsakmt_fmm_register_shared_memory(HsaKFDContext *ctx,
+						const HsaSharedMemoryHandle *SharedMemoryHandle,
+						HSAuint64 *SizeInBytes,
+						void **MemoryAddress,
+						uint32_t *gpu_id_array,
+						uint32_t gpu_id_array_size);
+
+bool hsakmt_fmm_get_handle(HsaKFDContext *ctx,
+						void *address,
+						uint64_t *handle,
+						uint64_t *size_offset);
+HSAKMT_STATUS hsakmt_fmm_get_mem_info(HsaKFDContext *ctx,
+						const void *address,
+						 HsaPointerInfo *info);
+HSAKMT_STATUS hsakmt_fmm_set_mem_user_data(HsaKFDContext *ctx,
+						const void *mem,
+						void *usr_data);
 #ifdef SANITIZER_AMDGPU
-HSAKMT_STATUS hsakmt_fmm_replace_asan_header_page(void* address);
-HSAKMT_STATUS hsakmt_fmm_return_asan_header_page(void* address);
+HSAKMT_STATUS hsakmt_fmm_replace_asan_header_page(HsaKFDContext *ctx, void* address);
+HSAKMT_STATUS hsakmt_fmm_return_asan_header_page(HsaKFDContext *ctx, void* address);
 #endif
 
 /* Topology interface*/
-HSAKMT_STATUS hsakmt_fmm_get_aperture_base_and_limit(aperture_type_e aperture_type, HSAuint32 gpu_id,
+HSAKMT_STATUS hsakmt_fmm_get_aperture_base_and_limit(HsaKFDContext *ctx,
+		aperture_type_e aperture_type, HSAuint32 gpu_id,
 		HSAuint64 *aperture_base, HSAuint64 *aperture_limit);
 
-HSAKMT_STATUS hsakmt_fmm_register_memory(void *address, uint64_t size_in_bytes,
-								  uint32_t *gpu_id_array,
-								  uint32_t gpu_id_array_size,
-								  HsaMemFlags flags);
-HSAKMT_STATUS hsakmt_fmm_register_graphics_handle(HSAuint64 GraphicsResourceHandle,
-					   HsaGraphicsResourceInfo *GraphicsResourceInfo,
-					   uint32_t *gpu_id_array,
-					   uint32_t gpu_id_array_size,
-					   HSA_REGISTER_MEM_FLAGS RegisterFlags);
-HSAKMT_STATUS hsakmt_fmm_deregister_memory(void *address);
-HSAKMT_STATUS hsakmt_fmm_export_dma_buf_fd(void *MemoryAddress,
-				    HSAuint64 MemorySizeInBytes,
-				    int *DMABufFd,
-				    HSAuint64 *Offset);
-HSAKMT_STATUS hsakmt_fmm_share_memory(void *MemoryAddress,
-			       HSAuint64 SizeInBytes,
-			       HsaSharedMemoryHandle *SharedMemoryHandle);
-HSAKMT_STATUS hsakmt_fmm_register_shared_memory(const HsaSharedMemoryHandle *SharedMemoryHandle,
-					 HSAuint64 *SizeInBytes,
-					 void **MemoryAddress,
-					 uint32_t *gpu_id_array,
-					 uint32_t gpu_id_array_size);
-HSAKMT_STATUS hsakmt_fmm_map_to_gpu_nodes(void *address, uint64_t size,
-		uint32_t *nodes_to_map, uint64_t num_of_nodes, uint64_t *gpuvm_address);
-
-int hsakmt_open_drm_render_device(int minor);
+int hsakmt_open_drm_render_device(HsaKFDContext *ctx, int minor);
 void *hsakmt_mmap_allocate_aligned(int prot, int flags, uint64_t size, uint64_t align,
 			    uint64_t guard_size, void *aper_base, void *aper_limit, int fd);
 
diff --git a/projects/rocr-runtime/libhsakmt/src/globals.c b/projects/rocr-runtime/libhsakmt/src/globals.c
index 9a36e6f5cc..fa9799c9d5 100644
--- a/projects/rocr-runtime/libhsakmt/src/globals.c
+++ b/projects/rocr-runtime/libhsakmt/src/globals.c
@@ -27,10 +27,8 @@
 
 // HSAKMT global data
 
-int hsakmt_kfd_fd = -1;
 int hsakmt_udmabuf_dev_fd = -1;
 unsigned long hsakmt_kfd_open_count;
-unsigned long hsakmt_system_properties_count;
 pthread_mutex_t hsakmt_mutex = PTHREAD_MUTEX_INITIALIZER;
 bool hsakmt_is_dgpu;
 
diff --git a/projects/rocr-runtime/libhsakmt/src/hsakmtctx.h b/projects/rocr-runtime/libhsakmt/src/hsakmtctx.h
new file mode 100644
index 0000000000..b2f04dbcdf
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/hsakmtctx.h
@@ -0,0 +1,827 @@
+/*
+ * Copyright © 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _HSAKMTCTX_H_
+#define _HSAKMTCTX_H_
+
+#include "hsakmt/hsakmttypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _HsaKFDContext HsaKFDContext;
+
+/**
+  The context-aware version for openning the kfd device.
+
+  "Opens" the HSA kernel driver for user-kernel mode communication.
+
+  On Windows, this function gets a handle to the KFD's AMDKFDIO device object that
+  is responsible for user-kernel communication, this handle is used internally by
+  the thunk library to send device I/O control to the HSA kernel driver.
+  No other thunk library function may be called unless the user-kernel communication
+  channel is opened first.
+
+  On Linux this call opens the "/dev/kfd" device file to establish a communication
+  path to the kernel.
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtOpenKFDCtx(
+    HsaKFDContext **pCtx   //IN/OUT
+    );
+
+/**
+  The context-aware version for closing the kfd device.
+
+  "Closes" the user-kernel communication path.
+
+  On Windows, the handle obtained by the hsaKmtOpenKFDCtx() function is closed;
+  no other communication with the kernel driver is possible after the successful
+  execution of the hsaKmtCloseKFDCtx() function. Depending on the failure reason,
+  the user-kernel communication path may or may not be still active.
+
+  On Linux the function closes the "dev/kfd" device file.
+  No further communication to the kernel driver is allowed until hsaKmtOpenKFDCtx()
+  function is called again.
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtCloseKFDCtx( void );
+
+/**
+  The function takes a "snapshot" of the topology information within the KFD
+  to avoid any changes during the enumeration process.
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtAcquireSystemPropertiesCtx(
+    HsaKFDContext         *ctx,               //IN
+    HsaSystemProperties*  SystemProperties    //OUT
+    );
+
+/**
+  Releases the topology "snapshot" taken by hsaKmtAcquireSystemProperties()
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtReleaseSystemPropertiesCtx(
+    HsaKFDContext         *ctx              //IN
+    );
+
+/**
+  Retrieves the discoverable sub-properties for a given HSA
+  node. The parameters returned allow the application or runtime to size the
+  management structures necessary to store the information.
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtGetNodePropertiesCtx(
+    HsaKFDContext          *ctx,              //IN
+    HSAuint32              NodeId,            //IN
+    HsaNodeProperties*     NodeProperties     //OUT
+    );
+
+/**
+  Retrieves the memory properties of a specific HSA node.
+  the memory pointer passed as MemoryProperties is sized as
+  NumBanks * sizeof(HsaMemoryProperties). NumBanks is retrieved with the
+  hsaKmtGetNodePropertiesCtx() call.
+
+  Some of the data returned is optional. Not all implementations may return all
+  parameters in the hsaMemoryProperties.
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtGetNodeMemoryPropertiesCtx(
+    HsaKFDContext         *ctx,               //IN
+    HSAuint32             NodeId,             //IN
+    HSAuint32             NumBanks,           //IN
+    HsaMemoryProperties*  MemoryProperties    //OUT
+    );
+
+/**
+  Retrieves the cache properties of a specific HSA node and processor ID.
+  ProcessorID refers to either a CPU core or a SIMD unit as enumerated earlier
+  via the hsaKmtGetNodePropertiesCtx() call.
+  The memory pointer passed as CacheProperties is sized as
+  NumCaches * sizeof(HsaCacheProperties). NumCaches is retrieved with the
+  hsaKmtGetNodePropertiesCtx() call.
+
+  The data returned is optional. Not all implementations may return all
+  parameters in the CacheProperties.
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtGetNodeCachePropertiesCtx(
+    HsaKFDContext       *ctx,           //IN
+    HSAuint32           NodeId,         //IN
+    HSAuint32           ProcessorId,    //IN
+    HSAuint32           NumCaches,      //IN
+    HsaCacheProperties* CacheProperties //OUT
+    );
+
+/**
+  Retrieves the HSA IO affinity properties of a specific HSA node.
+  the memory pointer passed as Properties is sized as
+  NumIoLinks * sizeof(HsaIoLinkProperties). NumIoLinks is retrieved with the
+  hsaKmtGetNodePropertiesCtx() call.
+
+  The data returned is optional. Not all implementations may return all
+  parameters in the IoLinkProperties.
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtGetNodeIoLinkPropertiesCtx(
+    HsaKFDContext        *ctx,              //IN
+    HSAuint32            NodeId,            //IN
+    HSAuint32            NumIoLinks,        //IN
+    HsaIoLinkProperties* IoLinkProperties   //OUT
+    );
+
+
+/**
+  Creates an operating system event associated with a HSA event ID
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtCreateEventCtx(
+    HsaKFDContext       *ctx,           //IN
+    HsaEventDescriptor* EventDesc,      //IN
+    bool                ManualReset,    //IN
+    bool                IsSignaled,     //IN
+    HsaEvent**          Event           //OUT
+    );
+
+/**
+  Destroys an operating system event associated with a HSA event ID
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtDestroyEventCtx(
+    HsaKFDContext       *ctx,    //IN
+    HsaEvent*           Event    //IN
+    );
+
+/**
+  Sets the specified event object to the signaled state
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtSetEventCtx(
+    HsaKFDContext       *ctx,    //IN
+    HsaEvent*           Event    //IN
+    );
+
+/**
+  Sets the specified event object to the non-signaled state
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtResetEventCtx(
+    HsaKFDContext       *ctx,    //IN
+    HsaEvent*           Event    //IN
+    );
+
+/**
+  Queries the state of the specified event object
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtQueryEventStateCtx(
+    HsaKFDContext       *ctx,    //IN
+    HsaEvent*           Event    //IN
+    );
+
+/**
+  Checks the current state of the event object. If the object's state is
+  nonsignaled, the calling thread enters the wait state.
+
+ The function returns when one of the following occurs:
+- The specified event object is in the signaled state.
+- The time-out interval elapses.
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtWaitOnEventCtx(
+    HsaKFDContext       *ctx,           //IN
+    HsaEvent*           Event,          //IN
+    HSAuint32           Milliseconds    //IN
+    );
+
+/**
+  Checks the current state of the event object. If the object's state is
+  nonsignaled, the calling thread enters the wait state. event_age can
+  help avoiding race conditions.
+
+ The function returns when one of the following occurs:
+- The specified event object is in the signaled state.
+- The time-out interval elapses.
+- Tracking event age
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtWaitOnEvent_ExtCtx(
+    HsaKFDContext       *ctx,           //IN
+    HsaEvent*           Event,          //IN
+    HSAuint32           Milliseconds,   //IN
+    uint64_t            *event_age      //IN/OUT
+    );
+
+/**
+  Checks the current state of multiple event objects.
+
+ The function returns when one of the following occurs:
+- Either any one or all of the specified objects are in the signaled state
+  - if "WaitOnAll" is "true" the function returns when the state of all
+    objects in array is signaled
+  - if "WaitOnAll" is "false" the function returns when the state of any
+    one of the objects is set to signaled
+- The time-out interval elapses.
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtWaitOnMultipleEventsCtx(
+    HsaKFDContext       *ctx,           //IN
+    HsaEvent*           Events[],       //IN
+    HSAuint32           NumEvents,      //IN
+    bool                WaitOnAll,      //IN
+    HSAuint32           Milliseconds    //IN
+    );
+
+/**
+  Checks the current state of multiple event objects.
+  event_age can help avoiding race conditions.
+
+ The function returns when one of the following occurs:
+- Either any one or all of the specified objects are in the signaled state
+  - if "WaitOnAll" is "true" the function returns when the state of all
+    objects in array is signaled
+  - if "WaitOnAll" is "false" the function returns when the state of any
+    one of the objects is set to signaled
+- The time-out interval elapses.
+- Tracking event age
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtWaitOnMultipleEvents_ExtCtx(
+    HsaKFDContext       *ctx,           //IN
+    HsaEvent*           Events[],       //IN
+    HSAuint32           NumEvents,      //IN
+    bool                WaitOnAll,      //IN
+    HSAuint32           Milliseconds,   //IN
+    uint64_t            *event_age      //IN/OUT
+    );
+
+/**
+  Creates a GPU queue with user-mode access rights
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtCreateQueueCtx(
+    HsaKFDContext       *ctx,             //IN
+    HSAuint32           NodeId,           //IN
+    HSA_QUEUE_TYPE      Type,             //IN
+    HSAuint32           QueuePercentage,  //IN
+    HSA_QUEUE_PRIORITY  Priority,         //IN
+    void*               QueueAddress,     //IN
+    HSAuint64           QueueSizeInBytes, //IN
+    HsaEvent*           Event,            //IN
+    HsaQueueResource*   QueueResource     //OUT
+    );
+
+/**
+  Creates a GPU queue with user-mode access rights
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtCreateQueueExtCtx(
+    HsaKFDContext       *ctx,             //IN
+    HSAuint32           NodeId,           //IN
+    HSA_QUEUE_TYPE      Type,             //IN
+    HSAuint32           QueuePercentage,  //IN
+    HSA_QUEUE_PRIORITY  Priority,         //IN
+    HSAuint32           SdmaEngineId,     //IN
+    void*               QueueAddress,     //IN
+    HSAuint64           QueueSizeInBytes, //IN
+    HsaEvent*           Event,            //IN
+    HsaQueueResource*   QueueResource     //OUT
+    );
+
+/**
+  Updates a queue
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtUpdateQueueCtx(
+    HsaKFDContext       *ctx,             //IN
+    HSA_QUEUEID         QueueId,          //IN
+    HSAuint32           QueuePercentage,  //IN
+    HSA_QUEUE_PRIORITY  Priority,         //IN
+    void*               QueueAddress,     //IN
+    HSAuint64           QueueSize,        //IN
+    HsaEvent*           Event             //IN
+    );
+
+/**
+  Destroys a queue
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtDestroyQueueCtx(
+    HsaKFDContext       *ctx,           //IN
+    HSA_QUEUEID         QueueId         //IN
+    );
+
+/**
+  Set cu mask for a queue
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtSetQueueCUMaskCtx(
+    HsaKFDContext       *ctx,           //IN
+    HSA_QUEUEID         QueueId,        //IN
+    HSAuint32           CUMaskCount,    //IN
+    HSAuint32*          QueueCUMask     //IN
+    );
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtGetQueueInfoCtx(
+    HsaKFDContext       *ctx,           //IN
+    HSA_QUEUEID         QueueId,        //IN
+    HsaQueueInfo        *QueueInfo      //IN
+    );
+
+/**
+  Allows an HSA process to set/change the default and alternate memory coherency, before starting to dispatch.
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtSetMemoryPolicyCtx(
+    HsaKFDContext      *ctx,                      //IN
+    HSAuint32          Node,                      //IN
+    HSAuint32          DefaultPolicy,             //IN
+    HSAuint32          AlternatePolicy,           //IN
+    void*              MemoryAddressAlternate,    //IN (page-aligned)
+    HSAuint64          MemorySizeInBytes          //IN (page-aligned)
+    );
+
+/**
+  Allocates a memory buffer that may be accessed by the GPU
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtAllocMemoryCtx(
+    HsaKFDContext     *ctx,                   //IN
+    HSAuint32         PreferredNode,          //IN
+    HSAuint64         SizeInBytes,            //IN  (multiple of page size)
+    HsaMemFlags       MemFlags,               //IN
+    void**            MemoryAddress           //IN/OUT (page-aligned)
+    );
+
+/**
+  Allocates a memory buffer with specific alignment that may be accessed by the GPU
+  If Alignment is 0, the smallest possible alignment will be used
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtAllocMemoryAlignCtx(
+    HsaKFDContext     *ctx,                  //IN
+    HSAuint32         PreferredNode,          //IN
+    HSAuint64         SizeInBytes,            //IN  (multiple of page size)
+    HSAuint64         Alignment,              //IN  (power of 2 and >= page size)
+    HsaMemFlags       MemFlags,               //IN
+    void**            MemoryAddress           //IN/OUT (page-aligned)
+    );
+
+/**
+  Frees a memory buffer
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtFreeMemoryCtx(
+    HsaKFDContext     *ctx,                 //IN
+    void*             MemoryAddress,        //IN (page-aligned)
+    HSAuint64         SizeInBytes           //IN
+    );
+
+/**
+  Inquires memory available for allocation as a memory buffer
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtAvailableMemoryCtx(
+    HsaKFDContext     *ctx,                //IN
+    HSAuint32         Node,                //IN
+    HSAuint64         *AvailableBytes      //OUT
+    );
+
+/**
+  Registers with KFD a memory buffer that may be accessed by the GPU
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtRegisterMemoryCtx(
+    HsaKFDContext     *ctx,               //IN
+    void*             MemoryAddress,      //IN (cache-aligned)
+    HSAuint64         MemorySizeInBytes   //IN (cache-aligned)
+    );
+
+
+/**
+  Registers with KFD a memory buffer that may be accessed by specific GPUs
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtRegisterMemoryToNodesCtx(
+    HsaKFDContext    *ctx,               //IN
+    void             *MemoryAddress,     //IN (cache-aligned)
+    HSAuint64        MemorySizeInBytes,  //IN (cache-aligned)
+    HSAuint64        NumberOfNodes,      //IN
+    HSAuint32*       NodeArray           //IN
+    );
+
+
+/**
+  Registers with KFD a memory buffer with memory attributes
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtRegisterMemoryWithFlagsCtx(
+    HsaKFDContext    *ctx,                //IN
+    void             *MemoryAddress,      //IN (cache-aligned)
+    HSAuint64        MemorySizeInBytes,   //IN (cache-aligned)
+    HsaMemFlags      MemFlags             //IN
+    );
+
+/**
+  Registers with KFD a graphics buffer and returns graphics metadata
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtRegisterGraphicsHandleToNodesCtx(
+    HsaKFDContext             *ctx,                         //IN
+    HSAuint64                 GraphicsResourceHandle,       //IN
+    HsaGraphicsResourceInfo   *GraphicsResourceInfo,        //OUT
+    HSAuint64                 NumberOfNodes,                //IN
+    HSAuint32*                NodeArray                     //IN
+    );
+
+/**
+  Similar to hsaKmtRegisterGraphicsHandleToNodes but provides registration
+  options via RegisterFlags.
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtRegisterGraphicsHandleToNodesExtCtx(
+    HsaKFDContext           *ctx,                         //IN
+    HSAuint64               GraphicsResourceHandle,       //IN
+    HsaGraphicsResourceInfo *GraphicsResourceInfo,        //OUT
+    HSAuint64               NumberOfNodes,                //IN
+    HSAuint32*              NodeArray,                    //IN
+    HSA_REGISTER_MEM_FLAGS  RegisterFlags                 //IN
+    );
+
+/**
+ * Export a dmabuf handle and offset for a given memory address
+ *
+ * Validates that @MemoryAddress belongs to a valid allocation and that the
+ * @MemorySizeInBytes doesn't exceed the end of that allocation. Returns a
+ * dmabuf fd of the allocation and the offset of MemoryAddress within that
+ * allocation. The memory will remain allocated even after the allocation is
+ * freed by hsaKmtFreeMemory for as long as a dmabuf fd remains open or any
+ * importer of that fd maintains an active reference to the memory.
+ */
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtExportDMABufHandleCtx(
+    HsaKFDContext     *ctx,               //IN
+    void              *MemoryAddress,     //IN
+    HSAuint64         MemorySizeInBytes,  //IN
+    int               *DMABufFd,          //OUT
+    HSAuint64         *Offset             //OUT
+    );
+
+/**
+ Export a memory buffer for sharing with other processes
+
+ NOTE: for the current revision of the thunk spec, SizeInBytes
+ must match whole allocation.
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtShareMemoryCtx(
+    HsaKFDContext         *ctx,               //IN
+    void                  *MemoryAddress,     //IN
+    HSAuint64             SizeInBytes,        //IN
+    HsaSharedMemoryHandle *SharedMemoryHandle //OUT
+);
+
+/**
+ Register shared memory handle
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtRegisterSharedHandleCtx(
+    HsaKFDContext               *ctx,                //IN
+    const HsaSharedMemoryHandle *SharedMemoryHandle, //IN
+    void                        **MemoryAddress,     //OUT
+    HSAuint64                   *SizeInBytes         //OUT
+);
+
+/**
+ Register shared memory handle to specific nodes only
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtRegisterSharedHandleToNodesCtx(
+    HsaKFDContext               *ctx,                //IN
+    const HsaSharedMemoryHandle *SharedMemoryHandle, //IN
+    void                        **MemoryAddress,     //OUT
+    HSAuint64                   *SizeInBytes,        //OUT
+    HSAuint64                   NumberOfNodes,       //OUT
+    HSAuint32*                  NodeArray            //OUT
+);
+
+/**
+  Unregisters with KFD a memory buffer
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtDeregisterMemoryCtx(
+    HsaKFDContext     *ctx,           //IN
+    void*             MemoryAddress   //IN
+    );
+
+/**
+  Ensures that the memory is resident and can be accessed by GPU
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtMapMemoryToGPUCtx(
+    HsaKFDContext     *ctx,              //IN
+    void*             MemoryAddress,     //IN (page-aligned)
+    HSAuint64         MemorySizeInBytes, //IN (page-aligned)
+    HSAuint64*        AlternateVAGPU     //OUT (page-aligned)
+    );
+
+/**
+  Ensures that the memory is resident and can be accessed by GPUs
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtMapMemoryToGPUNodesCtx(
+    HsaKFDContext     *ctx,                  //IN
+    void*             MemoryAddress,         //IN (page-aligned)
+    HSAuint64         MemorySizeInBytes,     //IN (page-aligned)
+    HSAuint64*        AlternateVAGPU,        //OUT (page-aligned)
+    HsaMemMapFlags    MemMapFlags,           //IN
+    HSAuint64         NumberOfNodes,         //IN
+    HSAuint32*        NodeArray              //IN
+    );
+
+/**
+  Releases the residency of the memory
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtUnmapMemoryToGPUCtx(
+    HsaKFDContext     *ctx,              //IN
+    void*             MemoryAddress      //IN (page-aligned)
+    );
+
+/**
+  Stub for Unmap Graphic Handle
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtUnmapGraphicHandleCtx(
+    HsaKFDContext      *ctx,                  //IN
+    HSAuint32          NodeId,                //IN
+    HSAuint64          FlatMemoryAddress,     //IN
+    HSAuint64          SizeInBytes            //IN
+    );
+
+/**
+ * Get an AMDGPU device handle for a GPU node
+ */
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtGetAMDGPUDeviceHandleCtx(
+    HsaKFDContext           *ctx,          //IN
+    HSAuint32               NodeId,        //IN
+    HsaAMDGPUDeviceHandle   *DeviceHandle  //OUT
+    );
+
+/**
+  Sets trap handler and trap buffer to be used for all queues
+  associated with the specified NodeId within this process context
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtSetTrapHandlerCtx(
+    HsaKFDContext      *ctx,                     //IN
+    HSAuint32          NodeId,                   //IN
+    void*              TrapHandlerBaseAddress,   //IN
+    HSAuint64          TrapHandlerSizeInBytes,   //IN
+    void*              TrapBufferBaseAddress,    //IN
+    HSAuint64          TrapBufferSizeInBytes     //IN
+    );
+
+/**
+  Gets image tile configuration.
+ */
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtGetTileConfigCtx(
+    HsaKFDContext       *ctx,       //IN
+    HSAuint32           NodeId,     //IN
+    HsaGpuTileConfig    *config     //IN/OUT
+    );
+
+/**
+  Returns information about pointers
+*/
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtQueryPointerInfoCtx(
+    HsaKFDContext     *ctx,            //IN
+    const void        *Pointer,        //IN
+    HsaPointerInfo    *PointerInfo     //OUT
+    );
+
+/**
+  Associates user data with a memory allocation
+*/
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtSetMemoryUserDataCtx(
+    HsaKFDContext    *ctx,       //IN
+    const void *     Pointer,    //IN
+    void *           UserData    //IN
+    );
+
+/**
+  Allocate GWS resource for a queue
+ */
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtAllocQueueGWSCtx(
+    HsaKFDContext      *ctx,           //IN
+    HSA_QUEUEID        QueueId,        //IN
+    HSAuint32          nGWS,           //IN
+    HSAuint32          *firstGWS       //OUT
+    );
+
+/* Helper functions for calling KFD SVM ioctl */
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtSVMSetAttrCtx(
+    HsaKFDContext     *ctx,         //IN
+    void              *start_addr,  //IN: Start of the virtual address range (page-aligned)
+    HSAuint64         size,         //IN: size (page-aligned)
+    unsigned int      nattr,        //IN: number of attributes
+    HSA_SVM_ATTRIBUTE *attrs        //IN: array of attributes
+    );
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtSVMGetAttrCtx(
+    HsaKFDContext     *ctx,         //IN
+    void              *start_addr,  //IN: Start of the virtual address range (page-aligned)
+    HSAuint64         size,         //IN: size (page aligned)
+    unsigned int      nattr,        //IN: number of attributes
+    HSA_SVM_ATTRIBUTE *attrs        //IN/OUT: array of attributes
+    );
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtSetXNACKModeCtx(
+    HsaKFDContext     *ctx,       //IN
+    HSAint32          enable      //IN: enable/disable XNACK node.
+    );
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtGetXNACKModeCtx(
+    HsaKFDContext     *ctx,       //IN
+    HSAint32          *enable     //OUT: returns XNACK value.
+    );
+
+/**
+   Open anonymous file handle to enable events and read SMI events.
+
+   To enable events, write 64bit events mask to fd, event enums as bit index.
+   for example, event mask ctx(HSA_SMI_EVENT_MASK_FROM_INDEXCtx(HSA_SMI_EVENT_INDEX_MAX) - 1) to enable all events
+
+   Read event from fd is not blocking, use poll with timeout value to check if event is available.
+   Event is dropped if kernel event fifo is full.
+*/
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtOpenSMICtx(
+    HsaKFDContext     *ctx,      //IN
+    HSAuint32         NodeId,    //IN: GPU node_id to receive the SMI event from
+    int               *fd        //OUT: anonymous file handle
+    );
+
+/**
+   If this is GPU Mapped memory, remap the first page at this address to be normal system memory
+
+   This is used in ASAN mode to remap the first page of device memory to share host ASAN logic.
+   This function is only supported when libhsakmt is compiled in ASAN mode.
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtReplaceAsanHeaderPageCtx(
+    HsaKFDContext     *ctx,      //IN
+    void              *addr      //IN: Start of the virtual address page
+    );
+
+/**
+   If this is GPU Mapped memory, remap the first page back to the original GPU memory
+
+   This is used in ASAN mode to remap the first page back to its original mapping.
+   This function is only supported when libhsakmt is compiled in ASAN mode.
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtReturnAsanHeaderPageCtx(
+    HsaKFDContext     *ctx,       //IN
+    void              *addr       //IN: Start of the virtual address page
+    );
+
+#ifdef __cplusplus
+}   //extern "C"
+#endif
+
+#endif //_HSAKMTCTX_H_
diff --git a/projects/rocr-runtime/libhsakmt/src/hsakmtmodel.c b/projects/rocr-runtime/libhsakmt/src/hsakmtmodel.c
index c065c3f0a3..7b1c69b76b 100644
--- a/projects/rocr-runtime/libhsakmt/src/hsakmtmodel.c
+++ b/projects/rocr-runtime/libhsakmt/src/hsakmtmodel.c
@@ -145,8 +145,8 @@ void model_init_env_vars(void)
 			abort();
 #endif
 		}
-		assert(hsakmt_kfd_fd < 0);
-		hsakmt_kfd_fd = fd;
+		assert(hsakmt_primary_kfd_ctx.fd < 0);
+		hsakmt_kfdcontext_init_context(fd, &hsakmt_primary_kfd_ctx);
 		pthread_condattr_t condattr;
 		pthread_condattr_init(&condattr);
 		pthread_condattr_setclock(&condattr, CLOCK_MONOTONIC);
@@ -193,7 +193,7 @@ static uint64_t allocate_from_memfd(uint64_t size, uint64_t align)
 	model_memfd_size = (model_memfd_size + align - 1) & ~(align - 1);
 	uint64_t offset = model_memfd_size;
 	model_memfd_size += size;
-	int ret = ftruncate(hsakmt_kfd_fd, model_memfd_size);
+	int ret = ftruncate(hsakmt_primary_kfd_ctx.fd, model_memfd_size);
 	if (ret < 0)
 	{
 		fprintf(stderr, "model: ftruncate on memfd failed\n");
@@ -269,7 +269,7 @@ void model_init(void)
 	HSAKMT_STATUS result;
 	HsaSystemProperties props;
 	/* Read the topology to determine nodes. */
-	result = hsakmt_topology_sysfs_get_system_props(&props);
+	result = hsakmt_topology_sysfs_get_system_props(&hsakmt_primary_kfd_ctx, &props);
 	if (result != HSAKMT_STATUS_SUCCESS)
 	{
 		fprintf(stderr, "model: Failed to parse topology\n");
@@ -503,7 +503,7 @@ static int model_kfd_ioctl_locked(unsigned long request, void *arg)
 		// unclear whether the current implementation causes kernel data
 		// structures to grow. But in practice, it almost certainly never
 		// matters.
-		int ret = fallocate(hsakmt_kfd_fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+		int ret = fallocate(hsakmt_primary_kfd_ctx.fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
 							mem_data->file_offset, mem_data->size);
 		if (ret != 0)
 		{
@@ -539,7 +539,7 @@ static int model_kfd_ioctl_locked(unsigned long request, void *arg)
 			pr_debug("MODEL IOCTL: AMDKFD_IOC_MAP_MEMORY_TO_GPU: VA: %lx : Size: %lu, Flags: %x\n", mem_data->va_addr, mem_data->size, mem_data->flags);
 			void *ret = mmap(VOID_PTR_ADD(model_nodes[node_id].aperture, mem_data->va_addr),
 							 mem_data->size, prot,
-							 MAP_SHARED | MAP_FIXED, hsakmt_kfd_fd, mem_data->file_offset);
+							 MAP_SHARED | MAP_FIXED, hsakmt_primary_kfd_ctx.fd, mem_data->file_offset);
 			if (ret == MAP_FAILED)
 			{
 				fprintf(stderr, "model: mmap failed\n");
@@ -767,7 +767,7 @@ static int model_kfd_ioctl_locked(unsigned long request, void *arg)
 			model_functions->register_queue(model_nodes[node_id].model, &info);
 		model_queues[queue_id].node_id = node_id;
 		args->queue_id = queue_id;
-		// Note that strictly speaking, this is the offset into the hsakmt_kfd_fd
+		// Note that strictly speaking, this is the offset into the hsakmt_primary_kfd_ctx.fd
 		// file, not the DRM fd (but they are the same in our case).
 		args->doorbell_offset = model_nodes[node_id].doorbell_offset + 8 * queue_id;
 		return 0;
diff --git a/projects/rocr-runtime/libhsakmt/src/kfdcontext.c b/projects/rocr-runtime/libhsakmt/src/kfdcontext.c
new file mode 100644
index 0000000000..981c53eb4a
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/kfdcontext.c
@@ -0,0 +1,63 @@
+/*
+ * Copyright © 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "kfdcontext.h"
+#include "libhsakmt.h"
+#include <stdlib.h>
+#include <stddef.h>
+#include <assert.h>
+#include <stdio.h>
+#include <errno.h>
+
+void hsakmt_kfdcontext_init_context(int fd, HsaKFDContext *ctx)
+{
+    assert(fd >= 0);
+    assert(ctx);
+
+    ctx->fd = fd;
+    ctx->queue_context = NULL;
+    ctx->fmm_context = NULL;
+    ctx->event_context = NULL;
+}
+
+void hsakmt_kfdcontext_clear_context(HsaKFDContext *ctx)
+{
+    if (!ctx)
+        return;
+
+    if (ctx->queue_context) {
+        free(ctx->queue_context);
+        ctx->queue_context = NULL;
+    }
+    if (ctx->fmm_context) {
+        free(ctx->fmm_context);
+        ctx->fmm_context = NULL;
+    }
+    if (ctx->event_context) {
+        free(ctx->event_context);
+        ctx->event_context = NULL;
+    }
+    ctx->fd = -1;
+}
diff --git a/projects/rocr-runtime/libhsakmt/src/kfdcontext.h b/projects/rocr-runtime/libhsakmt/src/kfdcontext.h
new file mode 100644
index 0000000000..8053e74f7b
--- /dev/null
+++ b/projects/rocr-runtime/libhsakmt/src/kfdcontext.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright © 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _KFDCONTEXT_H_
+#define _KFDCONTEXT_H_
+
+#include <stdint.h>
+
+struct hsa_kfd_queue_context;
+struct hsa_kfd_fmm_context;
+struct hsa_kfd_event_context;
+
+/*
+ * HsaKFDContext
+ *
+ * Represents the execution context for a connection to the Kernel Fusion Driver (KFD).
+ *
+ * This structure encapsulates all state required to manage a KFD session, including:
+ *   - The file descriptor associated with the open KFD device
+ *   - Related resources tied to this file descriptor
+ *
+ * Multiple HsaKFDContext instances can coexist simultaneously, each maintaining its own
+ * independent set of resources. These contexts are fully isolated from one another and
+ * must not have their resources mixed. For example, memory resources created in
+ * context A cannot be used in context B directly. If resources need to be shared between
+ * contexts, they must be explicitly exported and imported using the appropriate APIs.
+ */
+typedef struct _HsaKFDContext
+{
+    /* File descriptor for the KFD device */
+    int fd;
+
+    /* Queue context for managing user queues */
+    struct hsa_kfd_queue_context *queue_context;
+
+    /* Memory management context for managing memory */
+    struct hsa_kfd_fmm_context *fmm_context;
+
+    /* Event context for managing events */
+    struct hsa_kfd_event_context *event_context;
+} HsaKFDContext;
+
+// Initialize a pre-allocated HsaKFDContext with the given file descriptor
+void hsakmt_kfdcontext_init_context(int fd, HsaKFDContext *ctx);
+// Release all resources associated with the given KFD context
+void hsakmt_kfdcontext_clear_context(HsaKFDContext *ctx);
+
+struct hsa_kfd_fmm_context *hsakmt_kfdcontext_get_fmm_context(HsaKFDContext *ctx);
+struct hsa_kfd_queue_context *hsakmt_kfdcontext_get_queue_context(HsaKFDContext *ctx);
+struct hsa_kfd_event_context *hsakmt_kfdcontext_get_event_context(HsaKFDContext *ctx);
+
+#endif /* _KFDCONTEXT_H_ */
diff --git a/projects/rocr-runtime/libhsakmt/src/libhsakmt.h b/projects/rocr-runtime/libhsakmt/src/libhsakmt.h
index 6f66d20bf4..4ac445c025 100644
--- a/projects/rocr-runtime/libhsakmt/src/libhsakmt.h
+++ b/projects/rocr-runtime/libhsakmt/src/libhsakmt.h
@@ -28,11 +28,12 @@
 
 #include "hsakmt/linux/kfd_ioctl.h"
 #include "hsakmt/hsakmt.h"
+#include "kfdcontext.h"
+#include "hsakmtctx.h"
 #include <pthread.h>
 #include <stdint.h>
 #include <limits.h>
 
-extern int hsakmt_kfd_fd;
 extern int hsakmt_udmabuf_dev_fd;
 extern unsigned long hsakmt_kfd_open_count;
 extern bool hsakmt_forked;
@@ -42,6 +43,7 @@ extern bool hsakmt_is_svm_api_supported;
 extern int hsakmt_zfb_support;
 
 extern HsaVersionInfo hsakmt_kfd_version_info;
+extern HsaKFDContext hsakmt_primary_kfd_ctx;
 
 #undef HSAKMTAPI
 #define HSAKMTAPI __attribute__((visibility ("default")))
@@ -196,7 +198,7 @@ int get_drm_render_fd_by_gpu_id(HSAuint32 gpu_id);
 HSAKMT_STATUS hsakmt_validate_nodeid_array(uint32_t **gpu_id_array,
 		uint32_t NumberOfNodes, uint32_t *NodeArray);
 
-HSAKMT_STATUS hsakmt_topology_sysfs_get_system_props(HsaSystemProperties *props);
+HSAKMT_STATUS hsakmt_topology_sysfs_get_system_props(HsaKFDContext *ctx, HsaSystemProperties *props);
 HSAKMT_STATUS hsakmt_topology_get_node_props(HSAuint32 NodeId,
 				      HsaNodeProperties *NodeProperties);
 HSAKMT_STATUS hsakmt_topology_get_iolink_props(HSAuint32 NodeId,
@@ -207,13 +209,16 @@ bool hsakmt_topology_is_svm_needed(HSA_ENGINE_ID EngineId);
 
 HSAuint32 hsakmt_PageSizeFromFlags(unsigned int pageSizeFlags);
 
-void* hsakmt_allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align,
+void* hsakmt_allocate_exec_aligned_memory_gpu(HsaKFDContext *ctx,
+					   uint32_t size, uint32_t align,
 				       uint32_t gpu_id,
 				       uint32_t NodeId, bool NonPaged,
 				       bool DeviceLocal, bool Uncached);
-void hsakmt_free_exec_aligned_memory_gpu(void *addr, uint32_t size, uint32_t align);
-HSAKMT_STATUS hsakmt_init_process_doorbells(unsigned int NumNodes);
-void hsakmt_destroy_process_doorbells(void);
+void hsakmt_free_exec_aligned_memory_gpu(HsaKFDContext *ctx,
+				       void *addr, uint32_t size, uint32_t align);
+HSAKMT_STATUS hsakmt_init_process_doorbells(HsaKFDContext *ctx,
+					   unsigned int NumNodes);
+void hsakmt_destroy_process_doorbells(HsaKFDContext *ctx);
 HSAKMT_STATUS hsakmt_init_device_debugging_memory(unsigned int NumNodes);
 void hsakmt_destroy_device_debugging_memory(void);
 bool hsakmt_debug_get_reg_status(uint32_t node_id);
@@ -239,10 +244,10 @@ extern int hsakmt_ioctl(int fd, unsigned long request, void *arg);
 
 #define POWER_OF_2(x) ((x && (!(x & (x - 1)))) ? 1 : 0)
 
-void hsakmt_clear_events_page(void);
-void hsakmt_fmm_clear_all_mem(void);
-void hsakmt_fmm_clear_all_aperture(void);
-void hsakmt_clear_process_doorbells(void);
+void hsakmt_clear_events_page(HsaKFDContext *ctx);
+void hsakmt_fmm_clear_all_mem(HsaKFDContext *ctx);
+void hsakmt_fmm_clear_all_aperture(HsaKFDContext *ctx);
+void hsakmt_clear_process_doorbells(HsaKFDContext *ctx);
 uint32_t hsakmt_get_num_sysfs_nodes(void);
 
 bool hsakmt_is_forked_child(void);
diff --git a/projects/rocr-runtime/libhsakmt/src/memory.c b/projects/rocr-runtime/libhsakmt/src/memory.c
index fb317f71d6..db71264f6d 100644
--- a/projects/rocr-runtime/libhsakmt/src/memory.c
+++ b/projects/rocr-runtime/libhsakmt/src/memory.c
@@ -34,7 +34,8 @@
 #include <fcntl.h>
 #include "fmm.h"
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtSetMemoryPolicy(HSAuint32 Node,
+HSAKMT_STATUS HSAKMTAPI hsaKmtSetMemoryPolicyCtx(HsaKFDContext *ctx,
+						  HSAuint32 Node,
 					      HSAuint32 DefaultPolicy,
 					      HSAuint32 AlternatePolicy,
 					      void *MemoryAddressAlternate,
@@ -86,7 +87,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtSetMemoryPolicy(HSAuint32 Node,
 	args.alternate_aperture_base = (uintptr_t) MemoryAddressAlternate;
 	args.alternate_aperture_size = MemorySizeInBytes;
 
-	int err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_SET_MEMORY_POLICY, &args);
+	int err = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_SET_MEMORY_POLICY, &args);
 
 	return (err == -1) ? HSAKMT_STATUS_ERROR : HSAKMT_STATUS_SUCCESS;
 }
@@ -104,15 +105,17 @@ HSAuint32 hsakmt_PageSizeFromFlags(unsigned int pageSizeFlags)
 	}
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemory(HSAuint32 PreferredNode,
+HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemoryCtx(HsaKFDContext *ctx,
+					  HSAuint32 PreferredNode,
 					  HSAuint64 SizeInBytes,
 					  HsaMemFlags MemFlags,
 					  void **MemoryAddress)
 {
-	return hsaKmtAllocMemoryAlign(PreferredNode, SizeInBytes, 0, MemFlags, MemoryAddress);
+	return hsaKmtAllocMemoryAlignCtx(ctx, PreferredNode, SizeInBytes, 0, MemFlags, MemoryAddress);
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemoryAlign(HSAuint32 PreferredNode,
+HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemoryAlignCtx(HsaKFDContext *ctx,
+					  HSAuint32 PreferredNode,
 					  HSAuint64 SizeInBytes,
 					  HSAuint64 Alignment,
 					  HsaMemFlags MemFlags,
@@ -160,7 +163,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemoryAlign(HSAuint32 PreferredNode,
 			return HSAKMT_STATUS_NOT_IMPLEMENTED;
 		}
 
-		*MemoryAddress = hsakmt_fmm_allocate_scratch(gpu_id, *MemoryAddress, SizeInBytes);
+		*MemoryAddress = hsakmt_fmm_allocate_scratch(ctx, gpu_id, *MemoryAddress, SizeInBytes);
 
 		if (!(*MemoryAddress)) {
 			pr_err("[%s] failed to allocate %lu bytes from scratch\n",
@@ -183,7 +186,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemoryAlign(HSAuint32 PreferredNode,
 		if (hsakmt_zfb_support && gpu_id && MemFlags.ui32.NonPaged == 1)
 			MemFlags.ui32.CoarseGrain = 1;
 
-		*MemoryAddress = hsakmt_fmm_allocate_host(gpu_id, MemFlags.ui32.GTTAccess ? 0 : PreferredNode,
+		*MemoryAddress = hsakmt_fmm_allocate_host(ctx, gpu_id, MemFlags.ui32.GTTAccess ? 0 : PreferredNode,
 						   *MemoryAddress, SizeInBytes, Alignment, MemFlags);
 
 		if (!(*MemoryAddress)) {
@@ -204,7 +207,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemoryAlign(HSAuint32 PreferredNode,
 		return HSAKMT_STATUS_INVALID_PARAMETER;
 	}
 
-	*MemoryAddress = hsakmt_fmm_allocate_device(gpu_id, PreferredNode, *MemoryAddress,
+	*MemoryAddress = hsakmt_fmm_allocate_device(ctx, gpu_id, PreferredNode, *MemoryAddress,
 					     SizeInBytes, Alignment, MemFlags);
 
 	if (!(*MemoryAddress)) {
@@ -218,7 +221,8 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemoryAlign(HSAuint32 PreferredNode,
 
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtFreeMemory(void *MemoryAddress,
+HSAKMT_STATUS HSAKMTAPI hsaKmtFreeMemoryCtx(HsaKFDContext *ctx,
+					 void *MemoryAddress,
 					 HSAuint64 SizeInBytes)
 {
 	CHECK_KFD_OPEN();
@@ -230,11 +234,12 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtFreeMemory(void *MemoryAddress,
 		return HSAKMT_STATUS_ERROR;
 	}
 
-	return hsakmt_fmm_release(MemoryAddress);
+	return hsakmt_fmm_release(ctx, MemoryAddress);
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtAvailableMemory(HSAuint32 Node,
-					      HSAuint64 *AvailableBytes)
+HSAKMT_STATUS HSAKMTAPI hsaKmtAvailableMemoryCtx(HsaKFDContext *ctx,
+						  HSAuint32 Node,
+						  HSAuint64 *AvailableBytes)
 {
 	struct kfd_ioctl_get_available_memory_args args = {};
 	HSAKMT_STATUS result;
@@ -250,14 +255,15 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAvailableMemory(HSAuint32 Node,
 		return result;
 	}
 
-	if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_AVAILABLE_MEMORY, &args))
+	if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_AVAILABLE_MEMORY, &args))
 		return HSAKMT_STATUS_ERROR;
 
 	*AvailableBytes = args.available;
 	return HSAKMT_STATUS_SUCCESS;
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemory(void *MemoryAddress,
+HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryCtx(HsaKFDContext *ctx,
+					     void *MemoryAddress,
 					     HSAuint64 MemorySizeInBytes)
 {
 	CHECK_KFD_OPEN();
@@ -271,11 +277,13 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemory(void *MemoryAddress,
 	HsaMemFlags flags;
 	flags.ui32.CoarseGrain = 1;
 	flags.ui32.ExtendedCoherent = 0;
-	return hsakmt_fmm_register_memory(MemoryAddress, MemorySizeInBytes,
+	return hsakmt_fmm_register_memory(ctx,
+				   MemoryAddress, MemorySizeInBytes,
 				   NULL, 0, flags);
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryToNodes(void *MemoryAddress,
+HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryToNodesCtx(HsaKFDContext *ctx,
+						    void *MemoryAddress,
 						    HSAuint64 MemorySizeInBytes,
 						    HSAuint64 NumberOfNodes,
 						    HSAuint32 *NodeArray)
@@ -299,7 +307,8 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryToNodes(void *MemoryAddress,
 		flags.ui32.CoarseGrain = 1;
 		flags.ui32.ExtendedCoherent = 0;
 
-		ret = hsakmt_fmm_register_memory(MemoryAddress, MemorySizeInBytes,
+		ret = hsakmt_fmm_register_memory(ctx,
+					  MemoryAddress, MemorySizeInBytes,
 					  gpu_id_array,
 					  NumberOfNodes*sizeof(uint32_t),
 					  flags);
@@ -310,7 +319,8 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryToNodes(void *MemoryAddress,
 	return ret;
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryWithFlags(void *MemoryAddress,
+HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryWithFlagsCtx(HsaKFDContext *ctx,
+						    void *MemoryAddress,
 						    HSAuint64 MemorySizeInBytes,
 						    HsaMemFlags MemFlags)
 {
@@ -331,21 +341,24 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryWithFlags(void *MemoryAddress,
 		/* TODO: support mixed APU and dGPU configurations */
 		return HSAKMT_STATUS_NOT_SUPPORTED;
 
-	ret = hsakmt_fmm_register_memory(MemoryAddress, MemorySizeInBytes,
+	ret = hsakmt_fmm_register_memory(ctx,
+		MemoryAddress, MemorySizeInBytes,
 		NULL, 0, MemFlags);
 
 	return ret;
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterGraphicsHandleToNodes(HSAuint64 GraphicsResourceHandle,
+HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterGraphicsHandleToNodesCtx(HsaKFDContext *ctx,
+							    HSAuint64 GraphicsResourceHandle,
 							    HsaGraphicsResourceInfo *GraphicsResourceInfo,
 							    HSAuint64 NumberOfNodes,
 							    HSAuint32 *NodeArray)
 {
        HSA_REGISTER_MEM_FLAGS regFlags;
        regFlags.Value = 0;
-        
-       return hsaKmtRegisterGraphicsHandleToNodesExt(GraphicsResourceHandle,
+
+       return hsaKmtRegisterGraphicsHandleToNodesExtCtx(ctx,
+						     GraphicsResourceHandle,
 						     GraphicsResourceInfo,
 						     NumberOfNodes,
 						     NodeArray,
@@ -353,7 +366,8 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterGraphicsHandleToNodes(HSAuint64 GraphicsRe
 
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterGraphicsHandleToNodesExt(HSAuint64 GraphicsResourceHandle,
+HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterGraphicsHandleToNodesExtCtx(HsaKFDContext *ctx,
+							       HSAuint64 GraphicsResourceHandle,
 							       HsaGraphicsResourceInfo *GraphicsResourceInfo,
 							       HSAuint64 NumberOfNodes,
 							       HSAuint32 *NodeArray,
@@ -371,7 +385,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterGraphicsHandleToNodesExt(HSAuint64 Graphic
 	}
 
 	if (ret == HSAKMT_STATUS_SUCCESS) {
-		ret = hsakmt_fmm_register_graphics_handle(
+		ret = hsakmt_fmm_register_graphics_handle(ctx,
 			GraphicsResourceHandle, GraphicsResourceInfo,
 			gpu_id_array, NumberOfNodes * sizeof(uint32_t), RegisterFlags);
 		if (ret != HSAKMT_STATUS_SUCCESS)
@@ -381,7 +395,8 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterGraphicsHandleToNodesExt(HSAuint64 Graphic
 	return ret;
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtExportDMABufHandle(void *MemoryAddress,
+HSAKMT_STATUS HSAKMTAPI hsaKmtExportDMABufHandleCtx(HsaKFDContext *ctx,
+						 void *MemoryAddress,
 						 HSAuint64 MemorySizeInBytes,
 						 int *DMABufFd,
 						 HSAuint64 *Offset)
@@ -391,11 +406,13 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtExportDMABufHandle(void *MemoryAddress,
 
 	pr_debug("[%s] address %p\n", __func__, MemoryAddress);
 
-	return hsakmt_fmm_export_dma_buf_fd(MemoryAddress, MemorySizeInBytes,
+	return hsakmt_fmm_export_dma_buf_fd(ctx,
+				     MemoryAddress, MemorySizeInBytes,
 				     DMABufFd, Offset);
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtShareMemory(void *MemoryAddress,
+HSAKMT_STATUS HSAKMTAPI hsaKmtShareMemoryCtx(HsaKFDContext *ctx,
+					  void *MemoryAddress,
 					  HSAuint64 SizeInBytes,
 					  HsaSharedMemoryHandle *SharedMemoryHandle)
 {
@@ -406,25 +423,28 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtShareMemory(void *MemoryAddress,
 	if (!SharedMemoryHandle)
 		return HSAKMT_STATUS_INVALID_PARAMETER;
 
-	return hsakmt_fmm_share_memory(MemoryAddress, SizeInBytes, SharedMemoryHandle);
+	return hsakmt_fmm_share_memory(ctx, MemoryAddress, SizeInBytes, SharedMemoryHandle);
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterSharedHandle(const HsaSharedMemoryHandle *SharedMemoryHandle,
-						   void **MemoryAddress,
-						   HSAuint64 *SizeInBytes)
+HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterSharedHandleCtx(HsaKFDContext *ctx,
+					  const HsaSharedMemoryHandle *SharedMemoryHandle,
+					  void **MemoryAddress,
+					  HSAuint64 *SizeInBytes)
 {
 	CHECK_KFD_OPEN();
 
 	pr_debug("[%s] handle %p\n", __func__, SharedMemoryHandle);
 
-	return hsaKmtRegisterSharedHandleToNodes(SharedMemoryHandle,
+	return hsaKmtRegisterSharedHandleToNodesCtx(ctx,
+						 SharedMemoryHandle,
 						 MemoryAddress,
 						 SizeInBytes,
 						 0,
 						 NULL);
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterSharedHandleToNodes(const HsaSharedMemoryHandle *SharedMemoryHandle,
+HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterSharedHandleToNodesCtx(HsaKFDContext *ctx,
+							  const HsaSharedMemoryHandle *SharedMemoryHandle,
 							  void **MemoryAddress,
 							  HSAuint64 *SizeInBytes,
 							  HSAuint64 NumberOfNodes,
@@ -447,7 +467,8 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterSharedHandleToNodes(const HsaSharedMemoryH
 			goto error;
 	}
 
-	ret = hsakmt_fmm_register_shared_memory(SharedMemoryHandle,
+	ret = hsakmt_fmm_register_shared_memory(ctx,
+					 SharedMemoryHandle,
 					 SizeInBytes,
 					 MemoryAddress,
 					 gpu_id_array,
@@ -487,17 +508,17 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtProcessVMWrite(HSAuint32 Pid,
 	return HSAKMT_STATUS_NOT_IMPLEMENTED;
 }
 
-
-HSAKMT_STATUS HSAKMTAPI hsaKmtDeregisterMemory(void *MemoryAddress)
+HSAKMT_STATUS HSAKMTAPI hsaKmtDeregisterMemoryCtx(HsaKFDContext *ctx, void *MemoryAddress)
 {
 	CHECK_KFD_OPEN();
 
 	pr_debug("[%s] address %p\n", __func__, MemoryAddress);
 
-	return hsakmt_fmm_deregister_memory(MemoryAddress);
+	return hsakmt_fmm_deregister_memory(ctx, MemoryAddress);
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtMapMemoryToGPU(void *MemoryAddress,
+HSAKMT_STATUS HSAKMTAPI hsaKmtMapMemoryToGPUCtx(HsaKFDContext *ctx,
+					     void *MemoryAddress,
 					     HSAuint64 MemorySizeInBytes,
 					     HSAuint64 *AlternateVAGPU)
 {
@@ -513,10 +534,11 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtMapMemoryToGPU(void *MemoryAddress,
 	if (AlternateVAGPU)
 		*AlternateVAGPU = 0;
 
-	return hsakmt_fmm_map_to_gpu(MemoryAddress, MemorySizeInBytes, AlternateVAGPU);
+	return hsakmt_fmm_map_to_gpu(ctx, MemoryAddress, MemorySizeInBytes, AlternateVAGPU);
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtMapMemoryToGPUNodes(void *MemoryAddress,
+HSAKMT_STATUS HSAKMTAPI hsaKmtMapMemoryToGPUNodesCtx(HsaKFDContext *ctx,
+						  void *MemoryAddress,
 						  HSAuint64 MemorySizeInBytes,
 						  HSAuint64 *AlternateVAGPU,
 						  HsaMemMapFlags MemMapFlags,
@@ -537,16 +559,15 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtMapMemoryToGPUNodes(void *MemoryAddress,
 	}
 
 	if (!hsakmt_is_dgpu && NumberOfNodes == 1)
-		return hsaKmtMapMemoryToGPU(MemoryAddress,
-				MemorySizeInBytes,
-				AlternateVAGPU);
+		return hsaKmtMapMemoryToGPUCtx(ctx, MemoryAddress,
+					MemorySizeInBytes, AlternateVAGPU);
 
 	ret = hsakmt_validate_nodeid_array(&gpu_id_array,
 				NumberOfNodes, NodeArray);
 	if (ret != HSAKMT_STATUS_SUCCESS)
 		return ret;
 
-	ret = hsakmt_fmm_map_to_gpu_nodes(MemoryAddress, MemorySizeInBytes,
+	ret = hsakmt_fmm_map_to_gpu_nodes(ctx, MemoryAddress, MemorySizeInBytes,
 		gpu_id_array, NumberOfNodes, AlternateVAGPU);
 
 	if (gpu_id_array)
@@ -555,7 +576,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtMapMemoryToGPUNodes(void *MemoryAddress,
 	return ret;
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtUnmapMemoryToGPU(void *MemoryAddress)
+HSAKMT_STATUS HSAKMTAPI hsaKmtUnmapMemoryToGPUCtx(HsaKFDContext *ctx, void *MemoryAddress)
 {
 	CHECK_KFD_OPEN();
 
@@ -567,7 +588,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtUnmapMemoryToGPU(void *MemoryAddress)
 		return HSAKMT_STATUS_SUCCESS;
 	}
 
-	if (!hsakmt_fmm_unmap_from_gpu(MemoryAddress))
+	if (!hsakmt_fmm_unmap_from_gpu(ctx, MemoryAddress))
 		return HSAKMT_STATUS_SUCCESS;
 	else
 		return HSAKMT_STATUS_ERROR;
@@ -588,16 +609,16 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtMapGraphicHandle(HSAuint32 NodeId,
 	return HSAKMT_STATUS_NOT_IMPLEMENTED;
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtUnmapGraphicHandle(HSAuint32 NodeId,
+HSAKMT_STATUS HSAKMTAPI hsaKmtUnmapGraphicHandleCtx(HsaKFDContext *ctx,
+						 HSAuint32 NodeId,
 						 HSAuint64 FlatMemoryAddress,
 						 HSAuint64 SizeInBytes)
 {
-	CHECK_KFD_OPEN();
-
-	return hsaKmtUnmapMemoryToGPU(PORT_UINT64_TO_VPTR(FlatMemoryAddress));
+	return hsaKmtUnmapMemoryToGPUCtx(ctx, PORT_UINT64_TO_VPTR(FlatMemoryAddress));
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtGetTileConfig(HSAuint32 NodeId, HsaGpuTileConfig *config)
+HSAKMT_STATUS HSAKMTAPI hsaKmtGetTileConfigCtx(HsaKFDContext *ctx,
+						 HSAuint32 NodeId, HsaGpuTileConfig *config)
 {
 	struct kfd_ioctl_get_tile_config_args args = {0};
 	uint32_t gpu_id;
@@ -623,7 +644,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetTileConfig(HSAuint32 NodeId, HsaGpuTileConfig *
 	args.num_tile_configs = config->NumTileConfigs;
 	args.num_macro_tile_configs = config->NumMacroTileConfigs;
 
-	if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_GET_TILE_CONFIG, &args) != 0)
+	if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_GET_TILE_CONFIG, &args) != 0)
 		return HSAKMT_STATUS_ERROR;
 
 	config->NumTileConfigs = args.num_tile_configs;
@@ -637,7 +658,8 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetTileConfig(HSAuint32 NodeId, HsaGpuTileConfig *
 	return HSAKMT_STATUS_SUCCESS;
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtQueryPointerInfo(const void *Pointer,
+HSAKMT_STATUS HSAKMTAPI hsaKmtQueryPointerInfoCtx(HsaKFDContext *ctx,
+					       const void *Pointer,
 					       HsaPointerInfo *PointerInfo)
 {
 	CHECK_KFD_OPEN();
@@ -646,47 +668,264 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtQueryPointerInfo(const void *Pointer,
 
 	if (!PointerInfo)
 		return HSAKMT_STATUS_INVALID_PARAMETER;
-	return hsakmt_fmm_get_mem_info(Pointer, PointerInfo);
+	return hsakmt_fmm_get_mem_info(ctx, Pointer, PointerInfo);
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtSetMemoryUserData(const void *Pointer,
+HSAKMT_STATUS HSAKMTAPI hsaKmtSetMemoryUserDataCtx(HsaKFDContext *ctx,
+						const void *Pointer,
 						void *UserData)
 {
 	CHECK_KFD_OPEN();
 
 	pr_debug("[%s] pointer %p\n", __func__, Pointer);
 
-	return hsakmt_fmm_set_mem_user_data(Pointer, UserData);
+	return hsakmt_fmm_set_mem_user_data(ctx, Pointer, UserData);
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtReplaceAsanHeaderPage(void *addr)
+HSAKMT_STATUS HSAKMTAPI hsaKmtReplaceAsanHeaderPageCtx(HsaKFDContext *ctx, void *addr)
 {
 #ifdef SANITIZER_AMDGPU
 	pr_debug("[%s] address %p\n", __func__, addr);
 	CHECK_KFD_OPEN();
 
-	return hsakmt_fmm_replace_asan_header_page(addr);
+	return hsakmt_fmm_replace_asan_header_page(ctx, addr);
 #else
 	return HSAKMT_STATUS_NOT_SUPPORTED;
 #endif
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtReturnAsanHeaderPage(void *addr)
+HSAKMT_STATUS HSAKMTAPI hsaKmtReturnAsanHeaderPageCtx(HsaKFDContext *ctx, void *addr)
 {
 #ifdef SANITIZER_AMDGPU
 	pr_debug("[%s] address %p\n", __func__, addr);
 	CHECK_KFD_OPEN();
 
-	return hsakmt_fmm_return_asan_header_page(addr);
+	return hsakmt_fmm_return_asan_header_page(ctx, addr);
 #else
 	return HSAKMT_STATUS_NOT_SUPPORTED;
 #endif
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtGetAMDGPUDeviceHandle( HSAuint32 NodeId,
+HSAKMT_STATUS HSAKMTAPI hsaKmtGetAMDGPUDeviceHandleCtx(HsaKFDContext *ctx,
+						HSAuint32 NodeId,
 						HsaAMDGPUDeviceHandle   *DeviceHandle)
 {
 	CHECK_KFD_OPEN();
 
-	return hsakmt_fmm_get_amdgpu_device_handle(NodeId, DeviceHandle);
+	return hsakmt_fmm_get_amdgpu_device_handle(ctx, NodeId, DeviceHandle);
+}
+
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtSetMemoryPolicy(HSAuint32 Node,
+					      HSAuint32 DefaultPolicy,
+					      HSAuint32 AlternatePolicy,
+					      void *MemoryAddressAlternate,
+					      HSAuint64 MemorySizeInBytes)
+{
+	return hsaKmtSetMemoryPolicyCtx(&hsakmt_primary_kfd_ctx, Node,
+					  DefaultPolicy, AlternatePolicy,
+					  MemoryAddressAlternate, MemorySizeInBytes);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemory(HSAuint32 PreferredNode,
+					  HSAuint64 SizeInBytes,
+					  HsaMemFlags MemFlags,
+					  void **MemoryAddress)
+{
+	return hsaKmtAllocMemoryAlign(PreferredNode, SizeInBytes, 0, MemFlags, MemoryAddress);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemoryAlign(HSAuint32 PreferredNode,
+					  HSAuint64 SizeInBytes,
+					  HSAuint64 Alignment,
+					  HsaMemFlags MemFlags,
+					  void **MemoryAddress)
+{
+	return hsaKmtAllocMemoryAlignCtx(&hsakmt_primary_kfd_ctx, PreferredNode,
+					  SizeInBytes, Alignment, MemFlags, MemoryAddress);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtFreeMemory(void *MemoryAddress,
+					  HSAuint64 SizeInBytes)
+{
+	return hsaKmtFreeMemoryCtx(&hsakmt_primary_kfd_ctx, MemoryAddress, SizeInBytes);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtAvailableMemory(HSAuint32 Node,
+					  HSAuint64 *AvailableBytes)
+{
+	return hsaKmtAvailableMemoryCtx(&hsakmt_primary_kfd_ctx, Node, AvailableBytes);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemory(void *MemoryAddress,
+					      HSAuint64 MemorySizeInBytes)
+{
+	return hsaKmtRegisterMemoryCtx(&hsakmt_primary_kfd_ctx, MemoryAddress, MemorySizeInBytes);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryToNodes(void *MemoryAddress,
+					      HSAuint64 MemorySizeInBytes,
+					      HSAuint64 NumberOfNodes,
+					      HSAuint32 *NodeArray)
+{
+	return hsaKmtRegisterMemoryToNodesCtx(&hsakmt_primary_kfd_ctx,
+					      MemoryAddress, MemorySizeInBytes,
+					      NumberOfNodes, NodeArray);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryWithFlags(void *MemoryAddress,
+					      HSAuint64 MemorySizeInBytes,
+					      HsaMemFlags MemFlags)
+{
+	return hsaKmtRegisterMemoryWithFlagsCtx(&hsakmt_primary_kfd_ctx,
+					      MemoryAddress, MemorySizeInBytes, MemFlags);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterGraphicsHandleToNodes(HSAuint64 GraphicsResourceHandle,
+							    HsaGraphicsResourceInfo *GraphicsResourceInfo,
+							    HSAuint64 NumberOfNodes,
+							    HSAuint32 *NodeArray)
+{
+       HSA_REGISTER_MEM_FLAGS regFlags;
+       regFlags.Value = 0;
+
+       return hsaKmtRegisterGraphicsHandleToNodesExt(GraphicsResourceHandle,
+						     GraphicsResourceInfo,
+						     NumberOfNodes,
+						     NodeArray,
+						     regFlags);
+
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterGraphicsHandleToNodesExt(HSAuint64 GraphicsResourceHandle,
+							       HsaGraphicsResourceInfo *GraphicsResourceInfo,
+							       HSAuint64 NumberOfNodes,
+							       HSAuint32 *NodeArray,
+							       HSA_REGISTER_MEM_FLAGS RegisterFlags)
+{
+	return hsaKmtRegisterGraphicsHandleToNodesExtCtx(&hsakmt_primary_kfd_ctx,
+							       GraphicsResourceHandle,
+							       GraphicsResourceInfo,
+							       NumberOfNodes,
+							       NodeArray,
+							       RegisterFlags);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtExportDMABufHandle(void *MemoryAddress,
+						 HSAuint64 MemorySizeInBytes,
+						 int *DMABufFd,
+						 HSAuint64 *Offset)
+{
+	return hsaKmtExportDMABufHandleCtx(&hsakmt_primary_kfd_ctx,
+					     MemoryAddress, MemorySizeInBytes,
+					     DMABufFd, Offset);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtShareMemory(void *MemoryAddress,
+					  HSAuint64 SizeInBytes,
+					  HsaSharedMemoryHandle *SharedMemoryHandle)
+{
+	return hsaKmtShareMemoryCtx(&hsakmt_primary_kfd_ctx,
+				     MemoryAddress, SizeInBytes, SharedMemoryHandle);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterSharedHandle(
+					  const HsaSharedMemoryHandle *SharedMemoryHandle,
+					  void **MemoryAddress,
+					  HSAuint64 *SizeInBytes)
+{
+	CHECK_KFD_OPEN();
+
+	pr_debug("[%s] handle %p\n", __func__, SharedMemoryHandle);
+
+	return hsaKmtRegisterSharedHandleToNodes(SharedMemoryHandle,
+						     MemoryAddress,
+						     SizeInBytes,
+						     0,
+						     NULL);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterSharedHandleToNodes(const HsaSharedMemoryHandle *SharedMemoryHandle,
+							  void **MemoryAddress,
+							  HSAuint64 *SizeInBytes,
+							  HSAuint64 NumberOfNodes,
+							  HSAuint32 *NodeArray)
+{
+	return hsaKmtRegisterSharedHandleToNodesCtx(&hsakmt_primary_kfd_ctx,
+						     SharedMemoryHandle,
+						     MemoryAddress,
+						     SizeInBytes,
+						     NumberOfNodes,
+						     NodeArray);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtDeregisterMemory(void *MemoryAddress)
+{
+	return hsaKmtDeregisterMemoryCtx(&hsakmt_primary_kfd_ctx, MemoryAddress);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtMapMemoryToGPU(void *MemoryAddress,
+					  HSAuint64 MemorySizeInBytes,
+					  HSAuint64 *AlternateVAGPU)
+{
+	return hsaKmtMapMemoryToGPUCtx(&hsakmt_primary_kfd_ctx, MemoryAddress, MemorySizeInBytes, AlternateVAGPU);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtMapMemoryToGPUNodes(
+					  void *MemoryAddress,
+					  HSAuint64 MemorySizeInBytes,
+					  HSAuint64 *AlternateVAGPU,
+					  HsaMemMapFlags MemMapFlags,
+					  HSAuint64 NumberOfNodes,
+					  HSAuint32 *NodeArray)
+{
+	return hsaKmtMapMemoryToGPUNodesCtx(&hsakmt_primary_kfd_ctx, MemoryAddress,
+				MemorySizeInBytes, AlternateVAGPU, MemMapFlags, NumberOfNodes, NodeArray);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtUnmapMemoryToGPU(void *MemoryAddress)
+{
+	return hsaKmtUnmapMemoryToGPUCtx(&hsakmt_primary_kfd_ctx, MemoryAddress);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtUnmapGraphicHandle(HSAuint32 NodeId,
+					  HSAuint64 FlatMemoryAddress,
+					  HSAuint64 SizeInBytes)
+{
+	return hsaKmtUnmapMemoryToGPU(PORT_UINT64_TO_VPTR(FlatMemoryAddress));
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtGetTileConfig(HSAuint32 NodeId, HsaGpuTileConfig *config)
+{
+	return hsaKmtGetTileConfigCtx(&hsakmt_primary_kfd_ctx, NodeId, config);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtQueryPointerInfo(const void *Pointer,
+					       HsaPointerInfo *PointerInfo)
+{
+	return hsaKmtQueryPointerInfoCtx(&hsakmt_primary_kfd_ctx, Pointer, PointerInfo);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtSetMemoryUserData(const void *Pointer,
+						void *UserData)
+{
+	return hsaKmtSetMemoryUserDataCtx(&hsakmt_primary_kfd_ctx, Pointer, UserData);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtReplaceAsanHeaderPage(void *addr)
+{
+	return hsaKmtReplaceAsanHeaderPageCtx(&hsakmt_primary_kfd_ctx, addr);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtReturnAsanHeaderPage(void *addr)
+{
+	return hsaKmtReturnAsanHeaderPageCtx(&hsakmt_primary_kfd_ctx, addr);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtGetAMDGPUDeviceHandle(HSAuint32 NodeId,
+						HsaAMDGPUDeviceHandle   *DeviceHandle)
+{
+	CHECK_KFD_OPEN();
+
+	return hsaKmtGetAMDGPUDeviceHandleCtx(&hsakmt_primary_kfd_ctx, NodeId, DeviceHandle);
 }
diff --git a/projects/rocr-runtime/libhsakmt/src/openclose.c b/projects/rocr-runtime/libhsakmt/src/openclose.c
index d738fcc747..4d7d428891 100644
--- a/projects/rocr-runtime/libhsakmt/src/openclose.c
+++ b/projects/rocr-runtime/libhsakmt/src/openclose.c
@@ -51,6 +51,8 @@ static pid_t parent_pid = -1;
 int hsakmt_debug_level;
 bool hsakmt_forked;
 
+HsaKFDContext hsakmt_primary_kfd_ctx = {.fd = -1};
+
 /* hsakmt_is_forked_child detects when the process has forked since the last
  * time this function was called. We cannot rely on pthread_atfork
  * because the process can fork without calling the fork function in
@@ -99,16 +101,18 @@ static void child_fork_handler(void)
  * The topology information is duplicated from the parent is valid
  * in the child process so it is not cleared
  */
-static void clear_after_fork(void)
+static void clear_after_fork(HsaKFDContext *ctx)
 {
-	hsakmt_clear_process_doorbells();
-	hsakmt_clear_events_page();
-	hsakmt_fmm_clear_all_mem();
+	hsakmt_clear_process_doorbells(ctx);
+	hsakmt_clear_events_page(ctx);
+	hsakmt_fmm_clear_all_mem(ctx);
 	hsakmt_destroy_device_debugging_memory();
-	if (hsakmt_kfd_fd) {
-		close(hsakmt_kfd_fd);
-		hsakmt_kfd_fd = -1;
-	}
+
+	int fd = ctx->fd;
+	if (fd >= 0) {
+		hsakmt_kfdcontext_clear_context(ctx);
+		close(fd);
+ 	}
 	if (hsakmt_udmabuf_dev_fd > 0) {
 		close(hsakmt_udmabuf_dev_fd);
 		hsakmt_udmabuf_dev_fd = -1;
@@ -150,7 +154,7 @@ static HSAKMT_STATUS init_vars_from_env(void)
 	return HSAKMT_STATUS_SUCCESS;
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtOpenKFD(void)
+HSAKMT_STATUS HSAKMTAPI hsaKmtOpenKFDCtx(HsaKFDContext **pCtx)
 {
 	HSAKMT_STATUS result;
 	int fd = -1;
@@ -166,7 +170,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtOpenKFD(void)
 	 * belong to the parent
 	 */
 	if (hsakmt_is_forked_child())
-		clear_after_fork();
+		clear_after_fork(&hsakmt_primary_kfd_ctx);
 
 	if (hsakmt_kfd_open_count == 0) {
 		static bool atfork_installed = false;
@@ -184,15 +188,14 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtOpenKFD(void)
 		// Check if we are using the hsakmtmodel and setup initial state
 		model_init_env_vars();
 
-		if (hsakmt_kfd_fd < 0 && !hsakmt_use_model) {
+		if (hsakmt_primary_kfd_ctx.fd < 0 && !hsakmt_use_model) {
 			fd = open(kfd_device_name, O_RDWR | O_CLOEXEC);
 
 			if (fd == -1) {
 				result = HSAKMT_STATUS_KERNEL_IO_CHANNEL_NOT_OPENED;
 				goto open_failed;
 			}
-
-			hsakmt_kfd_fd = fd;
+			hsakmt_kfdcontext_init_context(fd, &hsakmt_primary_kfd_ctx);
 		}
 
 		init_page_size();
@@ -216,8 +219,8 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtOpenKFD(void)
 		useSvmStr = getenv("HSA_USE_SVM");
 		hsakmt_is_svm_api_supported = !(useSvmStr && !strcmp(useSvmStr, "0"));
 		if(!hsakmt_use_model)
-			result = hsakmt_topology_sysfs_get_system_props(&sys_props);
-		
+			result = hsakmt_topology_sysfs_get_system_props(&hsakmt_primary_kfd_ctx, &sys_props);
+
 		if (result != HSAKMT_STATUS_SUCCESS)
 			goto topology_sysfs_failed;
 
@@ -227,6 +230,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtOpenKFD(void)
 			pr_warn("Insufficient Memory. Debugging unavailable\n");
 
 		hsakmt_init_counter_props(sys_props.NumNodes);
+		*pCtx = &hsakmt_primary_kfd_ctx;
 
 		if (!atfork_installed) {
 			/* Atfork handlers cannot be uninstalled and
@@ -241,6 +245,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtOpenKFD(void)
 		}
 	} else {
 		hsakmt_kfd_open_count++;
+		*pCtx = &hsakmt_primary_kfd_ctx;
 		result = HSAKMT_STATUS_KERNEL_ALREADY_OPENED;
 	}
 
@@ -256,7 +261,7 @@ open_failed:
 	return result;
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtCloseKFD(void)
+HSAKMT_STATUS HSAKMTAPI hsaKmtCloseKFDCtx(void)
 {
 	HSAKMT_STATUS result;
 
@@ -266,7 +271,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCloseKFD(void)
 		if (--hsakmt_kfd_open_count == 0) {
 			hsakmt_destroy_counter_props();
 			hsakmt_destroy_device_debugging_memory();
-			hsakmt_fmm_clear_all_aperture();
+			hsakmt_fmm_clear_all_aperture(&hsakmt_primary_kfd_ctx);
 		}
 
 		result = HSAKMT_STATUS_SUCCESS;
@@ -277,3 +282,14 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCloseKFD(void)
 
 	return result;
 }
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtOpenKFD(void)
+{
+	HsaKFDContext *pCtx = NULL;
+	return hsaKmtOpenKFDCtx(&pCtx);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtCloseKFD(void)
+{
+	return hsaKmtCloseKFDCtx();
+}
diff --git a/projects/rocr-runtime/libhsakmt/src/pc_sampling.c b/projects/rocr-runtime/libhsakmt/src/pc_sampling.c
index 602d35c1b3..055a30c0b1 100644
--- a/projects/rocr-runtime/libhsakmt/src/pc_sampling.c
+++ b/projects/rocr-runtime/libhsakmt/src/pc_sampling.c
@@ -65,7 +65,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingQueryCapabilities(HSAuint32 NodeId, void
     args.num_sample_info = sample_info_sz;
     args.flags = 0;
 
-    int err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_PC_SAMPLE, &args);
+    int err = hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_PC_SAMPLE, &args);
 
     *size = args.num_sample_info;
 
@@ -111,7 +111,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingCreate(HSAuint32 NodeId, HsaPcSamplingIn
     args.num_sample_info = 1;
     args.trace_id = INVALID_TRACE_ID;
 
-    int err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_PC_SAMPLE, &args);
+    int err = hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_PC_SAMPLE, &args);
     if (err) {
         switch (errno) {
         case EINVAL:
@@ -151,7 +151,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingDestroy(HSAuint32 NodeId, HsaPcSamplingT
     args.gpu_id = gpu_id;
     args.trace_id = traceId;
 
-    int err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_PC_SAMPLE, &args);
+    int err = hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_PC_SAMPLE, &args);
     if (err) {
         if (errno == EINVAL)
             return HSAKMT_STATUS_INVALID_PARAMETER;
@@ -181,7 +181,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingStart(HSAuint32 NodeId, HsaPcSamplingTra
     args.gpu_id = gpu_id;
     args.trace_id = traceId;
 
-    int err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_PC_SAMPLE, &args);
+    int err = hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_PC_SAMPLE, &args);
     if (err) {
         switch (errno) {
         case EINVAL:
@@ -220,7 +220,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingStop(HSAuint32 NodeId, HsaPcSamplingTrac
     args.gpu_id = gpu_id;
     args.trace_id = traceId;
 
-    int err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_PC_SAMPLE, &args);
+    int err = hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_PC_SAMPLE, &args);
     if (err) {
         switch (errno) {
         case EINVAL:
diff --git a/projects/rocr-runtime/libhsakmt/src/queues.c b/projects/rocr-runtime/libhsakmt/src/queues.c
index 0e3500f5ef..c2a00734ea 100644
--- a/projects/rocr-runtime/libhsakmt/src/queues.c
+++ b/projects/rocr-runtime/libhsakmt/src/queues.c
@@ -35,6 +35,7 @@
 #include <sys/mman.h>
 #include <fcntl.h>
 #include <errno.h>
+#include <assert.h>
 
 /* 1024 doorbells, 4 or 8 bytes each doorbell depending on ASIC generation */
 #define DOORBELL_SIZE(gfxv)	(((gfxv) >= 0x90000) ? 8 : 4)
@@ -80,8 +81,28 @@ struct process_doorbells {
 	pthread_mutex_t mutex;
 };
 
-static unsigned int num_doorbells;
-static struct process_doorbells *doorbells;
+struct hsa_kfd_queue_context
+{
+	unsigned int num_doorbells;
+	struct process_doorbells *doorbells;
+};
+
+struct hsa_kfd_queue_context *hsakmt_kfdcontext_get_queue_context(HsaKFDContext *ctx)
+{
+	assert(ctx);
+
+	if (ctx->queue_context)
+		return ctx->queue_context;
+
+	ctx->queue_context = calloc(1, sizeof(struct hsa_kfd_queue_context));
+	if (!ctx->queue_context) {
+		pr_err("Alloc memory failed for struct hsa_kfd_queue_context size %zu\n",
+				 sizeof(struct hsa_kfd_queue_context));
+		return NULL;
+	}
+
+	return ctx->queue_context;
+}
 
 uint32_t hsakmt_get_vgpr_size_per_cu(uint32_t gfxv)
 {
@@ -102,26 +123,27 @@ uint32_t hsakmt_get_vgpr_size_per_cu(uint32_t gfxv)
 	return vgpr_size;
 }
 
-HSAKMT_STATUS hsakmt_init_process_doorbells(unsigned int NumNodes)
+HSAKMT_STATUS hsakmt_init_process_doorbells(HsaKFDContext *ctx, unsigned int NumNodes)
 {
 	unsigned int i;
 	HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
+	struct hsa_kfd_queue_context *queue_ctx = hsakmt_kfdcontext_get_queue_context(ctx);
 
-	/* doorbells[] is accessed using Topology NodeId. This means doorbells[0],
+	/* queue_ctx->doorbells[] is accessed using Topology NodeId. This means doorbells[0],
 	 * which corresponds to CPU only Node, might not be used
 	 */
-	doorbells = malloc(NumNodes * sizeof(struct process_doorbells));
-	if (!doorbells)
+	queue_ctx->doorbells = malloc(NumNodes * sizeof(struct process_doorbells));
+	if (!queue_ctx->doorbells)
 		return HSAKMT_STATUS_NO_MEMORY;
 
 	for (i = 0; i < NumNodes; i++) {
-		doorbells[i].use_gpuvm = false;
-		doorbells[i].size = 0;
-		doorbells[i].mapping = NULL;
-		pthread_mutex_init(&doorbells[i].mutex, NULL);
+		queue_ctx->doorbells[i].use_gpuvm = false;
+		queue_ctx->doorbells[i].size = 0;
+		queue_ctx->doorbells[i].mapping = NULL;
+		pthread_mutex_init(&queue_ctx->doorbells[i].mutex, NULL);
 	}
 
-	num_doorbells = NumNodes;
+	queue_ctx->num_doorbells = NumNodes;
 
 	return ret;
 }
@@ -144,94 +166,105 @@ static void get_doorbell_map_info(uint32_t node_id,
 	return;
 }
 
-void hsakmt_destroy_process_doorbells(void)
+void hsakmt_destroy_process_doorbells(HsaKFDContext *ctx)
 {
 	unsigned int i;
+	struct hsa_kfd_queue_context *queue_ctx = hsakmt_kfdcontext_get_queue_context(ctx);
+	struct process_doorbells *doorbells = queue_ctx->doorbells;
 
 	if (!doorbells)
 		return;
 
-	for (i = 0; i < num_doorbells; i++) {
+	for (i = 0; i < queue_ctx->num_doorbells; i++) {
 		if (!doorbells[i].size)
 			continue;
 
 		if (doorbells[i].use_gpuvm) {
-			hsakmt_fmm_unmap_from_gpu(doorbells[i].mapping);
-			hsakmt_fmm_release(doorbells[i].mapping);
+			hsakmt_fmm_unmap_from_gpu(ctx, doorbells[i].mapping);
+			hsakmt_fmm_release(ctx, doorbells[i].mapping);
 		} else
 			munmap(doorbells[i].mapping, doorbells[i].size);
 	}
 
 	free(doorbells);
-	doorbells = NULL;
-	num_doorbells = 0;
+	queue_ctx->doorbells = NULL;
+	queue_ctx->num_doorbells = 0;
 }
 
 /* This is a special funcion that should be called only from the child process
  * after a fork(). This will clear doorbells duplicated from the parent.
  */
-void hsakmt_clear_process_doorbells(void)
+void hsakmt_clear_process_doorbells(HsaKFDContext *ctx)
 {
 	unsigned int i;
+	struct hsa_kfd_queue_context *queue_ctx = hsakmt_kfdcontext_get_queue_context(ctx);
 
-	if (!doorbells)
+	if (!queue_ctx->doorbells)
 		return;
 
-	for (i = 0; i < num_doorbells; i++) {
-		if (!doorbells[i].size)
+	for (i = 0; i < queue_ctx->num_doorbells; i++) {
+		if (!queue_ctx->doorbells[i].size)
 			continue;
 
-		if (!doorbells[i].use_gpuvm)
-			munmap(doorbells[i].mapping, doorbells[i].size);
+		if (!queue_ctx->doorbells[i].use_gpuvm)
+			munmap(queue_ctx->doorbells[i].mapping, queue_ctx->doorbells[i].size);
 	}
 
-	free(doorbells);
-	doorbells = NULL;
-	num_doorbells = 0;
+	free(queue_ctx->doorbells);
+	queue_ctx->doorbells = NULL;
+	queue_ctx->num_doorbells = 0;
 }
 
-static HSAKMT_STATUS map_doorbell_apu(HSAuint32 NodeId, HSAuint32 gpu_id,
+static HSAKMT_STATUS map_doorbell_apu(HsaKFDContext *ctx,
+					  HSAuint32 NodeId, HSAuint32 gpu_id,
 				      HSAuint64 doorbell_mmap_offset)
 {
 	void *ptr;
+	struct hsa_kfd_queue_context *queue_ctx = hsakmt_kfdcontext_get_queue_context(ctx);
 
-	ptr = mmap(0, doorbells[NodeId].size, PROT_READ|PROT_WRITE,
-		   MAP_SHARED, hsakmt_kfd_fd, doorbell_mmap_offset);
+	ptr = mmap(0, queue_ctx->doorbells[NodeId].size, PROT_READ|PROT_WRITE,
+		   MAP_SHARED, ctx->fd, doorbell_mmap_offset);
 
 	if (ptr == MAP_FAILED)
 		return HSAKMT_STATUS_ERROR;
 
-	doorbells[NodeId].mapping = ptr;
+	queue_ctx->doorbells[NodeId].mapping = ptr;
 
 	return HSAKMT_STATUS_SUCCESS;
 }
 
-static HSAKMT_STATUS map_doorbell_dgpu(HSAuint32 NodeId, HSAuint32 gpu_id,
+static HSAKMT_STATUS map_doorbell_dgpu(HsaKFDContext *ctx,
+					   HSAuint32 NodeId, HSAuint32 gpu_id,
 				       HSAuint64 doorbell_mmap_offset)
 {
 	void *ptr;
+	struct hsa_kfd_queue_context *queue_ctx = hsakmt_kfdcontext_get_queue_context(ctx);
 
-	ptr = hsakmt_fmm_allocate_doorbell(gpu_id, doorbells[NodeId].size,
+	ptr = hsakmt_fmm_allocate_doorbell(ctx,
+				gpu_id, queue_ctx->doorbells[NodeId].size,
 				doorbell_mmap_offset);
 
 	if (!ptr)
 		return HSAKMT_STATUS_ERROR;
 
 	/* map for GPU access */
-	if (hsakmt_fmm_map_to_gpu(ptr, doorbells[NodeId].size, NULL)) {
-		hsakmt_fmm_release(ptr);
+	if (hsakmt_fmm_map_to_gpu(ctx, ptr, queue_ctx->doorbells[NodeId].size, NULL)) {
+		hsakmt_fmm_release(ctx, ptr);
 		return HSAKMT_STATUS_ERROR;
 	}
 
-	doorbells[NodeId].mapping = ptr;
+	queue_ctx->doorbells[NodeId].mapping = ptr;
 
 	return HSAKMT_STATUS_SUCCESS;
 }
 
-static HSAKMT_STATUS map_doorbell(HSAuint32 NodeId, HSAuint32 gpu_id,
+static HSAKMT_STATUS map_doorbell(HsaKFDContext *ctx,
+				  HSAuint32 NodeId, HSAuint32 gpu_id,
 				  HSAuint64 doorbell_mmap_offset)
 {
 	HSAKMT_STATUS status = HSAKMT_STATUS_SUCCESS;
+	struct hsa_kfd_queue_context *queue_ctx = hsakmt_kfdcontext_get_queue_context(ctx);
+	struct process_doorbells *doorbells = queue_ctx->doorbells;
 
 	pthread_mutex_lock(&doorbells[NodeId].mutex);
 	if (doorbells[NodeId].size) {
@@ -242,16 +275,16 @@ static HSAKMT_STATUS map_doorbell(HSAuint32 NodeId, HSAuint32 gpu_id,
 	get_doorbell_map_info(NodeId, &doorbells[NodeId]);
 
 	if (doorbells[NodeId].use_gpuvm) {
-		status = map_doorbell_dgpu(NodeId, gpu_id, doorbell_mmap_offset);
+		status = map_doorbell_dgpu(ctx, NodeId, gpu_id, doorbell_mmap_offset);
 		if (status != HSAKMT_STATUS_SUCCESS) {
 			/* Fall back to the old method if KFD doesn't
 			 * support doorbells in GPUVM
 			 */
 			doorbells[NodeId].use_gpuvm = false;
-			status = map_doorbell_apu(NodeId, gpu_id, doorbell_mmap_offset);
+			status = map_doorbell_apu(ctx, NodeId, gpu_id, doorbell_mmap_offset);
 		}
 	} else
-		status = map_doorbell_apu(NodeId, gpu_id, doorbell_mmap_offset);
+		status = map_doorbell_apu(ctx, NodeId, gpu_id, doorbell_mmap_offset);
 
 	if (status != HSAKMT_STATUS_SUCCESS)
 		doorbells[NodeId].size = 0;
@@ -279,13 +312,13 @@ static void *allocate_exec_aligned_memory_cpu(uint32_t size)
 }
 
 /* The bool return indicate whether the queue needs a context-save-restore area*/
-static bool update_ctx_save_restore_size(uint32_t nodeid, struct queue *q)
+static bool update_ctx_save_restore_size(HsaKFDContext *ctx, uint32_t nodeid, struct queue *q)
 {
 	HsaNodeProperties node;
 
 	if (q->gfxv < GFX_VERSION_CARRIZO)
 		return false;
-	if (hsaKmtGetNodeProperties(nodeid, &node))
+	if (hsaKmtGetNodePropertiesCtx(ctx, nodeid, &node))
 		return false;
 	if (node.NumFComputeCores && node.NumSIMDPerCU) {
 		uint32_t ctl_stack_size, wg_data_size;
@@ -316,7 +349,8 @@ static bool update_ctx_save_restore_size(uint32_t nodeid, struct queue *q)
 	return false;
 }
 
-void *hsakmt_allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, uint32_t gpu_id,
+void *hsakmt_allocate_exec_aligned_memory_gpu(HsaKFDContext *ctx,
+					   uint32_t size, uint32_t align, uint32_t gpu_id,
 				       uint32_t NodeId, bool nonPaged,
 				       bool DeviceLocal,
 				       bool Uncached)
@@ -337,7 +371,7 @@ void *hsakmt_allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, uin
 	size = ALIGN_UP(size, align);
 
 	if (DeviceLocal && !hsakmt_zfb_support)
-		mem = hsakmt_fmm_allocate_device(gpu_id, NodeId, mem, size, 0, flags);
+		mem = hsakmt_fmm_allocate_device(ctx, gpu_id, NodeId, mem, size, 0, flags);
 	else {
 		/* VRAM under ZFB mode should be supported here without any
 		 * additional code
@@ -352,7 +386,7 @@ void *hsakmt_allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, uin
 				cpu_id = 0;
 			}
 		}
-		mem = hsakmt_fmm_allocate_host(gpu_id, cpu_id, mem, size, 0, flags);
+		mem = hsakmt_fmm_allocate_host(ctx, gpu_id, cpu_id, mem, size, 0, flags);
 	}
 
 	if (!mem) {
@@ -366,35 +400,36 @@ void *hsakmt_allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, uin
 		HsaMemMapFlags map_flags = {0};
 		HSAKMT_STATUS result;
 
-		result = hsaKmtMapMemoryToGPUNodes(mem, size, &gpu_va, map_flags, 1, nodes_array);
+		result = hsaKmtMapMemoryToGPUNodesCtx(ctx, mem, size, &gpu_va, map_flags, 1, nodes_array);
 		if (result != HSAKMT_STATUS_SUCCESS) {
-			hsaKmtFreeMemory(mem, size);
+			hsaKmtFreeMemoryCtx(ctx, mem, size);
 			return NULL;
 		}
 
 		return mem;
 	}
 
-	if (hsaKmtMapMemoryToGPU(mem, size, &gpu_va) != HSAKMT_STATUS_SUCCESS) {
-		hsaKmtFreeMemory(mem, size);
+	if (hsaKmtMapMemoryToGPUCtx(ctx, mem, size, &gpu_va) != HSAKMT_STATUS_SUCCESS) {
+		hsaKmtFreeMemoryCtx(ctx, mem, size);
 		return NULL;
 	}
 
 	return mem;
 }
 
-void hsakmt_free_exec_aligned_memory_gpu(void *addr, uint32_t size, uint32_t align)
+void hsakmt_free_exec_aligned_memory_gpu(HsaKFDContext *ctx, void *addr, uint32_t size, uint32_t align)
 {
 	size = ALIGN_UP(size, align);
 
-	if (hsaKmtUnmapMemoryToGPU(addr) == HSAKMT_STATUS_SUCCESS)
-		hsaKmtFreeMemory(addr, size);
+	if (hsaKmtUnmapMemoryToGPUCtx(ctx, addr) == HSAKMT_STATUS_SUCCESS)
+		hsaKmtFreeMemoryCtx(ctx, addr, size);
 }
 
 /*
  * Allocates memory aligned to sysconf(_SC_PAGESIZE)
  */
-static void *allocate_exec_aligned_memory(uint32_t size,
+static void *allocate_exec_aligned_memory(HsaKFDContext *ctx,
+					  uint32_t size,
 					  bool use_ats,
 					  uint32_t gpu_id,
 					  uint32_t NodeId,
@@ -403,17 +438,19 @@ static void *allocate_exec_aligned_memory(uint32_t size,
 					  bool Uncached)
 {
 	if (!use_ats)
-		return hsakmt_allocate_exec_aligned_memory_gpu(size, PAGE_SIZE, gpu_id, NodeId,
+		return hsakmt_allocate_exec_aligned_memory_gpu(ctx,
+							size, PAGE_SIZE, gpu_id, NodeId,
 							nonPaged, DeviceLocal,
 							Uncached);
 	return allocate_exec_aligned_memory_cpu(size);
 }
 
-static void free_exec_aligned_memory(void *addr, uint32_t size, uint32_t align,
+static void free_exec_aligned_memory(HsaKFDContext *ctx,
+				     void *addr, uint32_t size, uint32_t align,
 				     bool use_ats)
 {
 	if (!use_ats)
-		hsakmt_free_exec_aligned_memory_gpu(addr, size, align);
+		hsakmt_free_exec_aligned_memory_gpu(ctx, addr, size, align);
 	else
 		munmap(addr, size);
 }
@@ -454,20 +491,20 @@ static HSAKMT_STATUS register_svm_range(void *mem, uint32_t size,
 	return hsaKmtSVMSetAttr(mem, size, nattr, attrs);
 }
 
-static void free_queue(struct queue *q)
+static void free_queue(HsaKFDContext *ctx, struct queue *q)
 {
 	if (q->eop_buffer)
-		free_exec_aligned_memory(q->eop_buffer,
+		free_exec_aligned_memory(ctx, q->eop_buffer,
 					 q->eop_buffer_size,
 					 PAGE_SIZE, q->use_ats);
 	if (q->unified_ctx_save_restore)
 		munmap(q->ctx_save_restore, q->total_mem_alloc_size);
 	else if (q->ctx_save_restore)
-		free_exec_aligned_memory(q->ctx_save_restore,
+		free_exec_aligned_memory(ctx, q->ctx_save_restore,
 					 q->total_mem_alloc_size,
 					 PAGE_SIZE, q->use_ats);
 
-	free_exec_aligned_memory((void *)q, sizeof(*q), PAGE_SIZE, q->use_ats);
+	free_exec_aligned_memory(ctx, (void *)q, sizeof(*q), PAGE_SIZE, q->use_ats);
 }
 
 static inline void fill_cwsr_header(struct queue *q, void *addr,
@@ -488,7 +525,8 @@ static inline void fill_cwsr_header(struct queue *q, void *addr,
 	}
 }
 
-static int handle_concrete_asic(struct queue *q,
+static int handle_concrete_asic(HsaKFDContext *ctx,
+				struct queue *q,
 				struct kfd_ioctl_create_queue_args *args,
 				uint32_t gpu_id,
 				uint32_t NodeId,
@@ -503,7 +541,8 @@ static int handle_concrete_asic(struct queue *q,
 
 	if (q->eop_buffer_size > 0) {
 		pr_info("Allocating VRAM for EOP\n");
-		q->eop_buffer = allocate_exec_aligned_memory(q->eop_buffer_size,
+		q->eop_buffer = allocate_exec_aligned_memory(ctx,
+				q->eop_buffer_size,
 				q->use_ats, gpu_id,
 				NodeId, true, true, /* Unused for VRAM */false);
 		if (!q->eop_buffer)
@@ -513,12 +552,12 @@ static int handle_concrete_asic(struct queue *q,
 		args->eop_buffer_size = q->eop_buffer_size;
 	}
 
-	ret = update_ctx_save_restore_size(NodeId, q);
+	ret = update_ctx_save_restore_size(ctx, NodeId, q);
 
 	if (ret) {
 		HsaNodeProperties node;
 
-		if (hsaKmtGetNodeProperties(NodeId, &node))
+		if (hsaKmtGetNodePropertiesCtx(ctx, NodeId, &node))
 			return HSAKMT_STATUS_ERROR;
 
 		args->ctx_save_restore_size = q->ctx_save_restore_size;
@@ -568,7 +607,7 @@ static int handle_concrete_asic(struct queue *q,
 		}
 
 		if (!q->unified_ctx_save_restore) {
-			q->ctx_save_restore = allocate_exec_aligned_memory(
+			q->ctx_save_restore = allocate_exec_aligned_memory(ctx,
 							q->total_mem_alloc_size,
 							q->use_ats, gpu_id, NodeId,
 							false, false, false);
@@ -591,24 +630,26 @@ static int handle_concrete_asic(struct queue *q,
  */
 static uint32_t priority_map[] = {0, 3, 5, 7, 9, 11, 15};
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueue(HSAuint32 NodeId,
-					  HSA_QUEUE_TYPE Type,
-					  HSAuint32 QueuePercentage,
-					  HSA_QUEUE_PRIORITY Priority,
-					  void *QueueAddress,
-					  HSAuint64 QueueSizeInBytes,
-					  HsaEvent *Event,
-					  HsaQueueResource *QueueResource)
+HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueueCtx(HsaKFDContext *ctx,
+						 HSAuint32 NodeId,
+						 HSA_QUEUE_TYPE Type,
+						 HSAuint32 QueuePercentage,
+						 HSA_QUEUE_PRIORITY Priority,
+						 void *QueueAddress,
+						 HSAuint64 QueueSizeInBytes,
+						 HsaEvent *Event,
+						 HsaQueueResource *QueueResource)
 {
 	if (Type == HSA_QUEUE_SDMA_BY_ENG_ID)
 		return HSAKMT_STATUS_ERROR;
 
-	return hsaKmtCreateQueueExt(NodeId, Type, QueuePercentage, Priority, 0,
+	return hsaKmtCreateQueueExtCtx(ctx, NodeId, Type, QueuePercentage, Priority, 0,
 				    QueueAddress, QueueSizeInBytes, Event,
 				    QueueResource);
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueueExt(HSAuint32 NodeId,
+HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueueExtCtx(HsaKFDContext *ctx,
+						 HSAuint32 NodeId,
 					     HSA_QUEUE_TYPE Type,
 					     HSAuint32 QueuePercentage,
 					     HSA_QUEUE_PRIORITY Priority,
@@ -628,6 +669,8 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueueExt(HSAuint32 NodeId,
 
 	CHECK_KFD_OPEN();
 
+	struct hsa_kfd_queue_context *queue_ctx = hsakmt_kfdcontext_get_queue_context(ctx);
+
 	if (Priority < HSA_QUEUE_PRIORITY_MINIMUM ||
 		Priority > HSA_QUEUE_PRIORITY_MAXIMUM)
 		return HSAKMT_STATUS_INVALID_PARAMETER;
@@ -636,7 +679,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueueExt(HSAuint32 NodeId,
 	if (result != HSAKMT_STATUS_SUCCESS)
 		return result;
 
-	struct queue *q = allocate_exec_aligned_memory(sizeof(*q),
+	struct queue *q = allocate_exec_aligned_memory(ctx, sizeof(*q),
 			false, gpu_id, NodeId, true, false, true);
 	if (!q)
 		return HSAKMT_STATUS_NO_MEMORY;
@@ -656,7 +699,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueueExt(HSAuint32 NodeId,
 	/* By default, CUs are all turned on. Initialize cu_mask to '1
 	 * for all CU bits.
 	 */
-	if (hsaKmtGetNodeProperties(NodeId, &props))
+	if (hsaKmtGetNodePropertiesCtx(ctx, NodeId, &props))
 		q->cu_mask_count = 0;
 	else {
 		cu_num = props.NumFComputeCores / props.NumSIMDPerCU;
@@ -695,9 +738,9 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueueExt(HSAuint32 NodeId,
 		QueueResource->QueueWptrValue = (uintptr_t)&q->wptr;
 	}
 
-	err = handle_concrete_asic(q, &args, gpu_id, NodeId, Event, QueueResource->ErrorReason);
+	err = handle_concrete_asic(ctx, q, &args, gpu_id, NodeId, Event, QueueResource->ErrorReason);
 	if (err != HSAKMT_STATUS_SUCCESS) {
-		free_queue(q);
+		free_queue(ctx, q);
 		return err;
 	}
 
@@ -709,10 +752,10 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueueExt(HSAuint32 NodeId,
 	args.queue_priority = priority_map[Priority+3];
 	args.sdma_engine_id = SdmaEngineId;
 
-	err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_CREATE_QUEUE, &args);
+	err = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_CREATE_QUEUE, &args);
 
 	if (err == -1) {
-		free_queue(q);
+		free_queue(ctx, q);
 		return HSAKMT_STATUS_ERROR;
 	}
 
@@ -737,20 +780,21 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueueExt(HSAuint32 NodeId,
 		doorbell_offset = q->queue_id * DOORBELL_SIZE(q->gfxv);
 	}
 
-	err = map_doorbell(NodeId, gpu_id, doorbell_mmap_offset);
+	err = map_doorbell(ctx, NodeId, gpu_id, doorbell_mmap_offset);
 	if (err != HSAKMT_STATUS_SUCCESS) {
-		hsaKmtDestroyQueue(q->queue_id);
+		hsaKmtDestroyQueueCtx(ctx, q->queue_id);
 		return HSAKMT_STATUS_ERROR;
 	}
 
 	QueueResource->QueueId = PORT_VPTR_TO_UINT64(q);
-	QueueResource->Queue_DoorBell = VOID_PTR_ADD(doorbells[NodeId].mapping,
+	QueueResource->Queue_DoorBell = VOID_PTR_ADD(queue_ctx->doorbells[NodeId].mapping,
 						     doorbell_offset);
 
 	return HSAKMT_STATUS_SUCCESS;
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtUpdateQueue(HSA_QUEUEID QueueId,
+HSAKMT_STATUS HSAKMTAPI hsaKmtUpdateQueueCtx(HsaKFDContext *ctx,
+					  HSA_QUEUEID QueueId,
 					  HSAuint32 QueuePercentage,
 					  HSA_QUEUE_PRIORITY Priority,
 					  void *QueueAddress,
@@ -774,7 +818,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtUpdateQueue(HSA_QUEUEID QueueId,
 	arg.queue_percentage = QueuePercentage;
 	arg.queue_priority = priority_map[Priority+3];
 
-	int err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_UPDATE_QUEUE, &arg);
+	int err = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_UPDATE_QUEUE, &arg);
 
 	if (err == -1)
 		return HSAKMT_STATUS_ERROR;
@@ -782,7 +826,8 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtUpdateQueue(HSA_QUEUEID QueueId,
 	return HSAKMT_STATUS_SUCCESS;
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtDestroyQueue(HSA_QUEUEID QueueId)
+HSAKMT_STATUS HSAKMTAPI hsaKmtDestroyQueueCtx(HsaKFDContext *ctx,
+						 HSA_QUEUEID QueueId)
 {
 	CHECK_KFD_OPEN();
 
@@ -794,20 +839,21 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDestroyQueue(HSA_QUEUEID QueueId)
 
 	args.queue_id = q->queue_id;
 
-	int err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_DESTROY_QUEUE, &args);
+	int err = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_DESTROY_QUEUE, &args);
 
 	if (err == -1) {
 		pr_err("Failed to destroy queue: %s\n", strerror(errno));
 		return HSAKMT_STATUS_ERROR;
 	}
 
-	free_queue(q);
+	free_queue(ctx, q);
 	return HSAKMT_STATUS_SUCCESS;
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtSetQueueCUMask(HSA_QUEUEID QueueId,
-					     HSAuint32 CUMaskCount,
-					     HSAuint32 *QueueCUMask)
+HSAKMT_STATUS HSAKMTAPI hsaKmtSetQueueCUMaskCtx(HsaKFDContext *ctx,
+						 HSA_QUEUEID QueueId,
+						 HSAuint32 CUMaskCount,
+						 HSAuint32 *QueueCUMask)
 {
 	struct queue *q = PORT_UINT64_TO_VPTR(QueueId);
 	struct kfd_ioctl_set_cu_mask_args args = {0};
@@ -821,7 +867,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtSetQueueCUMask(HSA_QUEUEID QueueId,
 	args.num_cu_mask = CUMaskCount;
 	args.cu_mask_ptr = (uintptr_t)QueueCUMask;
 
-	int err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_SET_CU_MASK, &args);
+	int err = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_SET_CU_MASK, &args);
 
 	if (err == -1)
 		return HSAKMT_STATUS_ERROR;
@@ -832,12 +878,9 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtSetQueueCUMask(HSA_QUEUEID QueueId,
 	return HSAKMT_STATUS_SUCCESS;
 }
 
-HSAKMT_STATUS
-HSAKMTAPI
-hsaKmtGetQueueInfo(
-	HSA_QUEUEID QueueId,
-	HsaQueueInfo *QueueInfo
-)
+HSAKMT_STATUS HSAKMTAPI hsaKmtGetQueueInfoCtx(HsaKFDContext *ctx,
+						 HSA_QUEUEID QueueId,
+						 HsaQueueInfo *QueueInfo)
 {
 	struct queue *q = PORT_UINT64_TO_VPTR(QueueId);
 	struct kfd_ioctl_get_queue_wave_state_args args = {0};
@@ -853,7 +896,7 @@ hsaKmtGetQueueInfo(
 	args.queue_id = q->queue_id;
 	args.ctl_stack_address = (uintptr_t)q->ctx_save_restore;
 
-	if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_GET_QUEUE_WAVE_STATE, &args) < 0)
+	if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_GET_QUEUE_WAVE_STATE, &args) < 0)
 		return HSAKMT_STATUS_ERROR;
 
 	QueueInfo->ControlStackTop = (void *)(args.ctl_stack_address +
@@ -871,7 +914,8 @@ hsaKmtGetQueueInfo(
 	return HSAKMT_STATUS_SUCCESS;
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtSetTrapHandler(HSAuint32 Node,
+HSAKMT_STATUS HSAKMTAPI hsaKmtSetTrapHandlerCtx(HsaKFDContext *ctx,
+						 HSAuint32 Node,
 					     void *TrapHandlerBaseAddress,
 					     HSAuint64 TrapHandlerSizeInBytes,
 					     void *TrapBufferBaseAddress,
@@ -891,7 +935,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtSetTrapHandler(HSAuint32 Node,
 	args.tba_addr = (uintptr_t)TrapHandlerBaseAddress;
 	args.tma_addr = (uintptr_t)TrapBufferBaseAddress;
 
-	int err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_SET_TRAP_HANDLER, &args);
+	int err = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_SET_TRAP_HANDLER, &args);
 
 	return (err == -1) ? HSAKMT_STATUS_ERROR : HSAKMT_STATUS_SUCCESS;
 }
@@ -921,12 +965,10 @@ uint32_t *hsakmt_convert_queue_ids(HSAuint32 NumQueues, HSA_QUEUEID *Queues)
 	return queue_ids_ptr;
 }
 
-HSAKMT_STATUS
-HSAKMTAPI
-hsaKmtAllocQueueGWS(
-                HSA_QUEUEID        QueueId,
-                HSAuint32          nGWS,
-                HSAuint32          *firstGWS)
+HSAKMT_STATUS HSAKMTAPI hsaKmtAllocQueueGWSCtx(HsaKFDContext *ctx,
+						 HSA_QUEUEID QueueId,
+						 HSAuint32 nGWS,
+						 HSAuint32 *firstGWS)
 {
 	struct kfd_ioctl_alloc_queue_gws_args args = {0};
 	struct queue *q = PORT_UINT64_TO_VPTR(QueueId);
@@ -936,7 +978,7 @@ hsaKmtAllocQueueGWS(
 	args.queue_id = (HSAuint32)q->queue_id;
 	args.num_gws = nGWS;
 
-	int err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_ALLOC_QUEUE_GWS, &args);
+	int err = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_ALLOC_QUEUE_GWS, &args);
 
 	if (!err && firstGWS)
 		*firstGWS = args.first_gws;
@@ -952,3 +994,85 @@ hsaKmtAllocQueueGWS(
 	else
 		return HSAKMT_STATUS_ERROR;
 }
+
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueue(HSAuint32 NodeId,
+						 HSA_QUEUE_TYPE Type,
+						 HSAuint32 QueuePercentage,
+						 HSA_QUEUE_PRIORITY Priority,
+						 void *QueueAddress,
+						 HSAuint64 QueueSizeInBytes,
+						 HsaEvent *Event,
+						 HsaQueueResource *QueueResource)
+{
+	if (Type == HSA_QUEUE_SDMA_BY_ENG_ID)
+		return HSAKMT_STATUS_ERROR;
+
+	return hsaKmtCreateQueueExt(NodeId, Type, QueuePercentage, Priority, 0,
+				    QueueAddress, QueueSizeInBytes, Event,
+				    QueueResource);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueueExt(HSAuint32 NodeId,
+					     HSA_QUEUE_TYPE Type,
+					     HSAuint32 QueuePercentage,
+					     HSA_QUEUE_PRIORITY Priority,
+					     HSAuint32 SdmaEngineId,
+					     void *QueueAddress,
+					     HSAuint64 QueueSizeInBytes,
+					     HsaEvent *Event,
+					     HsaQueueResource *QueueResource)
+{
+
+	return hsaKmtCreateQueueExtCtx(&hsakmt_primary_kfd_ctx, NodeId, Type,
+					QueuePercentage, Priority, SdmaEngineId, QueueAddress,
+					QueueSizeInBytes, Event, QueueResource);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtUpdateQueue(HSA_QUEUEID QueueId,
+						 HSAuint32 QueuePercentage,
+						 HSA_QUEUE_PRIORITY Priority,
+						 void *QueueAddress,
+						 HSAuint64 QueueSize,
+						 HsaEvent *Event)
+{
+	return hsaKmtUpdateQueueCtx(&hsakmt_primary_kfd_ctx, QueueId, QueuePercentage,
+					Priority, QueueAddress, QueueSize, Event);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtDestroyQueue(HSA_QUEUEID QueueId)
+{
+	return hsaKmtDestroyQueueCtx(&hsakmt_primary_kfd_ctx, QueueId);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtSetQueueCUMask(HSA_QUEUEID QueueId,
+						 HSAuint32 CUMaskCount,
+						 HSAuint32 *QueueCUMask)
+{
+	return hsaKmtSetQueueCUMaskCtx(&hsakmt_primary_kfd_ctx, QueueId, CUMaskCount, QueueCUMask);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtGetQueueInfo(
+						 HSA_QUEUEID QueueId,
+						 HsaQueueInfo *QueueInfo)
+{
+	return hsaKmtGetQueueInfoCtx(&hsakmt_primary_kfd_ctx, QueueId, QueueInfo);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtSetTrapHandler(HSAuint32 Node,
+						 void *TrapHandlerBaseAddress,
+						 HSAuint64 TrapHandlerSizeInBytes,
+						 void *TrapBufferBaseAddress,
+						 HSAuint64 TrapBufferSizeInBytes)
+{
+	return hsaKmtSetTrapHandlerCtx(&hsakmt_primary_kfd_ctx, Node,
+					TrapHandlerBaseAddress, TrapHandlerSizeInBytes,
+					TrapBufferBaseAddress, TrapBufferSizeInBytes);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtAllocQueueGWS(HSA_QUEUEID QueueId,
+						 HSAuint32 nGWS,
+						 HSAuint32 *firstGWS)
+{
+	return hsaKmtAllocQueueGWSCtx(&hsakmt_primary_kfd_ctx, QueueId, nGWS, firstGWS);
+}
diff --git a/projects/rocr-runtime/libhsakmt/src/spm.c b/projects/rocr-runtime/libhsakmt/src/spm.c
index 3ad72ccf2f..ec7f3d2b33 100644
--- a/projects/rocr-runtime/libhsakmt/src/spm.c
+++ b/projects/rocr-runtime/libhsakmt/src/spm.c
@@ -45,7 +45,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtSPMAcquire(HSAuint32 PreferredNode)
 	args.op = KFD_IOCTL_SPM_OP_ACQUIRE;
 	args.gpu_id = gpu_id;
 
-	ret = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_RLC_SPM, &args);
+	ret = hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_RLC_SPM, &args);
 
 	return ret;
 }
@@ -72,7 +72,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtSPMSetDestBuffer(HSAuint32 PreferredNode,
 	args.op         = KFD_IOCTL_SPM_OP_SET_DEST_BUF;
 	args.gpu_id     = gpu_id;
 
-	ret = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_RLC_SPM, &args);
+	ret = hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_RLC_SPM, &args);
 
 	*SizeCopied = args.bytes_copied;
 	*isSPMDataLoss = args.has_data_loss;
@@ -96,7 +96,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtSPMRelease(HSAuint32 PreferredNode)
 	args.op = KFD_IOCTL_SPM_OP_RELEASE;
 	args.gpu_id = gpu_id;
 
-	ret = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_RLC_SPM, &args);
+	ret = hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_RLC_SPM, &args);
 
 	return ret;
 }
diff --git a/projects/rocr-runtime/libhsakmt/src/svm.c b/projects/rocr-runtime/libhsakmt/src/svm.c
index 441fc00fd7..5482dead5c 100644
--- a/projects/rocr-runtime/libhsakmt/src/svm.c
+++ b/projects/rocr-runtime/libhsakmt/src/svm.c
@@ -37,7 +37,8 @@
 /* Helper functions for calling KFD SVM ioctl */
 
 HSAKMT_STATUS HSAKMTAPI
-hsaKmtSVMSetAttr(void *start_addr, HSAuint64 size, unsigned int nattr,
+hsaKmtSVMSetAttrCtx(HsaKFDContext *ctx,
+		 void *start_addr, HSAuint64 size, unsigned int nattr,
 		 HSA_SVM_ATTRIBUTE *attrs)
 {
 	struct kfd_ioctl_svm_args *args;
@@ -94,7 +95,7 @@ hsaKmtSVMSetAttr(void *start_addr, HSAuint64 size, unsigned int nattr,
 	}
 
 	/* Driver does one copy_from_user, with extra attrs size */
-	r = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_SVM + (s_attr << _IOC_SIZESHIFT), args);
+	r = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_SVM + (s_attr << _IOC_SIZESHIFT), args);
 	if (r) {
 		pr_debug("op set range attrs failed %s\n", strerror(errno));
 		return HSAKMT_STATUS_ERROR;
@@ -104,7 +105,8 @@ hsaKmtSVMSetAttr(void *start_addr, HSAuint64 size, unsigned int nattr,
 }
 
 HSAKMT_STATUS HSAKMTAPI
-hsaKmtSVMGetAttr(void *start_addr, HSAuint64 size, unsigned int nattr,
+hsaKmtSVMGetAttrCtx(HsaKFDContext *ctx,
+		 void *start_addr, HSAuint64 size, unsigned int nattr,
 		 HSA_SVM_ATTRIBUTE *attrs)
 {
 	struct kfd_ioctl_svm_args *args;
@@ -150,7 +152,7 @@ hsaKmtSVMGetAttr(void *start_addr, HSAuint64 size, unsigned int nattr,
 	}
 
 	/* Driver does one copy_from_user, with extra attrs size */
-	r = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_SVM + (s_attr << _IOC_SIZESHIFT), args);
+	r = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_SVM + (s_attr << _IOC_SIZESHIFT), args);
 	if (r) {
 		pr_debug("op get range attrs failed %s\n", strerror(errno));
 		return HSAKMT_STATUS_ERROR;
@@ -187,7 +189,7 @@ hsaKmtSVMGetAttr(void *start_addr, HSAuint64 size, unsigned int nattr,
 }
 
 static HSAKMT_STATUS
-hsaKmtSetGetXNACKMode(HSAint32 * enable)
+hsaKmtSetGetXNACKModeCtx(HsaKFDContext *ctx, HSAint32 * enable)
 {
 	struct kfd_ioctl_set_xnack_mode_args args;
 
@@ -196,7 +198,7 @@ hsaKmtSetGetXNACKMode(HSAint32 * enable)
 
 	args.xnack_enabled = *enable;
 
-	if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_SET_XNACK_MODE, &args)) {
+	if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_SET_XNACK_MODE, &args)) {
 		if (errno == EPERM) {
 			pr_debug("set mode not supported %s\n",
 				 strerror(errno));
@@ -213,6 +215,40 @@ hsaKmtSetGetXNACKMode(HSAint32 * enable)
 	return HSAKMT_STATUS_SUCCESS;
 }
 
+HSAKMT_STATUS HSAKMTAPI
+hsaKmtSetXNACKModeCtx(HsaKFDContext *ctx, HSAint32 enable)
+{
+	return hsaKmtSetGetXNACKModeCtx(ctx, &enable);
+}
+
+HSAKMT_STATUS HSAKMTAPI
+hsaKmtGetXNACKModeCtx(HsaKFDContext *ctx, HSAint32 * enable)
+{
+	*enable = -1;
+	return hsaKmtSetGetXNACKModeCtx(ctx, enable);
+}
+
+
+HSAKMT_STATUS HSAKMTAPI
+hsaKmtSVMSetAttr(void *start_addr, HSAuint64 size, unsigned int nattr,
+		 HSA_SVM_ATTRIBUTE *attrs)
+{
+	return hsaKmtSVMSetAttrCtx(&hsakmt_primary_kfd_ctx, start_addr, size, nattr, attrs);
+}
+
+HSAKMT_STATUS HSAKMTAPI
+hsaKmtSVMGetAttr(void *start_addr, HSAuint64 size, unsigned int nattr,
+		 HSA_SVM_ATTRIBUTE *attrs)
+{
+	return hsaKmtSVMGetAttrCtx(&hsakmt_primary_kfd_ctx, start_addr, size, nattr, attrs);
+}
+
+static HSAKMT_STATUS
+hsaKmtSetGetXNACKMode(HSAint32 * enable)
+{
+	return hsaKmtSetGetXNACKModeCtx(&hsakmt_primary_kfd_ctx, enable);
+}
+
 HSAKMT_STATUS HSAKMTAPI
 hsaKmtSetXNACKMode(HSAint32 enable)
 {
diff --git a/projects/rocr-runtime/libhsakmt/src/time.c b/projects/rocr-runtime/libhsakmt/src/time.c
index eff9ed1585..9e8b5ec451 100644
--- a/projects/rocr-runtime/libhsakmt/src/time.c
+++ b/projects/rocr-runtime/libhsakmt/src/time.c
@@ -42,7 +42,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetClockCounters(HSAuint32 NodeId,
 
 	args.gpu_id = gpu_id;
 
-	err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_GET_CLOCK_COUNTERS, &args);
+	err = hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_GET_CLOCK_COUNTERS, &args);
 	if (err < 0) {
 		result = HSAKMT_STATUS_ERROR;
 	} else {
diff --git a/projects/rocr-runtime/libhsakmt/src/topology.c b/projects/rocr-runtime/libhsakmt/src/topology.c
index 324033c840..6db13f9073 100644
--- a/projects/rocr-runtime/libhsakmt/src/topology.c
+++ b/projects/rocr-runtime/libhsakmt/src/topology.c
@@ -96,7 +96,7 @@ static const char *supported_processor_vendor_name[] = {
 	"\n"			// POWER requires a different search method
 };
 
-static HSAKMT_STATUS topology_take_snapshot(void);
+static HSAKMT_STATUS topology_take_snapshot(HsaKFDContext *ctx);
 static void topology_drop_snapshot(void);
 
 static const struct hsa_gfxip_table gfxip_lookup_table[] = {
@@ -645,7 +645,8 @@ static HSAKMT_STATUS topology_sysfs_get_gpu_id(uint32_t sysfs_node_id, uint32_t
  *	- if corresponding drm render node is not available.
  *	- if node information is not accessible (EPERM)
  */
-static HSAKMT_STATUS topology_sysfs_check_node_supported(uint32_t sysfs_node_id, bool *is_node_supported)
+static HSAKMT_STATUS topology_sysfs_check_node_supported(HsaKFDContext *ctx,
+					uint32_t sysfs_node_id, bool *is_node_supported)
 {
 	uint32_t gpu_id;
 	FILE *fd;
@@ -711,7 +712,7 @@ static HSAKMT_STATUS topology_sysfs_check_node_supported(uint32_t sysfs_node_id,
 	}
 
 	/* Open DRM Render device */
-	ret_value = hsakmt_open_drm_render_device(drm_render_minor);
+	ret_value = hsakmt_open_drm_render_device(ctx, drm_render_minor);
 	if (ret_value > 0)
 		*is_node_supported = true;
 	else if (ret_value != -ENOENT && ret_value != -EPERM)
@@ -723,7 +724,8 @@ err:
 	return ret;
 }
 
-HSAKMT_STATUS hsakmt_topology_sysfs_get_system_props(HsaSystemProperties *props)
+HSAKMT_STATUS hsakmt_topology_sysfs_get_system_props(HsaKFDContext *ctx,
+					HsaSystemProperties *props)
 {
 	FILE *fd;
 	char *read_buf, *p;
@@ -800,7 +802,7 @@ HSAKMT_STATUS hsakmt_topology_sysfs_get_system_props(HsaSystemProperties *props)
 	}
 
 	for (uint32_t i = 0; i < num_sysfs_nodes; i++) {
-		ret = topology_sysfs_check_node_supported(i, &is_node_supported);
+		ret = topology_sysfs_check_node_supported(ctx, i, &is_node_supported);
 		if (ret != HSAKMT_STATUS_SUCCESS)
 			goto sysfs_parse_failed;
 		if (is_node_supported)
@@ -1631,7 +1633,8 @@ static HSAKMT_STATUS topology_map_sysfs_to_user_node_id(uint32_t sys_node_id, ui
  * If node_to specified by the @iolink_id is not accessible the function returns HSAKMT_STATUS_NOT_SUPPORTED.
  * If node_to is accessible, then node_to is mapped from sysfs_node to user_node and returns HSAKMT_STATUS_SUCCESS.
  */
-static HSAKMT_STATUS topology_sysfs_get_iolink_props(uint32_t node_id,
+static HSAKMT_STATUS topology_sysfs_get_iolink_props(HsaKFDContext *ctx,
+						     uint32_t node_id,
 						     uint32_t iolink_id,
 						     HsaIoLinkProperties *props, bool p2pLink)
 {
@@ -1693,7 +1696,7 @@ static HSAKMT_STATUS topology_sysfs_get_iolink_props(uint32_t node_id,
 			uint32_t sysfs_node_id;
 
 			sysfs_node_id = (uint32_t)prop_val;
-			ret = topology_sysfs_check_node_supported(sysfs_node_id, &is_node_supported);
+			ret = topology_sysfs_check_node_supported(ctx, sysfs_node_id, &is_node_supported);
 			if (!is_node_supported) {
 				ret = HSAKMT_STATUS_NOT_SUPPORTED;
 				memset(props, 0, sizeof(*props));
@@ -1955,7 +1958,7 @@ try_alt_dir:
 	}
 }
 
-HSAKMT_STATUS topology_take_snapshot(void)
+HSAKMT_STATUS topology_take_snapshot(HsaKFDContext *ctx)
 {
 	uint32_t gen_start, gen_end, i, mem_id, cache_id;
 	HsaSystemProperties sys_props;
@@ -1978,7 +1981,7 @@ retry:
 	ret = topology_sysfs_get_generation(&gen_start);
 	if (ret != HSAKMT_STATUS_SUCCESS)
 		goto err;
-	ret = hsakmt_topology_sysfs_get_system_props(&sys_props);
+	ret = hsakmt_topology_sysfs_get_system_props(ctx, &sys_props);
 	if (ret != HSAKMT_STATUS_SUCCESS)
 		goto err;
 	if (sys_props.NumNodes > 0) {
@@ -2059,7 +2062,7 @@ retry:
 				 */
 				while (sys_link_id < num_ioLinks &&
 					link_id < sys_props.NumNodes - 1) {
-					ret = topology_sysfs_get_iolink_props(i, sys_link_id++,
+					ret = topology_sysfs_get_iolink_props(ctx, i, sys_link_id++,
 								&temp_props[i].link[link_id], false);
 					if (ret == HSAKMT_STATUS_NOT_SUPPORTED) {
 						continue;
@@ -2080,7 +2083,7 @@ retry:
 				 */
 				while (sys_link_id < num_p2pLinks &&
 					link_id < sys_props.NumNodes - 1) {
-					ret = topology_sysfs_get_iolink_props(i, sys_link_id++,
+					ret = topology_sysfs_get_iolink_props(ctx, i, sys_link_id++,
 								&temp_props[i].link[link_id], true);
 					if (ret == HSAKMT_STATUS_NOT_SUPPORTED) {
 						continue;
@@ -2179,7 +2182,8 @@ HSAKMT_STATUS hsakmt_gpuid_to_nodeid(uint32_t gpu_id, uint32_t *node_id)
 
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtAcquireSystemProperties(HsaSystemProperties *SystemProperties)
+HSAKMT_STATUS HSAKMTAPI hsaKmtAcquireSystemPropertiesCtx(HsaKFDContext *ctx,
+				        HsaSystemProperties *SystemProperties)
 {
 	HSAKMT_STATUS err = HSAKMT_STATUS_SUCCESS;
 
@@ -2198,7 +2202,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAcquireSystemProperties(HsaSystemProperties *Syste
 		goto out;
 	}
 
-	err = topology_take_snapshot();
+	err = topology_take_snapshot(ctx);
 	if (err != HSAKMT_STATUS_SUCCESS)
 		goto out;
 
@@ -2207,11 +2211,11 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAcquireSystemProperties(HsaSystemProperties *Syste
 	if (hsakmt_use_model)
 		model_init();
 
-	err = hsakmt_fmm_init_process_apertures(g_system->NumNodes);
+	err = hsakmt_fmm_init_process_apertures(ctx, g_system->NumNodes);
 	if (err != HSAKMT_STATUS_SUCCESS)
 		goto init_process_apertures_failed;
 
-	err = hsakmt_init_process_doorbells(g_system->NumNodes);
+	err = hsakmt_init_process_doorbells(ctx, g_system->NumNodes);
 	if (err != HSAKMT_STATUS_SUCCESS)
 		goto init_doorbells_failed;
 
@@ -2220,7 +2224,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAcquireSystemProperties(HsaSystemProperties *Syste
 	goto out;
 
 init_doorbells_failed:
-	hsakmt_fmm_destroy_process_apertures();
+	hsakmt_fmm_destroy_process_apertures(ctx);
 init_process_apertures_failed:
 	topology_drop_snapshot();
 
@@ -2229,12 +2233,12 @@ out:
 	return err;
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtReleaseSystemProperties(void)
+HSAKMT_STATUS HSAKMTAPI hsaKmtReleaseSystemPropertiesCtx(HsaKFDContext *ctx)
 {
 	pthread_mutex_lock(&hsakmt_mutex);
 
-	hsakmt_destroy_process_doorbells();
-	hsakmt_fmm_destroy_process_apertures();
+	hsakmt_destroy_process_doorbells(ctx);
+	hsakmt_fmm_destroy_process_apertures(ctx);
 	topology_drop_snapshot();
 
 	pthread_mutex_unlock(&hsakmt_mutex);
@@ -2252,7 +2256,9 @@ HSAKMT_STATUS hsakmt_topology_get_node_props(HSAuint32 NodeId,
 	return HSAKMT_STATUS_SUCCESS;
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeProperties(HSAuint32 NodeId,
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodePropertiesCtx(HsaKFDContext *ctx,
+						HSAuint32 NodeId,
 						HsaNodeProperties *NodeProperties)
 {
 	HSAKMT_STATUS err;
@@ -2278,7 +2284,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeProperties(HSAuint32 NodeId,
 			NodeProperties->NumMemoryBanks += NUM_OF_DGPU_HEAPS;
 		else
 			NodeProperties->NumMemoryBanks += NUM_OF_IGPU_HEAPS;
-		if (hsakmt_fmm_get_aperture_base_and_limit(FMM_MMIO, gpu_id, &base,
+		if (hsakmt_fmm_get_aperture_base_and_limit(ctx, FMM_MMIO, gpu_id, &base,
 				&limit) == HSAKMT_STATUS_SUCCESS)
 			NodeProperties->NumMemoryBanks += 1;
 	}
@@ -2288,7 +2294,8 @@ out:
 	return err;
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeMemoryProperties(HSAuint32 NodeId,
+HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeMemoryPropertiesCtx(HsaKFDContext *ctx,
+						      HSAuint32 NodeId,
 						      HSAuint32 NumBanks,
 						      HsaMemoryProperties *MemoryProperties)
 {
@@ -2319,7 +2326,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeMemoryProperties(HSAuint32 NodeId,
 
 	/*Add LDS*/
 	if (i < NumBanks &&
-		hsakmt_fmm_get_aperture_base_and_limit(FMM_LDS, gpu_id,
+		hsakmt_fmm_get_aperture_base_and_limit(ctx, FMM_LDS, gpu_id,
 				&MemoryProperties[i].VirtualBaseAddress, &aperture_limit) == HSAKMT_STATUS_SUCCESS) {
 		MemoryProperties[i].HeapType = HSA_HEAPTYPE_GPU_LDS;
 		MemoryProperties[i].SizeInBytes = g_props[NodeId].node.LDSSizeInKB * 1024;
@@ -2332,7 +2339,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeMemoryProperties(HSAuint32 NodeId,
 	 */
 	if (hsakmt_get_gfxv_by_node_id(NodeId) == GFX_VERSION_KAVERI && i < NumBanks &&
 		g_props[NodeId].node.LocalMemSize > 0 &&
-		hsakmt_fmm_get_aperture_base_and_limit(FMM_GPUVM, gpu_id,
+		hsakmt_fmm_get_aperture_base_and_limit(ctx, FMM_GPUVM, gpu_id,
 				&MemoryProperties[i].VirtualBaseAddress, &aperture_limit) == HSAKMT_STATUS_SUCCESS) {
 		MemoryProperties[i].HeapType = HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE;
 		MemoryProperties[i].SizeInBytes = g_props[NodeId].node.LocalMemSize;
@@ -2341,7 +2348,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeMemoryProperties(HSAuint32 NodeId,
 
 	/* Add SCRATCH */
 	if (i < NumBanks &&
-		hsakmt_fmm_get_aperture_base_and_limit(FMM_SCRATCH, gpu_id,
+		hsakmt_fmm_get_aperture_base_and_limit(ctx, FMM_SCRATCH, gpu_id,
 				&MemoryProperties[i].VirtualBaseAddress, &aperture_limit) == HSAKMT_STATUS_SUCCESS) {
 		MemoryProperties[i].HeapType = HSA_HEAPTYPE_GPU_SCRATCH;
 		MemoryProperties[i].SizeInBytes = (aperture_limit - MemoryProperties[i].VirtualBaseAddress) + 1;
@@ -2350,7 +2357,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeMemoryProperties(HSAuint32 NodeId,
 
 	/* Add SVM aperture */
 	if (hsakmt_topology_is_svm_needed(g_props[NodeId].node.EngineId) && i < NumBanks &&
-	    hsakmt_fmm_get_aperture_base_and_limit(
+	    hsakmt_fmm_get_aperture_base_and_limit(ctx,
 		    FMM_SVM, gpu_id, &MemoryProperties[i].VirtualBaseAddress,
 		    &aperture_limit) == HSAKMT_STATUS_SUCCESS) {
 		MemoryProperties[i].HeapType = HSA_HEAPTYPE_DEVICE_SVM;
@@ -2360,7 +2367,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeMemoryProperties(HSAuint32 NodeId,
 
 	/* Add mmio aperture */
 	if (i < NumBanks &&
-		hsakmt_fmm_get_aperture_base_and_limit(FMM_MMIO, gpu_id,
+		hsakmt_fmm_get_aperture_base_and_limit(ctx, FMM_MMIO, gpu_id,
 				&MemoryProperties[i].VirtualBaseAddress, &aperture_limit) == HSAKMT_STATUS_SUCCESS) {
 		MemoryProperties[i].HeapType = HSA_HEAPTYPE_MMIO_REMAP;
 		MemoryProperties[i].SizeInBytes = (aperture_limit - MemoryProperties[i].VirtualBaseAddress) + 1;
@@ -2372,7 +2379,8 @@ out:
 	return err;
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeCacheProperties(HSAuint32 NodeId,
+HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeCachePropertiesCtx(HsaKFDContext *ctx,
+						     HSAuint32 NodeId,
 						     HSAuint32 ProcessorId,
 						     HSAuint32 NumCaches,
 						     HsaCacheProperties *CacheProperties)
@@ -2422,7 +2430,8 @@ HSAKMT_STATUS hsakmt_topology_get_iolink_props(HSAuint32 NodeId,
 	return HSAKMT_STATUS_SUCCESS;
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeIoLinkProperties(HSAuint32 NodeId,
+HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeIoLinkPropertiesCtx(HsaKFDContext *ctx,
+						      HSAuint32 NodeId,
 						      HSAuint32 NumIoLinks,
 						      HsaIoLinkProperties *IoLinkProperties)
 {
@@ -2536,3 +2545,43 @@ inline uint32_t hsakmt_get_num_sysfs_nodes(void)
 {
 	return num_sysfs_nodes;
 }
+
+
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtAcquireSystemProperties(HsaSystemProperties *SystemProperties)
+{
+	return hsaKmtAcquireSystemPropertiesCtx(&hsakmt_primary_kfd_ctx, SystemProperties);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtReleaseSystemProperties(void)
+{
+	return hsaKmtReleaseSystemPropertiesCtx(&hsakmt_primary_kfd_ctx);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeProperties(HSAuint32 NodeId,
+						HsaNodeProperties *NodeProperties)
+{
+	return hsaKmtGetNodePropertiesCtx(&hsakmt_primary_kfd_ctx, NodeId, NodeProperties);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeMemoryProperties(HSAuint32 NodeId,
+						      HSAuint32 NumBanks,
+						      HsaMemoryProperties *MemoryProperties)
+{
+	return hsaKmtGetNodeMemoryPropertiesCtx(&hsakmt_primary_kfd_ctx, NodeId, NumBanks, MemoryProperties);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeCacheProperties(HSAuint32 NodeId,
+						     HSAuint32 ProcessorId,
+						     HSAuint32 NumCaches,
+						     HsaCacheProperties *CacheProperties)
+{
+	return hsaKmtGetNodeCachePropertiesCtx(&hsakmt_primary_kfd_ctx, NodeId, ProcessorId, NumCaches, CacheProperties);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeIoLinkProperties(HSAuint32 NodeId,
+						      HSAuint32 NumIoLinks,
+						      HsaIoLinkProperties *IoLinkProperties)
+{
+	return hsaKmtGetNodeIoLinkPropertiesCtx(&hsakmt_primary_kfd_ctx, NodeId, NumIoLinks, IoLinkProperties);
+}
diff --git a/projects/rocr-runtime/libhsakmt/src/version.c b/projects/rocr-runtime/libhsakmt/src/version.c
index ceda2d2106..2865c81e04 100644
--- a/projects/rocr-runtime/libhsakmt/src/version.c
+++ b/projects/rocr-runtime/libhsakmt/src/version.c
@@ -43,7 +43,7 @@ HSAKMT_STATUS hsakmt_init_kfd_version(void)
 {
 	struct kfd_ioctl_get_version_args args = {0};
 
-	if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_GET_VERSION, &args) == -1)
+	if (hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_GET_VERSION, &args) == -1)
 		return HSAKMT_STATUS_ERROR;
 
 	hsakmt_kfd_version_info.KernelInterfaceMajorVersion = args.major_version;