diff --git a/projects/rocr-runtime/libhsakmt/include/hsakmt/hsakmt.h b/projects/rocr-runtime/libhsakmt/include/hsakmt/hsakmt.h
index 957141243b..6cf5ced9d5 100644
--- a/projects/rocr-runtime/libhsakmt/include/hsakmt/hsakmt.h
+++ b/projects/rocr-runtime/libhsakmt/include/hsakmt/hsakmt.h
@@ -32,6 +32,8 @@
 extern "C" {
 #endif
 
+/* Forward declaration for debug trap ioctl arguments */
+struct kfd_ioctl_dbg_trap_args;
 
 /**
   "Opens" the HSA kernel driver for user-kernel mode communication.
@@ -852,8 +854,10 @@ hsaKmtCheckRuntimeDebugSupport(
 /**
   Debug ops call primarily used for KFD testing
  */
-HSAKMT_STATUS HSAKMTAPI hsaKmtDebugTrapIoctl(
-    struct kfd_ioctl_dbg_trap_args *arg,
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtDebugTrapIoctl(
+    struct kfd_ioctl_dbg_trap_args *args,
     HSA_QUEUEID *Queues,
     HSAuint64 *DebugReturn
     );
diff --git a/projects/rocr-runtime/libhsakmt/src/debug.c b/projects/rocr-runtime/libhsakmt/src/debug.c
index 7fe450d123..1781b88a46 100644
--- a/projects/rocr-runtime/libhsakmt/src/debug.c
+++ b/projects/rocr-runtime/libhsakmt/src/debug.c
@@ -26,38 +26,82 @@
 #include "libhsakmt.h"
 #include "hsakmt/linux/kfd_ioctl.h"
 #include <errno.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
+#include <assert.h>
 
-static bool *is_device_debugged;
-static uint32_t runtime_capabilities_mask = 0;
+/*
+ * hsa_kfd_debug_context
+ *
+ * Represents the debug state for a KFD context.
+ * Each HsaKFDContext has its own independent debug context.
+ */
+struct hsa_kfd_debug_context {
+	/* Array tracking which nodes are being debugged */
+	bool *is_device_debugged;
 
-HSAKMT_STATUS hsakmt_init_device_debugging_memory(unsigned int NumNodes)
+	/* Runtime debug capabilities mask */
+	uint32_t runtime_capabilities_mask;
+};
+
+struct hsa_kfd_debug_context *hsakmt_kfdcontext_get_debug_context(HsaKFDContext *ctx)
+{
+	assert(ctx);
+	if (!ctx) {
+		pr_err("Expected a non-null ptr for HsaKFDContext");
+		return NULL;
+	}
+
+	if (ctx->debug_context)
+		return ctx->debug_context;
+
+	ctx->debug_context = calloc(1, sizeof(struct hsa_kfd_debug_context));
+	if (!ctx->debug_context) {
+		pr_err("Alloc memory failed for struct hsa_kfd_debug_context size %zu\n",
+				 sizeof(struct hsa_kfd_debug_context));
+		return NULL;
+	}
+	return ctx->debug_context;
+}
+
+HSAKMT_STATUS hsakmt_init_device_debugging_memory(HsaKFDContext *ctx, unsigned int NumNodes)
 {
 	unsigned int i;
+	struct hsa_kfd_debug_context *debug_ctx = hsakmt_kfdcontext_get_debug_context(ctx);
+	if (!debug_ctx)
+		return HSAKMT_STATUS_NO_MEMORY;
 
-	is_device_debugged = malloc(NumNodes * sizeof(bool));
-	if (!is_device_debugged)
+	debug_ctx->is_device_debugged = malloc(NumNodes * sizeof(bool));
+	if (!debug_ctx->is_device_debugged)
 		return HSAKMT_STATUS_NO_MEMORY;
 
 	for (i = 0; i < NumNodes; i++)
-		is_device_debugged[i] = false;
+		debug_ctx->is_device_debugged[i] = false;
 
 	return HSAKMT_STATUS_SUCCESS;
 }
 
-void hsakmt_destroy_device_debugging_memory(void)
+void hsakmt_destroy_device_debugging_memory(HsaKFDContext *ctx)
 {
-	if (is_device_debugged) {
-		free(is_device_debugged);
-		is_device_debugged = NULL;
+	struct hsa_kfd_debug_context *debug_ctx = hsakmt_kfdcontext_get_debug_context(ctx);
+	if (!debug_ctx)
+		return;
+
+	if (debug_ctx->is_device_debugged) {
+		free(debug_ctx->is_device_debugged);
+		debug_ctx->is_device_debugged = NULL;
 	}
 }
 
-bool hsakmt_debug_get_reg_status(uint32_t node_id)
+bool hsakmt_debug_get_reg_status(HsaKFDContext *ctx, uint32_t node_id)
 {
-	return is_device_debugged[node_id];
+	struct hsa_kfd_debug_context *debug_ctx = hsakmt_kfdcontext_get_debug_context(ctx);
+	if (!debug_ctx || !debug_ctx->is_device_debugged)
+		return false;
+
+	return debug_ctx->is_device_debugged[node_id];
 }
 
 HSAKMT_STATUS HSAKMTAPI hsaKmtDbgRegister(HSAuint32 NodeId)
@@ -66,11 +110,12 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDbgRegister(HSAuint32 NodeId)
 	uint32_t gpu_id;
 
 	CHECK_KFD_OPEN();
-
-	if (!is_device_debugged)
+	struct hsa_kfd_debug_context *debug_ctx =
+				hsakmt_kfdcontext_get_debug_context(&hsakmt_primary_kfd_ctx);
+	if (!debug_ctx->is_device_debugged)
 		return HSAKMT_STATUS_NO_MEMORY;
 
-	result = hsakmt_validate_nodeid(NodeId, &gpu_id);
+	result = hsakmt_validate_nodeid(&hsakmt_primary_kfd_ctx, NodeId, &gpu_id);
 	if (result != HSAKMT_STATUS_SUCCESS)
 		return result;
 
@@ -94,11 +139,12 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDbgUnregister(HSAuint32 NodeId)
 	HSAKMT_STATUS result;
 
 	CHECK_KFD_OPEN();
-
-	if (!is_device_debugged)
+	struct hsa_kfd_debug_context *debug_ctx =
+				hsakmt_kfdcontext_get_debug_context(&hsakmt_primary_kfd_ctx);
+	if (!debug_ctx->is_device_debugged)
 		return HSAKMT_STATUS_NO_MEMORY;
 
-	result = hsakmt_validate_nodeid(NodeId, &gpu_id);
+	result = hsakmt_validate_nodeid(&hsakmt_primary_kfd_ctx, NodeId, &gpu_id);
 	if (result != HSAKMT_STATUS_SUCCESS)
 		return result;
 
@@ -126,7 +172,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDbgWavefrontControl(HSAuint32 NodeId,
 
 	CHECK_KFD_OPEN();
 
-	result = hsakmt_validate_nodeid(NodeId, &gpu_id);
+	result = hsakmt_validate_nodeid(&hsakmt_primary_kfd_ctx, NodeId, &gpu_id);
 	if (result != HSAKMT_STATUS_SUCCESS)
 		return result;
 
@@ -195,11 +241,11 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDbgAddressWatch(HSAuint32 NodeId,
 	uint32_t watch_event_items = WatchEvent != NULL ? NumWatchPoints:0;
 
 	struct kfd_ioctl_dbg_address_watch_args *args;
-	HSAuint32		 i = 0;
+	HSAuint32 i = 0;
 
 	CHECK_KFD_OPEN();
 
-	result = hsakmt_validate_nodeid(NodeId, &gpu_id);
+	result = hsakmt_validate_nodeid(&hsakmt_primary_kfd_ctx, NodeId, &gpu_id);
 	if (result != HSAKMT_STATUS_SUCCESS)
 		return result;
 
@@ -268,19 +314,19 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDbgAddressWatch(HSAuint32 NodeId,
 #define HSA_RUNTIME_ENABLE_MAX_MAJOR   1
 #define HSA_RUNTIME_ENABLE_MIN_MINOR   13
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtCheckRuntimeDebugSupport(void) {
+HSAKMT_STATUS HSAKMTAPI hsaKmtCheckRuntimeDebugSupportCtx(HsaKFDContext *ctx) {
 	HsaNodeProperties node = {0};
 	HsaSystemProperties props = {0};
 	HsaVersionInfo versionInfo = {0};
 
 	memset(&node, 0x00, sizeof(node));
 	memset(&props, 0x00, sizeof(props));
-	if (hsaKmtAcquireSystemProperties(&props))
+	if (hsaKmtAcquireSystemPropertiesCtx(ctx, &props))
 		return HSAKMT_STATUS_ERROR;
 
 	//the firmware of gpu node doesn't support the debugger, disable it.
 	for (uint32_t i = 0; i < props.NumNodes; i++) {
-		if (hsaKmtGetNodeProperties(i, &node))
+		if (hsaKmtGetNodePropertiesCtx(ctx, i, &node))
 			return HSAKMT_STATUS_ERROR;
 
 		//ignore cpu node
@@ -302,12 +348,14 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCheckRuntimeDebugSupport(void) {
 	return HSAKMT_STATUS_SUCCESS;
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtRuntimeEnable(void *rDebug,
+HSAKMT_STATUS HSAKMTAPI hsaKmtRuntimeEnableCtx(HsaKFDContext *ctx,
+					    void *rDebug,
 					    bool setupTtmp)
 {
-	struct kfd_ioctl_runtime_enable_args args = {0};
-	HSAKMT_STATUS result = hsaKmtCheckRuntimeDebugSupport();
+	struct hsa_kfd_debug_context *debug_ctx = hsakmt_kfdcontext_get_debug_context(ctx);
 
+	struct kfd_ioctl_runtime_enable_args args = {0};
+	HSAKMT_STATUS result = hsaKmtCheckRuntimeDebugSupportCtx(ctx);
 	if (result)
 		return result;
 
@@ -316,7 +364,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRuntimeEnable(void *rDebug,
 		((setupTtmp) ? KFD_RUNTIME_ENABLE_MODE_TTMP_SAVE_MASK : 0);
 	args.r_debug = (HSAuint64)rDebug;
 
-	long err = hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_RUNTIME_ENABLE, &args);
+	long err = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_RUNTIME_ENABLE, &args);
 
 	if (err) {
 		if (errno == EBUSY)
@@ -324,15 +372,15 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRuntimeEnable(void *rDebug,
 		else
 			return HSAKMT_STATUS_ERROR;
 	}
-	runtime_capabilities_mask= args.capabilities_mask;
+	debug_ctx->runtime_capabilities_mask= args.capabilities_mask;
 
 	return HSAKMT_STATUS_SUCCESS;
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtRuntimeDisable(void)
+HSAKMT_STATUS HSAKMTAPI hsaKmtRuntimeDisableCtx(HsaKFDContext *ctx)
 {
 	struct kfd_ioctl_runtime_enable_args args = {0};
-	HSAKMT_STATUS result = hsaKmtCheckRuntimeDebugSupport();
+	HSAKMT_STATUS result = hsaKmtCheckRuntimeDebugSupportCtx(ctx);
 
 	if (result)
 		return result;
@@ -340,19 +388,23 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRuntimeDisable(void)
 	memset(&args, 0x00, sizeof(args));
 	args.mode_mask = 0; //Disable
 
-	if (hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_RUNTIME_ENABLE, &args))
+	if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_RUNTIME_ENABLE, &args))
 		return HSAKMT_STATUS_ERROR;
 
 	return HSAKMT_STATUS_SUCCESS;
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtGetRuntimeCapabilities(HSAuint32 *caps_mask)
+HSAKMT_STATUS HSAKMTAPI hsaKmtGetRuntimeCapabilitiesCtx(HsaKFDContext *ctx,
+						  HSAuint32 *caps_mask)
 {
-	*caps_mask = runtime_capabilities_mask;
+	struct hsa_kfd_debug_context *debug_ctx = hsakmt_kfdcontext_get_debug_context(ctx);
+
+	*caps_mask = debug_ctx->runtime_capabilities_mask;
 	return HSAKMT_STATUS_SUCCESS;
 }
 
-static HSAKMT_STATUS dbg_trap_get_device_data(void *data,
+static HSAKMT_STATUS dbg_trap_get_device_data(HsaKFDContext *ctx,
+					      void *data,
 					      uint32_t *n_entries,
 					      uint32_t entry_size)
 {
@@ -363,14 +415,15 @@ static HSAKMT_STATUS dbg_trap_get_device_data(void *data,
 	args.device_snapshot.entry_size = entry_size;
 	args.op = KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT;
 	args.pid = getpid();
-	if (hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_DBG_TRAP, &args))
+	if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_DBG_TRAP, &args))
 		return HSAKMT_STATUS_ERROR;
 	*n_entries = args.device_snapshot.num_devices;
 
 	return HSAKMT_STATUS_SUCCESS;
 }
 
-static HSAKMT_STATUS dbg_trap_get_queue_data(void *data,
+static HSAKMT_STATUS dbg_trap_get_queue_data(HsaKFDContext *ctx,
+					     void *data,
 					     uint32_t *n_entries,
 					     uint32_t entry_size,
 					     uint32_t *queue_ids)
@@ -384,7 +437,7 @@ static HSAKMT_STATUS dbg_trap_get_queue_data(void *data,
 	args.queue_snapshot.snapshot_buf_ptr = (uint64_t) data;
 	args.pid = getpid();
 
-	if (hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_DBG_TRAP, &args))
+	if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_DBG_TRAP, &args))
 		return HSAKMT_STATUS_ERROR;
 
 	*n_entries = args.queue_snapshot.num_queues;
@@ -398,7 +451,8 @@ static HSAKMT_STATUS dbg_trap_get_queue_data(void *data,
 	return HSAKMT_STATUS_SUCCESS;
 }
 
-static HSAKMT_STATUS dbg_trap_suspend_queues(uint32_t *queue_ids,
+static HSAKMT_STATUS dbg_trap_suspend_queues(HsaKFDContext *ctx,
+					     uint32_t *queue_ids,
 					     uint32_t num_queues)
 {
 	struct kfd_ioctl_dbg_trap_args args = {0};
@@ -410,7 +464,7 @@ static HSAKMT_STATUS dbg_trap_suspend_queues(uint32_t *queue_ids,
 	args.op = KFD_IOC_DBG_TRAP_SUSPEND_QUEUES;
 	args.pid = getpid();
 
-	r = hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_DBG_TRAP, &args);
+	r = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_DBG_TRAP, &args);
 	if (r < 0)
 		return HSAKMT_STATUS_ERROR;
 
@@ -420,7 +474,8 @@ static HSAKMT_STATUS dbg_trap_suspend_queues(uint32_t *queue_ids,
 /* Debugger support has been in KFD ABI 1.13.  */
 #define KFD_MINOR_MIN_DEBUG 13
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtDbgEnable(void **runtime_info,
+HSAKMT_STATUS HSAKMTAPI hsaKmtDbgEnableCtx(HsaKFDContext *ctx,
+					     void **runtime_info,
 					     HSAuint32 *data_size)
 {
 	struct kfd_ioctl_dbg_trap_args args = {0};
@@ -429,7 +484,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDbgEnable(void **runtime_info,
 	CHECK_KFD_MINOR_VERSION(KFD_MINOR_MIN_DEBUG);
 	*data_size = sizeof(struct kfd_runtime_info);
 	args.enable.rinfo_size = *data_size;
-	args.enable.dbg_fd = hsakmt_primary_kfd_ctx.fd;
+	args.enable.dbg_fd = ctx->fd;
 	*runtime_info = malloc(args.enable.rinfo_size);
 	if (!*runtime_info)
 		return HSAKMT_STATUS_NO_MEMORY;
@@ -437,30 +492,31 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDbgEnable(void **runtime_info,
 	args.op = KFD_IOC_DBG_TRAP_ENABLE;
 	args.pid = getpid();
 
-	if (hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_DBG_TRAP, &args)) {
+	if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_DBG_TRAP, &args)) {
 		free(*runtime_info);
 		return HSAKMT_STATUS_ERROR;
 	}
 
 	return HSAKMT_STATUS_SUCCESS;
 }
-HSAKMT_STATUS HSAKMTAPI hsaKmtDbgDisable(void)
+HSAKMT_STATUS HSAKMTAPI hsaKmtDbgDisableCtx(HsaKFDContext *ctx)
 {
 	struct kfd_ioctl_dbg_trap_args args = {0};
 
 	CHECK_KFD_OPEN();
 	CHECK_KFD_MINOR_VERSION(KFD_MINOR_MIN_DEBUG);
-	args.enable.dbg_fd = hsakmt_primary_kfd_ctx.fd;
+	args.enable.dbg_fd = ctx->fd;
 	args.op = KFD_IOC_DBG_TRAP_DISABLE;
 	args.pid = getpid();
 
-	if (hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_DBG_TRAP, &args))
+	if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_DBG_TRAP, &args))
 		return HSAKMT_STATUS_ERROR;
 
 	return HSAKMT_STATUS_SUCCESS;
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtDbgGetDeviceData(void **data,
+HSAKMT_STATUS HSAKMTAPI hsaKmtDbgGetDeviceDataCtx(HsaKFDContext *ctx,
+						void **data,
 						HSAuint32 *n_entries,
 						HSAuint32 *entry_size)
 {
@@ -473,14 +529,15 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDbgGetDeviceData(void **data,
 	*data = malloc(*entry_size * *n_entries);
 	if (!*data)
 		return ret;
-	ret = dbg_trap_get_device_data(*data, n_entries, *entry_size);
+	ret = dbg_trap_get_device_data(ctx, *data, n_entries, *entry_size);
 	if (ret)
 		free(*data);
 
 	return ret;
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtDbgGetQueueData(void **data,
+HSAKMT_STATUS HSAKMTAPI hsaKmtDbgGetQueueDataCtx(HsaKFDContext *ctx,
+						void **data,
 						HSAuint32 *n_entries,
 						HSAuint32 *entry_size,
 						bool suspend_queues)
@@ -491,7 +548,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDbgGetQueueData(void **data,
 	CHECK_KFD_MINOR_VERSION(KFD_MINOR_MIN_DEBUG);
 	*entry_size = sizeof(struct kfd_queue_snapshot_entry);
 	*n_entries = 0;
-	if (dbg_trap_get_queue_data(NULL, n_entries, *entry_size, NULL))
+	if (dbg_trap_get_queue_data(ctx, NULL, n_entries, *entry_size, NULL))
 		return HSAKMT_STATUS_ERROR;
 	*data = malloc(*n_entries * *entry_size);
 	if (!*data)
@@ -499,11 +556,11 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDbgGetQueueData(void **data,
 	if (suspend_queues && *n_entries)
 		queue_ids = (uint32_t *)malloc(sizeof(uint32_t) * *n_entries);
 	if (!queue_ids ||
-	    dbg_trap_get_queue_data(*data, n_entries, *entry_size, queue_ids))
+	    dbg_trap_get_queue_data(ctx, *data, n_entries, *entry_size, queue_ids))
 		goto free_data;
 	if (queue_ids) {
-		if (dbg_trap_suspend_queues(queue_ids, *n_entries) ||
-		    dbg_trap_get_queue_data(*data, n_entries, *entry_size, NULL))
+		if (dbg_trap_suspend_queues(ctx, queue_ids, *n_entries) ||
+		    dbg_trap_get_queue_data(ctx, *data, n_entries, *entry_size, NULL))
 			goto free_data;
 		free(queue_ids);
 	}
@@ -516,9 +573,10 @@ free_data:
 	return HSAKMT_STATUS_ERROR;
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtDebugTrapIoctl(struct kfd_ioctl_dbg_trap_args *args,
-					HSA_QUEUEID *Queues,
-					HSAuint64 *DebugReturn)
+HSAKMT_STATUS HSAKMTAPI hsaKmtDebugTrapIoctlCtx(HsaKFDContext *ctx,
+						struct kfd_ioctl_dbg_trap_args *args,
+						HSA_QUEUEID *Queues,
+						HSAuint64 *DebugReturn)
 {
 	HSAKMT_STATUS result;
 
@@ -540,7 +598,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDebugTrapIoctl(struct kfd_ioctl_dbg_trap_args *arg
 		free(queue_ids);
 	}
 
-	long err = hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_DBG_TRAP, args);
+	long err = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_DBG_TRAP, args);
 	if (DebugReturn)
 		*DebugReturn = err;
 
@@ -557,3 +615,58 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDebugTrapIoctl(struct kfd_ioctl_dbg_trap_args *arg
 
 	return result;
 }
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtCheckRuntimeDebugSupport(void)
+{
+	return hsaKmtCheckRuntimeDebugSupportCtx(&hsakmt_primary_kfd_ctx);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtRuntimeEnable(void *rDebug,
+					     bool setupTtmp)
+{
+	return hsaKmtRuntimeEnableCtx(&hsakmt_primary_kfd_ctx, rDebug, setupTtmp);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtRuntimeDisable(void)
+{
+	return hsaKmtRuntimeDisableCtx(&hsakmt_primary_kfd_ctx);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtGetRuntimeCapabilities(HSAuint32 *caps_mask)
+{
+	return hsaKmtGetRuntimeCapabilitiesCtx(&hsakmt_primary_kfd_ctx, caps_mask);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtDbgEnable(void **runtime_info,
+					     HSAuint32 *data_size)
+{
+	return hsaKmtDbgEnableCtx(&hsakmt_primary_kfd_ctx, runtime_info, data_size);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtDbgDisable(void)
+{
+	return hsaKmtDbgDisableCtx(&hsakmt_primary_kfd_ctx);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtDbgGetDeviceData(void **data,
+					     HSAuint32 *n_entries,
+					     HSAuint32 *entry_size)
+{
+	return hsaKmtDbgGetDeviceDataCtx(&hsakmt_primary_kfd_ctx, data, n_entries, entry_size);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtDbgGetQueueData(void **data,
+						HSAuint32 *n_entries,
+						HSAuint32 *entry_size,
+						bool suspend_queues)
+{
+	return hsaKmtDbgGetQueueDataCtx(&hsakmt_primary_kfd_ctx, data,
+					        n_entries, entry_size, suspend_queues);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtDebugTrapIoctl(struct kfd_ioctl_dbg_trap_args *args,
+					HSA_QUEUEID *Queues,
+					HSAuint64 *DebugReturn)
+{
+	return hsaKmtDebugTrapIoctlCtx(&hsakmt_primary_kfd_ctx, args, Queues, DebugReturn);
+}
diff --git a/projects/rocr-runtime/libhsakmt/src/events.c b/projects/rocr-runtime/libhsakmt/src/events.c
index df97cf6c64..c9a04a3b30 100644
--- a/projects/rocr-runtime/libhsakmt/src/events.c
+++ b/projects/rocr-runtime/libhsakmt/src/events.c
@@ -307,7 +307,7 @@ static HSAKMT_STATUS get_mem_info_svm_api(HsaKFDContext *ctx, uint64_t address,
 		    args->attrs[i].value == KFD_IOCTL_SVM_LOCATION_UNDEFINED)
 			node_id = args->attrs[i].value;
 		else
-			hsakmt_gpuid_to_nodeid(args->attrs[i].value, &node_id);
+			hsakmt_gpuid_to_nodeid(ctx, args->attrs[i].value, &node_id);
 		switch (args->attrs[i].type) {
 		case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC:
 			pr_err("Preferred location for address 0x%lx is Node id %d\n",
@@ -359,7 +359,7 @@ static void analysis_memory_exception(HsaKFDContext *ctx,
 	uint32_t node_id = 0;
 	unsigned int i;
 
-	hsakmt_gpuid_to_nodeid(memory_exception_data->gpu_id, &node_id);
+	hsakmt_gpuid_to_nodeid(ctx, memory_exception_data->gpu_id, &node_id);
 	pr_err("Memory exception on virtual address 0x%lx, ", addr);
 	pr_err("node id %d : ", node_id);
 	if (memory_exception_data->failure.NotPresent)
@@ -468,7 +468,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents_ExtCtx(HsaKFDContext *ctx,
 			if (Events[i]->EventData.EventType == HSA_EVENTTYPE_MEMORY &&
 			    event_data[i].memory_exception_data.gpu_id) {
 				Events[i]->EventData.EventData.MemoryAccessFault.VirtualAddress = event_data[i].memory_exception_data.va;
-				result = hsakmt_gpuid_to_nodeid(event_data[i].memory_exception_data.gpu_id, &Events[i]->EventData.EventData.MemoryAccessFault.NodeId);
+				result = hsakmt_gpuid_to_nodeid(ctx, event_data[i].memory_exception_data.gpu_id, &Events[i]->EventData.EventData.MemoryAccessFault.NodeId);
 				if (result != HSAKMT_STATUS_SUCCESS)
 					goto out;
 				Events[i]->EventData.EventData.MemoryAccessFault.Failure.NotPresent = event_data[i].memory_exception_data.failure.NotPresent;
@@ -483,7 +483,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents_ExtCtx(HsaKFDContext *ctx,
 			} else if (Events[i]->EventData.EventType == HSA_EVENTTYPE_HW_EXCEPTION &&
 				event_data[i].hw_exception_data.gpu_id) {
 
-				result = hsakmt_gpuid_to_nodeid(event_data[i].hw_exception_data.gpu_id, &Events[i]->EventData.EventData.HwException.NodeId);
+				result = hsakmt_gpuid_to_nodeid(ctx, event_data[i].hw_exception_data.gpu_id, &Events[i]->EventData.EventData.HwException.NodeId);
 				if (result != HSAKMT_STATUS_SUCCESS)
 					goto out;
 
@@ -515,7 +515,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtOpenSMICtx(HsaKFDContext *ctx, HSAuint32 NodeId, i
 
 	pr_debug("[%s] node %d\n", __func__, NodeId);
 
-	result = hsakmt_validate_nodeid(NodeId, &gpuid);
+	result = hsakmt_validate_nodeid(ctx, NodeId, &gpuid);
 	if (result != HSAKMT_STATUS_SUCCESS) {
 		pr_err("[%s] invalid node ID: %d\n", __func__, NodeId);
 		return result;
diff --git a/projects/rocr-runtime/libhsakmt/src/fmm.c b/projects/rocr-runtime/libhsakmt/src/fmm.c
index ea0b43fbd7..57344aafa3 100644
--- a/projects/rocr-runtime/libhsakmt/src/fmm.c
+++ b/projects/rocr-runtime/libhsakmt/src/fmm.c
@@ -254,6 +254,28 @@ struct hsa_kfd_fmm_context
 	unsigned int gpu_mem_count;
 	gpu_mem_t *first_gpu_mem;
 
+	/* GPU node array for default mappings */
+	uint32_t all_gpu_id_array_size;
+	uint32_t *all_gpu_id_array;
+
+	void *dgpu_shared_aperture_base;
+	void *dgpu_shared_aperture_limit;
+
+	svm_t svm;
+
+	/* On APU, for memory allocated on the system memory that GPU doesn't
+	 * access via GPU driver, they are not managed by GPUVM. cpuvm_aperture
+	 * keeps track of this part of memory.
+	 * Each context has its own tracking.
+	 */
+	manageable_aperture_t cpuvm_aperture;
+
+	/* mem_handle_aperture is used to generate memory handles for allocations
+	 * that don't have a valid virtual address. its size is 47bits.
+	 * Each context has its own handle space.
+	 */
+	manageable_aperture_t mem_handle_aperture;
+
 #define DRM_FIRST_RENDER_NODE 128
 #define DRM_LAST_RENDER_NODE 255
 
@@ -281,38 +303,30 @@ struct hsa_kfd_fmm_context *hsakmt_kfdcontext_get_fmm_context(HsaKFDContext *ctx
 				 sizeof(struct hsa_kfd_fmm_context));
 		return NULL;
 	}
+
+	/* Initialize svm members */
+	manageable_aperture_t init_aperture = INIT_MANAGEABLE_APERTURE(0, 0);
+	manageable_aperture_t mem_handle_init = INIT_MANAGEABLE_APERTURE(START_NON_CANONICAL_ADDR, (START_NON_CANONICAL_ADDR + (1ULL << 47)));
+
+	ctx->fmm_context->svm.apertures[SVM_DEFAULT] = init_aperture;
+	ctx->fmm_context->svm.apertures[SVM_COHERENT] = init_aperture;
+	ctx->fmm_context->svm.dgpu_aperture = NULL;
+	ctx->fmm_context->svm.dgpu_alt_aperture = NULL;
+	ctx->fmm_context->svm.userptr_for_paged_mem = false;
+	ctx->fmm_context->svm.check_userptr = false;
+	ctx->fmm_context->svm.reserve_svm = false;
+	ctx->fmm_context->svm.disable_cache = false;
+	ctx->fmm_context->svm.alignment_order = 0;
+
+	/* Initialize cpuvm_aperture */
+	ctx->fmm_context->cpuvm_aperture = init_aperture;
+
+	/* Initialize mem_handle_aperture */
+	ctx->fmm_context->mem_handle_aperture = mem_handle_init;
+
 	return ctx->fmm_context;
 }
 
-static void *dgpu_shared_aperture_base;
-static void *dgpu_shared_aperture_limit;
-
-static svm_t svm = {
-	.apertures = {INIT_MANAGEABLE_APERTURE(0, 0),
-		      INIT_MANAGEABLE_APERTURE(0, 0)},
-	.dgpu_aperture = NULL,
-	.dgpu_alt_aperture = NULL,
-	.userptr_for_paged_mem = false,
-	.check_userptr = false,
-	.disable_cache = false,
-};
-
-/* On APU, for memory allocated on the system memory that GPU doesn't access
- * via GPU driver, they are not managed by GPUVM. cpuvm_aperture keeps track
- * of this part of memory.
- */
-static manageable_aperture_t cpuvm_aperture = INIT_MANAGEABLE_APERTURE(0, 0);
-
-/* mem_handle_aperture is used to generate memory handles
- * for allocations that don't have a valid virtual address
- * its size is 47bits.
-*/
-static manageable_aperture_t mem_handle_aperture = INIT_MANAGEABLE_APERTURE(START_NON_CANONICAL_ADDR, (START_NON_CANONICAL_ADDR + (1ULL << 47)));
-
-/* GPU node array for default mappings */
-static uint32_t all_gpu_id_array_size;
-static uint32_t *all_gpu_id_array;
-
 /* IPC structures and helper functions */
 typedef enum _HSA_APERTURE {
 	HSA_APERTURE_UNSUPPORTED = 0,
@@ -849,8 +863,9 @@ static void *mmap_aperture_allocate_aligned(manageable_aperture_t *aper,
 					    void *address,
 					    uint64_t size, uint64_t align)
 {
-	uint64_t alignment_size = PAGE_SIZE << svm.alignment_order;
 	uint64_t guard_size;
+	svm_t *svm = container_of(aper, svm_t, apertures);
+	uint64_t alignment_size = PAGE_SIZE << svm->alignment_order;
 
 	if (!aper->is_cpu_accessible) {
 		pr_err("MMap Aperture must be CPU accessible\n");
@@ -984,15 +999,15 @@ static manageable_aperture_t *fmm_get_aperture(struct hsa_kfd_fmm_context *fmm_c
 {
 	switch (info.type) {
 	case HSA_APERTURE_DGPU:
-		return svm.dgpu_aperture;
+		return fmm_ctx->svm.dgpu_aperture;
 	case HSA_APERTURE_DGPU_ALT:
-		return svm.dgpu_alt_aperture;
+		return fmm_ctx->svm.dgpu_alt_aperture;
 	case HSA_APERTURE_GPUVM:
 		return &fmm_ctx->gpu_mem[info.idx].gpuvm_aperture;
 	case HSA_APERTURE_CPUVM:
-		return &cpuvm_aperture;
+		return &fmm_ctx->cpuvm_aperture;
 	case HSA_APERTURE_MEMHANDLE:
-		return &mem_handle_aperture;
+		return &fmm_ctx->mem_handle_aperture;
 	default:
 		return NULL;
 	}
@@ -1023,35 +1038,35 @@ static manageable_aperture_t *fmm_find_aperture(struct hsa_kfd_fmm_context *fmm_
 	HsaApertureInfo _info = { .type = HSA_APERTURE_UNSUPPORTED, .idx = 0};
 	gpu_mem_t *gpu_mem_ptr = NULL;
 
-	if ((address >= mem_handle_aperture.base) &&
-		(address <= mem_handle_aperture.limit)){
+	if ((address >= fmm_ctx->mem_handle_aperture.base) &&
+		(address <= fmm_ctx->mem_handle_aperture.limit)){
 
-		aperture = &mem_handle_aperture;
+		aperture = &fmm_ctx->mem_handle_aperture;
 		_info.type = HSA_APERTURE_MEMHANDLE;
 
 	} else if (hsakmt_is_dgpu) {
-		if (address >= svm.dgpu_aperture->base &&
-			address <= svm.dgpu_aperture->limit) {
+		if (address >= fmm_ctx->svm.dgpu_aperture->base &&
+			address <= fmm_ctx->svm.dgpu_aperture->limit) {
 
 			gpu_mem_ptr = fmm_is_scratch_aperture(fmm_ctx, address);
 			if (gpu_mem_ptr) {
 				aperture = &gpu_mem_ptr->scratch_physical;
 			} else {
-				aperture = svm.dgpu_aperture;
+				aperture = fmm_ctx->svm.dgpu_aperture;
 				_info.type = HSA_APERTURE_DGPU;
 			}
-		} else if (address >= svm.dgpu_alt_aperture->base &&
-			address <= svm.dgpu_alt_aperture->limit) {
-			aperture = svm.dgpu_alt_aperture;
+		} else if (address >= fmm_ctx->svm.dgpu_alt_aperture->base &&
+			address <= fmm_ctx->svm.dgpu_alt_aperture->limit) {
+			aperture = fmm_ctx->svm.dgpu_alt_aperture;
 			_info.type = HSA_APERTURE_DGPU_ALT;
 		} else {
 			/* Not in SVM, it can be system memory registered by userptr */
-			aperture = svm.dgpu_aperture;
+			aperture = fmm_ctx->svm.dgpu_aperture;
 			_info.type = HSA_APERTURE_DGPU;
 		}
 	} else { /* APU */
-		if (address >= svm.dgpu_aperture->base && address <= svm.dgpu_aperture->limit) {
-			aperture = svm.dgpu_aperture;
+		if (address >= fmm_ctx->svm.dgpu_aperture->base && address <= fmm_ctx->svm.dgpu_aperture->limit) {
+			aperture = fmm_ctx->svm.dgpu_aperture;
 			_info.type = HSA_APERTURE_DGPU;
 		} else {
 			/* gpuvm_aperture */
@@ -1066,7 +1081,7 @@ static manageable_aperture_t *fmm_find_aperture(struct hsa_kfd_fmm_context *fmm_
 		}
 		if (!aperture) {
 			/* Not in GPUVM */
-			aperture = &cpuvm_aperture;
+			aperture = &fmm_ctx->cpuvm_aperture;
 			_info.type = HSA_APERTURE_CPUVM;
 		}
 	}
@@ -1179,6 +1194,7 @@ static vm_object_t *fmm_allocate_memory_object(HsaKFDContext *ctx,
 	vm_object_t *vm_obj = NULL;
 	HsaMemFlags mflags;
 	uint64_t offset = 0, total_size, size;
+	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
 
 	if (!mem)
 		return NULL;
@@ -1194,7 +1210,7 @@ static vm_object_t *fmm_allocate_memory_object(HsaKFDContext *ctx,
 		args.va_addr = VOID_PTRS_SUB(mem, aperture->base);
 
 	/* if allocate vram-only, use an invalid VA */
-	if (aperture == &mem_handle_aperture)
+	if (aperture == &fmm_ctx->mem_handle_aperture)
 		args.va_addr = 0;
 
 	total_size = 0;
@@ -1312,12 +1328,12 @@ void hsakmt_fmm_print(HsaKFDContext *ctx, uint32_t gpu_id)
 	}
 
 	pr_info("dGPU aperture:\n");
-	manageable_aperture_print(svm.dgpu_aperture);
+	manageable_aperture_print(fmm_ctx->svm.dgpu_aperture);
 	pr_info("dGPU alt aperture:\n");
-	if (svm.dgpu_aperture == svm.dgpu_alt_aperture)
+	if (fmm_ctx->svm.dgpu_aperture == fmm_ctx->svm.dgpu_alt_aperture)
 		pr_info("\t Alias of dGPU aperture\n");
 	else
-		manageable_aperture_print(svm.dgpu_alt_aperture);
+		manageable_aperture_print(fmm_ctx->svm.dgpu_alt_aperture);
 }
 #else
 void hsakmt_fmm_print(HsaKFDContext *ctx, uint32_t gpu_id)
@@ -1355,24 +1371,24 @@ static vm_object_t *vm_find_object(struct hsa_kfd_fmm_context *fmm_ctx,
 		}
 
 	if (!aper) {
-		if ((addr >= mem_handle_aperture.base) &&
-			 (addr <= mem_handle_aperture.limit)){
-			 aper = &mem_handle_aperture;
+		if ((addr >= fmm_ctx->mem_handle_aperture.base) &&
+			 (addr <= fmm_ctx->mem_handle_aperture.limit)){
+			 aper = &fmm_ctx->mem_handle_aperture;
 		}
 	}
 
 	if (!aper) {
-		if (!svm.dgpu_aperture)
+		if (!fmm_ctx->svm.dgpu_aperture)
 			goto no_svm;
 
-		if ((addr >= svm.dgpu_aperture->base) &&
-		    (addr <= svm.dgpu_aperture->limit))
-			aper = svm.dgpu_aperture;
-		else if ((addr >= svm.dgpu_alt_aperture->base) &&
-			 (addr <= svm.dgpu_alt_aperture->limit))
-			aper = svm.dgpu_alt_aperture;
+		if ((addr >= fmm_ctx->svm.dgpu_aperture->base) &&
+		    (addr <= fmm_ctx->svm.dgpu_aperture->limit))
+			aper = fmm_ctx->svm.dgpu_aperture;
+		else if ((addr >= fmm_ctx->svm.dgpu_alt_aperture->base) &&
+			 (addr <= fmm_ctx->svm.dgpu_alt_aperture->limit))
+			aper = fmm_ctx->svm.dgpu_alt_aperture;
 		else {
-			aper = svm.dgpu_aperture;
+			aper = fmm_ctx->svm.dgpu_aperture;
 			userptr = true;
 		}
 	}
@@ -1413,7 +1429,7 @@ no_svm:
 		if (aper)
 			pthread_mutex_unlock(&aper->fmm_mutex);
 
-		aper = &cpuvm_aperture;
+		aper = &fmm_ctx->cpuvm_aperture;
 
 		pthread_mutex_lock(&aper->fmm_mutex);
 		if (range)
@@ -1482,11 +1498,11 @@ static void fmm_release_scratch(HsaKFDContext *ctx, uint32_t gpu_id)
 		pthread_mutex_unlock(&aperture->fmm_mutex);
 
 		/* release address space */
-		pthread_mutex_lock(&svm.dgpu_aperture->fmm_mutex);
-		aperture_release_area(svm.dgpu_aperture,
+		pthread_mutex_lock(&fmm_ctx->svm.dgpu_aperture->fmm_mutex);
+		aperture_release_area(fmm_ctx->svm.dgpu_aperture,
 				      fmm_ctx->gpu_mem[gpu_mem_id].scratch_physical.base,
 				      size);
-		pthread_mutex_unlock(&svm.dgpu_aperture->fmm_mutex);
+		pthread_mutex_unlock(&fmm_ctx->svm.dgpu_aperture->fmm_mutex);
 	} else
 		/* release address space */
 		munmap(fmm_ctx->gpu_mem[gpu_mem_id].scratch_physical.base, size);
@@ -1533,11 +1549,11 @@ void *hsakmt_fmm_allocate_scratch(HsaKFDContext *ctx,
 
 	/* Allocate address space for scratch backing, 64KB aligned */
 	if (hsakmt_is_dgpu) {
-		pthread_mutex_lock(&svm.dgpu_aperture->fmm_mutex);
+		pthread_mutex_lock(&fmm_ctx->svm.dgpu_aperture->fmm_mutex);
 		mem = aperture_allocate_area_aligned(
-			svm.dgpu_aperture, address,
+			fmm_ctx->svm.dgpu_aperture, address,
 			aligned_size, SCRATCH_ALIGN);
-		pthread_mutex_unlock(&svm.dgpu_aperture->fmm_mutex);
+		pthread_mutex_unlock(&fmm_ctx->svm.dgpu_aperture->fmm_mutex);
 	} else {
 		if (address)
 			return NULL;
@@ -1670,6 +1686,7 @@ static void* udmabuf_allocation(HsaKFDContext *ctx,
 	uint64_t guard_size;
 	void *mem;
 	int ret;
+	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
 
 	dmabuf_fd = -1;
 	memfd = -1;
@@ -1694,7 +1711,7 @@ static void* udmabuf_allocation(HsaKFDContext *ctx,
 		goto error_release_memfd;
 	}
 
-	alignment_size = PAGE_SIZE << svm.alignment_order;
+	alignment_size = PAGE_SIZE << fmm_ctx->svm.alignment_order;
 	alignment = alignment ? alignment : aperture->align;
 	while (alignment < alignment_size && size >= (alignment << 1))
 		alignment <<= 1;
@@ -1714,7 +1731,7 @@ static void* udmabuf_allocation(HsaKFDContext *ctx,
 	mflags.ui32.NoSubstitute = 1;
 	/* Bind to NUMA node */
 	/* node_id is gpu id, get closed numa id */
-	numa_node_id = hsakmt_get_direct_link_cpu(node_id);
+	numa_node_id = hsakmt_get_direct_link_cpu(ctx, node_id);
 	if (bind_mem_to_numa(numa_node_id, mem, size, mflags))
 		goto error_release_aperture;
 
@@ -1801,7 +1818,7 @@ void *hsakmt_fmm_allocate_device(HsaKFDContext *ctx,
 	ioc_flags |= fmm_translate_hsa_to_ioc_flags(mflags);
 
 	if (hsakmt_topology_is_svm_needed(fmm_ctx->gpu_mem[gpu_mem_id].EngineId)) {
-		aperture = svm.dgpu_aperture;
+		aperture = fmm_ctx->svm.dgpu_aperture;
 		if (mflags.ui32.AQLQueueMemory)
 			size = MemorySizeInBytes * 2;
 	} else {
@@ -1814,12 +1831,12 @@ void *hsakmt_fmm_allocate_device(HsaKFDContext *ctx,
 
 	/* special case for vram allocation without addr */
 	if(mflags.ui32.NoAddress)
-		aperture = &mem_handle_aperture;
+		aperture = &fmm_ctx->mem_handle_aperture;
 
-	if (!mflags.ui32.CoarseGrain || svm.disable_cache)
+	if (!mflags.ui32.CoarseGrain || fmm_ctx->svm.disable_cache)
 		ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_COHERENT;
 
-	if (mflags.ui32.Uncached || svm.disable_cache)
+	if (mflags.ui32.Uncached || fmm_ctx->svm.disable_cache)
 		ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED;
 
 	if (mflags.ui32.ExtendedCoherent)
@@ -1829,7 +1846,7 @@ void *hsakmt_fmm_allocate_device(HsaKFDContext *ctx,
 		ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT;
 
 	mem = NULL;
-	if (hsakmt_udmabuf_dev_fd > 0 && aperture == svm.dgpu_aperture && !hsakmt_is_dgpu
+	if (hsakmt_udmabuf_dev_fd > 0 && aperture == fmm_ctx->svm.dgpu_aperture && !hsakmt_is_dgpu
 		 && aperture->ops == &mmap_aperture_ops) {
 		mem  = udmabuf_allocation(ctx, gpu_id, node_id, size, aperture, alignment,
                                         mflags, &vm_obj);
@@ -1871,7 +1888,7 @@ void *hsakmt_fmm_allocate_device(HsaKFDContext *ctx,
 		pthread_mutex_lock(&aperture->fmm_mutex);
 		/* Store memory allocation flags, not ioc flags */
 		 vm_obj->mflags = mflags;
-		 hsakmt_gpuid_to_nodeid(gpu_id, &vm_obj->node_id);
+		 hsakmt_gpuid_to_nodeid(ctx, gpu_id, &vm_obj->node_id);
 		 pthread_mutex_unlock(&aperture->fmm_mutex);
 
 	}
@@ -1896,7 +1913,7 @@ void *hsakmt_fmm_allocate_doorbell(HsaKFDContext *ctx,
 		return NULL;
 
 	/* Use fine-grained aperture */
-	aperture = svm.dgpu_alt_aperture;
+	aperture = fmm_ctx->svm.dgpu_alt_aperture;
 	ioc_flags = KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL |
 		    KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE |
 		    KFD_IOC_ALLOC_MEM_FLAGS_COHERENT;
@@ -1914,7 +1931,7 @@ void *hsakmt_fmm_allocate_doorbell(HsaKFDContext *ctx,
 
 		pthread_mutex_lock(&aperture->fmm_mutex);
 		vm_obj->mflags = mflags;
-		hsakmt_gpuid_to_nodeid(gpu_id, &vm_obj->node_id);
+		hsakmt_gpuid_to_nodeid(ctx, gpu_id, &vm_obj->node_id);
 		pthread_mutex_unlock(&aperture->fmm_mutex);
 	}
 
@@ -1932,12 +1949,13 @@ void *hsakmt_fmm_allocate_doorbell(HsaKFDContext *ctx,
 	return mem;
 }
 
-static void *fmm_allocate_host_cpu(void *address, uint64_t MemorySizeInBytes,
+static void *fmm_allocate_host_cpu(HsaKFDContext *ctx, void *address, uint64_t MemorySizeInBytes,
 				HsaMemFlags mflags)
 {
 	void *mem = NULL;
 	vm_object_t *vm_obj;
 	int mmap_prot = PROT_READ;
+	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
 
 	if (address)
 		return NULL;
@@ -1957,12 +1975,12 @@ static void *fmm_allocate_host_cpu(void *address, uint64_t MemorySizeInBytes,
 	if (mem == MAP_FAILED)
 		return NULL;
 
-	pthread_mutex_lock(&cpuvm_aperture.fmm_mutex);
-	vm_obj = aperture_allocate_object(&cpuvm_aperture, mem, 0,
+	pthread_mutex_lock(&fmm_ctx->cpuvm_aperture.fmm_mutex);
+	vm_obj = aperture_allocate_object(&fmm_ctx->cpuvm_aperture, mem, 0,
 				      MemorySizeInBytes, mflags);
 	if (vm_obj)
 		vm_obj->node_id = 0; /* APU systems only have one CPU node */
-	pthread_mutex_unlock(&cpuvm_aperture.fmm_mutex);
+	pthread_mutex_unlock(&fmm_ctx->cpuvm_aperture.fmm_mutex);
 
 	return mem;
 }
@@ -2066,14 +2084,14 @@ static void *fmm_allocate_host_gpu(HsaKFDContext *ctx,
 	size = MemorySizeInBytes;
 	ioc_flags = 0;
 	if (mflags.ui32.CoarseGrain)
-		aperture = svm.dgpu_aperture;
+		aperture = fmm_ctx->svm.dgpu_aperture;
 	else
-		aperture = svm.dgpu_alt_aperture; /* always coherent */
+		aperture = fmm_ctx->svm.dgpu_alt_aperture; /* always coherent */
 
-	if (!mflags.ui32.CoarseGrain || svm.disable_cache)
+	if (!mflags.ui32.CoarseGrain || fmm_ctx->svm.disable_cache)
 		ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_COHERENT;
 
-	if (mflags.ui32.Uncached || svm.disable_cache)
+	if (mflags.ui32.Uncached || fmm_ctx->svm.disable_cache)
 		ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED;
 
 	if (mflags.ui32.ExtendedCoherent)
@@ -2091,7 +2109,7 @@ static void *fmm_allocate_host_gpu(HsaKFDContext *ctx,
 	/* Paged memory is allocated as a userptr mapping, non-paged
 	 * memory is allocated from KFD
 	 */
-	if (!mflags.ui32.NonPaged && svm.userptr_for_paged_mem) {
+	if (!mflags.ui32.NonPaged && fmm_ctx->svm.userptr_for_paged_mem) {
 		int advice = MADV_NORMAL;
 
 		/* set madvise flags to HUGEPAGE always for 2MB pages */
@@ -2183,7 +2201,7 @@ void *hsakmt_fmm_allocate_host(HsaKFDContext *ctx,
 		return NULL;
 	}
 
-	return fmm_allocate_host_cpu(address, MemorySizeInBytes, mflags);
+	return fmm_allocate_host_cpu(ctx, address, MemorySizeInBytes, mflags);
 }
 
 static int __fmm_release(HsaKFDContext *ctx,
@@ -2251,12 +2269,12 @@ HSAKMT_STATUS hsakmt_fmm_release(HsaKFDContext *ctx, void *address)
 			HSAKMT_STATUS_SUCCESS :
 			HSAKMT_STATUS_MEMORY_NOT_REGISTERED;
 
-	if (aperture == &cpuvm_aperture) {
+	if (aperture == &fmm_ctx->cpuvm_aperture) {
 		/* APU system memory */
 		uint64_t size = 0;
 
 		size = object->size;
-		vm_remove_object(&cpuvm_aperture, object);
+		vm_remove_object(&fmm_ctx->cpuvm_aperture, object);
 		pthread_mutex_unlock(&aperture->fmm_mutex);
 		munmap(address, size);
 	} else {
@@ -2402,7 +2420,8 @@ static HSAKMT_STATUS acquire_vm(HsaKFDContext *ctx, uint32_t gpu_id, int fd)
 	return HSAKMT_STATUS_SUCCESS;
 }
 
-static HSAKMT_STATUS init_mmap_apertures(HSAuint64 base, HSAuint64 limit,
+static HSAKMT_STATUS init_mmap_apertures(svm_t *svm,
+					 HSAuint64 base, HSAuint64 limit,
 					 HSAuint32 align, HSAuint32 guard_pages)
 {
 	void *addr;
@@ -2417,29 +2436,29 @@ static HSAKMT_STATUS init_mmap_apertures(HSAuint64 base, HSAuint64 limit,
 	}
 
 	/* Set up one SVM aperture */
-	svm.apertures[SVM_DEFAULT].base  = (void *)base;
-	svm.apertures[SVM_DEFAULT].limit = (void *)limit;
-	svm.apertures[SVM_DEFAULT].align = align;
-	svm.apertures[SVM_DEFAULT].guard_pages = guard_pages;
-	svm.apertures[SVM_DEFAULT].is_cpu_accessible = true;
-	svm.apertures[SVM_DEFAULT].ops = &mmap_aperture_ops;
+	svm->apertures[SVM_DEFAULT].base  = (void *)base;
+	svm->apertures[SVM_DEFAULT].limit = (void *)limit;
+	svm->apertures[SVM_DEFAULT].align = align;
+	svm->apertures[SVM_DEFAULT].guard_pages = guard_pages;
+	svm->apertures[SVM_DEFAULT].is_cpu_accessible = true;
+	svm->apertures[SVM_DEFAULT].ops = &mmap_aperture_ops;
 
-	svm.apertures[SVM_COHERENT].base = svm.apertures[SVM_COHERENT].limit =
+	svm->apertures[SVM_COHERENT].base = svm->apertures[SVM_COHERENT].limit =
 		NULL;
 
 	/* Try to allocate one page. If it fails, we'll fall back to
 	 * managing our own reserved address range.
 	 */
-	addr = aperture_allocate_area(&svm.apertures[SVM_DEFAULT], NULL, PAGE_SIZE);
+	addr = aperture_allocate_area(&svm->apertures[SVM_DEFAULT], NULL, PAGE_SIZE);
 	if (addr) {
-		aperture_release_area(&svm.apertures[SVM_DEFAULT], addr,
+		aperture_release_area(&svm->apertures[SVM_DEFAULT], addr,
 				      PAGE_SIZE);
 
-		svm.dgpu_aperture = svm.dgpu_alt_aperture =
-			&svm.apertures[SVM_DEFAULT];
+		svm->dgpu_aperture = svm->dgpu_alt_aperture =
+			&svm->apertures[SVM_DEFAULT];
 		pr_info("Initialized unreserved SVM apertures: %p - %p\n",
-			svm.apertures[SVM_DEFAULT].base,
-			svm.apertures[SVM_DEFAULT].limit);
+			svm->apertures[SVM_DEFAULT].base,
+			svm->apertures[SVM_DEFAULT].limit);
 	} else {
 		pr_info("Failed to allocate unreserved SVM address space.\n");
 		pr_info("Falling back to reserved SVM apertures.\n");
@@ -2470,18 +2489,20 @@ static void *reserve_address(void *addr, unsigned long long int len)
 #define SVM_MIN_VM_SIZE (4ULL << 30)
 #define IS_CANONICAL_ADDR(a) ((a) < (1ULL << 47))
 
-static HSAKMT_STATUS init_svm_apertures(HSAuint64 base, HSAuint64 limit,
+static HSAKMT_STATUS init_svm_apertures(struct hsa_kfd_fmm_context *fmm_ctx,
+					HSAuint64 base, HSAuint64 limit,
 					HSAuint32 align, HSAuint32 guard_pages)
 {
 	const HSAuint64 ADDR_INC = GPU_HUGE_PAGE_SIZE;
 	HSAuint64 len, map_size, alt_base, alt_size;
 	bool found = false;
 	void *addr, *ret_addr = NULL;
+	svm_t *svm = &fmm_ctx->svm;
 
 	/* If we already have an SVM aperture initialized (from a
 	 * parent process), keep using it
 	 */
-	if (dgpu_shared_aperture_limit)
+	if (fmm_ctx->dgpu_shared_aperture_limit)
 		return HSAKMT_STATUS_SUCCESS;
 
 	/* Align base and limit to huge page size */
@@ -2495,8 +2516,8 @@ static HSAKMT_STATUS init_svm_apertures(HSAuint64 base, HSAuint64 limit,
 	 * x86_64) or at least mmap is unlikely to run out of
 	 * addresses the GPUs can handle.
 	 */
-	if (limit >= (1ULL << 47) - 1 && !svm.reserve_svm) {
-		HSAKMT_STATUS status = init_mmap_apertures(base, limit, align,
+	if (limit >= (1ULL << 47) - 1 && !svm->reserve_svm) {
+		HSAKMT_STATUS status = init_mmap_apertures(svm, base, limit, align,
 							   guard_pages);
 
 		if (status == HSAKMT_STATUS_SUCCESS)
@@ -2574,57 +2595,54 @@ static HSAKMT_STATUS init_svm_apertures(HSAuint64 base, HSAuint64 limit,
 		limit = base + map_size - 1;
 
 	/* init two apertures for non-coherent and coherent memory */
-	svm.apertures[SVM_DEFAULT].base  = dgpu_shared_aperture_base  = ret_addr;
-	svm.apertures[SVM_DEFAULT].limit = dgpu_shared_aperture_limit = (void *)limit;
-	svm.apertures[SVM_DEFAULT].align = align;
-	svm.apertures[SVM_DEFAULT].guard_pages = guard_pages;
-	svm.apertures[SVM_DEFAULT].is_cpu_accessible = true;
-	svm.apertures[SVM_DEFAULT].ops = &reserved_aperture_ops;
+	svm->apertures[SVM_DEFAULT].base = fmm_ctx->dgpu_shared_aperture_base = ret_addr;
+	svm->apertures[SVM_DEFAULT].limit = fmm_ctx->dgpu_shared_aperture_limit = (void *)limit;
+	svm->apertures[SVM_DEFAULT].align = align;
+	svm->apertures[SVM_DEFAULT].guard_pages = guard_pages;
+	svm->apertures[SVM_DEFAULT].is_cpu_accessible = true;
+	svm->apertures[SVM_DEFAULT].ops = &reserved_aperture_ops;
 
 	/* Use the first 1/4 of the dGPU aperture as
 	 * alternate aperture for coherent access.
 	 * Base and size must be 64KB aligned.
 	 */
-	alt_base = (HSAuint64)svm.apertures[SVM_DEFAULT].base;
-	alt_size = (VOID_PTRS_SUB(svm.apertures[SVM_DEFAULT].limit,
-				  svm.apertures[SVM_DEFAULT].base) + 1) >> 2;
+	alt_base = (HSAuint64)svm->apertures[SVM_DEFAULT].base;
+	alt_size = (VOID_PTRS_SUB(svm->apertures[SVM_DEFAULT].limit,
+				  svm->apertures[SVM_DEFAULT].base) + 1) >> 2;
 	alt_base = (alt_base + 0xffff) & ~0xffffULL;
 	alt_size = (alt_size + 0xffff) & ~0xffffULL;
-	svm.apertures[SVM_COHERENT].base = (void *)alt_base;
-	svm.apertures[SVM_COHERENT].limit = (void *)(alt_base + alt_size - 1);
-	svm.apertures[SVM_COHERENT].align = align;
-	svm.apertures[SVM_COHERENT].guard_pages = guard_pages;
-	svm.apertures[SVM_COHERENT].is_cpu_accessible = true;
-	svm.apertures[SVM_COHERENT].ops = &reserved_aperture_ops;
+	svm->apertures[SVM_COHERENT].base = (void *)alt_base;
+	svm->apertures[SVM_COHERENT].limit = (void *)(alt_base + alt_size - 1);
+	svm->apertures[SVM_COHERENT].align = align;
+	svm->apertures[SVM_COHERENT].guard_pages = guard_pages;
+	svm->apertures[SVM_COHERENT].is_cpu_accessible = true;
+	svm->apertures[SVM_COHERENT].ops = &reserved_aperture_ops;
 
-	svm.apertures[SVM_DEFAULT].base = VOID_PTR_ADD(svm.apertures[SVM_COHERENT].limit, 1);
+	svm->apertures[SVM_DEFAULT].base = VOID_PTR_ADD(svm->apertures[SVM_COHERENT].limit, 1);
 
 	pr_info("SVM alt (coherent): %12p - %12p\n",
-		svm.apertures[SVM_COHERENT].base, svm.apertures[SVM_COHERENT].limit);
+		svm->apertures[SVM_COHERENT].base, svm->apertures[SVM_COHERENT].limit);
 	pr_info("SVM (non-coherent): %12p - %12p\n",
-		svm.apertures[SVM_DEFAULT].base, svm.apertures[SVM_DEFAULT].limit);
+		svm->apertures[SVM_DEFAULT].base, svm->apertures[SVM_DEFAULT].limit);
 
-	svm.dgpu_aperture = &svm.apertures[SVM_DEFAULT];
-	svm.dgpu_alt_aperture = &svm.apertures[SVM_COHERENT];
+	svm->dgpu_aperture = &svm->apertures[SVM_DEFAULT];
+	svm->dgpu_alt_aperture = &svm->apertures[SVM_COHERENT];
 
 	return HSAKMT_STATUS_SUCCESS;
 }
 
 static void fmm_init_rbtree(struct hsa_kfd_fmm_context *fmm_ctx)
 {
-	static int once;
 	int i = fmm_ctx->gpu_mem_count;
 
-	if (once++ == 0) {
-		rbtree_init(&svm.apertures[SVM_DEFAULT].tree);
-		rbtree_init(&svm.apertures[SVM_DEFAULT].user_tree);
-		rbtree_init(&svm.apertures[SVM_COHERENT].tree);
-		rbtree_init(&svm.apertures[SVM_COHERENT].user_tree);
-		rbtree_init(&cpuvm_aperture.tree);
-		rbtree_init(&cpuvm_aperture.user_tree);
-		rbtree_init(&mem_handle_aperture.tree);
-		rbtree_init(&mem_handle_aperture.user_tree);
-	}
+	rbtree_init(&fmm_ctx->mem_handle_aperture.tree);
+	rbtree_init(&fmm_ctx->mem_handle_aperture.user_tree);
+	rbtree_init(&fmm_ctx->cpuvm_aperture.tree);
+	rbtree_init(&fmm_ctx->cpuvm_aperture.user_tree);
+	rbtree_init(&fmm_ctx->svm.apertures[SVM_DEFAULT].tree);
+	rbtree_init(&fmm_ctx->svm.apertures[SVM_DEFAULT].user_tree);
+	rbtree_init(&fmm_ctx->svm.apertures[SVM_COHERENT].tree);
+	rbtree_init(&fmm_ctx->svm.apertures[SVM_COHERENT].user_tree);
 
 	while (i--) {
 		rbtree_init(&fmm_ctx->gpu_mem[i].scratch_physical.tree);
@@ -2638,7 +2656,8 @@ static void *map_mmio(HsaKFDContext *ctx,
 				uint32_t node_id, uint32_t gpu_id, int mmap_fd)
 {
 	void *mem;
-	manageable_aperture_t *aperture = svm.dgpu_alt_aperture;
+	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
+	manageable_aperture_t *aperture = fmm_ctx->svm.dgpu_alt_aperture;
 	uint32_t ioc_flags;
 	vm_object_t *vm_obj = NULL;
 	HsaMemFlags mflags;
@@ -2735,35 +2754,35 @@ static bool init_mem_handle_aperture(struct hsa_kfd_fmm_context *fmm_ctx, HSAuin
 {
 	bool found;
 	uint32_t i;
-
+	manageable_aperture_t *mem_handle_aper = &fmm_ctx->mem_handle_aperture;
 	/* init mem_handle_aperture for buffer handler management */
-	mem_handle_aperture.align = align;
-	mem_handle_aperture.guard_pages = guard_pages;
-	mem_handle_aperture.is_cpu_accessible = false;
-	mem_handle_aperture.ops = &reserved_aperture_ops;
+	mem_handle_aper->align = align;
+	mem_handle_aper->guard_pages = guard_pages;
+	mem_handle_aper->is_cpu_accessible = false;
+	mem_handle_aper->ops = &reserved_aperture_ops;
 
-	while (PORT_VPTR_TO_UINT64(mem_handle_aperture.base) < END_NON_CANONICAL_ADDR - 1) {
+	while (PORT_VPTR_TO_UINT64(mem_handle_aper->base) < END_NON_CANONICAL_ADDR - 1) {
 
 		found = true;
 		for (i = 0; i < fmm_ctx->gpu_mem_count; i++) {
 
 			if (fmm_ctx->gpu_mem[i].lds_aperture.base &&
 				two_apertures_overlap(fmm_ctx->gpu_mem[i].lds_aperture.base, fmm_ctx->gpu_mem[i].lds_aperture.limit,
-									mem_handle_aperture.base, mem_handle_aperture.limit)) {
+									mem_handle_aper->base, mem_handle_aper->limit)) {
 					found = false;
 					break;
 			}
 
 			if (fmm_ctx->gpu_mem[i].scratch_aperture.base &&
 				two_apertures_overlap(fmm_ctx->gpu_mem[i].scratch_aperture.base, fmm_ctx->gpu_mem[i].scratch_aperture.limit,
-									mem_handle_aperture.base, mem_handle_aperture.limit)){
+									mem_handle_aper->base, mem_handle_aper->limit)){
 					found = false;
 					break;
 			}
 
 			if (fmm_ctx->gpu_mem[i].gpuvm_aperture.base &&
 			   two_apertures_overlap(fmm_ctx->gpu_mem[i].gpuvm_aperture.base, fmm_ctx->gpu_mem[i].gpuvm_aperture.limit,
-									mem_handle_aperture.base, mem_handle_aperture.limit)){
+									mem_handle_aper->base, mem_handle_aper->limit)){
 					found = false;
 					break;
 			}
@@ -2771,18 +2790,18 @@ static bool init_mem_handle_aperture(struct hsa_kfd_fmm_context *fmm_ctx, HSAuin
 
 		if (found) {
 			pr_info("mem_handle_aperture start %p, mem_handle_aperture limit %p\n",
-					mem_handle_aperture.base, mem_handle_aperture.limit);
+					mem_handle_aper->base, mem_handle_aper->limit);
 			return true;
 		} else {
 			/* increase base by 1UL<<47 to check next hole */
-			mem_handle_aperture.base =  VOID_PTR_ADD(mem_handle_aperture.base, (1UL << 47));
-			mem_handle_aperture.limit = VOID_PTR_ADD(mem_handle_aperture.base, (1ULL << 47));
+			mem_handle_aper->base =  VOID_PTR_ADD(mem_handle_aper->base, (1UL << 47));
+			mem_handle_aper->limit = VOID_PTR_ADD(mem_handle_aper->base, (1ULL << 47));
 		}
 	}
 
 	/* set invalid aperture if fail locating a hole for it */
-	mem_handle_aperture.base =  0;
-	mem_handle_aperture.limit = 0;
+	mem_handle_aper->base =  0;
+	mem_handle_aper->limit = 0;
 
 	return false;
 }
@@ -2802,30 +2821,31 @@ HSAKMT_STATUS hsakmt_fmm_init_process_apertures(HsaKFDContext *ctx,
 	unsigned int guardPages = 1;
 	uint64_t svm_base = 0, svm_limit = 0;
 	uint32_t svm_alignment = 0, mfma_high_precision_mode = 0;
+	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
 
 	/* If HSA_DISABLE_CACHE is set to a non-0 value, disable caching */
 	disableCache = getenv("HSA_DISABLE_CACHE");
-	svm.disable_cache = (disableCache && strcmp(disableCache, "0"));
+	fmm_ctx->svm.disable_cache = (disableCache && strcmp(disableCache, "0"));
 
 	/* If HSA_USERPTR_FOR_PAGED_MEM is not set or set to a non-0
 	 * value, enable userptr for all paged memory allocations
 	 */
 	pagedUserptr = getenv("HSA_USERPTR_FOR_PAGED_MEM");
-	svm.userptr_for_paged_mem = (!pagedUserptr || strcmp(pagedUserptr, "0"));
+	fmm_ctx->svm.userptr_for_paged_mem = (!pagedUserptr || strcmp(pagedUserptr, "0"));
 
 	if (hsakmt_use_model)
-		svm.userptr_for_paged_mem = false;
+		fmm_ctx->svm.userptr_for_paged_mem = false;
 	/* If HSA_CHECK_USERPTR is set to a non-0 value, check all userptrs
 	 * when they are registered
 	 */
 	checkUserptr = getenv("HSA_CHECK_USERPTR");
-	svm.check_userptr = (checkUserptr && strcmp(checkUserptr, "0"));
+	fmm_ctx->svm.check_userptr = (checkUserptr && strcmp(checkUserptr, "0"));
 
 	/* If HSA_RESERVE_SVM is set to a non-0 value,
 	 * enable packet capture and replay mode.
 	 */
 	reserveSvm = getenv("HSA_RESERVE_SVM");
-	svm.reserve_svm = (reserveSvm && strcmp(reserveSvm, "0"));
+	fmm_ctx->svm.reserve_svm = (reserveSvm && strcmp(reserveSvm, "0"));
 
 	/* Specify number of guard pages for SVM apertures, default is 1 */
 	guardPagesStr = getenv("HSA_SVM_GUARD_PAGES");
@@ -2842,19 +2862,17 @@ HSAKMT_STATUS hsakmt_fmm_init_process_apertures(HsaKFDContext *ctx,
 	 * ASIC is found in the system, set back to 9(2MB).
 	 */
 	maxVaAlignStr = getenv("HSA_MAX_VA_ALIGN");
-	if (!maxVaAlignStr || sscanf(maxVaAlignStr, "%u", &svm.alignment_order) != 1) {
-		svm.alignment_order = 18;
+	if (!maxVaAlignStr || sscanf(maxVaAlignStr, "%u", &fmm_ctx->svm.alignment_order) != 1) {
+		fmm_ctx->svm.alignment_order = 18;
 
 		for (i = 0; i < NumNodes; i++) {
-			if (hsakmt_get_gfxv_by_node_id(i) != GFX_VERSION_GFX950) {
-				svm.alignment_order = 9;
+			if (hsakmt_get_gfxv_by_node_id(ctx, i) != GFX_VERSION_GFX950) {
+				fmm_ctx->svm.alignment_order = 9;
 				break;
 			}
 		}
 	}
-	pr_info("SVM alignment default order is %d.", svm.alignment_order);
-
-	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
+	pr_info("SVM alignment default order is %d.", fmm_ctx->svm.alignment_order);
 
 	/* Trade off - NumNodes includes GPU nodes + CPU Node. So in
 	 * systems with CPU node, slightly more memory is allocated than
@@ -2874,7 +2892,7 @@ HSAKMT_STATUS hsakmt_fmm_init_process_apertures(HsaKFDContext *ctx,
 	for (i = 0; i < NumNodes; i++) {
 		HsaNodeProperties props;
 
-		ret = hsakmt_topology_get_node_props(i, &props);
+		ret = hsakmt_topology_get_node_props(ctx, i, &props);
 		if (ret != HSAKMT_STATUS_SUCCESS)
 			goto gpu_mem_init_failed;
 
@@ -2932,7 +2950,7 @@ HSAKMT_STATUS hsakmt_fmm_init_process_apertures(HsaKFDContext *ctx,
 	 * required since Number of nodes is already known. Kernel will fill in
 	 * the apertures in kfd_process_device_apertures_ptr
 	 */
-	num_of_sysfs_nodes = hsakmt_get_num_sysfs_nodes();
+	num_of_sysfs_nodes = hsakmt_get_num_sysfs_nodes(ctx);
 	if (num_of_sysfs_nodes < gpu_mem_count) {
 		ret = HSAKMT_STATUS_ERROR;
 		goto sysfs_parse_failed;
@@ -2952,11 +2970,11 @@ HSAKMT_STATUS hsakmt_fmm_init_process_apertures(HsaKFDContext *ctx,
 	if (ret != HSAKMT_STATUS_SUCCESS)
 		goto get_aperture_ioctl_failed;
 
-	all_gpu_id_array_size = 0;
-	all_gpu_id_array = NULL;
+	assert(fmm_ctx->all_gpu_id_array_size == 0);
+	assert(fmm_ctx->all_gpu_id_array == NULL);
 	if (num_of_sysfs_nodes > 0) {
-		all_gpu_id_array = malloc(sizeof(uint32_t) * gpu_mem_count);
-		if (!all_gpu_id_array) {
+		fmm_ctx->all_gpu_id_array = malloc(sizeof(uint32_t) * gpu_mem_count);
+		if (!fmm_ctx->all_gpu_id_array) {
 			ret = HSAKMT_STATUS_NO_MEMORY;
 			goto get_aperture_ioctl_failed;
 		}
@@ -2975,22 +2993,22 @@ HSAKMT_STATUS hsakmt_fmm_init_process_apertures(HsaKFDContext *ctx,
 		if (gpu_mem_id < 0)
 			continue;
 
-		if (all_gpu_id_array_size == gpu_mem_count) {
+		if (fmm_ctx->all_gpu_id_array_size == gpu_mem_count) {
 			ret = HSAKMT_STATUS_ERROR;
 			goto aperture_init_failed;
 		}
-		all_gpu_id_array[all_gpu_id_array_size++] = process_apertures[i].gpu_id;
+		fmm_ctx->all_gpu_id_array[fmm_ctx->all_gpu_id_array_size++] = process_apertures[i].gpu_id;
 
 		/* Add this GPU to the usable_peer_id_arrays of all GPUs that
 		 * this GPU has an IO link to. This GPU can map memory
 		 * allocated on those GPUs.
 		 */
 		nodeId = gpu_mem[gpu_mem_id].node_id;
-		ret = hsakmt_topology_get_node_props(nodeId, &nodeProps);
+		ret = hsakmt_topology_get_node_props(ctx, nodeId, &nodeProps);
 		if (ret != HSAKMT_STATUS_SUCCESS)
 			goto aperture_init_failed;
 		assert(nodeProps.NumIOLinks <= NumNodes);
-		ret = hsakmt_topology_get_iolink_props(nodeId, nodeProps.NumIOLinks,
+		ret = hsakmt_topology_get_iolink_props(ctx, nodeId, nodeProps.NumIOLinks,
 						linkProps);
 		if (ret != HSAKMT_STATUS_SUCCESS)
 			goto aperture_init_failed;
@@ -3061,13 +3079,13 @@ HSAKMT_STATUS hsakmt_fmm_init_process_apertures(HsaKFDContext *ctx,
 		if (ret != HSAKMT_STATUS_SUCCESS)
 			goto aperture_init_failed;
 	}
-	all_gpu_id_array_size *= sizeof(uint32_t);
+	fmm_ctx->all_gpu_id_array_size *= sizeof(uint32_t);
 
 	if (svm_limit) {
 		/* At least one GPU uses GPUVM in canonical address
 		 * space. Set up SVM apertures shared by all such GPUs
 		 */
-		ret = init_svm_apertures(svm_base, svm_limit, svm_alignment,
+		ret = init_svm_apertures(fmm_ctx, svm_base, svm_limit, svm_alignment,
 					 guardPages);
 		if (ret != HSAKMT_STATUS_SUCCESS)
 			goto init_svm_failed;
@@ -3081,17 +3099,17 @@ HSAKMT_STATUS hsakmt_fmm_init_process_apertures(HsaKFDContext *ctx,
 				continue;
 
 			/* Set memory policy to match the SVM apertures */
-			alt_base = (uintptr_t)svm.dgpu_alt_aperture->base;
-			alt_size = VOID_PTRS_SUB(svm.dgpu_alt_aperture->limit,
-				svm.dgpu_alt_aperture->base) + 1;
+			alt_base = (uintptr_t)fmm_ctx->svm.dgpu_alt_aperture->base;
+			alt_size = VOID_PTRS_SUB(fmm_ctx->svm.dgpu_alt_aperture->limit,
+				fmm_ctx->svm.dgpu_alt_aperture->base) + 1;
 			err = fmm_set_memory_policy(ctx,
 						    process_apertures[i].gpu_id,
-						    svm.disable_cache ?
+						    fmm_ctx->svm.disable_cache ?
 						    KFD_IOC_CACHE_POLICY_COHERENT :
 						    KFD_IOC_CACHE_POLICY_NONCOHERENT,
 						    KFD_IOC_CACHE_POLICY_COHERENT,
 						    alt_base, alt_size,
-						    hsakmt_get_gfxv_by_node_id(i) == GFX_VERSION_GFX950 ?
+						    hsakmt_get_gfxv_by_node_id(ctx, i) == GFX_VERSION_GFX950 ?
 						    mfma_high_precision_mode : 0);
 			if (err) {
 				pr_err("Failed to set mem policy for GPU [0x%x]\n",
@@ -3102,8 +3120,8 @@ HSAKMT_STATUS hsakmt_fmm_init_process_apertures(HsaKFDContext *ctx,
 		}
 	}
 
-	cpuvm_aperture.align = PAGE_SIZE;
-	cpuvm_aperture.limit = (void *)0x7FFFFFFFFFFF; /* 2^47 - 1 */
+	fmm_ctx->cpuvm_aperture.align = PAGE_SIZE;
+	fmm_ctx->cpuvm_aperture.limit = (void *)0x7FFFFFFFFFFF; /* 2^47 - 1 */
 
 	fmm_init_rbtree(fmm_ctx);
 
@@ -3132,8 +3150,8 @@ HSAKMT_STATUS hsakmt_fmm_init_process_apertures(HsaKFDContext *ctx,
 aperture_init_failed:
 init_svm_failed:
 set_memory_policy_failed:
-	free(all_gpu_id_array);
-	all_gpu_id_array = NULL;
+	free(fmm_ctx->all_gpu_id_array);
+	fmm_ctx->all_gpu_id_array = NULL;
 get_aperture_ioctl_failed:
 	free(process_apertures);
 sysfs_parse_failed:
@@ -3148,11 +3166,11 @@ void hsakmt_fmm_destroy_process_apertures(HsaKFDContext *ctx)
 
 	release_mmio(ctx);
 
-	if (all_gpu_id_array) {
-		free(all_gpu_id_array);
-		all_gpu_id_array = NULL;
+	if (fmm_ctx->all_gpu_id_array) {
+		free(fmm_ctx->all_gpu_id_array);
+		fmm_ctx->all_gpu_id_array = NULL;
 	}
-	all_gpu_id_array_size = 0;
+	fmm_ctx->all_gpu_id_array_size = 0;
 
 	if (fmm_ctx->gpu_mem) {
 		while (fmm_ctx->gpu_mem_count-- > 0)
@@ -3207,10 +3225,10 @@ HSAKMT_STATUS hsakmt_fmm_get_aperture_base_and_limit(HsaKFDContext *ctx,
 		/* Report single SVM aperture, starting at base of
 		 * fine-grained, ending at limit of coarse-grained
 		 */
-		if (aperture_is_valid(svm.dgpu_alt_aperture->base,
-				      svm.dgpu_aperture->limit)) {
-			*aperture_base = PORT_VPTR_TO_UINT64(svm.dgpu_alt_aperture->base);
-			*aperture_limit = PORT_VPTR_TO_UINT64(svm.dgpu_aperture->limit);
+		if (aperture_is_valid(fmm_ctx->svm.dgpu_alt_aperture->base,
+				      fmm_ctx->svm.dgpu_aperture->limit)) {
+			*aperture_base = PORT_VPTR_TO_UINT64(fmm_ctx->svm.dgpu_alt_aperture->base);
+			*aperture_limit = PORT_VPTR_TO_UINT64(fmm_ctx->svm.dgpu_aperture->limit);
 			err = HSAKMT_STATUS_SUCCESS;
 		}
 		break;
@@ -3349,15 +3367,15 @@ static HSAKMT_STATUS _fmm_map_to_gpu(HsaKFDContext *ctx,
 	/* not specified, not registered: map all GPUs */
 		int32_t gpu_mem_id = gpu_mem_find_by_node_id(fmm_ctx, obj->node_id);
 
-		if (!obj->userptr && hsakmt_get_device_id_by_node_id(obj->node_id) &&
+		if (!obj->userptr && hsakmt_get_device_id_by_node_id(ctx, obj->node_id) &&
 		    gpu_mem_id >= 0) {
 			args.device_ids_array_ptr = (uint64_t)
 				fmm_ctx->gpu_mem[gpu_mem_id].usable_peer_id_array;
 			args.n_devices =
 				fmm_ctx->gpu_mem[gpu_mem_id].usable_peer_id_num;
 		} else {
-			args.device_ids_array_ptr = (uint64_t)all_gpu_id_array;
-			args.n_devices = all_gpu_id_array_size / sizeof(uint32_t);
+			args.device_ids_array_ptr = (uint64_t)fmm_ctx->all_gpu_id_array;
+			args.n_devices = fmm_ctx->all_gpu_id_array_size / sizeof(uint32_t);
 		}
 	}
 
@@ -3427,7 +3445,7 @@ static HSAKMT_STATUS _fmm_map_to_gpu_scratch(HsaKFDContext *ctx,
 	    VOID_PTR_ADD(address, size - 1) > aperture->limit)
 		return HSAKMT_STATUS_INVALID_PARAMETER;
 
-	is_debugger = hsakmt_debug_get_reg_status(fmm_ctx->gpu_mem[gpu_mem_id].node_id);
+	is_debugger = hsakmt_debug_get_reg_status(ctx, fmm_ctx->gpu_mem[gpu_mem_id].node_id);
 	flags = is_debugger ? KFD_IOC_ALLOC_MEM_FLAGS_GTT :
 			      KFD_IOC_ALLOC_MEM_FLAGS_VRAM;
 	flags |= KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE;
@@ -3463,8 +3481,9 @@ static HSAKMT_STATUS _fmm_map_to_gpu_userptr(HsaKFDContext *ctx,
 	void *svm_addr;
 	HSAuint32 page_offset = (HSAuint64)addr & (PAGE_SIZE-1);
 	HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
+	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
 
-	aperture = svm.dgpu_aperture;
+	aperture = fmm_ctx->svm.dgpu_aperture;
 
 	/* Map and return the GPUVM address adjusted by the offset
 	 * from the start of the page
@@ -3472,8 +3491,8 @@ static HSAKMT_STATUS _fmm_map_to_gpu_userptr(HsaKFDContext *ctx,
 	if (!object && hsakmt_is_svm_api_supported) {
 		svm_addr = (void*)((HSAuint64)addr - page_offset);
 		if (!nodes_to_map) {
-			nodes_to_map = all_gpu_id_array;
-			nodes_array_size = all_gpu_id_array_size;
+			nodes_to_map = fmm_ctx->all_gpu_id_array;
+			nodes_array_size = fmm_ctx->all_gpu_id_array_size;
 		}
 		pr_debug("%s Mapping Address %p size aligned: %ld offset: %x\n",
 			__func__, svm_addr, PAGE_ALIGN_UP(page_offset + size), page_offset);
@@ -3531,12 +3550,12 @@ HSAKMT_STATUS hsakmt_fmm_map_to_gpu(HsaKFDContext *ctx,
 	}
 
 	/* allocate buffer only, should be mapped by GEM API */
-        if (aperture && (aperture == &mem_handle_aperture)) {
+        if (aperture && (aperture == &fmm_ctx->mem_handle_aperture)) {
 		pthread_mutex_unlock(&aperture->fmm_mutex);
 		return HSAKMT_STATUS_INVALID_PARAMETER;
 	}
 
-	if (aperture && (aperture == &cpuvm_aperture)) {
+	if (aperture && (aperture == &fmm_ctx->cpuvm_aperture)) {
 		/* Prefetch memory on APUs with dummy-reads */
 		fmm_check_user_memory(address, size);
 		ret = HSAKMT_STATUS_SUCCESS;
@@ -3733,7 +3752,7 @@ int hsakmt_fmm_unmap_from_gpu(HsaKFDContext *ctx, void *address)
 		return (!hsakmt_is_dgpu || hsakmt_is_svm_api_supported) ? 0 : -EINVAL;
 	/* Successful vm_find_object returns with the aperture locked */
 
-	if (aperture == &cpuvm_aperture)
+	if (aperture == &fmm_ctx->cpuvm_aperture)
 		/* On APUs GPU unmapping of system memory is a no-op */
 		ret = 0;
 	else
@@ -3776,12 +3795,12 @@ bool hsakmt_fmm_get_handle(HsaKFDContext *ctx,
 	}
 
 	if (!aperture) {
-		if ((address >= svm.dgpu_aperture->base) &&
-			(address <= svm.dgpu_aperture->limit)) {
-			aperture = svm.dgpu_aperture;
-		} else if ((address >= svm.dgpu_alt_aperture->base) &&
-			(address <= svm.dgpu_alt_aperture->limit)) {
-			aperture = svm.dgpu_alt_aperture;
+		if ((address >= fmm_ctx->svm.dgpu_aperture->base) &&
+			(address <= fmm_ctx->svm.dgpu_aperture->limit)) {
+			aperture = fmm_ctx->svm.dgpu_aperture;
+		} else if ((address >= fmm_ctx->svm.dgpu_alt_aperture->base) &&
+			(address <= fmm_ctx->svm.dgpu_alt_aperture->limit)) {
+			aperture = fmm_ctx->svm.dgpu_alt_aperture;
 		}
 	}
 
@@ -3822,7 +3841,6 @@ static HSAKMT_STATUS fmm_register_user_memory(HsaKFDContext *ctx,
 						vm_object_t **obj_ret,
 						HsaMemFlags flags)
 {
-	manageable_aperture_t *aperture = svm.dgpu_aperture;
 	HSAuint32 page_offset = (HSAuint64)addr & (PAGE_SIZE-1);
 	HSAuint64 aligned_addr = (HSAuint64)addr - page_offset;
 	HSAuint64 aligned_size = PAGE_ALIGN_UP(page_offset + size);
@@ -3830,7 +3848,7 @@ static HSAKMT_STATUS fmm_register_user_memory(HsaKFDContext *ctx,
 	HSAuint32 gpu_id;
 	vm_object_t *obj, *exist_obj;
 	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
-
+	manageable_aperture_t *aperture = fmm_ctx->svm.dgpu_aperture;
 	/* Find first GPU for creating the userptr BO */
 	if (!fmm_ctx->first_gpu_mem)
 		return HSAKMT_STATUS_ERROR;
@@ -3838,7 +3856,7 @@ static HSAKMT_STATUS fmm_register_user_memory(HsaKFDContext *ctx,
 	gpu_id = fmm_ctx->first_gpu_mem->gpu_id;
 
 	/* Optionally check that the CPU mapping is valid */
-	if (svm.check_userptr)
+	if (fmm_ctx->svm.check_userptr)
 		fmm_check_user_memory(addr, size);
 
 	/* Allocate BO, userptr address is passed in mmap_offset */
@@ -3867,7 +3885,7 @@ static HSAKMT_STATUS fmm_register_user_memory(HsaKFDContext *ctx,
 		++exist_obj->registration_count;
 	} else {
 		obj->userptr = addr;
-		hsakmt_gpuid_to_nodeid(gpu_id, &obj->node_id);
+		hsakmt_gpuid_to_nodeid(ctx, gpu_id, &obj->node_id);
 		obj->userptr_size = size;
 		obj->registration_count = 1;
 		obj->user_node.key = rbtree_key((unsigned long)addr, size);
@@ -3919,7 +3937,7 @@ HSAKMT_STATUS hsakmt_fmm_register_memory(HsaKFDContext *ctx,
 			return ret;
 		if (gpu_id_array_size == 0)
 			return HSAKMT_STATUS_SUCCESS;
-		aperture = svm.dgpu_aperture;
+		aperture = fmm_ctx->svm.dgpu_aperture;
 		pthread_mutex_lock(&aperture->fmm_mutex);
 		/* fall through for registered device ID array setup */
 	} else if (object->userptr) {
@@ -4019,9 +4037,9 @@ HSAKMT_STATUS hsakmt_fmm_register_graphics_handle(HsaKFDContext *ctx,
 
 	/* import DMA buffer without VA assigned */
 	if (!gpu_id_array && gpu_id_array_size == 0 && !RegisterFlags.ui32.requiresVAddr) {
-		aperture = &mem_handle_aperture;
+		aperture = &fmm_ctx->mem_handle_aperture;
 	} else if (hsakmt_topology_is_svm_needed(fmm_ctx->gpu_mem[gpu_mem_id].EngineId)) {
-		aperture = svm.dgpu_aperture;
+		aperture = fmm_ctx->svm.dgpu_aperture;
 	} else {
 		aperture = &fmm_ctx->gpu_mem[gpu_mem_id].gpuvm_aperture;
 		aperture_base = aperture->base;
@@ -4037,7 +4055,7 @@ HSAKMT_STATUS hsakmt_fmm_register_graphics_handle(HsaKFDContext *ctx,
 	}
 
 	/* Import DMA buffer */
-	if (aperture == &mem_handle_aperture)
+	if (aperture == &fmm_ctx->mem_handle_aperture)
 		importArgs.va_addr = 0;
 	else
 		importArgs.va_addr = VOID_PTRS_SUB(mem, aperture_base);
@@ -4059,7 +4077,7 @@ HSAKMT_STATUS hsakmt_fmm_register_graphics_handle(HsaKFDContext *ctx,
 		obj->metadata = metadata;
 		obj->registered_device_id_array = gpu_id_array;
 		obj->registered_device_id_array_size = gpu_id_array_size;
-		hsakmt_gpuid_to_nodeid(infoArgs.gpu_id, &obj->node_id);
+		hsakmt_gpuid_to_nodeid(ctx, infoArgs.gpu_id, &obj->node_id);
 	}
 	pthread_mutex_unlock(&aperture->fmm_mutex);
 	if (!obj)
@@ -4069,7 +4087,7 @@ HSAKMT_STATUS hsakmt_fmm_register_graphics_handle(HsaKFDContext *ctx,
 	GraphicsResourceInfo->SizeInBytes = infoArgs.size;
 	GraphicsResourceInfo->Metadata = (void *)(unsigned long)infoArgs.metadata_ptr;
 	GraphicsResourceInfo->MetadataSizeInBytes = infoArgs.metadata_size;
-	hsakmt_gpuid_to_nodeid(infoArgs.gpu_id, &GraphicsResourceInfo->NodeId);
+	hsakmt_gpuid_to_nodeid(ctx, infoArgs.gpu_id, &GraphicsResourceInfo->NodeId);
 
 	return HSAKMT_STATUS_SUCCESS;
 
@@ -4159,7 +4177,7 @@ HSAKMT_STATUS hsakmt_fmm_share_memory(HsaKFDContext *ctx,
 	if (!obj)
 		return HSAKMT_STATUS_INVALID_PARAMETER;
 
-	r = hsakmt_validate_nodeid(obj->node_id, &gpu_id);
+	r = hsakmt_validate_nodeid(ctx, obj->node_id, &gpu_id);
 	if (r != HSAKMT_STATUS_SUCCESS)
 		return r;
 	if (!gpu_id && hsakmt_is_dgpu) {
@@ -4310,7 +4328,7 @@ HSAKMT_STATUS hsakmt_fmm_deregister_memory(HsaKFDContext *ctx, void *address)
 			HSAKMT_STATUS_MEMORY_NOT_REGISTERED;
 	/* Successful vm_find_object returns with aperture locked */
 
-	if (aperture == &cpuvm_aperture) {
+	if (aperture == &fmm_ctx->cpuvm_aperture) {
 		/* API-allocated system memory on APUs, deregistration
 		 * is a no-op
 		 */
@@ -4382,14 +4400,14 @@ HSAKMT_STATUS hsakmt_fmm_map_to_gpu_nodes(HsaKFDContext *ctx,
 	}
 
 	/* allocates buffer only, should be mapped by GEM API */
-	if (aperture == &mem_handle_aperture) {
+	if (aperture == &fmm_ctx->mem_handle_aperture) {
 		pthread_mutex_unlock(&aperture->fmm_mutex);
 		return HSAKMT_STATUS_INVALID_PARAMETER;
 	}
 
 	/* APU memory is not supported by this function */
 	if (aperture &&
-	   (aperture == &cpuvm_aperture || !aperture->is_cpu_accessible)) {
+	   (aperture == &fmm_ctx->cpuvm_aperture || !aperture->is_cpu_accessible)) {
 		pthread_mutex_unlock(&aperture->fmm_mutex);
 		return HSAKMT_STATUS_ERROR;
 	}
@@ -4403,8 +4421,8 @@ HSAKMT_STATUS hsakmt_fmm_map_to_gpu_nodes(HsaKFDContext *ctx,
 	}
 
 	/* Verify that all nodes to map are registered already */
-	registered_node_id_array = all_gpu_id_array;
-	registered_node_id_array_size = all_gpu_id_array_size;
+	registered_node_id_array = fmm_ctx->all_gpu_id_array;
+	registered_node_id_array_size = fmm_ctx->all_gpu_id_array_size;
 	if (object->registered_device_id_array_size > 0 &&
 			object->registered_device_id_array) {
 		registered_node_id_array = object->registered_device_id_array;
@@ -4518,7 +4536,7 @@ HSAKMT_STATUS hsakmt_fmm_get_mem_info(HsaKFDContext *ctx,
 		 * register to new nodes) or the memory being freed
 		 */
 		for (i = 0; i < info->NRegisteredNodes; i++)
-			hsakmt_gpuid_to_nodeid(vm_obj->registered_device_id_array[i],
+			hsakmt_gpuid_to_nodeid(ctx, vm_obj->registered_device_id_array[i],
 				&vm_obj->registered_node_id_array[i]);
 	}
 	info->RegisteredNodes = vm_obj->registered_node_id_array;
@@ -4537,7 +4555,7 @@ HSAKMT_STATUS hsakmt_fmm_get_mem_info(HsaKFDContext *ctx,
 		 * to new nodes) or memory being freed
 		 */
 		for (i = 0; i < info->NMappedNodes; i++)
-			hsakmt_gpuid_to_nodeid(vm_obj->mapped_device_id_array[i],
+			hsakmt_gpuid_to_nodeid(ctx, vm_obj->mapped_device_id_array[i],
 				&vm_obj->mapped_node_id_array[i]);
 	}
 	info->MappedNodes = vm_obj->mapped_node_id_array;
@@ -4681,27 +4699,28 @@ void hsakmt_fmm_clear_all_aperture(HsaKFDContext *ctx)
 	
 	struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx);
 
-	fmm_clear_aperture(&mem_handle_aperture);
-	fmm_clear_aperture(&cpuvm_aperture);
-	fmm_clear_aperture(&svm.apertures[SVM_DEFAULT]);
-	fmm_clear_aperture(&svm.apertures[SVM_COHERENT]);
+	fmm_clear_aperture(&fmm_ctx->mem_handle_aperture);
+	fmm_clear_aperture(&fmm_ctx->cpuvm_aperture);
+	fmm_clear_aperture(&fmm_ctx->svm.apertures[SVM_DEFAULT]);
+	fmm_clear_aperture(&fmm_ctx->svm.apertures[SVM_COHERENT]);
 
-	if (dgpu_shared_aperture_limit) {
+	if (fmm_ctx->dgpu_shared_aperture_limit) {
 		/* Use the same dgpu range as the parent. If failed, then set
 		 * hsakmt_is_dgpu_mem_init to false. Later on dgpu_mem_init will try
 		 * to get a new range
 		 */
-		map_addr = mmap(dgpu_shared_aperture_base, (HSAuint64)(dgpu_shared_aperture_limit)-
-			(HSAuint64)(dgpu_shared_aperture_base) + 1, PROT_NONE,
+		map_addr = mmap(fmm_ctx->dgpu_shared_aperture_base,
+			(HSAuint64)(fmm_ctx->dgpu_shared_aperture_limit)-
+			(HSAuint64)(fmm_ctx->dgpu_shared_aperture_base) + 1, PROT_NONE,
 			MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE | MAP_FIXED, -1, 0);
 
 		if (map_addr == MAP_FAILED) {
-			munmap(dgpu_shared_aperture_base,
-				   (HSAuint64)(dgpu_shared_aperture_limit) -
-				   (HSAuint64)(dgpu_shared_aperture_base) + 1);
+			munmap(fmm_ctx->dgpu_shared_aperture_base,
+				   (HSAuint64)(fmm_ctx->dgpu_shared_aperture_limit) -
+				   (HSAuint64)(fmm_ctx->dgpu_shared_aperture_base) + 1);
 
-			dgpu_shared_aperture_base = NULL;
-			dgpu_shared_aperture_limit = NULL;
+			fmm_ctx->dgpu_shared_aperture_base = NULL;
+			fmm_ctx->dgpu_shared_aperture_limit = NULL;
 		}
 	}
 
diff --git a/projects/rocr-runtime/libhsakmt/src/hsakmtctx.h b/projects/rocr-runtime/libhsakmt/src/hsakmtctx.h
index b2f04dbcdf..687d960b84 100644
--- a/projects/rocr-runtime/libhsakmt/src/hsakmtctx.h
+++ b/projects/rocr-runtime/libhsakmt/src/hsakmtctx.h
@@ -740,6 +740,156 @@ hsaKmtAllocQueueGWSCtx(
     HSAuint32          *firstGWS       //OUT
     );
 
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtRuntimeEnableCtx(
+    HsaKFDContext      *ctx,           //IN
+    void*              rDebug,         //IN
+    bool               setupTtmp       //IN
+    );
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtRuntimeDisableCtx(
+    HsaKFDContext      *ctx           //IN
+    );
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtGetRuntimeCapabilitiesCtx(
+    HsaKFDContext      *ctx,           //IN
+    HSAuint32	         *caps_mask      //OUT
+    );
+
+/**
+  Enable debug trap.
+*/
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtDbgEnableCtx(
+    HsaKFDContext      *ctx,           //IN
+    void               **runtime_info, //Out
+    HSAuint32          *data_size      //Out
+    );
+
+/**
+  Disable debug trap.
+*/
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtDbgDisableCtx(
+    HsaKFDContext      *ctx          //IN
+    );
+
+/**
+  Get device snapshot.
+*/
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtDbgGetDeviceDataCtx(
+    HsaKFDContext     *ctx,          //IN
+    void              **data,        //Out
+    HSAuint32         *n_entries,    //Out
+    HSAuint32         *entry_size    //Out
+    );
+
+/**
+  Get queues snapshot.
+*/
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtDbgGetQueueDataCtx(
+    HsaKFDContext     *ctx,           //IN
+    void              **data,         //Out
+    HSAuint32         *n_entries,     //Out
+    HSAuint32         *entry_size,    //Out
+    bool              suspend_queues  //In
+    );
+
+/**
+  Check whether gpu firmware and kernel support debugging
+*/
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtCheckRuntimeDebugSupportCtx(
+    HsaKFDContext     *ctx           //IN
+    );
+
+/**
+  Debug ops call primarily used for KFD testing
+ */
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtDebugTrapIoctlCtx(
+    HsaKFDContext                  *ctx,           //IN
+    struct kfd_ioctl_dbg_trap_args *args,          //IN/OUT
+    HSA_QUEUEID                    *Queues,        //IN
+    HSAuint64                      *DebugReturn    //OUT
+    );
+
+/**
+  Gets GPU and CPU clock counters for particular Node
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtGetClockCountersCtx(
+    HsaKFDContext     *ctx,           //IN
+    HSAuint32         NodeId,         //IN
+    HsaClockCounters  *Counters);     //OUT
+
+/**
+  Retrieves information on the available HSA counters
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtPmcGetCounterPropertiesCtx(
+    HsaKFDContext         *ctx,                //IN
+    HSAuint32              NodeId,             //IN
+    HsaCounterProperties** CounterProperties   //OUT
+    );
+
+/**
+  Registers a set of (HW) counters to be used for tracing/profiling
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtPmcRegisterTraceCtx(
+    HsaKFDContext      *ctx,                //IN
+    HSAuint32           NodeId,             //IN
+    HSAuint32           NumberOfCounters,   //IN
+    HsaCounter*         Counters,           //IN
+    HsaPmcTraceRoot*    TraceRoot           //OUT
+    );
+
+/**
+  Allows a user mode process to get exclusive access to the defined set of (HW) counters
+  used for tracing/profiling
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtPmcAcquireTraceAccessCtx(
+    HsaKFDContext      *ctx,                //IN
+    HSAuint32          NodeId,              //IN
+    HSATraceId         TraceId              //IN
+    );
+
+/**
+  Allows a user mode process to release exclusive access to the defined set of (HW) counters
+  used for tracing/profiling
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtPmcReleaseTraceAccessCtx(
+    HsaKFDContext      *ctx,                //IN
+    HSAuint32          NodeId,              //IN
+    HSATraceId         TraceId              //IN
+    );
+
 /* Helper functions for calling KFD SVM ioctl */
 HSAKMT_STATUS
 HSAKMTAPI
diff --git a/projects/rocr-runtime/libhsakmt/src/hsakmtmodel.c b/projects/rocr-runtime/libhsakmt/src/hsakmtmodel.c
index 7b1c69b76b..6cf1e99762 100644
--- a/projects/rocr-runtime/libhsakmt/src/hsakmtmodel.c
+++ b/projects/rocr-runtime/libhsakmt/src/hsakmtmodel.c
@@ -282,7 +282,7 @@ void model_init(void)
 	for (unsigned node_id = 0; node_id < props.NumNodes; node_id++)
 	{
 		HsaNodeProperties node_props;
-		result = hsakmt_topology_get_node_props(node_id, &node_props);
+		result = hsakmt_topology_get_node_props(&hsakmt_primary_kfd_ctx, node_id, &node_props);
 		if (result != HSAKMT_STATUS_SUCCESS)
 		{
 			fprintf(stderr, "model: Failed to get node %u properties\n", node_id);
diff --git a/projects/rocr-runtime/libhsakmt/src/kfdcontext.c b/projects/rocr-runtime/libhsakmt/src/kfdcontext.c
index 981c53eb4a..8db36f8747 100644
--- a/projects/rocr-runtime/libhsakmt/src/kfdcontext.c
+++ b/projects/rocr-runtime/libhsakmt/src/kfdcontext.c
@@ -37,9 +37,12 @@ void hsakmt_kfdcontext_init_context(int fd, HsaKFDContext *ctx)
     assert(ctx);
 
     ctx->fd = fd;
+    ctx->topology_context = NULL;
     ctx->queue_context = NULL;
     ctx->fmm_context = NULL;
     ctx->event_context = NULL;
+    ctx->debug_context = NULL;
+    ctx->perf_context = NULL;
 }
 
 void hsakmt_kfdcontext_clear_context(HsaKFDContext *ctx)
@@ -47,6 +50,10 @@ void hsakmt_kfdcontext_clear_context(HsaKFDContext *ctx)
     if (!ctx)
         return;
 
+    if (ctx->topology_context) {
+        free(ctx->topology_context);
+        ctx->topology_context = NULL;
+    }
     if (ctx->queue_context) {
         free(ctx->queue_context);
         ctx->queue_context = NULL;
@@ -59,5 +66,13 @@ void hsakmt_kfdcontext_clear_context(HsaKFDContext *ctx)
         free(ctx->event_context);
         ctx->event_context = NULL;
     }
+    if (ctx->debug_context) {
+        free(ctx->debug_context);
+        ctx->debug_context = NULL;
+    }
+    if (ctx->perf_context) {
+        free(ctx->perf_context);
+        ctx->perf_context = NULL;
+    }
     ctx->fd = -1;
 }
diff --git a/projects/rocr-runtime/libhsakmt/src/kfdcontext.h b/projects/rocr-runtime/libhsakmt/src/kfdcontext.h
index 8053e74f7b..1b9d6ccda3 100644
--- a/projects/rocr-runtime/libhsakmt/src/kfdcontext.h
+++ b/projects/rocr-runtime/libhsakmt/src/kfdcontext.h
@@ -28,9 +28,12 @@
 
 #include <stdint.h>
 
+struct hsa_kfd_topology_context;
 struct hsa_kfd_queue_context;
 struct hsa_kfd_fmm_context;
 struct hsa_kfd_event_context;
+struct hsa_kfd_debug_context;
+struct hsa_kfd_perf_context;
 
 /*
  * HsaKFDContext
@@ -52,6 +55,9 @@ typedef struct _HsaKFDContext
     /* File descriptor for the KFD device */
     int fd;
 
+    /* Topology context for managing system topology information */
+    struct hsa_kfd_topology_context *topology_context;
+
     /* Queue context for managing user queues */
     struct hsa_kfd_queue_context *queue_context;
 
@@ -60,6 +66,12 @@ typedef struct _HsaKFDContext
 
     /* Event context for managing events */
     struct hsa_kfd_event_context *event_context;
+
+    /* Debug context for managing debug operations */
+    struct hsa_kfd_debug_context *debug_context;
+
+    /* perf context for managing perf operations */
+    struct hsa_kfd_perf_context *perf_context;
 } HsaKFDContext;
 
 // Initialize a pre-allocated HsaKFDContext with the given file descriptor
@@ -67,8 +79,10 @@ void hsakmt_kfdcontext_init_context(int fd, HsaKFDContext *ctx);
 // Release all resources associated with the given KFD context
 void hsakmt_kfdcontext_clear_context(HsaKFDContext *ctx);
 
+struct hsa_kfd_topology_context *hsakmt_kfdcontext_get_topology_context(HsaKFDContext *ctx);
 struct hsa_kfd_fmm_context *hsakmt_kfdcontext_get_fmm_context(HsaKFDContext *ctx);
 struct hsa_kfd_queue_context *hsakmt_kfdcontext_get_queue_context(HsaKFDContext *ctx);
 struct hsa_kfd_event_context *hsakmt_kfdcontext_get_event_context(HsaKFDContext *ctx);
-
+struct hsa_kfd_debug_context *hsakmt_kfdcontext_get_debug_context(HsaKFDContext *ctx);
+struct hsa_kfd_perf_context *hsakmt_kfdcontext_get_perf_context(HsaKFDContext *ctx);
 #endif /* _KFDCONTEXT_H_ */
diff --git a/projects/rocr-runtime/libhsakmt/src/libhsakmt.h b/projects/rocr-runtime/libhsakmt/src/libhsakmt.h
index 7440d55e02..f3bd3a5651 100644
--- a/projects/rocr-runtime/libhsakmt/src/libhsakmt.h
+++ b/projects/rocr-runtime/libhsakmt/src/libhsakmt.h
@@ -188,23 +188,26 @@ HSAKMT_STATUS hsakmt_init_kfd_version(void);
 
 #define IS_SOC15(gfxv) ((gfxv) >= GFX_VERSION_VEGA10)
 
-HSAKMT_STATUS hsakmt_validate_nodeid(uint32_t nodeid, uint32_t *gpu_id);
-HSAKMT_STATUS hsakmt_gpuid_to_nodeid(uint32_t gpu_id, uint32_t* node_id);
-uint32_t hsakmt_get_gfxv_by_node_id(HSAuint32 node_id);
-bool hsakmt_prefer_ats(HSAuint32 node_id);
-uint16_t hsakmt_get_device_id_by_node_id(HSAuint32 node_id);
-uint16_t hsakmt_get_device_id_by_gpu_id(HSAuint32 gpu_id);
-uint32_t hsakmt_get_direct_link_cpu(uint32_t gpu_node);
+HSAKMT_STATUS hsakmt_validate_nodeid(HsaKFDContext *ctx, uint32_t nodeid, uint32_t *gpu_id);
+HSAKMT_STATUS hsakmt_gpuid_to_nodeid(HsaKFDContext *ctx, uint32_t gpu_id, uint32_t* node_id);
+uint32_t hsakmt_get_gfxv_by_node_id(HsaKFDContext *ctx, HSAuint32 node_id);
+bool hsakmt_prefer_ats(HsaKFDContext *ctx, HSAuint32 node_id);
+uint16_t hsakmt_get_device_id_by_node_id(HsaKFDContext *ctx, HSAuint32 node_id);
+uint16_t hsakmt_get_device_id_by_gpu_id(HsaKFDContext *ctx, HSAuint32 gpu_id);
+uint32_t hsakmt_get_direct_link_cpu(HsaKFDContext *ctx, uint32_t gpu_node);
 int get_drm_render_fd_by_gpu_id(HSAuint32 gpu_id);
-HSAKMT_STATUS hsakmt_validate_nodeid_array(uint32_t **gpu_id_array,
+HSAKMT_STATUS hsakmt_validate_nodeid_array(HsaKFDContext *ctx,
+		uint32_t **gpu_id_array,
 		uint32_t NumberOfNodes, uint32_t *NodeArray);
 
 HSAKMT_STATUS hsakmt_topology_sysfs_get_system_props(HsaKFDContext *ctx, HsaSystemProperties *props);
-HSAKMT_STATUS hsakmt_topology_get_node_props(HSAuint32 NodeId,
+HSAKMT_STATUS hsakmt_topology_get_node_props(HsaKFDContext *ctx,
+				      HSAuint32 NodeId,
 				      HsaNodeProperties *NodeProperties);
-HSAKMT_STATUS hsakmt_topology_get_iolink_props(HSAuint32 NodeId,
-					HSAuint32 NumIoLinks,
-					HsaIoLinkProperties *IoLinkProperties);
+HSAKMT_STATUS hsakmt_topology_get_iolink_props(HsaKFDContext *ctx,
+				      HSAuint32 NodeId,
+				      HSAuint32 NumIoLinks,
+				      HsaIoLinkProperties *IoLinkProperties);
 void hsakmt_topology_setup_is_dgpu_param(HsaNodeProperties *props);
 bool hsakmt_topology_is_svm_needed(HSA_ENGINE_ID EngineId);
 
@@ -212,7 +215,7 @@ HSAuint32 hsakmt_PageSizeFromFlags(unsigned int pageSizeFlags);
 HSAuint64 MapDrmPerm(HsaMemoryMapFlags flags);
 
 void* hsakmt_allocate_exec_aligned_memory_gpu(HsaKFDContext *ctx,
-					   uint32_t size, uint32_t align,
+				       uint32_t size, uint32_t align,
 				       uint32_t gpu_id,
 				       uint32_t NodeId, bool NonPaged,
 				       bool DeviceLocal, bool Uncached);
@@ -221,11 +224,11 @@ void hsakmt_free_exec_aligned_memory_gpu(HsaKFDContext *ctx,
 HSAKMT_STATUS hsakmt_init_process_doorbells(HsaKFDContext *ctx,
 					   unsigned int NumNodes);
 void hsakmt_destroy_process_doorbells(HsaKFDContext *ctx);
-HSAKMT_STATUS hsakmt_init_device_debugging_memory(unsigned int NumNodes);
-void hsakmt_destroy_device_debugging_memory(void);
-bool hsakmt_debug_get_reg_status(uint32_t node_id);
-HSAKMT_STATUS hsakmt_init_counter_props(unsigned int NumNodes);
-void hsakmt_destroy_counter_props(void);
+HSAKMT_STATUS hsakmt_init_device_debugging_memory(HsaKFDContext *ctx, unsigned int NumNodes);
+void hsakmt_destroy_device_debugging_memory(HsaKFDContext *ctx);
+bool hsakmt_debug_get_reg_status(HsaKFDContext *ctx, uint32_t node_id);
+HSAKMT_STATUS hsakmt_init_counter_props(HsaKFDContext *ctx, unsigned int NumNodes);
+void hsakmt_destroy_counter_props(HsaKFDContext *ctx);
 uint32_t *hsakmt_convert_queue_ids(HSAuint32 NumQueues, HSA_QUEUEID *Queues);
 
 extern int hsakmt_ioctl(int fd, unsigned long request, void *arg);
@@ -250,7 +253,7 @@ void hsakmt_clear_events_page(HsaKFDContext *ctx);
 void hsakmt_fmm_clear_all_mem(HsaKFDContext *ctx);
 void hsakmt_fmm_clear_all_aperture(HsaKFDContext *ctx);
 void hsakmt_clear_process_doorbells(HsaKFDContext *ctx);
-uint32_t hsakmt_get_num_sysfs_nodes(void);
+uint32_t hsakmt_get_num_sysfs_nodes(HsaKFDContext *ctx);
 
 bool hsakmt_is_forked_child(void);
 
diff --git a/projects/rocr-runtime/libhsakmt/src/memory.c b/projects/rocr-runtime/libhsakmt/src/memory.c
index 07cfb70fb5..31e2a7551f 100644
--- a/projects/rocr-runtime/libhsakmt/src/memory.c
+++ b/projects/rocr-runtime/libhsakmt/src/memory.c
@@ -55,11 +55,11 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtSetMemoryPolicyCtx(HsaKFDContext *ctx,
 	pr_debug("[%s] node %d; default %d; alternate %d\n",
 		__func__, Node, DefaultPolicy, AlternatePolicy);
 
-	result = hsakmt_validate_nodeid(Node, &gpu_id);
+	result = hsakmt_validate_nodeid(ctx, Node, &gpu_id);
 	if (result != HSAKMT_STATUS_SUCCESS)
 		return result;
 
-	if (hsakmt_get_gfxv_by_node_id(Node) != GFX_VERSION_KAVERI)
+	if (hsakmt_get_gfxv_by_node_id(ctx, Node) != GFX_VERSION_KAVERI)
 		/* This is a legacy API useful on Kaveri only. On dGPU
 		 * the alternate aperture is setup and used
 		 * automatically for coherent allocations. Don't let
@@ -137,7 +137,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemoryAlignCtx(HsaKFDContext *ctx,
 
 	pr_debug("[%s] node %d\n", __func__, PreferredNode);
 
-	result = hsakmt_validate_nodeid(PreferredNode, &gpu_id);
+	result = hsakmt_validate_nodeid(ctx, PreferredNode, &gpu_id);
 	if (result != HSAKMT_STATUS_SUCCESS) {
 		pr_err("[%s] invalid node ID: %d\n", __func__, PreferredNode);
 		return result;
@@ -254,7 +254,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAvailableMemoryCtx(HsaKFDContext *ctx,
 
 	pr_debug("[%s] node %d\n", __func__, Node);
 
-	result = hsakmt_validate_nodeid(Node, &args.gpu_id);
+	result = hsakmt_validate_nodeid(ctx, Node, &args.gpu_id);
 	if (result != HSAKMT_STATUS_SUCCESS) {
 		pr_err("[%s] invalid node ID: %d\n", __func__, Node);
 		return result;
@@ -304,7 +304,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryToNodesCtx(HsaKFDContext *ctx,
 		/* TODO: support mixed APU and dGPU configurations */
 		return HSAKMT_STATUS_NOT_SUPPORTED;
 
-	ret = hsakmt_validate_nodeid_array(&gpu_id_array,
+	ret = hsakmt_validate_nodeid_array(ctx, &gpu_id_array,
 			NumberOfNodes, NodeArray);
 
 	if (ret == HSAKMT_STATUS_SUCCESS) {
@@ -385,7 +385,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterGraphicsHandleToNodesExtCtx(HsaKFDContext
 	pr_debug("[%s] number of nodes %lu\n", __func__, NumberOfNodes);
 
 	if (NodeArray != NULL || NumberOfNodes != 0) {
-		ret = hsakmt_validate_nodeid_array(&gpu_id_array,
+		ret = hsakmt_validate_nodeid_array(ctx, &gpu_id_array,
 				NumberOfNodes, NodeArray);
 	}
 
@@ -467,7 +467,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterSharedHandleToNodesCtx(HsaKFDContext *ctx,
 		return HSAKMT_STATUS_INVALID_PARAMETER;
 
 	if (NodeArray) {
-		ret = hsakmt_validate_nodeid_array(&gpu_id_array, NumberOfNodes, NodeArray);
+		ret = hsakmt_validate_nodeid_array(ctx, &gpu_id_array, NumberOfNodes, NodeArray);
 		if (ret != HSAKMT_STATUS_SUCCESS)
 			goto error;
 	}
@@ -567,7 +567,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtMapMemoryToGPUNodesCtx(HsaKFDContext *ctx,
 		return hsaKmtMapMemoryToGPUCtx(ctx, MemoryAddress,
 					MemorySizeInBytes, AlternateVAGPU);
 
-	ret = hsakmt_validate_nodeid_array(&gpu_id_array,
+	ret = hsakmt_validate_nodeid_array(ctx, &gpu_id_array,
 				NumberOfNodes, NodeArray);
 	if (ret != HSAKMT_STATUS_SUCCESS)
 		return ret;
@@ -633,7 +633,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetTileConfigCtx(HsaKFDContext *ctx,
 
 	pr_debug("[%s] node %d\n", __func__, NodeId);
 
-	result = hsakmt_validate_nodeid(NodeId, &gpu_id);
+	result = hsakmt_validate_nodeid(ctx, NodeId, &gpu_id);
 	if (result != HSAKMT_STATUS_SUCCESS)
 		return result;
 
diff --git a/projects/rocr-runtime/libhsakmt/src/openclose.c b/projects/rocr-runtime/libhsakmt/src/openclose.c
index 4d7d428891..abaa705c75 100644
--- a/projects/rocr-runtime/libhsakmt/src/openclose.c
+++ b/projects/rocr-runtime/libhsakmt/src/openclose.c
@@ -106,7 +106,7 @@ static void clear_after_fork(HsaKFDContext *ctx)
 	hsakmt_clear_process_doorbells(ctx);
 	hsakmt_clear_events_page(ctx);
 	hsakmt_fmm_clear_all_mem(ctx);
-	hsakmt_destroy_device_debugging_memory();
+	hsakmt_destroy_device_debugging_memory(ctx);
 
 	int fd = ctx->fd;
 	if (fd >= 0) {
@@ -226,10 +226,10 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtOpenKFDCtx(HsaKFDContext **pCtx)
 
 		hsakmt_kfd_open_count = 1;
 
-		if (hsakmt_init_device_debugging_memory(sys_props.NumNodes) != HSAKMT_STATUS_SUCCESS)
+		if (hsakmt_init_device_debugging_memory(&hsakmt_primary_kfd_ctx, sys_props.NumNodes) != HSAKMT_STATUS_SUCCESS)
 			pr_warn("Insufficient Memory. Debugging unavailable\n");
 
-		hsakmt_init_counter_props(sys_props.NumNodes);
+		hsakmt_init_counter_props(&hsakmt_primary_kfd_ctx, sys_props.NumNodes);
 		*pCtx = &hsakmt_primary_kfd_ctx;
 
 		if (!atfork_installed) {
@@ -269,8 +269,8 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCloseKFDCtx(void)
 
 	if (hsakmt_kfd_open_count > 0)	{
 		if (--hsakmt_kfd_open_count == 0) {
-			hsakmt_destroy_counter_props();
-			hsakmt_destroy_device_debugging_memory();
+			hsakmt_destroy_counter_props(&hsakmt_primary_kfd_ctx);
+			hsakmt_destroy_device_debugging_memory(&hsakmt_primary_kfd_ctx);
 			hsakmt_fmm_clear_all_aperture(&hsakmt_primary_kfd_ctx);
 		}
 
diff --git a/projects/rocr-runtime/libhsakmt/src/pc_sampling.c b/projects/rocr-runtime/libhsakmt/src/pc_sampling.c
index 055a30c0b1..a7a77e499f 100644
--- a/projects/rocr-runtime/libhsakmt/src/pc_sampling.c
+++ b/projects/rocr-runtime/libhsakmt/src/pc_sampling.c
@@ -52,7 +52,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingQueryCapabilities(HSAuint32 NodeId, void
     CHECK_KFD_OPEN();
     CHECK_KFD_MINOR_VERSION(16);
 
-    HSAKMT_STATUS ret = hsakmt_validate_nodeid(NodeId, &gpu_id);
+    HSAKMT_STATUS ret = hsakmt_validate_nodeid(&hsakmt_primary_kfd_ctx, NodeId, &gpu_id);
     if (ret != HSAKMT_STATUS_SUCCESS) {
         pr_err("[%s] invalid node ID: %d\n", __func__, NodeId);
         return ret;
@@ -99,7 +99,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingCreate(HSAuint32 NodeId, HsaPcSamplingIn
     CHECK_KFD_OPEN();
 
     *traceId = INVALID_TRACE_ID;
-    HSAKMT_STATUS ret = hsakmt_validate_nodeid(NodeId, &gpu_id);
+    HSAKMT_STATUS ret = hsakmt_validate_nodeid(&hsakmt_primary_kfd_ctx, NodeId, &gpu_id);
     if (ret != HSAKMT_STATUS_SUCCESS) {
         pr_err("[%s] invalid node ID: %d\n", __func__, NodeId);
         return ret;
@@ -139,7 +139,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingDestroy(HSAuint32 NodeId, HsaPcSamplingT
 
     CHECK_KFD_OPEN();
 
-    HSAKMT_STATUS ret = hsakmt_validate_nodeid(NodeId, &gpu_id);
+    HSAKMT_STATUS ret = hsakmt_validate_nodeid(&hsakmt_primary_kfd_ctx, NodeId, &gpu_id);
     if (ret != HSAKMT_STATUS_SUCCESS) {
         pr_err("[%s] invalid node ID: %d\n", __func__, NodeId);
         return ret;
@@ -171,7 +171,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingStart(HSAuint32 NodeId, HsaPcSamplingTra
 
     CHECK_KFD_OPEN();
 
-    HSAKMT_STATUS ret = hsakmt_validate_nodeid(NodeId, &gpu_id);
+    HSAKMT_STATUS ret = hsakmt_validate_nodeid(&hsakmt_primary_kfd_ctx, NodeId, &gpu_id);
     if (ret != HSAKMT_STATUS_SUCCESS) {
         pr_err("[%s] invalid node ID: %d\n", __func__, NodeId);
         return ret;
@@ -210,7 +210,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingStop(HSAuint32 NodeId, HsaPcSamplingTrac
 
     CHECK_KFD_OPEN();
 
-    HSAKMT_STATUS ret = hsakmt_validate_nodeid(NodeId, &gpu_id);
+    HSAKMT_STATUS ret = hsakmt_validate_nodeid(&hsakmt_primary_kfd_ctx, NodeId, &gpu_id);
     if (ret != HSAKMT_STATUS_SUCCESS) {
         pr_err("[%s] invalid node ID: %d\n", __func__, NodeId);
         return ret;
diff --git a/projects/rocr-runtime/libhsakmt/src/perfctr.c b/projects/rocr-runtime/libhsakmt/src/perfctr.c
index 0078836bcc..4e1de7956a 100644
--- a/projects/rocr-runtime/libhsakmt/src/perfctr.c
+++ b/projects/rocr-runtime/libhsakmt/src/perfctr.c
@@ -37,6 +37,7 @@
 #include <sys/mman.h>
 #include <fcntl.h>
 #include <semaphore.h>
+#include <assert.h>
 
 #define BITS_PER_BYTE		CHAR_BIT
 
@@ -75,8 +76,32 @@ struct perf_counts_values {
 	};
 };
 
-static HsaCounterProperties **counter_props;
-static unsigned int counter_props_count;
+struct hsa_kfd_perf_context
+{
+	HsaCounterProperties **counter_props;
+	unsigned int counter_props_count;
+};
+
+struct hsa_kfd_perf_context *hsakmt_kfdcontext_get_perf_context(HsaKFDContext *ctx)
+{
+	assert(ctx);
+	if (!ctx) {
+		pr_err("Expected a non-null ptr for HsaKFDContext");
+		return NULL;
+	}
+
+	if (ctx->perf_context)
+		return ctx->perf_context;
+
+	ctx->perf_context = calloc(1, sizeof(struct hsa_kfd_perf_context));
+	if (!ctx->perf_context) {
+		pr_err("Alloc memory failed for struct hsa_kfd_perf_context size %zu\n",
+				 sizeof(struct hsa_kfd_perf_context));
+		return NULL;
+	}
+
+	return ctx->perf_context;
+}
 
 static ssize_t readn(int fd, void *buf, size_t n)
 {
@@ -99,33 +124,35 @@ static ssize_t readn(int fd, void *buf, size_t n)
 	return n;
 }
 
-HSAKMT_STATUS hsakmt_init_counter_props(unsigned int NumNodes)
+HSAKMT_STATUS hsakmt_init_counter_props(HsaKFDContext *ctx, unsigned int NumNodes)
 {
-	counter_props = calloc(NumNodes, sizeof(struct HsaCounterProperties *));
-	if (!counter_props) {
+	struct hsa_kfd_perf_context *perf_ctx = hsakmt_kfdcontext_get_perf_context(ctx);
+	perf_ctx->counter_props = calloc(NumNodes, sizeof(struct HsaCounterProperties *));
+	if (!perf_ctx->counter_props) {
 		pr_warn("Profiling is not available.\n");
 		return HSAKMT_STATUS_NO_MEMORY;
 	}
 
-	counter_props_count = NumNodes;
+	perf_ctx->counter_props_count = NumNodes;
 
 	return HSAKMT_STATUS_SUCCESS;
 }
 
-void hsakmt_destroy_counter_props(void)
+void hsakmt_destroy_counter_props(HsaKFDContext *ctx)
 {
 	unsigned int i;
+	struct hsa_kfd_perf_context *perf_ctx = hsakmt_kfdcontext_get_perf_context(ctx);
 
-	if (!counter_props)
+	if (!perf_ctx->counter_props)
 		return;
 
-	for (i = 0; i < counter_props_count; i++)
-		if (counter_props[i]) {
-			free(counter_props[i]);
-			counter_props[i] = NULL;
+	for (i = 0; i < perf_ctx->counter_props_count; i++)
+		if (perf_ctx->counter_props[i]) {
+			free(perf_ctx->counter_props[i]);
+			perf_ctx->counter_props[i] = NULL;
 		}
 
-	free(counter_props);
+	free(perf_ctx->counter_props);
 }
 
 static int blockid2uuid(enum perf_block_id block_id, HSA_UUID *uuid)
@@ -211,11 +238,12 @@ static int blockid2uuid(enum perf_block_id block_id, HSA_UUID *uuid)
 	return rc;
 }
 
-static HSAuint32 get_block_concurrent_limit(uint32_t node_id,
+static HSAuint32 get_block_concurrent_limit(struct hsa_kfd_perf_context *perf_ctx,
+						uint32_t node_id,
 						HSAuint32 block_id)
 {
 	uint32_t i;
-	HsaCounterBlockProperties *block = &counter_props[node_id]->Blocks[0];
+	HsaCounterBlockProperties *block = &perf_ctx->counter_props[node_id]->Blocks[0];
 
 	for (i = 0; i < PERFCOUNTER_BLOCKID__MAX; i++) {
 		if (block->Counters[0].BlockIndex == block_id)
@@ -254,7 +282,8 @@ static HSAKMT_STATUS query_trace(int fd, uint64_t *buf)
 	return HSAKMT_STATUS_SUCCESS;
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtPmcGetCounterProperties(HSAuint32 NodeId,
+HSAKMT_STATUS HSAKMTAPI hsaKmtPmcGetCounterPropertiesCtx(HsaKFDContext *ctx,
+						      HSAuint32 NodeId,
 						      HsaCounterProperties **CounterProperties)
 {
 	HSAKMT_STATUS rc = HSAKMT_STATUS_SUCCESS;
@@ -265,23 +294,24 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPmcGetCounterProperties(HSAuint32 NodeId,
 	struct perf_counter_block block = {0};
 	uint32_t total_blocks = 0;
 	HsaCounterBlockProperties *block_prop;
+	struct hsa_kfd_perf_context *perf_ctx = hsakmt_kfdcontext_get_perf_context(ctx);
 
-	if (!counter_props)
+	if (!perf_ctx->counter_props)
 		return HSAKMT_STATUS_NO_MEMORY;
 
 	if (!CounterProperties)
 		return HSAKMT_STATUS_INVALID_PARAMETER;
 
-	if (hsakmt_validate_nodeid(NodeId, &gpu_id) != HSAKMT_STATUS_SUCCESS)
+	if (hsakmt_validate_nodeid(ctx, NodeId, &gpu_id) != HSAKMT_STATUS_SUCCESS)
 		return HSAKMT_STATUS_INVALID_NODE_UNIT;
 
-	if (counter_props[NodeId]) {
-		*CounterProperties = counter_props[NodeId];
+	if (perf_ctx->counter_props[NodeId]) {
+		*CounterProperties = perf_ctx->counter_props[NodeId];
 		return HSAKMT_STATUS_SUCCESS;
 	}
 
 	for (i = 0; i < PERFCOUNTER_BLOCKID__MAX; i++) {
-		rc = hsakmt_get_block_properties(NodeId, i, &block);
+		rc = hsakmt_get_block_properties(ctx, NodeId, i, &block);
 		if (rc != HSAKMT_STATUS_SUCCESS)
 			return rc;
 		total_concurrent += block.num_of_slots;
@@ -295,19 +325,19 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPmcGetCounterProperties(HSAuint32 NodeId,
 			sizeof(HsaCounterBlockProperties) * (total_blocks - 1) +
 			sizeof(HsaCounter) * (total_counters - total_blocks);
 
-	counter_props[NodeId] = malloc(counter_props_size);
-	if (!counter_props[NodeId])
+	perf_ctx->counter_props[NodeId] = malloc(counter_props_size);
+	if (!perf_ctx->counter_props[NodeId])
 		return HSAKMT_STATUS_NO_MEMORY;
 
-	counter_props[NodeId]->NumBlocks = total_blocks;
-	counter_props[NodeId]->NumConcurrent = total_concurrent;
+	perf_ctx->counter_props[NodeId]->NumBlocks = total_blocks;
+	perf_ctx->counter_props[NodeId]->NumConcurrent = total_concurrent;
 
-	block_prop = &counter_props[NodeId]->Blocks[0];
+	block_prop = &perf_ctx->counter_props[NodeId]->Blocks[0];
 	for (block_id = 0; block_id < PERFCOUNTER_BLOCKID__MAX; block_id++) {
-		rc = hsakmt_get_block_properties(NodeId, block_id, &block);
+		rc = hsakmt_get_block_properties(ctx, NodeId, block_id, &block);
 		if (rc != HSAKMT_STATUS_SUCCESS) {
-			free(counter_props[NodeId]);
-			counter_props[NodeId] = NULL;
+			free(perf_ctx->counter_props[NodeId]);
+			perf_ctx->counter_props[NodeId] = NULL;
 			return rc;
 		}
 
@@ -329,13 +359,14 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPmcGetCounterProperties(HSAuint32 NodeId,
 		block_prop = (HsaCounterBlockProperties *)&block_prop->Counters[block_prop->NumCounters];
 	}
 
-	*CounterProperties = counter_props[NodeId];
+	*CounterProperties = perf_ctx->counter_props[NodeId];
 
 	return HSAKMT_STATUS_SUCCESS;
 }
 
 /* Registers a set of (HW) counters to be used for tracing/profiling */
-HSAKMT_STATUS HSAKMTAPI hsaKmtPmcRegisterTrace(HSAuint32 NodeId,
+HSAKMT_STATUS HSAKMTAPI hsaKmtPmcRegisterTraceCtx(HsaKFDContext* ctx,
+					       HSAuint32 NodeId,
 					       HSAuint32 NumberOfCounters,
 					       HsaCounter *Counters,
 					       HsaPmcTraceRoot *TraceRoot)
@@ -353,6 +384,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPmcRegisterTrace(HSAuint32 NodeId,
 	uint32_t block, num_blocks = 0, total_counters = 0;
 	uint64_t *counter_id_ptr;
 	int *fd_ptr;
+	struct hsa_kfd_perf_context *perf_ctx = hsakmt_kfdcontext_get_perf_context(ctx);
 
 	pr_debug("[%s] Number of counters %d\n", __func__, NumberOfCounters);
 
@@ -362,7 +394,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPmcRegisterTrace(HSAuint32 NodeId,
 		return HSAKMT_STATUS_NO_MEMORY;
 	}
 
-	if (!counter_props) {
+	if (!perf_ctx->counter_props) {
 		pr_err("Profiling is not available, counter_props is NULL.\n");
 		goto no_memory_exit;
 	}
@@ -370,7 +402,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPmcRegisterTrace(HSAuint32 NodeId,
 	if (!Counters || !TraceRoot || NumberOfCounters == 0)
 		goto invalid_parameter_exit;
 
-	if (hsakmt_validate_nodeid(NodeId, &gpu_id) != HSAKMT_STATUS_SUCCESS) {
+	if (hsakmt_validate_nodeid(ctx, NodeId, &gpu_id) != HSAKMT_STATUS_SUCCESS) {
 		free(counter_id);
 		return HSAKMT_STATUS_INVALID_NODE_UNIT;
 	}
@@ -408,7 +440,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPmcRegisterTrace(HSAuint32 NodeId,
 	for (i = 0; i < PERFCOUNTER_BLOCKID__MAX; i++) {
 		if (!num_counters[i])
 			continue;
-		concurrent_limit = get_block_concurrent_limit(NodeId, i);
+		concurrent_limit = get_block_concurrent_limit(perf_ctx, NodeId, i);
 		if (!concurrent_limit) {
 			pr_err("Invalid block ID: %d\n", i);
 			goto invalid_parameter_exit;
@@ -509,7 +541,8 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPmcRegisterTrace(HSAuint32 NodeId,
 
 /* Unregisters a set of (HW) counters used for tracing/profiling */
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtPmcUnregisterTrace(HSAuint32 NodeId,
+HSAKMT_STATUS HSAKMTAPI hsaKmtPmcUnregisterTraceCtx(HsaKFDContext* ctx,
+						 HSAuint32 NodeId,
 						 HSATraceId TraceId)
 {
 	uint32_t gpu_id;
@@ -520,7 +553,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPmcUnregisterTrace(HSAuint32 NodeId,
 	if (TraceId == 0)
 		return HSAKMT_STATUS_INVALID_PARAMETER;
 
-	if (hsakmt_validate_nodeid(NodeId, &gpu_id) != HSAKMT_STATUS_SUCCESS)
+	if (hsakmt_validate_nodeid(ctx, NodeId, &gpu_id) != HSAKMT_STATUS_SUCCESS)
 		return HSAKMT_STATUS_INVALID_NODE_UNIT;
 
 	trace = (struct perf_trace *)PORT_UINT64_TO_VPTR(TraceId);
@@ -544,7 +577,8 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPmcUnregisterTrace(HSAuint32 NodeId,
 	return HSAKMT_STATUS_SUCCESS;
 }
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtPmcAcquireTraceAccess(HSAuint32 NodeId,
+HSAKMT_STATUS HSAKMTAPI hsaKmtPmcAcquireTraceAccessCtx(HsaKFDContext* ctx,
+						    HSAuint32 NodeId,
 						    HSATraceId TraceId)
 {
 	struct perf_trace *trace;
@@ -561,7 +595,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPmcAcquireTraceAccess(HSAuint32 NodeId,
 	if (trace->magic4cc != HSA_PERF_MAGIC4CC)
 		return HSAKMT_STATUS_INVALID_HANDLE;
 
-	if (hsakmt_validate_nodeid(NodeId, &gpu_id) != HSAKMT_STATUS_SUCCESS)
+	if (hsakmt_validate_nodeid(ctx, NodeId, &gpu_id) != HSAKMT_STATUS_SUCCESS)
 		return HSAKMT_STATUS_INVALID_NODE_UNIT;
 
 	return ret;
@@ -692,3 +726,32 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPmcStopTrace(HSATraceId TraceId)
 
 	return ret;
 }
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtPmcGetCounterProperties(HSAuint32 NodeId,
+						      HsaCounterProperties **CounterProperties)
+{
+	return hsaKmtPmcGetCounterPropertiesCtx(&hsakmt_primary_kfd_ctx, NodeId, CounterProperties);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtPmcRegisterTrace(HSAuint32 NodeId,
+					       HSAuint32 NumberOfCounters,
+					       HsaCounter *Counters,
+					       HsaPmcTraceRoot *TraceRoot)
+{
+	return hsaKmtPmcRegisterTraceCtx(&hsakmt_primary_kfd_ctx,
+							NodeId, NumberOfCounters, Counters, TraceRoot);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtPmcUnregisterTrace(HSAuint32 NodeId,
+						 HSATraceId TraceId)
+{
+	return hsaKmtPmcUnregisterTraceCtx(&hsakmt_primary_kfd_ctx,
+							NodeId, TraceId);
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtPmcAcquireTraceAccess(HSAuint32 NodeId,
+						    HSATraceId TraceId)
+{
+	return hsaKmtPmcAcquireTraceAccessCtx(&hsakmt_primary_kfd_ctx,
+							NodeId, TraceId);
+}
\ No newline at end of file
diff --git a/projects/rocr-runtime/libhsakmt/src/pmc_table.c b/projects/rocr-runtime/libhsakmt/src/pmc_table.c
index 6b9a9e6719..cdcecbe5f4 100644
--- a/projects/rocr-runtime/libhsakmt/src/pmc_table.c
+++ b/projects/rocr-runtime/libhsakmt/src/pmc_table.c
@@ -1958,12 +1958,13 @@ static struct perf_counter_block navi_blocks[PERFCOUNTER_BLOCKID__MAX] = {
 	},
 };
 
-HSAKMT_STATUS hsakmt_get_block_properties(uint32_t node_id,
+HSAKMT_STATUS hsakmt_get_block_properties(HsaKFDContext *ctx,
+				   uint32_t node_id,
 				   enum perf_block_id block_id,
 				   struct perf_counter_block *block)
 {
-	uint32_t gfxv = hsakmt_get_gfxv_by_node_id(node_id);
-	uint16_t dev_id = hsakmt_get_device_id_by_node_id(node_id);
+	uint32_t gfxv = hsakmt_get_gfxv_by_node_id(ctx, node_id);
+	uint16_t dev_id = hsakmt_get_device_id_by_node_id(ctx, node_id);
 
 	if (block_id >= PERFCOUNTER_BLOCKID__MAX ||
 			block_id < PERFCOUNTER_BLOCKID__FIRST)
diff --git a/projects/rocr-runtime/libhsakmt/src/pmc_table.h b/projects/rocr-runtime/libhsakmt/src/pmc_table.h
index 6154a8c559..213b205684 100644
--- a/projects/rocr-runtime/libhsakmt/src/pmc_table.h
+++ b/projects/rocr-runtime/libhsakmt/src/pmc_table.h
@@ -67,7 +67,8 @@ struct perf_counter_block {
 	uint64_t    counter_mask;
 };
 
-HSAKMT_STATUS hsakmt_get_block_properties(uint32_t node_id,
+HSAKMT_STATUS hsakmt_get_block_properties(HsaKFDContext *ctx,
+				   uint32_t node_id,
 				   enum perf_block_id block_id,
 				   struct perf_counter_block *block);
 
diff --git a/projects/rocr-runtime/libhsakmt/src/queues.c b/projects/rocr-runtime/libhsakmt/src/queues.c
index 5a92bdbc48..c2a29a9774 100644
--- a/projects/rocr-runtime/libhsakmt/src/queues.c
+++ b/projects/rocr-runtime/libhsakmt/src/queues.c
@@ -148,14 +148,15 @@ HSAKMT_STATUS hsakmt_init_process_doorbells(HsaKFDContext *ctx, unsigned int Num
 	return ret;
 }
 
-static void get_doorbell_map_info(uint32_t node_id,
+static void get_doorbell_map_info(HsaKFDContext *ctx,
+				  uint32_t node_id,
 				  struct process_doorbells *doorbell)
 {
 	/*
 	 * GPUVM doorbell on Tonga requires a workaround for VM TLB ACTIVE bit
 	 * lookup bug. Remove ASIC check when this is implemented in amdgpu.
 	 */
-	uint32_t gfxv = hsakmt_get_gfxv_by_node_id(node_id);
+	uint32_t gfxv = hsakmt_get_gfxv_by_node_id(ctx, node_id);
 	doorbell->use_gpuvm = (hsakmt_is_dgpu && gfxv != GFX_VERSION_TONGA);
 	doorbell->size = DOORBELLS_PAGE_SIZE(DOORBELL_SIZE(gfxv));
 
@@ -272,7 +273,7 @@ static HSAKMT_STATUS map_doorbell(HsaKFDContext *ctx,
 		return HSAKMT_STATUS_SUCCESS;
 	}
 
-	get_doorbell_map_info(NodeId, &doorbells[NodeId]);
+	get_doorbell_map_info(ctx, NodeId, &doorbells[NodeId]);
 
 	if (doorbells[NodeId].use_gpuvm) {
 		status = map_doorbell_dgpu(ctx, NodeId, gpu_id, doorbell_mmap_offset);
@@ -385,7 +386,7 @@ void *hsakmt_allocate_exec_aligned_memory_gpu(HsaKFDContext *ctx,
 		 * nonPaged=0 system memory allocation uses GTT path
 		 */
 		if (!nonPaged) {
-			cpu_id = hsakmt_get_direct_link_cpu(NodeId);
+			cpu_id = hsakmt_get_direct_link_cpu(ctx, NodeId);
 			if (cpu_id == INVALID_NODEID) {
 				flags.ui32.NoNUMABind = 1;
 				cpu_id = 0;
@@ -460,7 +461,8 @@ static void free_exec_aligned_memory(HsaKFDContext *ctx,
 		munmap(addr, size);
 }
 
-static HSAKMT_STATUS register_svm_range(void *mem, uint32_t size,
+static HSAKMT_STATUS register_svm_range(HsaKFDContext *ctx,
+				void *mem, uint32_t size,
 				uint32_t gpuNode, uint32_t prefetchNode,
 				uint32_t preferredNode, bool alwaysMapped)
 {
@@ -493,7 +495,7 @@ static HSAKMT_STATUS register_svm_range(void *mem, uint32_t size,
 	attrs[5].type = HSA_SVM_ATTR_GRANULARITY;
 	attrs[5].value = 0xFF;
 
-	return hsaKmtSVMSetAttr(mem, size, nattr, attrs);
+	return hsaKmtSVMSetAttrCtx(ctx, mem, size, nattr, attrs);
 }
 
 static void free_queue(HsaKFDContext *ctx, struct queue *q)
@@ -599,7 +601,7 @@ static int handle_concrete_asic(HsaKFDContext *ctx,
 
 				fill_cwsr_header(q, addr, Event, ErrPayload, node.NumXcc);
 
-				HSAKMT_STATUS r = register_svm_range(addr, size,
+				HSAKMT_STATUS r = register_svm_range(ctx, addr, size,
 						NodeId, NodeId, 0, true);
 
 				if (r == HSAKMT_STATUS_SUCCESS) {
@@ -680,7 +682,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueueExtCtx(HsaKFDContext *ctx,
 		Priority > HSA_QUEUE_PRIORITY_MAXIMUM)
 		return HSAKMT_STATUS_INVALID_PARAMETER;
 
-	result = hsakmt_validate_nodeid(NodeId, &gpu_id);
+	result = hsakmt_validate_nodeid(ctx, NodeId, &gpu_id);
 	if (result != HSAKMT_STATUS_SUCCESS)
 		return result;
 
@@ -691,7 +693,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueueExtCtx(HsaKFDContext *ctx,
 
 	memset(q, 0, sizeof(*q));
 
-	q->gfxv = hsakmt_get_gfxv_by_node_id(NodeId);
+	q->gfxv = hsakmt_get_gfxv_by_node_id(ctx, NodeId);
 	q->use_ats = false;
 
 	if (q->gfxv == GFX_VERSION_TONGA)
@@ -932,7 +934,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtSetTrapHandlerCtx(HsaKFDContext *ctx,
 
 	CHECK_KFD_OPEN();
 
-	result = hsakmt_validate_nodeid(Node, &gpu_id);
+	result = hsakmt_validate_nodeid(ctx, Node, &gpu_id);
 	if (result != HSAKMT_STATUS_SUCCESS)
 		return result;
 
diff --git a/projects/rocr-runtime/libhsakmt/src/spm.c b/projects/rocr-runtime/libhsakmt/src/spm.c
index ec7f3d2b33..3c83ae3453 100644
--- a/projects/rocr-runtime/libhsakmt/src/spm.c
+++ b/projects/rocr-runtime/libhsakmt/src/spm.c
@@ -35,7 +35,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtSPMAcquire(HSAuint32 PreferredNode)
 	struct kfd_ioctl_spm_args args = {0};
 	uint32_t gpu_id;
 
-	ret = hsakmt_validate_nodeid(PreferredNode, &gpu_id);
+	ret = hsakmt_validate_nodeid(&hsakmt_primary_kfd_ctx, PreferredNode, &gpu_id);
 	if (ret != HSAKMT_STATUS_SUCCESS) {
 		pr_err("[%s] invalid node ID: %d\n", __func__, PreferredNode);
 		return ret;
@@ -61,7 +61,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtSPMSetDestBuffer(HSAuint32 PreferredNode,
 	struct kfd_ioctl_spm_args args = {0};
 	uint32_t gpu_id = 0;
 
-	ret = hsakmt_validate_nodeid(PreferredNode, &gpu_id);
+	ret = hsakmt_validate_nodeid(&hsakmt_primary_kfd_ctx, PreferredNode, &gpu_id);
 	if (ret != HSAKMT_STATUS_SUCCESS) {
 		return ret;
 	}
@@ -87,7 +87,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtSPMRelease(HSAuint32 PreferredNode)
 	struct kfd_ioctl_spm_args args = {0};
 	uint32_t gpu_id;
 
-	ret = hsakmt_validate_nodeid(PreferredNode, &gpu_id);
+	ret = hsakmt_validate_nodeid(&hsakmt_primary_kfd_ctx, PreferredNode, &gpu_id);
 	if (ret != HSAKMT_STATUS_SUCCESS) {
 		pr_err("[%s] invalid node ID: %d\n", __func__, PreferredNode);
 		return ret;
diff --git a/projects/rocr-runtime/libhsakmt/src/svm.c b/projects/rocr-runtime/libhsakmt/src/svm.c
index 5482dead5c..69dc7cd0e9 100644
--- a/projects/rocr-runtime/libhsakmt/src/svm.c
+++ b/projects/rocr-runtime/libhsakmt/src/svm.c
@@ -81,7 +81,7 @@ hsaKmtSVMSetAttrCtx(HsaKFDContext *ctx,
 			continue;
 		}
 
-		r = hsakmt_validate_nodeid(attrs[i].value, &args->attrs[i].value);
+		r = hsakmt_validate_nodeid(ctx, attrs[i].value, &args->attrs[i].value);
 		if (r != HSAKMT_STATUS_SUCCESS) {
 			pr_debug("invalid node ID: %d\n", attrs[i].value);
 			return r;
@@ -141,7 +141,7 @@ hsaKmtSVMGetAttrCtx(HsaKFDContext *ctx,
 		    attrs[i].type != KFD_IOCTL_SVM_ATTR_NO_ACCESS)
 		    continue;
 
-		r = hsakmt_validate_nodeid(attrs[i].value, &args->attrs[i].value);
+		r = hsakmt_validate_nodeid(ctx, attrs[i].value, &args->attrs[i].value);
 		if (r != HSAKMT_STATUS_SUCCESS) {
 			pr_debug("invalid node ID: %d\n", attrs[i].value);
 			return r;
@@ -176,7 +176,7 @@ hsaKmtSVMGetAttrCtx(HsaKFDContext *ctx,
 			attrs[i].value = INVALID_NODEID;
 			break;
 		default:
-			r = hsakmt_gpuid_to_nodeid(attrs[i].value, &attrs[i].value);
+			r = hsakmt_gpuid_to_nodeid(ctx, attrs[i].value, &attrs[i].value);
 			if (r != HSAKMT_STATUS_SUCCESS) {
 				pr_debug("invalid GPU ID: %d\n",
 					 attrs[i].value);
diff --git a/projects/rocr-runtime/libhsakmt/src/time.c b/projects/rocr-runtime/libhsakmt/src/time.c
index 9e8b5ec451..222d53f32a 100644
--- a/projects/rocr-runtime/libhsakmt/src/time.c
+++ b/projects/rocr-runtime/libhsakmt/src/time.c
@@ -26,7 +26,8 @@
 #include "libhsakmt.h"
 #include "hsakmt/linux/kfd_ioctl.h"
 
-HSAKMT_STATUS HSAKMTAPI hsaKmtGetClockCounters(HSAuint32 NodeId,
+HSAKMT_STATUS HSAKMTAPI hsaKmtGetClockCountersCtx(HsaKFDContext *ctx,
+					       HSAuint32 NodeId,
 					       HsaClockCounters *Counters)
 {
 	HSAKMT_STATUS result;
@@ -36,13 +37,13 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetClockCounters(HSAuint32 NodeId,
 
 	CHECK_KFD_OPEN();
 
-	result = hsakmt_validate_nodeid(NodeId, &gpu_id);
+	result = hsakmt_validate_nodeid(ctx, NodeId, &gpu_id);
 	if (result != HSAKMT_STATUS_SUCCESS)
 		return result;
 
 	args.gpu_id = gpu_id;
 
-	err = hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_GET_CLOCK_COUNTERS, &args);
+	err = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_GET_CLOCK_COUNTERS, &args);
 	if (err < 0) {
 		result = HSAKMT_STATUS_ERROR;
 	} else {
@@ -55,3 +56,9 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetClockCounters(HSAuint32 NodeId,
 
 	return result;
 }
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtGetClockCounters(HSAuint32 NodeId,
+					       HsaClockCounters *Counters)
+{
+	return hsaKmtGetClockCountersCtx(&hsakmt_primary_kfd_ctx, NodeId, Counters);
+}
diff --git a/projects/rocr-runtime/libhsakmt/src/topology.c b/projects/rocr-runtime/libhsakmt/src/topology.c
index b33c8590a4..64c536151a 100644
--- a/projects/rocr-runtime/libhsakmt/src/topology.c
+++ b/projects/rocr-runtime/libhsakmt/src/topology.c
@@ -72,16 +72,6 @@ typedef struct {
 	HsaIoLinkProperties *link;
 } node_props_t;
 
-static HsaSystemProperties *g_system;
-static node_props_t *g_props;
-
-/* This array caches sysfs based node IDs of CPU nodes + all supported GPU nodes.
- * It will be used to map user-node IDs to sysfs-node IDs.
- */
-static uint32_t *map_user_to_sysfs_node_id;
-static uint32_t map_user_to_sysfs_node_id_size;
-static uint32_t num_sysfs_nodes;
-
 static int processor_vendor = -1;
 /* Supported System Vendors */
 enum SUPPORTED_PROCESSOR_VENDORS {
@@ -96,8 +86,45 @@ static const char *supported_processor_vendor_name[] = {
 	"\n"			// POWER requires a different search method
 };
 
+/*
+ * KFD Topology Context
+ */
+struct hsa_kfd_topology_context
+{
+	HsaSystemProperties* system_props;
+	node_props_t *node_props;
+
+	/* This array caches sysfs based node IDs of CPU nodes + all supported GPU nodes.
+	* It will be used to map user-node IDs to sysfs-node IDs.
+	*/
+	uint32_t *map_user_to_sysfs_node_id;
+	uint32_t map_user_to_sysfs_node_id_size;
+
+	uint32_t num_sysfs_nodes;
+};
+
+struct hsa_kfd_topology_context *hsakmt_kfdcontext_get_topology_context(HsaKFDContext *ctx)
+{
+	assert(ctx);
+	if (!ctx) {
+		pr_err("Expected a non-null ptr for HsaKFDContext");
+		return NULL;
+	}
+
+	if (ctx->topology_context)
+		return ctx->topology_context;
+
+	ctx->topology_context = calloc(1, sizeof(struct hsa_kfd_topology_context));
+	if (!ctx->topology_context) {
+		pr_err("Alloc memory failed for struct hsa_kfd_topology_context size %zu\n",
+				 sizeof(struct hsa_kfd_topology_context));
+		return NULL;
+	}
+	return ctx->topology_context;
+}
+
 static HSAKMT_STATUS topology_take_snapshot(HsaKFDContext *ctx);
-static void topology_drop_snapshot(void);
+static void topology_drop_snapshot(HsaKFDContext *ctx);
 
 static const struct hsa_gfxip_table gfxip_lookup_table[] = {
 	/* Kaveri Family */
@@ -610,12 +637,15 @@ err:
 	return ret;
 }
 
-static HSAKMT_STATUS topology_sysfs_map_node_id(uint32_t node_id, uint32_t *sys_node_id)
+static HSAKMT_STATUS topology_sysfs_map_node_id(
+						struct hsa_kfd_topology_context *topology_ctx,
+						uint32_t node_id, uint32_t *sys_node_id)
 {
-	if ((!map_user_to_sysfs_node_id) || (node_id >= map_user_to_sysfs_node_id_size))
+	if ((!topology_ctx->map_user_to_sysfs_node_id) ||
+		(node_id >= topology_ctx->map_user_to_sysfs_node_id_size))
 		return HSAKMT_STATUS_NOT_SUPPORTED;
 
-	*sys_node_id = map_user_to_sysfs_node_id[node_id];
+	*sys_node_id = topology_ctx->map_user_to_sysfs_node_id[node_id];
 	return HSAKMT_STATUS_SUCCESS;
 }
 
@@ -737,6 +767,7 @@ HSAKMT_STATUS hsakmt_topology_sysfs_get_system_props(HsaKFDContext *ctx,
 	HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
 	bool is_node_supported = true;
 	uint32_t num_supported_nodes = 0;
+	struct hsa_kfd_topology_context *topology_ctx = hsakmt_kfdcontext_get_topology_context(ctx);
 
 	assert(props);
 	snprintf(path, sizeof(path), KFD_SYSFS_PATH_SYSTEM_PROPERTIES, get_topology_dir());
@@ -779,34 +810,34 @@ HSAKMT_STATUS hsakmt_topology_sysfs_get_system_props(HsaKFDContext *ctx,
 	 * which represent the node numbers
 	 */
 	snprintf(path, sizeof(path), KFD_SYSFS_PATH_NODES, get_topology_dir());
-	num_sysfs_nodes = num_subdirs(path, "");
+	topology_ctx->num_sysfs_nodes = num_subdirs(path, "");
 
-	if (map_user_to_sysfs_node_id == NULL) {
+	if (topology_ctx->map_user_to_sysfs_node_id == NULL) {
 		/* Trade off - num_sysfs_nodes includes all CPU and GPU nodes.
 		 * Slightly more memory is allocated than necessary.
 		 */
-		map_user_to_sysfs_node_id = calloc(num_sysfs_nodes, sizeof(uint32_t));
-		if (map_user_to_sysfs_node_id == NULL) {
+		topology_ctx->map_user_to_sysfs_node_id = calloc(topology_ctx->num_sysfs_nodes, sizeof(uint32_t));
+		if (topology_ctx->map_user_to_sysfs_node_id == NULL) {
 			ret = HSAKMT_STATUS_NO_MEMORY;
 			goto err2;
 		}
-		map_user_to_sysfs_node_id_size = num_sysfs_nodes;
-	} else if (num_sysfs_nodes > map_user_to_sysfs_node_id_size) {
-		free(map_user_to_sysfs_node_id);
-		map_user_to_sysfs_node_id = calloc(num_sysfs_nodes, sizeof(uint32_t));
-		if (map_user_to_sysfs_node_id == NULL) {
+		topology_ctx->map_user_to_sysfs_node_id_size = topology_ctx->num_sysfs_nodes;
+	} else if (topology_ctx->num_sysfs_nodes > topology_ctx->map_user_to_sysfs_node_id_size) {
+		free(topology_ctx->map_user_to_sysfs_node_id);
+		topology_ctx->map_user_to_sysfs_node_id = calloc(topology_ctx->num_sysfs_nodes, sizeof(uint32_t));
+		if (topology_ctx->map_user_to_sysfs_node_id == NULL) {
 			ret = HSAKMT_STATUS_NO_MEMORY;
 			goto err2;
 		}
-		map_user_to_sysfs_node_id_size = num_sysfs_nodes;
+		topology_ctx->map_user_to_sysfs_node_id_size = topology_ctx->num_sysfs_nodes;
 	}
 
-	for (uint32_t i = 0; i < num_sysfs_nodes; i++) {
+	for (uint32_t i = 0; i < topology_ctx->num_sysfs_nodes; i++) {
 		ret = topology_sysfs_check_node_supported(ctx, i, &is_node_supported);
 		if (ret != HSAKMT_STATUS_SUCCESS)
 			goto sysfs_parse_failed;
 		if (is_node_supported)
-			map_user_to_sysfs_node_id[num_supported_nodes++] = i;
+			topology_ctx->map_user_to_sysfs_node_id[num_supported_nodes++] = i;
 	}
 	props->NumNodes = num_supported_nodes;
 
@@ -815,8 +846,8 @@ HSAKMT_STATUS hsakmt_topology_sysfs_get_system_props(HsaKFDContext *ctx,
 	return ret;
 
 sysfs_parse_failed:
-	free(map_user_to_sysfs_node_id);
-	map_user_to_sysfs_node_id = NULL;
+	free(topology_ctx->map_user_to_sysfs_node_id);
+	topology_ctx->map_user_to_sysfs_node_id = NULL;
 err2:
 	free(read_buf);
 err1:
@@ -1077,7 +1108,8 @@ err_device_initialize:
 	return ret;
 }
 
-static HSAKMT_STATUS topology_sysfs_get_node_props(uint32_t node_id,
+static HSAKMT_STATUS topology_sysfs_get_node_props(HsaKFDContext *ctx,
+						   uint32_t node_id,
 						   HsaNodeProperties *props,
 						   bool *p2p_links,
 						   uint32_t *num_p2pLinks)
@@ -1097,9 +1129,9 @@ static HSAKMT_STATUS topology_sysfs_get_node_props(uint32_t node_id,
 	uint32_t simd_arrays_count = 0;
 
 	HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
-
+	struct hsa_kfd_topology_context *topology_ctx = hsakmt_kfdcontext_get_topology_context(ctx);
 	assert(props);
-	ret = topology_sysfs_map_node_id(node_id, &sys_node_id);
+	ret = topology_sysfs_map_node_id(topology_ctx, node_id, &sys_node_id);
 	if (ret != HSAKMT_STATUS_SUCCESS)
 		return ret;
 
@@ -1307,7 +1339,9 @@ out:
 	return ret;
 }
 
-static HSAKMT_STATUS topology_sysfs_get_mem_props(uint32_t node_id,
+static HSAKMT_STATUS topology_sysfs_get_mem_props(
+						  struct hsa_kfd_topology_context *topology_ctx,
+						  uint32_t node_id,
 						  uint32_t mem_id,
 						  HsaMemoryProperties *props)
 {
@@ -1322,7 +1356,7 @@ static HSAKMT_STATUS topology_sysfs_get_mem_props(uint32_t node_id,
 	uint32_t sys_node_id;
 
 	assert(props);
-	ret = topology_sysfs_map_node_id(node_id, &sys_node_id);
+	ret = topology_sysfs_map_node_id(topology_ctx, node_id, &sys_node_id);
 	if (ret != HSAKMT_STATUS_SUCCESS)
 		return ret;
 
@@ -1541,7 +1575,9 @@ exit:
 	return ret;
 }
 
-static HSAKMT_STATUS topology_sysfs_get_cache_props(uint32_t node_id,
+static HSAKMT_STATUS topology_sysfs_get_cache_props(
+						    struct hsa_kfd_topology_context *topology_ctx,
+						    uint32_t node_id,
 						    uint32_t cache_id,
 						    HsaCacheProperties *props)
 {
@@ -1556,7 +1592,7 @@ static HSAKMT_STATUS topology_sysfs_get_cache_props(uint32_t node_id,
 	uint32_t sys_node_id;
 
 	assert(props);
-	ret = topology_sysfs_map_node_id(node_id, &sys_node_id);
+	ret = topology_sysfs_map_node_id(topology_ctx, node_id, &sys_node_id);
 	if (ret != HSAKMT_STATUS_SUCCESS)
 		return ret;
 
@@ -1619,12 +1655,13 @@ err1:
 	return ret;
 }
 
-static HSAKMT_STATUS topology_map_sysfs_to_user_node_id(uint32_t sys_node_id, uint32_t *user_node_id)
+static HSAKMT_STATUS topology_map_sysfs_to_user_node_id(struct hsa_kfd_topology_context *topology_ctx,
+								 uint32_t sys_node_id, uint32_t *user_node_id)
 {
 	uint32_t node_id;
 
-	for (node_id = 0; node_id < map_user_to_sysfs_node_id_size; node_id++)
-		if (map_user_to_sysfs_node_id[node_id] == sys_node_id) {
+	for (node_id = 0; node_id < topology_ctx->map_user_to_sysfs_node_id_size; node_id++)
+		if (topology_ctx->map_user_to_sysfs_node_id[node_id] == sys_node_id) {
 			*user_node_id = node_id;
 			return HSAKMT_STATUS_SUCCESS;
 		}
@@ -1652,9 +1689,10 @@ static HSAKMT_STATUS topology_sysfs_get_iolink_props(HsaKFDContext *ctx,
 	int read_size;
 	HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
 	uint32_t sys_node_id;
+	struct hsa_kfd_topology_context *topology_ctx = hsakmt_kfdcontext_get_topology_context(ctx);
 
 	assert(props);
-	ret = topology_sysfs_map_node_id(node_id, &sys_node_id);
+	ret = topology_sysfs_map_node_id(topology_ctx, node_id, &sys_node_id);
 	if (ret != HSAKMT_STATUS_SUCCESS)
 		return ret;
 
@@ -1707,7 +1745,7 @@ static HSAKMT_STATUS topology_sysfs_get_iolink_props(HsaKFDContext *ctx,
 				memset(props, 0, sizeof(*props));
 				goto err2;
 			}
-			ret = topology_map_sysfs_to_user_node_id(sysfs_node_id, &props->NodeTo);
+			ret = topology_map_sysfs_to_user_node_id(topology_ctx, sysfs_node_id, &props->NodeTo);
 			if (ret != HSAKMT_STATUS_SUCCESS)
 				goto err2;
 		} else if (strcmp(prop_name, "weight") == 0)
@@ -1974,6 +2012,7 @@ HSAKMT_STATUS topology_take_snapshot(HsaKFDContext *ctx)
 	uint32_t num_ioLinks;
 	bool p2p_links = false;
 	uint32_t num_p2pLinks = 0;
+	struct hsa_kfd_topology_context *topology_ctx = hsakmt_kfdcontext_get_topology_context(ctx);
 
 	cpuinfo = calloc(num_procs, sizeof(struct proc_cpuinfo));
 	if (!cpuinfo) {
@@ -1996,7 +2035,7 @@ retry:
 			goto err;
 		}
 		for (i = 0; i < sys_props.NumNodes; i++) {
-			ret = topology_sysfs_get_node_props(i,
+			ret = topology_sysfs_get_node_props(ctx, i,
 					&temp_props[i].node,
 					&p2p_links, &num_p2pLinks);
 			if (ret != HSAKMT_STATUS_SUCCESS) {
@@ -2016,7 +2055,7 @@ retry:
 					goto err;
 				}
 				for (mem_id = 0; mem_id < temp_props[i].node.NumMemoryBanks; mem_id++) {
-					ret = topology_sysfs_get_mem_props(i, mem_id, &temp_props[i].mem[mem_id]);
+					ret = topology_sysfs_get_mem_props(topology_ctx, i, mem_id, &temp_props[i].mem[mem_id]);
 					if (ret != HSAKMT_STATUS_SUCCESS) {
 						free_properties(temp_props, i + 1);
 						goto err;
@@ -2032,7 +2071,8 @@ retry:
 					goto err;
 				}
 				for (cache_id = 0; cache_id < temp_props[i].node.NumCaches; cache_id++) {
-					ret = topology_sysfs_get_cache_props(i, cache_id, &temp_props[i].cache[cache_id]);
+					ret = topology_sysfs_get_cache_props(topology_ctx,
+							i, cache_id, &temp_props[i].cache[cache_id]);
 					if (ret != HSAKMT_STATUS_SUCCESS) {
 						free_properties(temp_props, i + 1);
 						goto err;
@@ -2122,62 +2162,72 @@ retry:
 		goto retry;
 	}
 
-	if (!g_system) {
-		g_system = malloc(sizeof(HsaSystemProperties));
-		if (!g_system) {
+	if (!topology_ctx->system_props) {
+		topology_ctx->system_props = malloc(sizeof(HsaSystemProperties));
+		if (!topology_ctx->system_props) {
 			free_properties(temp_props, sys_props.NumNodes);
 			ret = HSAKMT_STATUS_NO_MEMORY;
 			goto err;
 		}
 	}
 
-	*g_system = sys_props;
-	if (g_props)
-		free(g_props);
-	g_props = temp_props;
+	*topology_ctx->system_props = sys_props;
+	if (topology_ctx->node_props)
+		free(topology_ctx->node_props);
+	topology_ctx->node_props = temp_props;
 err:
 	free(cpuinfo);
 	return ret;
 }
 
 /* Drop the Snapshot of the HSA topology information. Assume lock is held. */
-void topology_drop_snapshot(void)
+ void topology_drop_snapshot(HsaKFDContext *ctx)
 {
-	if (!!g_system != !!g_props)
+	struct hsa_kfd_topology_context *topology_ctx =
+				hsakmt_kfdcontext_get_topology_context(ctx);
+
+	if (!!topology_ctx->system_props != !!topology_ctx->node_props)
 		pr_warn("Probably inconsistency?\n");
 
-	if (g_props) {
+	if (topology_ctx->node_props) {
 		/* Remove state */
-		free_properties(g_props, g_system->NumNodes);
-		g_props = NULL;
+		free_properties(topology_ctx->node_props, topology_ctx->system_props->NumNodes);
+		topology_ctx->node_props = NULL;
 	}
 
-	free(g_system);
-	g_system = NULL;
+	free(topology_ctx->system_props);
+	topology_ctx->system_props = NULL;
 
-	if (map_user_to_sysfs_node_id) {
-		free(map_user_to_sysfs_node_id);
-		map_user_to_sysfs_node_id = NULL;
-		map_user_to_sysfs_node_id_size = 0;
+	if (topology_ctx->map_user_to_sysfs_node_id) {
+		free(topology_ctx->map_user_to_sysfs_node_id);
+		topology_ctx->map_user_to_sysfs_node_id = NULL;
+		topology_ctx->map_user_to_sysfs_node_id_size = 0;
 	}
 }
 
-HSAKMT_STATUS hsakmt_validate_nodeid(uint32_t nodeid, uint32_t *gpu_id)
+HSAKMT_STATUS hsakmt_validate_nodeid(HsaKFDContext *ctx, uint32_t nodeid, uint32_t *gpu_id)
 {
-	if (!g_props || !g_system || g_system->NumNodes <= nodeid)
+	struct hsa_kfd_topology_context *topology_ctx =
+				hsakmt_kfdcontext_get_topology_context(ctx);
+
+	if (!topology_ctx->node_props || !topology_ctx->system_props ||
+		topology_ctx->system_props->NumNodes <= nodeid)
 		return HSAKMT_STATUS_INVALID_NODE_UNIT;
 	if (gpu_id)
-		*gpu_id = g_props[nodeid].node.KFDGpuID;
+		*gpu_id = topology_ctx->node_props[nodeid].node.KFDGpuID;
 
 	return HSAKMT_STATUS_SUCCESS;
 }
 
-HSAKMT_STATUS hsakmt_gpuid_to_nodeid(uint32_t gpu_id, uint32_t *node_id)
+HSAKMT_STATUS hsakmt_gpuid_to_nodeid(HsaKFDContext *ctx, uint32_t gpu_id, uint32_t *node_id)
 {
 	uint64_t node_idx;
 
-	for (node_idx = 0; node_idx < g_system->NumNodes; node_idx++) {
-		if (g_props[node_idx].node.KFDGpuID == gpu_id) {
+	struct hsa_kfd_topology_context *topology_ctx =
+				hsakmt_kfdcontext_get_topology_context(ctx);
+
+	for (node_idx = 0; node_idx < topology_ctx->system_props->NumNodes; node_idx++) {
+		if (topology_ctx->node_props[node_idx].node.KFDGpuID == gpu_id) {
 			*node_id = node_idx;
 			return HSAKMT_STATUS_SUCCESS;
 		}
@@ -2193,6 +2243,8 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAcquireSystemPropertiesCtx(HsaKFDContext *ctx,
 	HSAKMT_STATUS err = HSAKMT_STATUS_SUCCESS;
 
 	CHECK_KFD_OPEN();
+	struct hsa_kfd_topology_context *topology_ctx =
+				hsakmt_kfdcontext_get_topology_context(ctx);
 
 	if (!SystemProperties)
 		return HSAKMT_STATUS_INVALID_PARAMETER;
@@ -2202,8 +2254,8 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAcquireSystemPropertiesCtx(HsaKFDContext *ctx,
 	/* We already have a valid snapshot. Avoid double initialization that
 	 * would leak memory.
 	 */
-	if (g_system) {
-		*SystemProperties = *g_system;
+	if (topology_ctx->system_props) {
+		*SystemProperties = *topology_ctx->system_props;
 		goto out;
 	}
 
@@ -2211,23 +2263,23 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAcquireSystemPropertiesCtx(HsaKFDContext *ctx,
 	if (err != HSAKMT_STATUS_SUCCESS)
 		goto out;
 
-	assert(g_system);
+	assert(topology_ctx->system_props);
 
 	if (hsakmt_use_model)
 		model_init();
 
-	err = hsakmt_fmm_init_process_apertures(ctx, g_system->NumNodes);
+	err = hsakmt_fmm_init_process_apertures(ctx, topology_ctx->system_props->NumNodes);
 	if (err != HSAKMT_STATUS_SUCCESS)
 		goto init_process_apertures_failed;
 
-	err = hsakmt_init_process_doorbells(ctx, g_system->NumNodes);
+	err = hsakmt_init_process_doorbells(ctx, topology_ctx->system_props->NumNodes);
 	if (err != HSAKMT_STATUS_SUCCESS)
 		goto init_doorbells_failed;
 
-	*SystemProperties = *g_system;
+	*SystemProperties = *topology_ctx->system_props;
 
-	for (int node = 0; node < g_system->NumNodes; node++) {
-		if (hsakmt_get_gfxv_by_node_id(node) == GFX_VERSION_GFX1151 &&
+	for (int node = 0; node < topology_ctx->system_props->NumNodes; node++) {
+		if (hsakmt_get_gfxv_by_node_id(ctx, node) == GFX_VERSION_GFX1151 &&
 		    hsakmt_kfd_version_info.KernelInterfaceMajorVersion == 1 &&
 		    hsakmt_kfd_version_info.KernelInterfaceMinorVersion < 20)
 			pr_err_once("WARNING: KFD ABI 1.20+ is recommended for gfx1151. Current KFD ABI is %i.%i. This may result in faults, crashes and other application instability\n", hsakmt_kfd_version_info.KernelInterfaceMajorVersion, hsakmt_kfd_version_info.KernelInterfaceMinorVersion);
@@ -2238,7 +2290,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAcquireSystemPropertiesCtx(HsaKFDContext *ctx,
 init_doorbells_failed:
 	hsakmt_fmm_destroy_process_apertures(ctx);
 init_process_apertures_failed:
-	topology_drop_snapshot();
+	topology_drop_snapshot(ctx);
 
 out:
 	pthread_mutex_unlock(&hsakmt_mutex);
@@ -2251,20 +2303,24 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtReleaseSystemPropertiesCtx(HsaKFDContext *ctx)
 
 	hsakmt_destroy_process_doorbells(ctx);
 	hsakmt_fmm_destroy_process_apertures(ctx);
-	topology_drop_snapshot();
+	topology_drop_snapshot(ctx);
 
 	pthread_mutex_unlock(&hsakmt_mutex);
 
 	return HSAKMT_STATUS_SUCCESS;
 }
 
-HSAKMT_STATUS hsakmt_topology_get_node_props(HSAuint32 NodeId,
+HSAKMT_STATUS hsakmt_topology_get_node_props(HsaKFDContext *ctx,
+				      HSAuint32 NodeId,
 				      HsaNodeProperties *NodeProperties)
 {
-	if (!g_system || !g_props || NodeId >= g_system->NumNodes)
+	struct hsa_kfd_topology_context *topology_ctx = hsakmt_kfdcontext_get_topology_context(ctx);
+
+	if (!topology_ctx->system_props || !topology_ctx->node_props ||
+		NodeId >= topology_ctx->system_props->NumNodes)
 		return HSAKMT_STATUS_ERROR;
 
-	*NodeProperties = g_props[NodeId].node;
+	*NodeProperties = topology_ctx->node_props[NodeId].node;
 	return HSAKMT_STATUS_SUCCESS;
 }
 
@@ -2282,11 +2338,11 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodePropertiesCtx(HsaKFDContext *ctx,
 	CHECK_KFD_OPEN();
 	pthread_mutex_lock(&hsakmt_mutex);
 
-	err = hsakmt_validate_nodeid(NodeId, &gpu_id);
+	err = hsakmt_validate_nodeid(ctx, NodeId, &gpu_id);
 	if (err != HSAKMT_STATUS_SUCCESS)
 		goto out;
 
-	err = hsakmt_topology_get_node_props(NodeId, NodeProperties);
+	err = hsakmt_topology_get_node_props(ctx, NodeId, NodeProperties);
 	if (err != HSAKMT_STATUS_SUCCESS)
 		goto out;
 	/* For CPU only node don't add any additional GPU memory banks. */
@@ -2314,6 +2370,8 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeMemoryPropertiesCtx(HsaKFDContext *ctx,
 	HSAKMT_STATUS err = HSAKMT_STATUS_SUCCESS;
 	uint32_t i, gpu_id;
 	HSAuint64 aperture_limit;
+	struct hsa_kfd_topology_context *topology_ctx = hsakmt_kfdcontext_get_topology_context(ctx);
+	node_props_t *node_props = topology_ctx->node_props;
 
 	if (!MemoryProperties)
 		return HSAKMT_STATUS_INVALID_PARAMETER;
@@ -2321,15 +2379,15 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeMemoryPropertiesCtx(HsaKFDContext *ctx,
 	CHECK_KFD_OPEN();
 	pthread_mutex_lock(&hsakmt_mutex);
 
-	err = hsakmt_validate_nodeid(NodeId, &gpu_id);
+	err = hsakmt_validate_nodeid(ctx, NodeId, &gpu_id);
 	if (err != HSAKMT_STATUS_SUCCESS)
 		goto out;
 
 	memset(MemoryProperties, 0, NumBanks * sizeof(HsaMemoryProperties));
 
-	for (i = 0; i < MIN(g_props[NodeId].node.NumMemoryBanks, NumBanks); i++) {
-		assert(g_props[NodeId].mem);
-		MemoryProperties[i] = g_props[NodeId].mem[i];
+	for (i = 0; i < MIN(node_props[NodeId].node.NumMemoryBanks, NumBanks); i++) {
+		assert(node_props[NodeId].mem);
+		MemoryProperties[i] = node_props[NodeId].mem[i];
 	}
 
 	/* The following memory banks does not apply to CPU only node */
@@ -2341,7 +2399,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeMemoryPropertiesCtx(HsaKFDContext *ctx,
 		hsakmt_fmm_get_aperture_base_and_limit(ctx, FMM_LDS, gpu_id,
 				&MemoryProperties[i].VirtualBaseAddress, &aperture_limit) == HSAKMT_STATUS_SUCCESS) {
 		MemoryProperties[i].HeapType = HSA_HEAPTYPE_GPU_LDS;
-		MemoryProperties[i].SizeInBytes = g_props[NodeId].node.LDSSizeInKB * 1024;
+		MemoryProperties[i].SizeInBytes = node_props[NodeId].node.LDSSizeInKB * 1024;
 		i++;
 	}
 
@@ -2349,12 +2407,12 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeMemoryPropertiesCtx(HsaKFDContext *ctx,
 	 * For dGPU the topology node contains Local Memory and it is added by
 	 * the for loop above
 	 */
-	if (hsakmt_get_gfxv_by_node_id(NodeId) == GFX_VERSION_KAVERI && i < NumBanks &&
-		g_props[NodeId].node.LocalMemSize > 0 &&
+	if (hsakmt_get_gfxv_by_node_id(ctx, NodeId) == GFX_VERSION_KAVERI && i < NumBanks &&
+		node_props[NodeId].node.LocalMemSize > 0 &&
 		hsakmt_fmm_get_aperture_base_and_limit(ctx, FMM_GPUVM, gpu_id,
 				&MemoryProperties[i].VirtualBaseAddress, &aperture_limit) == HSAKMT_STATUS_SUCCESS) {
 		MemoryProperties[i].HeapType = HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE;
-		MemoryProperties[i].SizeInBytes = g_props[NodeId].node.LocalMemSize;
+		MemoryProperties[i].SizeInBytes = node_props[NodeId].node.LocalMemSize;
 		i++;
 	}
 
@@ -2368,7 +2426,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeMemoryPropertiesCtx(HsaKFDContext *ctx,
 	}
 
 	/* Add SVM aperture */
-	if (hsakmt_topology_is_svm_needed(g_props[NodeId].node.EngineId) && i < NumBanks &&
+	if (hsakmt_topology_is_svm_needed(node_props[NodeId].node.EngineId) && i < NumBanks &&
 	    hsakmt_fmm_get_aperture_base_and_limit(ctx,
 		    FMM_SVM, gpu_id, &MemoryProperties[i].VirtualBaseAddress,
 		    &aperture_limit) == HSAKMT_STATUS_SUCCESS) {
@@ -2399,6 +2457,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeCachePropertiesCtx(HsaKFDContext *ctx,
 {
 	HSAKMT_STATUS err;
 	uint32_t i;
+	struct hsa_kfd_topology_context *topology_ctx = hsakmt_kfdcontext_get_topology_context(ctx);
 
 	if (!CacheProperties)
 		return HSAKMT_STATUS_INVALID_PARAMETER;
@@ -2407,19 +2466,19 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeCachePropertiesCtx(HsaKFDContext *ctx,
 	pthread_mutex_lock(&hsakmt_mutex);
 
 	/* KFD ADD page 18, snapshot protocol violation */
-	if (!g_system || NodeId >= g_system->NumNodes) {
+	if (!topology_ctx->system_props || NodeId >= topology_ctx->system_props->NumNodes) {
 		err = HSAKMT_STATUS_INVALID_NODE_UNIT;
 		goto out;
 	}
 
-	if (NumCaches > g_props[NodeId].node.NumCaches) {
+	if (NumCaches > topology_ctx->node_props[NodeId].node.NumCaches) {
 		err = HSAKMT_STATUS_INVALID_PARAMETER;
 		goto out;
 	}
 
-	for (i = 0; i < MIN(g_props[NodeId].node.NumCaches, NumCaches); i++) {
-		assert(g_props[NodeId].cache);
-		CacheProperties[i] = g_props[NodeId].cache[i];
+	for (i = 0; i < MIN(topology_ctx->node_props[NodeId].node.NumCaches, NumCaches); i++) {
+		assert(topology_ctx->node_props[NodeId].cache);
+		CacheProperties[i] = topology_ctx->node_props[NodeId].cache[i];
 	}
 
 	err = HSAKMT_STATUS_SUCCESS;
@@ -2429,14 +2488,18 @@ out:
 	return err;
 }
 
-HSAKMT_STATUS hsakmt_topology_get_iolink_props(HSAuint32 NodeId,
+HSAKMT_STATUS hsakmt_topology_get_iolink_props(HsaKFDContext *ctx,
+					HSAuint32 NodeId,
 					HSAuint32 NumIoLinks,
 					HsaIoLinkProperties *IoLinkProperties)
 {
-	if (!g_system || !g_props || NodeId >= g_system->NumNodes)
+	struct hsa_kfd_topology_context *topology_ctx = hsakmt_kfdcontext_get_topology_context(ctx);
+
+	if (!topology_ctx->system_props || !topology_ctx->node_props ||
+		NodeId >= topology_ctx->system_props->NumNodes)
 		return HSAKMT_STATUS_ERROR;
 
-	memcpy(IoLinkProperties, g_props[NodeId].link,
+	memcpy(IoLinkProperties, topology_ctx->node_props[NodeId].link,
 	       NumIoLinks * sizeof(*IoLinkProperties));
 
 	return HSAKMT_STATUS_SUCCESS;
@@ -2448,6 +2511,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeIoLinkPropertiesCtx(HsaKFDContext *ctx,
 						      HsaIoLinkProperties *IoLinkProperties)
 {
 	HSAKMT_STATUS err;
+	struct hsa_kfd_topology_context *topology_ctx = hsakmt_kfdcontext_get_topology_context(ctx);
 
 	if (!IoLinkProperties)
 		return HSAKMT_STATUS_INVALID_PARAMETER;
@@ -2457,79 +2521,85 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeIoLinkPropertiesCtx(HsaKFDContext *ctx,
 	pthread_mutex_lock(&hsakmt_mutex);
 
 	/* KFD ADD page 18, snapshot protocol violation */
-	if (!g_system || NodeId >= g_system->NumNodes ) {
+	if (!topology_ctx->system_props || NodeId >= topology_ctx->system_props->NumNodes ) {
 		err = HSAKMT_STATUS_INVALID_NODE_UNIT;
 		goto out;
 	}
 
-	if (NumIoLinks > g_props[NodeId].node.NumIOLinks) {
+	if (NumIoLinks > topology_ctx->node_props[NodeId].node.NumIOLinks) {
 		err = HSAKMT_STATUS_INVALID_PARAMETER;
 		goto out;
 	}
 
-	assert(g_props[NodeId].link);
-	err = hsakmt_topology_get_iolink_props(NodeId, NumIoLinks, IoLinkProperties);
+	assert(topology_ctx->node_props[NodeId].link);
+	err = hsakmt_topology_get_iolink_props(ctx, NodeId, NumIoLinks, IoLinkProperties);
 
 out:
 	pthread_mutex_unlock(&hsakmt_mutex);
 	return err;
 }
 
-uint32_t hsakmt_get_gfxv_by_node_id(HSAuint32 node_id)
+uint32_t hsakmt_get_gfxv_by_node_id(HsaKFDContext *ctx, HSAuint32 node_id)
 {
-	return HSA_GET_GFX_VERSION_FULL(g_props[node_id].node.EngineId.ui32);
+	struct hsa_kfd_topology_context *topology_ctx = hsakmt_kfdcontext_get_topology_context(ctx);
+	return HSA_GET_GFX_VERSION_FULL(topology_ctx->node_props[node_id].node.EngineId.ui32);
 }
 
-uint16_t hsakmt_get_device_id_by_node_id(HSAuint32 node_id)
+uint16_t hsakmt_get_device_id_by_node_id(HsaKFDContext *ctx, HSAuint32 node_id)
 {
-	if (!g_props || !g_system || g_system->NumNodes <= node_id)
+	struct hsa_kfd_topology_context *topology_ctx = hsakmt_kfdcontext_get_topology_context(ctx);
+
+	if (!topology_ctx->node_props || !topology_ctx->system_props ||
+		topology_ctx->system_props->NumNodes <= node_id)
 		return 0;
 
-	return g_props[node_id].node.DeviceId;
+	return topology_ctx->node_props[node_id].node.DeviceId;
 }
 
-bool hsakmt_prefer_ats(HSAuint32 node_id)
+bool hsakmt_prefer_ats(HsaKFDContext *ctx, HSAuint32 node_id)
 {
-	return g_props[node_id].node.Capability.ui32.HSAMMUPresent
-			&& g_props[node_id].node.NumCPUCores
-			&& g_props[node_id].node.NumFComputeCores;
+	struct hsa_kfd_topology_context *topology_ctx = hsakmt_kfdcontext_get_topology_context(ctx);
+	return topology_ctx->node_props[node_id].node.Capability.ui32.HSAMMUPresent
+			&& topology_ctx->node_props[node_id].node.NumCPUCores
+			&& topology_ctx->node_props[node_id].node.NumFComputeCores;
 }
 
-uint16_t hsakmt_get_device_id_by_gpu_id(HSAuint32 gpu_id)
+uint16_t hsakmt_get_device_id_by_gpu_id(HsaKFDContext *ctx, HSAuint32 gpu_id)
 {
 	unsigned int i;
-
-	if (!g_props || !g_system)
+	struct hsa_kfd_topology_context *topology_ctx = hsakmt_kfdcontext_get_topology_context(ctx);
+	if (!topology_ctx->node_props || !topology_ctx->system_props)
 		return 0;
 
-	for (i = 0; i < g_system->NumNodes; i++) {
-		if (g_props[i].node.KFDGpuID == gpu_id)
-			return g_props[i].node.DeviceId;
+	for (i = 0; i < topology_ctx->system_props->NumNodes; i++) {
+		if (topology_ctx->node_props[i].node.KFDGpuID == gpu_id)
+			return topology_ctx->node_props[i].node.DeviceId;
 	}
 
 	return 0;
 }
 
-uint32_t hsakmt_get_direct_link_cpu(uint32_t gpu_node)
+uint32_t hsakmt_get_direct_link_cpu(HsaKFDContext *ctx, HSAuint32 gpu_node)
 {
 	HSAuint64 size = 0;
 	int32_t cpu_id;
 	HSAuint32 i;
+	struct hsa_kfd_topology_context *topology_ctx = hsakmt_kfdcontext_get_topology_context(ctx);
 
-	cpu_id = gpu_get_direct_link_cpu(gpu_node, g_props);
+	cpu_id = gpu_get_direct_link_cpu(gpu_node, topology_ctx->node_props);
 	if (cpu_id == -1)
 		return INVALID_NODEID;
 
-	assert(g_props[cpu_id].mem);
-
-	for (i = 0; i < g_props[cpu_id].node.NumMemoryBanks; i++)
-		size += g_props[cpu_id].mem[i].SizeInBytes;
+	assert(topology_ctx->node_props[cpu_id].mem);
+	for (i = 0; i < topology_ctx->node_props[cpu_id].node.NumMemoryBanks; i++)
+		size += topology_ctx->node_props[cpu_id].mem[i].SizeInBytes;
 
 	return size ? (uint32_t)cpu_id : INVALID_NODEID;
 }
 
 
-HSAKMT_STATUS hsakmt_validate_nodeid_array(uint32_t **gpu_id_array,
+HSAKMT_STATUS hsakmt_validate_nodeid_array(HsaKFDContext *ctx,
+		uint32_t **gpu_id_array,
 		uint32_t NumberOfNodes, uint32_t *NodeArray)
 {
 	HSAKMT_STATUS ret;
@@ -2543,7 +2613,7 @@ HSAKMT_STATUS hsakmt_validate_nodeid_array(uint32_t **gpu_id_array,
 	if (!(*gpu_id_array))
 		return HSAKMT_STATUS_NO_MEMORY;
 	for (i = 0; i < NumberOfNodes; i++) {
-		ret = hsakmt_validate_nodeid(NodeArray[i], *gpu_id_array + i);
+		ret = hsakmt_validate_nodeid(ctx, NodeArray[i], *gpu_id_array + i);
 		if (ret != HSAKMT_STATUS_SUCCESS) {
 			free(*gpu_id_array);
 			break;
@@ -2553,13 +2623,13 @@ HSAKMT_STATUS hsakmt_validate_nodeid_array(uint32_t **gpu_id_array,
 	return ret;
 }
 
-inline uint32_t hsakmt_get_num_sysfs_nodes(void)
+uint32_t hsakmt_get_num_sysfs_nodes(HsaKFDContext *ctx)
 {
-	return num_sysfs_nodes;
+	struct hsa_kfd_topology_context *topology_ctx = hsakmt_kfdcontext_get_topology_context(ctx);
+	return topology_ctx->num_sysfs_nodes;
 }
 
 
-
 HSAKMT_STATUS HSAKMTAPI hsaKmtAcquireSystemProperties(HsaSystemProperties *SystemProperties)
 {
 	return hsaKmtAcquireSystemPropertiesCtx(&hsakmt_primary_kfd_ctx, SystemProperties);