diff --git a/projects/rocr-runtime/libhsakmt/include/hsakmt/hsakmt.h b/projects/rocr-runtime/libhsakmt/include/hsakmt/hsakmt.h index 957141243b..6cf5ced9d5 100644 --- a/projects/rocr-runtime/libhsakmt/include/hsakmt/hsakmt.h +++ b/projects/rocr-runtime/libhsakmt/include/hsakmt/hsakmt.h @@ -32,6 +32,8 @@ extern "C" { #endif +/* Forward declaration for debug trap ioctl arguments */ +struct kfd_ioctl_dbg_trap_args; /** "Opens" the HSA kernel driver for user-kernel mode communication. @@ -852,8 +854,10 @@ hsaKmtCheckRuntimeDebugSupport( /** Debug ops call primarily used for KFD testing */ -HSAKMT_STATUS HSAKMTAPI hsaKmtDebugTrapIoctl( - struct kfd_ioctl_dbg_trap_args *arg, +HSAKMT_STATUS +HSAKMTAPI +hsaKmtDebugTrapIoctl( + struct kfd_ioctl_dbg_trap_args *args, HSA_QUEUEID *Queues, HSAuint64 *DebugReturn ); diff --git a/projects/rocr-runtime/libhsakmt/src/debug.c b/projects/rocr-runtime/libhsakmt/src/debug.c index 7fe450d123..1781b88a46 100644 --- a/projects/rocr-runtime/libhsakmt/src/debug.c +++ b/projects/rocr-runtime/libhsakmt/src/debug.c @@ -26,38 +26,82 @@ #include "libhsakmt.h" #include "hsakmt/linux/kfd_ioctl.h" #include +#include #include #include #include +#include -static bool *is_device_debugged; -static uint32_t runtime_capabilities_mask = 0; +/* + * hsa_kfd_debug_context + * + * Represents the debug state for a KFD context. + * Each HsaKFDContext has its own independent debug context. + */ +struct hsa_kfd_debug_context { + /* Array tracking which nodes are being debugged */ + bool *is_device_debugged; -HSAKMT_STATUS hsakmt_init_device_debugging_memory(unsigned int NumNodes) + /* Runtime debug capabilities mask */ + uint32_t runtime_capabilities_mask; +}; + +struct hsa_kfd_debug_context *hsakmt_kfdcontext_get_debug_context(HsaKFDContext *ctx) +{ + assert(ctx); + if (!ctx) { + pr_err("Expected a non-null ptr for HsaKFDContext"); + return NULL; + } + + if (ctx->debug_context) + return ctx->debug_context; + + ctx->debug_context = calloc(1, sizeof(struct hsa_kfd_debug_context)); + if (!ctx->debug_context) { + pr_err("Alloc memory failed for struct hsa_kfd_debug_context size %zu\n", + sizeof(struct hsa_kfd_debug_context)); + return NULL; + } + return ctx->debug_context; +} + +HSAKMT_STATUS hsakmt_init_device_debugging_memory(HsaKFDContext *ctx, unsigned int NumNodes) { unsigned int i; + struct hsa_kfd_debug_context *debug_ctx = hsakmt_kfdcontext_get_debug_context(ctx); + if (!debug_ctx) + return HSAKMT_STATUS_NO_MEMORY; - is_device_debugged = malloc(NumNodes * sizeof(bool)); - if (!is_device_debugged) + debug_ctx->is_device_debugged = malloc(NumNodes * sizeof(bool)); + if (!debug_ctx->is_device_debugged) return HSAKMT_STATUS_NO_MEMORY; for (i = 0; i < NumNodes; i++) - is_device_debugged[i] = false; + debug_ctx->is_device_debugged[i] = false; return HSAKMT_STATUS_SUCCESS; } -void hsakmt_destroy_device_debugging_memory(void) +void hsakmt_destroy_device_debugging_memory(HsaKFDContext *ctx) { - if (is_device_debugged) { - free(is_device_debugged); - is_device_debugged = NULL; + struct hsa_kfd_debug_context *debug_ctx = hsakmt_kfdcontext_get_debug_context(ctx); + if (!debug_ctx) + return; + + if (debug_ctx->is_device_debugged) { + free(debug_ctx->is_device_debugged); + debug_ctx->is_device_debugged = NULL; } } -bool hsakmt_debug_get_reg_status(uint32_t node_id) +bool hsakmt_debug_get_reg_status(HsaKFDContext *ctx, uint32_t node_id) { - return is_device_debugged[node_id]; + struct hsa_kfd_debug_context *debug_ctx = hsakmt_kfdcontext_get_debug_context(ctx); + if (!debug_ctx || !debug_ctx->is_device_debugged) + return false; + + return debug_ctx->is_device_debugged[node_id]; } HSAKMT_STATUS HSAKMTAPI hsaKmtDbgRegister(HSAuint32 NodeId) @@ -66,11 +110,12 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDbgRegister(HSAuint32 NodeId) uint32_t gpu_id; CHECK_KFD_OPEN(); - - if (!is_device_debugged) + struct hsa_kfd_debug_context *debug_ctx = + hsakmt_kfdcontext_get_debug_context(&hsakmt_primary_kfd_ctx); + if (!debug_ctx->is_device_debugged) return HSAKMT_STATUS_NO_MEMORY; - result = hsakmt_validate_nodeid(NodeId, &gpu_id); + result = hsakmt_validate_nodeid(&hsakmt_primary_kfd_ctx, NodeId, &gpu_id); if (result != HSAKMT_STATUS_SUCCESS) return result; @@ -94,11 +139,12 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDbgUnregister(HSAuint32 NodeId) HSAKMT_STATUS result; CHECK_KFD_OPEN(); - - if (!is_device_debugged) + struct hsa_kfd_debug_context *debug_ctx = + hsakmt_kfdcontext_get_debug_context(&hsakmt_primary_kfd_ctx); + if (!debug_ctx->is_device_debugged) return HSAKMT_STATUS_NO_MEMORY; - result = hsakmt_validate_nodeid(NodeId, &gpu_id); + result = hsakmt_validate_nodeid(&hsakmt_primary_kfd_ctx, NodeId, &gpu_id); if (result != HSAKMT_STATUS_SUCCESS) return result; @@ -126,7 +172,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDbgWavefrontControl(HSAuint32 NodeId, CHECK_KFD_OPEN(); - result = hsakmt_validate_nodeid(NodeId, &gpu_id); + result = hsakmt_validate_nodeid(&hsakmt_primary_kfd_ctx, NodeId, &gpu_id); if (result != HSAKMT_STATUS_SUCCESS) return result; @@ -195,11 +241,11 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDbgAddressWatch(HSAuint32 NodeId, uint32_t watch_event_items = WatchEvent != NULL ? NumWatchPoints:0; struct kfd_ioctl_dbg_address_watch_args *args; - HSAuint32 i = 0; + HSAuint32 i = 0; CHECK_KFD_OPEN(); - result = hsakmt_validate_nodeid(NodeId, &gpu_id); + result = hsakmt_validate_nodeid(&hsakmt_primary_kfd_ctx, NodeId, &gpu_id); if (result != HSAKMT_STATUS_SUCCESS) return result; @@ -268,19 +314,19 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDbgAddressWatch(HSAuint32 NodeId, #define HSA_RUNTIME_ENABLE_MAX_MAJOR 1 #define HSA_RUNTIME_ENABLE_MIN_MINOR 13 -HSAKMT_STATUS HSAKMTAPI hsaKmtCheckRuntimeDebugSupport(void) { +HSAKMT_STATUS HSAKMTAPI hsaKmtCheckRuntimeDebugSupportCtx(HsaKFDContext *ctx) { HsaNodeProperties node = {0}; HsaSystemProperties props = {0}; HsaVersionInfo versionInfo = {0}; memset(&node, 0x00, sizeof(node)); memset(&props, 0x00, sizeof(props)); - if (hsaKmtAcquireSystemProperties(&props)) + if (hsaKmtAcquireSystemPropertiesCtx(ctx, &props)) return HSAKMT_STATUS_ERROR; //the firmware of gpu node doesn't support the debugger, disable it. for (uint32_t i = 0; i < props.NumNodes; i++) { - if (hsaKmtGetNodeProperties(i, &node)) + if (hsaKmtGetNodePropertiesCtx(ctx, i, &node)) return HSAKMT_STATUS_ERROR; //ignore cpu node @@ -302,12 +348,14 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCheckRuntimeDebugSupport(void) { return HSAKMT_STATUS_SUCCESS; } -HSAKMT_STATUS HSAKMTAPI hsaKmtRuntimeEnable(void *rDebug, +HSAKMT_STATUS HSAKMTAPI hsaKmtRuntimeEnableCtx(HsaKFDContext *ctx, + void *rDebug, bool setupTtmp) { - struct kfd_ioctl_runtime_enable_args args = {0}; - HSAKMT_STATUS result = hsaKmtCheckRuntimeDebugSupport(); + struct hsa_kfd_debug_context *debug_ctx = hsakmt_kfdcontext_get_debug_context(ctx); + struct kfd_ioctl_runtime_enable_args args = {0}; + HSAKMT_STATUS result = hsaKmtCheckRuntimeDebugSupportCtx(ctx); if (result) return result; @@ -316,7 +364,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRuntimeEnable(void *rDebug, ((setupTtmp) ? KFD_RUNTIME_ENABLE_MODE_TTMP_SAVE_MASK : 0); args.r_debug = (HSAuint64)rDebug; - long err = hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_RUNTIME_ENABLE, &args); + long err = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_RUNTIME_ENABLE, &args); if (err) { if (errno == EBUSY) @@ -324,15 +372,15 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRuntimeEnable(void *rDebug, else return HSAKMT_STATUS_ERROR; } - runtime_capabilities_mask= args.capabilities_mask; + debug_ctx->runtime_capabilities_mask= args.capabilities_mask; return HSAKMT_STATUS_SUCCESS; } -HSAKMT_STATUS HSAKMTAPI hsaKmtRuntimeDisable(void) +HSAKMT_STATUS HSAKMTAPI hsaKmtRuntimeDisableCtx(HsaKFDContext *ctx) { struct kfd_ioctl_runtime_enable_args args = {0}; - HSAKMT_STATUS result = hsaKmtCheckRuntimeDebugSupport(); + HSAKMT_STATUS result = hsaKmtCheckRuntimeDebugSupportCtx(ctx); if (result) return result; @@ -340,19 +388,23 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRuntimeDisable(void) memset(&args, 0x00, sizeof(args)); args.mode_mask = 0; //Disable - if (hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_RUNTIME_ENABLE, &args)) + if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_RUNTIME_ENABLE, &args)) return HSAKMT_STATUS_ERROR; return HSAKMT_STATUS_SUCCESS; } -HSAKMT_STATUS HSAKMTAPI hsaKmtGetRuntimeCapabilities(HSAuint32 *caps_mask) +HSAKMT_STATUS HSAKMTAPI hsaKmtGetRuntimeCapabilitiesCtx(HsaKFDContext *ctx, + HSAuint32 *caps_mask) { - *caps_mask = runtime_capabilities_mask; + struct hsa_kfd_debug_context *debug_ctx = hsakmt_kfdcontext_get_debug_context(ctx); + + *caps_mask = debug_ctx->runtime_capabilities_mask; return HSAKMT_STATUS_SUCCESS; } -static HSAKMT_STATUS dbg_trap_get_device_data(void *data, +static HSAKMT_STATUS dbg_trap_get_device_data(HsaKFDContext *ctx, + void *data, uint32_t *n_entries, uint32_t entry_size) { @@ -363,14 +415,15 @@ static HSAKMT_STATUS dbg_trap_get_device_data(void *data, args.device_snapshot.entry_size = entry_size; args.op = KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT; args.pid = getpid(); - if (hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_DBG_TRAP, &args)) + if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_DBG_TRAP, &args)) return HSAKMT_STATUS_ERROR; *n_entries = args.device_snapshot.num_devices; return HSAKMT_STATUS_SUCCESS; } -static HSAKMT_STATUS dbg_trap_get_queue_data(void *data, +static HSAKMT_STATUS dbg_trap_get_queue_data(HsaKFDContext *ctx, + void *data, uint32_t *n_entries, uint32_t entry_size, uint32_t *queue_ids) @@ -384,7 +437,7 @@ static HSAKMT_STATUS dbg_trap_get_queue_data(void *data, args.queue_snapshot.snapshot_buf_ptr = (uint64_t) data; args.pid = getpid(); - if (hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_DBG_TRAP, &args)) + if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_DBG_TRAP, &args)) return HSAKMT_STATUS_ERROR; *n_entries = args.queue_snapshot.num_queues; @@ -398,7 +451,8 @@ static HSAKMT_STATUS dbg_trap_get_queue_data(void *data, return HSAKMT_STATUS_SUCCESS; } -static HSAKMT_STATUS dbg_trap_suspend_queues(uint32_t *queue_ids, +static HSAKMT_STATUS dbg_trap_suspend_queues(HsaKFDContext *ctx, + uint32_t *queue_ids, uint32_t num_queues) { struct kfd_ioctl_dbg_trap_args args = {0}; @@ -410,7 +464,7 @@ static HSAKMT_STATUS dbg_trap_suspend_queues(uint32_t *queue_ids, args.op = KFD_IOC_DBG_TRAP_SUSPEND_QUEUES; args.pid = getpid(); - r = hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_DBG_TRAP, &args); + r = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_DBG_TRAP, &args); if (r < 0) return HSAKMT_STATUS_ERROR; @@ -420,7 +474,8 @@ static HSAKMT_STATUS dbg_trap_suspend_queues(uint32_t *queue_ids, /* Debugger support has been in KFD ABI 1.13. */ #define KFD_MINOR_MIN_DEBUG 13 -HSAKMT_STATUS HSAKMTAPI hsaKmtDbgEnable(void **runtime_info, +HSAKMT_STATUS HSAKMTAPI hsaKmtDbgEnableCtx(HsaKFDContext *ctx, + void **runtime_info, HSAuint32 *data_size) { struct kfd_ioctl_dbg_trap_args args = {0}; @@ -429,7 +484,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDbgEnable(void **runtime_info, CHECK_KFD_MINOR_VERSION(KFD_MINOR_MIN_DEBUG); *data_size = sizeof(struct kfd_runtime_info); args.enable.rinfo_size = *data_size; - args.enable.dbg_fd = hsakmt_primary_kfd_ctx.fd; + args.enable.dbg_fd = ctx->fd; *runtime_info = malloc(args.enable.rinfo_size); if (!*runtime_info) return HSAKMT_STATUS_NO_MEMORY; @@ -437,30 +492,31 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDbgEnable(void **runtime_info, args.op = KFD_IOC_DBG_TRAP_ENABLE; args.pid = getpid(); - if (hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_DBG_TRAP, &args)) { + if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_DBG_TRAP, &args)) { free(*runtime_info); return HSAKMT_STATUS_ERROR; } return HSAKMT_STATUS_SUCCESS; } -HSAKMT_STATUS HSAKMTAPI hsaKmtDbgDisable(void) +HSAKMT_STATUS HSAKMTAPI hsaKmtDbgDisableCtx(HsaKFDContext *ctx) { struct kfd_ioctl_dbg_trap_args args = {0}; CHECK_KFD_OPEN(); CHECK_KFD_MINOR_VERSION(KFD_MINOR_MIN_DEBUG); - args.enable.dbg_fd = hsakmt_primary_kfd_ctx.fd; + args.enable.dbg_fd = ctx->fd; args.op = KFD_IOC_DBG_TRAP_DISABLE; args.pid = getpid(); - if (hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_DBG_TRAP, &args)) + if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_DBG_TRAP, &args)) return HSAKMT_STATUS_ERROR; return HSAKMT_STATUS_SUCCESS; } -HSAKMT_STATUS HSAKMTAPI hsaKmtDbgGetDeviceData(void **data, +HSAKMT_STATUS HSAKMTAPI hsaKmtDbgGetDeviceDataCtx(HsaKFDContext *ctx, + void **data, HSAuint32 *n_entries, HSAuint32 *entry_size) { @@ -473,14 +529,15 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDbgGetDeviceData(void **data, *data = malloc(*entry_size * *n_entries); if (!*data) return ret; - ret = dbg_trap_get_device_data(*data, n_entries, *entry_size); + ret = dbg_trap_get_device_data(ctx, *data, n_entries, *entry_size); if (ret) free(*data); return ret; } -HSAKMT_STATUS HSAKMTAPI hsaKmtDbgGetQueueData(void **data, +HSAKMT_STATUS HSAKMTAPI hsaKmtDbgGetQueueDataCtx(HsaKFDContext *ctx, + void **data, HSAuint32 *n_entries, HSAuint32 *entry_size, bool suspend_queues) @@ -491,7 +548,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDbgGetQueueData(void **data, CHECK_KFD_MINOR_VERSION(KFD_MINOR_MIN_DEBUG); *entry_size = sizeof(struct kfd_queue_snapshot_entry); *n_entries = 0; - if (dbg_trap_get_queue_data(NULL, n_entries, *entry_size, NULL)) + if (dbg_trap_get_queue_data(ctx, NULL, n_entries, *entry_size, NULL)) return HSAKMT_STATUS_ERROR; *data = malloc(*n_entries * *entry_size); if (!*data) @@ -499,11 +556,11 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDbgGetQueueData(void **data, if (suspend_queues && *n_entries) queue_ids = (uint32_t *)malloc(sizeof(uint32_t) * *n_entries); if (!queue_ids || - dbg_trap_get_queue_data(*data, n_entries, *entry_size, queue_ids)) + dbg_trap_get_queue_data(ctx, *data, n_entries, *entry_size, queue_ids)) goto free_data; if (queue_ids) { - if (dbg_trap_suspend_queues(queue_ids, *n_entries) || - dbg_trap_get_queue_data(*data, n_entries, *entry_size, NULL)) + if (dbg_trap_suspend_queues(ctx, queue_ids, *n_entries) || + dbg_trap_get_queue_data(ctx, *data, n_entries, *entry_size, NULL)) goto free_data; free(queue_ids); } @@ -516,9 +573,10 @@ free_data: return HSAKMT_STATUS_ERROR; } -HSAKMT_STATUS HSAKMTAPI hsaKmtDebugTrapIoctl(struct kfd_ioctl_dbg_trap_args *args, - HSA_QUEUEID *Queues, - HSAuint64 *DebugReturn) +HSAKMT_STATUS HSAKMTAPI hsaKmtDebugTrapIoctlCtx(HsaKFDContext *ctx, + struct kfd_ioctl_dbg_trap_args *args, + HSA_QUEUEID *Queues, + HSAuint64 *DebugReturn) { HSAKMT_STATUS result; @@ -540,7 +598,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDebugTrapIoctl(struct kfd_ioctl_dbg_trap_args *arg free(queue_ids); } - long err = hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_DBG_TRAP, args); + long err = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_DBG_TRAP, args); if (DebugReturn) *DebugReturn = err; @@ -557,3 +615,58 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDebugTrapIoctl(struct kfd_ioctl_dbg_trap_args *arg return result; } + +HSAKMT_STATUS HSAKMTAPI hsaKmtCheckRuntimeDebugSupport(void) +{ + return hsaKmtCheckRuntimeDebugSupportCtx(&hsakmt_primary_kfd_ctx); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtRuntimeEnable(void *rDebug, + bool setupTtmp) +{ + return hsaKmtRuntimeEnableCtx(&hsakmt_primary_kfd_ctx, rDebug, setupTtmp); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtRuntimeDisable(void) +{ + return hsaKmtRuntimeDisableCtx(&hsakmt_primary_kfd_ctx); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtGetRuntimeCapabilities(HSAuint32 *caps_mask) +{ + return hsaKmtGetRuntimeCapabilitiesCtx(&hsakmt_primary_kfd_ctx, caps_mask); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtDbgEnable(void **runtime_info, + HSAuint32 *data_size) +{ + return hsaKmtDbgEnableCtx(&hsakmt_primary_kfd_ctx, runtime_info, data_size); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtDbgDisable(void) +{ + return hsaKmtDbgDisableCtx(&hsakmt_primary_kfd_ctx); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtDbgGetDeviceData(void **data, + HSAuint32 *n_entries, + HSAuint32 *entry_size) +{ + return hsaKmtDbgGetDeviceDataCtx(&hsakmt_primary_kfd_ctx, data, n_entries, entry_size); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtDbgGetQueueData(void **data, + HSAuint32 *n_entries, + HSAuint32 *entry_size, + bool suspend_queues) +{ + return hsaKmtDbgGetQueueDataCtx(&hsakmt_primary_kfd_ctx, data, + n_entries, entry_size, suspend_queues); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtDebugTrapIoctl(struct kfd_ioctl_dbg_trap_args *args, + HSA_QUEUEID *Queues, + HSAuint64 *DebugReturn) +{ + return hsaKmtDebugTrapIoctlCtx(&hsakmt_primary_kfd_ctx, args, Queues, DebugReturn); +} diff --git a/projects/rocr-runtime/libhsakmt/src/events.c b/projects/rocr-runtime/libhsakmt/src/events.c index df97cf6c64..c9a04a3b30 100644 --- a/projects/rocr-runtime/libhsakmt/src/events.c +++ b/projects/rocr-runtime/libhsakmt/src/events.c @@ -307,7 +307,7 @@ static HSAKMT_STATUS get_mem_info_svm_api(HsaKFDContext *ctx, uint64_t address, args->attrs[i].value == KFD_IOCTL_SVM_LOCATION_UNDEFINED) node_id = args->attrs[i].value; else - hsakmt_gpuid_to_nodeid(args->attrs[i].value, &node_id); + hsakmt_gpuid_to_nodeid(ctx, args->attrs[i].value, &node_id); switch (args->attrs[i].type) { case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: pr_err("Preferred location for address 0x%lx is Node id %d\n", @@ -359,7 +359,7 @@ static void analysis_memory_exception(HsaKFDContext *ctx, uint32_t node_id = 0; unsigned int i; - hsakmt_gpuid_to_nodeid(memory_exception_data->gpu_id, &node_id); + hsakmt_gpuid_to_nodeid(ctx, memory_exception_data->gpu_id, &node_id); pr_err("Memory exception on virtual address 0x%lx, ", addr); pr_err("node id %d : ", node_id); if (memory_exception_data->failure.NotPresent) @@ -468,7 +468,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents_ExtCtx(HsaKFDContext *ctx, if (Events[i]->EventData.EventType == HSA_EVENTTYPE_MEMORY && event_data[i].memory_exception_data.gpu_id) { Events[i]->EventData.EventData.MemoryAccessFault.VirtualAddress = event_data[i].memory_exception_data.va; - result = hsakmt_gpuid_to_nodeid(event_data[i].memory_exception_data.gpu_id, &Events[i]->EventData.EventData.MemoryAccessFault.NodeId); + result = hsakmt_gpuid_to_nodeid(ctx, event_data[i].memory_exception_data.gpu_id, &Events[i]->EventData.EventData.MemoryAccessFault.NodeId); if (result != HSAKMT_STATUS_SUCCESS) goto out; Events[i]->EventData.EventData.MemoryAccessFault.Failure.NotPresent = event_data[i].memory_exception_data.failure.NotPresent; @@ -483,7 +483,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents_ExtCtx(HsaKFDContext *ctx, } else if (Events[i]->EventData.EventType == HSA_EVENTTYPE_HW_EXCEPTION && event_data[i].hw_exception_data.gpu_id) { - result = hsakmt_gpuid_to_nodeid(event_data[i].hw_exception_data.gpu_id, &Events[i]->EventData.EventData.HwException.NodeId); + result = hsakmt_gpuid_to_nodeid(ctx, event_data[i].hw_exception_data.gpu_id, &Events[i]->EventData.EventData.HwException.NodeId); if (result != HSAKMT_STATUS_SUCCESS) goto out; @@ -515,7 +515,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtOpenSMICtx(HsaKFDContext *ctx, HSAuint32 NodeId, i pr_debug("[%s] node %d\n", __func__, NodeId); - result = hsakmt_validate_nodeid(NodeId, &gpuid); + result = hsakmt_validate_nodeid(ctx, NodeId, &gpuid); if (result != HSAKMT_STATUS_SUCCESS) { pr_err("[%s] invalid node ID: %d\n", __func__, NodeId); return result; diff --git a/projects/rocr-runtime/libhsakmt/src/fmm.c b/projects/rocr-runtime/libhsakmt/src/fmm.c index ea0b43fbd7..57344aafa3 100644 --- a/projects/rocr-runtime/libhsakmt/src/fmm.c +++ b/projects/rocr-runtime/libhsakmt/src/fmm.c @@ -254,6 +254,28 @@ struct hsa_kfd_fmm_context unsigned int gpu_mem_count; gpu_mem_t *first_gpu_mem; + /* GPU node array for default mappings */ + uint32_t all_gpu_id_array_size; + uint32_t *all_gpu_id_array; + + void *dgpu_shared_aperture_base; + void *dgpu_shared_aperture_limit; + + svm_t svm; + + /* On APU, for memory allocated on the system memory that GPU doesn't + * access via GPU driver, they are not managed by GPUVM. cpuvm_aperture + * keeps track of this part of memory. + * Each context has its own tracking. + */ + manageable_aperture_t cpuvm_aperture; + + /* mem_handle_aperture is used to generate memory handles for allocations + * that don't have a valid virtual address. its size is 47bits. + * Each context has its own handle space. + */ + manageable_aperture_t mem_handle_aperture; + #define DRM_FIRST_RENDER_NODE 128 #define DRM_LAST_RENDER_NODE 255 @@ -281,38 +303,30 @@ struct hsa_kfd_fmm_context *hsakmt_kfdcontext_get_fmm_context(HsaKFDContext *ctx sizeof(struct hsa_kfd_fmm_context)); return NULL; } + + /* Initialize svm members */ + manageable_aperture_t init_aperture = INIT_MANAGEABLE_APERTURE(0, 0); + manageable_aperture_t mem_handle_init = INIT_MANAGEABLE_APERTURE(START_NON_CANONICAL_ADDR, (START_NON_CANONICAL_ADDR + (1ULL << 47))); + + ctx->fmm_context->svm.apertures[SVM_DEFAULT] = init_aperture; + ctx->fmm_context->svm.apertures[SVM_COHERENT] = init_aperture; + ctx->fmm_context->svm.dgpu_aperture = NULL; + ctx->fmm_context->svm.dgpu_alt_aperture = NULL; + ctx->fmm_context->svm.userptr_for_paged_mem = false; + ctx->fmm_context->svm.check_userptr = false; + ctx->fmm_context->svm.reserve_svm = false; + ctx->fmm_context->svm.disable_cache = false; + ctx->fmm_context->svm.alignment_order = 0; + + /* Initialize cpuvm_aperture */ + ctx->fmm_context->cpuvm_aperture = init_aperture; + + /* Initialize mem_handle_aperture */ + ctx->fmm_context->mem_handle_aperture = mem_handle_init; + return ctx->fmm_context; } -static void *dgpu_shared_aperture_base; -static void *dgpu_shared_aperture_limit; - -static svm_t svm = { - .apertures = {INIT_MANAGEABLE_APERTURE(0, 0), - INIT_MANAGEABLE_APERTURE(0, 0)}, - .dgpu_aperture = NULL, - .dgpu_alt_aperture = NULL, - .userptr_for_paged_mem = false, - .check_userptr = false, - .disable_cache = false, -}; - -/* On APU, for memory allocated on the system memory that GPU doesn't access - * via GPU driver, they are not managed by GPUVM. cpuvm_aperture keeps track - * of this part of memory. - */ -static manageable_aperture_t cpuvm_aperture = INIT_MANAGEABLE_APERTURE(0, 0); - -/* mem_handle_aperture is used to generate memory handles - * for allocations that don't have a valid virtual address - * its size is 47bits. -*/ -static manageable_aperture_t mem_handle_aperture = INIT_MANAGEABLE_APERTURE(START_NON_CANONICAL_ADDR, (START_NON_CANONICAL_ADDR + (1ULL << 47))); - -/* GPU node array for default mappings */ -static uint32_t all_gpu_id_array_size; -static uint32_t *all_gpu_id_array; - /* IPC structures and helper functions */ typedef enum _HSA_APERTURE { HSA_APERTURE_UNSUPPORTED = 0, @@ -849,8 +863,9 @@ static void *mmap_aperture_allocate_aligned(manageable_aperture_t *aper, void *address, uint64_t size, uint64_t align) { - uint64_t alignment_size = PAGE_SIZE << svm.alignment_order; uint64_t guard_size; + svm_t *svm = container_of(aper, svm_t, apertures); + uint64_t alignment_size = PAGE_SIZE << svm->alignment_order; if (!aper->is_cpu_accessible) { pr_err("MMap Aperture must be CPU accessible\n"); @@ -984,15 +999,15 @@ static manageable_aperture_t *fmm_get_aperture(struct hsa_kfd_fmm_context *fmm_c { switch (info.type) { case HSA_APERTURE_DGPU: - return svm.dgpu_aperture; + return fmm_ctx->svm.dgpu_aperture; case HSA_APERTURE_DGPU_ALT: - return svm.dgpu_alt_aperture; + return fmm_ctx->svm.dgpu_alt_aperture; case HSA_APERTURE_GPUVM: return &fmm_ctx->gpu_mem[info.idx].gpuvm_aperture; case HSA_APERTURE_CPUVM: - return &cpuvm_aperture; + return &fmm_ctx->cpuvm_aperture; case HSA_APERTURE_MEMHANDLE: - return &mem_handle_aperture; + return &fmm_ctx->mem_handle_aperture; default: return NULL; } @@ -1023,35 +1038,35 @@ static manageable_aperture_t *fmm_find_aperture(struct hsa_kfd_fmm_context *fmm_ HsaApertureInfo _info = { .type = HSA_APERTURE_UNSUPPORTED, .idx = 0}; gpu_mem_t *gpu_mem_ptr = NULL; - if ((address >= mem_handle_aperture.base) && - (address <= mem_handle_aperture.limit)){ + if ((address >= fmm_ctx->mem_handle_aperture.base) && + (address <= fmm_ctx->mem_handle_aperture.limit)){ - aperture = &mem_handle_aperture; + aperture = &fmm_ctx->mem_handle_aperture; _info.type = HSA_APERTURE_MEMHANDLE; } else if (hsakmt_is_dgpu) { - if (address >= svm.dgpu_aperture->base && - address <= svm.dgpu_aperture->limit) { + if (address >= fmm_ctx->svm.dgpu_aperture->base && + address <= fmm_ctx->svm.dgpu_aperture->limit) { gpu_mem_ptr = fmm_is_scratch_aperture(fmm_ctx, address); if (gpu_mem_ptr) { aperture = &gpu_mem_ptr->scratch_physical; } else { - aperture = svm.dgpu_aperture; + aperture = fmm_ctx->svm.dgpu_aperture; _info.type = HSA_APERTURE_DGPU; } - } else if (address >= svm.dgpu_alt_aperture->base && - address <= svm.dgpu_alt_aperture->limit) { - aperture = svm.dgpu_alt_aperture; + } else if (address >= fmm_ctx->svm.dgpu_alt_aperture->base && + address <= fmm_ctx->svm.dgpu_alt_aperture->limit) { + aperture = fmm_ctx->svm.dgpu_alt_aperture; _info.type = HSA_APERTURE_DGPU_ALT; } else { /* Not in SVM, it can be system memory registered by userptr */ - aperture = svm.dgpu_aperture; + aperture = fmm_ctx->svm.dgpu_aperture; _info.type = HSA_APERTURE_DGPU; } } else { /* APU */ - if (address >= svm.dgpu_aperture->base && address <= svm.dgpu_aperture->limit) { - aperture = svm.dgpu_aperture; + if (address >= fmm_ctx->svm.dgpu_aperture->base && address <= fmm_ctx->svm.dgpu_aperture->limit) { + aperture = fmm_ctx->svm.dgpu_aperture; _info.type = HSA_APERTURE_DGPU; } else { /* gpuvm_aperture */ @@ -1066,7 +1081,7 @@ static manageable_aperture_t *fmm_find_aperture(struct hsa_kfd_fmm_context *fmm_ } if (!aperture) { /* Not in GPUVM */ - aperture = &cpuvm_aperture; + aperture = &fmm_ctx->cpuvm_aperture; _info.type = HSA_APERTURE_CPUVM; } } @@ -1179,6 +1194,7 @@ static vm_object_t *fmm_allocate_memory_object(HsaKFDContext *ctx, vm_object_t *vm_obj = NULL; HsaMemFlags mflags; uint64_t offset = 0, total_size, size; + struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); if (!mem) return NULL; @@ -1194,7 +1210,7 @@ static vm_object_t *fmm_allocate_memory_object(HsaKFDContext *ctx, args.va_addr = VOID_PTRS_SUB(mem, aperture->base); /* if allocate vram-only, use an invalid VA */ - if (aperture == &mem_handle_aperture) + if (aperture == &fmm_ctx->mem_handle_aperture) args.va_addr = 0; total_size = 0; @@ -1312,12 +1328,12 @@ void hsakmt_fmm_print(HsaKFDContext *ctx, uint32_t gpu_id) } pr_info("dGPU aperture:\n"); - manageable_aperture_print(svm.dgpu_aperture); + manageable_aperture_print(fmm_ctx->svm.dgpu_aperture); pr_info("dGPU alt aperture:\n"); - if (svm.dgpu_aperture == svm.dgpu_alt_aperture) + if (fmm_ctx->svm.dgpu_aperture == fmm_ctx->svm.dgpu_alt_aperture) pr_info("\t Alias of dGPU aperture\n"); else - manageable_aperture_print(svm.dgpu_alt_aperture); + manageable_aperture_print(fmm_ctx->svm.dgpu_alt_aperture); } #else void hsakmt_fmm_print(HsaKFDContext *ctx, uint32_t gpu_id) @@ -1355,24 +1371,24 @@ static vm_object_t *vm_find_object(struct hsa_kfd_fmm_context *fmm_ctx, } if (!aper) { - if ((addr >= mem_handle_aperture.base) && - (addr <= mem_handle_aperture.limit)){ - aper = &mem_handle_aperture; + if ((addr >= fmm_ctx->mem_handle_aperture.base) && + (addr <= fmm_ctx->mem_handle_aperture.limit)){ + aper = &fmm_ctx->mem_handle_aperture; } } if (!aper) { - if (!svm.dgpu_aperture) + if (!fmm_ctx->svm.dgpu_aperture) goto no_svm; - if ((addr >= svm.dgpu_aperture->base) && - (addr <= svm.dgpu_aperture->limit)) - aper = svm.dgpu_aperture; - else if ((addr >= svm.dgpu_alt_aperture->base) && - (addr <= svm.dgpu_alt_aperture->limit)) - aper = svm.dgpu_alt_aperture; + if ((addr >= fmm_ctx->svm.dgpu_aperture->base) && + (addr <= fmm_ctx->svm.dgpu_aperture->limit)) + aper = fmm_ctx->svm.dgpu_aperture; + else if ((addr >= fmm_ctx->svm.dgpu_alt_aperture->base) && + (addr <= fmm_ctx->svm.dgpu_alt_aperture->limit)) + aper = fmm_ctx->svm.dgpu_alt_aperture; else { - aper = svm.dgpu_aperture; + aper = fmm_ctx->svm.dgpu_aperture; userptr = true; } } @@ -1413,7 +1429,7 @@ no_svm: if (aper) pthread_mutex_unlock(&aper->fmm_mutex); - aper = &cpuvm_aperture; + aper = &fmm_ctx->cpuvm_aperture; pthread_mutex_lock(&aper->fmm_mutex); if (range) @@ -1482,11 +1498,11 @@ static void fmm_release_scratch(HsaKFDContext *ctx, uint32_t gpu_id) pthread_mutex_unlock(&aperture->fmm_mutex); /* release address space */ - pthread_mutex_lock(&svm.dgpu_aperture->fmm_mutex); - aperture_release_area(svm.dgpu_aperture, + pthread_mutex_lock(&fmm_ctx->svm.dgpu_aperture->fmm_mutex); + aperture_release_area(fmm_ctx->svm.dgpu_aperture, fmm_ctx->gpu_mem[gpu_mem_id].scratch_physical.base, size); - pthread_mutex_unlock(&svm.dgpu_aperture->fmm_mutex); + pthread_mutex_unlock(&fmm_ctx->svm.dgpu_aperture->fmm_mutex); } else /* release address space */ munmap(fmm_ctx->gpu_mem[gpu_mem_id].scratch_physical.base, size); @@ -1533,11 +1549,11 @@ void *hsakmt_fmm_allocate_scratch(HsaKFDContext *ctx, /* Allocate address space for scratch backing, 64KB aligned */ if (hsakmt_is_dgpu) { - pthread_mutex_lock(&svm.dgpu_aperture->fmm_mutex); + pthread_mutex_lock(&fmm_ctx->svm.dgpu_aperture->fmm_mutex); mem = aperture_allocate_area_aligned( - svm.dgpu_aperture, address, + fmm_ctx->svm.dgpu_aperture, address, aligned_size, SCRATCH_ALIGN); - pthread_mutex_unlock(&svm.dgpu_aperture->fmm_mutex); + pthread_mutex_unlock(&fmm_ctx->svm.dgpu_aperture->fmm_mutex); } else { if (address) return NULL; @@ -1670,6 +1686,7 @@ static void* udmabuf_allocation(HsaKFDContext *ctx, uint64_t guard_size; void *mem; int ret; + struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); dmabuf_fd = -1; memfd = -1; @@ -1694,7 +1711,7 @@ static void* udmabuf_allocation(HsaKFDContext *ctx, goto error_release_memfd; } - alignment_size = PAGE_SIZE << svm.alignment_order; + alignment_size = PAGE_SIZE << fmm_ctx->svm.alignment_order; alignment = alignment ? alignment : aperture->align; while (alignment < alignment_size && size >= (alignment << 1)) alignment <<= 1; @@ -1714,7 +1731,7 @@ static void* udmabuf_allocation(HsaKFDContext *ctx, mflags.ui32.NoSubstitute = 1; /* Bind to NUMA node */ /* node_id is gpu id, get closed numa id */ - numa_node_id = hsakmt_get_direct_link_cpu(node_id); + numa_node_id = hsakmt_get_direct_link_cpu(ctx, node_id); if (bind_mem_to_numa(numa_node_id, mem, size, mflags)) goto error_release_aperture; @@ -1801,7 +1818,7 @@ void *hsakmt_fmm_allocate_device(HsaKFDContext *ctx, ioc_flags |= fmm_translate_hsa_to_ioc_flags(mflags); if (hsakmt_topology_is_svm_needed(fmm_ctx->gpu_mem[gpu_mem_id].EngineId)) { - aperture = svm.dgpu_aperture; + aperture = fmm_ctx->svm.dgpu_aperture; if (mflags.ui32.AQLQueueMemory) size = MemorySizeInBytes * 2; } else { @@ -1814,12 +1831,12 @@ void *hsakmt_fmm_allocate_device(HsaKFDContext *ctx, /* special case for vram allocation without addr */ if(mflags.ui32.NoAddress) - aperture = &mem_handle_aperture; + aperture = &fmm_ctx->mem_handle_aperture; - if (!mflags.ui32.CoarseGrain || svm.disable_cache) + if (!mflags.ui32.CoarseGrain || fmm_ctx->svm.disable_cache) ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_COHERENT; - if (mflags.ui32.Uncached || svm.disable_cache) + if (mflags.ui32.Uncached || fmm_ctx->svm.disable_cache) ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED; if (mflags.ui32.ExtendedCoherent) @@ -1829,7 +1846,7 @@ void *hsakmt_fmm_allocate_device(HsaKFDContext *ctx, ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT; mem = NULL; - if (hsakmt_udmabuf_dev_fd > 0 && aperture == svm.dgpu_aperture && !hsakmt_is_dgpu + if (hsakmt_udmabuf_dev_fd > 0 && aperture == fmm_ctx->svm.dgpu_aperture && !hsakmt_is_dgpu && aperture->ops == &mmap_aperture_ops) { mem = udmabuf_allocation(ctx, gpu_id, node_id, size, aperture, alignment, mflags, &vm_obj); @@ -1871,7 +1888,7 @@ void *hsakmt_fmm_allocate_device(HsaKFDContext *ctx, pthread_mutex_lock(&aperture->fmm_mutex); /* Store memory allocation flags, not ioc flags */ vm_obj->mflags = mflags; - hsakmt_gpuid_to_nodeid(gpu_id, &vm_obj->node_id); + hsakmt_gpuid_to_nodeid(ctx, gpu_id, &vm_obj->node_id); pthread_mutex_unlock(&aperture->fmm_mutex); } @@ -1896,7 +1913,7 @@ void *hsakmt_fmm_allocate_doorbell(HsaKFDContext *ctx, return NULL; /* Use fine-grained aperture */ - aperture = svm.dgpu_alt_aperture; + aperture = fmm_ctx->svm.dgpu_alt_aperture; ioc_flags = KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL | KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | KFD_IOC_ALLOC_MEM_FLAGS_COHERENT; @@ -1914,7 +1931,7 @@ void *hsakmt_fmm_allocate_doorbell(HsaKFDContext *ctx, pthread_mutex_lock(&aperture->fmm_mutex); vm_obj->mflags = mflags; - hsakmt_gpuid_to_nodeid(gpu_id, &vm_obj->node_id); + hsakmt_gpuid_to_nodeid(ctx, gpu_id, &vm_obj->node_id); pthread_mutex_unlock(&aperture->fmm_mutex); } @@ -1932,12 +1949,13 @@ void *hsakmt_fmm_allocate_doorbell(HsaKFDContext *ctx, return mem; } -static void *fmm_allocate_host_cpu(void *address, uint64_t MemorySizeInBytes, +static void *fmm_allocate_host_cpu(HsaKFDContext *ctx, void *address, uint64_t MemorySizeInBytes, HsaMemFlags mflags) { void *mem = NULL; vm_object_t *vm_obj; int mmap_prot = PROT_READ; + struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); if (address) return NULL; @@ -1957,12 +1975,12 @@ static void *fmm_allocate_host_cpu(void *address, uint64_t MemorySizeInBytes, if (mem == MAP_FAILED) return NULL; - pthread_mutex_lock(&cpuvm_aperture.fmm_mutex); - vm_obj = aperture_allocate_object(&cpuvm_aperture, mem, 0, + pthread_mutex_lock(&fmm_ctx->cpuvm_aperture.fmm_mutex); + vm_obj = aperture_allocate_object(&fmm_ctx->cpuvm_aperture, mem, 0, MemorySizeInBytes, mflags); if (vm_obj) vm_obj->node_id = 0; /* APU systems only have one CPU node */ - pthread_mutex_unlock(&cpuvm_aperture.fmm_mutex); + pthread_mutex_unlock(&fmm_ctx->cpuvm_aperture.fmm_mutex); return mem; } @@ -2066,14 +2084,14 @@ static void *fmm_allocate_host_gpu(HsaKFDContext *ctx, size = MemorySizeInBytes; ioc_flags = 0; if (mflags.ui32.CoarseGrain) - aperture = svm.dgpu_aperture; + aperture = fmm_ctx->svm.dgpu_aperture; else - aperture = svm.dgpu_alt_aperture; /* always coherent */ + aperture = fmm_ctx->svm.dgpu_alt_aperture; /* always coherent */ - if (!mflags.ui32.CoarseGrain || svm.disable_cache) + if (!mflags.ui32.CoarseGrain || fmm_ctx->svm.disable_cache) ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_COHERENT; - if (mflags.ui32.Uncached || svm.disable_cache) + if (mflags.ui32.Uncached || fmm_ctx->svm.disable_cache) ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED; if (mflags.ui32.ExtendedCoherent) @@ -2091,7 +2109,7 @@ static void *fmm_allocate_host_gpu(HsaKFDContext *ctx, /* Paged memory is allocated as a userptr mapping, non-paged * memory is allocated from KFD */ - if (!mflags.ui32.NonPaged && svm.userptr_for_paged_mem) { + if (!mflags.ui32.NonPaged && fmm_ctx->svm.userptr_for_paged_mem) { int advice = MADV_NORMAL; /* set madvise flags to HUGEPAGE always for 2MB pages */ @@ -2183,7 +2201,7 @@ void *hsakmt_fmm_allocate_host(HsaKFDContext *ctx, return NULL; } - return fmm_allocate_host_cpu(address, MemorySizeInBytes, mflags); + return fmm_allocate_host_cpu(ctx, address, MemorySizeInBytes, mflags); } static int __fmm_release(HsaKFDContext *ctx, @@ -2251,12 +2269,12 @@ HSAKMT_STATUS hsakmt_fmm_release(HsaKFDContext *ctx, void *address) HSAKMT_STATUS_SUCCESS : HSAKMT_STATUS_MEMORY_NOT_REGISTERED; - if (aperture == &cpuvm_aperture) { + if (aperture == &fmm_ctx->cpuvm_aperture) { /* APU system memory */ uint64_t size = 0; size = object->size; - vm_remove_object(&cpuvm_aperture, object); + vm_remove_object(&fmm_ctx->cpuvm_aperture, object); pthread_mutex_unlock(&aperture->fmm_mutex); munmap(address, size); } else { @@ -2402,7 +2420,8 @@ static HSAKMT_STATUS acquire_vm(HsaKFDContext *ctx, uint32_t gpu_id, int fd) return HSAKMT_STATUS_SUCCESS; } -static HSAKMT_STATUS init_mmap_apertures(HSAuint64 base, HSAuint64 limit, +static HSAKMT_STATUS init_mmap_apertures(svm_t *svm, + HSAuint64 base, HSAuint64 limit, HSAuint32 align, HSAuint32 guard_pages) { void *addr; @@ -2417,29 +2436,29 @@ static HSAKMT_STATUS init_mmap_apertures(HSAuint64 base, HSAuint64 limit, } /* Set up one SVM aperture */ - svm.apertures[SVM_DEFAULT].base = (void *)base; - svm.apertures[SVM_DEFAULT].limit = (void *)limit; - svm.apertures[SVM_DEFAULT].align = align; - svm.apertures[SVM_DEFAULT].guard_pages = guard_pages; - svm.apertures[SVM_DEFAULT].is_cpu_accessible = true; - svm.apertures[SVM_DEFAULT].ops = &mmap_aperture_ops; + svm->apertures[SVM_DEFAULT].base = (void *)base; + svm->apertures[SVM_DEFAULT].limit = (void *)limit; + svm->apertures[SVM_DEFAULT].align = align; + svm->apertures[SVM_DEFAULT].guard_pages = guard_pages; + svm->apertures[SVM_DEFAULT].is_cpu_accessible = true; + svm->apertures[SVM_DEFAULT].ops = &mmap_aperture_ops; - svm.apertures[SVM_COHERENT].base = svm.apertures[SVM_COHERENT].limit = + svm->apertures[SVM_COHERENT].base = svm->apertures[SVM_COHERENT].limit = NULL; /* Try to allocate one page. If it fails, we'll fall back to * managing our own reserved address range. */ - addr = aperture_allocate_area(&svm.apertures[SVM_DEFAULT], NULL, PAGE_SIZE); + addr = aperture_allocate_area(&svm->apertures[SVM_DEFAULT], NULL, PAGE_SIZE); if (addr) { - aperture_release_area(&svm.apertures[SVM_DEFAULT], addr, + aperture_release_area(&svm->apertures[SVM_DEFAULT], addr, PAGE_SIZE); - svm.dgpu_aperture = svm.dgpu_alt_aperture = - &svm.apertures[SVM_DEFAULT]; + svm->dgpu_aperture = svm->dgpu_alt_aperture = + &svm->apertures[SVM_DEFAULT]; pr_info("Initialized unreserved SVM apertures: %p - %p\n", - svm.apertures[SVM_DEFAULT].base, - svm.apertures[SVM_DEFAULT].limit); + svm->apertures[SVM_DEFAULT].base, + svm->apertures[SVM_DEFAULT].limit); } else { pr_info("Failed to allocate unreserved SVM address space.\n"); pr_info("Falling back to reserved SVM apertures.\n"); @@ -2470,18 +2489,20 @@ static void *reserve_address(void *addr, unsigned long long int len) #define SVM_MIN_VM_SIZE (4ULL << 30) #define IS_CANONICAL_ADDR(a) ((a) < (1ULL << 47)) -static HSAKMT_STATUS init_svm_apertures(HSAuint64 base, HSAuint64 limit, +static HSAKMT_STATUS init_svm_apertures(struct hsa_kfd_fmm_context *fmm_ctx, + HSAuint64 base, HSAuint64 limit, HSAuint32 align, HSAuint32 guard_pages) { const HSAuint64 ADDR_INC = GPU_HUGE_PAGE_SIZE; HSAuint64 len, map_size, alt_base, alt_size; bool found = false; void *addr, *ret_addr = NULL; + svm_t *svm = &fmm_ctx->svm; /* If we already have an SVM aperture initialized (from a * parent process), keep using it */ - if (dgpu_shared_aperture_limit) + if (fmm_ctx->dgpu_shared_aperture_limit) return HSAKMT_STATUS_SUCCESS; /* Align base and limit to huge page size */ @@ -2495,8 +2516,8 @@ static HSAKMT_STATUS init_svm_apertures(HSAuint64 base, HSAuint64 limit, * x86_64) or at least mmap is unlikely to run out of * addresses the GPUs can handle. */ - if (limit >= (1ULL << 47) - 1 && !svm.reserve_svm) { - HSAKMT_STATUS status = init_mmap_apertures(base, limit, align, + if (limit >= (1ULL << 47) - 1 && !svm->reserve_svm) { + HSAKMT_STATUS status = init_mmap_apertures(svm, base, limit, align, guard_pages); if (status == HSAKMT_STATUS_SUCCESS) @@ -2574,57 +2595,54 @@ static HSAKMT_STATUS init_svm_apertures(HSAuint64 base, HSAuint64 limit, limit = base + map_size - 1; /* init two apertures for non-coherent and coherent memory */ - svm.apertures[SVM_DEFAULT].base = dgpu_shared_aperture_base = ret_addr; - svm.apertures[SVM_DEFAULT].limit = dgpu_shared_aperture_limit = (void *)limit; - svm.apertures[SVM_DEFAULT].align = align; - svm.apertures[SVM_DEFAULT].guard_pages = guard_pages; - svm.apertures[SVM_DEFAULT].is_cpu_accessible = true; - svm.apertures[SVM_DEFAULT].ops = &reserved_aperture_ops; + svm->apertures[SVM_DEFAULT].base = fmm_ctx->dgpu_shared_aperture_base = ret_addr; + svm->apertures[SVM_DEFAULT].limit = fmm_ctx->dgpu_shared_aperture_limit = (void *)limit; + svm->apertures[SVM_DEFAULT].align = align; + svm->apertures[SVM_DEFAULT].guard_pages = guard_pages; + svm->apertures[SVM_DEFAULT].is_cpu_accessible = true; + svm->apertures[SVM_DEFAULT].ops = &reserved_aperture_ops; /* Use the first 1/4 of the dGPU aperture as * alternate aperture for coherent access. * Base and size must be 64KB aligned. */ - alt_base = (HSAuint64)svm.apertures[SVM_DEFAULT].base; - alt_size = (VOID_PTRS_SUB(svm.apertures[SVM_DEFAULT].limit, - svm.apertures[SVM_DEFAULT].base) + 1) >> 2; + alt_base = (HSAuint64)svm->apertures[SVM_DEFAULT].base; + alt_size = (VOID_PTRS_SUB(svm->apertures[SVM_DEFAULT].limit, + svm->apertures[SVM_DEFAULT].base) + 1) >> 2; alt_base = (alt_base + 0xffff) & ~0xffffULL; alt_size = (alt_size + 0xffff) & ~0xffffULL; - svm.apertures[SVM_COHERENT].base = (void *)alt_base; - svm.apertures[SVM_COHERENT].limit = (void *)(alt_base + alt_size - 1); - svm.apertures[SVM_COHERENT].align = align; - svm.apertures[SVM_COHERENT].guard_pages = guard_pages; - svm.apertures[SVM_COHERENT].is_cpu_accessible = true; - svm.apertures[SVM_COHERENT].ops = &reserved_aperture_ops; + svm->apertures[SVM_COHERENT].base = (void *)alt_base; + svm->apertures[SVM_COHERENT].limit = (void *)(alt_base + alt_size - 1); + svm->apertures[SVM_COHERENT].align = align; + svm->apertures[SVM_COHERENT].guard_pages = guard_pages; + svm->apertures[SVM_COHERENT].is_cpu_accessible = true; + svm->apertures[SVM_COHERENT].ops = &reserved_aperture_ops; - svm.apertures[SVM_DEFAULT].base = VOID_PTR_ADD(svm.apertures[SVM_COHERENT].limit, 1); + svm->apertures[SVM_DEFAULT].base = VOID_PTR_ADD(svm->apertures[SVM_COHERENT].limit, 1); pr_info("SVM alt (coherent): %12p - %12p\n", - svm.apertures[SVM_COHERENT].base, svm.apertures[SVM_COHERENT].limit); + svm->apertures[SVM_COHERENT].base, svm->apertures[SVM_COHERENT].limit); pr_info("SVM (non-coherent): %12p - %12p\n", - svm.apertures[SVM_DEFAULT].base, svm.apertures[SVM_DEFAULT].limit); + svm->apertures[SVM_DEFAULT].base, svm->apertures[SVM_DEFAULT].limit); - svm.dgpu_aperture = &svm.apertures[SVM_DEFAULT]; - svm.dgpu_alt_aperture = &svm.apertures[SVM_COHERENT]; + svm->dgpu_aperture = &svm->apertures[SVM_DEFAULT]; + svm->dgpu_alt_aperture = &svm->apertures[SVM_COHERENT]; return HSAKMT_STATUS_SUCCESS; } static void fmm_init_rbtree(struct hsa_kfd_fmm_context *fmm_ctx) { - static int once; int i = fmm_ctx->gpu_mem_count; - if (once++ == 0) { - rbtree_init(&svm.apertures[SVM_DEFAULT].tree); - rbtree_init(&svm.apertures[SVM_DEFAULT].user_tree); - rbtree_init(&svm.apertures[SVM_COHERENT].tree); - rbtree_init(&svm.apertures[SVM_COHERENT].user_tree); - rbtree_init(&cpuvm_aperture.tree); - rbtree_init(&cpuvm_aperture.user_tree); - rbtree_init(&mem_handle_aperture.tree); - rbtree_init(&mem_handle_aperture.user_tree); - } + rbtree_init(&fmm_ctx->mem_handle_aperture.tree); + rbtree_init(&fmm_ctx->mem_handle_aperture.user_tree); + rbtree_init(&fmm_ctx->cpuvm_aperture.tree); + rbtree_init(&fmm_ctx->cpuvm_aperture.user_tree); + rbtree_init(&fmm_ctx->svm.apertures[SVM_DEFAULT].tree); + rbtree_init(&fmm_ctx->svm.apertures[SVM_DEFAULT].user_tree); + rbtree_init(&fmm_ctx->svm.apertures[SVM_COHERENT].tree); + rbtree_init(&fmm_ctx->svm.apertures[SVM_COHERENT].user_tree); while (i--) { rbtree_init(&fmm_ctx->gpu_mem[i].scratch_physical.tree); @@ -2638,7 +2656,8 @@ static void *map_mmio(HsaKFDContext *ctx, uint32_t node_id, uint32_t gpu_id, int mmap_fd) { void *mem; - manageable_aperture_t *aperture = svm.dgpu_alt_aperture; + struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); + manageable_aperture_t *aperture = fmm_ctx->svm.dgpu_alt_aperture; uint32_t ioc_flags; vm_object_t *vm_obj = NULL; HsaMemFlags mflags; @@ -2735,35 +2754,35 @@ static bool init_mem_handle_aperture(struct hsa_kfd_fmm_context *fmm_ctx, HSAuin { bool found; uint32_t i; - + manageable_aperture_t *mem_handle_aper = &fmm_ctx->mem_handle_aperture; /* init mem_handle_aperture for buffer handler management */ - mem_handle_aperture.align = align; - mem_handle_aperture.guard_pages = guard_pages; - mem_handle_aperture.is_cpu_accessible = false; - mem_handle_aperture.ops = &reserved_aperture_ops; + mem_handle_aper->align = align; + mem_handle_aper->guard_pages = guard_pages; + mem_handle_aper->is_cpu_accessible = false; + mem_handle_aper->ops = &reserved_aperture_ops; - while (PORT_VPTR_TO_UINT64(mem_handle_aperture.base) < END_NON_CANONICAL_ADDR - 1) { + while (PORT_VPTR_TO_UINT64(mem_handle_aper->base) < END_NON_CANONICAL_ADDR - 1) { found = true; for (i = 0; i < fmm_ctx->gpu_mem_count; i++) { if (fmm_ctx->gpu_mem[i].lds_aperture.base && two_apertures_overlap(fmm_ctx->gpu_mem[i].lds_aperture.base, fmm_ctx->gpu_mem[i].lds_aperture.limit, - mem_handle_aperture.base, mem_handle_aperture.limit)) { + mem_handle_aper->base, mem_handle_aper->limit)) { found = false; break; } if (fmm_ctx->gpu_mem[i].scratch_aperture.base && two_apertures_overlap(fmm_ctx->gpu_mem[i].scratch_aperture.base, fmm_ctx->gpu_mem[i].scratch_aperture.limit, - mem_handle_aperture.base, mem_handle_aperture.limit)){ + mem_handle_aper->base, mem_handle_aper->limit)){ found = false; break; } if (fmm_ctx->gpu_mem[i].gpuvm_aperture.base && two_apertures_overlap(fmm_ctx->gpu_mem[i].gpuvm_aperture.base, fmm_ctx->gpu_mem[i].gpuvm_aperture.limit, - mem_handle_aperture.base, mem_handle_aperture.limit)){ + mem_handle_aper->base, mem_handle_aper->limit)){ found = false; break; } @@ -2771,18 +2790,18 @@ static bool init_mem_handle_aperture(struct hsa_kfd_fmm_context *fmm_ctx, HSAuin if (found) { pr_info("mem_handle_aperture start %p, mem_handle_aperture limit %p\n", - mem_handle_aperture.base, mem_handle_aperture.limit); + mem_handle_aper->base, mem_handle_aper->limit); return true; } else { /* increase base by 1UL<<47 to check next hole */ - mem_handle_aperture.base = VOID_PTR_ADD(mem_handle_aperture.base, (1UL << 47)); - mem_handle_aperture.limit = VOID_PTR_ADD(mem_handle_aperture.base, (1ULL << 47)); + mem_handle_aper->base = VOID_PTR_ADD(mem_handle_aper->base, (1UL << 47)); + mem_handle_aper->limit = VOID_PTR_ADD(mem_handle_aper->base, (1ULL << 47)); } } /* set invalid aperture if fail locating a hole for it */ - mem_handle_aperture.base = 0; - mem_handle_aperture.limit = 0; + mem_handle_aper->base = 0; + mem_handle_aper->limit = 0; return false; } @@ -2802,30 +2821,31 @@ HSAKMT_STATUS hsakmt_fmm_init_process_apertures(HsaKFDContext *ctx, unsigned int guardPages = 1; uint64_t svm_base = 0, svm_limit = 0; uint32_t svm_alignment = 0, mfma_high_precision_mode = 0; + struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); /* If HSA_DISABLE_CACHE is set to a non-0 value, disable caching */ disableCache = getenv("HSA_DISABLE_CACHE"); - svm.disable_cache = (disableCache && strcmp(disableCache, "0")); + fmm_ctx->svm.disable_cache = (disableCache && strcmp(disableCache, "0")); /* If HSA_USERPTR_FOR_PAGED_MEM is not set or set to a non-0 * value, enable userptr for all paged memory allocations */ pagedUserptr = getenv("HSA_USERPTR_FOR_PAGED_MEM"); - svm.userptr_for_paged_mem = (!pagedUserptr || strcmp(pagedUserptr, "0")); + fmm_ctx->svm.userptr_for_paged_mem = (!pagedUserptr || strcmp(pagedUserptr, "0")); if (hsakmt_use_model) - svm.userptr_for_paged_mem = false; + fmm_ctx->svm.userptr_for_paged_mem = false; /* If HSA_CHECK_USERPTR is set to a non-0 value, check all userptrs * when they are registered */ checkUserptr = getenv("HSA_CHECK_USERPTR"); - svm.check_userptr = (checkUserptr && strcmp(checkUserptr, "0")); + fmm_ctx->svm.check_userptr = (checkUserptr && strcmp(checkUserptr, "0")); /* If HSA_RESERVE_SVM is set to a non-0 value, * enable packet capture and replay mode. */ reserveSvm = getenv("HSA_RESERVE_SVM"); - svm.reserve_svm = (reserveSvm && strcmp(reserveSvm, "0")); + fmm_ctx->svm.reserve_svm = (reserveSvm && strcmp(reserveSvm, "0")); /* Specify number of guard pages for SVM apertures, default is 1 */ guardPagesStr = getenv("HSA_SVM_GUARD_PAGES"); @@ -2842,19 +2862,17 @@ HSAKMT_STATUS hsakmt_fmm_init_process_apertures(HsaKFDContext *ctx, * ASIC is found in the system, set back to 9(2MB). */ maxVaAlignStr = getenv("HSA_MAX_VA_ALIGN"); - if (!maxVaAlignStr || sscanf(maxVaAlignStr, "%u", &svm.alignment_order) != 1) { - svm.alignment_order = 18; + if (!maxVaAlignStr || sscanf(maxVaAlignStr, "%u", &fmm_ctx->svm.alignment_order) != 1) { + fmm_ctx->svm.alignment_order = 18; for (i = 0; i < NumNodes; i++) { - if (hsakmt_get_gfxv_by_node_id(i) != GFX_VERSION_GFX950) { - svm.alignment_order = 9; + if (hsakmt_get_gfxv_by_node_id(ctx, i) != GFX_VERSION_GFX950) { + fmm_ctx->svm.alignment_order = 9; break; } } } - pr_info("SVM alignment default order is %d.", svm.alignment_order); - - struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); + pr_info("SVM alignment default order is %d.", fmm_ctx->svm.alignment_order); /* Trade off - NumNodes includes GPU nodes + CPU Node. So in * systems with CPU node, slightly more memory is allocated than @@ -2874,7 +2892,7 @@ HSAKMT_STATUS hsakmt_fmm_init_process_apertures(HsaKFDContext *ctx, for (i = 0; i < NumNodes; i++) { HsaNodeProperties props; - ret = hsakmt_topology_get_node_props(i, &props); + ret = hsakmt_topology_get_node_props(ctx, i, &props); if (ret != HSAKMT_STATUS_SUCCESS) goto gpu_mem_init_failed; @@ -2932,7 +2950,7 @@ HSAKMT_STATUS hsakmt_fmm_init_process_apertures(HsaKFDContext *ctx, * required since Number of nodes is already known. Kernel will fill in * the apertures in kfd_process_device_apertures_ptr */ - num_of_sysfs_nodes = hsakmt_get_num_sysfs_nodes(); + num_of_sysfs_nodes = hsakmt_get_num_sysfs_nodes(ctx); if (num_of_sysfs_nodes < gpu_mem_count) { ret = HSAKMT_STATUS_ERROR; goto sysfs_parse_failed; @@ -2952,11 +2970,11 @@ HSAKMT_STATUS hsakmt_fmm_init_process_apertures(HsaKFDContext *ctx, if (ret != HSAKMT_STATUS_SUCCESS) goto get_aperture_ioctl_failed; - all_gpu_id_array_size = 0; - all_gpu_id_array = NULL; + assert(fmm_ctx->all_gpu_id_array_size == 0); + assert(fmm_ctx->all_gpu_id_array == NULL); if (num_of_sysfs_nodes > 0) { - all_gpu_id_array = malloc(sizeof(uint32_t) * gpu_mem_count); - if (!all_gpu_id_array) { + fmm_ctx->all_gpu_id_array = malloc(sizeof(uint32_t) * gpu_mem_count); + if (!fmm_ctx->all_gpu_id_array) { ret = HSAKMT_STATUS_NO_MEMORY; goto get_aperture_ioctl_failed; } @@ -2975,22 +2993,22 @@ HSAKMT_STATUS hsakmt_fmm_init_process_apertures(HsaKFDContext *ctx, if (gpu_mem_id < 0) continue; - if (all_gpu_id_array_size == gpu_mem_count) { + if (fmm_ctx->all_gpu_id_array_size == gpu_mem_count) { ret = HSAKMT_STATUS_ERROR; goto aperture_init_failed; } - all_gpu_id_array[all_gpu_id_array_size++] = process_apertures[i].gpu_id; + fmm_ctx->all_gpu_id_array[fmm_ctx->all_gpu_id_array_size++] = process_apertures[i].gpu_id; /* Add this GPU to the usable_peer_id_arrays of all GPUs that * this GPU has an IO link to. This GPU can map memory * allocated on those GPUs. */ nodeId = gpu_mem[gpu_mem_id].node_id; - ret = hsakmt_topology_get_node_props(nodeId, &nodeProps); + ret = hsakmt_topology_get_node_props(ctx, nodeId, &nodeProps); if (ret != HSAKMT_STATUS_SUCCESS) goto aperture_init_failed; assert(nodeProps.NumIOLinks <= NumNodes); - ret = hsakmt_topology_get_iolink_props(nodeId, nodeProps.NumIOLinks, + ret = hsakmt_topology_get_iolink_props(ctx, nodeId, nodeProps.NumIOLinks, linkProps); if (ret != HSAKMT_STATUS_SUCCESS) goto aperture_init_failed; @@ -3061,13 +3079,13 @@ HSAKMT_STATUS hsakmt_fmm_init_process_apertures(HsaKFDContext *ctx, if (ret != HSAKMT_STATUS_SUCCESS) goto aperture_init_failed; } - all_gpu_id_array_size *= sizeof(uint32_t); + fmm_ctx->all_gpu_id_array_size *= sizeof(uint32_t); if (svm_limit) { /* At least one GPU uses GPUVM in canonical address * space. Set up SVM apertures shared by all such GPUs */ - ret = init_svm_apertures(svm_base, svm_limit, svm_alignment, + ret = init_svm_apertures(fmm_ctx, svm_base, svm_limit, svm_alignment, guardPages); if (ret != HSAKMT_STATUS_SUCCESS) goto init_svm_failed; @@ -3081,17 +3099,17 @@ HSAKMT_STATUS hsakmt_fmm_init_process_apertures(HsaKFDContext *ctx, continue; /* Set memory policy to match the SVM apertures */ - alt_base = (uintptr_t)svm.dgpu_alt_aperture->base; - alt_size = VOID_PTRS_SUB(svm.dgpu_alt_aperture->limit, - svm.dgpu_alt_aperture->base) + 1; + alt_base = (uintptr_t)fmm_ctx->svm.dgpu_alt_aperture->base; + alt_size = VOID_PTRS_SUB(fmm_ctx->svm.dgpu_alt_aperture->limit, + fmm_ctx->svm.dgpu_alt_aperture->base) + 1; err = fmm_set_memory_policy(ctx, process_apertures[i].gpu_id, - svm.disable_cache ? + fmm_ctx->svm.disable_cache ? KFD_IOC_CACHE_POLICY_COHERENT : KFD_IOC_CACHE_POLICY_NONCOHERENT, KFD_IOC_CACHE_POLICY_COHERENT, alt_base, alt_size, - hsakmt_get_gfxv_by_node_id(i) == GFX_VERSION_GFX950 ? + hsakmt_get_gfxv_by_node_id(ctx, i) == GFX_VERSION_GFX950 ? mfma_high_precision_mode : 0); if (err) { pr_err("Failed to set mem policy for GPU [0x%x]\n", @@ -3102,8 +3120,8 @@ HSAKMT_STATUS hsakmt_fmm_init_process_apertures(HsaKFDContext *ctx, } } - cpuvm_aperture.align = PAGE_SIZE; - cpuvm_aperture.limit = (void *)0x7FFFFFFFFFFF; /* 2^47 - 1 */ + fmm_ctx->cpuvm_aperture.align = PAGE_SIZE; + fmm_ctx->cpuvm_aperture.limit = (void *)0x7FFFFFFFFFFF; /* 2^47 - 1 */ fmm_init_rbtree(fmm_ctx); @@ -3132,8 +3150,8 @@ HSAKMT_STATUS hsakmt_fmm_init_process_apertures(HsaKFDContext *ctx, aperture_init_failed: init_svm_failed: set_memory_policy_failed: - free(all_gpu_id_array); - all_gpu_id_array = NULL; + free(fmm_ctx->all_gpu_id_array); + fmm_ctx->all_gpu_id_array = NULL; get_aperture_ioctl_failed: free(process_apertures); sysfs_parse_failed: @@ -3148,11 +3166,11 @@ void hsakmt_fmm_destroy_process_apertures(HsaKFDContext *ctx) release_mmio(ctx); - if (all_gpu_id_array) { - free(all_gpu_id_array); - all_gpu_id_array = NULL; + if (fmm_ctx->all_gpu_id_array) { + free(fmm_ctx->all_gpu_id_array); + fmm_ctx->all_gpu_id_array = NULL; } - all_gpu_id_array_size = 0; + fmm_ctx->all_gpu_id_array_size = 0; if (fmm_ctx->gpu_mem) { while (fmm_ctx->gpu_mem_count-- > 0) @@ -3207,10 +3225,10 @@ HSAKMT_STATUS hsakmt_fmm_get_aperture_base_and_limit(HsaKFDContext *ctx, /* Report single SVM aperture, starting at base of * fine-grained, ending at limit of coarse-grained */ - if (aperture_is_valid(svm.dgpu_alt_aperture->base, - svm.dgpu_aperture->limit)) { - *aperture_base = PORT_VPTR_TO_UINT64(svm.dgpu_alt_aperture->base); - *aperture_limit = PORT_VPTR_TO_UINT64(svm.dgpu_aperture->limit); + if (aperture_is_valid(fmm_ctx->svm.dgpu_alt_aperture->base, + fmm_ctx->svm.dgpu_aperture->limit)) { + *aperture_base = PORT_VPTR_TO_UINT64(fmm_ctx->svm.dgpu_alt_aperture->base); + *aperture_limit = PORT_VPTR_TO_UINT64(fmm_ctx->svm.dgpu_aperture->limit); err = HSAKMT_STATUS_SUCCESS; } break; @@ -3349,15 +3367,15 @@ static HSAKMT_STATUS _fmm_map_to_gpu(HsaKFDContext *ctx, /* not specified, not registered: map all GPUs */ int32_t gpu_mem_id = gpu_mem_find_by_node_id(fmm_ctx, obj->node_id); - if (!obj->userptr && hsakmt_get_device_id_by_node_id(obj->node_id) && + if (!obj->userptr && hsakmt_get_device_id_by_node_id(ctx, obj->node_id) && gpu_mem_id >= 0) { args.device_ids_array_ptr = (uint64_t) fmm_ctx->gpu_mem[gpu_mem_id].usable_peer_id_array; args.n_devices = fmm_ctx->gpu_mem[gpu_mem_id].usable_peer_id_num; } else { - args.device_ids_array_ptr = (uint64_t)all_gpu_id_array; - args.n_devices = all_gpu_id_array_size / sizeof(uint32_t); + args.device_ids_array_ptr = (uint64_t)fmm_ctx->all_gpu_id_array; + args.n_devices = fmm_ctx->all_gpu_id_array_size / sizeof(uint32_t); } } @@ -3427,7 +3445,7 @@ static HSAKMT_STATUS _fmm_map_to_gpu_scratch(HsaKFDContext *ctx, VOID_PTR_ADD(address, size - 1) > aperture->limit) return HSAKMT_STATUS_INVALID_PARAMETER; - is_debugger = hsakmt_debug_get_reg_status(fmm_ctx->gpu_mem[gpu_mem_id].node_id); + is_debugger = hsakmt_debug_get_reg_status(ctx, fmm_ctx->gpu_mem[gpu_mem_id].node_id); flags = is_debugger ? KFD_IOC_ALLOC_MEM_FLAGS_GTT : KFD_IOC_ALLOC_MEM_FLAGS_VRAM; flags |= KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE; @@ -3463,8 +3481,9 @@ static HSAKMT_STATUS _fmm_map_to_gpu_userptr(HsaKFDContext *ctx, void *svm_addr; HSAuint32 page_offset = (HSAuint64)addr & (PAGE_SIZE-1); HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); - aperture = svm.dgpu_aperture; + aperture = fmm_ctx->svm.dgpu_aperture; /* Map and return the GPUVM address adjusted by the offset * from the start of the page @@ -3472,8 +3491,8 @@ static HSAKMT_STATUS _fmm_map_to_gpu_userptr(HsaKFDContext *ctx, if (!object && hsakmt_is_svm_api_supported) { svm_addr = (void*)((HSAuint64)addr - page_offset); if (!nodes_to_map) { - nodes_to_map = all_gpu_id_array; - nodes_array_size = all_gpu_id_array_size; + nodes_to_map = fmm_ctx->all_gpu_id_array; + nodes_array_size = fmm_ctx->all_gpu_id_array_size; } pr_debug("%s Mapping Address %p size aligned: %ld offset: %x\n", __func__, svm_addr, PAGE_ALIGN_UP(page_offset + size), page_offset); @@ -3531,12 +3550,12 @@ HSAKMT_STATUS hsakmt_fmm_map_to_gpu(HsaKFDContext *ctx, } /* allocate buffer only, should be mapped by GEM API */ - if (aperture && (aperture == &mem_handle_aperture)) { + if (aperture && (aperture == &fmm_ctx->mem_handle_aperture)) { pthread_mutex_unlock(&aperture->fmm_mutex); return HSAKMT_STATUS_INVALID_PARAMETER; } - if (aperture && (aperture == &cpuvm_aperture)) { + if (aperture && (aperture == &fmm_ctx->cpuvm_aperture)) { /* Prefetch memory on APUs with dummy-reads */ fmm_check_user_memory(address, size); ret = HSAKMT_STATUS_SUCCESS; @@ -3733,7 +3752,7 @@ int hsakmt_fmm_unmap_from_gpu(HsaKFDContext *ctx, void *address) return (!hsakmt_is_dgpu || hsakmt_is_svm_api_supported) ? 0 : -EINVAL; /* Successful vm_find_object returns with the aperture locked */ - if (aperture == &cpuvm_aperture) + if (aperture == &fmm_ctx->cpuvm_aperture) /* On APUs GPU unmapping of system memory is a no-op */ ret = 0; else @@ -3776,12 +3795,12 @@ bool hsakmt_fmm_get_handle(HsaKFDContext *ctx, } if (!aperture) { - if ((address >= svm.dgpu_aperture->base) && - (address <= svm.dgpu_aperture->limit)) { - aperture = svm.dgpu_aperture; - } else if ((address >= svm.dgpu_alt_aperture->base) && - (address <= svm.dgpu_alt_aperture->limit)) { - aperture = svm.dgpu_alt_aperture; + if ((address >= fmm_ctx->svm.dgpu_aperture->base) && + (address <= fmm_ctx->svm.dgpu_aperture->limit)) { + aperture = fmm_ctx->svm.dgpu_aperture; + } else if ((address >= fmm_ctx->svm.dgpu_alt_aperture->base) && + (address <= fmm_ctx->svm.dgpu_alt_aperture->limit)) { + aperture = fmm_ctx->svm.dgpu_alt_aperture; } } @@ -3822,7 +3841,6 @@ static HSAKMT_STATUS fmm_register_user_memory(HsaKFDContext *ctx, vm_object_t **obj_ret, HsaMemFlags flags) { - manageable_aperture_t *aperture = svm.dgpu_aperture; HSAuint32 page_offset = (HSAuint64)addr & (PAGE_SIZE-1); HSAuint64 aligned_addr = (HSAuint64)addr - page_offset; HSAuint64 aligned_size = PAGE_ALIGN_UP(page_offset + size); @@ -3830,7 +3848,7 @@ static HSAKMT_STATUS fmm_register_user_memory(HsaKFDContext *ctx, HSAuint32 gpu_id; vm_object_t *obj, *exist_obj; struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); - + manageable_aperture_t *aperture = fmm_ctx->svm.dgpu_aperture; /* Find first GPU for creating the userptr BO */ if (!fmm_ctx->first_gpu_mem) return HSAKMT_STATUS_ERROR; @@ -3838,7 +3856,7 @@ static HSAKMT_STATUS fmm_register_user_memory(HsaKFDContext *ctx, gpu_id = fmm_ctx->first_gpu_mem->gpu_id; /* Optionally check that the CPU mapping is valid */ - if (svm.check_userptr) + if (fmm_ctx->svm.check_userptr) fmm_check_user_memory(addr, size); /* Allocate BO, userptr address is passed in mmap_offset */ @@ -3867,7 +3885,7 @@ static HSAKMT_STATUS fmm_register_user_memory(HsaKFDContext *ctx, ++exist_obj->registration_count; } else { obj->userptr = addr; - hsakmt_gpuid_to_nodeid(gpu_id, &obj->node_id); + hsakmt_gpuid_to_nodeid(ctx, gpu_id, &obj->node_id); obj->userptr_size = size; obj->registration_count = 1; obj->user_node.key = rbtree_key((unsigned long)addr, size); @@ -3919,7 +3937,7 @@ HSAKMT_STATUS hsakmt_fmm_register_memory(HsaKFDContext *ctx, return ret; if (gpu_id_array_size == 0) return HSAKMT_STATUS_SUCCESS; - aperture = svm.dgpu_aperture; + aperture = fmm_ctx->svm.dgpu_aperture; pthread_mutex_lock(&aperture->fmm_mutex); /* fall through for registered device ID array setup */ } else if (object->userptr) { @@ -4019,9 +4037,9 @@ HSAKMT_STATUS hsakmt_fmm_register_graphics_handle(HsaKFDContext *ctx, /* import DMA buffer without VA assigned */ if (!gpu_id_array && gpu_id_array_size == 0 && !RegisterFlags.ui32.requiresVAddr) { - aperture = &mem_handle_aperture; + aperture = &fmm_ctx->mem_handle_aperture; } else if (hsakmt_topology_is_svm_needed(fmm_ctx->gpu_mem[gpu_mem_id].EngineId)) { - aperture = svm.dgpu_aperture; + aperture = fmm_ctx->svm.dgpu_aperture; } else { aperture = &fmm_ctx->gpu_mem[gpu_mem_id].gpuvm_aperture; aperture_base = aperture->base; @@ -4037,7 +4055,7 @@ HSAKMT_STATUS hsakmt_fmm_register_graphics_handle(HsaKFDContext *ctx, } /* Import DMA buffer */ - if (aperture == &mem_handle_aperture) + if (aperture == &fmm_ctx->mem_handle_aperture) importArgs.va_addr = 0; else importArgs.va_addr = VOID_PTRS_SUB(mem, aperture_base); @@ -4059,7 +4077,7 @@ HSAKMT_STATUS hsakmt_fmm_register_graphics_handle(HsaKFDContext *ctx, obj->metadata = metadata; obj->registered_device_id_array = gpu_id_array; obj->registered_device_id_array_size = gpu_id_array_size; - hsakmt_gpuid_to_nodeid(infoArgs.gpu_id, &obj->node_id); + hsakmt_gpuid_to_nodeid(ctx, infoArgs.gpu_id, &obj->node_id); } pthread_mutex_unlock(&aperture->fmm_mutex); if (!obj) @@ -4069,7 +4087,7 @@ HSAKMT_STATUS hsakmt_fmm_register_graphics_handle(HsaKFDContext *ctx, GraphicsResourceInfo->SizeInBytes = infoArgs.size; GraphicsResourceInfo->Metadata = (void *)(unsigned long)infoArgs.metadata_ptr; GraphicsResourceInfo->MetadataSizeInBytes = infoArgs.metadata_size; - hsakmt_gpuid_to_nodeid(infoArgs.gpu_id, &GraphicsResourceInfo->NodeId); + hsakmt_gpuid_to_nodeid(ctx, infoArgs.gpu_id, &GraphicsResourceInfo->NodeId); return HSAKMT_STATUS_SUCCESS; @@ -4159,7 +4177,7 @@ HSAKMT_STATUS hsakmt_fmm_share_memory(HsaKFDContext *ctx, if (!obj) return HSAKMT_STATUS_INVALID_PARAMETER; - r = hsakmt_validate_nodeid(obj->node_id, &gpu_id); + r = hsakmt_validate_nodeid(ctx, obj->node_id, &gpu_id); if (r != HSAKMT_STATUS_SUCCESS) return r; if (!gpu_id && hsakmt_is_dgpu) { @@ -4310,7 +4328,7 @@ HSAKMT_STATUS hsakmt_fmm_deregister_memory(HsaKFDContext *ctx, void *address) HSAKMT_STATUS_MEMORY_NOT_REGISTERED; /* Successful vm_find_object returns with aperture locked */ - if (aperture == &cpuvm_aperture) { + if (aperture == &fmm_ctx->cpuvm_aperture) { /* API-allocated system memory on APUs, deregistration * is a no-op */ @@ -4382,14 +4400,14 @@ HSAKMT_STATUS hsakmt_fmm_map_to_gpu_nodes(HsaKFDContext *ctx, } /* allocates buffer only, should be mapped by GEM API */ - if (aperture == &mem_handle_aperture) { + if (aperture == &fmm_ctx->mem_handle_aperture) { pthread_mutex_unlock(&aperture->fmm_mutex); return HSAKMT_STATUS_INVALID_PARAMETER; } /* APU memory is not supported by this function */ if (aperture && - (aperture == &cpuvm_aperture || !aperture->is_cpu_accessible)) { + (aperture == &fmm_ctx->cpuvm_aperture || !aperture->is_cpu_accessible)) { pthread_mutex_unlock(&aperture->fmm_mutex); return HSAKMT_STATUS_ERROR; } @@ -4403,8 +4421,8 @@ HSAKMT_STATUS hsakmt_fmm_map_to_gpu_nodes(HsaKFDContext *ctx, } /* Verify that all nodes to map are registered already */ - registered_node_id_array = all_gpu_id_array; - registered_node_id_array_size = all_gpu_id_array_size; + registered_node_id_array = fmm_ctx->all_gpu_id_array; + registered_node_id_array_size = fmm_ctx->all_gpu_id_array_size; if (object->registered_device_id_array_size > 0 && object->registered_device_id_array) { registered_node_id_array = object->registered_device_id_array; @@ -4518,7 +4536,7 @@ HSAKMT_STATUS hsakmt_fmm_get_mem_info(HsaKFDContext *ctx, * register to new nodes) or the memory being freed */ for (i = 0; i < info->NRegisteredNodes; i++) - hsakmt_gpuid_to_nodeid(vm_obj->registered_device_id_array[i], + hsakmt_gpuid_to_nodeid(ctx, vm_obj->registered_device_id_array[i], &vm_obj->registered_node_id_array[i]); } info->RegisteredNodes = vm_obj->registered_node_id_array; @@ -4537,7 +4555,7 @@ HSAKMT_STATUS hsakmt_fmm_get_mem_info(HsaKFDContext *ctx, * to new nodes) or memory being freed */ for (i = 0; i < info->NMappedNodes; i++) - hsakmt_gpuid_to_nodeid(vm_obj->mapped_device_id_array[i], + hsakmt_gpuid_to_nodeid(ctx, vm_obj->mapped_device_id_array[i], &vm_obj->mapped_node_id_array[i]); } info->MappedNodes = vm_obj->mapped_node_id_array; @@ -4681,27 +4699,28 @@ void hsakmt_fmm_clear_all_aperture(HsaKFDContext *ctx) struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); - fmm_clear_aperture(&mem_handle_aperture); - fmm_clear_aperture(&cpuvm_aperture); - fmm_clear_aperture(&svm.apertures[SVM_DEFAULT]); - fmm_clear_aperture(&svm.apertures[SVM_COHERENT]); + fmm_clear_aperture(&fmm_ctx->mem_handle_aperture); + fmm_clear_aperture(&fmm_ctx->cpuvm_aperture); + fmm_clear_aperture(&fmm_ctx->svm.apertures[SVM_DEFAULT]); + fmm_clear_aperture(&fmm_ctx->svm.apertures[SVM_COHERENT]); - if (dgpu_shared_aperture_limit) { + if (fmm_ctx->dgpu_shared_aperture_limit) { /* Use the same dgpu range as the parent. If failed, then set * hsakmt_is_dgpu_mem_init to false. Later on dgpu_mem_init will try * to get a new range */ - map_addr = mmap(dgpu_shared_aperture_base, (HSAuint64)(dgpu_shared_aperture_limit)- - (HSAuint64)(dgpu_shared_aperture_base) + 1, PROT_NONE, + map_addr = mmap(fmm_ctx->dgpu_shared_aperture_base, + (HSAuint64)(fmm_ctx->dgpu_shared_aperture_limit)- + (HSAuint64)(fmm_ctx->dgpu_shared_aperture_base) + 1, PROT_NONE, MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE | MAP_FIXED, -1, 0); if (map_addr == MAP_FAILED) { - munmap(dgpu_shared_aperture_base, - (HSAuint64)(dgpu_shared_aperture_limit) - - (HSAuint64)(dgpu_shared_aperture_base) + 1); + munmap(fmm_ctx->dgpu_shared_aperture_base, + (HSAuint64)(fmm_ctx->dgpu_shared_aperture_limit) - + (HSAuint64)(fmm_ctx->dgpu_shared_aperture_base) + 1); - dgpu_shared_aperture_base = NULL; - dgpu_shared_aperture_limit = NULL; + fmm_ctx->dgpu_shared_aperture_base = NULL; + fmm_ctx->dgpu_shared_aperture_limit = NULL; } } diff --git a/projects/rocr-runtime/libhsakmt/src/hsakmtctx.h b/projects/rocr-runtime/libhsakmt/src/hsakmtctx.h index b2f04dbcdf..687d960b84 100644 --- a/projects/rocr-runtime/libhsakmt/src/hsakmtctx.h +++ b/projects/rocr-runtime/libhsakmt/src/hsakmtctx.h @@ -740,6 +740,156 @@ hsaKmtAllocQueueGWSCtx( HSAuint32 *firstGWS //OUT ); +HSAKMT_STATUS +HSAKMTAPI +hsaKmtRuntimeEnableCtx( + HsaKFDContext *ctx, //IN + void* rDebug, //IN + bool setupTtmp //IN + ); + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtRuntimeDisableCtx( + HsaKFDContext *ctx //IN + ); + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtGetRuntimeCapabilitiesCtx( + HsaKFDContext *ctx, //IN + HSAuint32 *caps_mask //OUT + ); + +/** + Enable debug trap. +*/ +HSAKMT_STATUS +HSAKMTAPI +hsaKmtDbgEnableCtx( + HsaKFDContext *ctx, //IN + void **runtime_info, //Out + HSAuint32 *data_size //Out + ); + +/** + Disable debug trap. +*/ +HSAKMT_STATUS +HSAKMTAPI +hsaKmtDbgDisableCtx( + HsaKFDContext *ctx //IN + ); + +/** + Get device snapshot. +*/ +HSAKMT_STATUS +HSAKMTAPI +hsaKmtDbgGetDeviceDataCtx( + HsaKFDContext *ctx, //IN + void **data, //Out + HSAuint32 *n_entries, //Out + HSAuint32 *entry_size //Out + ); + +/** + Get queues snapshot. +*/ +HSAKMT_STATUS +HSAKMTAPI +hsaKmtDbgGetQueueDataCtx( + HsaKFDContext *ctx, //IN + void **data, //Out + HSAuint32 *n_entries, //Out + HSAuint32 *entry_size, //Out + bool suspend_queues //In + ); + +/** + Check whether gpu firmware and kernel support debugging +*/ +HSAKMT_STATUS +HSAKMTAPI +hsaKmtCheckRuntimeDebugSupportCtx( + HsaKFDContext *ctx //IN + ); + +/** + Debug ops call primarily used for KFD testing + */ +HSAKMT_STATUS +HSAKMTAPI +hsaKmtDebugTrapIoctlCtx( + HsaKFDContext *ctx, //IN + struct kfd_ioctl_dbg_trap_args *args, //IN/OUT + HSA_QUEUEID *Queues, //IN + HSAuint64 *DebugReturn //OUT + ); + +/** + Gets GPU and CPU clock counters for particular Node +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtGetClockCountersCtx( + HsaKFDContext *ctx, //IN + HSAuint32 NodeId, //IN + HsaClockCounters *Counters); //OUT + +/** + Retrieves information on the available HSA counters +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtPmcGetCounterPropertiesCtx( + HsaKFDContext *ctx, //IN + HSAuint32 NodeId, //IN + HsaCounterProperties** CounterProperties //OUT + ); + +/** + Registers a set of (HW) counters to be used for tracing/profiling +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtPmcRegisterTraceCtx( + HsaKFDContext *ctx, //IN + HSAuint32 NodeId, //IN + HSAuint32 NumberOfCounters, //IN + HsaCounter* Counters, //IN + HsaPmcTraceRoot* TraceRoot //OUT + ); + +/** + Allows a user mode process to get exclusive access to the defined set of (HW) counters + used for tracing/profiling +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtPmcAcquireTraceAccessCtx( + HsaKFDContext *ctx, //IN + HSAuint32 NodeId, //IN + HSATraceId TraceId //IN + ); + +/** + Allows a user mode process to release exclusive access to the defined set of (HW) counters + used for tracing/profiling +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtPmcReleaseTraceAccessCtx( + HsaKFDContext *ctx, //IN + HSAuint32 NodeId, //IN + HSATraceId TraceId //IN + ); + /* Helper functions for calling KFD SVM ioctl */ HSAKMT_STATUS HSAKMTAPI diff --git a/projects/rocr-runtime/libhsakmt/src/hsakmtmodel.c b/projects/rocr-runtime/libhsakmt/src/hsakmtmodel.c index 7b1c69b76b..6cf1e99762 100644 --- a/projects/rocr-runtime/libhsakmt/src/hsakmtmodel.c +++ b/projects/rocr-runtime/libhsakmt/src/hsakmtmodel.c @@ -282,7 +282,7 @@ void model_init(void) for (unsigned node_id = 0; node_id < props.NumNodes; node_id++) { HsaNodeProperties node_props; - result = hsakmt_topology_get_node_props(node_id, &node_props); + result = hsakmt_topology_get_node_props(&hsakmt_primary_kfd_ctx, node_id, &node_props); if (result != HSAKMT_STATUS_SUCCESS) { fprintf(stderr, "model: Failed to get node %u properties\n", node_id); diff --git a/projects/rocr-runtime/libhsakmt/src/kfdcontext.c b/projects/rocr-runtime/libhsakmt/src/kfdcontext.c index 981c53eb4a..8db36f8747 100644 --- a/projects/rocr-runtime/libhsakmt/src/kfdcontext.c +++ b/projects/rocr-runtime/libhsakmt/src/kfdcontext.c @@ -37,9 +37,12 @@ void hsakmt_kfdcontext_init_context(int fd, HsaKFDContext *ctx) assert(ctx); ctx->fd = fd; + ctx->topology_context = NULL; ctx->queue_context = NULL; ctx->fmm_context = NULL; ctx->event_context = NULL; + ctx->debug_context = NULL; + ctx->perf_context = NULL; } void hsakmt_kfdcontext_clear_context(HsaKFDContext *ctx) @@ -47,6 +50,10 @@ void hsakmt_kfdcontext_clear_context(HsaKFDContext *ctx) if (!ctx) return; + if (ctx->topology_context) { + free(ctx->topology_context); + ctx->topology_context = NULL; + } if (ctx->queue_context) { free(ctx->queue_context); ctx->queue_context = NULL; @@ -59,5 +66,13 @@ void hsakmt_kfdcontext_clear_context(HsaKFDContext *ctx) free(ctx->event_context); ctx->event_context = NULL; } + if (ctx->debug_context) { + free(ctx->debug_context); + ctx->debug_context = NULL; + } + if (ctx->perf_context) { + free(ctx->perf_context); + ctx->perf_context = NULL; + } ctx->fd = -1; } diff --git a/projects/rocr-runtime/libhsakmt/src/kfdcontext.h b/projects/rocr-runtime/libhsakmt/src/kfdcontext.h index 8053e74f7b..1b9d6ccda3 100644 --- a/projects/rocr-runtime/libhsakmt/src/kfdcontext.h +++ b/projects/rocr-runtime/libhsakmt/src/kfdcontext.h @@ -28,9 +28,12 @@ #include +struct hsa_kfd_topology_context; struct hsa_kfd_queue_context; struct hsa_kfd_fmm_context; struct hsa_kfd_event_context; +struct hsa_kfd_debug_context; +struct hsa_kfd_perf_context; /* * HsaKFDContext @@ -52,6 +55,9 @@ typedef struct _HsaKFDContext /* File descriptor for the KFD device */ int fd; + /* Topology context for managing system topology information */ + struct hsa_kfd_topology_context *topology_context; + /* Queue context for managing user queues */ struct hsa_kfd_queue_context *queue_context; @@ -60,6 +66,12 @@ typedef struct _HsaKFDContext /* Event context for managing events */ struct hsa_kfd_event_context *event_context; + + /* Debug context for managing debug operations */ + struct hsa_kfd_debug_context *debug_context; + + /* perf context for managing perf operations */ + struct hsa_kfd_perf_context *perf_context; } HsaKFDContext; // Initialize a pre-allocated HsaKFDContext with the given file descriptor @@ -67,8 +79,10 @@ void hsakmt_kfdcontext_init_context(int fd, HsaKFDContext *ctx); // Release all resources associated with the given KFD context void hsakmt_kfdcontext_clear_context(HsaKFDContext *ctx); +struct hsa_kfd_topology_context *hsakmt_kfdcontext_get_topology_context(HsaKFDContext *ctx); struct hsa_kfd_fmm_context *hsakmt_kfdcontext_get_fmm_context(HsaKFDContext *ctx); struct hsa_kfd_queue_context *hsakmt_kfdcontext_get_queue_context(HsaKFDContext *ctx); struct hsa_kfd_event_context *hsakmt_kfdcontext_get_event_context(HsaKFDContext *ctx); - +struct hsa_kfd_debug_context *hsakmt_kfdcontext_get_debug_context(HsaKFDContext *ctx); +struct hsa_kfd_perf_context *hsakmt_kfdcontext_get_perf_context(HsaKFDContext *ctx); #endif /* _KFDCONTEXT_H_ */ diff --git a/projects/rocr-runtime/libhsakmt/src/libhsakmt.h b/projects/rocr-runtime/libhsakmt/src/libhsakmt.h index 7440d55e02..f3bd3a5651 100644 --- a/projects/rocr-runtime/libhsakmt/src/libhsakmt.h +++ b/projects/rocr-runtime/libhsakmt/src/libhsakmt.h @@ -188,23 +188,26 @@ HSAKMT_STATUS hsakmt_init_kfd_version(void); #define IS_SOC15(gfxv) ((gfxv) >= GFX_VERSION_VEGA10) -HSAKMT_STATUS hsakmt_validate_nodeid(uint32_t nodeid, uint32_t *gpu_id); -HSAKMT_STATUS hsakmt_gpuid_to_nodeid(uint32_t gpu_id, uint32_t* node_id); -uint32_t hsakmt_get_gfxv_by_node_id(HSAuint32 node_id); -bool hsakmt_prefer_ats(HSAuint32 node_id); -uint16_t hsakmt_get_device_id_by_node_id(HSAuint32 node_id); -uint16_t hsakmt_get_device_id_by_gpu_id(HSAuint32 gpu_id); -uint32_t hsakmt_get_direct_link_cpu(uint32_t gpu_node); +HSAKMT_STATUS hsakmt_validate_nodeid(HsaKFDContext *ctx, uint32_t nodeid, uint32_t *gpu_id); +HSAKMT_STATUS hsakmt_gpuid_to_nodeid(HsaKFDContext *ctx, uint32_t gpu_id, uint32_t* node_id); +uint32_t hsakmt_get_gfxv_by_node_id(HsaKFDContext *ctx, HSAuint32 node_id); +bool hsakmt_prefer_ats(HsaKFDContext *ctx, HSAuint32 node_id); +uint16_t hsakmt_get_device_id_by_node_id(HsaKFDContext *ctx, HSAuint32 node_id); +uint16_t hsakmt_get_device_id_by_gpu_id(HsaKFDContext *ctx, HSAuint32 gpu_id); +uint32_t hsakmt_get_direct_link_cpu(HsaKFDContext *ctx, uint32_t gpu_node); int get_drm_render_fd_by_gpu_id(HSAuint32 gpu_id); -HSAKMT_STATUS hsakmt_validate_nodeid_array(uint32_t **gpu_id_array, +HSAKMT_STATUS hsakmt_validate_nodeid_array(HsaKFDContext *ctx, + uint32_t **gpu_id_array, uint32_t NumberOfNodes, uint32_t *NodeArray); HSAKMT_STATUS hsakmt_topology_sysfs_get_system_props(HsaKFDContext *ctx, HsaSystemProperties *props); -HSAKMT_STATUS hsakmt_topology_get_node_props(HSAuint32 NodeId, +HSAKMT_STATUS hsakmt_topology_get_node_props(HsaKFDContext *ctx, + HSAuint32 NodeId, HsaNodeProperties *NodeProperties); -HSAKMT_STATUS hsakmt_topology_get_iolink_props(HSAuint32 NodeId, - HSAuint32 NumIoLinks, - HsaIoLinkProperties *IoLinkProperties); +HSAKMT_STATUS hsakmt_topology_get_iolink_props(HsaKFDContext *ctx, + HSAuint32 NodeId, + HSAuint32 NumIoLinks, + HsaIoLinkProperties *IoLinkProperties); void hsakmt_topology_setup_is_dgpu_param(HsaNodeProperties *props); bool hsakmt_topology_is_svm_needed(HSA_ENGINE_ID EngineId); @@ -212,7 +215,7 @@ HSAuint32 hsakmt_PageSizeFromFlags(unsigned int pageSizeFlags); HSAuint64 MapDrmPerm(HsaMemoryMapFlags flags); void* hsakmt_allocate_exec_aligned_memory_gpu(HsaKFDContext *ctx, - uint32_t size, uint32_t align, + uint32_t size, uint32_t align, uint32_t gpu_id, uint32_t NodeId, bool NonPaged, bool DeviceLocal, bool Uncached); @@ -221,11 +224,11 @@ void hsakmt_free_exec_aligned_memory_gpu(HsaKFDContext *ctx, HSAKMT_STATUS hsakmt_init_process_doorbells(HsaKFDContext *ctx, unsigned int NumNodes); void hsakmt_destroy_process_doorbells(HsaKFDContext *ctx); -HSAKMT_STATUS hsakmt_init_device_debugging_memory(unsigned int NumNodes); -void hsakmt_destroy_device_debugging_memory(void); -bool hsakmt_debug_get_reg_status(uint32_t node_id); -HSAKMT_STATUS hsakmt_init_counter_props(unsigned int NumNodes); -void hsakmt_destroy_counter_props(void); +HSAKMT_STATUS hsakmt_init_device_debugging_memory(HsaKFDContext *ctx, unsigned int NumNodes); +void hsakmt_destroy_device_debugging_memory(HsaKFDContext *ctx); +bool hsakmt_debug_get_reg_status(HsaKFDContext *ctx, uint32_t node_id); +HSAKMT_STATUS hsakmt_init_counter_props(HsaKFDContext *ctx, unsigned int NumNodes); +void hsakmt_destroy_counter_props(HsaKFDContext *ctx); uint32_t *hsakmt_convert_queue_ids(HSAuint32 NumQueues, HSA_QUEUEID *Queues); extern int hsakmt_ioctl(int fd, unsigned long request, void *arg); @@ -250,7 +253,7 @@ void hsakmt_clear_events_page(HsaKFDContext *ctx); void hsakmt_fmm_clear_all_mem(HsaKFDContext *ctx); void hsakmt_fmm_clear_all_aperture(HsaKFDContext *ctx); void hsakmt_clear_process_doorbells(HsaKFDContext *ctx); -uint32_t hsakmt_get_num_sysfs_nodes(void); +uint32_t hsakmt_get_num_sysfs_nodes(HsaKFDContext *ctx); bool hsakmt_is_forked_child(void); diff --git a/projects/rocr-runtime/libhsakmt/src/memory.c b/projects/rocr-runtime/libhsakmt/src/memory.c index 07cfb70fb5..31e2a7551f 100644 --- a/projects/rocr-runtime/libhsakmt/src/memory.c +++ b/projects/rocr-runtime/libhsakmt/src/memory.c @@ -55,11 +55,11 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtSetMemoryPolicyCtx(HsaKFDContext *ctx, pr_debug("[%s] node %d; default %d; alternate %d\n", __func__, Node, DefaultPolicy, AlternatePolicy); - result = hsakmt_validate_nodeid(Node, &gpu_id); + result = hsakmt_validate_nodeid(ctx, Node, &gpu_id); if (result != HSAKMT_STATUS_SUCCESS) return result; - if (hsakmt_get_gfxv_by_node_id(Node) != GFX_VERSION_KAVERI) + if (hsakmt_get_gfxv_by_node_id(ctx, Node) != GFX_VERSION_KAVERI) /* This is a legacy API useful on Kaveri only. On dGPU * the alternate aperture is setup and used * automatically for coherent allocations. Don't let @@ -137,7 +137,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemoryAlignCtx(HsaKFDContext *ctx, pr_debug("[%s] node %d\n", __func__, PreferredNode); - result = hsakmt_validate_nodeid(PreferredNode, &gpu_id); + result = hsakmt_validate_nodeid(ctx, PreferredNode, &gpu_id); if (result != HSAKMT_STATUS_SUCCESS) { pr_err("[%s] invalid node ID: %d\n", __func__, PreferredNode); return result; @@ -254,7 +254,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAvailableMemoryCtx(HsaKFDContext *ctx, pr_debug("[%s] node %d\n", __func__, Node); - result = hsakmt_validate_nodeid(Node, &args.gpu_id); + result = hsakmt_validate_nodeid(ctx, Node, &args.gpu_id); if (result != HSAKMT_STATUS_SUCCESS) { pr_err("[%s] invalid node ID: %d\n", __func__, Node); return result; @@ -304,7 +304,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryToNodesCtx(HsaKFDContext *ctx, /* TODO: support mixed APU and dGPU configurations */ return HSAKMT_STATUS_NOT_SUPPORTED; - ret = hsakmt_validate_nodeid_array(&gpu_id_array, + ret = hsakmt_validate_nodeid_array(ctx, &gpu_id_array, NumberOfNodes, NodeArray); if (ret == HSAKMT_STATUS_SUCCESS) { @@ -385,7 +385,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterGraphicsHandleToNodesExtCtx(HsaKFDContext pr_debug("[%s] number of nodes %lu\n", __func__, NumberOfNodes); if (NodeArray != NULL || NumberOfNodes != 0) { - ret = hsakmt_validate_nodeid_array(&gpu_id_array, + ret = hsakmt_validate_nodeid_array(ctx, &gpu_id_array, NumberOfNodes, NodeArray); } @@ -467,7 +467,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterSharedHandleToNodesCtx(HsaKFDContext *ctx, return HSAKMT_STATUS_INVALID_PARAMETER; if (NodeArray) { - ret = hsakmt_validate_nodeid_array(&gpu_id_array, NumberOfNodes, NodeArray); + ret = hsakmt_validate_nodeid_array(ctx, &gpu_id_array, NumberOfNodes, NodeArray); if (ret != HSAKMT_STATUS_SUCCESS) goto error; } @@ -567,7 +567,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtMapMemoryToGPUNodesCtx(HsaKFDContext *ctx, return hsaKmtMapMemoryToGPUCtx(ctx, MemoryAddress, MemorySizeInBytes, AlternateVAGPU); - ret = hsakmt_validate_nodeid_array(&gpu_id_array, + ret = hsakmt_validate_nodeid_array(ctx, &gpu_id_array, NumberOfNodes, NodeArray); if (ret != HSAKMT_STATUS_SUCCESS) return ret; @@ -633,7 +633,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetTileConfigCtx(HsaKFDContext *ctx, pr_debug("[%s] node %d\n", __func__, NodeId); - result = hsakmt_validate_nodeid(NodeId, &gpu_id); + result = hsakmt_validate_nodeid(ctx, NodeId, &gpu_id); if (result != HSAKMT_STATUS_SUCCESS) return result; diff --git a/projects/rocr-runtime/libhsakmt/src/openclose.c b/projects/rocr-runtime/libhsakmt/src/openclose.c index 4d7d428891..abaa705c75 100644 --- a/projects/rocr-runtime/libhsakmt/src/openclose.c +++ b/projects/rocr-runtime/libhsakmt/src/openclose.c @@ -106,7 +106,7 @@ static void clear_after_fork(HsaKFDContext *ctx) hsakmt_clear_process_doorbells(ctx); hsakmt_clear_events_page(ctx); hsakmt_fmm_clear_all_mem(ctx); - hsakmt_destroy_device_debugging_memory(); + hsakmt_destroy_device_debugging_memory(ctx); int fd = ctx->fd; if (fd >= 0) { @@ -226,10 +226,10 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtOpenKFDCtx(HsaKFDContext **pCtx) hsakmt_kfd_open_count = 1; - if (hsakmt_init_device_debugging_memory(sys_props.NumNodes) != HSAKMT_STATUS_SUCCESS) + if (hsakmt_init_device_debugging_memory(&hsakmt_primary_kfd_ctx, sys_props.NumNodes) != HSAKMT_STATUS_SUCCESS) pr_warn("Insufficient Memory. Debugging unavailable\n"); - hsakmt_init_counter_props(sys_props.NumNodes); + hsakmt_init_counter_props(&hsakmt_primary_kfd_ctx, sys_props.NumNodes); *pCtx = &hsakmt_primary_kfd_ctx; if (!atfork_installed) { @@ -269,8 +269,8 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCloseKFDCtx(void) if (hsakmt_kfd_open_count > 0) { if (--hsakmt_kfd_open_count == 0) { - hsakmt_destroy_counter_props(); - hsakmt_destroy_device_debugging_memory(); + hsakmt_destroy_counter_props(&hsakmt_primary_kfd_ctx); + hsakmt_destroy_device_debugging_memory(&hsakmt_primary_kfd_ctx); hsakmt_fmm_clear_all_aperture(&hsakmt_primary_kfd_ctx); } diff --git a/projects/rocr-runtime/libhsakmt/src/pc_sampling.c b/projects/rocr-runtime/libhsakmt/src/pc_sampling.c index 055a30c0b1..a7a77e499f 100644 --- a/projects/rocr-runtime/libhsakmt/src/pc_sampling.c +++ b/projects/rocr-runtime/libhsakmt/src/pc_sampling.c @@ -52,7 +52,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingQueryCapabilities(HSAuint32 NodeId, void CHECK_KFD_OPEN(); CHECK_KFD_MINOR_VERSION(16); - HSAKMT_STATUS ret = hsakmt_validate_nodeid(NodeId, &gpu_id); + HSAKMT_STATUS ret = hsakmt_validate_nodeid(&hsakmt_primary_kfd_ctx, NodeId, &gpu_id); if (ret != HSAKMT_STATUS_SUCCESS) { pr_err("[%s] invalid node ID: %d\n", __func__, NodeId); return ret; @@ -99,7 +99,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingCreate(HSAuint32 NodeId, HsaPcSamplingIn CHECK_KFD_OPEN(); *traceId = INVALID_TRACE_ID; - HSAKMT_STATUS ret = hsakmt_validate_nodeid(NodeId, &gpu_id); + HSAKMT_STATUS ret = hsakmt_validate_nodeid(&hsakmt_primary_kfd_ctx, NodeId, &gpu_id); if (ret != HSAKMT_STATUS_SUCCESS) { pr_err("[%s] invalid node ID: %d\n", __func__, NodeId); return ret; @@ -139,7 +139,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingDestroy(HSAuint32 NodeId, HsaPcSamplingT CHECK_KFD_OPEN(); - HSAKMT_STATUS ret = hsakmt_validate_nodeid(NodeId, &gpu_id); + HSAKMT_STATUS ret = hsakmt_validate_nodeid(&hsakmt_primary_kfd_ctx, NodeId, &gpu_id); if (ret != HSAKMT_STATUS_SUCCESS) { pr_err("[%s] invalid node ID: %d\n", __func__, NodeId); return ret; @@ -171,7 +171,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingStart(HSAuint32 NodeId, HsaPcSamplingTra CHECK_KFD_OPEN(); - HSAKMT_STATUS ret = hsakmt_validate_nodeid(NodeId, &gpu_id); + HSAKMT_STATUS ret = hsakmt_validate_nodeid(&hsakmt_primary_kfd_ctx, NodeId, &gpu_id); if (ret != HSAKMT_STATUS_SUCCESS) { pr_err("[%s] invalid node ID: %d\n", __func__, NodeId); return ret; @@ -210,7 +210,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingStop(HSAuint32 NodeId, HsaPcSamplingTrac CHECK_KFD_OPEN(); - HSAKMT_STATUS ret = hsakmt_validate_nodeid(NodeId, &gpu_id); + HSAKMT_STATUS ret = hsakmt_validate_nodeid(&hsakmt_primary_kfd_ctx, NodeId, &gpu_id); if (ret != HSAKMT_STATUS_SUCCESS) { pr_err("[%s] invalid node ID: %d\n", __func__, NodeId); return ret; diff --git a/projects/rocr-runtime/libhsakmt/src/perfctr.c b/projects/rocr-runtime/libhsakmt/src/perfctr.c index 0078836bcc..4e1de7956a 100644 --- a/projects/rocr-runtime/libhsakmt/src/perfctr.c +++ b/projects/rocr-runtime/libhsakmt/src/perfctr.c @@ -37,6 +37,7 @@ #include #include #include +#include #define BITS_PER_BYTE CHAR_BIT @@ -75,8 +76,32 @@ struct perf_counts_values { }; }; -static HsaCounterProperties **counter_props; -static unsigned int counter_props_count; +struct hsa_kfd_perf_context +{ + HsaCounterProperties **counter_props; + unsigned int counter_props_count; +}; + +struct hsa_kfd_perf_context *hsakmt_kfdcontext_get_perf_context(HsaKFDContext *ctx) +{ + assert(ctx); + if (!ctx) { + pr_err("Expected a non-null ptr for HsaKFDContext"); + return NULL; + } + + if (ctx->perf_context) + return ctx->perf_context; + + ctx->perf_context = calloc(1, sizeof(struct hsa_kfd_perf_context)); + if (!ctx->perf_context) { + pr_err("Alloc memory failed for struct hsa_kfd_perf_context size %zu\n", + sizeof(struct hsa_kfd_perf_context)); + return NULL; + } + + return ctx->perf_context; +} static ssize_t readn(int fd, void *buf, size_t n) { @@ -99,33 +124,35 @@ static ssize_t readn(int fd, void *buf, size_t n) return n; } -HSAKMT_STATUS hsakmt_init_counter_props(unsigned int NumNodes) +HSAKMT_STATUS hsakmt_init_counter_props(HsaKFDContext *ctx, unsigned int NumNodes) { - counter_props = calloc(NumNodes, sizeof(struct HsaCounterProperties *)); - if (!counter_props) { + struct hsa_kfd_perf_context *perf_ctx = hsakmt_kfdcontext_get_perf_context(ctx); + perf_ctx->counter_props = calloc(NumNodes, sizeof(struct HsaCounterProperties *)); + if (!perf_ctx->counter_props) { pr_warn("Profiling is not available.\n"); return HSAKMT_STATUS_NO_MEMORY; } - counter_props_count = NumNodes; + perf_ctx->counter_props_count = NumNodes; return HSAKMT_STATUS_SUCCESS; } -void hsakmt_destroy_counter_props(void) +void hsakmt_destroy_counter_props(HsaKFDContext *ctx) { unsigned int i; + struct hsa_kfd_perf_context *perf_ctx = hsakmt_kfdcontext_get_perf_context(ctx); - if (!counter_props) + if (!perf_ctx->counter_props) return; - for (i = 0; i < counter_props_count; i++) - if (counter_props[i]) { - free(counter_props[i]); - counter_props[i] = NULL; + for (i = 0; i < perf_ctx->counter_props_count; i++) + if (perf_ctx->counter_props[i]) { + free(perf_ctx->counter_props[i]); + perf_ctx->counter_props[i] = NULL; } - free(counter_props); + free(perf_ctx->counter_props); } static int blockid2uuid(enum perf_block_id block_id, HSA_UUID *uuid) @@ -211,11 +238,12 @@ static int blockid2uuid(enum perf_block_id block_id, HSA_UUID *uuid) return rc; } -static HSAuint32 get_block_concurrent_limit(uint32_t node_id, +static HSAuint32 get_block_concurrent_limit(struct hsa_kfd_perf_context *perf_ctx, + uint32_t node_id, HSAuint32 block_id) { uint32_t i; - HsaCounterBlockProperties *block = &counter_props[node_id]->Blocks[0]; + HsaCounterBlockProperties *block = &perf_ctx->counter_props[node_id]->Blocks[0]; for (i = 0; i < PERFCOUNTER_BLOCKID__MAX; i++) { if (block->Counters[0].BlockIndex == block_id) @@ -254,7 +282,8 @@ static HSAKMT_STATUS query_trace(int fd, uint64_t *buf) return HSAKMT_STATUS_SUCCESS; } -HSAKMT_STATUS HSAKMTAPI hsaKmtPmcGetCounterProperties(HSAuint32 NodeId, +HSAKMT_STATUS HSAKMTAPI hsaKmtPmcGetCounterPropertiesCtx(HsaKFDContext *ctx, + HSAuint32 NodeId, HsaCounterProperties **CounterProperties) { HSAKMT_STATUS rc = HSAKMT_STATUS_SUCCESS; @@ -265,23 +294,24 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPmcGetCounterProperties(HSAuint32 NodeId, struct perf_counter_block block = {0}; uint32_t total_blocks = 0; HsaCounterBlockProperties *block_prop; + struct hsa_kfd_perf_context *perf_ctx = hsakmt_kfdcontext_get_perf_context(ctx); - if (!counter_props) + if (!perf_ctx->counter_props) return HSAKMT_STATUS_NO_MEMORY; if (!CounterProperties) return HSAKMT_STATUS_INVALID_PARAMETER; - if (hsakmt_validate_nodeid(NodeId, &gpu_id) != HSAKMT_STATUS_SUCCESS) + if (hsakmt_validate_nodeid(ctx, NodeId, &gpu_id) != HSAKMT_STATUS_SUCCESS) return HSAKMT_STATUS_INVALID_NODE_UNIT; - if (counter_props[NodeId]) { - *CounterProperties = counter_props[NodeId]; + if (perf_ctx->counter_props[NodeId]) { + *CounterProperties = perf_ctx->counter_props[NodeId]; return HSAKMT_STATUS_SUCCESS; } for (i = 0; i < PERFCOUNTER_BLOCKID__MAX; i++) { - rc = hsakmt_get_block_properties(NodeId, i, &block); + rc = hsakmt_get_block_properties(ctx, NodeId, i, &block); if (rc != HSAKMT_STATUS_SUCCESS) return rc; total_concurrent += block.num_of_slots; @@ -295,19 +325,19 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPmcGetCounterProperties(HSAuint32 NodeId, sizeof(HsaCounterBlockProperties) * (total_blocks - 1) + sizeof(HsaCounter) * (total_counters - total_blocks); - counter_props[NodeId] = malloc(counter_props_size); - if (!counter_props[NodeId]) + perf_ctx->counter_props[NodeId] = malloc(counter_props_size); + if (!perf_ctx->counter_props[NodeId]) return HSAKMT_STATUS_NO_MEMORY; - counter_props[NodeId]->NumBlocks = total_blocks; - counter_props[NodeId]->NumConcurrent = total_concurrent; + perf_ctx->counter_props[NodeId]->NumBlocks = total_blocks; + perf_ctx->counter_props[NodeId]->NumConcurrent = total_concurrent; - block_prop = &counter_props[NodeId]->Blocks[0]; + block_prop = &perf_ctx->counter_props[NodeId]->Blocks[0]; for (block_id = 0; block_id < PERFCOUNTER_BLOCKID__MAX; block_id++) { - rc = hsakmt_get_block_properties(NodeId, block_id, &block); + rc = hsakmt_get_block_properties(ctx, NodeId, block_id, &block); if (rc != HSAKMT_STATUS_SUCCESS) { - free(counter_props[NodeId]); - counter_props[NodeId] = NULL; + free(perf_ctx->counter_props[NodeId]); + perf_ctx->counter_props[NodeId] = NULL; return rc; } @@ -329,13 +359,14 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPmcGetCounterProperties(HSAuint32 NodeId, block_prop = (HsaCounterBlockProperties *)&block_prop->Counters[block_prop->NumCounters]; } - *CounterProperties = counter_props[NodeId]; + *CounterProperties = perf_ctx->counter_props[NodeId]; return HSAKMT_STATUS_SUCCESS; } /* Registers a set of (HW) counters to be used for tracing/profiling */ -HSAKMT_STATUS HSAKMTAPI hsaKmtPmcRegisterTrace(HSAuint32 NodeId, +HSAKMT_STATUS HSAKMTAPI hsaKmtPmcRegisterTraceCtx(HsaKFDContext* ctx, + HSAuint32 NodeId, HSAuint32 NumberOfCounters, HsaCounter *Counters, HsaPmcTraceRoot *TraceRoot) @@ -353,6 +384,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPmcRegisterTrace(HSAuint32 NodeId, uint32_t block, num_blocks = 0, total_counters = 0; uint64_t *counter_id_ptr; int *fd_ptr; + struct hsa_kfd_perf_context *perf_ctx = hsakmt_kfdcontext_get_perf_context(ctx); pr_debug("[%s] Number of counters %d\n", __func__, NumberOfCounters); @@ -362,7 +394,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPmcRegisterTrace(HSAuint32 NodeId, return HSAKMT_STATUS_NO_MEMORY; } - if (!counter_props) { + if (!perf_ctx->counter_props) { pr_err("Profiling is not available, counter_props is NULL.\n"); goto no_memory_exit; } @@ -370,7 +402,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPmcRegisterTrace(HSAuint32 NodeId, if (!Counters || !TraceRoot || NumberOfCounters == 0) goto invalid_parameter_exit; - if (hsakmt_validate_nodeid(NodeId, &gpu_id) != HSAKMT_STATUS_SUCCESS) { + if (hsakmt_validate_nodeid(ctx, NodeId, &gpu_id) != HSAKMT_STATUS_SUCCESS) { free(counter_id); return HSAKMT_STATUS_INVALID_NODE_UNIT; } @@ -408,7 +440,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPmcRegisterTrace(HSAuint32 NodeId, for (i = 0; i < PERFCOUNTER_BLOCKID__MAX; i++) { if (!num_counters[i]) continue; - concurrent_limit = get_block_concurrent_limit(NodeId, i); + concurrent_limit = get_block_concurrent_limit(perf_ctx, NodeId, i); if (!concurrent_limit) { pr_err("Invalid block ID: %d\n", i); goto invalid_parameter_exit; @@ -509,7 +541,8 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPmcRegisterTrace(HSAuint32 NodeId, /* Unregisters a set of (HW) counters used for tracing/profiling */ -HSAKMT_STATUS HSAKMTAPI hsaKmtPmcUnregisterTrace(HSAuint32 NodeId, +HSAKMT_STATUS HSAKMTAPI hsaKmtPmcUnregisterTraceCtx(HsaKFDContext* ctx, + HSAuint32 NodeId, HSATraceId TraceId) { uint32_t gpu_id; @@ -520,7 +553,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPmcUnregisterTrace(HSAuint32 NodeId, if (TraceId == 0) return HSAKMT_STATUS_INVALID_PARAMETER; - if (hsakmt_validate_nodeid(NodeId, &gpu_id) != HSAKMT_STATUS_SUCCESS) + if (hsakmt_validate_nodeid(ctx, NodeId, &gpu_id) != HSAKMT_STATUS_SUCCESS) return HSAKMT_STATUS_INVALID_NODE_UNIT; trace = (struct perf_trace *)PORT_UINT64_TO_VPTR(TraceId); @@ -544,7 +577,8 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPmcUnregisterTrace(HSAuint32 NodeId, return HSAKMT_STATUS_SUCCESS; } -HSAKMT_STATUS HSAKMTAPI hsaKmtPmcAcquireTraceAccess(HSAuint32 NodeId, +HSAKMT_STATUS HSAKMTAPI hsaKmtPmcAcquireTraceAccessCtx(HsaKFDContext* ctx, + HSAuint32 NodeId, HSATraceId TraceId) { struct perf_trace *trace; @@ -561,7 +595,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPmcAcquireTraceAccess(HSAuint32 NodeId, if (trace->magic4cc != HSA_PERF_MAGIC4CC) return HSAKMT_STATUS_INVALID_HANDLE; - if (hsakmt_validate_nodeid(NodeId, &gpu_id) != HSAKMT_STATUS_SUCCESS) + if (hsakmt_validate_nodeid(ctx, NodeId, &gpu_id) != HSAKMT_STATUS_SUCCESS) return HSAKMT_STATUS_INVALID_NODE_UNIT; return ret; @@ -692,3 +726,32 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPmcStopTrace(HSATraceId TraceId) return ret; } + +HSAKMT_STATUS HSAKMTAPI hsaKmtPmcGetCounterProperties(HSAuint32 NodeId, + HsaCounterProperties **CounterProperties) +{ + return hsaKmtPmcGetCounterPropertiesCtx(&hsakmt_primary_kfd_ctx, NodeId, CounterProperties); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtPmcRegisterTrace(HSAuint32 NodeId, + HSAuint32 NumberOfCounters, + HsaCounter *Counters, + HsaPmcTraceRoot *TraceRoot) +{ + return hsaKmtPmcRegisterTraceCtx(&hsakmt_primary_kfd_ctx, + NodeId, NumberOfCounters, Counters, TraceRoot); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtPmcUnregisterTrace(HSAuint32 NodeId, + HSATraceId TraceId) +{ + return hsaKmtPmcUnregisterTraceCtx(&hsakmt_primary_kfd_ctx, + NodeId, TraceId); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtPmcAcquireTraceAccess(HSAuint32 NodeId, + HSATraceId TraceId) +{ + return hsaKmtPmcAcquireTraceAccessCtx(&hsakmt_primary_kfd_ctx, + NodeId, TraceId); +} \ No newline at end of file diff --git a/projects/rocr-runtime/libhsakmt/src/pmc_table.c b/projects/rocr-runtime/libhsakmt/src/pmc_table.c index 6b9a9e6719..cdcecbe5f4 100644 --- a/projects/rocr-runtime/libhsakmt/src/pmc_table.c +++ b/projects/rocr-runtime/libhsakmt/src/pmc_table.c @@ -1958,12 +1958,13 @@ static struct perf_counter_block navi_blocks[PERFCOUNTER_BLOCKID__MAX] = { }, }; -HSAKMT_STATUS hsakmt_get_block_properties(uint32_t node_id, +HSAKMT_STATUS hsakmt_get_block_properties(HsaKFDContext *ctx, + uint32_t node_id, enum perf_block_id block_id, struct perf_counter_block *block) { - uint32_t gfxv = hsakmt_get_gfxv_by_node_id(node_id); - uint16_t dev_id = hsakmt_get_device_id_by_node_id(node_id); + uint32_t gfxv = hsakmt_get_gfxv_by_node_id(ctx, node_id); + uint16_t dev_id = hsakmt_get_device_id_by_node_id(ctx, node_id); if (block_id >= PERFCOUNTER_BLOCKID__MAX || block_id < PERFCOUNTER_BLOCKID__FIRST) diff --git a/projects/rocr-runtime/libhsakmt/src/pmc_table.h b/projects/rocr-runtime/libhsakmt/src/pmc_table.h index 6154a8c559..213b205684 100644 --- a/projects/rocr-runtime/libhsakmt/src/pmc_table.h +++ b/projects/rocr-runtime/libhsakmt/src/pmc_table.h @@ -67,7 +67,8 @@ struct perf_counter_block { uint64_t counter_mask; }; -HSAKMT_STATUS hsakmt_get_block_properties(uint32_t node_id, +HSAKMT_STATUS hsakmt_get_block_properties(HsaKFDContext *ctx, + uint32_t node_id, enum perf_block_id block_id, struct perf_counter_block *block); diff --git a/projects/rocr-runtime/libhsakmt/src/queues.c b/projects/rocr-runtime/libhsakmt/src/queues.c index 5a92bdbc48..c2a29a9774 100644 --- a/projects/rocr-runtime/libhsakmt/src/queues.c +++ b/projects/rocr-runtime/libhsakmt/src/queues.c @@ -148,14 +148,15 @@ HSAKMT_STATUS hsakmt_init_process_doorbells(HsaKFDContext *ctx, unsigned int Num return ret; } -static void get_doorbell_map_info(uint32_t node_id, +static void get_doorbell_map_info(HsaKFDContext *ctx, + uint32_t node_id, struct process_doorbells *doorbell) { /* * GPUVM doorbell on Tonga requires a workaround for VM TLB ACTIVE bit * lookup bug. Remove ASIC check when this is implemented in amdgpu. */ - uint32_t gfxv = hsakmt_get_gfxv_by_node_id(node_id); + uint32_t gfxv = hsakmt_get_gfxv_by_node_id(ctx, node_id); doorbell->use_gpuvm = (hsakmt_is_dgpu && gfxv != GFX_VERSION_TONGA); doorbell->size = DOORBELLS_PAGE_SIZE(DOORBELL_SIZE(gfxv)); @@ -272,7 +273,7 @@ static HSAKMT_STATUS map_doorbell(HsaKFDContext *ctx, return HSAKMT_STATUS_SUCCESS; } - get_doorbell_map_info(NodeId, &doorbells[NodeId]); + get_doorbell_map_info(ctx, NodeId, &doorbells[NodeId]); if (doorbells[NodeId].use_gpuvm) { status = map_doorbell_dgpu(ctx, NodeId, gpu_id, doorbell_mmap_offset); @@ -385,7 +386,7 @@ void *hsakmt_allocate_exec_aligned_memory_gpu(HsaKFDContext *ctx, * nonPaged=0 system memory allocation uses GTT path */ if (!nonPaged) { - cpu_id = hsakmt_get_direct_link_cpu(NodeId); + cpu_id = hsakmt_get_direct_link_cpu(ctx, NodeId); if (cpu_id == INVALID_NODEID) { flags.ui32.NoNUMABind = 1; cpu_id = 0; @@ -460,7 +461,8 @@ static void free_exec_aligned_memory(HsaKFDContext *ctx, munmap(addr, size); } -static HSAKMT_STATUS register_svm_range(void *mem, uint32_t size, +static HSAKMT_STATUS register_svm_range(HsaKFDContext *ctx, + void *mem, uint32_t size, uint32_t gpuNode, uint32_t prefetchNode, uint32_t preferredNode, bool alwaysMapped) { @@ -493,7 +495,7 @@ static HSAKMT_STATUS register_svm_range(void *mem, uint32_t size, attrs[5].type = HSA_SVM_ATTR_GRANULARITY; attrs[5].value = 0xFF; - return hsaKmtSVMSetAttr(mem, size, nattr, attrs); + return hsaKmtSVMSetAttrCtx(ctx, mem, size, nattr, attrs); } static void free_queue(HsaKFDContext *ctx, struct queue *q) @@ -599,7 +601,7 @@ static int handle_concrete_asic(HsaKFDContext *ctx, fill_cwsr_header(q, addr, Event, ErrPayload, node.NumXcc); - HSAKMT_STATUS r = register_svm_range(addr, size, + HSAKMT_STATUS r = register_svm_range(ctx, addr, size, NodeId, NodeId, 0, true); if (r == HSAKMT_STATUS_SUCCESS) { @@ -680,7 +682,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueueExtCtx(HsaKFDContext *ctx, Priority > HSA_QUEUE_PRIORITY_MAXIMUM) return HSAKMT_STATUS_INVALID_PARAMETER; - result = hsakmt_validate_nodeid(NodeId, &gpu_id); + result = hsakmt_validate_nodeid(ctx, NodeId, &gpu_id); if (result != HSAKMT_STATUS_SUCCESS) return result; @@ -691,7 +693,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueueExtCtx(HsaKFDContext *ctx, memset(q, 0, sizeof(*q)); - q->gfxv = hsakmt_get_gfxv_by_node_id(NodeId); + q->gfxv = hsakmt_get_gfxv_by_node_id(ctx, NodeId); q->use_ats = false; if (q->gfxv == GFX_VERSION_TONGA) @@ -932,7 +934,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtSetTrapHandlerCtx(HsaKFDContext *ctx, CHECK_KFD_OPEN(); - result = hsakmt_validate_nodeid(Node, &gpu_id); + result = hsakmt_validate_nodeid(ctx, Node, &gpu_id); if (result != HSAKMT_STATUS_SUCCESS) return result; diff --git a/projects/rocr-runtime/libhsakmt/src/spm.c b/projects/rocr-runtime/libhsakmt/src/spm.c index ec7f3d2b33..3c83ae3453 100644 --- a/projects/rocr-runtime/libhsakmt/src/spm.c +++ b/projects/rocr-runtime/libhsakmt/src/spm.c @@ -35,7 +35,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtSPMAcquire(HSAuint32 PreferredNode) struct kfd_ioctl_spm_args args = {0}; uint32_t gpu_id; - ret = hsakmt_validate_nodeid(PreferredNode, &gpu_id); + ret = hsakmt_validate_nodeid(&hsakmt_primary_kfd_ctx, PreferredNode, &gpu_id); if (ret != HSAKMT_STATUS_SUCCESS) { pr_err("[%s] invalid node ID: %d\n", __func__, PreferredNode); return ret; @@ -61,7 +61,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtSPMSetDestBuffer(HSAuint32 PreferredNode, struct kfd_ioctl_spm_args args = {0}; uint32_t gpu_id = 0; - ret = hsakmt_validate_nodeid(PreferredNode, &gpu_id); + ret = hsakmt_validate_nodeid(&hsakmt_primary_kfd_ctx, PreferredNode, &gpu_id); if (ret != HSAKMT_STATUS_SUCCESS) { return ret; } @@ -87,7 +87,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtSPMRelease(HSAuint32 PreferredNode) struct kfd_ioctl_spm_args args = {0}; uint32_t gpu_id; - ret = hsakmt_validate_nodeid(PreferredNode, &gpu_id); + ret = hsakmt_validate_nodeid(&hsakmt_primary_kfd_ctx, PreferredNode, &gpu_id); if (ret != HSAKMT_STATUS_SUCCESS) { pr_err("[%s] invalid node ID: %d\n", __func__, PreferredNode); return ret; diff --git a/projects/rocr-runtime/libhsakmt/src/svm.c b/projects/rocr-runtime/libhsakmt/src/svm.c index 5482dead5c..69dc7cd0e9 100644 --- a/projects/rocr-runtime/libhsakmt/src/svm.c +++ b/projects/rocr-runtime/libhsakmt/src/svm.c @@ -81,7 +81,7 @@ hsaKmtSVMSetAttrCtx(HsaKFDContext *ctx, continue; } - r = hsakmt_validate_nodeid(attrs[i].value, &args->attrs[i].value); + r = hsakmt_validate_nodeid(ctx, attrs[i].value, &args->attrs[i].value); if (r != HSAKMT_STATUS_SUCCESS) { pr_debug("invalid node ID: %d\n", attrs[i].value); return r; @@ -141,7 +141,7 @@ hsaKmtSVMGetAttrCtx(HsaKFDContext *ctx, attrs[i].type != KFD_IOCTL_SVM_ATTR_NO_ACCESS) continue; - r = hsakmt_validate_nodeid(attrs[i].value, &args->attrs[i].value); + r = hsakmt_validate_nodeid(ctx, attrs[i].value, &args->attrs[i].value); if (r != HSAKMT_STATUS_SUCCESS) { pr_debug("invalid node ID: %d\n", attrs[i].value); return r; @@ -176,7 +176,7 @@ hsaKmtSVMGetAttrCtx(HsaKFDContext *ctx, attrs[i].value = INVALID_NODEID; break; default: - r = hsakmt_gpuid_to_nodeid(attrs[i].value, &attrs[i].value); + r = hsakmt_gpuid_to_nodeid(ctx, attrs[i].value, &attrs[i].value); if (r != HSAKMT_STATUS_SUCCESS) { pr_debug("invalid GPU ID: %d\n", attrs[i].value); diff --git a/projects/rocr-runtime/libhsakmt/src/time.c b/projects/rocr-runtime/libhsakmt/src/time.c index 9e8b5ec451..222d53f32a 100644 --- a/projects/rocr-runtime/libhsakmt/src/time.c +++ b/projects/rocr-runtime/libhsakmt/src/time.c @@ -26,7 +26,8 @@ #include "libhsakmt.h" #include "hsakmt/linux/kfd_ioctl.h" -HSAKMT_STATUS HSAKMTAPI hsaKmtGetClockCounters(HSAuint32 NodeId, +HSAKMT_STATUS HSAKMTAPI hsaKmtGetClockCountersCtx(HsaKFDContext *ctx, + HSAuint32 NodeId, HsaClockCounters *Counters) { HSAKMT_STATUS result; @@ -36,13 +37,13 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetClockCounters(HSAuint32 NodeId, CHECK_KFD_OPEN(); - result = hsakmt_validate_nodeid(NodeId, &gpu_id); + result = hsakmt_validate_nodeid(ctx, NodeId, &gpu_id); if (result != HSAKMT_STATUS_SUCCESS) return result; args.gpu_id = gpu_id; - err = hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_GET_CLOCK_COUNTERS, &args); + err = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_GET_CLOCK_COUNTERS, &args); if (err < 0) { result = HSAKMT_STATUS_ERROR; } else { @@ -55,3 +56,9 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetClockCounters(HSAuint32 NodeId, return result; } + +HSAKMT_STATUS HSAKMTAPI hsaKmtGetClockCounters(HSAuint32 NodeId, + HsaClockCounters *Counters) +{ + return hsaKmtGetClockCountersCtx(&hsakmt_primary_kfd_ctx, NodeId, Counters); +} diff --git a/projects/rocr-runtime/libhsakmt/src/topology.c b/projects/rocr-runtime/libhsakmt/src/topology.c index b33c8590a4..64c536151a 100644 --- a/projects/rocr-runtime/libhsakmt/src/topology.c +++ b/projects/rocr-runtime/libhsakmt/src/topology.c @@ -72,16 +72,6 @@ typedef struct { HsaIoLinkProperties *link; } node_props_t; -static HsaSystemProperties *g_system; -static node_props_t *g_props; - -/* This array caches sysfs based node IDs of CPU nodes + all supported GPU nodes. - * It will be used to map user-node IDs to sysfs-node IDs. - */ -static uint32_t *map_user_to_sysfs_node_id; -static uint32_t map_user_to_sysfs_node_id_size; -static uint32_t num_sysfs_nodes; - static int processor_vendor = -1; /* Supported System Vendors */ enum SUPPORTED_PROCESSOR_VENDORS { @@ -96,8 +86,45 @@ static const char *supported_processor_vendor_name[] = { "\n" // POWER requires a different search method }; +/* + * KFD Topology Context + */ +struct hsa_kfd_topology_context +{ + HsaSystemProperties* system_props; + node_props_t *node_props; + + /* This array caches sysfs based node IDs of CPU nodes + all supported GPU nodes. + * It will be used to map user-node IDs to sysfs-node IDs. + */ + uint32_t *map_user_to_sysfs_node_id; + uint32_t map_user_to_sysfs_node_id_size; + + uint32_t num_sysfs_nodes; +}; + +struct hsa_kfd_topology_context *hsakmt_kfdcontext_get_topology_context(HsaKFDContext *ctx) +{ + assert(ctx); + if (!ctx) { + pr_err("Expected a non-null ptr for HsaKFDContext"); + return NULL; + } + + if (ctx->topology_context) + return ctx->topology_context; + + ctx->topology_context = calloc(1, sizeof(struct hsa_kfd_topology_context)); + if (!ctx->topology_context) { + pr_err("Alloc memory failed for struct hsa_kfd_topology_context size %zu\n", + sizeof(struct hsa_kfd_topology_context)); + return NULL; + } + return ctx->topology_context; +} + static HSAKMT_STATUS topology_take_snapshot(HsaKFDContext *ctx); -static void topology_drop_snapshot(void); +static void topology_drop_snapshot(HsaKFDContext *ctx); static const struct hsa_gfxip_table gfxip_lookup_table[] = { /* Kaveri Family */ @@ -610,12 +637,15 @@ err: return ret; } -static HSAKMT_STATUS topology_sysfs_map_node_id(uint32_t node_id, uint32_t *sys_node_id) +static HSAKMT_STATUS topology_sysfs_map_node_id( + struct hsa_kfd_topology_context *topology_ctx, + uint32_t node_id, uint32_t *sys_node_id) { - if ((!map_user_to_sysfs_node_id) || (node_id >= map_user_to_sysfs_node_id_size)) + if ((!topology_ctx->map_user_to_sysfs_node_id) || + (node_id >= topology_ctx->map_user_to_sysfs_node_id_size)) return HSAKMT_STATUS_NOT_SUPPORTED; - *sys_node_id = map_user_to_sysfs_node_id[node_id]; + *sys_node_id = topology_ctx->map_user_to_sysfs_node_id[node_id]; return HSAKMT_STATUS_SUCCESS; } @@ -737,6 +767,7 @@ HSAKMT_STATUS hsakmt_topology_sysfs_get_system_props(HsaKFDContext *ctx, HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; bool is_node_supported = true; uint32_t num_supported_nodes = 0; + struct hsa_kfd_topology_context *topology_ctx = hsakmt_kfdcontext_get_topology_context(ctx); assert(props); snprintf(path, sizeof(path), KFD_SYSFS_PATH_SYSTEM_PROPERTIES, get_topology_dir()); @@ -779,34 +810,34 @@ HSAKMT_STATUS hsakmt_topology_sysfs_get_system_props(HsaKFDContext *ctx, * which represent the node numbers */ snprintf(path, sizeof(path), KFD_SYSFS_PATH_NODES, get_topology_dir()); - num_sysfs_nodes = num_subdirs(path, ""); + topology_ctx->num_sysfs_nodes = num_subdirs(path, ""); - if (map_user_to_sysfs_node_id == NULL) { + if (topology_ctx->map_user_to_sysfs_node_id == NULL) { /* Trade off - num_sysfs_nodes includes all CPU and GPU nodes. * Slightly more memory is allocated than necessary. */ - map_user_to_sysfs_node_id = calloc(num_sysfs_nodes, sizeof(uint32_t)); - if (map_user_to_sysfs_node_id == NULL) { + topology_ctx->map_user_to_sysfs_node_id = calloc(topology_ctx->num_sysfs_nodes, sizeof(uint32_t)); + if (topology_ctx->map_user_to_sysfs_node_id == NULL) { ret = HSAKMT_STATUS_NO_MEMORY; goto err2; } - map_user_to_sysfs_node_id_size = num_sysfs_nodes; - } else if (num_sysfs_nodes > map_user_to_sysfs_node_id_size) { - free(map_user_to_sysfs_node_id); - map_user_to_sysfs_node_id = calloc(num_sysfs_nodes, sizeof(uint32_t)); - if (map_user_to_sysfs_node_id == NULL) { + topology_ctx->map_user_to_sysfs_node_id_size = topology_ctx->num_sysfs_nodes; + } else if (topology_ctx->num_sysfs_nodes > topology_ctx->map_user_to_sysfs_node_id_size) { + free(topology_ctx->map_user_to_sysfs_node_id); + topology_ctx->map_user_to_sysfs_node_id = calloc(topology_ctx->num_sysfs_nodes, sizeof(uint32_t)); + if (topology_ctx->map_user_to_sysfs_node_id == NULL) { ret = HSAKMT_STATUS_NO_MEMORY; goto err2; } - map_user_to_sysfs_node_id_size = num_sysfs_nodes; + topology_ctx->map_user_to_sysfs_node_id_size = topology_ctx->num_sysfs_nodes; } - for (uint32_t i = 0; i < num_sysfs_nodes; i++) { + for (uint32_t i = 0; i < topology_ctx->num_sysfs_nodes; i++) { ret = topology_sysfs_check_node_supported(ctx, i, &is_node_supported); if (ret != HSAKMT_STATUS_SUCCESS) goto sysfs_parse_failed; if (is_node_supported) - map_user_to_sysfs_node_id[num_supported_nodes++] = i; + topology_ctx->map_user_to_sysfs_node_id[num_supported_nodes++] = i; } props->NumNodes = num_supported_nodes; @@ -815,8 +846,8 @@ HSAKMT_STATUS hsakmt_topology_sysfs_get_system_props(HsaKFDContext *ctx, return ret; sysfs_parse_failed: - free(map_user_to_sysfs_node_id); - map_user_to_sysfs_node_id = NULL; + free(topology_ctx->map_user_to_sysfs_node_id); + topology_ctx->map_user_to_sysfs_node_id = NULL; err2: free(read_buf); err1: @@ -1077,7 +1108,8 @@ err_device_initialize: return ret; } -static HSAKMT_STATUS topology_sysfs_get_node_props(uint32_t node_id, +static HSAKMT_STATUS topology_sysfs_get_node_props(HsaKFDContext *ctx, + uint32_t node_id, HsaNodeProperties *props, bool *p2p_links, uint32_t *num_p2pLinks) @@ -1097,9 +1129,9 @@ static HSAKMT_STATUS topology_sysfs_get_node_props(uint32_t node_id, uint32_t simd_arrays_count = 0; HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; - + struct hsa_kfd_topology_context *topology_ctx = hsakmt_kfdcontext_get_topology_context(ctx); assert(props); - ret = topology_sysfs_map_node_id(node_id, &sys_node_id); + ret = topology_sysfs_map_node_id(topology_ctx, node_id, &sys_node_id); if (ret != HSAKMT_STATUS_SUCCESS) return ret; @@ -1307,7 +1339,9 @@ out: return ret; } -static HSAKMT_STATUS topology_sysfs_get_mem_props(uint32_t node_id, +static HSAKMT_STATUS topology_sysfs_get_mem_props( + struct hsa_kfd_topology_context *topology_ctx, + uint32_t node_id, uint32_t mem_id, HsaMemoryProperties *props) { @@ -1322,7 +1356,7 @@ static HSAKMT_STATUS topology_sysfs_get_mem_props(uint32_t node_id, uint32_t sys_node_id; assert(props); - ret = topology_sysfs_map_node_id(node_id, &sys_node_id); + ret = topology_sysfs_map_node_id(topology_ctx, node_id, &sys_node_id); if (ret != HSAKMT_STATUS_SUCCESS) return ret; @@ -1541,7 +1575,9 @@ exit: return ret; } -static HSAKMT_STATUS topology_sysfs_get_cache_props(uint32_t node_id, +static HSAKMT_STATUS topology_sysfs_get_cache_props( + struct hsa_kfd_topology_context *topology_ctx, + uint32_t node_id, uint32_t cache_id, HsaCacheProperties *props) { @@ -1556,7 +1592,7 @@ static HSAKMT_STATUS topology_sysfs_get_cache_props(uint32_t node_id, uint32_t sys_node_id; assert(props); - ret = topology_sysfs_map_node_id(node_id, &sys_node_id); + ret = topology_sysfs_map_node_id(topology_ctx, node_id, &sys_node_id); if (ret != HSAKMT_STATUS_SUCCESS) return ret; @@ -1619,12 +1655,13 @@ err1: return ret; } -static HSAKMT_STATUS topology_map_sysfs_to_user_node_id(uint32_t sys_node_id, uint32_t *user_node_id) +static HSAKMT_STATUS topology_map_sysfs_to_user_node_id(struct hsa_kfd_topology_context *topology_ctx, + uint32_t sys_node_id, uint32_t *user_node_id) { uint32_t node_id; - for (node_id = 0; node_id < map_user_to_sysfs_node_id_size; node_id++) - if (map_user_to_sysfs_node_id[node_id] == sys_node_id) { + for (node_id = 0; node_id < topology_ctx->map_user_to_sysfs_node_id_size; node_id++) + if (topology_ctx->map_user_to_sysfs_node_id[node_id] == sys_node_id) { *user_node_id = node_id; return HSAKMT_STATUS_SUCCESS; } @@ -1652,9 +1689,10 @@ static HSAKMT_STATUS topology_sysfs_get_iolink_props(HsaKFDContext *ctx, int read_size; HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; uint32_t sys_node_id; + struct hsa_kfd_topology_context *topology_ctx = hsakmt_kfdcontext_get_topology_context(ctx); assert(props); - ret = topology_sysfs_map_node_id(node_id, &sys_node_id); + ret = topology_sysfs_map_node_id(topology_ctx, node_id, &sys_node_id); if (ret != HSAKMT_STATUS_SUCCESS) return ret; @@ -1707,7 +1745,7 @@ static HSAKMT_STATUS topology_sysfs_get_iolink_props(HsaKFDContext *ctx, memset(props, 0, sizeof(*props)); goto err2; } - ret = topology_map_sysfs_to_user_node_id(sysfs_node_id, &props->NodeTo); + ret = topology_map_sysfs_to_user_node_id(topology_ctx, sysfs_node_id, &props->NodeTo); if (ret != HSAKMT_STATUS_SUCCESS) goto err2; } else if (strcmp(prop_name, "weight") == 0) @@ -1974,6 +2012,7 @@ HSAKMT_STATUS topology_take_snapshot(HsaKFDContext *ctx) uint32_t num_ioLinks; bool p2p_links = false; uint32_t num_p2pLinks = 0; + struct hsa_kfd_topology_context *topology_ctx = hsakmt_kfdcontext_get_topology_context(ctx); cpuinfo = calloc(num_procs, sizeof(struct proc_cpuinfo)); if (!cpuinfo) { @@ -1996,7 +2035,7 @@ retry: goto err; } for (i = 0; i < sys_props.NumNodes; i++) { - ret = topology_sysfs_get_node_props(i, + ret = topology_sysfs_get_node_props(ctx, i, &temp_props[i].node, &p2p_links, &num_p2pLinks); if (ret != HSAKMT_STATUS_SUCCESS) { @@ -2016,7 +2055,7 @@ retry: goto err; } for (mem_id = 0; mem_id < temp_props[i].node.NumMemoryBanks; mem_id++) { - ret = topology_sysfs_get_mem_props(i, mem_id, &temp_props[i].mem[mem_id]); + ret = topology_sysfs_get_mem_props(topology_ctx, i, mem_id, &temp_props[i].mem[mem_id]); if (ret != HSAKMT_STATUS_SUCCESS) { free_properties(temp_props, i + 1); goto err; @@ -2032,7 +2071,8 @@ retry: goto err; } for (cache_id = 0; cache_id < temp_props[i].node.NumCaches; cache_id++) { - ret = topology_sysfs_get_cache_props(i, cache_id, &temp_props[i].cache[cache_id]); + ret = topology_sysfs_get_cache_props(topology_ctx, + i, cache_id, &temp_props[i].cache[cache_id]); if (ret != HSAKMT_STATUS_SUCCESS) { free_properties(temp_props, i + 1); goto err; @@ -2122,62 +2162,72 @@ retry: goto retry; } - if (!g_system) { - g_system = malloc(sizeof(HsaSystemProperties)); - if (!g_system) { + if (!topology_ctx->system_props) { + topology_ctx->system_props = malloc(sizeof(HsaSystemProperties)); + if (!topology_ctx->system_props) { free_properties(temp_props, sys_props.NumNodes); ret = HSAKMT_STATUS_NO_MEMORY; goto err; } } - *g_system = sys_props; - if (g_props) - free(g_props); - g_props = temp_props; + *topology_ctx->system_props = sys_props; + if (topology_ctx->node_props) + free(topology_ctx->node_props); + topology_ctx->node_props = temp_props; err: free(cpuinfo); return ret; } /* Drop the Snapshot of the HSA topology information. Assume lock is held. */ -void topology_drop_snapshot(void) + void topology_drop_snapshot(HsaKFDContext *ctx) { - if (!!g_system != !!g_props) + struct hsa_kfd_topology_context *topology_ctx = + hsakmt_kfdcontext_get_topology_context(ctx); + + if (!!topology_ctx->system_props != !!topology_ctx->node_props) pr_warn("Probably inconsistency?\n"); - if (g_props) { + if (topology_ctx->node_props) { /* Remove state */ - free_properties(g_props, g_system->NumNodes); - g_props = NULL; + free_properties(topology_ctx->node_props, topology_ctx->system_props->NumNodes); + topology_ctx->node_props = NULL; } - free(g_system); - g_system = NULL; + free(topology_ctx->system_props); + topology_ctx->system_props = NULL; - if (map_user_to_sysfs_node_id) { - free(map_user_to_sysfs_node_id); - map_user_to_sysfs_node_id = NULL; - map_user_to_sysfs_node_id_size = 0; + if (topology_ctx->map_user_to_sysfs_node_id) { + free(topology_ctx->map_user_to_sysfs_node_id); + topology_ctx->map_user_to_sysfs_node_id = NULL; + topology_ctx->map_user_to_sysfs_node_id_size = 0; } } -HSAKMT_STATUS hsakmt_validate_nodeid(uint32_t nodeid, uint32_t *gpu_id) +HSAKMT_STATUS hsakmt_validate_nodeid(HsaKFDContext *ctx, uint32_t nodeid, uint32_t *gpu_id) { - if (!g_props || !g_system || g_system->NumNodes <= nodeid) + struct hsa_kfd_topology_context *topology_ctx = + hsakmt_kfdcontext_get_topology_context(ctx); + + if (!topology_ctx->node_props || !topology_ctx->system_props || + topology_ctx->system_props->NumNodes <= nodeid) return HSAKMT_STATUS_INVALID_NODE_UNIT; if (gpu_id) - *gpu_id = g_props[nodeid].node.KFDGpuID; + *gpu_id = topology_ctx->node_props[nodeid].node.KFDGpuID; return HSAKMT_STATUS_SUCCESS; } -HSAKMT_STATUS hsakmt_gpuid_to_nodeid(uint32_t gpu_id, uint32_t *node_id) +HSAKMT_STATUS hsakmt_gpuid_to_nodeid(HsaKFDContext *ctx, uint32_t gpu_id, uint32_t *node_id) { uint64_t node_idx; - for (node_idx = 0; node_idx < g_system->NumNodes; node_idx++) { - if (g_props[node_idx].node.KFDGpuID == gpu_id) { + struct hsa_kfd_topology_context *topology_ctx = + hsakmt_kfdcontext_get_topology_context(ctx); + + for (node_idx = 0; node_idx < topology_ctx->system_props->NumNodes; node_idx++) { + if (topology_ctx->node_props[node_idx].node.KFDGpuID == gpu_id) { *node_id = node_idx; return HSAKMT_STATUS_SUCCESS; } @@ -2193,6 +2243,8 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAcquireSystemPropertiesCtx(HsaKFDContext *ctx, HSAKMT_STATUS err = HSAKMT_STATUS_SUCCESS; CHECK_KFD_OPEN(); + struct hsa_kfd_topology_context *topology_ctx = + hsakmt_kfdcontext_get_topology_context(ctx); if (!SystemProperties) return HSAKMT_STATUS_INVALID_PARAMETER; @@ -2202,8 +2254,8 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAcquireSystemPropertiesCtx(HsaKFDContext *ctx, /* We already have a valid snapshot. Avoid double initialization that * would leak memory. */ - if (g_system) { - *SystemProperties = *g_system; + if (topology_ctx->system_props) { + *SystemProperties = *topology_ctx->system_props; goto out; } @@ -2211,23 +2263,23 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAcquireSystemPropertiesCtx(HsaKFDContext *ctx, if (err != HSAKMT_STATUS_SUCCESS) goto out; - assert(g_system); + assert(topology_ctx->system_props); if (hsakmt_use_model) model_init(); - err = hsakmt_fmm_init_process_apertures(ctx, g_system->NumNodes); + err = hsakmt_fmm_init_process_apertures(ctx, topology_ctx->system_props->NumNodes); if (err != HSAKMT_STATUS_SUCCESS) goto init_process_apertures_failed; - err = hsakmt_init_process_doorbells(ctx, g_system->NumNodes); + err = hsakmt_init_process_doorbells(ctx, topology_ctx->system_props->NumNodes); if (err != HSAKMT_STATUS_SUCCESS) goto init_doorbells_failed; - *SystemProperties = *g_system; + *SystemProperties = *topology_ctx->system_props; - for (int node = 0; node < g_system->NumNodes; node++) { - if (hsakmt_get_gfxv_by_node_id(node) == GFX_VERSION_GFX1151 && + for (int node = 0; node < topology_ctx->system_props->NumNodes; node++) { + if (hsakmt_get_gfxv_by_node_id(ctx, node) == GFX_VERSION_GFX1151 && hsakmt_kfd_version_info.KernelInterfaceMajorVersion == 1 && hsakmt_kfd_version_info.KernelInterfaceMinorVersion < 20) pr_err_once("WARNING: KFD ABI 1.20+ is recommended for gfx1151. Current KFD ABI is %i.%i. This may result in faults, crashes and other application instability\n", hsakmt_kfd_version_info.KernelInterfaceMajorVersion, hsakmt_kfd_version_info.KernelInterfaceMinorVersion); @@ -2238,7 +2290,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAcquireSystemPropertiesCtx(HsaKFDContext *ctx, init_doorbells_failed: hsakmt_fmm_destroy_process_apertures(ctx); init_process_apertures_failed: - topology_drop_snapshot(); + topology_drop_snapshot(ctx); out: pthread_mutex_unlock(&hsakmt_mutex); @@ -2251,20 +2303,24 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtReleaseSystemPropertiesCtx(HsaKFDContext *ctx) hsakmt_destroy_process_doorbells(ctx); hsakmt_fmm_destroy_process_apertures(ctx); - topology_drop_snapshot(); + topology_drop_snapshot(ctx); pthread_mutex_unlock(&hsakmt_mutex); return HSAKMT_STATUS_SUCCESS; } -HSAKMT_STATUS hsakmt_topology_get_node_props(HSAuint32 NodeId, +HSAKMT_STATUS hsakmt_topology_get_node_props(HsaKFDContext *ctx, + HSAuint32 NodeId, HsaNodeProperties *NodeProperties) { - if (!g_system || !g_props || NodeId >= g_system->NumNodes) + struct hsa_kfd_topology_context *topology_ctx = hsakmt_kfdcontext_get_topology_context(ctx); + + if (!topology_ctx->system_props || !topology_ctx->node_props || + NodeId >= topology_ctx->system_props->NumNodes) return HSAKMT_STATUS_ERROR; - *NodeProperties = g_props[NodeId].node; + *NodeProperties = topology_ctx->node_props[NodeId].node; return HSAKMT_STATUS_SUCCESS; } @@ -2282,11 +2338,11 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodePropertiesCtx(HsaKFDContext *ctx, CHECK_KFD_OPEN(); pthread_mutex_lock(&hsakmt_mutex); - err = hsakmt_validate_nodeid(NodeId, &gpu_id); + err = hsakmt_validate_nodeid(ctx, NodeId, &gpu_id); if (err != HSAKMT_STATUS_SUCCESS) goto out; - err = hsakmt_topology_get_node_props(NodeId, NodeProperties); + err = hsakmt_topology_get_node_props(ctx, NodeId, NodeProperties); if (err != HSAKMT_STATUS_SUCCESS) goto out; /* For CPU only node don't add any additional GPU memory banks. */ @@ -2314,6 +2370,8 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeMemoryPropertiesCtx(HsaKFDContext *ctx, HSAKMT_STATUS err = HSAKMT_STATUS_SUCCESS; uint32_t i, gpu_id; HSAuint64 aperture_limit; + struct hsa_kfd_topology_context *topology_ctx = hsakmt_kfdcontext_get_topology_context(ctx); + node_props_t *node_props = topology_ctx->node_props; if (!MemoryProperties) return HSAKMT_STATUS_INVALID_PARAMETER; @@ -2321,15 +2379,15 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeMemoryPropertiesCtx(HsaKFDContext *ctx, CHECK_KFD_OPEN(); pthread_mutex_lock(&hsakmt_mutex); - err = hsakmt_validate_nodeid(NodeId, &gpu_id); + err = hsakmt_validate_nodeid(ctx, NodeId, &gpu_id); if (err != HSAKMT_STATUS_SUCCESS) goto out; memset(MemoryProperties, 0, NumBanks * sizeof(HsaMemoryProperties)); - for (i = 0; i < MIN(g_props[NodeId].node.NumMemoryBanks, NumBanks); i++) { - assert(g_props[NodeId].mem); - MemoryProperties[i] = g_props[NodeId].mem[i]; + for (i = 0; i < MIN(node_props[NodeId].node.NumMemoryBanks, NumBanks); i++) { + assert(node_props[NodeId].mem); + MemoryProperties[i] = node_props[NodeId].mem[i]; } /* The following memory banks does not apply to CPU only node */ @@ -2341,7 +2399,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeMemoryPropertiesCtx(HsaKFDContext *ctx, hsakmt_fmm_get_aperture_base_and_limit(ctx, FMM_LDS, gpu_id, &MemoryProperties[i].VirtualBaseAddress, &aperture_limit) == HSAKMT_STATUS_SUCCESS) { MemoryProperties[i].HeapType = HSA_HEAPTYPE_GPU_LDS; - MemoryProperties[i].SizeInBytes = g_props[NodeId].node.LDSSizeInKB * 1024; + MemoryProperties[i].SizeInBytes = node_props[NodeId].node.LDSSizeInKB * 1024; i++; } @@ -2349,12 +2407,12 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeMemoryPropertiesCtx(HsaKFDContext *ctx, * For dGPU the topology node contains Local Memory and it is added by * the for loop above */ - if (hsakmt_get_gfxv_by_node_id(NodeId) == GFX_VERSION_KAVERI && i < NumBanks && - g_props[NodeId].node.LocalMemSize > 0 && + if (hsakmt_get_gfxv_by_node_id(ctx, NodeId) == GFX_VERSION_KAVERI && i < NumBanks && + node_props[NodeId].node.LocalMemSize > 0 && hsakmt_fmm_get_aperture_base_and_limit(ctx, FMM_GPUVM, gpu_id, &MemoryProperties[i].VirtualBaseAddress, &aperture_limit) == HSAKMT_STATUS_SUCCESS) { MemoryProperties[i].HeapType = HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE; - MemoryProperties[i].SizeInBytes = g_props[NodeId].node.LocalMemSize; + MemoryProperties[i].SizeInBytes = node_props[NodeId].node.LocalMemSize; i++; } @@ -2368,7 +2426,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeMemoryPropertiesCtx(HsaKFDContext *ctx, } /* Add SVM aperture */ - if (hsakmt_topology_is_svm_needed(g_props[NodeId].node.EngineId) && i < NumBanks && + if (hsakmt_topology_is_svm_needed(node_props[NodeId].node.EngineId) && i < NumBanks && hsakmt_fmm_get_aperture_base_and_limit(ctx, FMM_SVM, gpu_id, &MemoryProperties[i].VirtualBaseAddress, &aperture_limit) == HSAKMT_STATUS_SUCCESS) { @@ -2399,6 +2457,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeCachePropertiesCtx(HsaKFDContext *ctx, { HSAKMT_STATUS err; uint32_t i; + struct hsa_kfd_topology_context *topology_ctx = hsakmt_kfdcontext_get_topology_context(ctx); if (!CacheProperties) return HSAKMT_STATUS_INVALID_PARAMETER; @@ -2407,19 +2466,19 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeCachePropertiesCtx(HsaKFDContext *ctx, pthread_mutex_lock(&hsakmt_mutex); /* KFD ADD page 18, snapshot protocol violation */ - if (!g_system || NodeId >= g_system->NumNodes) { + if (!topology_ctx->system_props || NodeId >= topology_ctx->system_props->NumNodes) { err = HSAKMT_STATUS_INVALID_NODE_UNIT; goto out; } - if (NumCaches > g_props[NodeId].node.NumCaches) { + if (NumCaches > topology_ctx->node_props[NodeId].node.NumCaches) { err = HSAKMT_STATUS_INVALID_PARAMETER; goto out; } - for (i = 0; i < MIN(g_props[NodeId].node.NumCaches, NumCaches); i++) { - assert(g_props[NodeId].cache); - CacheProperties[i] = g_props[NodeId].cache[i]; + for (i = 0; i < MIN(topology_ctx->node_props[NodeId].node.NumCaches, NumCaches); i++) { + assert(topology_ctx->node_props[NodeId].cache); + CacheProperties[i] = topology_ctx->node_props[NodeId].cache[i]; } err = HSAKMT_STATUS_SUCCESS; @@ -2429,14 +2488,18 @@ out: return err; } -HSAKMT_STATUS hsakmt_topology_get_iolink_props(HSAuint32 NodeId, +HSAKMT_STATUS hsakmt_topology_get_iolink_props(HsaKFDContext *ctx, + HSAuint32 NodeId, HSAuint32 NumIoLinks, HsaIoLinkProperties *IoLinkProperties) { - if (!g_system || !g_props || NodeId >= g_system->NumNodes) + struct hsa_kfd_topology_context *topology_ctx = hsakmt_kfdcontext_get_topology_context(ctx); + + if (!topology_ctx->system_props || !topology_ctx->node_props || + NodeId >= topology_ctx->system_props->NumNodes) return HSAKMT_STATUS_ERROR; - memcpy(IoLinkProperties, g_props[NodeId].link, + memcpy(IoLinkProperties, topology_ctx->node_props[NodeId].link, NumIoLinks * sizeof(*IoLinkProperties)); return HSAKMT_STATUS_SUCCESS; @@ -2448,6 +2511,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeIoLinkPropertiesCtx(HsaKFDContext *ctx, HsaIoLinkProperties *IoLinkProperties) { HSAKMT_STATUS err; + struct hsa_kfd_topology_context *topology_ctx = hsakmt_kfdcontext_get_topology_context(ctx); if (!IoLinkProperties) return HSAKMT_STATUS_INVALID_PARAMETER; @@ -2457,79 +2521,85 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeIoLinkPropertiesCtx(HsaKFDContext *ctx, pthread_mutex_lock(&hsakmt_mutex); /* KFD ADD page 18, snapshot protocol violation */ - if (!g_system || NodeId >= g_system->NumNodes ) { + if (!topology_ctx->system_props || NodeId >= topology_ctx->system_props->NumNodes ) { err = HSAKMT_STATUS_INVALID_NODE_UNIT; goto out; } - if (NumIoLinks > g_props[NodeId].node.NumIOLinks) { + if (NumIoLinks > topology_ctx->node_props[NodeId].node.NumIOLinks) { err = HSAKMT_STATUS_INVALID_PARAMETER; goto out; } - assert(g_props[NodeId].link); - err = hsakmt_topology_get_iolink_props(NodeId, NumIoLinks, IoLinkProperties); + assert(topology_ctx->node_props[NodeId].link); + err = hsakmt_topology_get_iolink_props(ctx, NodeId, NumIoLinks, IoLinkProperties); out: pthread_mutex_unlock(&hsakmt_mutex); return err; } -uint32_t hsakmt_get_gfxv_by_node_id(HSAuint32 node_id) +uint32_t hsakmt_get_gfxv_by_node_id(HsaKFDContext *ctx, HSAuint32 node_id) { - return HSA_GET_GFX_VERSION_FULL(g_props[node_id].node.EngineId.ui32); + struct hsa_kfd_topology_context *topology_ctx = hsakmt_kfdcontext_get_topology_context(ctx); + return HSA_GET_GFX_VERSION_FULL(topology_ctx->node_props[node_id].node.EngineId.ui32); } -uint16_t hsakmt_get_device_id_by_node_id(HSAuint32 node_id) +uint16_t hsakmt_get_device_id_by_node_id(HsaKFDContext *ctx, HSAuint32 node_id) { - if (!g_props || !g_system || g_system->NumNodes <= node_id) + struct hsa_kfd_topology_context *topology_ctx = hsakmt_kfdcontext_get_topology_context(ctx); + + if (!topology_ctx->node_props || !topology_ctx->system_props || + topology_ctx->system_props->NumNodes <= node_id) return 0; - return g_props[node_id].node.DeviceId; + return topology_ctx->node_props[node_id].node.DeviceId; } -bool hsakmt_prefer_ats(HSAuint32 node_id) +bool hsakmt_prefer_ats(HsaKFDContext *ctx, HSAuint32 node_id) { - return g_props[node_id].node.Capability.ui32.HSAMMUPresent - && g_props[node_id].node.NumCPUCores - && g_props[node_id].node.NumFComputeCores; + struct hsa_kfd_topology_context *topology_ctx = hsakmt_kfdcontext_get_topology_context(ctx); + return topology_ctx->node_props[node_id].node.Capability.ui32.HSAMMUPresent + && topology_ctx->node_props[node_id].node.NumCPUCores + && topology_ctx->node_props[node_id].node.NumFComputeCores; } -uint16_t hsakmt_get_device_id_by_gpu_id(HSAuint32 gpu_id) +uint16_t hsakmt_get_device_id_by_gpu_id(HsaKFDContext *ctx, HSAuint32 gpu_id) { unsigned int i; - - if (!g_props || !g_system) + struct hsa_kfd_topology_context *topology_ctx = hsakmt_kfdcontext_get_topology_context(ctx); + if (!topology_ctx->node_props || !topology_ctx->system_props) return 0; - for (i = 0; i < g_system->NumNodes; i++) { - if (g_props[i].node.KFDGpuID == gpu_id) - return g_props[i].node.DeviceId; + for (i = 0; i < topology_ctx->system_props->NumNodes; i++) { + if (topology_ctx->node_props[i].node.KFDGpuID == gpu_id) + return topology_ctx->node_props[i].node.DeviceId; } return 0; } -uint32_t hsakmt_get_direct_link_cpu(uint32_t gpu_node) +uint32_t hsakmt_get_direct_link_cpu(HsaKFDContext *ctx, HSAuint32 gpu_node) { HSAuint64 size = 0; int32_t cpu_id; HSAuint32 i; + struct hsa_kfd_topology_context *topology_ctx = hsakmt_kfdcontext_get_topology_context(ctx); - cpu_id = gpu_get_direct_link_cpu(gpu_node, g_props); + cpu_id = gpu_get_direct_link_cpu(gpu_node, topology_ctx->node_props); if (cpu_id == -1) return INVALID_NODEID; - assert(g_props[cpu_id].mem); - - for (i = 0; i < g_props[cpu_id].node.NumMemoryBanks; i++) - size += g_props[cpu_id].mem[i].SizeInBytes; + assert(topology_ctx->node_props[cpu_id].mem); + for (i = 0; i < topology_ctx->node_props[cpu_id].node.NumMemoryBanks; i++) + size += topology_ctx->node_props[cpu_id].mem[i].SizeInBytes; return size ? (uint32_t)cpu_id : INVALID_NODEID; } -HSAKMT_STATUS hsakmt_validate_nodeid_array(uint32_t **gpu_id_array, +HSAKMT_STATUS hsakmt_validate_nodeid_array(HsaKFDContext *ctx, + uint32_t **gpu_id_array, uint32_t NumberOfNodes, uint32_t *NodeArray) { HSAKMT_STATUS ret; @@ -2543,7 +2613,7 @@ HSAKMT_STATUS hsakmt_validate_nodeid_array(uint32_t **gpu_id_array, if (!(*gpu_id_array)) return HSAKMT_STATUS_NO_MEMORY; for (i = 0; i < NumberOfNodes; i++) { - ret = hsakmt_validate_nodeid(NodeArray[i], *gpu_id_array + i); + ret = hsakmt_validate_nodeid(ctx, NodeArray[i], *gpu_id_array + i); if (ret != HSAKMT_STATUS_SUCCESS) { free(*gpu_id_array); break; @@ -2553,13 +2623,13 @@ HSAKMT_STATUS hsakmt_validate_nodeid_array(uint32_t **gpu_id_array, return ret; } -inline uint32_t hsakmt_get_num_sysfs_nodes(void) +uint32_t hsakmt_get_num_sysfs_nodes(HsaKFDContext *ctx) { - return num_sysfs_nodes; + struct hsa_kfd_topology_context *topology_ctx = hsakmt_kfdcontext_get_topology_context(ctx); + return topology_ctx->num_sysfs_nodes; } - HSAKMT_STATUS HSAKMTAPI hsaKmtAcquireSystemProperties(HsaSystemProperties *SystemProperties) { return hsaKmtAcquireSystemPropertiesCtx(&hsakmt_primary_kfd_ctx, SystemProperties);