From 9da1572c423fd6b17fe3b0be21b950f6c57ce428 Mon Sep 17 00:00:00 2001 From: Junhua Shen Date: Mon, 10 Nov 2025 11:19:58 +0800 Subject: [PATCH] libhsakmt: Refactor for Multi-KFD Context Support (Multiple KFD FDs per Process) (#1701) * Introduce HsaKFDContext structure and infrastructure for multiple KFD contexts, enabling independent contexts within a single process. * Refactor core components (queue, event, FMM, topology) to be context-aware, using explicit HsaKFDContext parameters instead of global state. * Replace global hsakmt_kfd_fd with context-specific file descriptors, ensuring full context isolation. * Maintain backward compatibility by redirecting legacy APIs to use the primary context. This refactoring establishes a foundation for multi-context support while preserving existing functionality. Signed-off-by: Junhua Shen --- .../rocr-runtime/libhsakmt/CMakeLists.txt | 3 +- projects/rocr-runtime/libhsakmt/src/ais.c | 4 +- projects/rocr-runtime/libhsakmt/src/debug.c | 28 +- projects/rocr-runtime/libhsakmt/src/events.c | 221 +++-- projects/rocr-runtime/libhsakmt/src/fmm.c | 727 ++++++++------- projects/rocr-runtime/libhsakmt/src/fmm.h | 144 ++- projects/rocr-runtime/libhsakmt/src/globals.c | 2 - .../rocr-runtime/libhsakmt/src/hsakmtctx.h | 827 ++++++++++++++++++ .../rocr-runtime/libhsakmt/src/hsakmtmodel.c | 14 +- .../rocr-runtime/libhsakmt/src/kfdcontext.c | 63 ++ .../rocr-runtime/libhsakmt/src/kfdcontext.h | 74 ++ .../rocr-runtime/libhsakmt/src/libhsakmt.h | 25 +- projects/rocr-runtime/libhsakmt/src/memory.c | 363 ++++++-- .../rocr-runtime/libhsakmt/src/openclose.c | 50 +- .../rocr-runtime/libhsakmt/src/pc_sampling.c | 10 +- projects/rocr-runtime/libhsakmt/src/queues.c | 340 ++++--- projects/rocr-runtime/libhsakmt/src/spm.c | 6 +- projects/rocr-runtime/libhsakmt/src/svm.c | 48 +- projects/rocr-runtime/libhsakmt/src/time.c | 2 +- .../rocr-runtime/libhsakmt/src/topology.c | 107 ++- projects/rocr-runtime/libhsakmt/src/version.c | 2 +- 21 files changed, 2377 insertions(+), 683 deletions(-) create mode 100644 projects/rocr-runtime/libhsakmt/src/hsakmtctx.h create mode 100644 projects/rocr-runtime/libhsakmt/src/kfdcontext.c create mode 100644 projects/rocr-runtime/libhsakmt/src/kfdcontext.h diff --git a/projects/rocr-runtime/libhsakmt/CMakeLists.txt b/projects/rocr-runtime/libhsakmt/CMakeLists.txt index 44b5dc603e..25b3af4af8 100644 --- a/projects/rocr-runtime/libhsakmt/CMakeLists.txt +++ b/projects/rocr-runtime/libhsakmt/CMakeLists.txt @@ -130,7 +130,8 @@ set ( HSAKMT_SRC "src/debug.c" "src/version.c" "src/svm.c" "src/pc_sampling.c" - "src/ais.c") + "src/ais.c" + "src/kfdcontext.c") ## Declare the library target name add_library (${HSAKMT_TARGET} STATIC "") diff --git a/projects/rocr-runtime/libhsakmt/src/ais.c b/projects/rocr-runtime/libhsakmt/src/ais.c index aca8acc48f..7a46264c86 100644 --- a/projects/rocr-runtime/libhsakmt/src/ais.c +++ b/projects/rocr-runtime/libhsakmt/src/ais.c @@ -47,7 +47,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAisReadWriteFile(void *MemoryAddress, /* Support is only for dGPUs */ - if (!hsakmt_fmm_get_handle(MemoryAddress, &handle, &size_offset)) { + if (!hsakmt_fmm_get_handle(&hsakmt_primary_kfd_ctx, MemoryAddress, &handle, &size_offset)) { pr_err("Address/size out of range: %p/%lu\n", MemoryAddress, MemorySizeInBytes); return HSAKMT_STATUS_INVALID_PARAMETER; } @@ -66,7 +66,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAisReadWriteFile(void *MemoryAddress, } args.in.handle_offset = size_offset; - ret = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_AIS_OP, &args); + ret = hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_AIS_OP, &args); if (SizeCopiedInBytes) *SizeCopiedInBytes = args.out.size_copied; diff --git a/projects/rocr-runtime/libhsakmt/src/debug.c b/projects/rocr-runtime/libhsakmt/src/debug.c index 6aad5ea183..7fe450d123 100644 --- a/projects/rocr-runtime/libhsakmt/src/debug.c +++ b/projects/rocr-runtime/libhsakmt/src/debug.c @@ -78,7 +78,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDbgRegister(HSAuint32 NodeId) args.gpu_id = gpu_id; - long err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_DBG_REGISTER_DEPRECATED, &args); + long err = hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_DBG_REGISTER_DEPRECATED, &args); if (err == 0) result = HSAKMT_STATUS_SUCCESS; @@ -105,7 +105,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDbgUnregister(HSAuint32 NodeId) struct kfd_ioctl_dbg_unregister_args args = {0}; args.gpu_id = gpu_id; - long err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_DBG_UNREGISTER_DEPRECATED, &args); + long err = hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_DBG_UNREGISTER_DEPRECATED, &args); if (err) return HSAKMT_STATUS_ERROR; @@ -168,7 +168,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDbgWavefrontControl(HSAuint32 NodeId, run_ptr += sizeof(DbgWaveMsgRing->MemoryVA); /* send to kernel */ - long err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_DBG_WAVE_CONTROL_DEPRECATED, args); + long err = hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_DBG_WAVE_CONTROL_DEPRECATED, args); free(args); @@ -256,7 +256,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDbgAddressWatch(HSAuint32 NodeId, } /* send to kernel */ - long err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_DBG_ADDRESS_WATCH_DEPRECATED, args); + long err = hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_DBG_ADDRESS_WATCH_DEPRECATED, args); free(args); @@ -316,7 +316,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRuntimeEnable(void *rDebug, ((setupTtmp) ? KFD_RUNTIME_ENABLE_MODE_TTMP_SAVE_MASK : 0); args.r_debug = (HSAuint64)rDebug; - long err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_RUNTIME_ENABLE, &args); + long err = hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_RUNTIME_ENABLE, &args); if (err) { if (errno == EBUSY) @@ -340,7 +340,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRuntimeDisable(void) memset(&args, 0x00, sizeof(args)); args.mode_mask = 0; //Disable - if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_RUNTIME_ENABLE, &args)) + if (hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_RUNTIME_ENABLE, &args)) return HSAKMT_STATUS_ERROR; return HSAKMT_STATUS_SUCCESS; @@ -363,7 +363,7 @@ static HSAKMT_STATUS dbg_trap_get_device_data(void *data, args.device_snapshot.entry_size = entry_size; args.op = KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT; args.pid = getpid(); - if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_DBG_TRAP, &args)) + if (hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_DBG_TRAP, &args)) return HSAKMT_STATUS_ERROR; *n_entries = args.device_snapshot.num_devices; @@ -384,7 +384,7 @@ static HSAKMT_STATUS dbg_trap_get_queue_data(void *data, args.queue_snapshot.snapshot_buf_ptr = (uint64_t) data; args.pid = getpid(); - if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_DBG_TRAP, &args)) + if (hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_DBG_TRAP, &args)) return HSAKMT_STATUS_ERROR; *n_entries = args.queue_snapshot.num_queues; @@ -410,7 +410,7 @@ static HSAKMT_STATUS dbg_trap_suspend_queues(uint32_t *queue_ids, args.op = KFD_IOC_DBG_TRAP_SUSPEND_QUEUES; args.pid = getpid(); - r = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_DBG_TRAP, &args); + r = hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_DBG_TRAP, &args); if (r < 0) return HSAKMT_STATUS_ERROR; @@ -429,7 +429,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDbgEnable(void **runtime_info, CHECK_KFD_MINOR_VERSION(KFD_MINOR_MIN_DEBUG); *data_size = sizeof(struct kfd_runtime_info); args.enable.rinfo_size = *data_size; - args.enable.dbg_fd = hsakmt_kfd_fd; + args.enable.dbg_fd = hsakmt_primary_kfd_ctx.fd; *runtime_info = malloc(args.enable.rinfo_size); if (!*runtime_info) return HSAKMT_STATUS_NO_MEMORY; @@ -437,7 +437,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDbgEnable(void **runtime_info, args.op = KFD_IOC_DBG_TRAP_ENABLE; args.pid = getpid(); - if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_DBG_TRAP, &args)) { + if (hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_DBG_TRAP, &args)) { free(*runtime_info); return HSAKMT_STATUS_ERROR; } @@ -450,11 +450,11 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDbgDisable(void) CHECK_KFD_OPEN(); CHECK_KFD_MINOR_VERSION(KFD_MINOR_MIN_DEBUG); - args.enable.dbg_fd = hsakmt_kfd_fd; + args.enable.dbg_fd = hsakmt_primary_kfd_ctx.fd; args.op = KFD_IOC_DBG_TRAP_DISABLE; args.pid = getpid(); - if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_DBG_TRAP, &args)) + if (hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_DBG_TRAP, &args)) return HSAKMT_STATUS_ERROR; return HSAKMT_STATUS_SUCCESS; @@ -540,7 +540,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDebugTrapIoctl(struct kfd_ioctl_dbg_trap_args *arg free(queue_ids); } - long err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_DBG_TRAP, args); + long err = hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_DBG_TRAP, args); if (DebugReturn) *DebugReturn = err; diff --git a/projects/rocr-runtime/libhsakmt/src/events.c b/projects/rocr-runtime/libhsakmt/src/events.c index 2421a80dfd..df97cf6c64 100644 --- a/projects/rocr-runtime/libhsakmt/src/events.c +++ b/projects/rocr-runtime/libhsakmt/src/events.c @@ -34,12 +34,36 @@ #include "hsakmt/linux/kfd_ioctl.h" #include "fmm.h" #include "hsakmt/hsakmtmodel.h" +#include -static HSAuint64 *events_page = NULL; -void hsakmt_clear_events_page(void) +struct hsa_kfd_event_context { - events_page = NULL; + HSAuint64 *events_page; +}; + +struct hsa_kfd_event_context *hsakmt_kfdcontext_get_event_context(HsaKFDContext *ctx) +{ + assert(ctx); + + if (ctx->event_context) + return ctx->event_context; + + ctx->event_context = calloc(1, sizeof(struct hsa_kfd_event_context)); + if (!ctx->event_context) { + pr_err("Alloc memory failed for struct hsa_kfd_event_context size %zu\n", + sizeof(struct hsa_kfd_event_context)); + return NULL; + } + return ctx->event_context; +} + +void hsakmt_clear_events_page(HsaKFDContext *ctx) +{ + struct hsa_kfd_event_context *event_ctx = hsakmt_kfdcontext_get_event_context(ctx); + if (event_ctx) { + event_ctx->events_page = NULL; + } } static bool IsSystemEventType(HSA_EVENTTYPE type) @@ -48,14 +72,18 @@ static bool IsSystemEventType(HSA_EVENTTYPE type) return (type != HSA_EVENTTYPE_SIGNAL && type != HSA_EVENTTYPE_DEBUG_EVENT); } -HSAKMT_STATUS HSAKMTAPI hsaKmtCreateEvent(HsaEventDescriptor *EventDesc, - bool ManualReset, bool IsSignaled, - HsaEvent **Event) +HSAKMT_STATUS HSAKMTAPI hsaKmtCreateEventCtx(HsaKFDContext *ctx, + HsaEventDescriptor *EventDesc, + bool ManualReset, bool IsSignaled, + HsaEvent **Event) { unsigned int event_limit = KFD_SIGNAL_EVENT_LIMIT; CHECK_KFD_OPEN(); + struct hsa_kfd_event_context *event_ctx = NULL; + HSAuint64 *events_page = NULL; + if (EventDesc->EventType >= HSA_EVENTTYPE_MAXID) return HSAKMT_STATUS_INVALID_PARAMETER; @@ -74,9 +102,11 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCreateEvent(HsaEventDescriptor *EventDesc, /* dGPU code */ pthread_mutex_lock(&hsakmt_mutex); + event_ctx = hsakmt_kfdcontext_get_event_context(ctx); + events_page = event_ctx->events_page; if (hsakmt_is_dgpu && !events_page) { - events_page = hsakmt_allocate_exec_aligned_memory_gpu( + events_page = hsakmt_allocate_exec_aligned_memory_gpu(ctx, KFD_SIGNAL_EVENT_LIMIT * 8, PAGE_SIZE, 0, 0, true, false, true); if (!events_page) { free(e); @@ -86,10 +116,10 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCreateEvent(HsaEventDescriptor *EventDesc, if (hsakmt_use_model) model_set_event_page(events_page, KFD_SIGNAL_EVENT_LIMIT); else - hsakmt_fmm_get_handle(events_page, (uint64_t *)&args.event_page_offset, NULL); + hsakmt_fmm_get_handle(ctx, events_page, (uint64_t *)&args.event_page_offset, NULL); } - if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_CREATE_EVENT, &args) != 0) { + if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_CREATE_EVENT, &args) != 0) { free(e); *Event = NULL; pthread_mutex_unlock(&hsakmt_mutex); @@ -100,17 +130,17 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCreateEvent(HsaEventDescriptor *EventDesc, if (!events_page && args.event_page_offset > 0) { events_page = mmap(NULL, event_limit * 8, PROT_WRITE | PROT_READ, - MAP_SHARED, hsakmt_kfd_fd, args.event_page_offset); + MAP_SHARED, ctx->fd, args.event_page_offset); if (events_page == MAP_FAILED) { /* old kernels only support 256 events */ event_limit = 256; events_page = mmap(NULL, PAGE_SIZE, PROT_WRITE | PROT_READ, - MAP_SHARED, hsakmt_kfd_fd, args.event_page_offset); + MAP_SHARED, ctx->fd, args.event_page_offset); } if (events_page == MAP_FAILED) { events_page = NULL; pthread_mutex_unlock(&hsakmt_mutex); - hsaKmtDestroyEvent(e); + hsaKmtDestroyEventCtx(ctx, e); return HSAKMT_STATUS_ERROR; } } @@ -118,10 +148,10 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCreateEvent(HsaEventDescriptor *EventDesc, if (args.event_page_offset > 0 && args.event_slot_index < event_limit) e->EventData.HWData2 = (HSAuint64)&events_page[args.event_slot_index]; - pthread_mutex_unlock(&hsakmt_mutex); + pthread_mutex_unlock(&hsakmt_mutex); - e->EventData.EventType = EventDesc->EventType; - e->EventData.HWData1 = args.event_id; + e->EventData.EventType = EventDesc->EventType; + e->EventData.HWData1 = args.event_id; e->EventData.HWData3 = args.event_trigger_data; e->EventData.EventData.SyncVar.SyncVar.UserData = @@ -134,19 +164,21 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCreateEvent(HsaEventDescriptor *EventDesc, set_args.event_id = args.event_id; - if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_SET_EVENT, - &set_args) != 0) { - hsaKmtDestroyEvent(e); - return HSAKMT_STATUS_ERROR; - } - } + if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_SET_EVENT, &set_args) != 0) { + hsaKmtDestroyEventCtx(ctx, e); + return HSAKMT_STATUS_ERROR; + } + } - *Event = e; + *Event = e; + if (!event_ctx->events_page) + event_ctx->events_page = events_page; return HSAKMT_STATUS_SUCCESS; } -HSAKMT_STATUS HSAKMTAPI hsaKmtDestroyEvent(HsaEvent *Event) +HSAKMT_STATUS HSAKMTAPI hsaKmtDestroyEventCtx(HsaKFDContext *ctx, + HsaEvent *Event) { CHECK_KFD_OPEN(); @@ -157,14 +189,15 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDestroyEvent(HsaEvent *Event) args.event_id = Event->EventId; - if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_DESTROY_EVENT, &args) != 0) + if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_DESTROY_EVENT, &args) != 0) return HSAKMT_STATUS_ERROR; free(Event); return HSAKMT_STATUS_SUCCESS; } -HSAKMT_STATUS HSAKMTAPI hsaKmtSetEvent(HsaEvent *Event) +HSAKMT_STATUS HSAKMTAPI hsaKmtSetEventCtx(HsaKFDContext *ctx, + HsaEvent *Event) { CHECK_KFD_OPEN(); @@ -181,13 +214,14 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtSetEvent(HsaEvent *Event) args.event_id = Event->EventId; - if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_SET_EVENT, &args) == -1) + if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_SET_EVENT, &args) == -1) return HSAKMT_STATUS_ERROR; return HSAKMT_STATUS_SUCCESS; } -HSAKMT_STATUS HSAKMTAPI hsaKmtResetEvent(HsaEvent *Event) +HSAKMT_STATUS HSAKMTAPI hsaKmtResetEventCtx(HsaKFDContext *ctx, + HsaEvent *Event) { CHECK_KFD_OPEN(); @@ -204,13 +238,14 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtResetEvent(HsaEvent *Event) args.event_id = Event->EventId; - if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_RESET_EVENT, &args) == -1) + if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_RESET_EVENT, &args) == -1) return HSAKMT_STATUS_ERROR; return HSAKMT_STATUS_SUCCESS; } -HSAKMT_STATUS HSAKMTAPI hsaKmtQueryEventState(HsaEvent *Event) +HSAKMT_STATUS HSAKMTAPI hsaKmtQueryEventStateCtx(HsaKFDContext *ctx, + HsaEvent *Event) { CHECK_KFD_OPEN(); @@ -220,22 +255,25 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtQueryEventState(HsaEvent *Event) return HSAKMT_STATUS_SUCCESS; } -HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnEvent(HsaEvent *Event, - HSAuint32 Milliseconds) +HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnEventCtx(HsaKFDContext *ctx, + HsaEvent *Event, + HSAuint32 Milliseconds) { - return hsaKmtWaitOnEvent_Ext(Event, Milliseconds, NULL); + return hsaKmtWaitOnEvent_ExtCtx(ctx, Event, Milliseconds, NULL); } -HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnEvent_Ext(HsaEvent *Event, - HSAuint32 Milliseconds, uint64_t *event_age) +HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnEvent_ExtCtx(HsaKFDContext *ctx, + HsaEvent *Event, + HSAuint32 Milliseconds, uint64_t *event_age) { if (!Event) return HSAKMT_STATUS_INVALID_HANDLE; - return hsaKmtWaitOnMultipleEvents_Ext(&Event, 1, true, Milliseconds, event_age); + return hsaKmtWaitOnMultipleEvents_ExtCtx(ctx, &Event, + 1, true, Milliseconds, event_age); } -static HSAKMT_STATUS get_mem_info_svm_api(uint64_t address, uint32_t gpu_id) +static HSAKMT_STATUS get_mem_info_svm_api(HsaKFDContext *ctx, uint64_t address, uint32_t gpu_id) { struct kfd_ioctl_svm_args *args; uint32_t node_id = 0; @@ -258,7 +296,7 @@ static HSAKMT_STATUS get_mem_info_svm_api(uint64_t address, uint32_t gpu_id) args->op = KFD_IOCTL_SVM_OP_GET_ATTR; args->nattr = s_attr / sizeof(*attrs); memcpy(args->attrs, attrs, s_attr); - if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_SVM + (s_attr << _IOC_SIZESHIFT), args)) { + if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_SVM + (s_attr << _IOC_SIZESHIFT), args)) { pr_debug("op get range attrs failed %s\n", strerror(errno)); return HSAKMT_STATUS_ERROR; } @@ -312,8 +350,8 @@ static HSAKMT_STATUS get_mem_info_svm_api(uint64_t address, uint32_t gpu_id) return HSAKMT_STATUS_SUCCESS; } //Analysis memory exception data, print debug messages -static void analysis_memory_exception(struct kfd_hsa_memory_exception_data * - memory_exception_data) +static void analysis_memory_exception(HsaKFDContext *ctx, + struct kfd_hsa_memory_exception_data *memory_exception_data) { HSAKMT_STATUS ret; HsaPointerInfo info; @@ -331,9 +369,9 @@ static void analysis_memory_exception(struct kfd_hsa_memory_exception_data * else if (memory_exception_data->failure.NoExecute) pr_err("Execute to none-executable page\n"); - ret = hsakmt_fmm_get_mem_info((const void *)addr, &info); + ret = hsakmt_fmm_get_mem_info(ctx, (const void *)addr, &info); if (ret != HSAKMT_STATUS_SUCCESS) { - ret = get_mem_info_svm_api(addr, memory_exception_data->gpu_id); + ret = get_mem_info_svm_api(ctx, addr, memory_exception_data->gpu_id); if (ret != HSAKMT_STATUS_SUCCESS) pr_err("Address does not belong to a known buffer\n"); return; @@ -378,19 +416,22 @@ static void analysis_memory_exception(struct kfd_hsa_memory_exception_data * } } -HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents(HsaEvent *Events[], - HSAuint32 NumEvents, - bool WaitOnAll, - HSAuint32 Milliseconds) +HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEventsCtx(HsaKFDContext *ctx, + HsaEvent *Events[], + HSAuint32 NumEvents, + bool WaitOnAll, + HSAuint32 Milliseconds) { - return hsaKmtWaitOnMultipleEvents_Ext(Events, NumEvents, WaitOnAll, Milliseconds, NULL); + return hsaKmtWaitOnMultipleEvents_ExtCtx(ctx, Events, + NumEvents, WaitOnAll, Milliseconds, NULL); } -HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents_Ext(HsaEvent *Events[], - HSAuint32 NumEvents, - bool WaitOnAll, - HSAuint32 Milliseconds, - uint64_t *event_age) +HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents_ExtCtx(HsaKFDContext *ctx, + HsaEvent *Events[], + HSAuint32 NumEvents, + bool WaitOnAll, + HSAuint32 Milliseconds, + uint64_t *event_age) { HSAKMT_STATUS result; CHECK_KFD_OPEN(); @@ -417,7 +458,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents_Ext(HsaEvent *Events[], args.num_events = NumEvents; args.events_ptr = (uint64_t)(uintptr_t)event_data; - if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_WAIT_EVENTS, &args) == -1) + if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_WAIT_EVENTS, &args) == -1) result = HSAKMT_STATUS_ERROR; else if (args.wait_result == KFD_IOC_WAIT_RESULT_TIMEOUT) result = HSAKMT_STATUS_WAIT_TIMEOUT; @@ -438,7 +479,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents_Ext(HsaEvent *Events[], Events[i]->EventData.EventData.MemoryAccessFault.Failure.ECC = ((event_data[i].memory_exception_data.ErrorType == 1) || (event_data[i].memory_exception_data.ErrorType == 2)) ? 1 : 0; Events[i]->EventData.EventData.MemoryAccessFault.Flags = HSA_EVENTID_MEMORY_FATAL_PROCESS; - analysis_memory_exception(&event_data[i].memory_exception_data); + analysis_memory_exception(ctx, &event_data[i].memory_exception_data); } else if (Events[i]->EventData.EventType == HSA_EVENTTYPE_HW_EXCEPTION && event_data[i].hw_exception_data.gpu_id) { @@ -464,7 +505,7 @@ out: return result; } -HSAKMT_STATUS HSAKMTAPI hsaKmtOpenSMI(HSAuint32 NodeId, int *fd) +HSAKMT_STATUS HSAKMTAPI hsaKmtOpenSMICtx(HsaKFDContext *ctx, HSAuint32 NodeId, int *fd) { struct kfd_ioctl_smi_events_args args; HSAKMT_STATUS result; @@ -481,7 +522,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtOpenSMI(HSAuint32 NodeId, int *fd) } args.gpuid = gpuid; - result = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_SMI_EVENTS, &args); + result = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_SMI_EVENTS, &args); if (result) { pr_debug("open SMI event fd failed %s\n", strerror(errno)); return HSAKMT_STATUS_ERROR; @@ -490,3 +531,73 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtOpenSMI(HSAuint32 NodeId, int *fd) *fd = args.anon_fd; return HSAKMT_STATUS_SUCCESS; } + + + +HSAKMT_STATUS HSAKMTAPI hsaKmtCreateEvent(HsaEventDescriptor *EventDesc, + bool ManualReset, bool IsSignaled, + HsaEvent **Event) +{ + return hsaKmtCreateEventCtx(&hsakmt_primary_kfd_ctx, EventDesc, ManualReset, + IsSignaled, Event); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtDestroyEvent(HsaEvent *Event) +{ + return hsaKmtDestroyEventCtx(&hsakmt_primary_kfd_ctx, Event); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtSetEvent(HsaEvent *Event) +{ + return hsaKmtSetEventCtx(&hsakmt_primary_kfd_ctx, Event); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtResetEvent(HsaEvent *Event) +{ + return hsaKmtResetEventCtx(&hsakmt_primary_kfd_ctx, Event); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtQueryEventState(HsaEvent *Event) +{ + return hsaKmtQueryEventStateCtx(&hsakmt_primary_kfd_ctx, Event); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnEvent(HsaEvent *Event, + HSAuint32 Milliseconds) +{ + return hsaKmtWaitOnEvent_Ext(Event, Milliseconds, NULL); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnEvent_Ext(HsaEvent *Event, + HSAuint32 Milliseconds, uint64_t *event_age) +{ + if (!Event) + return HSAKMT_STATUS_INVALID_HANDLE; + + return hsaKmtWaitOnMultipleEvents_Ext(&Event, 1, + true, Milliseconds, event_age); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents(HsaEvent *Events[], + HSAuint32 NumEvents, + bool WaitOnAll, + HSAuint32 Milliseconds) +{ + return hsaKmtWaitOnMultipleEvents_Ext(Events, NumEvents, + WaitOnAll, Milliseconds, NULL); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents_Ext(HsaEvent *Events[], + HSAuint32 NumEvents, + bool WaitOnAll, + HSAuint32 Milliseconds, + uint64_t *event_age) +{ + return hsaKmtWaitOnMultipleEvents_ExtCtx(&hsakmt_primary_kfd_ctx, + Events, NumEvents, WaitOnAll, Milliseconds, event_age); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtOpenSMI(HSAuint32 NodeId, int *fd) +{ + return hsaKmtOpenSMICtx(&hsakmt_primary_kfd_ctx, NodeId, fd); +} diff --git a/projects/rocr-runtime/libhsakmt/src/fmm.c b/projects/rocr-runtime/libhsakmt/src/fmm.c index 75b9b481de..ea0b43fbd7 100644 --- a/projects/rocr-runtime/libhsakmt/src/fmm.c +++ b/projects/rocr-runtime/libhsakmt/src/fmm.c @@ -245,12 +245,44 @@ typedef struct { uint32_t alignment_order; } svm_t; -/* The other apertures are specific to each GPU. gpu_mem_t manages GPU - * specific memory apertures. - */ -static gpu_mem_t *gpu_mem; -static unsigned int gpu_mem_count; -static gpu_mem_t *g_first_gpu_mem; +struct hsa_kfd_fmm_context +{ + /* The other apertures are specific to each GPU. gpu_mem_t manages GPU + * specific memory apertures. + */ + gpu_mem_t *gpu_mem; + unsigned int gpu_mem_count; + gpu_mem_t *first_gpu_mem; + +#define DRM_FIRST_RENDER_NODE 128 +#define DRM_LAST_RENDER_NODE 255 + + /* The VMs from DRM render nodes are used by KFD for the lifetime of + * the process. Therefore we have to keep using the same FDs for the + * lifetime of the process, even when we close and reopen KFD. There + * are up to 128 render nodes that we cache in this array. + */ + int drm_render_fds[DRM_LAST_RENDER_NODE + 1 - DRM_FIRST_RENDER_NODE]; + + /* amdgpu device handle for each gpu that libdrm uses */ + struct amdgpu_device *amdgpu_handle[DRM_LAST_RENDER_NODE + 1 - DRM_FIRST_RENDER_NODE]; +}; + +struct hsa_kfd_fmm_context *hsakmt_kfdcontext_get_fmm_context(HsaKFDContext *ctx) +{ + assert(ctx); + + if (ctx->fmm_context) + return ctx->fmm_context; + + ctx->fmm_context = calloc(1, sizeof(struct hsa_kfd_fmm_context)); + if (!ctx->fmm_context) { + pr_err("Alloc memory failed for struct hsa_kfd_fmm_context size %zu\n", + sizeof(struct hsa_kfd_fmm_context)); + return NULL; + } + return ctx->fmm_context; +} static void *dgpu_shared_aperture_base; static void *dgpu_shared_aperture_limit; @@ -322,10 +354,9 @@ static inline HsaSharedMemoryHandle *to_hsa_shared_memory_handle( return (HsaSharedMemoryHandle *)SharedMemoryStruct; } -static int __fmm_release(vm_object_t *object, manageable_aperture_t *aperture); -static int _fmm_unmap_from_gpu_scratch(uint32_t gpu_id, - manageable_aperture_t *aperture, - void *address); +static int __fmm_release(HsaKFDContext *ctx, vm_object_t *object, manageable_aperture_t *aperture); +static int _fmm_unmap_from_gpu_scratch(HsaKFDContext *ctx, uint32_t gpu_id, + manageable_aperture_t *aperture, void *address); static void print_device_id_array(uint32_t *device_id_array, uint32_t device_id_array_size); static vm_area_t *vm_create_and_init_area(void *start, void *end) @@ -927,29 +958,29 @@ static vm_object_t *aperture_allocate_object(manageable_aperture_t *app, return new_object; } -static int32_t gpu_mem_find_by_gpu_id(uint32_t gpu_id) +static int32_t gpu_mem_find_by_gpu_id(struct hsa_kfd_fmm_context *fmm_ctx, uint32_t gpu_id) { uint32_t i; - for (i = 0 ; i < gpu_mem_count ; i++) - if (gpu_mem[i].gpu_id == gpu_id) + for (i = 0 ; i < fmm_ctx->gpu_mem_count ; i++) + if (fmm_ctx->gpu_mem[i].gpu_id == gpu_id) return i; return -1; } -static int32_t gpu_mem_find_by_node_id(uint32_t node_id) +static int32_t gpu_mem_find_by_node_id(struct hsa_kfd_fmm_context *fmm_ctx, uint32_t node_id) { uint32_t i; - for (i = 0 ; i < gpu_mem_count ; i++) - if (gpu_mem[i].node_id == node_id) + for (i = 0 ; i < fmm_ctx->gpu_mem_count ; i++) + if (fmm_ctx->gpu_mem[i].node_id == node_id) return i; return -1; } -static manageable_aperture_t *fmm_get_aperture(HsaApertureInfo info) +static manageable_aperture_t *fmm_get_aperture(struct hsa_kfd_fmm_context *fmm_ctx, HsaApertureInfo info) { switch (info.type) { case HSA_APERTURE_DGPU: @@ -957,7 +988,7 @@ static manageable_aperture_t *fmm_get_aperture(HsaApertureInfo info) case HSA_APERTURE_DGPU_ALT: return svm.dgpu_alt_aperture; case HSA_APERTURE_GPUVM: - return &gpu_mem[info.idx].gpuvm_aperture; + return &fmm_ctx->gpu_mem[info.idx].gpuvm_aperture; case HSA_APERTURE_CPUVM: return &cpuvm_aperture; case HSA_APERTURE_MEMHANDLE: @@ -967,23 +998,24 @@ static manageable_aperture_t *fmm_get_aperture(HsaApertureInfo info) } } -static gpu_mem_t *fmm_is_scratch_aperture(const void *address) +static gpu_mem_t *fmm_is_scratch_aperture(struct hsa_kfd_fmm_context *fmm_ctx, const void *address) { uint32_t i; - for (i = 0; i < gpu_mem_count; i++) { - if (gpu_mem[i].gpu_id == NON_VALID_GPU_ID) + for (i = 0; i < fmm_ctx->gpu_mem_count; i++) { + if (fmm_ctx->gpu_mem[i].gpu_id == NON_VALID_GPU_ID) continue; - if ((address >= gpu_mem[i].scratch_physical.base) && - (address <= gpu_mem[i].scratch_physical.limit)) - return &gpu_mem[i]; + if ((address >= fmm_ctx->gpu_mem[i].scratch_physical.base) && + (address <= fmm_ctx->gpu_mem[i].scratch_physical.limit)) + return &fmm_ctx->gpu_mem[i]; } return NULL; } -static manageable_aperture_t *fmm_find_aperture(const void *address, +static manageable_aperture_t *fmm_find_aperture(struct hsa_kfd_fmm_context *fmm_ctx, + const void *address, HsaApertureInfo *info) { manageable_aperture_t *aperture = NULL; @@ -1001,7 +1033,7 @@ static manageable_aperture_t *fmm_find_aperture(const void *address, if (address >= svm.dgpu_aperture->base && address <= svm.dgpu_aperture->limit) { - gpu_mem_ptr = fmm_is_scratch_aperture(address); + gpu_mem_ptr = fmm_is_scratch_aperture(fmm_ctx, address); if (gpu_mem_ptr) { aperture = &gpu_mem_ptr->scratch_physical; } else { @@ -1023,10 +1055,10 @@ static manageable_aperture_t *fmm_find_aperture(const void *address, _info.type = HSA_APERTURE_DGPU; } else { /* gpuvm_aperture */ - for (i = 0; i < gpu_mem_count; i++) { - if ((address >= gpu_mem[i].gpuvm_aperture.base) && - (address <= gpu_mem[i].gpuvm_aperture.limit)) { - aperture = &gpu_mem[i].gpuvm_aperture; + for (i = 0; i < fmm_ctx->gpu_mem_count; i++) { + if ((address >= fmm_ctx->gpu_mem[i].gpuvm_aperture.base) && + (address <= fmm_ctx->gpu_mem[i].gpuvm_aperture.limit)) { + aperture = &fmm_ctx->gpu_mem[i].gpuvm_aperture; _info.type = HSA_APERTURE_GPUVM; _info.idx = i; } @@ -1060,7 +1092,8 @@ static HsaMemFlags fmm_translate_ioc_to_hsa_flags(uint32_t ioc_flags) return mflags; } -static HSAKMT_STATUS fmm_register_mem_svm_api(void *address, +static HSAKMT_STATUS fmm_register_mem_svm_api(HsaKFDContext *ctx, + void *address, uint64_t size, HsaMemFlags flags) { struct kfd_ioctl_svm_args *args; @@ -1068,8 +1101,9 @@ static HSAKMT_STATUS fmm_register_mem_svm_api(void *address, HSAuint32 page_offset = (HSAuint64)address & (PAGE_SIZE-1); HSAuint64 aligned_addr = (HSAuint64)address - page_offset; HSAuint64 aligned_size = PAGE_ALIGN_UP(page_offset + size); + struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); - if (!g_first_gpu_mem) + if (!fmm_ctx->first_gpu_mem) return HSAKMT_STATUS_ERROR; s_attr = 2 * sizeof(struct kfd_ioctl_svm_attribute); @@ -1087,7 +1121,7 @@ static HSAKMT_STATUS fmm_register_mem_svm_api(void *address, pr_debug("Registering to SVM %p size: %ld\n", (void*)aligned_addr, aligned_size); /* Driver does one copy_from_user, with extra attrs size */ - if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_SVM + (s_attr << _IOC_SIZESHIFT), args)) { + if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_SVM + (s_attr << _IOC_SIZESHIFT), args)) { pr_debug("op set range attrs failed %s\n", strerror(errno)); return HSAKMT_STATUS_ERROR; } @@ -1095,7 +1129,8 @@ static HSAKMT_STATUS fmm_register_mem_svm_api(void *address, return HSAKMT_STATUS_SUCCESS; } -static HSAKMT_STATUS fmm_map_mem_svm_api(void *address, +static HSAKMT_STATUS fmm_map_mem_svm_api(HsaKFDContext *ctx, + void *address, uint64_t size, uint32_t *nodes_to_map, uint32_t nodes_array_size) @@ -1103,8 +1138,9 @@ static HSAKMT_STATUS fmm_map_mem_svm_api(void *address, struct kfd_ioctl_svm_args *args; size_t s_attr; uint32_t i, nattr; + struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); - if (!g_first_gpu_mem) + if (!fmm_ctx->first_gpu_mem) return HSAKMT_STATUS_ERROR; nattr = nodes_array_size; @@ -1120,7 +1156,7 @@ static HSAKMT_STATUS fmm_map_mem_svm_api(void *address, args->attrs[i].value = nodes_to_map[i]; } /* Driver does one copy_from_user, with extra attrs size */ - if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_SVM + (s_attr << _IOC_SIZESHIFT), args)) { + if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_SVM + (s_attr << _IOC_SIZESHIFT), args)) { pr_debug("op set range attrs failed %s\n", strerror(errno)); return HSAKMT_STATUS_ERROR; } @@ -1131,7 +1167,8 @@ static HSAKMT_STATUS fmm_map_mem_svm_api(void *address, /* After allocating the memory, return the vm_object created for this memory. * Return NULL if any failure. */ -static vm_object_t *fmm_allocate_memory_object(uint32_t gpu_id, void *mem, +static vm_object_t *fmm_allocate_memory_object(HsaKFDContext *ctx, + uint32_t gpu_id, void *mem, uint64_t MemorySizeInBytes, manageable_aperture_t *aperture, uint64_t *mmap_offset, @@ -1176,7 +1213,7 @@ static vm_object_t *fmm_allocate_memory_object(uint32_t gpu_id, void *mem, do { args.size = size; - if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_ALLOC_MEMORY_OF_GPU, &args)) + if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_ALLOC_MEMORY_OF_GPU, &args)) goto err_hsakmt_ioctl_failed; /* Allocate object */ @@ -1212,14 +1249,14 @@ static vm_object_t *fmm_allocate_memory_object(uint32_t gpu_id, void *mem, err_object_allocation_failed: free_args.handle = args.handle; - if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_FREE_MEMORY_OF_GPU, &free_args)) { + if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_FREE_MEMORY_OF_GPU, &free_args)) { pr_err("Failed to free GPU memory with handle: 0x%llx\n", free_args.handle); } err_hsakmt_ioctl_failed: if (vm_obj) { do { free_args.handle = vm_obj->handles[--vm_obj->handle_num]; - if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_FREE_MEMORY_OF_GPU, &free_args)) + if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_FREE_MEMORY_OF_GPU, &free_args)) pr_err("Failed to free GPU memory with handle: 0x%llx\n", free_args.handle); } while (vm_obj->handle_num); pthread_mutex_lock(&aperture->fmm_mutex); @@ -1258,19 +1295,20 @@ static void manageable_aperture_print(manageable_aperture_t *app) } } -void hsakmt_fmm_print(uint32_t gpu_id) +void hsakmt_fmm_print(HsaKFDContext *ctx, uint32_t gpu_id) { - int32_t gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id); + struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); + int32_t gpu_mem_id = gpu_mem_find_by_gpu_id(fmm_ctx, gpu_id); if (gpu_mem_id >= 0) { /* Found */ pr_info("LDS aperture:\n"); - aperture_print(&gpu_mem[gpu_mem_id].lds_aperture); + aperture_print(&fmm_ctx->gpu_mem[gpu_mem_id].lds_aperture); pr_info("GPUVM aperture:\n"); - manageable_aperture_print(&gpu_mem[gpu_mem_id].gpuvm_aperture); + manageable_aperture_print(&fmm_ctx->gpu_mem[gpu_mem_id].gpuvm_aperture); pr_info("Scratch aperture:\n"); - aperture_print(&gpu_mem[gpu_mem_id].scratch_aperture); + aperture_print(&fmm_ctx->gpu_mem[gpu_mem_id].scratch_aperture); pr_info("Scratch backing memory:\n"); - manageable_aperture_print(&gpu_mem[gpu_mem_id].scratch_physical); + manageable_aperture_print(&fmm_ctx->gpu_mem[gpu_mem_id].scratch_physical); } pr_info("dGPU aperture:\n"); @@ -1282,7 +1320,7 @@ void hsakmt_fmm_print(uint32_t gpu_id) manageable_aperture_print(svm.dgpu_alt_aperture); } #else -void hsakmt_fmm_print(uint32_t gpu_id) +void hsakmt_fmm_print(HsaKFDContext *ctx, uint32_t gpu_id) { } #endif @@ -1298,7 +1336,8 @@ void hsakmt_fmm_print(uint32_t gpu_id) * object is found, this function returns with the * (*out_aper)->fmm_mutex locked. */ -static vm_object_t *vm_find_object(const void *addr, uint64_t size, +static vm_object_t *vm_find_object(struct hsa_kfd_fmm_context *fmm_ctx, + const void *addr, uint64_t size, manageable_aperture_t **out_aper) { manageable_aperture_t *aper = NULL; @@ -1307,11 +1346,11 @@ static vm_object_t *vm_find_object(const void *addr, uint64_t size, vm_object_t *obj = NULL; uint32_t i; - for (i = 0; i < gpu_mem_count; i++) - if (gpu_mem[i].gpu_id != NON_VALID_GPU_ID && - addr >= gpu_mem[i].gpuvm_aperture.base && - addr <= gpu_mem[i].gpuvm_aperture.limit) { - aper = &gpu_mem[i].gpuvm_aperture; + for (i = 0; i < fmm_ctx->gpu_mem_count; i++) + if (fmm_ctx->gpu_mem[i].gpu_id != NON_VALID_GPU_ID && + addr >= fmm_ctx->gpu_mem[i].gpuvm_aperture.base && + addr <= fmm_ctx->gpu_mem[i].gpuvm_aperture.limit) { + aper = &fmm_ctx->gpu_mem[i].gpuvm_aperture; break; } @@ -1409,19 +1448,20 @@ static HSAuint8 fmm_check_user_memory(const void *addr, HSAuint64 size) return sum; } -static void fmm_release_scratch(uint32_t gpu_id) +static void fmm_release_scratch(HsaKFDContext *ctx, uint32_t gpu_id) { int32_t gpu_mem_id; uint64_t size; vm_object_t *obj; manageable_aperture_t *aperture; rbtree_node_t *n; + struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); - gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id); + gpu_mem_id = gpu_mem_find_by_gpu_id(fmm_ctx, gpu_id); if (gpu_mem_id < 0) return; - aperture = &gpu_mem[gpu_mem_id].scratch_physical; + aperture = &fmm_ctx->gpu_mem[gpu_mem_id].scratch_physical; size = VOID_PTRS_SUB(aperture->limit, aperture->base) + 1; @@ -1435,7 +1475,7 @@ static void fmm_release_scratch(uint32_t gpu_id) pthread_mutex_unlock(&aperture->fmm_mutex); - _fmm_unmap_from_gpu_scratch(gpu_id, aperture, obj_addr); + _fmm_unmap_from_gpu_scratch(ctx, gpu_id, aperture, obj_addr); pthread_mutex_lock(&aperture->fmm_mutex); } @@ -1444,16 +1484,16 @@ static void fmm_release_scratch(uint32_t gpu_id) /* release address space */ pthread_mutex_lock(&svm.dgpu_aperture->fmm_mutex); aperture_release_area(svm.dgpu_aperture, - gpu_mem[gpu_mem_id].scratch_physical.base, + fmm_ctx->gpu_mem[gpu_mem_id].scratch_physical.base, size); pthread_mutex_unlock(&svm.dgpu_aperture->fmm_mutex); } else /* release address space */ - munmap(gpu_mem[gpu_mem_id].scratch_physical.base, size); + munmap(fmm_ctx->gpu_mem[gpu_mem_id].scratch_physical.base, size); /* invalidate scratch backing aperture */ - gpu_mem[gpu_mem_id].scratch_physical.base = NULL; - gpu_mem[gpu_mem_id].scratch_physical.limit = NULL; + fmm_ctx->gpu_mem[gpu_mem_id].scratch_physical.base = NULL; + fmm_ctx->gpu_mem[gpu_mem_id].scratch_physical.limit = NULL; } static uint32_t fmm_translate_hsa_to_ioc_flags(HsaMemFlags flags) @@ -1471,20 +1511,22 @@ static uint32_t fmm_translate_hsa_to_ioc_flags(HsaMemFlags flags) } #define SCRATCH_ALIGN 0x10000 -void *hsakmt_fmm_allocate_scratch(uint32_t gpu_id, void *address, uint64_t MemorySizeInBytes) +void *hsakmt_fmm_allocate_scratch(HsaKFDContext *ctx, + uint32_t gpu_id, void *address, uint64_t MemorySizeInBytes) { manageable_aperture_t *aperture_phy; struct kfd_ioctl_set_scratch_backing_va_args args = {0}; int32_t gpu_mem_id; void *mem = NULL; uint64_t aligned_size = ALIGN_UP(MemorySizeInBytes, SCRATCH_ALIGN); + struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); /* Retrieve gpu_mem id according to gpu_id */ - gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id); + gpu_mem_id = gpu_mem_find_by_gpu_id(fmm_ctx, gpu_id); if (gpu_mem_id < 0) return NULL; - aperture_phy = &gpu_mem[gpu_mem_id].scratch_physical; + aperture_phy = &fmm_ctx->gpu_mem[gpu_mem_id].scratch_physical; if (aperture_phy->base || aperture_phy->limit) /* Scratch was already allocated for this GPU */ return NULL; @@ -1515,15 +1557,16 @@ void *hsakmt_fmm_allocate_scratch(uint32_t gpu_id, void *address, uint64_t Memor args.gpu_id = gpu_id; args.va_addr = ((uint64_t)mem) >> 16; - if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_SET_SCRATCH_BACKING_VA, &args)) { - fmm_release_scratch(gpu_id); + if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_SET_SCRATCH_BACKING_VA, &args)) { + fmm_release_scratch(ctx, gpu_id); return NULL; } return mem; } -static void *__fmm_allocate_device(uint32_t gpu_id, void *address, uint64_t MemorySizeInBytes, +static void *__fmm_allocate_device(HsaKFDContext *ctx, + uint32_t gpu_id, void *address, uint64_t MemorySizeInBytes, manageable_aperture_t *aperture, uint64_t *mmap_offset, uint32_t ioc_flags, uint64_t alignment, vm_object_t **vm_obj) { @@ -1545,7 +1588,7 @@ static void *__fmm_allocate_device(uint32_t gpu_id, void *address, uint64_t Memo * Now that we have the area reserved, allocate memory in the device * itself */ - obj = fmm_allocate_memory_object(gpu_id, mem, + obj = fmm_allocate_memory_object(ctx, gpu_id, mem, MemorySizeInBytes, aperture, mmap_offset, ioc_flags); if (!obj) { /* @@ -1613,7 +1656,8 @@ static void *fmm_allocate_va(uint32_t gpu_id, void *address, uint64_t size, } /* use udmabuf driver to allocate buf */ -static void* udmabuf_allocation(uint32_t gpu_id, uint32_t node_id, uint64_t size, +static void* udmabuf_allocation(HsaKFDContext *ctx, + uint32_t gpu_id, uint32_t node_id, uint64_t size, manageable_aperture_t *aperture, uint64_t alignment, HsaMemFlags mflags, vm_object_t** vm_obj) { @@ -1699,7 +1743,7 @@ static void* udmabuf_allocation(uint32_t gpu_id, uint32_t node_id, uint64_t size importArgs.gpu_id = gpu_id; importArgs.dmabuf_fd = dmabuf_fd; - ret = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_IMPORT_DMABUF, (void *)&importArgs); + ret = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_IMPORT_DMABUF, (void *)&importArgs); if (ret) { pr_debug("ioctl AMDKFD_IOC_IMPORT_DMABUF failed\n, ret 0x%x", ret); goto error_release_dmabuf; @@ -1732,7 +1776,8 @@ error_release_memfd: return NULL; } -void *hsakmt_fmm_allocate_device(uint32_t gpu_id, uint32_t node_id, void *address, +void *hsakmt_fmm_allocate_device(HsaKFDContext *ctx, + uint32_t gpu_id, uint32_t node_id, void *address, uint64_t MemorySizeInBytes, uint64_t alignment, HsaMemFlags mflags) { manageable_aperture_t *aperture; @@ -1741,9 +1786,10 @@ void *hsakmt_fmm_allocate_device(uint32_t gpu_id, uint32_t node_id, void *addres uint64_t size, mmap_offset; void *mem; vm_object_t *vm_obj = NULL; + struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); /* Retrieve gpu_mem id according to gpu_id */ - gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id); + gpu_mem_id = gpu_mem_find_by_gpu_id(fmm_ctx, gpu_id); if (gpu_mem_id < 0) return NULL; @@ -1754,12 +1800,12 @@ void *hsakmt_fmm_allocate_device(uint32_t gpu_id, uint32_t node_id, void *addres ioc_flags |= fmm_translate_hsa_to_ioc_flags(mflags); - if (hsakmt_topology_is_svm_needed(gpu_mem[gpu_mem_id].EngineId)) { + if (hsakmt_topology_is_svm_needed(fmm_ctx->gpu_mem[gpu_mem_id].EngineId)) { aperture = svm.dgpu_aperture; if (mflags.ui32.AQLQueueMemory) size = MemorySizeInBytes * 2; } else { - aperture = &gpu_mem[gpu_mem_id].gpuvm_aperture; + aperture = &fmm_ctx->gpu_mem[gpu_mem_id].gpuvm_aperture; } /* special case for va allocation without vram alloc */ @@ -1785,7 +1831,7 @@ void *hsakmt_fmm_allocate_device(uint32_t gpu_id, uint32_t node_id, void *addres mem = NULL; if (hsakmt_udmabuf_dev_fd > 0 && aperture == svm.dgpu_aperture && !hsakmt_is_dgpu && aperture->ops == &mmap_aperture_ops) { - mem = udmabuf_allocation(gpu_id, node_id, size, aperture, alignment, + mem = udmabuf_allocation(ctx, gpu_id, node_id, size, aperture, alignment, mflags, &vm_obj); pr_debug("udmabuf_allocation mem %p\n", mem); if (!mem) @@ -1796,24 +1842,25 @@ void *hsakmt_fmm_allocate_device(uint32_t gpu_id, uint32_t node_id, void *addres * fall back to use device driver to allocate memory */ if (!mem) { - mem = __fmm_allocate_device(gpu_id, address, size, aperture, &mmap_offset, + mem = __fmm_allocate_device(ctx, + gpu_id, address, size, aperture, &mmap_offset, ioc_flags, alignment, &vm_obj); /* if alloc vram-only not mmap to cpu vm since no va */ if (mem && !mflags.ui32.NoAddress) { void *ret = fmm_map_to_cpu(mem, MemorySizeInBytes, mflags.ui32.HostAccess, - gpu_mem[gpu_mem_id].drm_render_fd, + fmm_ctx->gpu_mem[gpu_mem_id].drm_render_fd, mmap_offset); if (ret == MAP_FAILED) { - __fmm_release(vm_obj, aperture); + __fmm_release(ctx, vm_obj, aperture); return NULL; } #ifdef SANITIZER_AMDGPU if (vm_obj) { vm_obj->mmap_flags = mflags.ui32.HostAccess ? PROT_READ | PROT_WRITE : PROT_NONE; - vm_obj->mmap_fd = gpu_mem[gpu_mem_id].drm_render_fd; + vm_obj->mmap_fd = fmm_ctx->gpu_mem[gpu_mem_id].drm_render_fd; vm_obj->mmap_offset = mmap_offset; } #endif @@ -1832,7 +1879,8 @@ void *hsakmt_fmm_allocate_device(uint32_t gpu_id, uint32_t node_id, void *addres return mem; } -void *hsakmt_fmm_allocate_doorbell(uint32_t gpu_id, uint64_t MemorySizeInBytes, +void *hsakmt_fmm_allocate_doorbell(HsaKFDContext *ctx, + uint32_t gpu_id, uint64_t MemorySizeInBytes, uint64_t doorbell_mmap_offset) { manageable_aperture_t *aperture; @@ -1840,9 +1888,10 @@ void *hsakmt_fmm_allocate_doorbell(uint32_t gpu_id, uint64_t MemorySizeInBytes, uint32_t ioc_flags; void *mem; vm_object_t *vm_obj = NULL; + struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); /* Retrieve gpu_mem id according to gpu_id */ - gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id); + gpu_mem_id = gpu_mem_find_by_gpu_id(fmm_ctx, gpu_id); if (gpu_mem_id < 0) return NULL; @@ -1852,8 +1901,8 @@ void *hsakmt_fmm_allocate_doorbell(uint32_t gpu_id, uint64_t MemorySizeInBytes, KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | KFD_IOC_ALLOC_MEM_FLAGS_COHERENT; - mem = __fmm_allocate_device(gpu_id, NULL, MemorySizeInBytes, aperture, NULL, - ioc_flags, 0, &vm_obj); + mem = __fmm_allocate_device(ctx, gpu_id, NULL, MemorySizeInBytes, + aperture, NULL, ioc_flags, 0, &vm_obj); if (mem && vm_obj) { HsaMemFlags mflags; @@ -1872,10 +1921,10 @@ void *hsakmt_fmm_allocate_doorbell(uint32_t gpu_id, uint64_t MemorySizeInBytes, if (mem) { void *ret = mmap(mem, MemorySizeInBytes, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_FIXED, hsakmt_kfd_fd, + MAP_SHARED | MAP_FIXED, ctx->fd, doorbell_mmap_offset); if (ret == MAP_FAILED) { - __fmm_release(vm_obj, aperture); + __fmm_release(ctx, vm_obj, aperture); return NULL; } } @@ -1986,8 +2035,10 @@ static int bind_mem_to_numa(uint32_t numa_node_id, void *mem, return 0; } -static void *fmm_allocate_host_gpu(uint32_t gpu_id, uint32_t node_id, void *address, - uint64_t MemorySizeInBytes, uint64_t alignment, HsaMemFlags mflags) +static void *fmm_allocate_host_gpu(HsaKFDContext *ctx, + uint32_t gpu_id, uint32_t node_id, void *address, + uint64_t MemorySizeInBytes, + uint64_t alignment, HsaMemFlags mflags) { manageable_aperture_t *aperture; vm_object_t *vm_obj = NULL; @@ -1995,21 +2046,22 @@ static void *fmm_allocate_host_gpu(uint32_t gpu_id, uint32_t node_id, void *addr int32_t gpu_drm_fd; uint32_t ioc_flags; uint32_t preferred_gpu_id; - int gpu_mem_id = 0; /* default to g_first_gpu_mem */ + int gpu_mem_id = 0; /* default to first_gpu_mem */ uint64_t size; void *mem; - if (!g_first_gpu_mem) + struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); + if (!fmm_ctx->first_gpu_mem) return NULL; if (gpu_id) { - gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id); + gpu_mem_id = gpu_mem_find_by_gpu_id(fmm_ctx, gpu_id); if (gpu_mem_id < 0) return NULL; } - preferred_gpu_id = gpu_mem[gpu_mem_id].gpu_id; - gpu_drm_fd = gpu_mem[gpu_mem_id].drm_render_fd; + preferred_gpu_id = fmm_ctx->gpu_mem[gpu_mem_id].gpu_id; + gpu_drm_fd = fmm_ctx->gpu_mem[gpu_mem_id].drm_render_fd; size = MemorySizeInBytes; ioc_flags = 0; @@ -2068,14 +2120,14 @@ static void *fmm_allocate_host_gpu(uint32_t gpu_id, uint32_t node_id, void *addr /* Create userptr BO */ mmap_offset = (uint64_t)mem; ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_USERPTR; - vm_obj = fmm_allocate_memory_object(preferred_gpu_id, mem, size, + vm_obj = fmm_allocate_memory_object(ctx, preferred_gpu_id, mem, size, aperture, &mmap_offset, ioc_flags); if (!vm_obj) goto out_release_area; } else { ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_GTT; - mem = __fmm_allocate_device(preferred_gpu_id, address, size, aperture, + mem = __fmm_allocate_device(ctx, preferred_gpu_id, address, size, aperture, &mmap_offset, ioc_flags, alignment, &vm_obj); if (mem && mflags.ui32.HostAccess) { @@ -2084,7 +2136,7 @@ static void *fmm_allocate_host_gpu(uint32_t gpu_id, uint32_t node_id, void *addr gpu_drm_fd, mmap_offset); if (ret == MAP_FAILED) { - __fmm_release(vm_obj, aperture); + __fmm_release(ctx, vm_obj, aperture); return NULL; } } @@ -2119,11 +2171,12 @@ out_release_area: return NULL; } -void *hsakmt_fmm_allocate_host(uint32_t gpu_id, uint32_t node_id, void *address, +void *hsakmt_fmm_allocate_host(HsaKFDContext *ctx, + uint32_t gpu_id, uint32_t node_id, void *address, uint64_t MemorySizeInBytes, uint64_t alignment, HsaMemFlags mflags) { if (hsakmt_is_dgpu) - return fmm_allocate_host_gpu(gpu_id, node_id, address, MemorySizeInBytes, alignment, mflags); + return fmm_allocate_host_gpu(ctx, gpu_id, node_id, address, MemorySizeInBytes, alignment, mflags); if (alignment) {//Alignment not supported on non-dgpu pr_err("Non-default alignment not supported on non-dgpu\n"); @@ -2133,7 +2186,8 @@ void *hsakmt_fmm_allocate_host(uint32_t gpu_id, uint32_t node_id, void *address, return fmm_allocate_host_cpu(address, MemorySizeInBytes, mflags); } -static int __fmm_release(vm_object_t *object, manageable_aperture_t *aperture) +static int __fmm_release(HsaKFDContext *ctx, + vm_object_t *object, manageable_aperture_t *aperture) { struct kfd_ioctl_free_memory_of_gpu_args args = {0}; int ret = 0; @@ -2161,7 +2215,7 @@ static int __fmm_release(vm_object_t *object, manageable_aperture_t *aperture) args.handle = object->handles[i]; if (args.handle == 0) continue; - if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_FREE_MEMORY_OF_GPU, &args)) + if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_FREE_MEMORY_OF_GPU, &args)) ret = -errno; } @@ -2176,20 +2230,21 @@ err_free_mem_failed: return ret; } -HSAKMT_STATUS hsakmt_fmm_release(void *address) +HSAKMT_STATUS hsakmt_fmm_release(HsaKFDContext *ctx, void *address) { manageable_aperture_t *aperture = NULL; vm_object_t *object = NULL; gpu_mem_t *gpu_mem_ptr = NULL; + struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); /* Special handling for scratch memory */ - gpu_mem_ptr = fmm_is_scratch_aperture(address); + gpu_mem_ptr = fmm_is_scratch_aperture(fmm_ctx, address); if (gpu_mem_ptr) { - fmm_release_scratch(gpu_mem_ptr->gpu_id); + fmm_release_scratch(ctx, gpu_mem_ptr->gpu_id); return HSAKMT_STATUS_SUCCESS; } - object = vm_find_object(address, 0, &aperture); + object = vm_find_object(fmm_ctx, address, 0, &aperture); if (!object) return hsakmt_is_svm_api_supported ? @@ -2207,14 +2262,15 @@ HSAKMT_STATUS hsakmt_fmm_release(void *address) } else { pthread_mutex_unlock(&aperture->fmm_mutex); - if (__fmm_release(object, aperture)) + if (__fmm_release(ctx, object, aperture)) return HSAKMT_STATUS_ERROR; } return HSAKMT_STATUS_SUCCESS; } -static int fmm_set_memory_policy(uint32_t gpu_id, int default_policy, int alt_policy, +static int fmm_set_memory_policy(HsaKFDContext *ctx, + uint32_t gpu_id, int default_policy, int alt_policy, uintptr_t alt_base, uint64_t alt_size, uint32_t misc_process_flags) { @@ -2227,7 +2283,7 @@ static int fmm_set_memory_policy(uint32_t gpu_id, int default_policy, int alt_po args.alternate_aperture_size = alt_size; args.misc_process_flag = misc_process_flags; - return hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_SET_MEMORY_POLICY, &args); + return hsakmt_ioctl(ctx->fd, AMDKFD_IOC_SET_MEMORY_POLICY, &args); } static uint32_t get_vm_alignment(uint32_t device_id) @@ -2242,7 +2298,7 @@ static uint32_t get_vm_alignment(uint32_t device_id) return MAX(PAGE_SIZE, page_size); } -static HSAKMT_STATUS get_process_apertures( +static HSAKMT_STATUS get_process_apertures(HsaKFDContext *ctx, struct kfd_process_device_apertures *process_apertures, uint32_t *num_of_nodes) { @@ -2251,7 +2307,7 @@ static HSAKMT_STATUS get_process_apertures( args_new.kfd_process_device_apertures_ptr = (uintptr_t)process_apertures; args_new.num_of_nodes = *num_of_nodes; - if (!hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_GET_PROCESS_APERTURES_NEW, + if (!hsakmt_ioctl(ctx->fd, AMDKFD_IOC_GET_PROCESS_APERTURES_NEW, (void *)&args_new)) { *num_of_nodes = args_new.num_of_nodes; return HSAKMT_STATUS_SUCCESS; @@ -2261,7 +2317,7 @@ static HSAKMT_STATUS get_process_apertures( * a really old kernel */ memset(&args_old, 0, sizeof(args_old)); - if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_GET_PROCESS_APERTURES, + if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_GET_PROCESS_APERTURES, (void *)&args_old)) return HSAKMT_STATUS_ERROR; @@ -2274,29 +2330,18 @@ static HSAKMT_STATUS get_process_apertures( return HSAKMT_STATUS_SUCCESS; } -/* The VMs from DRM render nodes are used by KFD for the lifetime of - * the process. Therefore we have to keep using the same FDs for the - * lifetime of the process, even when we close and reopen KFD. There - * are up to 128 render nodes that we cache in this array. - */ -#define DRM_FIRST_RENDER_NODE 128 -#define DRM_LAST_RENDER_NODE 255 -static int drm_render_fds[DRM_LAST_RENDER_NODE + 1 - DRM_FIRST_RENDER_NODE]; - -/* amdgpu device handle for each gpu that libdrm uses */ -static struct amdgpu_device *amdgpu_handle[DRM_LAST_RENDER_NODE + 1 - DRM_FIRST_RENDER_NODE]; - -int hsakmt_open_drm_render_device(int minor) +int hsakmt_open_drm_render_device(HsaKFDContext *ctx, int minor) { char path[128]; int index, fd; uint32_t major_drm, minor_drm; struct amdgpu_device **device_handle; + struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); - /* Bypass amdgpu if we're running a model. Return hsakmt_kfd_fd, which is the + /* Bypass amdgpu if we're running a model. Return ctx->fd, which is the * backing for all our "GPU" memory. */ if (hsakmt_use_model) - return hsakmt_kfd_fd; + return ctx->fd; if (minor < DRM_FIRST_RENDER_NODE || minor > DRM_LAST_RENDER_NODE) { pr_err("DRM render minor %d out of range [%d, %d]\n", minor, @@ -2306,8 +2351,8 @@ int hsakmt_open_drm_render_device(int minor) index = minor - DRM_FIRST_RENDER_NODE; /* If the render node was already opened, keep using the same FD */ - if (drm_render_fds[index]) - return drm_render_fds[index]; + if (fmm_ctx->drm_render_fds[index]) + return fmm_ctx->drm_render_fds[index]; sprintf(path, "/dev/dri/renderD%d", minor); fd = open(path, O_RDWR | O_CLOEXEC); @@ -2319,9 +2364,9 @@ int hsakmt_open_drm_render_device(int minor) } return -errno; } - drm_render_fds[index] = fd; + fmm_ctx->drm_render_fds[index] = fd; - device_handle = &amdgpu_handle[index]; + device_handle = &fmm_ctx->amdgpu_handle[index]; if (!amdgpu_device_initialize(fd, &major_drm, &minor_drm, device_handle)) { /* if amdgpu_device_get_fd available query render fd that libdrm uses, * then close drm_render_fds above, replace it by fd libdrm uses. @@ -2329,8 +2374,8 @@ int hsakmt_open_drm_render_device(int minor) if (hsakmt_fn_amdgpu_device_get_fd) { fd = hsakmt_fn_amdgpu_device_get_fd(*device_handle); if (fd > 0) { - close(drm_render_fds[index]); - drm_render_fds[index] = fd; + close(fmm_ctx->drm_render_fds[index]); + fmm_ctx->drm_render_fds[index] = fd; } else { pr_err("amdgpu_device_get_fd failed: %d\n", fd); amdgpu_device_deinitialize(*device_handle); @@ -2342,14 +2387,14 @@ int hsakmt_open_drm_render_device(int minor) return fd; } -static HSAKMT_STATUS acquire_vm(uint32_t gpu_id, int fd) +static HSAKMT_STATUS acquire_vm(HsaKFDContext *ctx, uint32_t gpu_id, int fd) { struct kfd_ioctl_acquire_vm_args args; args.gpu_id = gpu_id; args.drm_fd = fd; pr_info("acquiring VM for %x using %d\n", gpu_id, fd); - if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_ACQUIRE_VM, (void *)&args)) { + if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_ACQUIRE_VM, (void *)&args)) { pr_err("AMDKFD_IOC_ACQUIRE_VM failed\n"); return HSAKMT_STATUS_ERROR; } @@ -2565,10 +2610,10 @@ static HSAKMT_STATUS init_svm_apertures(HSAuint64 base, HSAuint64 limit, return HSAKMT_STATUS_SUCCESS; } -static void fmm_init_rbtree(void) +static void fmm_init_rbtree(struct hsa_kfd_fmm_context *fmm_ctx) { static int once; - int i = gpu_mem_count; + int i = fmm_ctx->gpu_mem_count; if (once++ == 0) { rbtree_init(&svm.apertures[SVM_DEFAULT].tree); @@ -2582,14 +2627,15 @@ static void fmm_init_rbtree(void) } while (i--) { - rbtree_init(&gpu_mem[i].scratch_physical.tree); - rbtree_init(&gpu_mem[i].scratch_physical.user_tree); - rbtree_init(&gpu_mem[i].gpuvm_aperture.tree); - rbtree_init(&gpu_mem[i].gpuvm_aperture.user_tree); + rbtree_init(&fmm_ctx->gpu_mem[i].scratch_physical.tree); + rbtree_init(&fmm_ctx->gpu_mem[i].scratch_physical.user_tree); + rbtree_init(&fmm_ctx->gpu_mem[i].gpuvm_aperture.tree); + rbtree_init(&fmm_ctx->gpu_mem[i].gpuvm_aperture.user_tree); } } -static void *map_mmio(uint32_t node_id, uint32_t gpu_id, int mmap_fd) +static void *map_mmio(HsaKFDContext *ctx, + uint32_t node_id, uint32_t gpu_id, int mmap_fd) { void *mem; manageable_aperture_t *aperture = svm.dgpu_alt_aperture; @@ -2603,7 +2649,8 @@ static void *map_mmio(uint32_t node_id, uint32_t gpu_id, int mmap_fd) ioc_flags = KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP | KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | KFD_IOC_ALLOC_MEM_FLAGS_COHERENT; - mem = __fmm_allocate_device(gpu_id, NULL, PAGE_SIZE, aperture, + mem = __fmm_allocate_device(ctx, + gpu_id, NULL, PAGE_SIZE, aperture, &mmap_offset, ioc_flags, 0, &vm_obj); if (!mem || !vm_obj) @@ -2628,36 +2675,39 @@ static void *map_mmio(uint32_t node_id, uint32_t gpu_id, int mmap_fd) MAP_SHARED | MAP_FIXED, mmap_fd, mmap_offset); if (ret == MAP_FAILED) { - __fmm_release(vm_obj, aperture); + __fmm_release(ctx, vm_obj, aperture); return NULL; } /* Map for GPU access*/ - if (hsakmt_fmm_map_to_gpu(mem, PAGE_SIZE, NULL)) { - __fmm_release(vm_obj, aperture); + if (hsakmt_fmm_map_to_gpu(ctx, mem, PAGE_SIZE, NULL)) { + __fmm_release(ctx, vm_obj, aperture); return NULL; } return mem; } -static void release_mmio(void) +static void release_mmio(HsaKFDContext *ctx) { uint32_t gpu_mem_id; + struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); - for (gpu_mem_id = 0; (uint32_t)gpu_mem_id < gpu_mem_count; gpu_mem_id++) { - if (!gpu_mem[gpu_mem_id].mmio_aperture.base) + for (gpu_mem_id = 0; gpu_mem_id < fmm_ctx->gpu_mem_count; gpu_mem_id++) { + if (!fmm_ctx->gpu_mem[gpu_mem_id].mmio_aperture.base) continue; - hsakmt_fmm_unmap_from_gpu(gpu_mem[gpu_mem_id].mmio_aperture.base); - munmap(gpu_mem[gpu_mem_id].mmio_aperture.base, PAGE_SIZE); - hsakmt_fmm_release(gpu_mem[gpu_mem_id].mmio_aperture.base); + hsakmt_fmm_unmap_from_gpu(ctx, fmm_ctx->gpu_mem[gpu_mem_id].mmio_aperture.base); + munmap(fmm_ctx->gpu_mem[gpu_mem_id].mmio_aperture.base, PAGE_SIZE); + hsakmt_fmm_release(ctx, fmm_ctx->gpu_mem[gpu_mem_id].mmio_aperture.base); } } -HSAKMT_STATUS hsakmt_fmm_get_amdgpu_device_handle(uint32_t node_id, +HSAKMT_STATUS hsakmt_fmm_get_amdgpu_device_handle(HsaKFDContext *ctx, + uint32_t node_id, HsaAMDGPUDeviceHandle *DeviceHandle) { - int32_t i = gpu_mem_find_by_node_id(node_id); + struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); + int32_t i = gpu_mem_find_by_node_id(fmm_ctx, node_id); int index; if (i < 0) @@ -2668,11 +2718,11 @@ HSAKMT_STATUS hsakmt_fmm_get_amdgpu_device_handle(uint32_t node_id, return HSAKMT_STATUS_SUCCESS; } - index = gpu_mem[i].drm_render_minor - DRM_FIRST_RENDER_NODE; - if (!amdgpu_handle[index]) + index = fmm_ctx->gpu_mem[i].drm_render_minor - DRM_FIRST_RENDER_NODE; + if (!fmm_ctx->amdgpu_handle[index]) return HSAKMT_STATUS_INVALID_HANDLE; - *DeviceHandle = amdgpu_handle[index]; + *DeviceHandle = fmm_ctx->amdgpu_handle[index]; return HSAKMT_STATUS_SUCCESS; } @@ -2681,7 +2731,7 @@ static bool two_apertures_overlap(void *start_1, void *limit_1, void *start_2, v return (start_1 >= start_2 && start_1 <= limit_2) || (start_2 >= start_1 && start_2 <= limit_1); } -static bool init_mem_handle_aperture(HSAuint32 align, HSAuint32 guard_pages) +static bool init_mem_handle_aperture(struct hsa_kfd_fmm_context *fmm_ctx, HSAuint32 align, HSAuint32 guard_pages) { bool found; uint32_t i; @@ -2695,24 +2745,24 @@ static bool init_mem_handle_aperture(HSAuint32 align, HSAuint32 guard_pages) while (PORT_VPTR_TO_UINT64(mem_handle_aperture.base) < END_NON_CANONICAL_ADDR - 1) { found = true; - for (i = 0; i < gpu_mem_count; i++) { + for (i = 0; i < fmm_ctx->gpu_mem_count; i++) { - if (gpu_mem[i].lds_aperture.base && - two_apertures_overlap(gpu_mem[i].lds_aperture.base, gpu_mem[i].lds_aperture.limit, + if (fmm_ctx->gpu_mem[i].lds_aperture.base && + two_apertures_overlap(fmm_ctx->gpu_mem[i].lds_aperture.base, fmm_ctx->gpu_mem[i].lds_aperture.limit, mem_handle_aperture.base, mem_handle_aperture.limit)) { found = false; break; } - if (gpu_mem[i].scratch_aperture.base && - two_apertures_overlap(gpu_mem[i].scratch_aperture.base, gpu_mem[i].scratch_aperture.limit, + if (fmm_ctx->gpu_mem[i].scratch_aperture.base && + two_apertures_overlap(fmm_ctx->gpu_mem[i].scratch_aperture.base, fmm_ctx->gpu_mem[i].scratch_aperture.limit, mem_handle_aperture.base, mem_handle_aperture.limit)){ found = false; break; } - if (gpu_mem[i].gpuvm_aperture.base && - two_apertures_overlap(gpu_mem[i].gpuvm_aperture.base, gpu_mem[i].gpuvm_aperture.limit, + if (fmm_ctx->gpu_mem[i].gpuvm_aperture.base && + two_apertures_overlap(fmm_ctx->gpu_mem[i].gpuvm_aperture.base, fmm_ctx->gpu_mem[i].gpuvm_aperture.limit, mem_handle_aperture.base, mem_handle_aperture.limit)){ found = false; break; @@ -2737,10 +2787,13 @@ static bool init_mem_handle_aperture(HSAuint32 align, HSAuint32 guard_pages) return false; } -HSAKMT_STATUS hsakmt_fmm_init_process_apertures(unsigned int NumNodes) +HSAKMT_STATUS hsakmt_fmm_init_process_apertures(HsaKFDContext *ctx, + unsigned int NumNodes) { uint32_t i; + uint32_t gpu_mem_count = 0; int32_t gpu_mem_id = 0; + gpu_mem_t *gpu_mem = NULL; struct kfd_process_device_apertures *process_apertures; uint32_t num_of_sysfs_nodes; HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; @@ -2801,8 +2854,7 @@ HSAKMT_STATUS hsakmt_fmm_init_process_apertures(unsigned int NumNodes) } pr_info("SVM alignment default order is %d.", svm.alignment_order); - gpu_mem_count = 0; - g_first_gpu_mem = NULL; + struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); /* Trade off - NumNodes includes GPU nodes + CPU Node. So in * systems with CPU node, slightly more memory is allocated than @@ -2830,7 +2882,7 @@ HSAKMT_STATUS hsakmt_fmm_init_process_apertures(unsigned int NumNodes) /* Skip non-GPU nodes */ if (props.KFDGpuID) { - int fd = hsakmt_open_drm_render_device(props.DrmRenderMinor); + int fd = hsakmt_open_drm_render_device(ctx, props.DrmRenderMinor); if (fd <= 0) { ret = HSAKMT_STATUS_ERROR; goto gpu_mem_init_failed; @@ -2867,13 +2919,14 @@ HSAKMT_STATUS hsakmt_fmm_init_process_apertures(unsigned int NumNodes) gpu_mem[gpu_mem_count].gpuvm_aperture.ops = &reserved_aperture_ops; pthread_mutex_init(&gpu_mem[gpu_mem_count].gpuvm_aperture.fmm_mutex, NULL); - if (!g_first_gpu_mem) - g_first_gpu_mem = &gpu_mem[gpu_mem_count]; - gpu_mem_count++; } } + fmm_ctx->gpu_mem = gpu_mem; + fmm_ctx->gpu_mem_count = gpu_mem_count; + fmm_ctx->first_gpu_mem = gpu_mem; + /* The ioctl will also return Number of Nodes if * args.kfd_process_device_apertures_ptr is set to NULL. This is not * required since Number of nodes is already known. Kernel will fill in @@ -2895,7 +2948,7 @@ HSAKMT_STATUS hsakmt_fmm_init_process_apertures(unsigned int NumNodes) * The Kernel driver could be not aware of this. * Get from Kernel driver information of all the nodes and then filter it. */ - ret = get_process_apertures(process_apertures, &num_of_sysfs_nodes); + ret = get_process_apertures(ctx, process_apertures, &num_of_sysfs_nodes); if (ret != HSAKMT_STATUS_SUCCESS) goto get_aperture_ioctl_failed; @@ -2918,7 +2971,7 @@ HSAKMT_STATUS hsakmt_fmm_init_process_apertures(unsigned int NumNodes) /* Map Kernel process device data node i <--> gpu_mem_id which * indexes into gpu_mem[] based on gpu_id */ - gpu_mem_id = gpu_mem_find_by_gpu_id(process_apertures[i].gpu_id); + gpu_mem_id = gpu_mem_find_by_gpu_id(fmm_ctx, process_apertures[i].gpu_id); if (gpu_mem_id < 0) continue; @@ -2943,7 +2996,7 @@ HSAKMT_STATUS hsakmt_fmm_init_process_apertures(unsigned int NumNodes) goto aperture_init_failed; for (j = 0; j < nodeProps.NumIOLinks; j++) { int32_t to_gpu_mem_id = - gpu_mem_find_by_node_id(linkProps[j].NodeTo); + gpu_mem_find_by_node_id(fmm_ctx, linkProps[j].NodeTo); uint32_t peer; if (to_gpu_mem_id < 0) @@ -3002,7 +3055,8 @@ HSAKMT_STATUS hsakmt_fmm_init_process_apertures(unsigned int NumNodes) } /* Acquire the VM from the DRM render node for KFD use */ - ret = acquire_vm(gpu_mem[gpu_mem_id].gpu_id, + ret = acquire_vm(ctx, + gpu_mem[gpu_mem_id].gpu_id, gpu_mem[gpu_mem_id].drm_render_fd); if (ret != HSAKMT_STATUS_SUCCESS) goto aperture_init_failed; @@ -3030,7 +3084,8 @@ HSAKMT_STATUS hsakmt_fmm_init_process_apertures(unsigned int NumNodes) alt_base = (uintptr_t)svm.dgpu_alt_aperture->base; alt_size = VOID_PTRS_SUB(svm.dgpu_alt_aperture->limit, svm.dgpu_alt_aperture->base) + 1; - err = fmm_set_memory_policy(process_apertures[i].gpu_id, + err = fmm_set_memory_policy(ctx, + process_apertures[i].gpu_id, svm.disable_cache ? KFD_IOC_CACHE_POLICY_COHERENT : KFD_IOC_CACHE_POLICY_NONCOHERENT, @@ -3050,18 +3105,18 @@ HSAKMT_STATUS hsakmt_fmm_init_process_apertures(unsigned int NumNodes) cpuvm_aperture.align = PAGE_SIZE; cpuvm_aperture.limit = (void *)0x7FFFFFFFFFFF; /* 2^47 - 1 */ - fmm_init_rbtree(); + fmm_init_rbtree(fmm_ctx); - if (!init_mem_handle_aperture(PAGE_SIZE, guardPages)) + if (!init_mem_handle_aperture(fmm_ctx, PAGE_SIZE, guardPages)) pr_err("Failed to init mem_handle_aperture\n"); for (gpu_mem_id = 0; (uint32_t)gpu_mem_id < gpu_mem_count; gpu_mem_id++) { if (!hsakmt_topology_is_svm_needed(gpu_mem[gpu_mem_id].EngineId)) continue; - gpu_mem[gpu_mem_id].mmio_aperture.base = map_mmio( + gpu_mem[gpu_mem_id].mmio_aperture.base = map_mmio(ctx, gpu_mem[gpu_mem_id].node_id, gpu_mem[gpu_mem_id].gpu_id, - hsakmt_kfd_fd); + ctx->fd); if (gpu_mem[gpu_mem_id].mmio_aperture.base) gpu_mem[gpu_mem_id].mmio_aperture.limit = (void *) ((char *)gpu_mem[gpu_mem_id].mmio_aperture.base + @@ -3083,13 +3138,15 @@ get_aperture_ioctl_failed: free(process_apertures); sysfs_parse_failed: gpu_mem_init_failed: - hsakmt_fmm_destroy_process_apertures(); + hsakmt_fmm_destroy_process_apertures(ctx); return ret; } -void hsakmt_fmm_destroy_process_apertures(void) +void hsakmt_fmm_destroy_process_apertures(HsaKFDContext *ctx) { - release_mmio(); + struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); + + release_mmio(ctx); if (all_gpu_id_array) { free(all_gpu_id_array); @@ -3097,48 +3154,51 @@ void hsakmt_fmm_destroy_process_apertures(void) } all_gpu_id_array_size = 0; - if (gpu_mem) { - while (gpu_mem_count-- > 0) - free(gpu_mem[gpu_mem_count].usable_peer_id_array); - free(gpu_mem); - gpu_mem = NULL; + if (fmm_ctx->gpu_mem) { + while (fmm_ctx->gpu_mem_count-- > 0) + free(fmm_ctx->gpu_mem[fmm_ctx->gpu_mem_count].usable_peer_id_array); + free(fmm_ctx->gpu_mem); + fmm_ctx->gpu_mem = NULL; + fmm_ctx->first_gpu_mem = NULL; } - gpu_mem_count = 0; + fmm_ctx->gpu_mem_count = 0; } -HSAKMT_STATUS hsakmt_fmm_get_aperture_base_and_limit(aperture_type_e aperture_type, HSAuint32 gpu_id, +HSAKMT_STATUS hsakmt_fmm_get_aperture_base_and_limit(HsaKFDContext *ctx, + aperture_type_e aperture_type, HSAuint32 gpu_id, HSAuint64 *aperture_base, HSAuint64 *aperture_limit) { HSAKMT_STATUS err = HSAKMT_STATUS_ERROR; - int32_t slot = gpu_mem_find_by_gpu_id(gpu_id); + struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); + int32_t slot = gpu_mem_find_by_gpu_id(fmm_ctx, gpu_id); if (slot < 0) return HSAKMT_STATUS_INVALID_PARAMETER; switch (aperture_type) { case FMM_GPUVM: - if (aperture_is_valid(gpu_mem[slot].gpuvm_aperture.base, - gpu_mem[slot].gpuvm_aperture.limit)) { - *aperture_base = PORT_VPTR_TO_UINT64(gpu_mem[slot].gpuvm_aperture.base); - *aperture_limit = PORT_VPTR_TO_UINT64(gpu_mem[slot].gpuvm_aperture.limit); + if (aperture_is_valid(fmm_ctx->gpu_mem[slot].gpuvm_aperture.base, + fmm_ctx->gpu_mem[slot].gpuvm_aperture.limit)) { + *aperture_base = PORT_VPTR_TO_UINT64(fmm_ctx->gpu_mem[slot].gpuvm_aperture.base); + *aperture_limit = PORT_VPTR_TO_UINT64(fmm_ctx->gpu_mem[slot].gpuvm_aperture.limit); err = HSAKMT_STATUS_SUCCESS; } break; case FMM_SCRATCH: - if (aperture_is_valid(gpu_mem[slot].scratch_aperture.base, - gpu_mem[slot].scratch_aperture.limit)) { - *aperture_base = PORT_VPTR_TO_UINT64(gpu_mem[slot].scratch_aperture.base); - *aperture_limit = PORT_VPTR_TO_UINT64(gpu_mem[slot].scratch_aperture.limit); + if (aperture_is_valid(fmm_ctx->gpu_mem[slot].scratch_aperture.base, + fmm_ctx->gpu_mem[slot].scratch_aperture.limit)) { + *aperture_base = PORT_VPTR_TO_UINT64(fmm_ctx->gpu_mem[slot].scratch_aperture.base); + *aperture_limit = PORT_VPTR_TO_UINT64(fmm_ctx->gpu_mem[slot].scratch_aperture.limit); err = HSAKMT_STATUS_SUCCESS; } break; case FMM_LDS: - if (aperture_is_valid(gpu_mem[slot].lds_aperture.base, - gpu_mem[slot].lds_aperture.limit)) { - *aperture_base = PORT_VPTR_TO_UINT64(gpu_mem[slot].lds_aperture.base); - *aperture_limit = PORT_VPTR_TO_UINT64(gpu_mem[slot].lds_aperture.limit); + if (aperture_is_valid(fmm_ctx->gpu_mem[slot].lds_aperture.base, + fmm_ctx->gpu_mem[slot].lds_aperture.limit)) { + *aperture_base = PORT_VPTR_TO_UINT64(fmm_ctx->gpu_mem[slot].lds_aperture.base); + *aperture_limit = PORT_VPTR_TO_UINT64(fmm_ctx->gpu_mem[slot].lds_aperture.limit); err = HSAKMT_STATUS_SUCCESS; } break; @@ -3156,10 +3216,10 @@ HSAKMT_STATUS hsakmt_fmm_get_aperture_base_and_limit(aperture_type_e aperture_ty break; case FMM_MMIO: - if (aperture_is_valid(gpu_mem[slot].mmio_aperture.base, - gpu_mem[slot].mmio_aperture.limit)) { - *aperture_base = PORT_VPTR_TO_UINT64(gpu_mem[slot].mmio_aperture.base); - *aperture_limit = PORT_VPTR_TO_UINT64(gpu_mem[slot].mmio_aperture.limit); + if (aperture_is_valid(fmm_ctx->gpu_mem[slot].mmio_aperture.base, + fmm_ctx->gpu_mem[slot].mmio_aperture.limit)) { + *aperture_base = PORT_VPTR_TO_UINT64(fmm_ctx->gpu_mem[slot].mmio_aperture.base); + *aperture_limit = PORT_VPTR_TO_UINT64(fmm_ctx->gpu_mem[slot].mmio_aperture.limit); err = HSAKMT_STATUS_SUCCESS; } break; @@ -3241,7 +3301,8 @@ static void add_device_ids_to_mapped_array(vm_object_t *obj, /* If nodes_to_map is not NULL, map the nodes specified; otherwise map all. */ -static HSAKMT_STATUS _fmm_map_to_gpu(manageable_aperture_t *aperture, +static HSAKMT_STATUS _fmm_map_to_gpu(HsaKFDContext *ctx, + manageable_aperture_t *aperture, void *address, uint64_t size, vm_object_t *obj, uint32_t *nodes_to_map, uint32_t nodes_array_size) { @@ -3250,6 +3311,7 @@ static HSAKMT_STATUS _fmm_map_to_gpu(manageable_aperture_t *aperture, HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; int ret_ioctl; uint32_t i; + struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); if (!obj) pthread_mutex_lock(&aperture->fmm_mutex); @@ -3285,14 +3347,14 @@ static HSAKMT_STATUS _fmm_map_to_gpu(manageable_aperture_t *aperture, sizeof(uint32_t); } else { /* not specified, not registered: map all GPUs */ - int32_t gpu_mem_id = gpu_mem_find_by_node_id(obj->node_id); + int32_t gpu_mem_id = gpu_mem_find_by_node_id(fmm_ctx, obj->node_id); if (!obj->userptr && hsakmt_get_device_id_by_node_id(obj->node_id) && gpu_mem_id >= 0) { args.device_ids_array_ptr = (uint64_t) - gpu_mem[gpu_mem_id].usable_peer_id_array; + fmm_ctx->gpu_mem[gpu_mem_id].usable_peer_id_array; args.n_devices = - gpu_mem[gpu_mem_id].usable_peer_id_num; + fmm_ctx->gpu_mem[gpu_mem_id].usable_peer_id_num; } else { args.device_ids_array_ptr = (uint64_t)all_gpu_id_array; args.n_devices = all_gpu_id_array_size / sizeof(uint32_t); @@ -3303,7 +3365,7 @@ static HSAKMT_STATUS _fmm_map_to_gpu(manageable_aperture_t *aperture, args.n_success = 0; args.handle = object->handles[i]; - ret_ioctl = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_MAP_MEMORY_TO_GPU, &args); + ret_ioctl = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_MAP_MEMORY_TO_GPU, &args); if (ret_ioctl) { pr_err("GPU mapping failed (%d) for obj at %p, userptr %p, size %lu", ret_ioctl, object->start, object->userptr, object->size); @@ -3330,7 +3392,7 @@ static HSAKMT_STATUS _fmm_map_to_gpu(manageable_aperture_t *aperture, err_map_failed: while (ret && i--) { args.handle = object->handles[i]; - hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU, &args); + hsakmt_ioctl(ctx->fd, AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU, &args); } exit_ok: err_object_not_found: @@ -3339,7 +3401,8 @@ err_object_not_found: return ret; } -static HSAKMT_STATUS _fmm_map_to_gpu_scratch(uint32_t gpu_id, manageable_aperture_t *aperture, +static HSAKMT_STATUS _fmm_map_to_gpu_scratch(HsaKFDContext *ctx, + uint32_t gpu_id, manageable_aperture_t *aperture, void *address, uint64_t size) { int32_t gpu_mem_id; @@ -3349,9 +3412,10 @@ static HSAKMT_STATUS _fmm_map_to_gpu_scratch(uint32_t gpu_id, manageable_apertur void *mmap_ret = NULL; uint64_t mmap_offset = 0; vm_object_t *obj; + struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); /* Retrieve gpu_mem id according to gpu_id */ - gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id); + gpu_mem_id = gpu_mem_find_by_gpu_id(fmm_ctx, gpu_id); if (gpu_mem_id < 0) return HSAKMT_STATUS_INVALID_PARAMETER; @@ -3363,33 +3427,35 @@ static HSAKMT_STATUS _fmm_map_to_gpu_scratch(uint32_t gpu_id, manageable_apertur VOID_PTR_ADD(address, size - 1) > aperture->limit) return HSAKMT_STATUS_INVALID_PARAMETER; - is_debugger = hsakmt_debug_get_reg_status(gpu_mem[gpu_mem_id].node_id); + is_debugger = hsakmt_debug_get_reg_status(fmm_ctx->gpu_mem[gpu_mem_id].node_id); flags = is_debugger ? KFD_IOC_ALLOC_MEM_FLAGS_GTT : KFD_IOC_ALLOC_MEM_FLAGS_VRAM; flags |= KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE; /* allocate object within the scratch backing aperture */ - obj = fmm_allocate_memory_object(gpu_id, address, size, + obj = fmm_allocate_memory_object(ctx, + gpu_id, address, size, aperture, &mmap_offset, flags); if (!obj) return HSAKMT_STATUS_INVALID_HANDLE; /* Create a CPU mapping for the debugger */ mmap_ret = fmm_map_to_cpu(address, size, is_debugger, - gpu_mem[gpu_mem_id].drm_render_fd, + fmm_ctx->gpu_mem[gpu_mem_id].drm_render_fd, mmap_offset); if (mmap_ret == MAP_FAILED) { - __fmm_release(obj, aperture); + __fmm_release(ctx, obj, aperture); return HSAKMT_STATUS_ERROR; } /* map to GPU */ - ret = _fmm_map_to_gpu(aperture, address, size, NULL, &gpu_id, sizeof(uint32_t)); + ret = _fmm_map_to_gpu(ctx, aperture, address, size, NULL, &gpu_id, sizeof(uint32_t)); if (ret != HSAKMT_STATUS_SUCCESS) - __fmm_release(obj, aperture); + __fmm_release(ctx, obj, aperture); return ret; } -static HSAKMT_STATUS _fmm_map_to_gpu_userptr(void *addr, uint64_t size, +static HSAKMT_STATUS _fmm_map_to_gpu_userptr(HsaKFDContext *ctx, + void *addr, uint64_t size, uint64_t *gpuvm_addr, vm_object_t *object, uint32_t *nodes_to_map, uint32_t nodes_array_size) { @@ -3411,14 +3477,14 @@ static HSAKMT_STATUS _fmm_map_to_gpu_userptr(void *addr, uint64_t size, } pr_debug("%s Mapping Address %p size aligned: %ld offset: %x\n", __func__, svm_addr, PAGE_ALIGN_UP(page_offset + size), page_offset); - ret = fmm_map_mem_svm_api(svm_addr, + ret = fmm_map_mem_svm_api(ctx, svm_addr, PAGE_ALIGN_UP(page_offset + size), nodes_to_map, nodes_array_size / sizeof(uint32_t)); } else if (object) { svm_addr = object->start; - ret = _fmm_map_to_gpu(aperture, svm_addr, object->size, object, NULL, 0); + ret = _fmm_map_to_gpu(ctx, aperture, svm_addr, object->size, object, NULL, 0); } else { pr_err("Object is null and SVM API is not supported.\n"); return HSAKMT_STATUS_ERROR; @@ -3429,22 +3495,24 @@ static HSAKMT_STATUS _fmm_map_to_gpu_userptr(void *addr, uint64_t size, return ret; } -HSAKMT_STATUS hsakmt_fmm_map_to_gpu(void *address, uint64_t size, uint64_t *gpuvm_address) +HSAKMT_STATUS hsakmt_fmm_map_to_gpu(HsaKFDContext *ctx, + void *address, uint64_t size, uint64_t *gpuvm_address) { manageable_aperture_t *aperture = NULL; vm_object_t *object; HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; gpu_mem_t *gpu_mem_ptr = NULL; + struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); /* Special handling for scratch memory */ - gpu_mem_ptr = fmm_is_scratch_aperture(address); + gpu_mem_ptr = fmm_is_scratch_aperture(fmm_ctx, address); if (gpu_mem_ptr) { - return _fmm_map_to_gpu_scratch(gpu_mem_ptr->gpu_id, + return _fmm_map_to_gpu_scratch(ctx, gpu_mem_ptr->gpu_id, &gpu_mem_ptr->scratch_physical, address, size); } - object = vm_find_object(address, size, &aperture); + object = vm_find_object(fmm_ctx, address, size, &aperture); if (!object && !hsakmt_is_svm_api_supported) { if (!hsakmt_is_dgpu) { /* Prefetch memory on APUs with dummy-reads */ @@ -3473,9 +3541,9 @@ HSAKMT_STATUS hsakmt_fmm_map_to_gpu(void *address, uint64_t size, uint64_t *gpuv fmm_check_user_memory(address, size); ret = HSAKMT_STATUS_SUCCESS; } else if ((hsakmt_is_svm_api_supported && !object) || (object && (object->userptr))) { - ret = _fmm_map_to_gpu_userptr(address, size, gpuvm_address, object, NULL, 0); + ret = _fmm_map_to_gpu_userptr(ctx, address, size, gpuvm_address, object, NULL, 0); } else if (aperture) { - ret = _fmm_map_to_gpu(aperture, address, size, object, NULL, 0); + ret = _fmm_map_to_gpu(ctx, aperture, address, size, object, NULL, 0); /* Update alternate GPUVM address only for * CPU-invisible apertures on old APUs */ @@ -3500,7 +3568,8 @@ static void print_device_id_array(uint32_t *device_id_array, uint32_t device_id_ #endif } -static int _fmm_unmap_from_gpu(manageable_aperture_t *aperture, void *address, +static int _fmm_unmap_from_gpu(HsaKFDContext *ctx, + manageable_aperture_t *aperture, void *address, uint32_t *device_ids_array, uint32_t device_ids_array_size, vm_object_t *obj) { @@ -3553,7 +3622,7 @@ static int _fmm_unmap_from_gpu(manageable_aperture_t *aperture, void *address, args.handle = object->handles[i]; args.n_success = 0; - tmp_ret = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU, &args); + tmp_ret = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU, &args); if (tmp_ret) ret = tmp_ret; } @@ -3574,7 +3643,8 @@ out: return ret; } -static int _fmm_unmap_from_gpu_scratch(uint32_t gpu_id, +static int _fmm_unmap_from_gpu_scratch(HsaKFDContext *ctx, + uint32_t gpu_id, manageable_aperture_t *aperture, void *address) { @@ -3582,9 +3652,10 @@ static int _fmm_unmap_from_gpu_scratch(uint32_t gpu_id, vm_object_t *object; struct kfd_ioctl_unmap_memory_from_gpu_args args = {0}; int ret; + struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); /* Retrieve gpu_mem id according to gpu_id */ - gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id); + gpu_mem_id = gpu_mem_find_by_gpu_id(fmm_ctx, gpu_id); if (gpu_mem_id < 0) return -1; @@ -3611,7 +3682,7 @@ static int _fmm_unmap_from_gpu_scratch(uint32_t gpu_id, args.device_ids_array_ptr = (uint64_t)object->mapped_device_id_array; args.n_devices = object->mapped_device_id_array_size / sizeof(uint32_t); args.n_success = 0; - ret = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU, &args); + ret = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU, &args); /* unmap from CPU while keeping the address space reserved */ mmap(address, object->size, PROT_NONE, @@ -3632,29 +3703,31 @@ static int _fmm_unmap_from_gpu_scratch(uint32_t gpu_id, pthread_mutex_unlock(&aperture->fmm_mutex); /* free object in scratch backing aperture */ - return __fmm_release(object, aperture); + return __fmm_release(ctx, object, aperture); err: pthread_mutex_unlock(&aperture->fmm_mutex); return ret; } -int hsakmt_fmm_unmap_from_gpu(void *address) +int hsakmt_fmm_unmap_from_gpu(HsaKFDContext *ctx, void *address) { manageable_aperture_t *aperture; vm_object_t *object; int ret; gpu_mem_t *gpu_mem_ptr = NULL; + struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); /* Special handling for scratch memory */ - gpu_mem_ptr = fmm_is_scratch_aperture(address); + gpu_mem_ptr = fmm_is_scratch_aperture(fmm_ctx, address); if (gpu_mem_ptr) { - return _fmm_unmap_from_gpu_scratch(gpu_mem_ptr->gpu_id, - &gpu_mem_ptr->scratch_physical, - address); + return _fmm_unmap_from_gpu_scratch(ctx, + gpu_mem_ptr->gpu_id, + &gpu_mem_ptr->scratch_physical, + address); } - object = vm_find_object(address, 0, &aperture); + object = vm_find_object(fmm_ctx, address, 0, &aperture); if (!object) /* On APUs GPU unmapping of system memory is a no-op */ return (!hsakmt_is_dgpu || hsakmt_is_svm_api_supported) ? 0 : -EINVAL; @@ -3664,7 +3737,7 @@ int hsakmt_fmm_unmap_from_gpu(void *address) /* On APUs GPU unmapping of system memory is a no-op */ ret = 0; else - ret = _fmm_unmap_from_gpu(aperture, address, NULL, 0, object); + ret = _fmm_unmap_from_gpu(ctx, aperture, address, NULL, 0, object); pthread_mutex_unlock(&aperture->fmm_mutex); @@ -3681,24 +3754,23 @@ int hsakmt_fmm_unmap_from_gpu(void *address) * * Returns true if the handle is found, false otherwise. */ -bool hsakmt_fmm_get_handle(void *address, uint64_t *handle, uint64_t *size_offset) +bool hsakmt_fmm_get_handle(HsaKFDContext *ctx, + void *address, uint64_t *handle, uint64_t *size_offset) { uint32_t i; - manageable_aperture_t *aperture; + manageable_aperture_t *aperture = NULL; vm_object_t *object; - bool found; - - found = false; - aperture = NULL; + bool found = false; + struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); /* Find the aperture the requested address belongs to */ - for (i = 0; i < gpu_mem_count; i++) { - if (gpu_mem[i].gpu_id == NON_VALID_GPU_ID) + for (i = 0; i < fmm_ctx->gpu_mem_count; i++) { + if (fmm_ctx->gpu_mem[i].gpu_id == NON_VALID_GPU_ID) continue; - if ((address >= gpu_mem[i].gpuvm_aperture.base) && - (address <= gpu_mem[i].gpuvm_aperture.limit)) { - aperture = &gpu_mem[i].gpuvm_aperture; + if ((address >= fmm_ctx->gpu_mem[i].gpuvm_aperture.base) && + (address <= fmm_ctx->gpu_mem[i].gpuvm_aperture.limit)) { + aperture = &fmm_ctx->gpu_mem[i].gpuvm_aperture; break; } } @@ -3744,7 +3816,8 @@ bool hsakmt_fmm_get_handle(void *address, uint64_t *handle, uint64_t *size_offse return found; } -static HSAKMT_STATUS fmm_register_user_memory(void *addr, +static HSAKMT_STATUS fmm_register_user_memory(HsaKFDContext *ctx, + void *addr, HSAuint64 size, vm_object_t **obj_ret, HsaMemFlags flags) @@ -3756,19 +3829,21 @@ static HSAKMT_STATUS fmm_register_user_memory(void *addr, void *svm_addr; HSAuint32 gpu_id; vm_object_t *obj, *exist_obj; + struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); /* Find first GPU for creating the userptr BO */ - if (!g_first_gpu_mem) + if (!fmm_ctx->first_gpu_mem) return HSAKMT_STATUS_ERROR; - gpu_id = g_first_gpu_mem->gpu_id; + gpu_id = fmm_ctx->first_gpu_mem->gpu_id; /* Optionally check that the CPU mapping is valid */ if (svm.check_userptr) fmm_check_user_memory(addr, size); /* Allocate BO, userptr address is passed in mmap_offset */ - svm_addr = __fmm_allocate_device(gpu_id, NULL, aligned_size, aperture, + svm_addr = __fmm_allocate_device(ctx, + gpu_id, NULL, aligned_size, aperture, &aligned_addr, KFD_IOC_ALLOC_MEM_FLAGS_USERPTR | KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | @@ -3801,14 +3876,15 @@ static HSAKMT_STATUS fmm_register_user_memory(void *addr, pthread_mutex_unlock(&aperture->fmm_mutex); if (exist_obj) - __fmm_release(obj, aperture); + __fmm_release(ctx, obj, aperture); if (obj_ret) *obj_ret = exist_obj ? exist_obj : obj; return HSAKMT_STATUS_SUCCESS; } -HSAKMT_STATUS hsakmt_fmm_register_memory(void *address, uint64_t size_in_bytes, +HSAKMT_STATUS hsakmt_fmm_register_memory(HsaKFDContext *ctx, + void *address, uint64_t size_in_bytes, uint32_t *gpu_id_array, uint32_t gpu_id_array_size, HsaMemFlags flags) @@ -3816,6 +3892,7 @@ HSAKMT_STATUS hsakmt_fmm_register_memory(void *address, uint64_t size_in_bytes, manageable_aperture_t *aperture = NULL; vm_object_t *object = NULL; HSAKMT_STATUS ret; + struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); if (gpu_id_array_size > 0 && !gpu_id_array) return HSAKMT_STATUS_INVALID_PARAMETER; @@ -3823,7 +3900,7 @@ HSAKMT_STATUS hsakmt_fmm_register_memory(void *address, uint64_t size_in_bytes, if (flags.ui32.CoarseGrain && flags.ui32.ExtendedCoherent) return HSAKMT_STATUS_INVALID_PARAMETER; - object = vm_find_object(address, size_in_bytes, &aperture); + object = vm_find_object(fmm_ctx, address, size_in_bytes, &aperture); if (!object) { if (!hsakmt_is_dgpu) /* System memory registration on APUs is a no-op */ @@ -3831,12 +3908,12 @@ HSAKMT_STATUS hsakmt_fmm_register_memory(void *address, uint64_t size_in_bytes, /* Register a new user ptr */ if (hsakmt_is_svm_api_supported) { - ret = fmm_register_mem_svm_api(address, size_in_bytes, flags); + ret = fmm_register_mem_svm_api(ctx, address, size_in_bytes, flags); if (ret == HSAKMT_STATUS_SUCCESS) return ret; pr_debug("SVM failed, falling back to old registration\n"); } - ret = fmm_register_user_memory(address, size_in_bytes, &object, flags); + ret = fmm_register_user_memory(ctx, address, size_in_bytes, &object, flags); if (ret != HSAKMT_STATUS_SUCCESS) return ret; @@ -3891,7 +3968,8 @@ HSAKMT_STATUS hsakmt_fmm_register_memory(void *address, uint64_t size_in_bytes, } #define GRAPHICS_METADATA_DEFAULT_SIZE 64 -HSAKMT_STATUS hsakmt_fmm_register_graphics_handle(HSAuint64 GraphicsResourceHandle, +HSAKMT_STATUS hsakmt_fmm_register_graphics_handle(HsaKFDContext *ctx, + HSAuint64 GraphicsResourceHandle, HsaGraphicsResourceInfo *GraphicsResourceInfo, uint32_t *gpu_id_array, uint32_t gpu_id_array_size, @@ -3909,6 +3987,7 @@ HSAKMT_STATUS hsakmt_fmm_register_graphics_handle(HSAuint64 GraphicsResourceHand int r; HSAKMT_STATUS status = HSAKMT_STATUS_ERROR; static const uint64_t IMAGE_ALIGN = 256*1024; + struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); if (gpu_id_array_size > 0 && !gpu_id_array) return HSAKMT_STATUS_INVALID_PARAMETER; @@ -3919,7 +3998,7 @@ HSAKMT_STATUS hsakmt_fmm_register_graphics_handle(HSAuint64 GraphicsResourceHand if (!metadata) return HSAKMT_STATUS_NO_MEMORY; infoArgs.metadata_ptr = (uint64_t)metadata; - r = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_GET_DMABUF_INFO, (void *)&infoArgs); + r = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_GET_DMABUF_INFO, (void *)&infoArgs); if (r && infoArgs.metadata_size > GRAPHICS_METADATA_DEFAULT_SIZE) { /* Try again with bigger metadata */ free(metadata); @@ -3927,24 +4006,24 @@ HSAKMT_STATUS hsakmt_fmm_register_graphics_handle(HSAuint64 GraphicsResourceHand if (!metadata) return HSAKMT_STATUS_NO_MEMORY; infoArgs.metadata_ptr = (uint64_t)metadata; - r = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_GET_DMABUF_INFO, (void *)&infoArgs); + r = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_GET_DMABUF_INFO, (void *)&infoArgs); } if (r) goto error_free_metadata; /* Choose aperture based on GPU and allocate virtual address */ - gpu_mem_id = gpu_mem_find_by_gpu_id(infoArgs.gpu_id); + gpu_mem_id = gpu_mem_find_by_gpu_id(fmm_ctx, infoArgs.gpu_id); if (gpu_mem_id < 0) goto error_free_metadata; /* import DMA buffer without VA assigned */ if (!gpu_id_array && gpu_id_array_size == 0 && !RegisterFlags.ui32.requiresVAddr) { aperture = &mem_handle_aperture; - } else if (hsakmt_topology_is_svm_needed(gpu_mem[gpu_mem_id].EngineId)) { + } else if (hsakmt_topology_is_svm_needed(fmm_ctx->gpu_mem[gpu_mem_id].EngineId)) { aperture = svm.dgpu_aperture; } else { - aperture = &gpu_mem[gpu_mem_id].gpuvm_aperture; + aperture = &fmm_ctx->gpu_mem[gpu_mem_id].gpuvm_aperture; aperture_base = aperture->base; } if (!aperture_is_valid(aperture->base, aperture->limit)) @@ -3965,7 +4044,7 @@ HSAKMT_STATUS hsakmt_fmm_register_graphics_handle(HSAuint64 GraphicsResourceHand importArgs.gpu_id = infoArgs.gpu_id; importArgs.dmabuf_fd = GraphicsResourceHandle; - r = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_IMPORT_DMABUF, (void *)&importArgs); + r = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_IMPORT_DMABUF, (void *)&importArgs); if (r) { pthread_mutex_unlock(&aperture->fmm_mutex); goto error_release_aperture; @@ -3996,7 +4075,7 @@ HSAKMT_STATUS hsakmt_fmm_register_graphics_handle(HSAuint64 GraphicsResourceHand error_release_buffer: freeArgs.handle = importArgs.handle; - if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_FREE_MEMORY_OF_GPU, &freeArgs) != 0) { + if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_FREE_MEMORY_OF_GPU, &freeArgs) != 0) { /* Handle error if memory is not freed properly */ pr_err("Failed to free GPU memory\n"); } @@ -4008,7 +4087,8 @@ error_free_metadata: return status; } -HSAKMT_STATUS hsakmt_fmm_export_dma_buf_fd(void *MemoryAddress, +HSAKMT_STATUS hsakmt_fmm_export_dma_buf_fd(HsaKFDContext *ctx, + void *MemoryAddress, HSAuint64 MemorySizeInBytes, int *DMABufFd, HSAuint64 *Offset) @@ -4019,8 +4099,9 @@ HSAKMT_STATUS hsakmt_fmm_export_dma_buf_fd(void *MemoryAddress, vm_object_t *obj; HSAuint64 offset; int r; + struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); - aperture = fmm_find_aperture(MemoryAddress, &ApeInfo); + aperture = fmm_find_aperture(fmm_ctx, MemoryAddress, &ApeInfo); if (!aperture) return HSAKMT_STATUS_INVALID_PARAMETER; @@ -4040,7 +4121,7 @@ HSAKMT_STATUS hsakmt_fmm_export_dma_buf_fd(void *MemoryAddress, if (!obj) return HSAKMT_STATUS_INVALID_PARAMETER; - r = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_EXPORT_DMABUF, (void *)&exportArgs); + r = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_EXPORT_DMABUF, (void *)&exportArgs); if (r) return HSAKMT_STATUS_ERROR; @@ -4050,7 +4131,8 @@ HSAKMT_STATUS hsakmt_fmm_export_dma_buf_fd(void *MemoryAddress, return HSAKMT_STATUS_SUCCESS; } -HSAKMT_STATUS hsakmt_fmm_share_memory(void *MemoryAddress, +HSAKMT_STATUS hsakmt_fmm_share_memory(HsaKFDContext *ctx, + void *MemoryAddress, HSAuint64 SizeInBytes, HsaSharedMemoryHandle *SharedMemoryHandle) { @@ -4062,11 +4144,12 @@ HSAKMT_STATUS hsakmt_fmm_share_memory(void *MemoryAddress, HsaApertureInfo ApeInfo; HsaSharedMemoryStruct *SharedMemoryStruct = to_hsa_shared_memory_struct(SharedMemoryHandle); + struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); if (SizeInBytes >= (1ULL << ((sizeof(HSAuint32) * 8) + PAGE_SHIFT))) return HSAKMT_STATUS_INVALID_PARAMETER; - aperture = fmm_find_aperture(MemoryAddress, &ApeInfo); + aperture = fmm_find_aperture(fmm_ctx, MemoryAddress, &ApeInfo); if (!aperture) return HSAKMT_STATUS_INVALID_PARAMETER; @@ -4083,16 +4166,16 @@ HSAKMT_STATUS hsakmt_fmm_share_memory(void *MemoryAddress, /* Sharing non paged system memory. Use first GPU which was * used during allocation. See fmm_allocate_host_gpu() */ - if (!g_first_gpu_mem) + if (!fmm_ctx->first_gpu_mem) return HSAKMT_STATUS_ERROR; - gpu_id = g_first_gpu_mem->gpu_id; + gpu_id = fmm_ctx->first_gpu_mem->gpu_id; } exportArgs.handle = obj->handles[0]; exportArgs.gpu_id = gpu_id; exportArgs.flags = obj->mflags.Value; - r = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_IPC_EXPORT_HANDLE, (void *)&exportArgs); + r = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_IPC_EXPORT_HANDLE, (void *)&exportArgs); if (r) return HSAKMT_STATUS_ERROR; @@ -4105,7 +4188,8 @@ HSAKMT_STATUS hsakmt_fmm_share_memory(void *MemoryAddress, return HSAKMT_STATUS_SUCCESS; } -HSAKMT_STATUS hsakmt_fmm_register_shared_memory(const HsaSharedMemoryHandle *SharedMemoryHandle, +HSAKMT_STATUS hsakmt_fmm_register_shared_memory(HsaKFDContext *ctx, + const HsaSharedMemoryHandle *SharedMemoryHandle, HSAuint64 *SizeInBytes, void **MemoryAddress, uint32_t *gpu_id_array, @@ -4122,6 +4206,7 @@ HSAKMT_STATUS hsakmt_fmm_register_shared_memory(const HsaSharedMemoryHandle *Sha to_const_hsa_shared_memory_struct(SharedMemoryHandle); HSAuint64 SizeInPages = SharedMemoryStruct->SizeInPages; HsaMemFlags mflags; + struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); if (gpu_id_array_size > 0 && !gpu_id_array) return HSAKMT_STATUS_INVALID_PARAMETER; @@ -4130,7 +4215,7 @@ HSAKMT_STATUS hsakmt_fmm_register_shared_memory(const HsaSharedMemoryHandle *Sha sizeof(importArgs.share_handle)); importArgs.gpu_id = SharedMemoryStruct->ExportGpuId; - aperture = fmm_get_aperture(SharedMemoryStruct->ApeInfo); + aperture = fmm_get_aperture(fmm_ctx, SharedMemoryStruct->ApeInfo); if (!aperture) return HSAKMT_STATUS_INVALID_PARAMETER; @@ -4143,7 +4228,7 @@ HSAKMT_STATUS hsakmt_fmm_register_shared_memory(const HsaSharedMemoryHandle *Sha } importArgs.va_addr = (uint64_t)reservedMem; - r = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_IPC_IMPORT_HANDLE, (void *)&importArgs); + r = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_IPC_IMPORT_HANDLE, (void *)&importArgs); if (r) { err = HSAKMT_STATUS_ERROR; goto err_import; @@ -4158,7 +4243,7 @@ HSAKMT_STATUS hsakmt_fmm_register_shared_memory(const HsaSharedMemoryHandle *Sha } if (importArgs.mmap_offset) { - int32_t gpu_mem_id = gpu_mem_find_by_gpu_id(importArgs.gpu_id); + int32_t gpu_mem_id = gpu_mem_find_by_gpu_id(fmm_ctx, importArgs.gpu_id); void *ret; if (gpu_mem_id < 0) { @@ -4168,11 +4253,11 @@ HSAKMT_STATUS hsakmt_fmm_register_shared_memory(const HsaSharedMemoryHandle *Sha err = HSAKMT_STATUS_ERROR; goto err_free_mem; } - obj->node_id = gpu_mem[gpu_mem_id].node_id; + obj->node_id = fmm_ctx->gpu_mem[gpu_mem_id].node_id; pthread_mutex_unlock(&aperture->fmm_mutex); ret = fmm_map_to_cpu(reservedMem, (SizeInPages << PAGE_SHIFT), - true, gpu_mem[gpu_mem_id].drm_render_fd, + true, fmm_ctx->gpu_mem[gpu_mem_id].drm_render_fd, importArgs.mmap_offset); if (ret == MAP_FAILED) { @@ -4199,7 +4284,7 @@ HSAKMT_STATUS hsakmt_fmm_register_shared_memory(const HsaSharedMemoryHandle *Sha return HSAKMT_STATUS_SUCCESS; err_free_mem_handle: freeArgs.handle = importArgs.handle; - if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_FREE_MEMORY_OF_GPU, &freeArgs) != 0) { + if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_FREE_MEMORY_OF_GPU, &freeArgs) != 0) { pr_err("Failed to free GPU memory for handle %llu\n", freeArgs.handle); } err_free_mem: @@ -4209,12 +4294,13 @@ err_import: return err; } -HSAKMT_STATUS hsakmt_fmm_deregister_memory(void *address) +HSAKMT_STATUS hsakmt_fmm_deregister_memory(HsaKFDContext *ctx, void *address) { manageable_aperture_t *aperture; vm_object_t *object; + struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); - object = vm_find_object(address, 0, &aperture); + object = vm_find_object(fmm_ctx, address, 0, &aperture); if (!object) /* On APUs we assume it's a random system memory address * where registration and dergistration is a no-op @@ -4238,7 +4324,7 @@ HSAKMT_STATUS hsakmt_fmm_deregister_memory(void *address) * userptrs means releasing the BO. */ pthread_mutex_unlock(&aperture->fmm_mutex); - __fmm_release(object, aperture); + __fmm_release(ctx, object, aperture); return HSAKMT_STATUS_SUCCESS; } @@ -4268,7 +4354,8 @@ HSAKMT_STATUS hsakmt_fmm_deregister_memory(void *address) * and maps nodes_to_map */ -HSAKMT_STATUS hsakmt_fmm_map_to_gpu_nodes(void *address, uint64_t size, +HSAKMT_STATUS hsakmt_fmm_map_to_gpu_nodes(HsaKFDContext *ctx, + void *address, uint64_t size, uint32_t *nodes_to_map, uint64_t num_of_nodes, uint64_t *gpuvm_address) { @@ -4278,11 +4365,12 @@ HSAKMT_STATUS hsakmt_fmm_map_to_gpu_nodes(void *address, uint64_t size, uint32_t *registered_node_id_array, registered_node_id_array_size; HSAKMT_STATUS ret; int retcode = 0; + struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); if (!num_of_nodes || !nodes_to_map || !address) return HSAKMT_STATUS_INVALID_PARAMETER; - object = vm_find_object(address, size, &aperture); + object = vm_find_object(fmm_ctx, address, size, &aperture); if (!object && !hsakmt_is_svm_api_supported) return HSAKMT_STATUS_ERROR; /* Successful vm_find_object returns with aperture locked */ @@ -4307,7 +4395,7 @@ HSAKMT_STATUS hsakmt_fmm_map_to_gpu_nodes(void *address, uint64_t size, } if ((hsakmt_is_svm_api_supported && !object) || object->userptr) { - retcode = _fmm_map_to_gpu_userptr(address, size, gpuvm_address, + retcode = _fmm_map_to_gpu_userptr(ctx, address, size, gpuvm_address, object, nodes_to_map, num_of_nodes * sizeof(uint32_t)); if (object) pthread_mutex_unlock(&aperture->fmm_mutex); @@ -4345,7 +4433,7 @@ HSAKMT_STATUS hsakmt_fmm_map_to_gpu_nodes(void *address, uint64_t size, temp_node_id_array_size *= sizeof(uint32_t); if (temp_node_id_array_size) { - ret = _fmm_unmap_from_gpu(aperture, address, + ret = _fmm_unmap_from_gpu(ctx, aperture, address, temp_node_id_array, temp_node_id_array_size, object); @@ -4371,7 +4459,7 @@ HSAKMT_STATUS hsakmt_fmm_map_to_gpu_nodes(void *address, uint64_t size, } if (map_node_id_array_size) - retcode = _fmm_map_to_gpu(aperture, address, size, object, + retcode = _fmm_map_to_gpu(ctx, aperture, address, size, object, map_node_id_array, map_node_id_array_size * sizeof(uint32_t)); @@ -4383,16 +4471,18 @@ HSAKMT_STATUS hsakmt_fmm_map_to_gpu_nodes(void *address, uint64_t size, return HSAKMT_STATUS_SUCCESS; } -HSAKMT_STATUS hsakmt_fmm_get_mem_info(const void *address, HsaPointerInfo *info) +HSAKMT_STATUS hsakmt_fmm_get_mem_info(HsaKFDContext *ctx, + const void *address, HsaPointerInfo *info) { HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; uint32_t i; manageable_aperture_t *aperture; vm_object_t *vm_obj; + struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); memset(info, 0, sizeof(HsaPointerInfo)); - vm_obj = vm_find_object(address, UINT64_MAX, &aperture); + vm_obj = vm_find_object(fmm_ctx, address, UINT64_MAX, &aperture); if (!vm_obj) { info->Type = HSA_POINTER_UNKNOWN; return HSAKMT_STATUS_ERROR; @@ -4468,13 +4558,14 @@ HSAKMT_STATUS hsakmt_fmm_get_mem_info(const void *address, HsaPointerInfo *info) } #ifdef SANITIZER_AMDGPU -HSAKMT_STATUS hsakmt_fmm_replace_asan_header_page(void* address) +HSAKMT_STATUS hsakmt_fmm_replace_asan_header_page(HsaKFDContext *ctx, void* address) { HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; manageable_aperture_t* aperture; vm_object_t* vm_obj; + struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); - vm_obj = vm_find_object(address, UINT64_MAX, &aperture); + vm_obj = vm_find_object(fmm_ctx, address, UINT64_MAX, &aperture); if (!vm_obj) return HSAKMT_STATUS_ERROR; /* Successful vm_find_object returns with the aperture locked */ @@ -4495,13 +4586,14 @@ HSAKMT_STATUS hsakmt_fmm_replace_asan_header_page(void* address) return ret; } -HSAKMT_STATUS hsakmt_fmm_return_asan_header_page(void* address) +HSAKMT_STATUS hsakmt_fmm_return_asan_header_page(HsaKFDContext *ctx, void* address) { HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; manageable_aperture_t* aperture; vm_object_t* vm_obj; + struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); - vm_obj = vm_find_object(address, UINT64_MAX, &aperture); + vm_obj = vm_find_object(fmm_ctx, address, UINT64_MAX, &aperture); if (!vm_obj) return HSAKMT_STATUS_ERROR; /* Successful vm_find_object returns with the aperture locked */ @@ -4525,12 +4617,14 @@ HSAKMT_STATUS hsakmt_fmm_return_asan_header_page(void* address) } #endif -HSAKMT_STATUS hsakmt_fmm_set_mem_user_data(const void *mem, void *usr_data) +HSAKMT_STATUS hsakmt_fmm_set_mem_user_data(HsaKFDContext *ctx, + const void *mem, void *usr_data) { manageable_aperture_t *aperture; vm_object_t *vm_obj; + struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); - vm_obj = vm_find_object(mem, 0, &aperture); + vm_obj = vm_find_object(fmm_ctx, mem, 0, &aperture); if (!vm_obj) return HSAKMT_STATUS_ERROR; @@ -4560,29 +4654,32 @@ static void fmm_clear_aperture(manageable_aperture_t *app) * after a fork(). This will clear all vm_objects and mmaps duplicated from * the parent. */ -void hsakmt_fmm_clear_all_mem(void) +void hsakmt_fmm_clear_all_mem(HsaKFDContext *ctx) { uint32_t i; - + + struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); /* Close render node FDs. The child process needs to open new ones */ for (i = 0; i <= DRM_LAST_RENDER_NODE - DRM_FIRST_RENDER_NODE; i++) { - if (amdgpu_handle[i]) { - amdgpu_device_deinitialize(amdgpu_handle[i]); - amdgpu_handle[i] = NULL; - } else if (drm_render_fds[i]) { - close(drm_render_fds[i]); + if (fmm_ctx->amdgpu_handle[i]) { + amdgpu_device_deinitialize(fmm_ctx->amdgpu_handle[i]); + fmm_ctx->amdgpu_handle[i] = NULL; + } else if (fmm_ctx->drm_render_fds[i]) { + close(fmm_ctx->drm_render_fds[i]); } - drm_render_fds[i] = 0; + fmm_ctx->drm_render_fds[i] = 0; } - hsakmt_fmm_clear_all_aperture(); + hsakmt_fmm_clear_all_aperture(ctx); } -void hsakmt_fmm_clear_all_aperture(void) +void hsakmt_fmm_clear_all_aperture(HsaKFDContext *ctx) { uint32_t i; void *map_addr; + + struct hsa_kfd_fmm_context *fmm_ctx = hsakmt_kfdcontext_get_fmm_context(ctx); fmm_clear_aperture(&mem_handle_aperture); fmm_clear_aperture(&cpuvm_aperture); @@ -4609,13 +4706,13 @@ void hsakmt_fmm_clear_all_aperture(void) } /* Nothing is initialized. */ - if (!gpu_mem) + if (!fmm_ctx->gpu_mem) return; - for (i = 0; i < gpu_mem_count; i++) { - fmm_clear_aperture(&gpu_mem[i].gpuvm_aperture); - fmm_clear_aperture(&gpu_mem[i].scratch_physical); + for (i = 0; i < fmm_ctx->gpu_mem_count; i++) { + fmm_clear_aperture(&fmm_ctx->gpu_mem[i].gpuvm_aperture); + fmm_clear_aperture(&fmm_ctx->gpu_mem[i].scratch_physical); } - hsakmt_fmm_destroy_process_apertures(); + hsakmt_fmm_destroy_process_apertures(ctx); } diff --git a/projects/rocr-runtime/libhsakmt/src/fmm.h b/projects/rocr-runtime/libhsakmt/src/fmm.h index f98b129b5d..ebb2ce7900 100644 --- a/projects/rocr-runtime/libhsakmt/src/fmm.h +++ b/projects/rocr-runtime/libhsakmt/src/fmm.h @@ -45,59 +45,113 @@ typedef struct { void *start_address; } aperture_properties_t; -HSAKMT_STATUS hsakmt_fmm_get_amdgpu_device_handle(uint32_t node_id, HsaAMDGPUDeviceHandle *DeviceHandle); -HSAKMT_STATUS hsakmt_fmm_init_process_apertures(unsigned int NumNodes); -void hsakmt_fmm_destroy_process_apertures(void); +HSAKMT_STATUS hsakmt_fmm_get_amdgpu_device_handle(HsaKFDContext *ctx, + uint32_t node_id, HsaAMDGPUDeviceHandle *DeviceHandle); +HSAKMT_STATUS hsakmt_fmm_init_process_apertures(HsaKFDContext *ctx, unsigned int NumNodes); +void hsakmt_fmm_destroy_process_apertures(HsaKFDContext *ctx); /* Memory interface */ -void *hsakmt_fmm_allocate_scratch(uint32_t gpu_id, void *address, uint64_t MemorySizeInBytes); -void *hsakmt_fmm_allocate_device(uint32_t gpu_id, uint32_t node_id, void *address, - uint64_t MemorySizeInBytes, uint64_t alignment, HsaMemFlags flags); -void *hsakmt_fmm_allocate_doorbell(uint32_t gpu_id, uint64_t MemorySizeInBytes, uint64_t doorbell_offset); -void *hsakmt_fmm_allocate_host(uint32_t gpu_id, uint32_t node_id, void *address, uint64_t MemorySizeInBytes, - uint64_t alignment, HsaMemFlags flags); -void hsakmt_fmm_print(uint32_t node); -HSAKMT_STATUS hsakmt_fmm_release(void *address); -HSAKMT_STATUS hsakmt_fmm_map_to_gpu(void *address, uint64_t size, uint64_t *gpuvm_address); -int hsakmt_fmm_unmap_from_gpu(void *address); -bool hsakmt_fmm_get_handle(void *address, uint64_t *handle, uint64_t *size_offset); -HSAKMT_STATUS hsakmt_fmm_get_mem_info(const void *address, HsaPointerInfo *info); -HSAKMT_STATUS hsakmt_fmm_set_mem_user_data(const void *mem, void *usr_data); +// Memory allocation/free functions +void *hsakmt_fmm_allocate_scratch(HsaKFDContext *ctx, + uint32_t gpu_id, + void *address, + uint64_t MemorySizeInBytes); + +void *hsakmt_fmm_allocate_device(HsaKFDContext *ctx, + uint32_t gpu_id, + uint32_t node_id, + void *address, + uint64_t MemorySizeInBytes, + uint64_t alignment, + HsaMemFlags flags); + +void *hsakmt_fmm_allocate_host(HsaKFDContext *ctx, + uint32_t gpu_id, + uint32_t node_id, + void *address, + uint64_t MemorySizeInBytes, + uint64_t alignment, + HsaMemFlags flags); + +void *hsakmt_fmm_allocate_doorbell(HsaKFDContext *ctx, + uint32_t gpu_id, + uint64_t MemorySizeInBytes, + uint64_t doorbell_offset); + +void hsakmt_fmm_print(HsaKFDContext *ctx, uint32_t node); +HSAKMT_STATUS hsakmt_fmm_release(HsaKFDContext *ctx, void *address); + +// Memory mmap/munmap functions +HSAKMT_STATUS hsakmt_fmm_map_to_gpu(HsaKFDContext *ctx, + void *address, + uint64_t size, + uint64_t *gpuvm_address); + +HSAKMT_STATUS hsakmt_fmm_map_to_gpu_nodes(HsaKFDContext *ctx, + void *address, + uint64_t size, + uint32_t *nodes_to_map, + uint64_t num_of_nodes, + uint64_t *gpuvm_address); + +int hsakmt_fmm_unmap_from_gpu(HsaKFDContext *ctx, void *address); + +// Memory register/deregister functions +HSAKMT_STATUS hsakmt_fmm_register_memory(HsaKFDContext *ctx, + void *address, uint64_t size_in_bytes, + uint32_t *gpu_id_array, + uint32_t gpu_id_array_size, + HsaMemFlags flags); + +HSAKMT_STATUS hsakmt_fmm_register_graphics_handle(HsaKFDContext *ctx, + HSAuint64 GraphicsResourceHandle, + HsaGraphicsResourceInfo *GraphicsResourceInfo, + uint32_t *gpu_id_array, + uint32_t gpu_id_array_size, + HSA_REGISTER_MEM_FLAGS RegisterFlags); + +HSAKMT_STATUS hsakmt_fmm_deregister_memory(HsaKFDContext *ctx, void *address); + +// Memory export functions +HSAKMT_STATUS hsakmt_fmm_export_dma_buf_fd(HsaKFDContext *ctx, + void *MemoryAddress, + HSAuint64 MemorySizeInBytes, + int *DMABufFd, + HSAuint64 *Offset); + +HSAKMT_STATUS hsakmt_fmm_share_memory(HsaKFDContext *ctx, + void *MemoryAddress, + HSAuint64 SizeInBytes, + HsaSharedMemoryHandle *SharedMemoryHandle); + +HSAKMT_STATUS hsakmt_fmm_register_shared_memory(HsaKFDContext *ctx, + const HsaSharedMemoryHandle *SharedMemoryHandle, + HSAuint64 *SizeInBytes, + void **MemoryAddress, + uint32_t *gpu_id_array, + uint32_t gpu_id_array_size); + +bool hsakmt_fmm_get_handle(HsaKFDContext *ctx, + void *address, + uint64_t *handle, + uint64_t *size_offset); +HSAKMT_STATUS hsakmt_fmm_get_mem_info(HsaKFDContext *ctx, + const void *address, + HsaPointerInfo *info); +HSAKMT_STATUS hsakmt_fmm_set_mem_user_data(HsaKFDContext *ctx, + const void *mem, + void *usr_data); #ifdef SANITIZER_AMDGPU -HSAKMT_STATUS hsakmt_fmm_replace_asan_header_page(void* address); -HSAKMT_STATUS hsakmt_fmm_return_asan_header_page(void* address); +HSAKMT_STATUS hsakmt_fmm_replace_asan_header_page(HsaKFDContext *ctx, void* address); +HSAKMT_STATUS hsakmt_fmm_return_asan_header_page(HsaKFDContext *ctx, void* address); #endif /* Topology interface*/ -HSAKMT_STATUS hsakmt_fmm_get_aperture_base_and_limit(aperture_type_e aperture_type, HSAuint32 gpu_id, +HSAKMT_STATUS hsakmt_fmm_get_aperture_base_and_limit(HsaKFDContext *ctx, + aperture_type_e aperture_type, HSAuint32 gpu_id, HSAuint64 *aperture_base, HSAuint64 *aperture_limit); -HSAKMT_STATUS hsakmt_fmm_register_memory(void *address, uint64_t size_in_bytes, - uint32_t *gpu_id_array, - uint32_t gpu_id_array_size, - HsaMemFlags flags); -HSAKMT_STATUS hsakmt_fmm_register_graphics_handle(HSAuint64 GraphicsResourceHandle, - HsaGraphicsResourceInfo *GraphicsResourceInfo, - uint32_t *gpu_id_array, - uint32_t gpu_id_array_size, - HSA_REGISTER_MEM_FLAGS RegisterFlags); -HSAKMT_STATUS hsakmt_fmm_deregister_memory(void *address); -HSAKMT_STATUS hsakmt_fmm_export_dma_buf_fd(void *MemoryAddress, - HSAuint64 MemorySizeInBytes, - int *DMABufFd, - HSAuint64 *Offset); -HSAKMT_STATUS hsakmt_fmm_share_memory(void *MemoryAddress, - HSAuint64 SizeInBytes, - HsaSharedMemoryHandle *SharedMemoryHandle); -HSAKMT_STATUS hsakmt_fmm_register_shared_memory(const HsaSharedMemoryHandle *SharedMemoryHandle, - HSAuint64 *SizeInBytes, - void **MemoryAddress, - uint32_t *gpu_id_array, - uint32_t gpu_id_array_size); -HSAKMT_STATUS hsakmt_fmm_map_to_gpu_nodes(void *address, uint64_t size, - uint32_t *nodes_to_map, uint64_t num_of_nodes, uint64_t *gpuvm_address); - -int hsakmt_open_drm_render_device(int minor); +int hsakmt_open_drm_render_device(HsaKFDContext *ctx, int minor); void *hsakmt_mmap_allocate_aligned(int prot, int flags, uint64_t size, uint64_t align, uint64_t guard_size, void *aper_base, void *aper_limit, int fd); diff --git a/projects/rocr-runtime/libhsakmt/src/globals.c b/projects/rocr-runtime/libhsakmt/src/globals.c index 9a36e6f5cc..fa9799c9d5 100644 --- a/projects/rocr-runtime/libhsakmt/src/globals.c +++ b/projects/rocr-runtime/libhsakmt/src/globals.c @@ -27,10 +27,8 @@ // HSAKMT global data -int hsakmt_kfd_fd = -1; int hsakmt_udmabuf_dev_fd = -1; unsigned long hsakmt_kfd_open_count; -unsigned long hsakmt_system_properties_count; pthread_mutex_t hsakmt_mutex = PTHREAD_MUTEX_INITIALIZER; bool hsakmt_is_dgpu; diff --git a/projects/rocr-runtime/libhsakmt/src/hsakmtctx.h b/projects/rocr-runtime/libhsakmt/src/hsakmtctx.h new file mode 100644 index 0000000000..b2f04dbcdf --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/hsakmtctx.h @@ -0,0 +1,827 @@ +/* + * Copyright © 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef _HSAKMTCTX_H_ +#define _HSAKMTCTX_H_ + +#include "hsakmt/hsakmttypes.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _HsaKFDContext HsaKFDContext; + +/** + The context-aware version for openning the kfd device. + + "Opens" the HSA kernel driver for user-kernel mode communication. + + On Windows, this function gets a handle to the KFD's AMDKFDIO device object that + is responsible for user-kernel communication, this handle is used internally by + the thunk library to send device I/O control to the HSA kernel driver. + No other thunk library function may be called unless the user-kernel communication + channel is opened first. + + On Linux this call opens the "/dev/kfd" device file to establish a communication + path to the kernel. +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtOpenKFDCtx( + HsaKFDContext **pCtx //IN/OUT + ); + +/** + The context-aware version for closing the kfd device. + + "Closes" the user-kernel communication path. + + On Windows, the handle obtained by the hsaKmtOpenKFDCtx() function is closed; + no other communication with the kernel driver is possible after the successful + execution of the hsaKmtCloseKFDCtx() function. Depending on the failure reason, + the user-kernel communication path may or may not be still active. + + On Linux the function closes the "dev/kfd" device file. + No further communication to the kernel driver is allowed until hsaKmtOpenKFDCtx() + function is called again. +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtCloseKFDCtx( void ); + +/** + The function takes a "snapshot" of the topology information within the KFD + to avoid any changes during the enumeration process. +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtAcquireSystemPropertiesCtx( + HsaKFDContext *ctx, //IN + HsaSystemProperties* SystemProperties //OUT + ); + +/** + Releases the topology "snapshot" taken by hsaKmtAcquireSystemProperties() +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtReleaseSystemPropertiesCtx( + HsaKFDContext *ctx //IN + ); + +/** + Retrieves the discoverable sub-properties for a given HSA + node. The parameters returned allow the application or runtime to size the + management structures necessary to store the information. +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtGetNodePropertiesCtx( + HsaKFDContext *ctx, //IN + HSAuint32 NodeId, //IN + HsaNodeProperties* NodeProperties //OUT + ); + +/** + Retrieves the memory properties of a specific HSA node. + the memory pointer passed as MemoryProperties is sized as + NumBanks * sizeof(HsaMemoryProperties). NumBanks is retrieved with the + hsaKmtGetNodePropertiesCtx() call. + + Some of the data returned is optional. Not all implementations may return all + parameters in the hsaMemoryProperties. +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtGetNodeMemoryPropertiesCtx( + HsaKFDContext *ctx, //IN + HSAuint32 NodeId, //IN + HSAuint32 NumBanks, //IN + HsaMemoryProperties* MemoryProperties //OUT + ); + +/** + Retrieves the cache properties of a specific HSA node and processor ID. + ProcessorID refers to either a CPU core or a SIMD unit as enumerated earlier + via the hsaKmtGetNodePropertiesCtx() call. + The memory pointer passed as CacheProperties is sized as + NumCaches * sizeof(HsaCacheProperties). NumCaches is retrieved with the + hsaKmtGetNodePropertiesCtx() call. + + The data returned is optional. Not all implementations may return all + parameters in the CacheProperties. +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtGetNodeCachePropertiesCtx( + HsaKFDContext *ctx, //IN + HSAuint32 NodeId, //IN + HSAuint32 ProcessorId, //IN + HSAuint32 NumCaches, //IN + HsaCacheProperties* CacheProperties //OUT + ); + +/** + Retrieves the HSA IO affinity properties of a specific HSA node. + the memory pointer passed as Properties is sized as + NumIoLinks * sizeof(HsaIoLinkProperties). NumIoLinks is retrieved with the + hsaKmtGetNodePropertiesCtx() call. + + The data returned is optional. Not all implementations may return all + parameters in the IoLinkProperties. +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtGetNodeIoLinkPropertiesCtx( + HsaKFDContext *ctx, //IN + HSAuint32 NodeId, //IN + HSAuint32 NumIoLinks, //IN + HsaIoLinkProperties* IoLinkProperties //OUT + ); + + +/** + Creates an operating system event associated with a HSA event ID +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtCreateEventCtx( + HsaKFDContext *ctx, //IN + HsaEventDescriptor* EventDesc, //IN + bool ManualReset, //IN + bool IsSignaled, //IN + HsaEvent** Event //OUT + ); + +/** + Destroys an operating system event associated with a HSA event ID +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtDestroyEventCtx( + HsaKFDContext *ctx, //IN + HsaEvent* Event //IN + ); + +/** + Sets the specified event object to the signaled state +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtSetEventCtx( + HsaKFDContext *ctx, //IN + HsaEvent* Event //IN + ); + +/** + Sets the specified event object to the non-signaled state +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtResetEventCtx( + HsaKFDContext *ctx, //IN + HsaEvent* Event //IN + ); + +/** + Queries the state of the specified event object +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtQueryEventStateCtx( + HsaKFDContext *ctx, //IN + HsaEvent* Event //IN + ); + +/** + Checks the current state of the event object. If the object's state is + nonsignaled, the calling thread enters the wait state. + + The function returns when one of the following occurs: +- The specified event object is in the signaled state. +- The time-out interval elapses. +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtWaitOnEventCtx( + HsaKFDContext *ctx, //IN + HsaEvent* Event, //IN + HSAuint32 Milliseconds //IN + ); + +/** + Checks the current state of the event object. If the object's state is + nonsignaled, the calling thread enters the wait state. event_age can + help avoiding race conditions. + + The function returns when one of the following occurs: +- The specified event object is in the signaled state. +- The time-out interval elapses. +- Tracking event age +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtWaitOnEvent_ExtCtx( + HsaKFDContext *ctx, //IN + HsaEvent* Event, //IN + HSAuint32 Milliseconds, //IN + uint64_t *event_age //IN/OUT + ); + +/** + Checks the current state of multiple event objects. + + The function returns when one of the following occurs: +- Either any one or all of the specified objects are in the signaled state + - if "WaitOnAll" is "true" the function returns when the state of all + objects in array is signaled + - if "WaitOnAll" is "false" the function returns when the state of any + one of the objects is set to signaled +- The time-out interval elapses. +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtWaitOnMultipleEventsCtx( + HsaKFDContext *ctx, //IN + HsaEvent* Events[], //IN + HSAuint32 NumEvents, //IN + bool WaitOnAll, //IN + HSAuint32 Milliseconds //IN + ); + +/** + Checks the current state of multiple event objects. + event_age can help avoiding race conditions. + + The function returns when one of the following occurs: +- Either any one or all of the specified objects are in the signaled state + - if "WaitOnAll" is "true" the function returns when the state of all + objects in array is signaled + - if "WaitOnAll" is "false" the function returns when the state of any + one of the objects is set to signaled +- The time-out interval elapses. +- Tracking event age +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtWaitOnMultipleEvents_ExtCtx( + HsaKFDContext *ctx, //IN + HsaEvent* Events[], //IN + HSAuint32 NumEvents, //IN + bool WaitOnAll, //IN + HSAuint32 Milliseconds, //IN + uint64_t *event_age //IN/OUT + ); + +/** + Creates a GPU queue with user-mode access rights +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtCreateQueueCtx( + HsaKFDContext *ctx, //IN + HSAuint32 NodeId, //IN + HSA_QUEUE_TYPE Type, //IN + HSAuint32 QueuePercentage, //IN + HSA_QUEUE_PRIORITY Priority, //IN + void* QueueAddress, //IN + HSAuint64 QueueSizeInBytes, //IN + HsaEvent* Event, //IN + HsaQueueResource* QueueResource //OUT + ); + +/** + Creates a GPU queue with user-mode access rights +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtCreateQueueExtCtx( + HsaKFDContext *ctx, //IN + HSAuint32 NodeId, //IN + HSA_QUEUE_TYPE Type, //IN + HSAuint32 QueuePercentage, //IN + HSA_QUEUE_PRIORITY Priority, //IN + HSAuint32 SdmaEngineId, //IN + void* QueueAddress, //IN + HSAuint64 QueueSizeInBytes, //IN + HsaEvent* Event, //IN + HsaQueueResource* QueueResource //OUT + ); + +/** + Updates a queue +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtUpdateQueueCtx( + HsaKFDContext *ctx, //IN + HSA_QUEUEID QueueId, //IN + HSAuint32 QueuePercentage, //IN + HSA_QUEUE_PRIORITY Priority, //IN + void* QueueAddress, //IN + HSAuint64 QueueSize, //IN + HsaEvent* Event //IN + ); + +/** + Destroys a queue +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtDestroyQueueCtx( + HsaKFDContext *ctx, //IN + HSA_QUEUEID QueueId //IN + ); + +/** + Set cu mask for a queue +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtSetQueueCUMaskCtx( + HsaKFDContext *ctx, //IN + HSA_QUEUEID QueueId, //IN + HSAuint32 CUMaskCount, //IN + HSAuint32* QueueCUMask //IN + ); + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtGetQueueInfoCtx( + HsaKFDContext *ctx, //IN + HSA_QUEUEID QueueId, //IN + HsaQueueInfo *QueueInfo //IN + ); + +/** + Allows an HSA process to set/change the default and alternate memory coherency, before starting to dispatch. +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtSetMemoryPolicyCtx( + HsaKFDContext *ctx, //IN + HSAuint32 Node, //IN + HSAuint32 DefaultPolicy, //IN + HSAuint32 AlternatePolicy, //IN + void* MemoryAddressAlternate, //IN (page-aligned) + HSAuint64 MemorySizeInBytes //IN (page-aligned) + ); + +/** + Allocates a memory buffer that may be accessed by the GPU +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtAllocMemoryCtx( + HsaKFDContext *ctx, //IN + HSAuint32 PreferredNode, //IN + HSAuint64 SizeInBytes, //IN (multiple of page size) + HsaMemFlags MemFlags, //IN + void** MemoryAddress //IN/OUT (page-aligned) + ); + +/** + Allocates a memory buffer with specific alignment that may be accessed by the GPU + If Alignment is 0, the smallest possible alignment will be used +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtAllocMemoryAlignCtx( + HsaKFDContext *ctx, //IN + HSAuint32 PreferredNode, //IN + HSAuint64 SizeInBytes, //IN (multiple of page size) + HSAuint64 Alignment, //IN (power of 2 and >= page size) + HsaMemFlags MemFlags, //IN + void** MemoryAddress //IN/OUT (page-aligned) + ); + +/** + Frees a memory buffer +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtFreeMemoryCtx( + HsaKFDContext *ctx, //IN + void* MemoryAddress, //IN (page-aligned) + HSAuint64 SizeInBytes //IN + ); + +/** + Inquires memory available for allocation as a memory buffer +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtAvailableMemoryCtx( + HsaKFDContext *ctx, //IN + HSAuint32 Node, //IN + HSAuint64 *AvailableBytes //OUT + ); + +/** + Registers with KFD a memory buffer that may be accessed by the GPU +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtRegisterMemoryCtx( + HsaKFDContext *ctx, //IN + void* MemoryAddress, //IN (cache-aligned) + HSAuint64 MemorySizeInBytes //IN (cache-aligned) + ); + + +/** + Registers with KFD a memory buffer that may be accessed by specific GPUs +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtRegisterMemoryToNodesCtx( + HsaKFDContext *ctx, //IN + void *MemoryAddress, //IN (cache-aligned) + HSAuint64 MemorySizeInBytes, //IN (cache-aligned) + HSAuint64 NumberOfNodes, //IN + HSAuint32* NodeArray //IN + ); + + +/** + Registers with KFD a memory buffer with memory attributes +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtRegisterMemoryWithFlagsCtx( + HsaKFDContext *ctx, //IN + void *MemoryAddress, //IN (cache-aligned) + HSAuint64 MemorySizeInBytes, //IN (cache-aligned) + HsaMemFlags MemFlags //IN + ); + +/** + Registers with KFD a graphics buffer and returns graphics metadata +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtRegisterGraphicsHandleToNodesCtx( + HsaKFDContext *ctx, //IN + HSAuint64 GraphicsResourceHandle, //IN + HsaGraphicsResourceInfo *GraphicsResourceInfo, //OUT + HSAuint64 NumberOfNodes, //IN + HSAuint32* NodeArray //IN + ); + +/** + Similar to hsaKmtRegisterGraphicsHandleToNodes but provides registration + options via RegisterFlags. +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtRegisterGraphicsHandleToNodesExtCtx( + HsaKFDContext *ctx, //IN + HSAuint64 GraphicsResourceHandle, //IN + HsaGraphicsResourceInfo *GraphicsResourceInfo, //OUT + HSAuint64 NumberOfNodes, //IN + HSAuint32* NodeArray, //IN + HSA_REGISTER_MEM_FLAGS RegisterFlags //IN + ); + +/** + * Export a dmabuf handle and offset for a given memory address + * + * Validates that @MemoryAddress belongs to a valid allocation and that the + * @MemorySizeInBytes doesn't exceed the end of that allocation. Returns a + * dmabuf fd of the allocation and the offset of MemoryAddress within that + * allocation. The memory will remain allocated even after the allocation is + * freed by hsaKmtFreeMemory for as long as a dmabuf fd remains open or any + * importer of that fd maintains an active reference to the memory. + */ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtExportDMABufHandleCtx( + HsaKFDContext *ctx, //IN + void *MemoryAddress, //IN + HSAuint64 MemorySizeInBytes, //IN + int *DMABufFd, //OUT + HSAuint64 *Offset //OUT + ); + +/** + Export a memory buffer for sharing with other processes + + NOTE: for the current revision of the thunk spec, SizeInBytes + must match whole allocation. +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtShareMemoryCtx( + HsaKFDContext *ctx, //IN + void *MemoryAddress, //IN + HSAuint64 SizeInBytes, //IN + HsaSharedMemoryHandle *SharedMemoryHandle //OUT +); + +/** + Register shared memory handle +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtRegisterSharedHandleCtx( + HsaKFDContext *ctx, //IN + const HsaSharedMemoryHandle *SharedMemoryHandle, //IN + void **MemoryAddress, //OUT + HSAuint64 *SizeInBytes //OUT +); + +/** + Register shared memory handle to specific nodes only +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtRegisterSharedHandleToNodesCtx( + HsaKFDContext *ctx, //IN + const HsaSharedMemoryHandle *SharedMemoryHandle, //IN + void **MemoryAddress, //OUT + HSAuint64 *SizeInBytes, //OUT + HSAuint64 NumberOfNodes, //OUT + HSAuint32* NodeArray //OUT +); + +/** + Unregisters with KFD a memory buffer +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtDeregisterMemoryCtx( + HsaKFDContext *ctx, //IN + void* MemoryAddress //IN + ); + +/** + Ensures that the memory is resident and can be accessed by GPU +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtMapMemoryToGPUCtx( + HsaKFDContext *ctx, //IN + void* MemoryAddress, //IN (page-aligned) + HSAuint64 MemorySizeInBytes, //IN (page-aligned) + HSAuint64* AlternateVAGPU //OUT (page-aligned) + ); + +/** + Ensures that the memory is resident and can be accessed by GPUs +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtMapMemoryToGPUNodesCtx( + HsaKFDContext *ctx, //IN + void* MemoryAddress, //IN (page-aligned) + HSAuint64 MemorySizeInBytes, //IN (page-aligned) + HSAuint64* AlternateVAGPU, //OUT (page-aligned) + HsaMemMapFlags MemMapFlags, //IN + HSAuint64 NumberOfNodes, //IN + HSAuint32* NodeArray //IN + ); + +/** + Releases the residency of the memory +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtUnmapMemoryToGPUCtx( + HsaKFDContext *ctx, //IN + void* MemoryAddress //IN (page-aligned) + ); + +/** + Stub for Unmap Graphic Handle +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtUnmapGraphicHandleCtx( + HsaKFDContext *ctx, //IN + HSAuint32 NodeId, //IN + HSAuint64 FlatMemoryAddress, //IN + HSAuint64 SizeInBytes //IN + ); + +/** + * Get an AMDGPU device handle for a GPU node + */ +HSAKMT_STATUS +HSAKMTAPI +hsaKmtGetAMDGPUDeviceHandleCtx( + HsaKFDContext *ctx, //IN + HSAuint32 NodeId, //IN + HsaAMDGPUDeviceHandle *DeviceHandle //OUT + ); + +/** + Sets trap handler and trap buffer to be used for all queues + associated with the specified NodeId within this process context +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtSetTrapHandlerCtx( + HsaKFDContext *ctx, //IN + HSAuint32 NodeId, //IN + void* TrapHandlerBaseAddress, //IN + HSAuint64 TrapHandlerSizeInBytes, //IN + void* TrapBufferBaseAddress, //IN + HSAuint64 TrapBufferSizeInBytes //IN + ); + +/** + Gets image tile configuration. + */ +HSAKMT_STATUS +HSAKMTAPI +hsaKmtGetTileConfigCtx( + HsaKFDContext *ctx, //IN + HSAuint32 NodeId, //IN + HsaGpuTileConfig *config //IN/OUT + ); + +/** + Returns information about pointers +*/ +HSAKMT_STATUS +HSAKMTAPI +hsaKmtQueryPointerInfoCtx( + HsaKFDContext *ctx, //IN + const void *Pointer, //IN + HsaPointerInfo *PointerInfo //OUT + ); + +/** + Associates user data with a memory allocation +*/ +HSAKMT_STATUS +HSAKMTAPI +hsaKmtSetMemoryUserDataCtx( + HsaKFDContext *ctx, //IN + const void * Pointer, //IN + void * UserData //IN + ); + +/** + Allocate GWS resource for a queue + */ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtAllocQueueGWSCtx( + HsaKFDContext *ctx, //IN + HSA_QUEUEID QueueId, //IN + HSAuint32 nGWS, //IN + HSAuint32 *firstGWS //OUT + ); + +/* Helper functions for calling KFD SVM ioctl */ +HSAKMT_STATUS +HSAKMTAPI +hsaKmtSVMSetAttrCtx( + HsaKFDContext *ctx, //IN + void *start_addr, //IN: Start of the virtual address range (page-aligned) + HSAuint64 size, //IN: size (page-aligned) + unsigned int nattr, //IN: number of attributes + HSA_SVM_ATTRIBUTE *attrs //IN: array of attributes + ); + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtSVMGetAttrCtx( + HsaKFDContext *ctx, //IN + void *start_addr, //IN: Start of the virtual address range (page-aligned) + HSAuint64 size, //IN: size (page aligned) + unsigned int nattr, //IN: number of attributes + HSA_SVM_ATTRIBUTE *attrs //IN/OUT: array of attributes + ); + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtSetXNACKModeCtx( + HsaKFDContext *ctx, //IN + HSAint32 enable //IN: enable/disable XNACK node. + ); + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtGetXNACKModeCtx( + HsaKFDContext *ctx, //IN + HSAint32 *enable //OUT: returns XNACK value. + ); + +/** + Open anonymous file handle to enable events and read SMI events. + + To enable events, write 64bit events mask to fd, event enums as bit index. + for example, event mask ctx(HSA_SMI_EVENT_MASK_FROM_INDEXCtx(HSA_SMI_EVENT_INDEX_MAX) - 1) to enable all events + + Read event from fd is not blocking, use poll with timeout value to check if event is available. + Event is dropped if kernel event fifo is full. +*/ +HSAKMT_STATUS +HSAKMTAPI +hsaKmtOpenSMICtx( + HsaKFDContext *ctx, //IN + HSAuint32 NodeId, //IN: GPU node_id to receive the SMI event from + int *fd //OUT: anonymous file handle + ); + +/** + If this is GPU Mapped memory, remap the first page at this address to be normal system memory + + This is used in ASAN mode to remap the first page of device memory to share host ASAN logic. + This function is only supported when libhsakmt is compiled in ASAN mode. +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtReplaceAsanHeaderPageCtx( + HsaKFDContext *ctx, //IN + void *addr //IN: Start of the virtual address page + ); + +/** + If this is GPU Mapped memory, remap the first page back to the original GPU memory + + This is used in ASAN mode to remap the first page back to its original mapping. + This function is only supported when libhsakmt is compiled in ASAN mode. +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtReturnAsanHeaderPageCtx( + HsaKFDContext *ctx, //IN + void *addr //IN: Start of the virtual address page + ); + +#ifdef __cplusplus +} //extern "C" +#endif + +#endif //_HSAKMTCTX_H_ diff --git a/projects/rocr-runtime/libhsakmt/src/hsakmtmodel.c b/projects/rocr-runtime/libhsakmt/src/hsakmtmodel.c index c065c3f0a3..7b1c69b76b 100644 --- a/projects/rocr-runtime/libhsakmt/src/hsakmtmodel.c +++ b/projects/rocr-runtime/libhsakmt/src/hsakmtmodel.c @@ -145,8 +145,8 @@ void model_init_env_vars(void) abort(); #endif } - assert(hsakmt_kfd_fd < 0); - hsakmt_kfd_fd = fd; + assert(hsakmt_primary_kfd_ctx.fd < 0); + hsakmt_kfdcontext_init_context(fd, &hsakmt_primary_kfd_ctx); pthread_condattr_t condattr; pthread_condattr_init(&condattr); pthread_condattr_setclock(&condattr, CLOCK_MONOTONIC); @@ -193,7 +193,7 @@ static uint64_t allocate_from_memfd(uint64_t size, uint64_t align) model_memfd_size = (model_memfd_size + align - 1) & ~(align - 1); uint64_t offset = model_memfd_size; model_memfd_size += size; - int ret = ftruncate(hsakmt_kfd_fd, model_memfd_size); + int ret = ftruncate(hsakmt_primary_kfd_ctx.fd, model_memfd_size); if (ret < 0) { fprintf(stderr, "model: ftruncate on memfd failed\n"); @@ -269,7 +269,7 @@ void model_init(void) HSAKMT_STATUS result; HsaSystemProperties props; /* Read the topology to determine nodes. */ - result = hsakmt_topology_sysfs_get_system_props(&props); + result = hsakmt_topology_sysfs_get_system_props(&hsakmt_primary_kfd_ctx, &props); if (result != HSAKMT_STATUS_SUCCESS) { fprintf(stderr, "model: Failed to parse topology\n"); @@ -503,7 +503,7 @@ static int model_kfd_ioctl_locked(unsigned long request, void *arg) // unclear whether the current implementation causes kernel data // structures to grow. But in practice, it almost certainly never // matters. - int ret = fallocate(hsakmt_kfd_fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + int ret = fallocate(hsakmt_primary_kfd_ctx.fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, mem_data->file_offset, mem_data->size); if (ret != 0) { @@ -539,7 +539,7 @@ static int model_kfd_ioctl_locked(unsigned long request, void *arg) pr_debug("MODEL IOCTL: AMDKFD_IOC_MAP_MEMORY_TO_GPU: VA: %lx : Size: %lu, Flags: %x\n", mem_data->va_addr, mem_data->size, mem_data->flags); void *ret = mmap(VOID_PTR_ADD(model_nodes[node_id].aperture, mem_data->va_addr), mem_data->size, prot, - MAP_SHARED | MAP_FIXED, hsakmt_kfd_fd, mem_data->file_offset); + MAP_SHARED | MAP_FIXED, hsakmt_primary_kfd_ctx.fd, mem_data->file_offset); if (ret == MAP_FAILED) { fprintf(stderr, "model: mmap failed\n"); @@ -767,7 +767,7 @@ static int model_kfd_ioctl_locked(unsigned long request, void *arg) model_functions->register_queue(model_nodes[node_id].model, &info); model_queues[queue_id].node_id = node_id; args->queue_id = queue_id; - // Note that strictly speaking, this is the offset into the hsakmt_kfd_fd + // Note that strictly speaking, this is the offset into the hsakmt_primary_kfd_ctx.fd // file, not the DRM fd (but they are the same in our case). args->doorbell_offset = model_nodes[node_id].doorbell_offset + 8 * queue_id; return 0; diff --git a/projects/rocr-runtime/libhsakmt/src/kfdcontext.c b/projects/rocr-runtime/libhsakmt/src/kfdcontext.c new file mode 100644 index 0000000000..981c53eb4a --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/kfdcontext.c @@ -0,0 +1,63 @@ +/* + * Copyright © 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include "kfdcontext.h" +#include "libhsakmt.h" +#include +#include +#include +#include +#include + +void hsakmt_kfdcontext_init_context(int fd, HsaKFDContext *ctx) +{ + assert(fd >= 0); + assert(ctx); + + ctx->fd = fd; + ctx->queue_context = NULL; + ctx->fmm_context = NULL; + ctx->event_context = NULL; +} + +void hsakmt_kfdcontext_clear_context(HsaKFDContext *ctx) +{ + if (!ctx) + return; + + if (ctx->queue_context) { + free(ctx->queue_context); + ctx->queue_context = NULL; + } + if (ctx->fmm_context) { + free(ctx->fmm_context); + ctx->fmm_context = NULL; + } + if (ctx->event_context) { + free(ctx->event_context); + ctx->event_context = NULL; + } + ctx->fd = -1; +} diff --git a/projects/rocr-runtime/libhsakmt/src/kfdcontext.h b/projects/rocr-runtime/libhsakmt/src/kfdcontext.h new file mode 100644 index 0000000000..8053e74f7b --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/kfdcontext.h @@ -0,0 +1,74 @@ +/* + * Copyright © 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef _KFDCONTEXT_H_ +#define _KFDCONTEXT_H_ + +#include + +struct hsa_kfd_queue_context; +struct hsa_kfd_fmm_context; +struct hsa_kfd_event_context; + +/* + * HsaKFDContext + * + * Represents the execution context for a connection to the Kernel Fusion Driver (KFD). + * + * This structure encapsulates all state required to manage a KFD session, including: + * - The file descriptor associated with the open KFD device + * - Related resources tied to this file descriptor + * + * Multiple HsaKFDContext instances can coexist simultaneously, each maintaining its own + * independent set of resources. These contexts are fully isolated from one another and + * must not have their resources mixed. For example, memory resources created in + * context A cannot be used in context B directly. If resources need to be shared between + * contexts, they must be explicitly exported and imported using the appropriate APIs. + */ +typedef struct _HsaKFDContext +{ + /* File descriptor for the KFD device */ + int fd; + + /* Queue context for managing user queues */ + struct hsa_kfd_queue_context *queue_context; + + /* Memory management context for managing memory */ + struct hsa_kfd_fmm_context *fmm_context; + + /* Event context for managing events */ + struct hsa_kfd_event_context *event_context; +} HsaKFDContext; + +// Initialize a pre-allocated HsaKFDContext with the given file descriptor +void hsakmt_kfdcontext_init_context(int fd, HsaKFDContext *ctx); +// Release all resources associated with the given KFD context +void hsakmt_kfdcontext_clear_context(HsaKFDContext *ctx); + +struct hsa_kfd_fmm_context *hsakmt_kfdcontext_get_fmm_context(HsaKFDContext *ctx); +struct hsa_kfd_queue_context *hsakmt_kfdcontext_get_queue_context(HsaKFDContext *ctx); +struct hsa_kfd_event_context *hsakmt_kfdcontext_get_event_context(HsaKFDContext *ctx); + +#endif /* _KFDCONTEXT_H_ */ diff --git a/projects/rocr-runtime/libhsakmt/src/libhsakmt.h b/projects/rocr-runtime/libhsakmt/src/libhsakmt.h index 6f66d20bf4..4ac445c025 100644 --- a/projects/rocr-runtime/libhsakmt/src/libhsakmt.h +++ b/projects/rocr-runtime/libhsakmt/src/libhsakmt.h @@ -28,11 +28,12 @@ #include "hsakmt/linux/kfd_ioctl.h" #include "hsakmt/hsakmt.h" +#include "kfdcontext.h" +#include "hsakmtctx.h" #include #include #include -extern int hsakmt_kfd_fd; extern int hsakmt_udmabuf_dev_fd; extern unsigned long hsakmt_kfd_open_count; extern bool hsakmt_forked; @@ -42,6 +43,7 @@ extern bool hsakmt_is_svm_api_supported; extern int hsakmt_zfb_support; extern HsaVersionInfo hsakmt_kfd_version_info; +extern HsaKFDContext hsakmt_primary_kfd_ctx; #undef HSAKMTAPI #define HSAKMTAPI __attribute__((visibility ("default"))) @@ -196,7 +198,7 @@ int get_drm_render_fd_by_gpu_id(HSAuint32 gpu_id); HSAKMT_STATUS hsakmt_validate_nodeid_array(uint32_t **gpu_id_array, uint32_t NumberOfNodes, uint32_t *NodeArray); -HSAKMT_STATUS hsakmt_topology_sysfs_get_system_props(HsaSystemProperties *props); +HSAKMT_STATUS hsakmt_topology_sysfs_get_system_props(HsaKFDContext *ctx, HsaSystemProperties *props); HSAKMT_STATUS hsakmt_topology_get_node_props(HSAuint32 NodeId, HsaNodeProperties *NodeProperties); HSAKMT_STATUS hsakmt_topology_get_iolink_props(HSAuint32 NodeId, @@ -207,13 +209,16 @@ bool hsakmt_topology_is_svm_needed(HSA_ENGINE_ID EngineId); HSAuint32 hsakmt_PageSizeFromFlags(unsigned int pageSizeFlags); -void* hsakmt_allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, +void* hsakmt_allocate_exec_aligned_memory_gpu(HsaKFDContext *ctx, + uint32_t size, uint32_t align, uint32_t gpu_id, uint32_t NodeId, bool NonPaged, bool DeviceLocal, bool Uncached); -void hsakmt_free_exec_aligned_memory_gpu(void *addr, uint32_t size, uint32_t align); -HSAKMT_STATUS hsakmt_init_process_doorbells(unsigned int NumNodes); -void hsakmt_destroy_process_doorbells(void); +void hsakmt_free_exec_aligned_memory_gpu(HsaKFDContext *ctx, + void *addr, uint32_t size, uint32_t align); +HSAKMT_STATUS hsakmt_init_process_doorbells(HsaKFDContext *ctx, + unsigned int NumNodes); +void hsakmt_destroy_process_doorbells(HsaKFDContext *ctx); HSAKMT_STATUS hsakmt_init_device_debugging_memory(unsigned int NumNodes); void hsakmt_destroy_device_debugging_memory(void); bool hsakmt_debug_get_reg_status(uint32_t node_id); @@ -239,10 +244,10 @@ extern int hsakmt_ioctl(int fd, unsigned long request, void *arg); #define POWER_OF_2(x) ((x && (!(x & (x - 1)))) ? 1 : 0) -void hsakmt_clear_events_page(void); -void hsakmt_fmm_clear_all_mem(void); -void hsakmt_fmm_clear_all_aperture(void); -void hsakmt_clear_process_doorbells(void); +void hsakmt_clear_events_page(HsaKFDContext *ctx); +void hsakmt_fmm_clear_all_mem(HsaKFDContext *ctx); +void hsakmt_fmm_clear_all_aperture(HsaKFDContext *ctx); +void hsakmt_clear_process_doorbells(HsaKFDContext *ctx); uint32_t hsakmt_get_num_sysfs_nodes(void); bool hsakmt_is_forked_child(void); diff --git a/projects/rocr-runtime/libhsakmt/src/memory.c b/projects/rocr-runtime/libhsakmt/src/memory.c index fb317f71d6..db71264f6d 100644 --- a/projects/rocr-runtime/libhsakmt/src/memory.c +++ b/projects/rocr-runtime/libhsakmt/src/memory.c @@ -34,7 +34,8 @@ #include #include "fmm.h" -HSAKMT_STATUS HSAKMTAPI hsaKmtSetMemoryPolicy(HSAuint32 Node, +HSAKMT_STATUS HSAKMTAPI hsaKmtSetMemoryPolicyCtx(HsaKFDContext *ctx, + HSAuint32 Node, HSAuint32 DefaultPolicy, HSAuint32 AlternatePolicy, void *MemoryAddressAlternate, @@ -86,7 +87,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtSetMemoryPolicy(HSAuint32 Node, args.alternate_aperture_base = (uintptr_t) MemoryAddressAlternate; args.alternate_aperture_size = MemorySizeInBytes; - int err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_SET_MEMORY_POLICY, &args); + int err = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_SET_MEMORY_POLICY, &args); return (err == -1) ? HSAKMT_STATUS_ERROR : HSAKMT_STATUS_SUCCESS; } @@ -104,15 +105,17 @@ HSAuint32 hsakmt_PageSizeFromFlags(unsigned int pageSizeFlags) } } -HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemory(HSAuint32 PreferredNode, +HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemoryCtx(HsaKFDContext *ctx, + HSAuint32 PreferredNode, HSAuint64 SizeInBytes, HsaMemFlags MemFlags, void **MemoryAddress) { - return hsaKmtAllocMemoryAlign(PreferredNode, SizeInBytes, 0, MemFlags, MemoryAddress); + return hsaKmtAllocMemoryAlignCtx(ctx, PreferredNode, SizeInBytes, 0, MemFlags, MemoryAddress); } -HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemoryAlign(HSAuint32 PreferredNode, +HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemoryAlignCtx(HsaKFDContext *ctx, + HSAuint32 PreferredNode, HSAuint64 SizeInBytes, HSAuint64 Alignment, HsaMemFlags MemFlags, @@ -160,7 +163,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemoryAlign(HSAuint32 PreferredNode, return HSAKMT_STATUS_NOT_IMPLEMENTED; } - *MemoryAddress = hsakmt_fmm_allocate_scratch(gpu_id, *MemoryAddress, SizeInBytes); + *MemoryAddress = hsakmt_fmm_allocate_scratch(ctx, gpu_id, *MemoryAddress, SizeInBytes); if (!(*MemoryAddress)) { pr_err("[%s] failed to allocate %lu bytes from scratch\n", @@ -183,7 +186,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemoryAlign(HSAuint32 PreferredNode, if (hsakmt_zfb_support && gpu_id && MemFlags.ui32.NonPaged == 1) MemFlags.ui32.CoarseGrain = 1; - *MemoryAddress = hsakmt_fmm_allocate_host(gpu_id, MemFlags.ui32.GTTAccess ? 0 : PreferredNode, + *MemoryAddress = hsakmt_fmm_allocate_host(ctx, gpu_id, MemFlags.ui32.GTTAccess ? 0 : PreferredNode, *MemoryAddress, SizeInBytes, Alignment, MemFlags); if (!(*MemoryAddress)) { @@ -204,7 +207,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemoryAlign(HSAuint32 PreferredNode, return HSAKMT_STATUS_INVALID_PARAMETER; } - *MemoryAddress = hsakmt_fmm_allocate_device(gpu_id, PreferredNode, *MemoryAddress, + *MemoryAddress = hsakmt_fmm_allocate_device(ctx, gpu_id, PreferredNode, *MemoryAddress, SizeInBytes, Alignment, MemFlags); if (!(*MemoryAddress)) { @@ -218,7 +221,8 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemoryAlign(HSAuint32 PreferredNode, } -HSAKMT_STATUS HSAKMTAPI hsaKmtFreeMemory(void *MemoryAddress, +HSAKMT_STATUS HSAKMTAPI hsaKmtFreeMemoryCtx(HsaKFDContext *ctx, + void *MemoryAddress, HSAuint64 SizeInBytes) { CHECK_KFD_OPEN(); @@ -230,11 +234,12 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtFreeMemory(void *MemoryAddress, return HSAKMT_STATUS_ERROR; } - return hsakmt_fmm_release(MemoryAddress); + return hsakmt_fmm_release(ctx, MemoryAddress); } -HSAKMT_STATUS HSAKMTAPI hsaKmtAvailableMemory(HSAuint32 Node, - HSAuint64 *AvailableBytes) +HSAKMT_STATUS HSAKMTAPI hsaKmtAvailableMemoryCtx(HsaKFDContext *ctx, + HSAuint32 Node, + HSAuint64 *AvailableBytes) { struct kfd_ioctl_get_available_memory_args args = {}; HSAKMT_STATUS result; @@ -250,14 +255,15 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAvailableMemory(HSAuint32 Node, return result; } - if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_AVAILABLE_MEMORY, &args)) + if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_AVAILABLE_MEMORY, &args)) return HSAKMT_STATUS_ERROR; *AvailableBytes = args.available; return HSAKMT_STATUS_SUCCESS; } -HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemory(void *MemoryAddress, +HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryCtx(HsaKFDContext *ctx, + void *MemoryAddress, HSAuint64 MemorySizeInBytes) { CHECK_KFD_OPEN(); @@ -271,11 +277,13 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemory(void *MemoryAddress, HsaMemFlags flags; flags.ui32.CoarseGrain = 1; flags.ui32.ExtendedCoherent = 0; - return hsakmt_fmm_register_memory(MemoryAddress, MemorySizeInBytes, + return hsakmt_fmm_register_memory(ctx, + MemoryAddress, MemorySizeInBytes, NULL, 0, flags); } -HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryToNodes(void *MemoryAddress, +HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryToNodesCtx(HsaKFDContext *ctx, + void *MemoryAddress, HSAuint64 MemorySizeInBytes, HSAuint64 NumberOfNodes, HSAuint32 *NodeArray) @@ -299,7 +307,8 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryToNodes(void *MemoryAddress, flags.ui32.CoarseGrain = 1; flags.ui32.ExtendedCoherent = 0; - ret = hsakmt_fmm_register_memory(MemoryAddress, MemorySizeInBytes, + ret = hsakmt_fmm_register_memory(ctx, + MemoryAddress, MemorySizeInBytes, gpu_id_array, NumberOfNodes*sizeof(uint32_t), flags); @@ -310,7 +319,8 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryToNodes(void *MemoryAddress, return ret; } -HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryWithFlags(void *MemoryAddress, +HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryWithFlagsCtx(HsaKFDContext *ctx, + void *MemoryAddress, HSAuint64 MemorySizeInBytes, HsaMemFlags MemFlags) { @@ -331,21 +341,24 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryWithFlags(void *MemoryAddress, /* TODO: support mixed APU and dGPU configurations */ return HSAKMT_STATUS_NOT_SUPPORTED; - ret = hsakmt_fmm_register_memory(MemoryAddress, MemorySizeInBytes, + ret = hsakmt_fmm_register_memory(ctx, + MemoryAddress, MemorySizeInBytes, NULL, 0, MemFlags); return ret; } -HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterGraphicsHandleToNodes(HSAuint64 GraphicsResourceHandle, +HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterGraphicsHandleToNodesCtx(HsaKFDContext *ctx, + HSAuint64 GraphicsResourceHandle, HsaGraphicsResourceInfo *GraphicsResourceInfo, HSAuint64 NumberOfNodes, HSAuint32 *NodeArray) { HSA_REGISTER_MEM_FLAGS regFlags; regFlags.Value = 0; - - return hsaKmtRegisterGraphicsHandleToNodesExt(GraphicsResourceHandle, + + return hsaKmtRegisterGraphicsHandleToNodesExtCtx(ctx, + GraphicsResourceHandle, GraphicsResourceInfo, NumberOfNodes, NodeArray, @@ -353,7 +366,8 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterGraphicsHandleToNodes(HSAuint64 GraphicsRe } -HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterGraphicsHandleToNodesExt(HSAuint64 GraphicsResourceHandle, +HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterGraphicsHandleToNodesExtCtx(HsaKFDContext *ctx, + HSAuint64 GraphicsResourceHandle, HsaGraphicsResourceInfo *GraphicsResourceInfo, HSAuint64 NumberOfNodes, HSAuint32 *NodeArray, @@ -371,7 +385,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterGraphicsHandleToNodesExt(HSAuint64 Graphic } if (ret == HSAKMT_STATUS_SUCCESS) { - ret = hsakmt_fmm_register_graphics_handle( + ret = hsakmt_fmm_register_graphics_handle(ctx, GraphicsResourceHandle, GraphicsResourceInfo, gpu_id_array, NumberOfNodes * sizeof(uint32_t), RegisterFlags); if (ret != HSAKMT_STATUS_SUCCESS) @@ -381,7 +395,8 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterGraphicsHandleToNodesExt(HSAuint64 Graphic return ret; } -HSAKMT_STATUS HSAKMTAPI hsaKmtExportDMABufHandle(void *MemoryAddress, +HSAKMT_STATUS HSAKMTAPI hsaKmtExportDMABufHandleCtx(HsaKFDContext *ctx, + void *MemoryAddress, HSAuint64 MemorySizeInBytes, int *DMABufFd, HSAuint64 *Offset) @@ -391,11 +406,13 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtExportDMABufHandle(void *MemoryAddress, pr_debug("[%s] address %p\n", __func__, MemoryAddress); - return hsakmt_fmm_export_dma_buf_fd(MemoryAddress, MemorySizeInBytes, + return hsakmt_fmm_export_dma_buf_fd(ctx, + MemoryAddress, MemorySizeInBytes, DMABufFd, Offset); } -HSAKMT_STATUS HSAKMTAPI hsaKmtShareMemory(void *MemoryAddress, +HSAKMT_STATUS HSAKMTAPI hsaKmtShareMemoryCtx(HsaKFDContext *ctx, + void *MemoryAddress, HSAuint64 SizeInBytes, HsaSharedMemoryHandle *SharedMemoryHandle) { @@ -406,25 +423,28 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtShareMemory(void *MemoryAddress, if (!SharedMemoryHandle) return HSAKMT_STATUS_INVALID_PARAMETER; - return hsakmt_fmm_share_memory(MemoryAddress, SizeInBytes, SharedMemoryHandle); + return hsakmt_fmm_share_memory(ctx, MemoryAddress, SizeInBytes, SharedMemoryHandle); } -HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterSharedHandle(const HsaSharedMemoryHandle *SharedMemoryHandle, - void **MemoryAddress, - HSAuint64 *SizeInBytes) +HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterSharedHandleCtx(HsaKFDContext *ctx, + const HsaSharedMemoryHandle *SharedMemoryHandle, + void **MemoryAddress, + HSAuint64 *SizeInBytes) { CHECK_KFD_OPEN(); pr_debug("[%s] handle %p\n", __func__, SharedMemoryHandle); - return hsaKmtRegisterSharedHandleToNodes(SharedMemoryHandle, + return hsaKmtRegisterSharedHandleToNodesCtx(ctx, + SharedMemoryHandle, MemoryAddress, SizeInBytes, 0, NULL); } -HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterSharedHandleToNodes(const HsaSharedMemoryHandle *SharedMemoryHandle, +HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterSharedHandleToNodesCtx(HsaKFDContext *ctx, + const HsaSharedMemoryHandle *SharedMemoryHandle, void **MemoryAddress, HSAuint64 *SizeInBytes, HSAuint64 NumberOfNodes, @@ -447,7 +467,8 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterSharedHandleToNodes(const HsaSharedMemoryH goto error; } - ret = hsakmt_fmm_register_shared_memory(SharedMemoryHandle, + ret = hsakmt_fmm_register_shared_memory(ctx, + SharedMemoryHandle, SizeInBytes, MemoryAddress, gpu_id_array, @@ -487,17 +508,17 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtProcessVMWrite(HSAuint32 Pid, return HSAKMT_STATUS_NOT_IMPLEMENTED; } - -HSAKMT_STATUS HSAKMTAPI hsaKmtDeregisterMemory(void *MemoryAddress) +HSAKMT_STATUS HSAKMTAPI hsaKmtDeregisterMemoryCtx(HsaKFDContext *ctx, void *MemoryAddress) { CHECK_KFD_OPEN(); pr_debug("[%s] address %p\n", __func__, MemoryAddress); - return hsakmt_fmm_deregister_memory(MemoryAddress); + return hsakmt_fmm_deregister_memory(ctx, MemoryAddress); } -HSAKMT_STATUS HSAKMTAPI hsaKmtMapMemoryToGPU(void *MemoryAddress, +HSAKMT_STATUS HSAKMTAPI hsaKmtMapMemoryToGPUCtx(HsaKFDContext *ctx, + void *MemoryAddress, HSAuint64 MemorySizeInBytes, HSAuint64 *AlternateVAGPU) { @@ -513,10 +534,11 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtMapMemoryToGPU(void *MemoryAddress, if (AlternateVAGPU) *AlternateVAGPU = 0; - return hsakmt_fmm_map_to_gpu(MemoryAddress, MemorySizeInBytes, AlternateVAGPU); + return hsakmt_fmm_map_to_gpu(ctx, MemoryAddress, MemorySizeInBytes, AlternateVAGPU); } -HSAKMT_STATUS HSAKMTAPI hsaKmtMapMemoryToGPUNodes(void *MemoryAddress, +HSAKMT_STATUS HSAKMTAPI hsaKmtMapMemoryToGPUNodesCtx(HsaKFDContext *ctx, + void *MemoryAddress, HSAuint64 MemorySizeInBytes, HSAuint64 *AlternateVAGPU, HsaMemMapFlags MemMapFlags, @@ -537,16 +559,15 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtMapMemoryToGPUNodes(void *MemoryAddress, } if (!hsakmt_is_dgpu && NumberOfNodes == 1) - return hsaKmtMapMemoryToGPU(MemoryAddress, - MemorySizeInBytes, - AlternateVAGPU); + return hsaKmtMapMemoryToGPUCtx(ctx, MemoryAddress, + MemorySizeInBytes, AlternateVAGPU); ret = hsakmt_validate_nodeid_array(&gpu_id_array, NumberOfNodes, NodeArray); if (ret != HSAKMT_STATUS_SUCCESS) return ret; - ret = hsakmt_fmm_map_to_gpu_nodes(MemoryAddress, MemorySizeInBytes, + ret = hsakmt_fmm_map_to_gpu_nodes(ctx, MemoryAddress, MemorySizeInBytes, gpu_id_array, NumberOfNodes, AlternateVAGPU); if (gpu_id_array) @@ -555,7 +576,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtMapMemoryToGPUNodes(void *MemoryAddress, return ret; } -HSAKMT_STATUS HSAKMTAPI hsaKmtUnmapMemoryToGPU(void *MemoryAddress) +HSAKMT_STATUS HSAKMTAPI hsaKmtUnmapMemoryToGPUCtx(HsaKFDContext *ctx, void *MemoryAddress) { CHECK_KFD_OPEN(); @@ -567,7 +588,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtUnmapMemoryToGPU(void *MemoryAddress) return HSAKMT_STATUS_SUCCESS; } - if (!hsakmt_fmm_unmap_from_gpu(MemoryAddress)) + if (!hsakmt_fmm_unmap_from_gpu(ctx, MemoryAddress)) return HSAKMT_STATUS_SUCCESS; else return HSAKMT_STATUS_ERROR; @@ -588,16 +609,16 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtMapGraphicHandle(HSAuint32 NodeId, return HSAKMT_STATUS_NOT_IMPLEMENTED; } -HSAKMT_STATUS HSAKMTAPI hsaKmtUnmapGraphicHandle(HSAuint32 NodeId, +HSAKMT_STATUS HSAKMTAPI hsaKmtUnmapGraphicHandleCtx(HsaKFDContext *ctx, + HSAuint32 NodeId, HSAuint64 FlatMemoryAddress, HSAuint64 SizeInBytes) { - CHECK_KFD_OPEN(); - - return hsaKmtUnmapMemoryToGPU(PORT_UINT64_TO_VPTR(FlatMemoryAddress)); + return hsaKmtUnmapMemoryToGPUCtx(ctx, PORT_UINT64_TO_VPTR(FlatMemoryAddress)); } -HSAKMT_STATUS HSAKMTAPI hsaKmtGetTileConfig(HSAuint32 NodeId, HsaGpuTileConfig *config) +HSAKMT_STATUS HSAKMTAPI hsaKmtGetTileConfigCtx(HsaKFDContext *ctx, + HSAuint32 NodeId, HsaGpuTileConfig *config) { struct kfd_ioctl_get_tile_config_args args = {0}; uint32_t gpu_id; @@ -623,7 +644,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetTileConfig(HSAuint32 NodeId, HsaGpuTileConfig * args.num_tile_configs = config->NumTileConfigs; args.num_macro_tile_configs = config->NumMacroTileConfigs; - if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_GET_TILE_CONFIG, &args) != 0) + if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_GET_TILE_CONFIG, &args) != 0) return HSAKMT_STATUS_ERROR; config->NumTileConfigs = args.num_tile_configs; @@ -637,7 +658,8 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetTileConfig(HSAuint32 NodeId, HsaGpuTileConfig * return HSAKMT_STATUS_SUCCESS; } -HSAKMT_STATUS HSAKMTAPI hsaKmtQueryPointerInfo(const void *Pointer, +HSAKMT_STATUS HSAKMTAPI hsaKmtQueryPointerInfoCtx(HsaKFDContext *ctx, + const void *Pointer, HsaPointerInfo *PointerInfo) { CHECK_KFD_OPEN(); @@ -646,47 +668,264 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtQueryPointerInfo(const void *Pointer, if (!PointerInfo) return HSAKMT_STATUS_INVALID_PARAMETER; - return hsakmt_fmm_get_mem_info(Pointer, PointerInfo); + return hsakmt_fmm_get_mem_info(ctx, Pointer, PointerInfo); } -HSAKMT_STATUS HSAKMTAPI hsaKmtSetMemoryUserData(const void *Pointer, +HSAKMT_STATUS HSAKMTAPI hsaKmtSetMemoryUserDataCtx(HsaKFDContext *ctx, + const void *Pointer, void *UserData) { CHECK_KFD_OPEN(); pr_debug("[%s] pointer %p\n", __func__, Pointer); - return hsakmt_fmm_set_mem_user_data(Pointer, UserData); + return hsakmt_fmm_set_mem_user_data(ctx, Pointer, UserData); } -HSAKMT_STATUS HSAKMTAPI hsaKmtReplaceAsanHeaderPage(void *addr) +HSAKMT_STATUS HSAKMTAPI hsaKmtReplaceAsanHeaderPageCtx(HsaKFDContext *ctx, void *addr) { #ifdef SANITIZER_AMDGPU pr_debug("[%s] address %p\n", __func__, addr); CHECK_KFD_OPEN(); - return hsakmt_fmm_replace_asan_header_page(addr); + return hsakmt_fmm_replace_asan_header_page(ctx, addr); #else return HSAKMT_STATUS_NOT_SUPPORTED; #endif } -HSAKMT_STATUS HSAKMTAPI hsaKmtReturnAsanHeaderPage(void *addr) +HSAKMT_STATUS HSAKMTAPI hsaKmtReturnAsanHeaderPageCtx(HsaKFDContext *ctx, void *addr) { #ifdef SANITIZER_AMDGPU pr_debug("[%s] address %p\n", __func__, addr); CHECK_KFD_OPEN(); - return hsakmt_fmm_return_asan_header_page(addr); + return hsakmt_fmm_return_asan_header_page(ctx, addr); #else return HSAKMT_STATUS_NOT_SUPPORTED; #endif } -HSAKMT_STATUS HSAKMTAPI hsaKmtGetAMDGPUDeviceHandle( HSAuint32 NodeId, +HSAKMT_STATUS HSAKMTAPI hsaKmtGetAMDGPUDeviceHandleCtx(HsaKFDContext *ctx, + HSAuint32 NodeId, HsaAMDGPUDeviceHandle *DeviceHandle) { CHECK_KFD_OPEN(); - return hsakmt_fmm_get_amdgpu_device_handle(NodeId, DeviceHandle); + return hsakmt_fmm_get_amdgpu_device_handle(ctx, NodeId, DeviceHandle); +} + + +HSAKMT_STATUS HSAKMTAPI hsaKmtSetMemoryPolicy(HSAuint32 Node, + HSAuint32 DefaultPolicy, + HSAuint32 AlternatePolicy, + void *MemoryAddressAlternate, + HSAuint64 MemorySizeInBytes) +{ + return hsaKmtSetMemoryPolicyCtx(&hsakmt_primary_kfd_ctx, Node, + DefaultPolicy, AlternatePolicy, + MemoryAddressAlternate, MemorySizeInBytes); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemory(HSAuint32 PreferredNode, + HSAuint64 SizeInBytes, + HsaMemFlags MemFlags, + void **MemoryAddress) +{ + return hsaKmtAllocMemoryAlign(PreferredNode, SizeInBytes, 0, MemFlags, MemoryAddress); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemoryAlign(HSAuint32 PreferredNode, + HSAuint64 SizeInBytes, + HSAuint64 Alignment, + HsaMemFlags MemFlags, + void **MemoryAddress) +{ + return hsaKmtAllocMemoryAlignCtx(&hsakmt_primary_kfd_ctx, PreferredNode, + SizeInBytes, Alignment, MemFlags, MemoryAddress); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtFreeMemory(void *MemoryAddress, + HSAuint64 SizeInBytes) +{ + return hsaKmtFreeMemoryCtx(&hsakmt_primary_kfd_ctx, MemoryAddress, SizeInBytes); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtAvailableMemory(HSAuint32 Node, + HSAuint64 *AvailableBytes) +{ + return hsaKmtAvailableMemoryCtx(&hsakmt_primary_kfd_ctx, Node, AvailableBytes); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemory(void *MemoryAddress, + HSAuint64 MemorySizeInBytes) +{ + return hsaKmtRegisterMemoryCtx(&hsakmt_primary_kfd_ctx, MemoryAddress, MemorySizeInBytes); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryToNodes(void *MemoryAddress, + HSAuint64 MemorySizeInBytes, + HSAuint64 NumberOfNodes, + HSAuint32 *NodeArray) +{ + return hsaKmtRegisterMemoryToNodesCtx(&hsakmt_primary_kfd_ctx, + MemoryAddress, MemorySizeInBytes, + NumberOfNodes, NodeArray); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryWithFlags(void *MemoryAddress, + HSAuint64 MemorySizeInBytes, + HsaMemFlags MemFlags) +{ + return hsaKmtRegisterMemoryWithFlagsCtx(&hsakmt_primary_kfd_ctx, + MemoryAddress, MemorySizeInBytes, MemFlags); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterGraphicsHandleToNodes(HSAuint64 GraphicsResourceHandle, + HsaGraphicsResourceInfo *GraphicsResourceInfo, + HSAuint64 NumberOfNodes, + HSAuint32 *NodeArray) +{ + HSA_REGISTER_MEM_FLAGS regFlags; + regFlags.Value = 0; + + return hsaKmtRegisterGraphicsHandleToNodesExt(GraphicsResourceHandle, + GraphicsResourceInfo, + NumberOfNodes, + NodeArray, + regFlags); + +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterGraphicsHandleToNodesExt(HSAuint64 GraphicsResourceHandle, + HsaGraphicsResourceInfo *GraphicsResourceInfo, + HSAuint64 NumberOfNodes, + HSAuint32 *NodeArray, + HSA_REGISTER_MEM_FLAGS RegisterFlags) +{ + return hsaKmtRegisterGraphicsHandleToNodesExtCtx(&hsakmt_primary_kfd_ctx, + GraphicsResourceHandle, + GraphicsResourceInfo, + NumberOfNodes, + NodeArray, + RegisterFlags); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtExportDMABufHandle(void *MemoryAddress, + HSAuint64 MemorySizeInBytes, + int *DMABufFd, + HSAuint64 *Offset) +{ + return hsaKmtExportDMABufHandleCtx(&hsakmt_primary_kfd_ctx, + MemoryAddress, MemorySizeInBytes, + DMABufFd, Offset); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtShareMemory(void *MemoryAddress, + HSAuint64 SizeInBytes, + HsaSharedMemoryHandle *SharedMemoryHandle) +{ + return hsaKmtShareMemoryCtx(&hsakmt_primary_kfd_ctx, + MemoryAddress, SizeInBytes, SharedMemoryHandle); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterSharedHandle( + const HsaSharedMemoryHandle *SharedMemoryHandle, + void **MemoryAddress, + HSAuint64 *SizeInBytes) +{ + CHECK_KFD_OPEN(); + + pr_debug("[%s] handle %p\n", __func__, SharedMemoryHandle); + + return hsaKmtRegisterSharedHandleToNodes(SharedMemoryHandle, + MemoryAddress, + SizeInBytes, + 0, + NULL); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterSharedHandleToNodes(const HsaSharedMemoryHandle *SharedMemoryHandle, + void **MemoryAddress, + HSAuint64 *SizeInBytes, + HSAuint64 NumberOfNodes, + HSAuint32 *NodeArray) +{ + return hsaKmtRegisterSharedHandleToNodesCtx(&hsakmt_primary_kfd_ctx, + SharedMemoryHandle, + MemoryAddress, + SizeInBytes, + NumberOfNodes, + NodeArray); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtDeregisterMemory(void *MemoryAddress) +{ + return hsaKmtDeregisterMemoryCtx(&hsakmt_primary_kfd_ctx, MemoryAddress); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtMapMemoryToGPU(void *MemoryAddress, + HSAuint64 MemorySizeInBytes, + HSAuint64 *AlternateVAGPU) +{ + return hsaKmtMapMemoryToGPUCtx(&hsakmt_primary_kfd_ctx, MemoryAddress, MemorySizeInBytes, AlternateVAGPU); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtMapMemoryToGPUNodes( + void *MemoryAddress, + HSAuint64 MemorySizeInBytes, + HSAuint64 *AlternateVAGPU, + HsaMemMapFlags MemMapFlags, + HSAuint64 NumberOfNodes, + HSAuint32 *NodeArray) +{ + return hsaKmtMapMemoryToGPUNodesCtx(&hsakmt_primary_kfd_ctx, MemoryAddress, + MemorySizeInBytes, AlternateVAGPU, MemMapFlags, NumberOfNodes, NodeArray); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtUnmapMemoryToGPU(void *MemoryAddress) +{ + return hsaKmtUnmapMemoryToGPUCtx(&hsakmt_primary_kfd_ctx, MemoryAddress); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtUnmapGraphicHandle(HSAuint32 NodeId, + HSAuint64 FlatMemoryAddress, + HSAuint64 SizeInBytes) +{ + return hsaKmtUnmapMemoryToGPU(PORT_UINT64_TO_VPTR(FlatMemoryAddress)); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtGetTileConfig(HSAuint32 NodeId, HsaGpuTileConfig *config) +{ + return hsaKmtGetTileConfigCtx(&hsakmt_primary_kfd_ctx, NodeId, config); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtQueryPointerInfo(const void *Pointer, + HsaPointerInfo *PointerInfo) +{ + return hsaKmtQueryPointerInfoCtx(&hsakmt_primary_kfd_ctx, Pointer, PointerInfo); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtSetMemoryUserData(const void *Pointer, + void *UserData) +{ + return hsaKmtSetMemoryUserDataCtx(&hsakmt_primary_kfd_ctx, Pointer, UserData); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtReplaceAsanHeaderPage(void *addr) +{ + return hsaKmtReplaceAsanHeaderPageCtx(&hsakmt_primary_kfd_ctx, addr); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtReturnAsanHeaderPage(void *addr) +{ + return hsaKmtReturnAsanHeaderPageCtx(&hsakmt_primary_kfd_ctx, addr); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtGetAMDGPUDeviceHandle(HSAuint32 NodeId, + HsaAMDGPUDeviceHandle *DeviceHandle) +{ + CHECK_KFD_OPEN(); + + return hsaKmtGetAMDGPUDeviceHandleCtx(&hsakmt_primary_kfd_ctx, NodeId, DeviceHandle); } diff --git a/projects/rocr-runtime/libhsakmt/src/openclose.c b/projects/rocr-runtime/libhsakmt/src/openclose.c index d738fcc747..4d7d428891 100644 --- a/projects/rocr-runtime/libhsakmt/src/openclose.c +++ b/projects/rocr-runtime/libhsakmt/src/openclose.c @@ -51,6 +51,8 @@ static pid_t parent_pid = -1; int hsakmt_debug_level; bool hsakmt_forked; +HsaKFDContext hsakmt_primary_kfd_ctx = {.fd = -1}; + /* hsakmt_is_forked_child detects when the process has forked since the last * time this function was called. We cannot rely on pthread_atfork * because the process can fork without calling the fork function in @@ -99,16 +101,18 @@ static void child_fork_handler(void) * The topology information is duplicated from the parent is valid * in the child process so it is not cleared */ -static void clear_after_fork(void) +static void clear_after_fork(HsaKFDContext *ctx) { - hsakmt_clear_process_doorbells(); - hsakmt_clear_events_page(); - hsakmt_fmm_clear_all_mem(); + hsakmt_clear_process_doorbells(ctx); + hsakmt_clear_events_page(ctx); + hsakmt_fmm_clear_all_mem(ctx); hsakmt_destroy_device_debugging_memory(); - if (hsakmt_kfd_fd) { - close(hsakmt_kfd_fd); - hsakmt_kfd_fd = -1; - } + + int fd = ctx->fd; + if (fd >= 0) { + hsakmt_kfdcontext_clear_context(ctx); + close(fd); + } if (hsakmt_udmabuf_dev_fd > 0) { close(hsakmt_udmabuf_dev_fd); hsakmt_udmabuf_dev_fd = -1; @@ -150,7 +154,7 @@ static HSAKMT_STATUS init_vars_from_env(void) return HSAKMT_STATUS_SUCCESS; } -HSAKMT_STATUS HSAKMTAPI hsaKmtOpenKFD(void) +HSAKMT_STATUS HSAKMTAPI hsaKmtOpenKFDCtx(HsaKFDContext **pCtx) { HSAKMT_STATUS result; int fd = -1; @@ -166,7 +170,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtOpenKFD(void) * belong to the parent */ if (hsakmt_is_forked_child()) - clear_after_fork(); + clear_after_fork(&hsakmt_primary_kfd_ctx); if (hsakmt_kfd_open_count == 0) { static bool atfork_installed = false; @@ -184,15 +188,14 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtOpenKFD(void) // Check if we are using the hsakmtmodel and setup initial state model_init_env_vars(); - if (hsakmt_kfd_fd < 0 && !hsakmt_use_model) { + if (hsakmt_primary_kfd_ctx.fd < 0 && !hsakmt_use_model) { fd = open(kfd_device_name, O_RDWR | O_CLOEXEC); if (fd == -1) { result = HSAKMT_STATUS_KERNEL_IO_CHANNEL_NOT_OPENED; goto open_failed; } - - hsakmt_kfd_fd = fd; + hsakmt_kfdcontext_init_context(fd, &hsakmt_primary_kfd_ctx); } init_page_size(); @@ -216,8 +219,8 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtOpenKFD(void) useSvmStr = getenv("HSA_USE_SVM"); hsakmt_is_svm_api_supported = !(useSvmStr && !strcmp(useSvmStr, "0")); if(!hsakmt_use_model) - result = hsakmt_topology_sysfs_get_system_props(&sys_props); - + result = hsakmt_topology_sysfs_get_system_props(&hsakmt_primary_kfd_ctx, &sys_props); + if (result != HSAKMT_STATUS_SUCCESS) goto topology_sysfs_failed; @@ -227,6 +230,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtOpenKFD(void) pr_warn("Insufficient Memory. Debugging unavailable\n"); hsakmt_init_counter_props(sys_props.NumNodes); + *pCtx = &hsakmt_primary_kfd_ctx; if (!atfork_installed) { /* Atfork handlers cannot be uninstalled and @@ -241,6 +245,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtOpenKFD(void) } } else { hsakmt_kfd_open_count++; + *pCtx = &hsakmt_primary_kfd_ctx; result = HSAKMT_STATUS_KERNEL_ALREADY_OPENED; } @@ -256,7 +261,7 @@ open_failed: return result; } -HSAKMT_STATUS HSAKMTAPI hsaKmtCloseKFD(void) +HSAKMT_STATUS HSAKMTAPI hsaKmtCloseKFDCtx(void) { HSAKMT_STATUS result; @@ -266,7 +271,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCloseKFD(void) if (--hsakmt_kfd_open_count == 0) { hsakmt_destroy_counter_props(); hsakmt_destroy_device_debugging_memory(); - hsakmt_fmm_clear_all_aperture(); + hsakmt_fmm_clear_all_aperture(&hsakmt_primary_kfd_ctx); } result = HSAKMT_STATUS_SUCCESS; @@ -277,3 +282,14 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCloseKFD(void) return result; } + +HSAKMT_STATUS HSAKMTAPI hsaKmtOpenKFD(void) +{ + HsaKFDContext *pCtx = NULL; + return hsaKmtOpenKFDCtx(&pCtx); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtCloseKFD(void) +{ + return hsaKmtCloseKFDCtx(); +} diff --git a/projects/rocr-runtime/libhsakmt/src/pc_sampling.c b/projects/rocr-runtime/libhsakmt/src/pc_sampling.c index 602d35c1b3..055a30c0b1 100644 --- a/projects/rocr-runtime/libhsakmt/src/pc_sampling.c +++ b/projects/rocr-runtime/libhsakmt/src/pc_sampling.c @@ -65,7 +65,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingQueryCapabilities(HSAuint32 NodeId, void args.num_sample_info = sample_info_sz; args.flags = 0; - int err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_PC_SAMPLE, &args); + int err = hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_PC_SAMPLE, &args); *size = args.num_sample_info; @@ -111,7 +111,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingCreate(HSAuint32 NodeId, HsaPcSamplingIn args.num_sample_info = 1; args.trace_id = INVALID_TRACE_ID; - int err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_PC_SAMPLE, &args); + int err = hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_PC_SAMPLE, &args); if (err) { switch (errno) { case EINVAL: @@ -151,7 +151,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingDestroy(HSAuint32 NodeId, HsaPcSamplingT args.gpu_id = gpu_id; args.trace_id = traceId; - int err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_PC_SAMPLE, &args); + int err = hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_PC_SAMPLE, &args); if (err) { if (errno == EINVAL) return HSAKMT_STATUS_INVALID_PARAMETER; @@ -181,7 +181,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingStart(HSAuint32 NodeId, HsaPcSamplingTra args.gpu_id = gpu_id; args.trace_id = traceId; - int err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_PC_SAMPLE, &args); + int err = hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_PC_SAMPLE, &args); if (err) { switch (errno) { case EINVAL: @@ -220,7 +220,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingStop(HSAuint32 NodeId, HsaPcSamplingTrac args.gpu_id = gpu_id; args.trace_id = traceId; - int err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_PC_SAMPLE, &args); + int err = hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_PC_SAMPLE, &args); if (err) { switch (errno) { case EINVAL: diff --git a/projects/rocr-runtime/libhsakmt/src/queues.c b/projects/rocr-runtime/libhsakmt/src/queues.c index 0e3500f5ef..c2a00734ea 100644 --- a/projects/rocr-runtime/libhsakmt/src/queues.c +++ b/projects/rocr-runtime/libhsakmt/src/queues.c @@ -35,6 +35,7 @@ #include #include #include +#include /* 1024 doorbells, 4 or 8 bytes each doorbell depending on ASIC generation */ #define DOORBELL_SIZE(gfxv) (((gfxv) >= 0x90000) ? 8 : 4) @@ -80,8 +81,28 @@ struct process_doorbells { pthread_mutex_t mutex; }; -static unsigned int num_doorbells; -static struct process_doorbells *doorbells; +struct hsa_kfd_queue_context +{ + unsigned int num_doorbells; + struct process_doorbells *doorbells; +}; + +struct hsa_kfd_queue_context *hsakmt_kfdcontext_get_queue_context(HsaKFDContext *ctx) +{ + assert(ctx); + + if (ctx->queue_context) + return ctx->queue_context; + + ctx->queue_context = calloc(1, sizeof(struct hsa_kfd_queue_context)); + if (!ctx->queue_context) { + pr_err("Alloc memory failed for struct hsa_kfd_queue_context size %zu\n", + sizeof(struct hsa_kfd_queue_context)); + return NULL; + } + + return ctx->queue_context; +} uint32_t hsakmt_get_vgpr_size_per_cu(uint32_t gfxv) { @@ -102,26 +123,27 @@ uint32_t hsakmt_get_vgpr_size_per_cu(uint32_t gfxv) return vgpr_size; } -HSAKMT_STATUS hsakmt_init_process_doorbells(unsigned int NumNodes) +HSAKMT_STATUS hsakmt_init_process_doorbells(HsaKFDContext *ctx, unsigned int NumNodes) { unsigned int i; HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + struct hsa_kfd_queue_context *queue_ctx = hsakmt_kfdcontext_get_queue_context(ctx); - /* doorbells[] is accessed using Topology NodeId. This means doorbells[0], + /* queue_ctx->doorbells[] is accessed using Topology NodeId. This means doorbells[0], * which corresponds to CPU only Node, might not be used */ - doorbells = malloc(NumNodes * sizeof(struct process_doorbells)); - if (!doorbells) + queue_ctx->doorbells = malloc(NumNodes * sizeof(struct process_doorbells)); + if (!queue_ctx->doorbells) return HSAKMT_STATUS_NO_MEMORY; for (i = 0; i < NumNodes; i++) { - doorbells[i].use_gpuvm = false; - doorbells[i].size = 0; - doorbells[i].mapping = NULL; - pthread_mutex_init(&doorbells[i].mutex, NULL); + queue_ctx->doorbells[i].use_gpuvm = false; + queue_ctx->doorbells[i].size = 0; + queue_ctx->doorbells[i].mapping = NULL; + pthread_mutex_init(&queue_ctx->doorbells[i].mutex, NULL); } - num_doorbells = NumNodes; + queue_ctx->num_doorbells = NumNodes; return ret; } @@ -144,94 +166,105 @@ static void get_doorbell_map_info(uint32_t node_id, return; } -void hsakmt_destroy_process_doorbells(void) +void hsakmt_destroy_process_doorbells(HsaKFDContext *ctx) { unsigned int i; + struct hsa_kfd_queue_context *queue_ctx = hsakmt_kfdcontext_get_queue_context(ctx); + struct process_doorbells *doorbells = queue_ctx->doorbells; if (!doorbells) return; - for (i = 0; i < num_doorbells; i++) { + for (i = 0; i < queue_ctx->num_doorbells; i++) { if (!doorbells[i].size) continue; if (doorbells[i].use_gpuvm) { - hsakmt_fmm_unmap_from_gpu(doorbells[i].mapping); - hsakmt_fmm_release(doorbells[i].mapping); + hsakmt_fmm_unmap_from_gpu(ctx, doorbells[i].mapping); + hsakmt_fmm_release(ctx, doorbells[i].mapping); } else munmap(doorbells[i].mapping, doorbells[i].size); } free(doorbells); - doorbells = NULL; - num_doorbells = 0; + queue_ctx->doorbells = NULL; + queue_ctx->num_doorbells = 0; } /* This is a special funcion that should be called only from the child process * after a fork(). This will clear doorbells duplicated from the parent. */ -void hsakmt_clear_process_doorbells(void) +void hsakmt_clear_process_doorbells(HsaKFDContext *ctx) { unsigned int i; + struct hsa_kfd_queue_context *queue_ctx = hsakmt_kfdcontext_get_queue_context(ctx); - if (!doorbells) + if (!queue_ctx->doorbells) return; - for (i = 0; i < num_doorbells; i++) { - if (!doorbells[i].size) + for (i = 0; i < queue_ctx->num_doorbells; i++) { + if (!queue_ctx->doorbells[i].size) continue; - if (!doorbells[i].use_gpuvm) - munmap(doorbells[i].mapping, doorbells[i].size); + if (!queue_ctx->doorbells[i].use_gpuvm) + munmap(queue_ctx->doorbells[i].mapping, queue_ctx->doorbells[i].size); } - free(doorbells); - doorbells = NULL; - num_doorbells = 0; + free(queue_ctx->doorbells); + queue_ctx->doorbells = NULL; + queue_ctx->num_doorbells = 0; } -static HSAKMT_STATUS map_doorbell_apu(HSAuint32 NodeId, HSAuint32 gpu_id, +static HSAKMT_STATUS map_doorbell_apu(HsaKFDContext *ctx, + HSAuint32 NodeId, HSAuint32 gpu_id, HSAuint64 doorbell_mmap_offset) { void *ptr; + struct hsa_kfd_queue_context *queue_ctx = hsakmt_kfdcontext_get_queue_context(ctx); - ptr = mmap(0, doorbells[NodeId].size, PROT_READ|PROT_WRITE, - MAP_SHARED, hsakmt_kfd_fd, doorbell_mmap_offset); + ptr = mmap(0, queue_ctx->doorbells[NodeId].size, PROT_READ|PROT_WRITE, + MAP_SHARED, ctx->fd, doorbell_mmap_offset); if (ptr == MAP_FAILED) return HSAKMT_STATUS_ERROR; - doorbells[NodeId].mapping = ptr; + queue_ctx->doorbells[NodeId].mapping = ptr; return HSAKMT_STATUS_SUCCESS; } -static HSAKMT_STATUS map_doorbell_dgpu(HSAuint32 NodeId, HSAuint32 gpu_id, +static HSAKMT_STATUS map_doorbell_dgpu(HsaKFDContext *ctx, + HSAuint32 NodeId, HSAuint32 gpu_id, HSAuint64 doorbell_mmap_offset) { void *ptr; + struct hsa_kfd_queue_context *queue_ctx = hsakmt_kfdcontext_get_queue_context(ctx); - ptr = hsakmt_fmm_allocate_doorbell(gpu_id, doorbells[NodeId].size, + ptr = hsakmt_fmm_allocate_doorbell(ctx, + gpu_id, queue_ctx->doorbells[NodeId].size, doorbell_mmap_offset); if (!ptr) return HSAKMT_STATUS_ERROR; /* map for GPU access */ - if (hsakmt_fmm_map_to_gpu(ptr, doorbells[NodeId].size, NULL)) { - hsakmt_fmm_release(ptr); + if (hsakmt_fmm_map_to_gpu(ctx, ptr, queue_ctx->doorbells[NodeId].size, NULL)) { + hsakmt_fmm_release(ctx, ptr); return HSAKMT_STATUS_ERROR; } - doorbells[NodeId].mapping = ptr; + queue_ctx->doorbells[NodeId].mapping = ptr; return HSAKMT_STATUS_SUCCESS; } -static HSAKMT_STATUS map_doorbell(HSAuint32 NodeId, HSAuint32 gpu_id, +static HSAKMT_STATUS map_doorbell(HsaKFDContext *ctx, + HSAuint32 NodeId, HSAuint32 gpu_id, HSAuint64 doorbell_mmap_offset) { HSAKMT_STATUS status = HSAKMT_STATUS_SUCCESS; + struct hsa_kfd_queue_context *queue_ctx = hsakmt_kfdcontext_get_queue_context(ctx); + struct process_doorbells *doorbells = queue_ctx->doorbells; pthread_mutex_lock(&doorbells[NodeId].mutex); if (doorbells[NodeId].size) { @@ -242,16 +275,16 @@ static HSAKMT_STATUS map_doorbell(HSAuint32 NodeId, HSAuint32 gpu_id, get_doorbell_map_info(NodeId, &doorbells[NodeId]); if (doorbells[NodeId].use_gpuvm) { - status = map_doorbell_dgpu(NodeId, gpu_id, doorbell_mmap_offset); + status = map_doorbell_dgpu(ctx, NodeId, gpu_id, doorbell_mmap_offset); if (status != HSAKMT_STATUS_SUCCESS) { /* Fall back to the old method if KFD doesn't * support doorbells in GPUVM */ doorbells[NodeId].use_gpuvm = false; - status = map_doorbell_apu(NodeId, gpu_id, doorbell_mmap_offset); + status = map_doorbell_apu(ctx, NodeId, gpu_id, doorbell_mmap_offset); } } else - status = map_doorbell_apu(NodeId, gpu_id, doorbell_mmap_offset); + status = map_doorbell_apu(ctx, NodeId, gpu_id, doorbell_mmap_offset); if (status != HSAKMT_STATUS_SUCCESS) doorbells[NodeId].size = 0; @@ -279,13 +312,13 @@ static void *allocate_exec_aligned_memory_cpu(uint32_t size) } /* The bool return indicate whether the queue needs a context-save-restore area*/ -static bool update_ctx_save_restore_size(uint32_t nodeid, struct queue *q) +static bool update_ctx_save_restore_size(HsaKFDContext *ctx, uint32_t nodeid, struct queue *q) { HsaNodeProperties node; if (q->gfxv < GFX_VERSION_CARRIZO) return false; - if (hsaKmtGetNodeProperties(nodeid, &node)) + if (hsaKmtGetNodePropertiesCtx(ctx, nodeid, &node)) return false; if (node.NumFComputeCores && node.NumSIMDPerCU) { uint32_t ctl_stack_size, wg_data_size; @@ -316,7 +349,8 @@ static bool update_ctx_save_restore_size(uint32_t nodeid, struct queue *q) return false; } -void *hsakmt_allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, uint32_t gpu_id, +void *hsakmt_allocate_exec_aligned_memory_gpu(HsaKFDContext *ctx, + uint32_t size, uint32_t align, uint32_t gpu_id, uint32_t NodeId, bool nonPaged, bool DeviceLocal, bool Uncached) @@ -337,7 +371,7 @@ void *hsakmt_allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, uin size = ALIGN_UP(size, align); if (DeviceLocal && !hsakmt_zfb_support) - mem = hsakmt_fmm_allocate_device(gpu_id, NodeId, mem, size, 0, flags); + mem = hsakmt_fmm_allocate_device(ctx, gpu_id, NodeId, mem, size, 0, flags); else { /* VRAM under ZFB mode should be supported here without any * additional code @@ -352,7 +386,7 @@ void *hsakmt_allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, uin cpu_id = 0; } } - mem = hsakmt_fmm_allocate_host(gpu_id, cpu_id, mem, size, 0, flags); + mem = hsakmt_fmm_allocate_host(ctx, gpu_id, cpu_id, mem, size, 0, flags); } if (!mem) { @@ -366,35 +400,36 @@ void *hsakmt_allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, uin HsaMemMapFlags map_flags = {0}; HSAKMT_STATUS result; - result = hsaKmtMapMemoryToGPUNodes(mem, size, &gpu_va, map_flags, 1, nodes_array); + result = hsaKmtMapMemoryToGPUNodesCtx(ctx, mem, size, &gpu_va, map_flags, 1, nodes_array); if (result != HSAKMT_STATUS_SUCCESS) { - hsaKmtFreeMemory(mem, size); + hsaKmtFreeMemoryCtx(ctx, mem, size); return NULL; } return mem; } - if (hsaKmtMapMemoryToGPU(mem, size, &gpu_va) != HSAKMT_STATUS_SUCCESS) { - hsaKmtFreeMemory(mem, size); + if (hsaKmtMapMemoryToGPUCtx(ctx, mem, size, &gpu_va) != HSAKMT_STATUS_SUCCESS) { + hsaKmtFreeMemoryCtx(ctx, mem, size); return NULL; } return mem; } -void hsakmt_free_exec_aligned_memory_gpu(void *addr, uint32_t size, uint32_t align) +void hsakmt_free_exec_aligned_memory_gpu(HsaKFDContext *ctx, void *addr, uint32_t size, uint32_t align) { size = ALIGN_UP(size, align); - if (hsaKmtUnmapMemoryToGPU(addr) == HSAKMT_STATUS_SUCCESS) - hsaKmtFreeMemory(addr, size); + if (hsaKmtUnmapMemoryToGPUCtx(ctx, addr) == HSAKMT_STATUS_SUCCESS) + hsaKmtFreeMemoryCtx(ctx, addr, size); } /* * Allocates memory aligned to sysconf(_SC_PAGESIZE) */ -static void *allocate_exec_aligned_memory(uint32_t size, +static void *allocate_exec_aligned_memory(HsaKFDContext *ctx, + uint32_t size, bool use_ats, uint32_t gpu_id, uint32_t NodeId, @@ -403,17 +438,19 @@ static void *allocate_exec_aligned_memory(uint32_t size, bool Uncached) { if (!use_ats) - return hsakmt_allocate_exec_aligned_memory_gpu(size, PAGE_SIZE, gpu_id, NodeId, + return hsakmt_allocate_exec_aligned_memory_gpu(ctx, + size, PAGE_SIZE, gpu_id, NodeId, nonPaged, DeviceLocal, Uncached); return allocate_exec_aligned_memory_cpu(size); } -static void free_exec_aligned_memory(void *addr, uint32_t size, uint32_t align, +static void free_exec_aligned_memory(HsaKFDContext *ctx, + void *addr, uint32_t size, uint32_t align, bool use_ats) { if (!use_ats) - hsakmt_free_exec_aligned_memory_gpu(addr, size, align); + hsakmt_free_exec_aligned_memory_gpu(ctx, addr, size, align); else munmap(addr, size); } @@ -454,20 +491,20 @@ static HSAKMT_STATUS register_svm_range(void *mem, uint32_t size, return hsaKmtSVMSetAttr(mem, size, nattr, attrs); } -static void free_queue(struct queue *q) +static void free_queue(HsaKFDContext *ctx, struct queue *q) { if (q->eop_buffer) - free_exec_aligned_memory(q->eop_buffer, + free_exec_aligned_memory(ctx, q->eop_buffer, q->eop_buffer_size, PAGE_SIZE, q->use_ats); if (q->unified_ctx_save_restore) munmap(q->ctx_save_restore, q->total_mem_alloc_size); else if (q->ctx_save_restore) - free_exec_aligned_memory(q->ctx_save_restore, + free_exec_aligned_memory(ctx, q->ctx_save_restore, q->total_mem_alloc_size, PAGE_SIZE, q->use_ats); - free_exec_aligned_memory((void *)q, sizeof(*q), PAGE_SIZE, q->use_ats); + free_exec_aligned_memory(ctx, (void *)q, sizeof(*q), PAGE_SIZE, q->use_ats); } static inline void fill_cwsr_header(struct queue *q, void *addr, @@ -488,7 +525,8 @@ static inline void fill_cwsr_header(struct queue *q, void *addr, } } -static int handle_concrete_asic(struct queue *q, +static int handle_concrete_asic(HsaKFDContext *ctx, + struct queue *q, struct kfd_ioctl_create_queue_args *args, uint32_t gpu_id, uint32_t NodeId, @@ -503,7 +541,8 @@ static int handle_concrete_asic(struct queue *q, if (q->eop_buffer_size > 0) { pr_info("Allocating VRAM for EOP\n"); - q->eop_buffer = allocate_exec_aligned_memory(q->eop_buffer_size, + q->eop_buffer = allocate_exec_aligned_memory(ctx, + q->eop_buffer_size, q->use_ats, gpu_id, NodeId, true, true, /* Unused for VRAM */false); if (!q->eop_buffer) @@ -513,12 +552,12 @@ static int handle_concrete_asic(struct queue *q, args->eop_buffer_size = q->eop_buffer_size; } - ret = update_ctx_save_restore_size(NodeId, q); + ret = update_ctx_save_restore_size(ctx, NodeId, q); if (ret) { HsaNodeProperties node; - if (hsaKmtGetNodeProperties(NodeId, &node)) + if (hsaKmtGetNodePropertiesCtx(ctx, NodeId, &node)) return HSAKMT_STATUS_ERROR; args->ctx_save_restore_size = q->ctx_save_restore_size; @@ -568,7 +607,7 @@ static int handle_concrete_asic(struct queue *q, } if (!q->unified_ctx_save_restore) { - q->ctx_save_restore = allocate_exec_aligned_memory( + q->ctx_save_restore = allocate_exec_aligned_memory(ctx, q->total_mem_alloc_size, q->use_ats, gpu_id, NodeId, false, false, false); @@ -591,24 +630,26 @@ static int handle_concrete_asic(struct queue *q, */ static uint32_t priority_map[] = {0, 3, 5, 7, 9, 11, 15}; -HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueue(HSAuint32 NodeId, - HSA_QUEUE_TYPE Type, - HSAuint32 QueuePercentage, - HSA_QUEUE_PRIORITY Priority, - void *QueueAddress, - HSAuint64 QueueSizeInBytes, - HsaEvent *Event, - HsaQueueResource *QueueResource) +HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueueCtx(HsaKFDContext *ctx, + HSAuint32 NodeId, + HSA_QUEUE_TYPE Type, + HSAuint32 QueuePercentage, + HSA_QUEUE_PRIORITY Priority, + void *QueueAddress, + HSAuint64 QueueSizeInBytes, + HsaEvent *Event, + HsaQueueResource *QueueResource) { if (Type == HSA_QUEUE_SDMA_BY_ENG_ID) return HSAKMT_STATUS_ERROR; - return hsaKmtCreateQueueExt(NodeId, Type, QueuePercentage, Priority, 0, + return hsaKmtCreateQueueExtCtx(ctx, NodeId, Type, QueuePercentage, Priority, 0, QueueAddress, QueueSizeInBytes, Event, QueueResource); } -HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueueExt(HSAuint32 NodeId, +HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueueExtCtx(HsaKFDContext *ctx, + HSAuint32 NodeId, HSA_QUEUE_TYPE Type, HSAuint32 QueuePercentage, HSA_QUEUE_PRIORITY Priority, @@ -628,6 +669,8 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueueExt(HSAuint32 NodeId, CHECK_KFD_OPEN(); + struct hsa_kfd_queue_context *queue_ctx = hsakmt_kfdcontext_get_queue_context(ctx); + if (Priority < HSA_QUEUE_PRIORITY_MINIMUM || Priority > HSA_QUEUE_PRIORITY_MAXIMUM) return HSAKMT_STATUS_INVALID_PARAMETER; @@ -636,7 +679,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueueExt(HSAuint32 NodeId, if (result != HSAKMT_STATUS_SUCCESS) return result; - struct queue *q = allocate_exec_aligned_memory(sizeof(*q), + struct queue *q = allocate_exec_aligned_memory(ctx, sizeof(*q), false, gpu_id, NodeId, true, false, true); if (!q) return HSAKMT_STATUS_NO_MEMORY; @@ -656,7 +699,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueueExt(HSAuint32 NodeId, /* By default, CUs are all turned on. Initialize cu_mask to '1 * for all CU bits. */ - if (hsaKmtGetNodeProperties(NodeId, &props)) + if (hsaKmtGetNodePropertiesCtx(ctx, NodeId, &props)) q->cu_mask_count = 0; else { cu_num = props.NumFComputeCores / props.NumSIMDPerCU; @@ -695,9 +738,9 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueueExt(HSAuint32 NodeId, QueueResource->QueueWptrValue = (uintptr_t)&q->wptr; } - err = handle_concrete_asic(q, &args, gpu_id, NodeId, Event, QueueResource->ErrorReason); + err = handle_concrete_asic(ctx, q, &args, gpu_id, NodeId, Event, QueueResource->ErrorReason); if (err != HSAKMT_STATUS_SUCCESS) { - free_queue(q); + free_queue(ctx, q); return err; } @@ -709,10 +752,10 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueueExt(HSAuint32 NodeId, args.queue_priority = priority_map[Priority+3]; args.sdma_engine_id = SdmaEngineId; - err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_CREATE_QUEUE, &args); + err = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_CREATE_QUEUE, &args); if (err == -1) { - free_queue(q); + free_queue(ctx, q); return HSAKMT_STATUS_ERROR; } @@ -737,20 +780,21 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueueExt(HSAuint32 NodeId, doorbell_offset = q->queue_id * DOORBELL_SIZE(q->gfxv); } - err = map_doorbell(NodeId, gpu_id, doorbell_mmap_offset); + err = map_doorbell(ctx, NodeId, gpu_id, doorbell_mmap_offset); if (err != HSAKMT_STATUS_SUCCESS) { - hsaKmtDestroyQueue(q->queue_id); + hsaKmtDestroyQueueCtx(ctx, q->queue_id); return HSAKMT_STATUS_ERROR; } QueueResource->QueueId = PORT_VPTR_TO_UINT64(q); - QueueResource->Queue_DoorBell = VOID_PTR_ADD(doorbells[NodeId].mapping, + QueueResource->Queue_DoorBell = VOID_PTR_ADD(queue_ctx->doorbells[NodeId].mapping, doorbell_offset); return HSAKMT_STATUS_SUCCESS; } -HSAKMT_STATUS HSAKMTAPI hsaKmtUpdateQueue(HSA_QUEUEID QueueId, +HSAKMT_STATUS HSAKMTAPI hsaKmtUpdateQueueCtx(HsaKFDContext *ctx, + HSA_QUEUEID QueueId, HSAuint32 QueuePercentage, HSA_QUEUE_PRIORITY Priority, void *QueueAddress, @@ -774,7 +818,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtUpdateQueue(HSA_QUEUEID QueueId, arg.queue_percentage = QueuePercentage; arg.queue_priority = priority_map[Priority+3]; - int err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_UPDATE_QUEUE, &arg); + int err = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_UPDATE_QUEUE, &arg); if (err == -1) return HSAKMT_STATUS_ERROR; @@ -782,7 +826,8 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtUpdateQueue(HSA_QUEUEID QueueId, return HSAKMT_STATUS_SUCCESS; } -HSAKMT_STATUS HSAKMTAPI hsaKmtDestroyQueue(HSA_QUEUEID QueueId) +HSAKMT_STATUS HSAKMTAPI hsaKmtDestroyQueueCtx(HsaKFDContext *ctx, + HSA_QUEUEID QueueId) { CHECK_KFD_OPEN(); @@ -794,20 +839,21 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtDestroyQueue(HSA_QUEUEID QueueId) args.queue_id = q->queue_id; - int err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_DESTROY_QUEUE, &args); + int err = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_DESTROY_QUEUE, &args); if (err == -1) { pr_err("Failed to destroy queue: %s\n", strerror(errno)); return HSAKMT_STATUS_ERROR; } - free_queue(q); + free_queue(ctx, q); return HSAKMT_STATUS_SUCCESS; } -HSAKMT_STATUS HSAKMTAPI hsaKmtSetQueueCUMask(HSA_QUEUEID QueueId, - HSAuint32 CUMaskCount, - HSAuint32 *QueueCUMask) +HSAKMT_STATUS HSAKMTAPI hsaKmtSetQueueCUMaskCtx(HsaKFDContext *ctx, + HSA_QUEUEID QueueId, + HSAuint32 CUMaskCount, + HSAuint32 *QueueCUMask) { struct queue *q = PORT_UINT64_TO_VPTR(QueueId); struct kfd_ioctl_set_cu_mask_args args = {0}; @@ -821,7 +867,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtSetQueueCUMask(HSA_QUEUEID QueueId, args.num_cu_mask = CUMaskCount; args.cu_mask_ptr = (uintptr_t)QueueCUMask; - int err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_SET_CU_MASK, &args); + int err = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_SET_CU_MASK, &args); if (err == -1) return HSAKMT_STATUS_ERROR; @@ -832,12 +878,9 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtSetQueueCUMask(HSA_QUEUEID QueueId, return HSAKMT_STATUS_SUCCESS; } -HSAKMT_STATUS -HSAKMTAPI -hsaKmtGetQueueInfo( - HSA_QUEUEID QueueId, - HsaQueueInfo *QueueInfo -) +HSAKMT_STATUS HSAKMTAPI hsaKmtGetQueueInfoCtx(HsaKFDContext *ctx, + HSA_QUEUEID QueueId, + HsaQueueInfo *QueueInfo) { struct queue *q = PORT_UINT64_TO_VPTR(QueueId); struct kfd_ioctl_get_queue_wave_state_args args = {0}; @@ -853,7 +896,7 @@ hsaKmtGetQueueInfo( args.queue_id = q->queue_id; args.ctl_stack_address = (uintptr_t)q->ctx_save_restore; - if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_GET_QUEUE_WAVE_STATE, &args) < 0) + if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_GET_QUEUE_WAVE_STATE, &args) < 0) return HSAKMT_STATUS_ERROR; QueueInfo->ControlStackTop = (void *)(args.ctl_stack_address + @@ -871,7 +914,8 @@ hsaKmtGetQueueInfo( return HSAKMT_STATUS_SUCCESS; } -HSAKMT_STATUS HSAKMTAPI hsaKmtSetTrapHandler(HSAuint32 Node, +HSAKMT_STATUS HSAKMTAPI hsaKmtSetTrapHandlerCtx(HsaKFDContext *ctx, + HSAuint32 Node, void *TrapHandlerBaseAddress, HSAuint64 TrapHandlerSizeInBytes, void *TrapBufferBaseAddress, @@ -891,7 +935,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtSetTrapHandler(HSAuint32 Node, args.tba_addr = (uintptr_t)TrapHandlerBaseAddress; args.tma_addr = (uintptr_t)TrapBufferBaseAddress; - int err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_SET_TRAP_HANDLER, &args); + int err = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_SET_TRAP_HANDLER, &args); return (err == -1) ? HSAKMT_STATUS_ERROR : HSAKMT_STATUS_SUCCESS; } @@ -921,12 +965,10 @@ uint32_t *hsakmt_convert_queue_ids(HSAuint32 NumQueues, HSA_QUEUEID *Queues) return queue_ids_ptr; } -HSAKMT_STATUS -HSAKMTAPI -hsaKmtAllocQueueGWS( - HSA_QUEUEID QueueId, - HSAuint32 nGWS, - HSAuint32 *firstGWS) +HSAKMT_STATUS HSAKMTAPI hsaKmtAllocQueueGWSCtx(HsaKFDContext *ctx, + HSA_QUEUEID QueueId, + HSAuint32 nGWS, + HSAuint32 *firstGWS) { struct kfd_ioctl_alloc_queue_gws_args args = {0}; struct queue *q = PORT_UINT64_TO_VPTR(QueueId); @@ -936,7 +978,7 @@ hsaKmtAllocQueueGWS( args.queue_id = (HSAuint32)q->queue_id; args.num_gws = nGWS; - int err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_ALLOC_QUEUE_GWS, &args); + int err = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_ALLOC_QUEUE_GWS, &args); if (!err && firstGWS) *firstGWS = args.first_gws; @@ -952,3 +994,85 @@ hsaKmtAllocQueueGWS( else return HSAKMT_STATUS_ERROR; } + + +HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueue(HSAuint32 NodeId, + HSA_QUEUE_TYPE Type, + HSAuint32 QueuePercentage, + HSA_QUEUE_PRIORITY Priority, + void *QueueAddress, + HSAuint64 QueueSizeInBytes, + HsaEvent *Event, + HsaQueueResource *QueueResource) +{ + if (Type == HSA_QUEUE_SDMA_BY_ENG_ID) + return HSAKMT_STATUS_ERROR; + + return hsaKmtCreateQueueExt(NodeId, Type, QueuePercentage, Priority, 0, + QueueAddress, QueueSizeInBytes, Event, + QueueResource); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueueExt(HSAuint32 NodeId, + HSA_QUEUE_TYPE Type, + HSAuint32 QueuePercentage, + HSA_QUEUE_PRIORITY Priority, + HSAuint32 SdmaEngineId, + void *QueueAddress, + HSAuint64 QueueSizeInBytes, + HsaEvent *Event, + HsaQueueResource *QueueResource) +{ + + return hsaKmtCreateQueueExtCtx(&hsakmt_primary_kfd_ctx, NodeId, Type, + QueuePercentage, Priority, SdmaEngineId, QueueAddress, + QueueSizeInBytes, Event, QueueResource); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtUpdateQueue(HSA_QUEUEID QueueId, + HSAuint32 QueuePercentage, + HSA_QUEUE_PRIORITY Priority, + void *QueueAddress, + HSAuint64 QueueSize, + HsaEvent *Event) +{ + return hsaKmtUpdateQueueCtx(&hsakmt_primary_kfd_ctx, QueueId, QueuePercentage, + Priority, QueueAddress, QueueSize, Event); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtDestroyQueue(HSA_QUEUEID QueueId) +{ + return hsaKmtDestroyQueueCtx(&hsakmt_primary_kfd_ctx, QueueId); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtSetQueueCUMask(HSA_QUEUEID QueueId, + HSAuint32 CUMaskCount, + HSAuint32 *QueueCUMask) +{ + return hsaKmtSetQueueCUMaskCtx(&hsakmt_primary_kfd_ctx, QueueId, CUMaskCount, QueueCUMask); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtGetQueueInfo( + HSA_QUEUEID QueueId, + HsaQueueInfo *QueueInfo) +{ + return hsaKmtGetQueueInfoCtx(&hsakmt_primary_kfd_ctx, QueueId, QueueInfo); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtSetTrapHandler(HSAuint32 Node, + void *TrapHandlerBaseAddress, + HSAuint64 TrapHandlerSizeInBytes, + void *TrapBufferBaseAddress, + HSAuint64 TrapBufferSizeInBytes) +{ + return hsaKmtSetTrapHandlerCtx(&hsakmt_primary_kfd_ctx, Node, + TrapHandlerBaseAddress, TrapHandlerSizeInBytes, + TrapBufferBaseAddress, TrapBufferSizeInBytes); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtAllocQueueGWS(HSA_QUEUEID QueueId, + HSAuint32 nGWS, + HSAuint32 *firstGWS) +{ + return hsaKmtAllocQueueGWSCtx(&hsakmt_primary_kfd_ctx, QueueId, nGWS, firstGWS); +} diff --git a/projects/rocr-runtime/libhsakmt/src/spm.c b/projects/rocr-runtime/libhsakmt/src/spm.c index 3ad72ccf2f..ec7f3d2b33 100644 --- a/projects/rocr-runtime/libhsakmt/src/spm.c +++ b/projects/rocr-runtime/libhsakmt/src/spm.c @@ -45,7 +45,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtSPMAcquire(HSAuint32 PreferredNode) args.op = KFD_IOCTL_SPM_OP_ACQUIRE; args.gpu_id = gpu_id; - ret = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_RLC_SPM, &args); + ret = hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_RLC_SPM, &args); return ret; } @@ -72,7 +72,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtSPMSetDestBuffer(HSAuint32 PreferredNode, args.op = KFD_IOCTL_SPM_OP_SET_DEST_BUF; args.gpu_id = gpu_id; - ret = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_RLC_SPM, &args); + ret = hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_RLC_SPM, &args); *SizeCopied = args.bytes_copied; *isSPMDataLoss = args.has_data_loss; @@ -96,7 +96,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtSPMRelease(HSAuint32 PreferredNode) args.op = KFD_IOCTL_SPM_OP_RELEASE; args.gpu_id = gpu_id; - ret = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_RLC_SPM, &args); + ret = hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_RLC_SPM, &args); return ret; } diff --git a/projects/rocr-runtime/libhsakmt/src/svm.c b/projects/rocr-runtime/libhsakmt/src/svm.c index 441fc00fd7..5482dead5c 100644 --- a/projects/rocr-runtime/libhsakmt/src/svm.c +++ b/projects/rocr-runtime/libhsakmt/src/svm.c @@ -37,7 +37,8 @@ /* Helper functions for calling KFD SVM ioctl */ HSAKMT_STATUS HSAKMTAPI -hsaKmtSVMSetAttr(void *start_addr, HSAuint64 size, unsigned int nattr, +hsaKmtSVMSetAttrCtx(HsaKFDContext *ctx, + void *start_addr, HSAuint64 size, unsigned int nattr, HSA_SVM_ATTRIBUTE *attrs) { struct kfd_ioctl_svm_args *args; @@ -94,7 +95,7 @@ hsaKmtSVMSetAttr(void *start_addr, HSAuint64 size, unsigned int nattr, } /* Driver does one copy_from_user, with extra attrs size */ - r = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_SVM + (s_attr << _IOC_SIZESHIFT), args); + r = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_SVM + (s_attr << _IOC_SIZESHIFT), args); if (r) { pr_debug("op set range attrs failed %s\n", strerror(errno)); return HSAKMT_STATUS_ERROR; @@ -104,7 +105,8 @@ hsaKmtSVMSetAttr(void *start_addr, HSAuint64 size, unsigned int nattr, } HSAKMT_STATUS HSAKMTAPI -hsaKmtSVMGetAttr(void *start_addr, HSAuint64 size, unsigned int nattr, +hsaKmtSVMGetAttrCtx(HsaKFDContext *ctx, + void *start_addr, HSAuint64 size, unsigned int nattr, HSA_SVM_ATTRIBUTE *attrs) { struct kfd_ioctl_svm_args *args; @@ -150,7 +152,7 @@ hsaKmtSVMGetAttr(void *start_addr, HSAuint64 size, unsigned int nattr, } /* Driver does one copy_from_user, with extra attrs size */ - r = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_SVM + (s_attr << _IOC_SIZESHIFT), args); + r = hsakmt_ioctl(ctx->fd, AMDKFD_IOC_SVM + (s_attr << _IOC_SIZESHIFT), args); if (r) { pr_debug("op get range attrs failed %s\n", strerror(errno)); return HSAKMT_STATUS_ERROR; @@ -187,7 +189,7 @@ hsaKmtSVMGetAttr(void *start_addr, HSAuint64 size, unsigned int nattr, } static HSAKMT_STATUS -hsaKmtSetGetXNACKMode(HSAint32 * enable) +hsaKmtSetGetXNACKModeCtx(HsaKFDContext *ctx, HSAint32 * enable) { struct kfd_ioctl_set_xnack_mode_args args; @@ -196,7 +198,7 @@ hsaKmtSetGetXNACKMode(HSAint32 * enable) args.xnack_enabled = *enable; - if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_SET_XNACK_MODE, &args)) { + if (hsakmt_ioctl(ctx->fd, AMDKFD_IOC_SET_XNACK_MODE, &args)) { if (errno == EPERM) { pr_debug("set mode not supported %s\n", strerror(errno)); @@ -213,6 +215,40 @@ hsaKmtSetGetXNACKMode(HSAint32 * enable) return HSAKMT_STATUS_SUCCESS; } +HSAKMT_STATUS HSAKMTAPI +hsaKmtSetXNACKModeCtx(HsaKFDContext *ctx, HSAint32 enable) +{ + return hsaKmtSetGetXNACKModeCtx(ctx, &enable); +} + +HSAKMT_STATUS HSAKMTAPI +hsaKmtGetXNACKModeCtx(HsaKFDContext *ctx, HSAint32 * enable) +{ + *enable = -1; + return hsaKmtSetGetXNACKModeCtx(ctx, enable); +} + + +HSAKMT_STATUS HSAKMTAPI +hsaKmtSVMSetAttr(void *start_addr, HSAuint64 size, unsigned int nattr, + HSA_SVM_ATTRIBUTE *attrs) +{ + return hsaKmtSVMSetAttrCtx(&hsakmt_primary_kfd_ctx, start_addr, size, nattr, attrs); +} + +HSAKMT_STATUS HSAKMTAPI +hsaKmtSVMGetAttr(void *start_addr, HSAuint64 size, unsigned int nattr, + HSA_SVM_ATTRIBUTE *attrs) +{ + return hsaKmtSVMGetAttrCtx(&hsakmt_primary_kfd_ctx, start_addr, size, nattr, attrs); +} + +static HSAKMT_STATUS +hsaKmtSetGetXNACKMode(HSAint32 * enable) +{ + return hsaKmtSetGetXNACKModeCtx(&hsakmt_primary_kfd_ctx, enable); +} + HSAKMT_STATUS HSAKMTAPI hsaKmtSetXNACKMode(HSAint32 enable) { diff --git a/projects/rocr-runtime/libhsakmt/src/time.c b/projects/rocr-runtime/libhsakmt/src/time.c index eff9ed1585..9e8b5ec451 100644 --- a/projects/rocr-runtime/libhsakmt/src/time.c +++ b/projects/rocr-runtime/libhsakmt/src/time.c @@ -42,7 +42,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetClockCounters(HSAuint32 NodeId, args.gpu_id = gpu_id; - err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_GET_CLOCK_COUNTERS, &args); + err = hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_GET_CLOCK_COUNTERS, &args); if (err < 0) { result = HSAKMT_STATUS_ERROR; } else { diff --git a/projects/rocr-runtime/libhsakmt/src/topology.c b/projects/rocr-runtime/libhsakmt/src/topology.c index 324033c840..6db13f9073 100644 --- a/projects/rocr-runtime/libhsakmt/src/topology.c +++ b/projects/rocr-runtime/libhsakmt/src/topology.c @@ -96,7 +96,7 @@ static const char *supported_processor_vendor_name[] = { "\n" // POWER requires a different search method }; -static HSAKMT_STATUS topology_take_snapshot(void); +static HSAKMT_STATUS topology_take_snapshot(HsaKFDContext *ctx); static void topology_drop_snapshot(void); static const struct hsa_gfxip_table gfxip_lookup_table[] = { @@ -645,7 +645,8 @@ static HSAKMT_STATUS topology_sysfs_get_gpu_id(uint32_t sysfs_node_id, uint32_t * - if corresponding drm render node is not available. * - if node information is not accessible (EPERM) */ -static HSAKMT_STATUS topology_sysfs_check_node_supported(uint32_t sysfs_node_id, bool *is_node_supported) +static HSAKMT_STATUS topology_sysfs_check_node_supported(HsaKFDContext *ctx, + uint32_t sysfs_node_id, bool *is_node_supported) { uint32_t gpu_id; FILE *fd; @@ -711,7 +712,7 @@ static HSAKMT_STATUS topology_sysfs_check_node_supported(uint32_t sysfs_node_id, } /* Open DRM Render device */ - ret_value = hsakmt_open_drm_render_device(drm_render_minor); + ret_value = hsakmt_open_drm_render_device(ctx, drm_render_minor); if (ret_value > 0) *is_node_supported = true; else if (ret_value != -ENOENT && ret_value != -EPERM) @@ -723,7 +724,8 @@ err: return ret; } -HSAKMT_STATUS hsakmt_topology_sysfs_get_system_props(HsaSystemProperties *props) +HSAKMT_STATUS hsakmt_topology_sysfs_get_system_props(HsaKFDContext *ctx, + HsaSystemProperties *props) { FILE *fd; char *read_buf, *p; @@ -800,7 +802,7 @@ HSAKMT_STATUS hsakmt_topology_sysfs_get_system_props(HsaSystemProperties *props) } for (uint32_t i = 0; i < num_sysfs_nodes; i++) { - ret = topology_sysfs_check_node_supported(i, &is_node_supported); + ret = topology_sysfs_check_node_supported(ctx, i, &is_node_supported); if (ret != HSAKMT_STATUS_SUCCESS) goto sysfs_parse_failed; if (is_node_supported) @@ -1631,7 +1633,8 @@ static HSAKMT_STATUS topology_map_sysfs_to_user_node_id(uint32_t sys_node_id, ui * If node_to specified by the @iolink_id is not accessible the function returns HSAKMT_STATUS_NOT_SUPPORTED. * If node_to is accessible, then node_to is mapped from sysfs_node to user_node and returns HSAKMT_STATUS_SUCCESS. */ -static HSAKMT_STATUS topology_sysfs_get_iolink_props(uint32_t node_id, +static HSAKMT_STATUS topology_sysfs_get_iolink_props(HsaKFDContext *ctx, + uint32_t node_id, uint32_t iolink_id, HsaIoLinkProperties *props, bool p2pLink) { @@ -1693,7 +1696,7 @@ static HSAKMT_STATUS topology_sysfs_get_iolink_props(uint32_t node_id, uint32_t sysfs_node_id; sysfs_node_id = (uint32_t)prop_val; - ret = topology_sysfs_check_node_supported(sysfs_node_id, &is_node_supported); + ret = topology_sysfs_check_node_supported(ctx, sysfs_node_id, &is_node_supported); if (!is_node_supported) { ret = HSAKMT_STATUS_NOT_SUPPORTED; memset(props, 0, sizeof(*props)); @@ -1955,7 +1958,7 @@ try_alt_dir: } } -HSAKMT_STATUS topology_take_snapshot(void) +HSAKMT_STATUS topology_take_snapshot(HsaKFDContext *ctx) { uint32_t gen_start, gen_end, i, mem_id, cache_id; HsaSystemProperties sys_props; @@ -1978,7 +1981,7 @@ retry: ret = topology_sysfs_get_generation(&gen_start); if (ret != HSAKMT_STATUS_SUCCESS) goto err; - ret = hsakmt_topology_sysfs_get_system_props(&sys_props); + ret = hsakmt_topology_sysfs_get_system_props(ctx, &sys_props); if (ret != HSAKMT_STATUS_SUCCESS) goto err; if (sys_props.NumNodes > 0) { @@ -2059,7 +2062,7 @@ retry: */ while (sys_link_id < num_ioLinks && link_id < sys_props.NumNodes - 1) { - ret = topology_sysfs_get_iolink_props(i, sys_link_id++, + ret = topology_sysfs_get_iolink_props(ctx, i, sys_link_id++, &temp_props[i].link[link_id], false); if (ret == HSAKMT_STATUS_NOT_SUPPORTED) { continue; @@ -2080,7 +2083,7 @@ retry: */ while (sys_link_id < num_p2pLinks && link_id < sys_props.NumNodes - 1) { - ret = topology_sysfs_get_iolink_props(i, sys_link_id++, + ret = topology_sysfs_get_iolink_props(ctx, i, sys_link_id++, &temp_props[i].link[link_id], true); if (ret == HSAKMT_STATUS_NOT_SUPPORTED) { continue; @@ -2179,7 +2182,8 @@ HSAKMT_STATUS hsakmt_gpuid_to_nodeid(uint32_t gpu_id, uint32_t *node_id) } -HSAKMT_STATUS HSAKMTAPI hsaKmtAcquireSystemProperties(HsaSystemProperties *SystemProperties) +HSAKMT_STATUS HSAKMTAPI hsaKmtAcquireSystemPropertiesCtx(HsaKFDContext *ctx, + HsaSystemProperties *SystemProperties) { HSAKMT_STATUS err = HSAKMT_STATUS_SUCCESS; @@ -2198,7 +2202,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAcquireSystemProperties(HsaSystemProperties *Syste goto out; } - err = topology_take_snapshot(); + err = topology_take_snapshot(ctx); if (err != HSAKMT_STATUS_SUCCESS) goto out; @@ -2207,11 +2211,11 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAcquireSystemProperties(HsaSystemProperties *Syste if (hsakmt_use_model) model_init(); - err = hsakmt_fmm_init_process_apertures(g_system->NumNodes); + err = hsakmt_fmm_init_process_apertures(ctx, g_system->NumNodes); if (err != HSAKMT_STATUS_SUCCESS) goto init_process_apertures_failed; - err = hsakmt_init_process_doorbells(g_system->NumNodes); + err = hsakmt_init_process_doorbells(ctx, g_system->NumNodes); if (err != HSAKMT_STATUS_SUCCESS) goto init_doorbells_failed; @@ -2220,7 +2224,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAcquireSystemProperties(HsaSystemProperties *Syste goto out; init_doorbells_failed: - hsakmt_fmm_destroy_process_apertures(); + hsakmt_fmm_destroy_process_apertures(ctx); init_process_apertures_failed: topology_drop_snapshot(); @@ -2229,12 +2233,12 @@ out: return err; } -HSAKMT_STATUS HSAKMTAPI hsaKmtReleaseSystemProperties(void) +HSAKMT_STATUS HSAKMTAPI hsaKmtReleaseSystemPropertiesCtx(HsaKFDContext *ctx) { pthread_mutex_lock(&hsakmt_mutex); - hsakmt_destroy_process_doorbells(); - hsakmt_fmm_destroy_process_apertures(); + hsakmt_destroy_process_doorbells(ctx); + hsakmt_fmm_destroy_process_apertures(ctx); topology_drop_snapshot(); pthread_mutex_unlock(&hsakmt_mutex); @@ -2252,7 +2256,9 @@ HSAKMT_STATUS hsakmt_topology_get_node_props(HSAuint32 NodeId, return HSAKMT_STATUS_SUCCESS; } -HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeProperties(HSAuint32 NodeId, + +HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodePropertiesCtx(HsaKFDContext *ctx, + HSAuint32 NodeId, HsaNodeProperties *NodeProperties) { HSAKMT_STATUS err; @@ -2278,7 +2284,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeProperties(HSAuint32 NodeId, NodeProperties->NumMemoryBanks += NUM_OF_DGPU_HEAPS; else NodeProperties->NumMemoryBanks += NUM_OF_IGPU_HEAPS; - if (hsakmt_fmm_get_aperture_base_and_limit(FMM_MMIO, gpu_id, &base, + if (hsakmt_fmm_get_aperture_base_and_limit(ctx, FMM_MMIO, gpu_id, &base, &limit) == HSAKMT_STATUS_SUCCESS) NodeProperties->NumMemoryBanks += 1; } @@ -2288,7 +2294,8 @@ out: return err; } -HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeMemoryProperties(HSAuint32 NodeId, +HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeMemoryPropertiesCtx(HsaKFDContext *ctx, + HSAuint32 NodeId, HSAuint32 NumBanks, HsaMemoryProperties *MemoryProperties) { @@ -2319,7 +2326,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeMemoryProperties(HSAuint32 NodeId, /*Add LDS*/ if (i < NumBanks && - hsakmt_fmm_get_aperture_base_and_limit(FMM_LDS, gpu_id, + hsakmt_fmm_get_aperture_base_and_limit(ctx, FMM_LDS, gpu_id, &MemoryProperties[i].VirtualBaseAddress, &aperture_limit) == HSAKMT_STATUS_SUCCESS) { MemoryProperties[i].HeapType = HSA_HEAPTYPE_GPU_LDS; MemoryProperties[i].SizeInBytes = g_props[NodeId].node.LDSSizeInKB * 1024; @@ -2332,7 +2339,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeMemoryProperties(HSAuint32 NodeId, */ if (hsakmt_get_gfxv_by_node_id(NodeId) == GFX_VERSION_KAVERI && i < NumBanks && g_props[NodeId].node.LocalMemSize > 0 && - hsakmt_fmm_get_aperture_base_and_limit(FMM_GPUVM, gpu_id, + hsakmt_fmm_get_aperture_base_and_limit(ctx, FMM_GPUVM, gpu_id, &MemoryProperties[i].VirtualBaseAddress, &aperture_limit) == HSAKMT_STATUS_SUCCESS) { MemoryProperties[i].HeapType = HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE; MemoryProperties[i].SizeInBytes = g_props[NodeId].node.LocalMemSize; @@ -2341,7 +2348,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeMemoryProperties(HSAuint32 NodeId, /* Add SCRATCH */ if (i < NumBanks && - hsakmt_fmm_get_aperture_base_and_limit(FMM_SCRATCH, gpu_id, + hsakmt_fmm_get_aperture_base_and_limit(ctx, FMM_SCRATCH, gpu_id, &MemoryProperties[i].VirtualBaseAddress, &aperture_limit) == HSAKMT_STATUS_SUCCESS) { MemoryProperties[i].HeapType = HSA_HEAPTYPE_GPU_SCRATCH; MemoryProperties[i].SizeInBytes = (aperture_limit - MemoryProperties[i].VirtualBaseAddress) + 1; @@ -2350,7 +2357,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeMemoryProperties(HSAuint32 NodeId, /* Add SVM aperture */ if (hsakmt_topology_is_svm_needed(g_props[NodeId].node.EngineId) && i < NumBanks && - hsakmt_fmm_get_aperture_base_and_limit( + hsakmt_fmm_get_aperture_base_and_limit(ctx, FMM_SVM, gpu_id, &MemoryProperties[i].VirtualBaseAddress, &aperture_limit) == HSAKMT_STATUS_SUCCESS) { MemoryProperties[i].HeapType = HSA_HEAPTYPE_DEVICE_SVM; @@ -2360,7 +2367,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeMemoryProperties(HSAuint32 NodeId, /* Add mmio aperture */ if (i < NumBanks && - hsakmt_fmm_get_aperture_base_and_limit(FMM_MMIO, gpu_id, + hsakmt_fmm_get_aperture_base_and_limit(ctx, FMM_MMIO, gpu_id, &MemoryProperties[i].VirtualBaseAddress, &aperture_limit) == HSAKMT_STATUS_SUCCESS) { MemoryProperties[i].HeapType = HSA_HEAPTYPE_MMIO_REMAP; MemoryProperties[i].SizeInBytes = (aperture_limit - MemoryProperties[i].VirtualBaseAddress) + 1; @@ -2372,7 +2379,8 @@ out: return err; } -HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeCacheProperties(HSAuint32 NodeId, +HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeCachePropertiesCtx(HsaKFDContext *ctx, + HSAuint32 NodeId, HSAuint32 ProcessorId, HSAuint32 NumCaches, HsaCacheProperties *CacheProperties) @@ -2422,7 +2430,8 @@ HSAKMT_STATUS hsakmt_topology_get_iolink_props(HSAuint32 NodeId, return HSAKMT_STATUS_SUCCESS; } -HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeIoLinkProperties(HSAuint32 NodeId, +HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeIoLinkPropertiesCtx(HsaKFDContext *ctx, + HSAuint32 NodeId, HSAuint32 NumIoLinks, HsaIoLinkProperties *IoLinkProperties) { @@ -2536,3 +2545,43 @@ inline uint32_t hsakmt_get_num_sysfs_nodes(void) { return num_sysfs_nodes; } + + + +HSAKMT_STATUS HSAKMTAPI hsaKmtAcquireSystemProperties(HsaSystemProperties *SystemProperties) +{ + return hsaKmtAcquireSystemPropertiesCtx(&hsakmt_primary_kfd_ctx, SystemProperties); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtReleaseSystemProperties(void) +{ + return hsaKmtReleaseSystemPropertiesCtx(&hsakmt_primary_kfd_ctx); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeProperties(HSAuint32 NodeId, + HsaNodeProperties *NodeProperties) +{ + return hsaKmtGetNodePropertiesCtx(&hsakmt_primary_kfd_ctx, NodeId, NodeProperties); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeMemoryProperties(HSAuint32 NodeId, + HSAuint32 NumBanks, + HsaMemoryProperties *MemoryProperties) +{ + return hsaKmtGetNodeMemoryPropertiesCtx(&hsakmt_primary_kfd_ctx, NodeId, NumBanks, MemoryProperties); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeCacheProperties(HSAuint32 NodeId, + HSAuint32 ProcessorId, + HSAuint32 NumCaches, + HsaCacheProperties *CacheProperties) +{ + return hsaKmtGetNodeCachePropertiesCtx(&hsakmt_primary_kfd_ctx, NodeId, ProcessorId, NumCaches, CacheProperties); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeIoLinkProperties(HSAuint32 NodeId, + HSAuint32 NumIoLinks, + HsaIoLinkProperties *IoLinkProperties) +{ + return hsaKmtGetNodeIoLinkPropertiesCtx(&hsakmt_primary_kfd_ctx, NodeId, NumIoLinks, IoLinkProperties); +} diff --git a/projects/rocr-runtime/libhsakmt/src/version.c b/projects/rocr-runtime/libhsakmt/src/version.c index ceda2d2106..2865c81e04 100644 --- a/projects/rocr-runtime/libhsakmt/src/version.c +++ b/projects/rocr-runtime/libhsakmt/src/version.c @@ -43,7 +43,7 @@ HSAKMT_STATUS hsakmt_init_kfd_version(void) { struct kfd_ioctl_get_version_args args = {0}; - if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_GET_VERSION, &args) == -1) + if (hsakmt_ioctl(hsakmt_primary_kfd_ctx.fd, AMDKFD_IOC_GET_VERSION, &args) == -1) return HSAKMT_STATUS_ERROR; hsakmt_kfd_version_info.KernelInterfaceMajorVersion = args.major_version;