From bde07e7baa795bff9a3e040ee5aeb23d5d45d500 Mon Sep 17 00:00:00 2001 From: "Kuricheti, Mythreya" Date: Thu, 26 Jun 2025 11:28:45 -0700 Subject: [PATCH] [SDK] KFD new events API (#321) * Remove page-migration * Add KFD events API * Address review comments * Move assert checks * Update enum-string utils * Update codeowners * Update KFD header * Add perfetto category [ROCm/rocprofiler-sdk commit: 8a461afe208158c9e3cea8dadd6147350f676886] --- projects/rocprofiler-sdk/CODEOWNERS | 4 +- .../samples/api_buffered_tracing/client.cpp | 99 +- .../include/rocprofiler-sdk/buffer_tracing.h | 163 +- .../rocprofiler-sdk/cxx/enum_string.hpp | 144 +- .../include/rocprofiler-sdk/cxx/perfetto.hpp | 10 +- .../cxx/serialization/save.hpp | 211 +-- .../source/include/rocprofiler-sdk/fwd.h | 32 +- .../rocprofiler-sdk/kfd/CMakeLists.txt | 2 +- .../include/rocprofiler-sdk/kfd/kfd_id.h | 159 ++ .../rocprofiler-sdk/kfd/page_migration_args.h | 111 -- .../rocprofiler-sdk/kfd/page_migration_id.h | 82 - .../source/lib/rocprofiler-sdk/CMakeLists.txt | 2 +- .../lib/rocprofiler-sdk/buffer_tracing.cpp | 63 +- .../lib/rocprofiler-sdk/context/domain.cpp | 2 +- .../lib/rocprofiler-sdk/details/kfd_ioctl.h | 59 +- .../lib/rocprofiler-sdk/kfd/CMakeLists.txt | 12 + .../source/lib/rocprofiler-sdk/kfd/abi.cpp | 88 + .../{page_migration => kfd}/defines.hpp | 43 +- .../source/lib/rocprofiler-sdk/kfd/kfd.cpp | 1553 +++++++++++++++++ .../lib/rocprofiler-sdk/kfd/kfd.def.cpp | 87 + .../page_migration.hpp => kfd/kfd.hpp} | 12 +- .../rocprofiler-sdk/kfd/tests/CMakeLists.txt | 48 + .../lib/rocprofiler-sdk/kfd/tests/parser.cpp | 274 +++ .../{page_migration => kfd}/utils.hpp | 128 +- .../page_migration/CMakeLists.txt | 7 - .../rocprofiler-sdk/page_migration/abi.cpp | 107 -- .../page_migration/page_migration.cpp | 937 ---------- .../page_migration/page_migration.def.cpp | 60 - .../lib/rocprofiler-sdk/registration.cpp | 4 +- .../lib/rocprofiler-sdk/tests/CMakeLists.txt | 3 +- .../lib/rocprofiler-sdk/tests/enum_string.cpp | 28 +- .../rocprofiler-sdk/tests/page_migration.cpp | 106 -- projects/rocprofiler-sdk/tests/CMakeLists.txt | 1 - .../rocprofiler-sdk/tests/bin/CMakeLists.txt | 1 - .../tests/bin/page-migration/CMakeLists.txt | 42 - .../bin/page-migration/page-migration.cpp | 300 ---- .../tests/page-migration/CMakeLists.txt | 76 - .../tests/page-migration/conftest.py | 50 - .../tests/page-migration/pytest.ini | 5 - .../tests/page-migration/validate.py | 377 ---- .../rocprofiler-sdk/tests/tools/json-tool.cpp | 400 ++++- 41 files changed, 3132 insertions(+), 2760 deletions(-) create mode 100644 projects/rocprofiler-sdk/source/include/rocprofiler-sdk/kfd/kfd_id.h delete mode 100644 projects/rocprofiler-sdk/source/include/rocprofiler-sdk/kfd/page_migration_args.h delete mode 100644 projects/rocprofiler-sdk/source/include/rocprofiler-sdk/kfd/page_migration_id.h create mode 100644 projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/kfd/CMakeLists.txt create mode 100644 projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/kfd/abi.cpp rename projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/{page_migration => kfd}/defines.hpp (55%) create mode 100644 projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/kfd/kfd.cpp create mode 100644 projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/kfd/kfd.def.cpp rename projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/{page_migration/page_migration.hpp => kfd/kfd.hpp} (86%) create mode 100644 projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/kfd/tests/CMakeLists.txt create mode 100644 projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/kfd/tests/parser.cpp rename projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/{page_migration => kfd}/utils.hpp (58%) delete mode 100644 projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/page_migration/CMakeLists.txt delete mode 100644 projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/page_migration/abi.cpp delete mode 100644 projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/page_migration/page_migration.cpp delete mode 100644 projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/page_migration/page_migration.def.cpp delete mode 100644 projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/tests/page_migration.cpp delete mode 100644 projects/rocprofiler-sdk/tests/bin/page-migration/CMakeLists.txt delete mode 100644 projects/rocprofiler-sdk/tests/bin/page-migration/page-migration.cpp delete mode 100644 projects/rocprofiler-sdk/tests/page-migration/CMakeLists.txt delete mode 100644 projects/rocprofiler-sdk/tests/page-migration/conftest.py delete mode 100644 projects/rocprofiler-sdk/tests/page-migration/pytest.ini delete mode 100644 projects/rocprofiler-sdk/tests/page-migration/validate.py diff --git a/projects/rocprofiler-sdk/CODEOWNERS b/projects/rocprofiler-sdk/CODEOWNERS index 41371afae4..96d52cc8d8 100644 --- a/projects/rocprofiler-sdk/CODEOWNERS +++ b/projects/rocprofiler-sdk/CODEOWNERS @@ -8,8 +8,8 @@ source/docs @bgopesh source/include @jrmadsen @bwelton @ammarwa source/include/rocprofiler-sdk/rccl @MythreyaK +source/include/rocprofiler-sdk/kfd @MythreyaK source/include/rocprofiler-sdk/cxx/codeobj @ApoKalipse-V - source/bin @SrirakshaNag @bgopesh source/libexec @SrirakshaNag @@ -17,8 +17,8 @@ source/libexec @SrirakshaNag source/lib/rocprofiler-sdk/counters @bwelton source/lib/rocprofiler-sdk/pc_sampling @vlaindic source/lib/rocprofiler-sdk/thread_trace @ApoKalipse-V -source/lib/rocprofiler-sdk/page_migration @MythreyaK source/lib/rocprofiler-sdk/rccl @MythreyaK +source/lib/rocprofiler-sdk/kfd @MythreyaK source/lib/rocprofiler-sdk-tool @SrirakshaNag diff --git a/projects/rocprofiler-sdk/samples/api_buffered_tracing/client.cpp b/projects/rocprofiler-sdk/samples/api_buffered_tracing/client.cpp index 721e0ac592..2a3322990c 100644 --- a/projects/rocprofiler-sdk/samples/api_buffered_tracing/client.cpp +++ b/projects/rocprofiler-sdk/samples/api_buffered_tracing/client.cpp @@ -311,99 +311,6 @@ tool_tracing_callback(rocprofiler_context_id_t context, static_cast(user_data)->emplace_back( source_location{__FUNCTION__, __FILE__, __LINE__, kind_name + info.str()}); } - else if(header->category == ROCPROFILER_BUFFER_CATEGORY_TRACING && - header->kind == ROCPROFILER_BUFFER_TRACING_PAGE_MIGRATION) - { - auto* record = - static_cast(header->payload); - - auto info = std::stringstream{}; - - info << "kind=" << record->kind << ", operation=" << record->operation - << ", pid=" << record->pid << ", timestamp=" << record->timestamp - << ", name=" << get_name(record) << std::boolalpha; - - switch(record->operation) - { - case ROCPROFILER_PAGE_MIGRATION_PAGE_MIGRATE_START: - { - const auto& arg = record->args; - info << ", page_migrate_start=(" << as_hex(arg.page_migrate_start.start_addr) - << ", " << as_hex(arg.page_migrate_start.end_addr) << ", " - << arg.page_migrate_start.from_agent.handle << ", " - << arg.page_migrate_start.to_agent.handle << ", " - << arg.page_migrate_start.prefetch_agent.handle << ", " - << arg.page_migrate_start.preferred_agent.handle << ", " - << arg.page_migrate_start.trigger << ")"; - break; - } - case ROCPROFILER_PAGE_MIGRATION_PAGE_MIGRATE_END: - { - const auto& arg = record->args; - info << ", page_migrate_end=(" << as_hex(arg.page_migrate_end.start_addr) - << ", " << as_hex(arg.page_migrate_end.end_addr) << ", " - << arg.page_migrate_end.from_agent.handle << ", " - << arg.page_migrate_end.to_agent.handle << ", " - << arg.page_migrate_end.trigger << ", " << arg.page_migrate_end.error_code - << ")"; - break; - } - case ROCPROFILER_PAGE_MIGRATION_PAGE_FAULT_START: - { - const auto& arg = record->args; - info << ", page_fault_start=(" << arg.page_fault_start.read_fault << ", " - << arg.page_fault_start.agent_id.handle << ", " - << as_hex(arg.page_fault_start.address) << ")"; - break; - } - case ROCPROFILER_PAGE_MIGRATION_PAGE_FAULT_END: - { - const auto& arg = record->args; - info << ", page_fault_end=(" << arg.page_fault_end.migrated << ", " - << arg.page_fault_end.agent_id.handle << ", " - << as_hex(arg.page_fault_end.address) << ")"; - break; - } - case ROCPROFILER_PAGE_MIGRATION_QUEUE_EVICTION: - { - const auto& arg = record->args; - info << ", queue_eviction=(" << arg.queue_eviction.agent_id.handle << ", " - << arg.queue_eviction.trigger << ")"; - break; - } - case ROCPROFILER_PAGE_MIGRATION_QUEUE_RESTORE: - { - const auto& arg = record->args; - info << ", queue_restore=(" << arg.queue_restore.rescheduled << ", " - << arg.queue_restore.agent_id.handle << ")"; - break; - } - case ROCPROFILER_PAGE_MIGRATION_UNMAP_FROM_GPU: - { - const auto& arg = record->args; - info << ", unmap_from_gpu=(" << as_hex(arg.unmap_from_gpu.start_addr) << ", " - << as_hex(arg.unmap_from_gpu.end_addr) << ", " - << arg.unmap_from_gpu.agent_id.handle << ", " << arg.unmap_from_gpu.trigger - << ")"; - break; - } - case ROCPROFILER_PAGE_MIGRATION_DROPPED_EVENT: - { - const auto& arg = record->args; - info << ", dropped_event=(" << arg.dropped_event.dropped_events_count << ")"; - break; - } - case ROCPROFILER_PAGE_MIGRATION_NONE: - case ROCPROFILER_PAGE_MIGRATION_LAST: - { - throw std::runtime_error{"unexpected page migration value"}; - break; - } - } - - static_cast(user_data)->emplace_back( - source_location{__FUNCTION__, __FILE__, __LINE__, kind_name + info.str()}); - } else if(header->category == ROCPROFILER_BUFFER_CATEGORY_TRACING && header->kind == ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY) { @@ -546,14 +453,10 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) client_ctx, ROCPROFILER_BUFFER_TRACING_MEMORY_COPY, nullptr, 0, client_buffer), "buffer tracing service for memory copy configure"); - // May have incompatible kernel so only emit a warning here - ROCPROFILER_WARN(rocprofiler_configure_buffer_tracing_service( - client_ctx, ROCPROFILER_BUFFER_TRACING_PAGE_MIGRATION, nullptr, 0, client_buffer)); - ROCPROFILER_CALL( rocprofiler_configure_buffer_tracing_service( client_ctx, ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY, nullptr, 0, client_buffer), - "buffer tracing service for page migration configure"); + "buffer tracing service for scratch memory configure"); auto client_thread = rocprofiler_callback_thread_t{}; ROCPROFILER_CALL(rocprofiler_create_callback_thread(&client_thread), diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/buffer_tracing.h b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/buffer_tracing.h index 399cd7b51b..ba4e9dfae2 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/buffer_tracing.h +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/buffer_tracing.h @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #include @@ -339,17 +339,160 @@ typedef struct rocprofiler_buffer_tracing_kernel_dispatch_record_t } rocprofiler_buffer_tracing_kernel_dispatch_record_t; /** - * @brief ROCProfiler Buffer Page Migration Tracer Record + * @brief ROCProfiler Buffer Page Migration event record from KFD. */ -typedef struct rocprofiler_buffer_tracing_page_migration_record_t +typedef struct rocprofiler_buffer_tracing_kfd_event_page_migrate_record_t { - uint64_t size; ///< size of this struct - rocprofiler_buffer_tracing_kind_t kind; ///< ROCPROFILER_BUFFER_TRACING_PAGE_MIGRATION - rocprofiler_page_migration_operation_t operation; - rocprofiler_timestamp_t timestamp; ///< start time in nanoseconds - uint32_t pid; - rocprofiler_page_migration_args_t args; -} rocprofiler_buffer_tracing_page_migration_record_t; + uint64_t size; ///< Size of this struct + rocprofiler_buffer_tracing_kind_t kind; + rocprofiler_kfd_event_page_migrate_operation_t operation; + rocprofiler_timestamp_t timestamp; ///< Timestamp of the event as reported by KFD + uint32_t pid; ///< PID of the process as reported by KFD + rocprofiler_address_t start_address; ///< Start address of the memory range being migrated + rocprofiler_address_t end_address; ///< End address of the memory range being migrated + rocprofiler_agent_id_t src_agent; ///< Source agent from which pages were migrated + rocprofiler_agent_id_t dst_agent; ///< Destination agent to which pages were migrated + rocprofiler_agent_id_t prefetch_agent; + rocprofiler_agent_id_t preferred_agent; + int32_t error_code; ///< Non-zero if there was an error at migrate-end + + ///< @var kind + ///< @brief ::ROCPROFILER_BUFFER_TRACING_KFD_EVENT_PAGE_MIGRATE + ///< @var operation + ///< @brief @see rocprofiler_kfd_page_migrate_operation_t + ///< @var prefetch_agent + ///< @brief Agent to which memory is to be prefetched. + /// This field should be ignored on a migrate-end event + ///< @var preferred_agent + ///< @brief Preferred location for this memory + /// This field should be ignored on a migrate-end event +} rocprofiler_buffer_tracing_kfd_event_page_migrate_record_t; + +/** + * @brief ROCProfiler Buffer Page Fault event record from KFD. + */ +typedef struct rocprofiler_buffer_tracing_kfd_event_page_fault_record_t +{ + uint64_t size; ///< Size of this struct + rocprofiler_buffer_tracing_kind_t kind; ///< ::ROCPROFILER_BUFFER_TRACING_KFD_EVENT_PAGE_FAULT + rocprofiler_kfd_event_page_fault_operation_t operation; + rocprofiler_timestamp_t timestamp; ///< Timestamp of the event as reported by KFD + uint32_t pid; ///< PID of the process as reported by KFD + rocprofiler_agent_id_t agent_id; ///< Agent ID which generated the fault + rocprofiler_address_t address; ///< Memory access that generated the fault event + + ///< @var operation + ///< @brief @see rocprofiler_kfd_page_fault_operation_t +} rocprofiler_buffer_tracing_kfd_event_page_fault_record_t; + +/** + * @brief ROCProfiler Buffer Queue event record from KFD. + */ +typedef struct rocprofiler_buffer_tracing_kfd_event_queue_record_t +{ + uint64_t size; ///< Size of this struct + rocprofiler_buffer_tracing_kind_t kind; ///< ::ROCPROFILER_BUFFER_TRACING_KFD_EVENT_QUEUE + rocprofiler_kfd_event_queue_operation_t operation; ///< @see rocprofiler_kfd_queue_operation_t + rocprofiler_timestamp_t timestamp; ///< Timestamp of the event as reported by KFD + uint32_t pid; ///< PID of the process as reported by KFD + rocprofiler_agent_id_t agent_id; ///< Agent ID on which this event occurred +} rocprofiler_buffer_tracing_kfd_event_queue_record_t; + +/** + * @brief ROCProfiler Buffer Unmap of memory from GPU event record from KFD. + */ +typedef struct rocprofiler_buffer_tracing_kfd_event_unmap_from_gpu_record_t +{ + uint64_t size; ///< Size of this struct + rocprofiler_buffer_tracing_kind_t kind; + rocprofiler_kfd_event_unmap_from_gpu_operation_t operation; + rocprofiler_timestamp_t timestamp; ///< Timestamp of the event as reported by KFD + uint32_t pid; ///< PID of the process as reported by KFD + rocprofiler_agent_id_t agent_id; ///< Agent ID on which memory ranges were unmapped + rocprofiler_address_t start_address; ///< Start address of the memory range being unmapped + rocprofiler_address_t end_address; ///< End address of the memory range being unmapped + + ///< @var kind + ///< @brief ::ROCPROFILER_BUFFER_TRACING_KFD_EVENT_UNMAP_FROM_GPU + ///< @var operation + ///< @brief @see rocprofiler_kfd_unmap_from_gpu_operation_t +} rocprofiler_buffer_tracing_kfd_event_unmap_from_gpu_record_t; + +/** + * @brief ROCProfiler Buffer Dropped events event record, for when KFD reports + * that it has dropped some events. + */ +typedef struct rocprofiler_buffer_tracing_kfd_event_dropped_events_record_t +{ + uint64_t size; ///< Size of this struct + rocprofiler_buffer_tracing_kind_t kind; + rocprofiler_kfd_event_dropped_events_operation_t operation; + rocprofiler_timestamp_t timestamp; ///< Timestamp of the event as reported by KFD + uint32_t pid; ///< PID of the process as reported by KFD + uint32_t count; ///< Number of records that KFD dropped + + ///< @var kind + ///< @brief ::ROCPROFILER_BUFFER_TRACING_KFD_EVENT_DROPPED_EVENTS + ///< @var operation + ///< @brief @see rocprofiler_kfd_event_dropped_events_operation_t +} rocprofiler_buffer_tracing_kfd_event_dropped_events_record_t; + +/** + * @brief ROCProfiler Buffer Page Migration (paired) record from KFD. + */ +typedef struct rocprofiler_buffer_tracing_kfd_page_migrate_record_t +{ + uint64_t size; ///< Size of this struct + rocprofiler_buffer_tracing_kind_t kind; ///< ::ROCPROFILER_BUFFER_TRACING_KFD_PAGE_MIGRATE + rocprofiler_kfd_page_migrate_operation_t operation; + rocprofiler_timestamp_t start_timestamp; ///< Start timestamp as reported by KFD + rocprofiler_timestamp_t end_timestamp; ///< End timestamp as reported by KFD + uint32_t pid; ///< PID of the process as reported by KFD + rocprofiler_address_t start_address; ///< Start address of the memory range being migrated + rocprofiler_address_t end_address; ///< End address of the memory range being migrated + rocprofiler_agent_id_t src_agent; ///< Source agent from which pages were migrated + rocprofiler_agent_id_t dst_agent; ///< Destination agent to which pages were migrated + rocprofiler_agent_id_t prefetch_agent; + rocprofiler_agent_id_t preferred_agent; + int32_t error_code; ///< Non-zero codes are errors, as reported by KFD + ///< @var operation + ///< @brief @see rocprofiler_kfd_page_migrate_operation_t + ///< @var prefetch_agent + ///< @brief Agent to which memory is to be prefetched. + ///< @var preferred_agent + ///< @brief Preferred location for this memory +} rocprofiler_buffer_tracing_kfd_page_migrate_record_t; + +/** + * @brief ROCProfiler Buffer Page Fault (paired) record from KFD. + */ +typedef struct rocprofiler_buffer_tracing_kfd_page_fault_record_t +{ + uint64_t size; ///< Size of this struct + rocprofiler_buffer_tracing_kind_t kind; ///< ::ROCPROFILER_BUFFER_TRACING_KFD_PAGE_FAULT + rocprofiler_kfd_page_fault_operation_t operation; + rocprofiler_timestamp_t start_timestamp; ///< Start timestamp as reported by KFD + rocprofiler_timestamp_t end_timestamp; ///< End timestamp as reported by KFD + uint32_t pid; ///< PID of the process as reported by KFD + rocprofiler_agent_id_t agent_id; ///< Agent ID which generated the fault + rocprofiler_address_t address; ///< Memory access that generated the page fault + ///< @var operation + ///< @brief @see rocprofiler_kfd_page_fault_operation_t +} rocprofiler_buffer_tracing_kfd_page_fault_record_t; + +/** + * @brief ROCProfiler Buffer Queue suspend (paired) record from KFD. + */ +typedef struct rocprofiler_buffer_tracing_kfd_queue_record_t +{ + uint64_t size; ///< Size of this struct + rocprofiler_buffer_tracing_kind_t kind; ///< ::ROCPROFILER_BUFFER_TRACING_KFD_QUEUE + rocprofiler_kfd_queue_operation_t operation; ///< @see rocprofiler_kfd_queue_operation_t + rocprofiler_timestamp_t start_timestamp; ///< Start timestamp as reported by KFD + rocprofiler_timestamp_t end_timestamp; ///< End timestamp as reported by KFD + uint32_t pid; ///< PID of the process as reported by KFD + rocprofiler_agent_id_t agent_id; ///< Agent ID on which this event occurred +} rocprofiler_buffer_tracing_kfd_queue_record_t; /** * @brief ROCProfiler Buffer Scratch Memory Tracer Record diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/enum_string.hpp b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/enum_string.hpp index 004fd7a0a8..143cd87279 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/enum_string.hpp +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/enum_string.hpp @@ -1195,10 +1195,6 @@ ROCPROFILER_ENUM_INFO(rocprofiler_buffer_policy_t, ROCPROFILER_BUFFER_POLICY_NONE, ROCPROFILER_BUFFER_POLICY_LAST, false); -ROCPROFILER_ENUM_INFO(rocprofiler_page_migration_operation_t, - ROCPROFILER_PAGE_MIGRATION_NONE, - ROCPROFILER_PAGE_MIGRATION_LAST, - false); ROCPROFILER_ENUM_INFO(rocprofiler_scratch_memory_operation_t, ROCPROFILER_SCRATCH_MEMORY_NONE, ROCPROFILER_SCRATCH_MEMORY_LAST, @@ -1243,19 +1239,40 @@ ROCPROFILER_ENUM_INFO(rocprofiler_pc_sampling_record_kind_t, ROCPROFILER_PC_SAMPLING_RECORD_LAST, false); -// page-migration -ROCPROFILER_ENUM_INFO(rocprofiler_page_migration_trigger_t, - ROCPROFILER_PAGE_MIGRATION_TRIGGER_NONE, - ROCPROFILER_PAGE_MIGRATION_TRIGGER_LAST, +// kfd/kfd_id.h +ROCPROFILER_ENUM_INFO(rocprofiler_kfd_event_page_migrate_operation_t, + ROCPROFILER_KFD_EVENT_PAGE_MIGRATE_NONE, + ROCPROFILER_KFD_EVENT_PAGE_MIGRATE_LAST, false); -ROCPROFILER_ENUM_INFO(rocprofiler_page_migration_queue_suspend_trigger_t, - ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND_TRIGGER_NONE, - ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND_TRIGGER_LAST, +ROCPROFILER_ENUM_INFO(rocprofiler_kfd_event_page_fault_operation_t, + ROCPROFILER_KFD_EVENT_PAGE_FAULT_NONE, + ROCPROFILER_KFD_EVENT_PAGE_FAULT_LAST, false); -ROCPROFILER_ENUM_INFO(rocprofiler_page_migration_unmap_from_gpu_trigger_t, - ROCPROFILER_PAGE_MIGRATION_UNMAP_FROM_GPU_TRIGGER_NONE, - ROCPROFILER_PAGE_MIGRATION_UNMAP_FROM_GPU_TRIGGER_LAST, +ROCPROFILER_ENUM_INFO(rocprofiler_kfd_event_queue_operation_t, + ROCPROFILER_KFD_EVENT_QUEUE_NONE, + ROCPROFILER_KFD_EVENT_QUEUE_LAST, false); +ROCPROFILER_ENUM_INFO(rocprofiler_kfd_event_unmap_from_gpu_operation_t, + ROCPROFILER_KFD_EVENT_UNMAP_FROM_GPU_NONE, + ROCPROFILER_KFD_EVENT_UNMAP_FROM_GPU_LAST, + false); +ROCPROFILER_ENUM_INFO(rocprofiler_kfd_event_dropped_events_operation_t, + ROCPROFILER_KFD_EVENT_DROPPED_EVENTS_NONE, + ROCPROFILER_KFD_EVENT_DROPPED_EVENTS_LAST, + false); +ROCPROFILER_ENUM_INFO(rocprofiler_kfd_page_migrate_operation_t, + ROCPROFILER_KFD_PAGE_MIGRATE_NONE, + ROCPROFILER_KFD_PAGE_MIGRATE_LAST, + false); +ROCPROFILER_ENUM_INFO(rocprofiler_kfd_page_fault_operation_t, + ROCPROFILER_KFD_PAGE_FAULT_NONE, + ROCPROFILER_KFD_PAGE_FAULT_LAST, + false); +ROCPROFILER_ENUM_INFO(rocprofiler_kfd_queue_operation_t, + ROCPROFILER_KFD_QUEUE_NONE, + ROCPROFILER_KFD_QUEUE_LAST, + false); + ROCPROFILER_ENUM_INFO(rocprofiler_external_correlation_id_request_kind_t, ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_NONE, ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_LAST, @@ -1404,7 +1421,6 @@ ROCPROFILER_ENUM_LABEL(ROCPROFILER_BUFFER_TRACING_MARKER_CONTROL_API); ROCPROFILER_ENUM_LABEL(ROCPROFILER_BUFFER_TRACING_MARKER_NAME_API); ROCPROFILER_ENUM_LABEL(ROCPROFILER_BUFFER_TRACING_MEMORY_COPY); ROCPROFILER_ENUM_LABEL(ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH); -ROCPROFILER_ENUM_LABEL(ROCPROFILER_BUFFER_TRACING_PAGE_MIGRATION); ROCPROFILER_ENUM_LABEL(ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY); ROCPROFILER_ENUM_LABEL(ROCPROFILER_BUFFER_TRACING_CORRELATION_ID_RETIREMENT); ROCPROFILER_ENUM_LABEL(ROCPROFILER_BUFFER_TRACING_RCCL_API); @@ -1417,7 +1433,15 @@ ROCPROFILER_ENUM_LABEL(ROCPROFILER_BUFFER_TRACING_HIP_STREAM); ROCPROFILER_ENUM_LABEL(ROCPROFILER_BUFFER_TRACING_HIP_RUNTIME_API_EXT); ROCPROFILER_ENUM_LABEL(ROCPROFILER_BUFFER_TRACING_HIP_COMPILER_API_EXT); ROCPROFILER_ENUM_LABEL(ROCPROFILER_BUFFER_TRACING_ROCDECODE_API_EXT); -static_assert(ROCPROFILER_BUFFER_TRACING_LAST == 25); +ROCPROFILER_ENUM_LABEL(ROCPROFILER_BUFFER_TRACING_KFD_EVENT_PAGE_MIGRATE); +ROCPROFILER_ENUM_LABEL(ROCPROFILER_BUFFER_TRACING_KFD_EVENT_PAGE_FAULT); +ROCPROFILER_ENUM_LABEL(ROCPROFILER_BUFFER_TRACING_KFD_EVENT_QUEUE); +ROCPROFILER_ENUM_LABEL(ROCPROFILER_BUFFER_TRACING_KFD_EVENT_UNMAP_FROM_GPU); +ROCPROFILER_ENUM_LABEL(ROCPROFILER_BUFFER_TRACING_KFD_EVENT_DROPPED_EVENTS); +ROCPROFILER_ENUM_LABEL(ROCPROFILER_BUFFER_TRACING_KFD_PAGE_MIGRATE); +ROCPROFILER_ENUM_LABEL(ROCPROFILER_BUFFER_TRACING_KFD_PAGE_FAULT); +ROCPROFILER_ENUM_LABEL(ROCPROFILER_BUFFER_TRACING_KFD_QUEUE); +static_assert(ROCPROFILER_BUFFER_TRACING_LAST == 32); // rocprofiler_code_object_operation_t ROCPROFILER_ENUM_LABEL(ROCPROFILER_CODE_OBJECT_NONE); @@ -1467,18 +1491,6 @@ ROCPROFILER_ENUM_LABEL(ROCPROFILER_BUFFER_POLICY_DISCARD); ROCPROFILER_ENUM_LABEL(ROCPROFILER_BUFFER_POLICY_LOSSLESS); static_assert(ROCPROFILER_BUFFER_POLICY_LAST == 3); -// rocprofiler_page_migration_operation_t -ROCPROFILER_ENUM_LABEL(ROCPROFILER_PAGE_MIGRATION_NONE); -ROCPROFILER_ENUM_LABEL(ROCPROFILER_PAGE_MIGRATION_PAGE_MIGRATE_START); -ROCPROFILER_ENUM_LABEL(ROCPROFILER_PAGE_MIGRATION_PAGE_MIGRATE_END); -ROCPROFILER_ENUM_LABEL(ROCPROFILER_PAGE_MIGRATION_PAGE_FAULT_START); -ROCPROFILER_ENUM_LABEL(ROCPROFILER_PAGE_MIGRATION_PAGE_FAULT_END); -ROCPROFILER_ENUM_LABEL(ROCPROFILER_PAGE_MIGRATION_QUEUE_EVICTION); -ROCPROFILER_ENUM_LABEL(ROCPROFILER_PAGE_MIGRATION_QUEUE_RESTORE); -ROCPROFILER_ENUM_LABEL(ROCPROFILER_PAGE_MIGRATION_UNMAP_FROM_GPU); -ROCPROFILER_ENUM_LABEL(ROCPROFILER_PAGE_MIGRATION_DROPPED_EVENT); -static_assert(ROCPROFILER_PAGE_MIGRATION_LAST == 9); - // rocprofiler_scratch_memory_operation_t ROCPROFILER_ENUM_LABEL(ROCPROFILER_SCRATCH_MEMORY_NONE); ROCPROFILER_ENUM_LABEL(ROCPROFILER_SCRATCH_MEMORY_ALLOC); @@ -1546,23 +1558,65 @@ ROCPROFILER_ENUM_LABEL(ROCPROFILER_PC_SAMPLING_RECORD_HOST_TRAP_V0_SAMPLE); ROCPROFILER_ENUM_LABEL(ROCPROFILER_PC_SAMPLING_RECORD_STOCHASTIC_V0_SAMPLE); static_assert(ROCPROFILER_PC_SAMPLING_RECORD_LAST == 4); -// page-migration -ROCPROFILER_ENUM_LABEL(ROCPROFILER_PAGE_MIGRATION_TRIGGER_PREFETCH); -ROCPROFILER_ENUM_LABEL(ROCPROFILER_PAGE_MIGRATION_TRIGGER_PAGEFAULT_GPU); -ROCPROFILER_ENUM_LABEL(ROCPROFILER_PAGE_MIGRATION_TRIGGER_PAGEFAULT_CPU); -ROCPROFILER_ENUM_LABEL(ROCPROFILER_PAGE_MIGRATION_TRIGGER_TTM_EVICTION); -static_assert(ROCPROFILER_PAGE_MIGRATION_TRIGGER_LAST == 4); -ROCPROFILER_ENUM_LABEL(ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND_TRIGGER_SVM); -ROCPROFILER_ENUM_LABEL(ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND_TRIGGER_USERPTR); -ROCPROFILER_ENUM_LABEL(ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND_TRIGGER_TTM); -ROCPROFILER_ENUM_LABEL(ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND_TRIGGER_SUSPEND); -ROCPROFILER_ENUM_LABEL(ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND_TRIGGER_CRIU_CHECKPOINT); -ROCPROFILER_ENUM_LABEL(ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND_TRIGGER_CRIU_RESTORE); -static_assert(ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND_TRIGGER_LAST == 6); -ROCPROFILER_ENUM_LABEL(ROCPROFILER_PAGE_MIGRATION_UNMAP_FROM_GPU_TRIGGER_MMU_NOTIFY); -ROCPROFILER_ENUM_LABEL(ROCPROFILER_PAGE_MIGRATION_UNMAP_FROM_GPU_TRIGGER_MMU_NOTIFY_MIGRATE); -ROCPROFILER_ENUM_LABEL(ROCPROFILER_PAGE_MIGRATION_UNMAP_FROM_GPU_TRIGGER_UNMAP_FROM_CPU); -static_assert(ROCPROFILER_PAGE_MIGRATION_UNMAP_FROM_GPU_TRIGGER_LAST == 3); +// kfd events +ROCPROFILER_ENUM_LABEL(ROCPROFILER_KFD_EVENT_PAGE_MIGRATE_PREFETCH); +ROCPROFILER_ENUM_LABEL(ROCPROFILER_KFD_EVENT_PAGE_MIGRATE_PAGEFAULT_GPU); +ROCPROFILER_ENUM_LABEL(ROCPROFILER_KFD_EVENT_PAGE_MIGRATE_PAGEFAULT_CPU); +ROCPROFILER_ENUM_LABEL(ROCPROFILER_KFD_EVENT_PAGE_MIGRATE_TTM_EVICTION); +ROCPROFILER_ENUM_LABEL(ROCPROFILER_KFD_EVENT_PAGE_MIGRATE_END); +static_assert(ROCPROFILER_KFD_EVENT_PAGE_MIGRATE_LAST == 5); + +// rocprofiler_kfd_event_page_fault_operation_t +ROCPROFILER_ENUM_LABEL(ROCPROFILER_KFD_EVENT_PAGE_FAULT_START); +ROCPROFILER_ENUM_LABEL(ROCPROFILER_KFD_EVENT_PAGE_FAULT_START_READ_FAULT); +ROCPROFILER_ENUM_LABEL(ROCPROFILER_KFD_EVENT_PAGE_FAULT_START_WRITE_FAULT); +ROCPROFILER_ENUM_LABEL(ROCPROFILER_KFD_EVENT_PAGE_FAULT_END_PAGE_MIGRATED); +ROCPROFILER_ENUM_LABEL(ROCPROFILER_KFD_EVENT_PAGE_FAULT_END_PAGE_UPDATED); +static_assert(ROCPROFILER_KFD_EVENT_PAGE_FAULT_LAST == 5); + +// rocprofiler_kfd_event_queue_operation_t +ROCPROFILER_ENUM_LABEL(ROCPROFILER_KFD_EVENT_QUEUE_EVICT_SVM); +ROCPROFILER_ENUM_LABEL(ROCPROFILER_KFD_EVENT_QUEUE_EVICT_USERPTR); +ROCPROFILER_ENUM_LABEL(ROCPROFILER_KFD_EVENT_QUEUE_EVICT_TTM); +ROCPROFILER_ENUM_LABEL(ROCPROFILER_KFD_EVENT_QUEUE_EVICT_SUSPEND); +ROCPROFILER_ENUM_LABEL(ROCPROFILER_KFD_EVENT_QUEUE_EVICT_CRIU_CHECKPOINT); +ROCPROFILER_ENUM_LABEL(ROCPROFILER_KFD_EVENT_QUEUE_EVICT_CRIU_RESTORE); +ROCPROFILER_ENUM_LABEL(ROCPROFILER_KFD_EVENT_QUEUE_RESTORE_RESCHEDULED); +ROCPROFILER_ENUM_LABEL(ROCPROFILER_KFD_EVENT_QUEUE_RESTORE); +static_assert(ROCPROFILER_KFD_EVENT_QUEUE_LAST == 8); + +// rocprofiler_kfd_event_unmap_from_gpu_operation_t +ROCPROFILER_ENUM_LABEL(ROCPROFILER_KFD_EVENT_UNMAP_FROM_GPU_MMU_NOTIFY); +ROCPROFILER_ENUM_LABEL(ROCPROFILER_KFD_EVENT_UNMAP_FROM_GPU_MMU_NOTIFY_MIGRATE); +ROCPROFILER_ENUM_LABEL(ROCPROFILER_KFD_EVENT_UNMAP_FROM_GPU_UNMAP_FROM_CPU); +static_assert(ROCPROFILER_KFD_EVENT_UNMAP_FROM_GPU_LAST == 3); + +// rocprofiler_kfd_event_dropped_events_operation_t +ROCPROFILER_ENUM_LABEL(ROCPROFILER_KFD_EVENT_DROPPED_EVENTS); +static_assert(ROCPROFILER_KFD_EVENT_DROPPED_EVENTS_LAST == 1); + +// rocprofiler_kfd_page_migrate_operation_t +ROCPROFILER_ENUM_LABEL(ROCPROFILER_KFD_PAGE_MIGRATE_PREFETCH); +ROCPROFILER_ENUM_LABEL(ROCPROFILER_KFD_PAGE_MIGRATE_PAGEFAULT_GPU); +ROCPROFILER_ENUM_LABEL(ROCPROFILER_KFD_PAGE_MIGRATE_PAGEFAULT_CPU); +ROCPROFILER_ENUM_LABEL(ROCPROFILER_KFD_PAGE_MIGRATE_TTM_EVICTION); +static_assert(ROCPROFILER_KFD_PAGE_MIGRATE_LAST == 4); + +// rocprofiler_kfd_page_fault_operation_t +ROCPROFILER_ENUM_LABEL(ROCPROFILER_KFD_PAGE_FAULT_READ_FAULT_MIGRATED); +ROCPROFILER_ENUM_LABEL(ROCPROFILER_KFD_PAGE_FAULT_READ_FAULT_UPDATED); +ROCPROFILER_ENUM_LABEL(ROCPROFILER_KFD_PAGE_FAULT_WRITE_FAULT_MIGRATED); +ROCPROFILER_ENUM_LABEL(ROCPROFILER_KFD_PAGE_FAULT_WRITE_FAULT_UPDATED); +static_assert(ROCPROFILER_KFD_PAGE_FAULT_LAST == 4); + +// rocprofiler_kfd_queue_operation_t +ROCPROFILER_ENUM_LABEL(ROCPROFILER_KFD_QUEUE_EVICT_SVM); +ROCPROFILER_ENUM_LABEL(ROCPROFILER_KFD_QUEUE_EVICT_USERPTR); +ROCPROFILER_ENUM_LABEL(ROCPROFILER_KFD_QUEUE_EVICT_TTM); +ROCPROFILER_ENUM_LABEL(ROCPROFILER_KFD_QUEUE_EVICT_SUSPEND); +ROCPROFILER_ENUM_LABEL(ROCPROFILER_KFD_QUEUE_EVICT_CRIU_CHECKPOINT); +ROCPROFILER_ENUM_LABEL(ROCPROFILER_KFD_QUEUE_EVICT_CRIU_RESTORE); +static_assert(ROCPROFILER_KFD_QUEUE_LAST == 6); // rocprofiler_external_correlation_id_request_kind_t ROCPROFILER_ENUM_LABEL(ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_NONE); diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/perfetto.hpp b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/perfetto.hpp index 896b364dad..d9ed56dca6 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/perfetto.hpp +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/perfetto.hpp @@ -101,6 +101,7 @@ ROCPROFILER_DEFINE_CATEGORY(category, memory_allocation, "Memory Allocation") ROCPROFILER_DEFINE_CATEGORY(category, rocdecode_api, "rocDecode API function") ROCPROFILER_DEFINE_CATEGORY(category, rocjpeg_api, "rocJPEG API function") ROCPROFILER_DEFINE_CATEGORY(category, counter_collection, "Counter Collection") +ROCPROFILER_DEFINE_CATEGORY(category, kfd_events, "KFD events collection") ROCPROFILER_DEFINE_CATEGORY(category, none, "Unknown category") #define ROCPROFILER_PERFETTO_CATEGORIES \ @@ -215,7 +216,6 @@ ROCPROFILER_PERFETTO_BUFFER_TRACING_CATEGORY(MARKER_NAME_API, marker_api) ROCPROFILER_PERFETTO_BUFFER_TRACING_CATEGORY(MEMORY_COPY, memory_copy) ROCPROFILER_PERFETTO_BUFFER_TRACING_CATEGORY(MEMORY_ALLOCATION, memory_allocation) ROCPROFILER_PERFETTO_BUFFER_TRACING_CATEGORY(KERNEL_DISPATCH, kernel_dispatch) -ROCPROFILER_PERFETTO_BUFFER_TRACING_CATEGORY(PAGE_MIGRATION, none) ROCPROFILER_PERFETTO_BUFFER_TRACING_CATEGORY(SCRATCH_MEMORY, memory_allocation) ROCPROFILER_PERFETTO_BUFFER_TRACING_CATEGORY(CORRELATION_ID_RETIREMENT, none) ROCPROFILER_PERFETTO_BUFFER_TRACING_CATEGORY(RCCL_API, rccl_api) @@ -227,6 +227,14 @@ ROCPROFILER_PERFETTO_BUFFER_TRACING_CATEGORY(HIP_STREAM, hip_api) ROCPROFILER_PERFETTO_BUFFER_TRACING_CATEGORY(HIP_RUNTIME_API_EXT, hip_api) ROCPROFILER_PERFETTO_BUFFER_TRACING_CATEGORY(HIP_COMPILER_API_EXT, hip_api) ROCPROFILER_PERFETTO_BUFFER_TRACING_CATEGORY(ROCDECODE_API_EXT, rocdecode_api) +ROCPROFILER_PERFETTO_BUFFER_TRACING_CATEGORY(KFD_EVENT_PAGE_MIGRATE, kfd_events) +ROCPROFILER_PERFETTO_BUFFER_TRACING_CATEGORY(KFD_EVENT_PAGE_FAULT, kfd_events) +ROCPROFILER_PERFETTO_BUFFER_TRACING_CATEGORY(KFD_EVENT_QUEUE, kfd_events) +ROCPROFILER_PERFETTO_BUFFER_TRACING_CATEGORY(KFD_EVENT_UNMAP_FROM_GPU, kfd_events) +ROCPROFILER_PERFETTO_BUFFER_TRACING_CATEGORY(KFD_EVENT_DROPPED_EVENTS, kfd_events) +ROCPROFILER_PERFETTO_BUFFER_TRACING_CATEGORY(KFD_PAGE_MIGRATE, kfd_events) +ROCPROFILER_PERFETTO_BUFFER_TRACING_CATEGORY(KFD_PAGE_FAULT, kfd_events) +ROCPROFILER_PERFETTO_BUFFER_TRACING_CATEGORY(KFD_QUEUE, kfd_events) ROCPROFILER_PERFETTO_CALLBACK_TRACING_CATEGORY(NONE, none) ROCPROFILER_PERFETTO_CALLBACK_TRACING_CATEGORY(HSA_CORE_API, hsa_api) diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/serialization/save.hpp b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/serialization/save.hpp index 23cc0ac751..d3bea2a421 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/serialization/save.hpp +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/serialization/save.hpp @@ -729,140 +729,109 @@ save(ArchiveT& ar, rocprofiler_buffer_tracing_memory_allocation_record_t data) template void -save(ArchiveT& ar, const rocprofiler_page_migration_page_fault_start_t& data) +save(ArchiveT& ar, rocprofiler_buffer_tracing_kfd_event_page_migrate_record_t data) { - ROCP_SDK_SAVE_DATA_BITFIELD("read_fault", read_fault); - ROCP_SDK_SAVE_DATA_FIELD(agent_id); - ROCP_SDK_SAVE_DATA_FIELD(address); -} - -template -void -save(ArchiveT& ar, const rocprofiler_page_migration_page_fault_end_t& data) -{ - ROCP_SDK_SAVE_DATA_BITFIELD("migrated", migrated); - ROCP_SDK_SAVE_DATA_FIELD(agent_id); - ROCP_SDK_SAVE_DATA_FIELD(address); -} - -template -void -save(ArchiveT& ar, const rocprofiler_page_migration_page_migrate_start_t& data) -{ - ROCP_SDK_SAVE_DATA_FIELD(start_addr); - ROCP_SDK_SAVE_DATA_FIELD(end_addr); - ROCP_SDK_SAVE_DATA_FIELD(from_agent); - ROCP_SDK_SAVE_DATA_FIELD(to_agent); + ROCP_SDK_SAVE_DATA_FIELD(size); + ROCP_SDK_SAVE_DATA_FIELD(operation); + ROCP_SDK_SAVE_DATA_FIELD(timestamp); + ROCP_SDK_SAVE_DATA_FIELD(pid); + ROCP_SDK_SAVE_DATA_FIELD(start_address); + ROCP_SDK_SAVE_DATA_FIELD(end_address); + ROCP_SDK_SAVE_DATA_FIELD(src_agent); + ROCP_SDK_SAVE_DATA_FIELD(dst_agent); ROCP_SDK_SAVE_DATA_FIELD(prefetch_agent); ROCP_SDK_SAVE_DATA_FIELD(preferred_agent); - ROCP_SDK_SAVE_DATA_FIELD(trigger); -} - -template -void -save(ArchiveT& ar, const rocprofiler_page_migration_page_migrate_end_t& data) -{ - ROCP_SDK_SAVE_DATA_FIELD(start_addr); - ROCP_SDK_SAVE_DATA_FIELD(end_addr); - ROCP_SDK_SAVE_DATA_FIELD(from_agent); - ROCP_SDK_SAVE_DATA_FIELD(to_agent); - ROCP_SDK_SAVE_DATA_FIELD(trigger); ROCP_SDK_SAVE_DATA_FIELD(error_code); } template void -save(ArchiveT& ar, const rocprofiler_page_migration_queue_eviction_t& data) -{ - ROCP_SDK_SAVE_DATA_FIELD(agent_id); - ROCP_SDK_SAVE_DATA_FIELD(trigger); -} - -template -void -save(ArchiveT& ar, const rocprofiler_page_migration_queue_restore_t& data) -{ - ROCP_SDK_SAVE_DATA_BITFIELD("rescheduled", rescheduled); - ROCP_SDK_SAVE_DATA_FIELD(agent_id); -} - -template -void -save(ArchiveT& ar, const rocprofiler_page_migration_unmap_from_gpu_t& data) -{ - ROCP_SDK_SAVE_DATA_FIELD(start_addr); - ROCP_SDK_SAVE_DATA_FIELD(end_addr); - ROCP_SDK_SAVE_DATA_FIELD(agent_id); - ROCP_SDK_SAVE_DATA_FIELD(trigger); -} - -template -void -save(ArchiveT& ar, const rocprofiler_page_migration_dropped_event_t& data) -{ - ROCP_SDK_SAVE_DATA_FIELD(dropped_events_count); -} - -namespace details -{ -template -struct save_page_migration_arg; - -#define ROCP_SDK_SPECIALIZE_PAGE_MIGRATION_ARG(ENUM_VALUE, UNION_ARG) \ - template <> \ - struct save_page_migration_arg \ - { \ - static constexpr auto value = ROCPROFILER_PAGE_MIGRATION_##ENUM_VALUE; \ - template \ - void operator()(ArchiveT& ar, rocprofiler_page_migration_args_t args) \ - { \ - ar(make_nvp(#UNION_ARG, args.UNION_ARG)); \ - } \ - }; - -ROCP_SDK_SPECIALIZE_PAGE_MIGRATION_ARG(NONE, none) -ROCP_SDK_SPECIALIZE_PAGE_MIGRATION_ARG(PAGE_MIGRATE_START, page_migrate_start) -ROCP_SDK_SPECIALIZE_PAGE_MIGRATION_ARG(PAGE_MIGRATE_END, page_migrate_end) -ROCP_SDK_SPECIALIZE_PAGE_MIGRATION_ARG(PAGE_FAULT_START, page_fault_start) -ROCP_SDK_SPECIALIZE_PAGE_MIGRATION_ARG(PAGE_FAULT_END, page_fault_end) -ROCP_SDK_SPECIALIZE_PAGE_MIGRATION_ARG(QUEUE_EVICTION, queue_eviction) -ROCP_SDK_SPECIALIZE_PAGE_MIGRATION_ARG(QUEUE_RESTORE, queue_restore) -ROCP_SDK_SPECIALIZE_PAGE_MIGRATION_ARG(UNMAP_FROM_GPU, unmap_from_gpu) -ROCP_SDK_SPECIALIZE_PAGE_MIGRATION_ARG(DROPPED_EVENT, dropped_event) - -#undef ROCP_SDK_SPECIALIZE_PAGE_MIGRATION_ARG - -template -void -save_page_migration_args(ArchiveT& ar, - rocprofiler_page_migration_operation_t op, - rocprofiler_page_migration_args_t args, - std::index_sequence) -{ - using save_page_migration_type = save_page_migration_arg; - if(op == save_page_migration_type::value) - { - if constexpr(save_page_migration_type::value != ROCPROFILER_PAGE_MIGRATION_NONE) - save_page_migration_type{}(ar, args); - } - else if constexpr(sizeof...(IdxTail) > 0) - { - save_page_migration_args(ar, op, args, std::index_sequence{}); - } -} -} // namespace details - -template -void -save(ArchiveT& ar, const rocprofiler_buffer_tracing_page_migration_record_t& data) +save(ArchiveT& ar, rocprofiler_buffer_tracing_kfd_event_page_fault_record_t data) { ROCP_SDK_SAVE_DATA_FIELD(size); - ROCP_SDK_SAVE_DATA_FIELD(kind); ROCP_SDK_SAVE_DATA_FIELD(operation); ROCP_SDK_SAVE_DATA_FIELD(timestamp); ROCP_SDK_SAVE_DATA_FIELD(pid); - details::save_page_migration_args( - ar, data.operation, data.args, std::make_index_sequence{}); + ROCP_SDK_SAVE_DATA_FIELD(agent_id); + ROCP_SDK_SAVE_DATA_FIELD(address); +} + +template +void +save(ArchiveT& ar, rocprofiler_buffer_tracing_kfd_event_queue_record_t data) +{ + ROCP_SDK_SAVE_DATA_FIELD(size); + ROCP_SDK_SAVE_DATA_FIELD(operation); + ROCP_SDK_SAVE_DATA_FIELD(timestamp); + ROCP_SDK_SAVE_DATA_FIELD(pid); + ROCP_SDK_SAVE_DATA_FIELD(agent_id); +} + +template +void +save(ArchiveT& ar, rocprofiler_buffer_tracing_kfd_event_unmap_from_gpu_record_t data) +{ + ROCP_SDK_SAVE_DATA_FIELD(size); + ROCP_SDK_SAVE_DATA_FIELD(operation); + ROCP_SDK_SAVE_DATA_FIELD(timestamp); + ROCP_SDK_SAVE_DATA_FIELD(pid); + ROCP_SDK_SAVE_DATA_FIELD(agent_id); + ROCP_SDK_SAVE_DATA_FIELD(start_address); + ROCP_SDK_SAVE_DATA_FIELD(end_address); +} + +template +void +save(ArchiveT& ar, rocprofiler_buffer_tracing_kfd_event_dropped_events_record_t data) +{ + ROCP_SDK_SAVE_DATA_FIELD(size); + ROCP_SDK_SAVE_DATA_FIELD(operation); + ROCP_SDK_SAVE_DATA_FIELD(timestamp); + ROCP_SDK_SAVE_DATA_FIELD(pid); + ROCP_SDK_SAVE_DATA_FIELD(count); +} + +template +void +save(ArchiveT& ar, rocprofiler_buffer_tracing_kfd_page_migrate_record_t data) +{ + ROCP_SDK_SAVE_DATA_FIELD(size); + ROCP_SDK_SAVE_DATA_FIELD(operation); + ROCP_SDK_SAVE_DATA_FIELD(start_timestamp); + ROCP_SDK_SAVE_DATA_FIELD(end_timestamp); + ROCP_SDK_SAVE_DATA_FIELD(pid); + ROCP_SDK_SAVE_DATA_FIELD(start_address); + ROCP_SDK_SAVE_DATA_FIELD(end_address); + ROCP_SDK_SAVE_DATA_FIELD(src_agent); + ROCP_SDK_SAVE_DATA_FIELD(dst_agent); + ROCP_SDK_SAVE_DATA_FIELD(prefetch_agent); + ROCP_SDK_SAVE_DATA_FIELD(preferred_agent); + ROCP_SDK_SAVE_DATA_FIELD(error_code); +} + +template +void +save(ArchiveT& ar, rocprofiler_buffer_tracing_kfd_page_fault_record_t data) +{ + ROCP_SDK_SAVE_DATA_FIELD(size); + ROCP_SDK_SAVE_DATA_FIELD(operation); + ROCP_SDK_SAVE_DATA_FIELD(start_timestamp); + ROCP_SDK_SAVE_DATA_FIELD(end_timestamp); + ROCP_SDK_SAVE_DATA_FIELD(pid); + ROCP_SDK_SAVE_DATA_FIELD(agent_id); + ROCP_SDK_SAVE_DATA_FIELD(address); +} + +template +void +save(ArchiveT& ar, rocprofiler_buffer_tracing_kfd_queue_record_t data) +{ + ROCP_SDK_SAVE_DATA_FIELD(size); + ROCP_SDK_SAVE_DATA_FIELD(operation); + ROCP_SDK_SAVE_DATA_FIELD(start_timestamp); + ROCP_SDK_SAVE_DATA_FIELD(end_timestamp); + ROCP_SDK_SAVE_DATA_FIELD(pid); + ROCP_SDK_SAVE_DATA_FIELD(agent_id); } template diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/fwd.h b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/fwd.h index 3f41fc1050..b9d1ca72a1 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/fwd.h +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/fwd.h @@ -196,7 +196,6 @@ typedef enum rocprofiler_buffer_tracing_kind_t // NOLINT(performance-enum-size) ROCPROFILER_BUFFER_TRACING_MARKER_NAME_API, ///< @see ::rocprofiler_marker_name_api_id_t ROCPROFILER_BUFFER_TRACING_MEMORY_COPY, ///< @see ::rocprofiler_memory_copy_operation_t ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH, ///< Buffer kernel dispatch info - ROCPROFILER_BUFFER_TRACING_PAGE_MIGRATION, ///< Buffer page migration info ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY, ///< Buffer scratch memory reclaimation info ROCPROFILER_BUFFER_TRACING_CORRELATION_ID_RETIREMENT, ///< Correlation ID in no longer in use ROCPROFILER_BUFFER_TRACING_RCCL_API, ///< RCCL tracing @@ -212,6 +211,20 @@ typedef enum rocprofiler_buffer_tracing_kind_t // NOLINT(performance-enum-size) ROCPROFILER_BUFFER_TRACING_HIP_RUNTIME_API_EXT, ROCPROFILER_BUFFER_TRACING_HIP_COMPILER_API_EXT, ROCPROFILER_BUFFER_TRACING_ROCDECODE_API_EXT, + + ROCPROFILER_BUFFER_TRACING_KFD_EVENT_PAGE_MIGRATE, ///< @see + ///< rocprofiler_kfd_event_page_migrate_operation_t + ROCPROFILER_BUFFER_TRACING_KFD_EVENT_PAGE_FAULT, ///< @see + ///< rocprofiler_kfd_event_page_fault_operation_t + ROCPROFILER_BUFFER_TRACING_KFD_EVENT_QUEUE, ///< @see rocprofiler_kfd_event_queue_operation_t + ROCPROFILER_BUFFER_TRACING_KFD_EVENT_UNMAP_FROM_GPU, ///< @see + ///< rocprofiler_kfd_event_unmap_from_gpu_operation_t + ROCPROFILER_BUFFER_TRACING_KFD_EVENT_DROPPED_EVENTS, ///< @see + ///< rocprofiler_kfd_event_dropped_events_operation_t + ROCPROFILER_BUFFER_TRACING_KFD_PAGE_MIGRATE, ///< @see rocprofiler_kfd_page_migrate_operation_t + ROCPROFILER_BUFFER_TRACING_KFD_PAGE_FAULT, ///< @see rocprofiler_kfd_page_fault_operation_t + ROCPROFILER_BUFFER_TRACING_KFD_QUEUE, ///< @see rocprofiler_kfd_queue_operation_t + ROCPROFILER_BUFFER_TRACING_LAST, /// @var ROCPROFILER_BUFFER_TRACING_HIP_RUNTIME_API_EXT @@ -361,23 +374,6 @@ typedef enum rocprofiler_buffer_policy_t // NOLINT(performance-enum-size) ROCPROFILER_BUFFER_POLICY_LAST, } rocprofiler_buffer_policy_t; -/** - * @brief Page migration event. - */ -typedef enum rocprofiler_page_migration_operation_t // NOLINT(performance-enum-size) -{ - ROCPROFILER_PAGE_MIGRATION_NONE = 0, ///< Unknown event - ROCPROFILER_PAGE_MIGRATION_PAGE_MIGRATE_START, - ROCPROFILER_PAGE_MIGRATION_PAGE_MIGRATE_END, - ROCPROFILER_PAGE_MIGRATION_PAGE_FAULT_START, - ROCPROFILER_PAGE_MIGRATION_PAGE_FAULT_END, - ROCPROFILER_PAGE_MIGRATION_QUEUE_EVICTION, - ROCPROFILER_PAGE_MIGRATION_QUEUE_RESTORE, - ROCPROFILER_PAGE_MIGRATION_UNMAP_FROM_GPU, - ROCPROFILER_PAGE_MIGRATION_DROPPED_EVENT, - ROCPROFILER_PAGE_MIGRATION_LAST, -} rocprofiler_page_migration_operation_t; - /** * @brief Scratch event kind */ diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/kfd/CMakeLists.txt b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/kfd/CMakeLists.txt index 2a63111257..bfbd282b07 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/kfd/CMakeLists.txt +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/kfd/CMakeLists.txt @@ -3,7 +3,7 @@ # Installation of public KFD headers # # -set(ROCPROFILER_KFD_HEADER_FILES page_migration_args.h page_migration_id.h) +set(ROCPROFILER_KFD_HEADER_FILES kfd_id.h) install( FILES ${ROCPROFILER_KFD_HEADER_FILES} diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/kfd/kfd_id.h b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/kfd/kfd_id.h new file mode 100644 index 0000000000..e32c406b59 --- /dev/null +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/kfd/kfd_id.h @@ -0,0 +1,159 @@ +// MIT License +// +// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#pragma once + +#include +#include +#include +#include + +#include + +ROCPROFILER_EXTERN_C_INIT + +/** + * @brief Page migration event operations. @see + * rocprofiler_buffer_tracing_kfd_event_page_migrate_record_t + */ +typedef enum rocprofiler_kfd_event_page_migrate_operation_t +{ + ROCPROFILER_KFD_EVENT_PAGE_MIGRATE_NONE = -1, + ROCPROFILER_KFD_EVENT_PAGE_MIGRATE_PREFETCH, ///< Migration triggered by a prefetch + ROCPROFILER_KFD_EVENT_PAGE_MIGRATE_PAGEFAULT_GPU, ///< Migration triggered by a page fault on + ///< the GPU + ROCPROFILER_KFD_EVENT_PAGE_MIGRATE_PAGEFAULT_CPU, ///< Migration triggered by a page fault on + ///< the CPU + ROCPROFILER_KFD_EVENT_PAGE_MIGRATE_TTM_EVICTION, ///< Page evicted by linux TTM (Translation + ///< Table Manager) + ROCPROFILER_KFD_EVENT_PAGE_MIGRATE_END, + ROCPROFILER_KFD_EVENT_PAGE_MIGRATE_LAST, +} rocprofiler_kfd_event_page_migrate_operation_t; + +/** + * @brief Page fault event operations. @see rocprofiler_buffer_tracing_kfd_event_page_fault_record_t + */ +typedef enum rocprofiler_kfd_event_page_fault_operation_t +{ + ROCPROFILER_KFD_EVENT_PAGE_FAULT_NONE = -1, + ROCPROFILER_KFD_EVENT_PAGE_FAULT_START, + ROCPROFILER_KFD_EVENT_PAGE_FAULT_START_READ_FAULT, ///< Page fault was due to a read operation + ROCPROFILER_KFD_EVENT_PAGE_FAULT_START_WRITE_FAULT, ///< Page fault was due to a write + ///< operation + ROCPROFILER_KFD_EVENT_PAGE_FAULT_END_PAGE_MIGRATED, ///< Fault resolved through a migration + ROCPROFILER_KFD_EVENT_PAGE_FAULT_END_PAGE_UPDATED, ///< Fault resolved through an update + ROCPROFILER_KFD_EVENT_PAGE_FAULT_LAST, +} rocprofiler_kfd_event_page_fault_operation_t; + +/** + * @brief Queue evict/restore event operations. @see + * rocprofiler_buffer_tracing_kfd_event_queue_record_t + */ +typedef enum rocprofiler_kfd_event_queue_operation_t +{ + ROCPROFILER_KFD_EVENT_QUEUE_NONE = -1, + ROCPROFILER_KFD_EVENT_QUEUE_EVICT_SVM, ///< SVM Buffer migration + ROCPROFILER_KFD_EVENT_QUEUE_EVICT_USERPTR, ///< userptr movement + ROCPROFILER_KFD_EVENT_QUEUE_EVICT_TTM, ///< TTM move buffer + ROCPROFILER_KFD_EVENT_QUEUE_EVICT_SUSPEND, ///< GPU suspend + ROCPROFILER_KFD_EVENT_QUEUE_EVICT_CRIU_CHECKPOINT, ///< Queues evicted due to process + ///< checkpoint by CRIU + ROCPROFILER_KFD_EVENT_QUEUE_EVICT_CRIU_RESTORE, ///< Queues restored during process restore by + ///< CRIU + + ROCPROFILER_KFD_EVENT_QUEUE_RESTORE_RESCHEDULED, ///< Queue was not restored; will be restored + ///< later + ROCPROFILER_KFD_EVENT_QUEUE_RESTORE, ///< Queue was restored + ROCPROFILER_KFD_EVENT_QUEUE_LAST, +} rocprofiler_kfd_event_queue_operation_t; + +/** + * @brief Memory unmap from GPU event operations. @see + * rocprofiler_buffer_tracing_kfd_event_unmap_from_gpu_record_t + */ +typedef enum rocprofiler_kfd_event_unmap_from_gpu_operation_t +{ + ROCPROFILER_KFD_EVENT_UNMAP_FROM_GPU_NONE = -1, + ROCPROFILER_KFD_EVENT_UNMAP_FROM_GPU_MMU_NOTIFY, ///< MMU notifier CPU buffer movement + ROCPROFILER_KFD_EVENT_UNMAP_FROM_GPU_MMU_NOTIFY_MIGRATE, ///< MMU notifier page migration + ROCPROFILER_KFD_EVENT_UNMAP_FROM_GPU_UNMAP_FROM_CPU, ///< Unmap to free the buffer + ROCPROFILER_KFD_EVENT_UNMAP_FROM_GPU_LAST, +} rocprofiler_kfd_event_unmap_from_gpu_operation_t; + +/** + * @brief KFD dropped event operations. @see + * rocprofiler_buffer_tracing_kfd_event_dropped_events_record_t + */ +typedef enum rocprofiler_kfd_event_dropped_events_operation_t +{ + ROCPROFILER_KFD_EVENT_DROPPED_EVENTS_NONE = -1, + ROCPROFILER_KFD_EVENT_DROPPED_EVENTS, + ROCPROFILER_KFD_EVENT_DROPPED_EVENTS_LAST, +} rocprofiler_kfd_event_dropped_events_operation_t; + +/** + * @brief Operation kinds for @see rocprofiler_buffer_tracing_kfd_page_migrate_record_t + */ +typedef enum rocprofiler_kfd_page_migrate_operation_t +{ + ROCPROFILER_KFD_PAGE_MIGRATE_NONE = -1, + ROCPROFILER_KFD_PAGE_MIGRATE_PREFETCH, ///< Migration triggered by a prefetch + ROCPROFILER_KFD_PAGE_MIGRATE_PAGEFAULT_GPU, ///< Migration triggered by a page fault on the + ///< GPU + ROCPROFILER_KFD_PAGE_MIGRATE_PAGEFAULT_CPU, ///< Migration triggered by a page fault on the + ///< CPU + ROCPROFILER_KFD_PAGE_MIGRATE_TTM_EVICTION, ///< Page evicted by linux TTM (Translation Table + ///< Manager) + ROCPROFILER_KFD_PAGE_MIGRATE_LAST, +} rocprofiler_kfd_page_migrate_operation_t; + +/** + * @brief Operation kinds for @see rocprofiler_buffer_tracing_kfd_page_fault_record_t + */ +typedef enum rocprofiler_kfd_page_fault_operation_t +{ + ROCPROFILER_KFD_PAGE_FAULT_NONE = -1, + ROCPROFILER_KFD_PAGE_FAULT_READ_FAULT_MIGRATED, ///< read fault resolved with a migrate + ROCPROFILER_KFD_PAGE_FAULT_READ_FAULT_UPDATED, ///< read fault resolved with an update + ROCPROFILER_KFD_PAGE_FAULT_WRITE_FAULT_MIGRATED, ///< write fault resolved with an migrate + ROCPROFILER_KFD_PAGE_FAULT_WRITE_FAULT_UPDATED, ///< write fault resolved with an update + ROCPROFILER_KFD_PAGE_FAULT_LAST, +} rocprofiler_kfd_page_fault_operation_t; + +/** + * @brief Operation kinds for @see rocprofiler_buffer_tracing_kfd_queue_record_t + */ +typedef enum rocprofiler_kfd_queue_operation_t +{ + ROCPROFILER_KFD_QUEUE_NONE = -1, + ROCPROFILER_KFD_QUEUE_EVICT_SVM, ///< SVM Buffer migration + ROCPROFILER_KFD_QUEUE_EVICT_USERPTR, ///< userptr movement + ROCPROFILER_KFD_QUEUE_EVICT_TTM, ///< TTM move buffer + ROCPROFILER_KFD_QUEUE_EVICT_SUSPEND, ///< GPU suspend + ROCPROFILER_KFD_QUEUE_EVICT_CRIU_CHECKPOINT, ///< Queues evicted due to process checkpoint by + ///< CRIU + ROCPROFILER_KFD_QUEUE_EVICT_CRIU_RESTORE, ///< Queues restored during process restore by CRIU + + ROCPROFILER_KFD_QUEUE_LAST, +} rocprofiler_kfd_queue_operation_t; + +ROCPROFILER_EXTERN_C_FINI diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/kfd/page_migration_args.h b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/kfd/page_migration_args.h deleted file mode 100644 index dfc9e8f424..0000000000 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/kfd/page_migration_args.h +++ /dev/null @@ -1,111 +0,0 @@ -// MIT License -// -// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -#pragma once - -#include -#include - -#include - -ROCPROFILER_EXTERN_C_INIT - -typedef struct rocprofiler_page_migration_none_t -{ - char empty; -} rocprofiler_page_migration_none_t; - -typedef struct rocprofiler_page_migration_page_migrate_start_t -{ - uint64_t start_addr; - uint64_t end_addr; - rocprofiler_agent_id_t from_agent; - rocprofiler_agent_id_t to_agent; - rocprofiler_agent_id_t prefetch_agent; - rocprofiler_agent_id_t preferred_agent; - rocprofiler_page_migration_trigger_t trigger; -} rocprofiler_page_migration_page_migrate_start_t; - -typedef struct rocprofiler_page_migration_page_migrate_end_t -{ - uint64_t start_addr; - uint64_t end_addr; - rocprofiler_agent_id_t from_agent; - rocprofiler_agent_id_t to_agent; - rocprofiler_page_migration_trigger_t trigger; - int32_t error_code; -} rocprofiler_page_migration_page_migrate_end_t; - -typedef struct rocprofiler_page_migration_page_fault_start_t -{ - uint32_t read_fault : 1; - rocprofiler_agent_id_t agent_id; - uint64_t address; -} rocprofiler_page_migration_page_fault_start_t; - -typedef struct rocprofiler_page_migration_page_fault_end_t -{ - uint32_t migrated : 1; - rocprofiler_agent_id_t agent_id; - uint64_t address; -} rocprofiler_page_migration_page_fault_end_t; - -typedef struct rocprofiler_page_migration_queue_eviction_t -{ - rocprofiler_agent_id_t agent_id; - rocprofiler_page_migration_queue_suspend_trigger_t trigger; -} rocprofiler_page_migration_queue_eviction_t; - -typedef struct rocprofiler_page_migration_queue_restore_t -{ - uint32_t rescheduled : 1; - rocprofiler_agent_id_t agent_id; -} rocprofiler_page_migration_queue_restore_t; - -typedef struct rocprofiler_page_migration_unmap_from_gpu_t -{ - uint64_t start_addr; - uint64_t end_addr; - rocprofiler_agent_id_t agent_id; - rocprofiler_page_migration_unmap_from_gpu_trigger_t trigger; -} rocprofiler_page_migration_unmap_from_gpu_t; - -typedef struct rocprofiler_page_migration_dropped_event_t -{ - uint32_t dropped_events_count; -} rocprofiler_page_migration_dropped_event_t; - -typedef union rocprofiler_page_migration_args_t -{ - rocprofiler_page_migration_none_t none; - rocprofiler_page_migration_page_migrate_start_t page_migrate_start; - rocprofiler_page_migration_page_migrate_end_t page_migrate_end; - rocprofiler_page_migration_page_fault_start_t page_fault_start; - rocprofiler_page_migration_page_fault_end_t page_fault_end; - rocprofiler_page_migration_queue_eviction_t queue_eviction; - rocprofiler_page_migration_queue_restore_t queue_restore; - rocprofiler_page_migration_unmap_from_gpu_t unmap_from_gpu; - rocprofiler_page_migration_dropped_event_t dropped_event; - uint64_t reserved[16]; -} rocprofiler_page_migration_args_t; - -ROCPROFILER_EXTERN_C_FINI diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/kfd/page_migration_id.h b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/kfd/page_migration_id.h deleted file mode 100644 index 53f3b480b4..0000000000 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/kfd/page_migration_id.h +++ /dev/null @@ -1,82 +0,0 @@ -// MIT License -// -// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -#pragma once - -#include -#include -#include -#include - -#include - -ROCPROFILER_EXTERN_C_INIT - -/** - * @brief Page migration triggers - * - */ -typedef enum rocprofiler_page_migration_trigger_t -{ - ROCPROFILER_PAGE_MIGRATION_TRIGGER_NONE = -1, - ROCPROFILER_PAGE_MIGRATION_TRIGGER_PREFETCH, ///< Migration triggered by a prefetch - ROCPROFILER_PAGE_MIGRATION_TRIGGER_PAGEFAULT_GPU, ///< Triggered by a page fault on the GPU - ROCPROFILER_PAGE_MIGRATION_TRIGGER_PAGEFAULT_CPU, ///< Triggered by a page fault on the CPU - ROCPROFILER_PAGE_MIGRATION_TRIGGER_TTM_EVICTION, ///< Page evicted by linux TTM (Translation - ///< Table Manager) - ROCPROFILER_PAGE_MIGRATION_TRIGGER_LAST, -} rocprofiler_page_migration_trigger_t; - -/** - * @brief Page migration triggers causing the queue to suspend - * - */ -typedef enum rocprofiler_page_migration_queue_suspend_trigger_t -{ - ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND_TRIGGER_NONE = -1, - ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND_TRIGGER_SVM, - ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND_TRIGGER_USERPTR, - ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND_TRIGGER_TTM, ///< Queue suspended by TTM (Translation - ///< Table Manager) operation - ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND_TRIGGER_SUSPEND, - ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND_TRIGGER_CRIU_CHECKPOINT, ///< Queues evicted due to - ///< process save - ///< (checkpoint) by CRIU - ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND_TRIGGER_CRIU_RESTORE, ///< Queues restored during - ///< process restore by CRIU - ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND_TRIGGER_LAST, -} rocprofiler_page_migration_queue_suspend_trigger_t; - -/** - * @brief Page migration triggers causing an unmap from the GPU - * - */ -typedef enum rocprofiler_page_migration_unmap_from_gpu_trigger_t -{ - ROCPROFILER_PAGE_MIGRATION_UNMAP_FROM_GPU_TRIGGER_NONE = -1, - ROCPROFILER_PAGE_MIGRATION_UNMAP_FROM_GPU_TRIGGER_MMU_NOTIFY, - ROCPROFILER_PAGE_MIGRATION_UNMAP_FROM_GPU_TRIGGER_MMU_NOTIFY_MIGRATE, - ROCPROFILER_PAGE_MIGRATION_UNMAP_FROM_GPU_TRIGGER_UNMAP_FROM_CPU, - ROCPROFILER_PAGE_MIGRATION_UNMAP_FROM_GPU_TRIGGER_LAST, -} rocprofiler_page_migration_unmap_from_gpu_trigger_t; - -ROCPROFILER_EXTERN_C_FINI diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/CMakeLists.txt b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/CMakeLists.txt index c4945d3914..1fc1ddfe65 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/CMakeLists.txt +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/CMakeLists.txt @@ -49,7 +49,7 @@ add_subdirectory(marker) add_subdirectory(thread_trace) add_subdirectory(tracing) add_subdirectory(kernel_dispatch) -add_subdirectory(page_migration) +add_subdirectory(kfd) add_subdirectory(rccl) add_subdirectory(rocdecode) add_subdirectory(rocjpeg) diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/buffer_tracing.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/buffer_tracing.cpp index df5efb5284..e680cd1693 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/buffer_tracing.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/buffer_tracing.cpp @@ -30,9 +30,9 @@ #include "lib/rocprofiler-sdk/hsa/memory_allocation.hpp" #include "lib/rocprofiler-sdk/hsa/scratch_memory.hpp" #include "lib/rocprofiler-sdk/kernel_dispatch/kernel_dispatch.hpp" +#include "lib/rocprofiler-sdk/kfd/kfd.hpp" #include "lib/rocprofiler-sdk/marker/marker.hpp" #include "lib/rocprofiler-sdk/ompt/ompt.hpp" -#include "lib/rocprofiler-sdk/page_migration/page_migration.hpp" #include "lib/rocprofiler-sdk/rccl/rccl.hpp" #include "lib/rocprofiler-sdk/registration.hpp" #include "lib/rocprofiler-sdk/rocdecode/rocdecode.hpp" @@ -52,6 +52,7 @@ #include #include #include +#include #include #define RETURN_STATUS_ON_FAIL(...) \ @@ -90,7 +91,6 @@ ROCPROFILER_BUFFER_TRACING_KIND_STRING(MARKER_NAME_API) ROCPROFILER_BUFFER_TRACING_KIND_STRING(MEMORY_COPY) ROCPROFILER_BUFFER_TRACING_KIND_STRING(MEMORY_ALLOCATION) ROCPROFILER_BUFFER_TRACING_KIND_STRING(KERNEL_DISPATCH) -ROCPROFILER_BUFFER_TRACING_KIND_STRING(PAGE_MIGRATION) ROCPROFILER_BUFFER_TRACING_KIND_STRING(SCRATCH_MEMORY) ROCPROFILER_BUFFER_TRACING_KIND_STRING(CORRELATION_ID_RETIREMENT) ROCPROFILER_BUFFER_TRACING_KIND_STRING(RCCL_API) @@ -102,6 +102,14 @@ ROCPROFILER_BUFFER_TRACING_KIND_STRING(HIP_STREAM) ROCPROFILER_BUFFER_TRACING_KIND_STRING(HIP_RUNTIME_API_EXT) ROCPROFILER_BUFFER_TRACING_KIND_STRING(HIP_COMPILER_API_EXT) ROCPROFILER_BUFFER_TRACING_KIND_STRING(ROCDECODE_API_EXT) +ROCPROFILER_BUFFER_TRACING_KIND_STRING(KFD_EVENT_PAGE_MIGRATE) +ROCPROFILER_BUFFER_TRACING_KIND_STRING(KFD_EVENT_PAGE_FAULT) +ROCPROFILER_BUFFER_TRACING_KIND_STRING(KFD_EVENT_QUEUE) +ROCPROFILER_BUFFER_TRACING_KIND_STRING(KFD_EVENT_UNMAP_FROM_GPU) +ROCPROFILER_BUFFER_TRACING_KIND_STRING(KFD_EVENT_DROPPED_EVENTS) +ROCPROFILER_BUFFER_TRACING_KIND_STRING(KFD_PAGE_MIGRATE) +ROCPROFILER_BUFFER_TRACING_KIND_STRING(KFD_PAGE_FAULT) +ROCPROFILER_BUFFER_TRACING_KIND_STRING(KFD_QUEUE) template std::pair @@ -172,8 +180,21 @@ rocprofiler_configure_buffer_tracing_service(rocprofiler_context_id_t ctx->buffered_tracer->domains, kind, operations[i])); } - if(kind == ROCPROFILER_BUFFER_TRACING_PAGE_MIGRATION) - RETURN_STATUS_ON_FAIL(rocprofiler::page_migration::init()); + { + static constexpr auto kfd_events = + std::array{ROCPROFILER_BUFFER_TRACING_KFD_EVENT_PAGE_MIGRATE, + ROCPROFILER_BUFFER_TRACING_KFD_EVENT_PAGE_FAULT, + ROCPROFILER_BUFFER_TRACING_KFD_EVENT_QUEUE, + ROCPROFILER_BUFFER_TRACING_KFD_EVENT_UNMAP_FROM_GPU, + ROCPROFILER_BUFFER_TRACING_KFD_PAGE_MIGRATE, + ROCPROFILER_BUFFER_TRACING_KFD_PAGE_FAULT, + ROCPROFILER_BUFFER_TRACING_KFD_QUEUE}; + + if(std::find(kfd_events.begin(), kfd_events.end(), kind) != kfd_events.end()) + { + RETURN_STATUS_ON_FAIL(rocprofiler::kfd::init()); + } + } return ROCPROFILER_STATUS_SUCCESS; } @@ -282,11 +303,6 @@ rocprofiler_query_buffer_tracing_kind_operation_name(rocprofiler_buffer_tracing_ val = rocprofiler::kernel_dispatch::name_by_id(operation); break; } - case ROCPROFILER_BUFFER_TRACING_PAGE_MIGRATION: - { - val = rocprofiler::page_migration::name_by_id(operation); - break; - } case ROCPROFILER_BUFFER_TRACING_OMPT: { val = rocprofiler::ompt::name_by_id(operation); @@ -318,6 +334,18 @@ rocprofiler_query_buffer_tracing_kind_operation_name(rocprofiler_buffer_tracing_ val = rocprofiler::hip::stream::name_by_id(operation); break; } + case ROCPROFILER_BUFFER_TRACING_KFD_EVENT_PAGE_MIGRATE: + case ROCPROFILER_BUFFER_TRACING_KFD_EVENT_PAGE_FAULT: + case ROCPROFILER_BUFFER_TRACING_KFD_EVENT_QUEUE: + case ROCPROFILER_BUFFER_TRACING_KFD_EVENT_UNMAP_FROM_GPU: + case ROCPROFILER_BUFFER_TRACING_KFD_EVENT_DROPPED_EVENTS: + case ROCPROFILER_BUFFER_TRACING_KFD_PAGE_MIGRATE: + case ROCPROFILER_BUFFER_TRACING_KFD_PAGE_FAULT: + case ROCPROFILER_BUFFER_TRACING_KFD_QUEUE: + { + val = rocprofiler::kfd::name_by_id(kind, operation); + break; + } }; if(!val) @@ -432,11 +460,6 @@ rocprofiler_iterate_buffer_tracing_kind_operations( ops = rocprofiler::kernel_dispatch::get_ids(); break; } - case ROCPROFILER_BUFFER_TRACING_PAGE_MIGRATION: - { - ops = rocprofiler::page_migration::get_ids(); - break; - } case ROCPROFILER_BUFFER_TRACING_OMPT: { ops = rocprofiler::ompt::get_ids(); @@ -467,6 +490,18 @@ rocprofiler_iterate_buffer_tracing_kind_operations( ops = rocprofiler::hip::stream::get_ids(); break; } + case ROCPROFILER_BUFFER_TRACING_KFD_EVENT_PAGE_MIGRATE: + case ROCPROFILER_BUFFER_TRACING_KFD_EVENT_PAGE_FAULT: + case ROCPROFILER_BUFFER_TRACING_KFD_EVENT_QUEUE: + case ROCPROFILER_BUFFER_TRACING_KFD_EVENT_UNMAP_FROM_GPU: + case ROCPROFILER_BUFFER_TRACING_KFD_EVENT_DROPPED_EVENTS: + case ROCPROFILER_BUFFER_TRACING_KFD_PAGE_MIGRATE: + case ROCPROFILER_BUFFER_TRACING_KFD_PAGE_FAULT: + case ROCPROFILER_BUFFER_TRACING_KFD_QUEUE: + { + ops = rocprofiler::kfd::get_ids(kind); + break; + } } for(const auto& itr : ops) diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/context/domain.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/context/domain.cpp index e29b5e9e92..d0a97e45ec 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/context/domain.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/context/domain.cpp @@ -59,7 +59,7 @@ template rocprofiler_status_t add_domain(domain_context& _cfg, DomainT _domain) { - static_assert((1 << domain_info::last) < std::numeric_limits::max(), + static_assert((1UL << domain_info::last) < std::numeric_limits::max(), "uint64_t cannot handle all the domains"); if(_domain <= domain_info::none) return ROCPROFILER_STATUS_ERROR_KIND_NOT_FOUND; diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/details/kfd_ioctl.h b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/details/kfd_ioctl.h index 7acc4f88f7..161760d5c2 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/details/kfd_ioctl.h +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/details/kfd_ioctl.h @@ -42,9 +42,11 @@ * - 1.14 - Update kfd_event_data * - 1.15 - Enable managing mappings in compute VMs with GEM_VA ioctl * - 1.16 - Add contiguous VRAM allocation flag + * - 1.17 - Add SDMA queue creation with target SDMA engine ID + * - 1.18 - Rename pad in set_memory_policy_args to misc_process_flag */ #define KFD_IOCTL_MAJOR_VERSION 1 -#define KFD_IOCTL_MINOR_VERSION 16 +#define KFD_IOCTL_MINOR_VERSION 18 struct kfd_ioctl_get_version_args { @@ -53,14 +55,17 @@ struct kfd_ioctl_get_version_args }; /* For kfd_ioctl_create_queue_args.queue_type. */ -#define KFD_IOC_QUEUE_TYPE_COMPUTE 0x0 -#define KFD_IOC_QUEUE_TYPE_SDMA 0x1 -#define KFD_IOC_QUEUE_TYPE_COMPUTE_AQL 0x2 -#define KFD_IOC_QUEUE_TYPE_SDMA_XGMI 0x3 +#define KFD_IOC_QUEUE_TYPE_COMPUTE 0x0 +#define KFD_IOC_QUEUE_TYPE_SDMA 0x1 +#define KFD_IOC_QUEUE_TYPE_COMPUTE_AQL 0x2 +#define KFD_IOC_QUEUE_TYPE_SDMA_XGMI 0x3 +#define KFD_IOC_QUEUE_TYPE_SDMA_BY_ENG_ID 0x4 #define KFD_MAX_QUEUE_PERCENTAGE 100 #define KFD_MAX_QUEUE_PRIORITY 15 +#define KFD_MIN_QUEUE_RING_SIZE 1024 + struct kfd_ioctl_create_queue_args { __u64 ring_base_address; /* to KFD */ @@ -80,6 +85,8 @@ struct kfd_ioctl_create_queue_args __u64 ctx_save_restore_address; /* to KFD */ __u32 ctx_save_restore_size; /* to KFD */ __u32 ctl_stack_size; /* to KFD */ + __u32 sdma_engine_id; /* to KFD */ + __u32 pad; }; struct kfd_ioctl_destroy_queue_args @@ -152,15 +159,18 @@ struct kfd_dbg_device_info_entry #define KFD_IOC_CACHE_POLICY_COHERENT 0 #define KFD_IOC_CACHE_POLICY_NONCOHERENT 1 +/* Misc. per process flags */ +#define KFD_PROC_FLAG_MFMA_HIGH_PRECISION (1 << 0) + struct kfd_ioctl_set_memory_policy_args { __u64 alternate_aperture_base; /* to KFD */ __u64 alternate_aperture_size; /* to KFD */ - __u32 gpu_id; /* to KFD */ - __u32 default_policy; /* to KFD */ - __u32 alternate_policy; /* to KFD */ - __u32 pad; + __u32 gpu_id; /* to KFD */ + __u32 default_policy; /* to KFD */ + __u32 alternate_policy; /* to KFD */ + __u32 misc_process_flag; /* to KFD */ }; /* @@ -578,7 +588,9 @@ enum kfd_smi_event KFD_SMI_EVENT_QUEUE_EVICTION = 9, KFD_SMI_EVENT_QUEUE_RESTORE = 10, KFD_SMI_EVENT_UNMAP_FROM_GPU = 11, - KFD_SMI_EVENT_DROPPED_EVENT = 12, + KFD_SMI_EVENT_PROCESS_START = 12, + KFD_SMI_EVENT_PROCESS_END = 13, + KFD_SMI_EVENT_DROPPED_EVENT = 14, /* * max event number, as a flag bit to get events from all processes, @@ -589,14 +601,16 @@ enum kfd_smi_event KFD_SMI_EVENT_ALL_PROCESS = 64 }; +/* The reason of the page migration event */ enum KFD_MIGRATE_TRIGGERS { - KFD_MIGRATE_TRIGGER_PREFETCH, /* Prefetch to GPU */ + KFD_MIGRATE_TRIGGER_PREFETCH, /* Prefetch to GPU VRAM or system memory */ KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU, /* GPU page fault recover */ KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU, /* CPU page fault recover */ KFD_MIGRATE_TRIGGER_TTM_EVICTION /* TTM eviction */ }; +/* The reason of user queue evition event */ enum KFD_QUEUE_EVICTION_TRIGGERS { KFD_QUEUE_EVICTION_TRIGGER_SVM, /* SVM buffer migration */ @@ -607,6 +621,7 @@ enum KFD_QUEUE_EVICTION_TRIGGERS KFD_QUEUE_EVICTION_CRIU_RESTORE /* CRIU restore */ }; +/* The reason of unmap buffer from GPU event */ enum KFD_SVM_UNMAP_TRIGGERS { KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY, /* MMU notifier CPU buffer movement */ @@ -647,9 +662,8 @@ enum kfd_ioctl_spm_op * @buf_size[in]: size of the destination buffer * @timeout[in/out]: [in]: timeout in milliseconds, [out]: amount of time left * `in the timeout window - * @bytes_copied[out]: amount of data that was copied to the previous dest_buf - * @has_data_loss: boolean indicating whether data was lost - * (e.g. due to a ring-buffer overflow) + * @bytes_copied[out]: total amount of data that was copied to the previous dest_buf + * @has_data_loss: total count for sub-block which has data loss * * This ioctl performs different functions depending on the @op parameter. * @@ -700,6 +714,22 @@ struct kfd_ioctl_spm_args __u32 has_data_loss; }; +/** + * kfd_ioctl_spm_buffer_header - SPM Buffer header for kfd_ioctl_spm_args->dest_buf + * + * @version [out]: spm versiom + * @bytes_copied [out]: amount of data for each sub-block + * @has_data_loss: [out]: boolean indicating whether data was lost for each sub-block + * (e.g. due to a ring-buffer overflow) + */ +struct kfd_ioctl_spm_buffer_header +{ + __u32 version; /* 0-23: minor 24-31: major */ + __u32 bytes_copied; + __u32 has_data_loss; + __u32 reserved[5]; +}; + /* * SVM event tracing via SMI system management interface * @@ -760,6 +790,7 @@ struct kfd_ioctl_spm_args "%lld -%d %x %c\n", (ns), (pid), (node), (rescheduled) #define KFD_EVENT_FMT_UNMAP_FROM_GPU(ns, pid, addr, size, node, unmap_trigger) \ "%lld -%d @%lx(%lx) %x %d\n", (ns), (pid), (addr), (size), (node), (unmap_trigger) +#define KFD_EVENT_FMT_PROCESS(pid, task_name) "%x %s\n", (pid), (task_name) #define KFD_EVENT_FMT_DROPPED_EVENT(ns, pid, drop_count) "%lld -%d %d\n", (ns), (pid), (drop_count) /************************************************************************************************** diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/kfd/CMakeLists.txt b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/kfd/CMakeLists.txt new file mode 100644 index 0000000000..aa20ccbd2e --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/kfd/CMakeLists.txt @@ -0,0 +1,12 @@ +# +# +set(ROCPROFILER_LIB_KFD_EVENT_SOURCES abi.cpp kfd.cpp) +set(ROCPROFILER_LIB_KFD_EVENT_HEADERS defines.hpp kfd.hpp utils.hpp) + +target_sources( + rocprofiler-sdk-object-library PRIVATE ${ROCPROFILER_LIB_KFD_EVENT_SOURCES} + ${ROCPROFILER_LIB_KFD_EVENT_HEADERS}) + +if(ROCPROFILER_BUILD_TESTS) + add_subdirectory(tests) +endif() diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/kfd/abi.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/kfd/abi.cpp new file mode 100644 index 0000000000..3752a357f8 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/kfd/abi.cpp @@ -0,0 +1,88 @@ +// MIT License +// +// Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "lib/common/container/small_vector.hpp" +#include "lib/common/defines.hpp" +#include "lib/common/mpl.hpp" +#include "lib/rocprofiler-sdk/details/kfd_ioctl.h" +#include "lib/rocprofiler-sdk/kfd/defines.hpp" +#include "lib/rocprofiler-sdk/kfd/utils.hpp" + +#include + +#define ASSERT_SAME(A, B) static_assert(static_cast(A) == static_cast(B)) + +namespace rocprofiler +{ +namespace kfd +{ +static_assert(KFD_SMI_EVENT_NONE == 0); +static_assert(KFD_SMI_EVENT_MIGRATE_START == 5); +static_assert(KFD_SMI_EVENT_MIGRATE_END == 6); +static_assert(KFD_SMI_EVENT_PAGE_FAULT_START == 7); +static_assert(KFD_SMI_EVENT_PAGE_FAULT_END == 8); +static_assert(KFD_SMI_EVENT_QUEUE_EVICTION == 9); +static_assert(KFD_SMI_EVENT_QUEUE_RESTORE == 10); +static_assert(KFD_SMI_EVENT_UNMAP_FROM_GPU == 11); +static_assert(KFD_SMI_EVENT_DROPPED_EVENT == 14); +static_assert(KFD_SMI_EVENT_ALL_PROCESS == 64); + +// If any of these changes, we can no longer static_cast the new triggers +// from the KFD trigger to the record operation type +static_assert(ROCPROFILER_KFD_EVENT_PAGE_MIGRATE_LAST == 5); +static_assert(ROCPROFILER_KFD_PAGE_MIGRATE_LAST == 4); +static_assert(ROCPROFILER_KFD_EVENT_PAGE_FAULT_LAST == 5); +static_assert(ROCPROFILER_KFD_PAGE_FAULT_LAST == 4); +static_assert(ROCPROFILER_KFD_EVENT_QUEUE_LAST == 8); +static_assert(ROCPROFILER_KFD_QUEUE_LAST == 6); +static_assert(ROCPROFILER_KFD_EVENT_UNMAP_FROM_GPU_LAST == 3); +static_assert(ROCPROFILER_KFD_EVENT_QUEUE_RESTORE_RESCHEDULED == 6); +static_assert(ROCPROFILER_KFD_EVENT_QUEUE_RESTORE == 7); + +// clang-format off +ASSERT_SAME(ROCPROFILER_KFD_EVENT_PAGE_MIGRATE_PREFETCH, KFD_MIGRATE_TRIGGER_PREFETCH ); +ASSERT_SAME(ROCPROFILER_KFD_EVENT_PAGE_MIGRATE_PAGEFAULT_GPU, KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU ); +ASSERT_SAME(ROCPROFILER_KFD_EVENT_PAGE_MIGRATE_PAGEFAULT_CPU, KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU ); +ASSERT_SAME(ROCPROFILER_KFD_EVENT_PAGE_MIGRATE_TTM_EVICTION, KFD_MIGRATE_TRIGGER_TTM_EVICTION ); +ASSERT_SAME(ROCPROFILER_KFD_PAGE_MIGRATE_PREFETCH, KFD_MIGRATE_TRIGGER_PREFETCH ); +ASSERT_SAME(ROCPROFILER_KFD_PAGE_MIGRATE_PAGEFAULT_GPU, KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU ); +ASSERT_SAME(ROCPROFILER_KFD_PAGE_MIGRATE_PAGEFAULT_CPU, KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU ); +ASSERT_SAME(ROCPROFILER_KFD_PAGE_MIGRATE_TTM_EVICTION, KFD_MIGRATE_TRIGGER_TTM_EVICTION ); +ASSERT_SAME(ROCPROFILER_KFD_EVENT_QUEUE_EVICT_SVM, KFD_QUEUE_EVICTION_TRIGGER_SVM ); +ASSERT_SAME(ROCPROFILER_KFD_EVENT_QUEUE_EVICT_USERPTR, KFD_QUEUE_EVICTION_TRIGGER_USERPTR ); +ASSERT_SAME(ROCPROFILER_KFD_EVENT_QUEUE_EVICT_TTM, KFD_QUEUE_EVICTION_TRIGGER_TTM ); +ASSERT_SAME(ROCPROFILER_KFD_EVENT_QUEUE_EVICT_SUSPEND, KFD_QUEUE_EVICTION_TRIGGER_SUSPEND ); +ASSERT_SAME(ROCPROFILER_KFD_EVENT_QUEUE_EVICT_CRIU_CHECKPOINT, KFD_QUEUE_EVICTION_CRIU_CHECKPOINT ); +ASSERT_SAME(ROCPROFILER_KFD_EVENT_QUEUE_EVICT_CRIU_RESTORE, KFD_QUEUE_EVICTION_CRIU_RESTORE ); +ASSERT_SAME(ROCPROFILER_KFD_QUEUE_EVICT_SVM, KFD_QUEUE_EVICTION_TRIGGER_SVM ); +ASSERT_SAME(ROCPROFILER_KFD_QUEUE_EVICT_USERPTR, KFD_QUEUE_EVICTION_TRIGGER_USERPTR ); +ASSERT_SAME(ROCPROFILER_KFD_QUEUE_EVICT_TTM, KFD_QUEUE_EVICTION_TRIGGER_TTM ); +ASSERT_SAME(ROCPROFILER_KFD_QUEUE_EVICT_SUSPEND, KFD_QUEUE_EVICTION_TRIGGER_SUSPEND ); +ASSERT_SAME(ROCPROFILER_KFD_QUEUE_EVICT_CRIU_CHECKPOINT, KFD_QUEUE_EVICTION_CRIU_CHECKPOINT ); +ASSERT_SAME(ROCPROFILER_KFD_QUEUE_EVICT_CRIU_RESTORE, KFD_QUEUE_EVICTION_CRIU_RESTORE ); +ASSERT_SAME(ROCPROFILER_KFD_EVENT_UNMAP_FROM_GPU_MMU_NOTIFY, KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY ); +ASSERT_SAME(ROCPROFILER_KFD_EVENT_UNMAP_FROM_GPU_MMU_NOTIFY_MIGRATE, KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY_MIGRATE); +ASSERT_SAME(ROCPROFILER_KFD_EVENT_UNMAP_FROM_GPU_UNMAP_FROM_CPU, KFD_SVM_UNMAP_TRIGGER_UNMAP_FROM_CPU ); +// clang-format on + +} // namespace kfd +} // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/page_migration/defines.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/kfd/defines.hpp similarity index 55% rename from projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/page_migration/defines.hpp rename to projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/kfd/defines.hpp index 52163a1c81..d51a370074 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/page_migration/defines.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/kfd/defines.hpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -44,20 +44,47 @@ } \ } while(0) -#define SPECIALIZE_PAGE_MIGRATION_INFO(ROCPROF_NAME, KFD_NAME, FORMAT_STRING) \ +#define SPECIALIZE_KFD_EVENT_INFO(EVENT_NAME, KFD_NAME, BUFFER_KIND, FORMAT_STRING) \ template <> \ - struct page_migration_info \ + struct kfd_event_info \ { \ - static constexpr auto name = "PAGE_MIGRATION_" #ROCPROF_NAME; \ - static constexpr size_t operation = ROCPROFILER_PAGE_MIGRATION_##ROCPROF_NAME; \ - static constexpr size_t kfd_operation = KFD_SMI_EVENT_##KFD_NAME; \ - static constexpr size_t kfd_bitmask = bitmask(KFD_SMI_EVENT_##KFD_NAME); \ - static constexpr std::string_view format_str = FORMAT_STRING; \ + static constexpr size_t kind = KFD_EVENT_##EVENT_NAME; \ + static constexpr size_t buffer_kind = ROCPROFILER_BUFFER_TRACING_KFD_##BUFFER_KIND; \ + static constexpr size_t kfd_id = KFD_SMI_EVENT_##KFD_NAME; \ + static constexpr size_t kfd_bitmask = bitmask(KFD_SMI_EVENT_##KFD_NAME); \ + static constexpr std::string_view format_str = FORMAT_STRING; \ }; +#define SPECIALIZE_KFD_KIND_INFO(KIND_NAME, LAST_OP) \ + template <> \ + struct kfd_kind_info \ + { \ + static constexpr auto kind = ROCPROFILER_BUFFER_TRACING_KFD_##KIND_NAME; \ + static constexpr auto last = ::LAST_OP; \ + } + +#define SPECIALIZE_KFD_KIND_NAME(KIND_NAME_SUFFIX, OPERATION) \ + template <> \ + struct kfd_operation_info \ + { \ + static constexpr auto kind = ROCPROFILER_BUFFER_TRACING_KFD_##KIND_NAME_SUFFIX; \ + static constexpr auto operation = ::OPERATION; \ + static constexpr auto name = #OPERATION; \ + } + #define SPECIALIZE_KFD_IOC_IOCTL(STRUCT, ARG_IOC) \ template <> \ struct IOC_event \ { \ static constexpr auto value = ARG_IOC; \ } + +#define ASSERT_SAME_AND_COPY(member) \ + if(end.member == start.member) \ + { \ + ret.member = end.member; \ + } \ + else \ + ROCP_ERROR << fmt::format("Expected member " #member " to be same in events. {} vs {}", \ + end.member, \ + start.member); diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/kfd/kfd.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/kfd/kfd.cpp new file mode 100644 index 0000000000..b29d229d1d --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/kfd/kfd.cpp @@ -0,0 +1,1553 @@ +// MIT License +// +// Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "lib/rocprofiler-sdk/kfd/kfd.hpp" +#include "include/rocprofiler-sdk/kfd/kfd_id.h" +#include "lib/common/logging.hpp" +#include "lib/common/mpl.hpp" +#include "lib/common/static_object.hpp" +#include "lib/common/utility.hpp" +#include "lib/rocprofiler-sdk/agent.hpp" +#include "lib/rocprofiler-sdk/buffer.hpp" +#include "lib/rocprofiler-sdk/context/context.hpp" +#include "lib/rocprofiler-sdk/details/kfd_ioctl.h" +#include "lib/rocprofiler-sdk/internal_threading.hpp" +#include "lib/rocprofiler-sdk/kfd/defines.hpp" +#include "lib/rocprofiler-sdk/kfd/utils.hpp" + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace rocprofiler +{ +namespace kfd +{ +template +using small_vector = common::container::small_vector; + +using context_t = context::context; +using context_array_t = common::container::small_vector; + +using page_migrate_event_record_t = rocprofiler_buffer_tracing_kfd_event_page_migrate_record_t; +using page_fault_event_record_t = rocprofiler_buffer_tracing_kfd_event_page_fault_record_t; +using queue_event_record_t = rocprofiler_buffer_tracing_kfd_event_queue_record_t; +using page_migrate_record_t = rocprofiler_buffer_tracing_kfd_page_migrate_record_t; +using page_fault_record_t = rocprofiler_buffer_tracing_kfd_page_fault_record_t; +using queue_record_t = rocprofiler_buffer_tracing_kfd_queue_record_t; + +#define ROCPROFILER_LIB_ROCPROFILER_SDK_KFD_CPP_IMPL 1 +#include "kfd.def.cpp" +#undef ROCPROFILER_LIB_ROCPROFILER_SDK_KFD_CPP_IMPL + +// enum / info checks +namespace +{ +using kfd_seq_t = std::make_index_sequence; + +static_assert(kfd_bitmask(std::index_sequence()) == + (KFD_SMI_EVENT_MASK_FROM_INDEX(KFD_SMI_EVENT_PAGE_FAULT_START) | + KFD_SMI_EVENT_MASK_FROM_INDEX(KFD_SMI_EVENT_MIGRATE_END) | + KFD_SMI_EVENT_MASK_FROM_INDEX(KFD_SMI_EVENT_UNMAP_FROM_GPU))); + +static_assert((kfd_event_info::kfd_bitmask | + kfd_event_info::kfd_bitmask | + kfd_event_info::kfd_bitmask) == + (KFD_SMI_EVENT_MASK_FROM_INDEX(KFD_SMI_EVENT_MIGRATE_END) | + KFD_SMI_EVENT_MASK_FROM_INDEX(KFD_SMI_EVENT_QUEUE_EVICTION) | + KFD_SMI_EVENT_MASK_FROM_INDEX(KFD_SMI_EVENT_UNMAP_FROM_GPU))); + +static_assert(to_rocprofiler_kfd_event_id_func(KFD_SMI_EVENT_MIGRATE_START, kfd_seq_t{}) == + KFD_EVENT_PAGE_MIGRATE_START); +static_assert(to_rocprofiler_kfd_event_id_func(KFD_SMI_EVENT_MIGRATE_END, kfd_seq_t{}) == + KFD_EVENT_PAGE_MIGRATE_END); +static_assert(to_rocprofiler_kfd_event_id_func(KFD_SMI_EVENT_PAGE_FAULT_START, kfd_seq_t{}) == + KFD_EVENT_PAGE_FAULT_START); +static_assert(to_rocprofiler_kfd_event_id_func(KFD_SMI_EVENT_PAGE_FAULT_END, kfd_seq_t{}) == + KFD_EVENT_PAGE_FAULT_END); +static_assert(to_rocprofiler_kfd_event_id_func(KFD_SMI_EVENT_QUEUE_EVICTION, kfd_seq_t{}) == + KFD_EVENT_QUEUE_EVICTION); +static_assert(to_rocprofiler_kfd_event_id_func(KFD_SMI_EVENT_QUEUE_RESTORE, kfd_seq_t{}) == + KFD_EVENT_QUEUE_RESTORE); +static_assert(to_rocprofiler_kfd_event_id_func(KFD_SMI_EVENT_UNMAP_FROM_GPU, kfd_seq_t{}) == + KFD_EVENT_UNMAP_FROM_GPU); +} // namespace + +// Parsing and utilities +namespace +{ +using page_migrate_start_ops_t = + std::index_sequence; +using page_migrate_end_ops_t = std::index_sequence; +using page_fault_start_ops_t = + std::index_sequence; +using page_fault_end_ops_t = std::index_sequence; +using queue_evict_ops_t = std::index_sequence; + +using queue_restore_ops_t = std::index_sequence; + +constexpr auto +page_to_bytes(size_t val) +{ + // each page is 4KB = 4096 bytes + return val << 12; +} + +template +kfd_event_record +parse_event(const agent_id_map_t&, std::string_view) +{ + ROCP_FATAL_IF(false) << "Invalid KFD event"; + return {}; +} + +auto +get_node_map() +{ + static auto*& _data = static_object::construct([]() { + auto _v = agent_id_map_t{}; + for(const auto* agent : agent::get_agents()) + _v.emplace(agent->gpu_id, agent->id); + return _v; + }()); + + return *_data; +} + +auto +get_node_agent_id(const agent_id_map_t& agents, uint32_t _node_id) +{ + ROCP_FATAL_IF(agents.count(_node_id) == 0) << "kfd_events: unknown node id: " << _node_id; + return agents.at(_node_id); +} + +constexpr char READ_FAULT_CHAR = 'R'; +constexpr char WRITE_FAULT_CHAR = 'W'; +constexpr char FAULT_MIGRATE_CHAR = 'M'; // Fault resolved with a migration +constexpr char FAULT_UPDATE_CHAR = 'U'; // Fault resolved with an update +// Queue was not restored, will be restored later +constexpr char QUEUE_RESTORE_RESCHEDULED_CHAR = 'R'; + +template <> +kfd_event_record +parse_event(const agent_id_map_t& agents, std::string_view str) +{ + auto rec = kfd_event_record{}; + auto& e = rec.data.page_migrate_event; + + common::init_public_api_struct(e); + e.kind = ROCPROFILER_BUFFER_TRACING_KFD_EVENT_PAGE_MIGRATE; + + uint32_t _kind = 0; + uint32_t _operation = 0; + uint64_t _start_address = 0; + uint64_t _size = 0; + uint32_t _from_node = 0; + uint32_t _to_node = 0; + uint32_t _prefetch_node = 0; + uint32_t _preferred_node = 0; + + const auto scan_count = + std::sscanf(str.data(), + kfd_event_info::format_str.data(), + &_kind, + &e.timestamp, + &e.pid, + &_start_address, + &_size, + &_from_node, + &_to_node, + &_prefetch_node, + &_preferred_node, + &_operation); + + if(scan_count != 10) + { + ROCP_CI_LOG(WARNING) << fmt::format( + "kfd: parse_event: Expected {}, scanned {}", 10, scan_count); + return {}; + } + + e.operation = static_cast(_operation); + e.start_address.value = page_to_bytes(_start_address); + e.end_address.value = page_to_bytes(_start_address + _size); + e.src_agent = get_node_agent_id(agents, _from_node); + e.dst_agent = get_node_agent_id(agents, _to_node); + e.prefetch_agent = get_node_agent_id(agents, _prefetch_node); + e.preferred_agent = get_node_agent_id(agents, _preferred_node); + e.error_code = 0; + + ROCP_INFO << fmt::format( + "Page migrate start [ ts: {} pid: {} addr s: 0x{:X} addr " + "e: 0x{:X} size: {}B from node: {} to node: {} prefetch node: {} preferred node: {} " + "trigger: {} ] \n", + e.timestamp, + e.pid, + e.start_address.value, + e.end_address.value, + (e.end_address.value - e.start_address.value), + e.src_agent.handle, + e.dst_agent.handle, + e.prefetch_agent.handle, + e.preferred_agent.handle, + _operation); + + rec.kind = e.kind; + rec.operation = e.operation; + return rec; +} + +template <> +kfd_event_record +parse_event(const agent_id_map_t& agents, std::string_view str) +{ + auto rec = kfd_event_record{}; + auto& e = rec.data.page_migrate_event; + + common::init_public_api_struct(e); + e.kind = ROCPROFILER_BUFFER_TRACING_KFD_EVENT_PAGE_MIGRATE; + + uint32_t _kind = 0; + uint32_t _operation = 0; + uint64_t _start_address = 0; + uint64_t _size = 0; + uint32_t _from_node = 0; + uint32_t _to_node = 0; + + const auto scan_count = + std::sscanf(str.data(), + kfd_event_info::format_str.data(), + &_kind, + &e.timestamp, + &e.pid, + &_start_address, + &_size, + &_from_node, + &_to_node, + &_operation, + &e.error_code); + + if(scan_count == 8) + { + // KFD version was not bumped when this value was added, + // so older versions may not output an error code + e.error_code = 0; + } + else if(scan_count != 9) + { + ROCP_CI_LOG(WARNING) << fmt::format( + "kfd: parse_event: Expected {}, scanned {}", 9, scan_count); + return {}; + } + + // e.operation = static_cast(operation); + e.operation = ROCPROFILER_KFD_EVENT_PAGE_MIGRATE_END; + e.start_address.value = page_to_bytes(_start_address); + e.end_address.value = page_to_bytes(_start_address + _size); + e.src_agent = get_node_agent_id(agents, _from_node); + e.dst_agent = get_node_agent_id(agents, _to_node); + + ROCP_INFO << fmt::format("Page migrate end [ ts: {} pid: {} addr s: 0x{:X} addr e: " + "0x{:X} from node: {} to node: {} trigger: {} error code: {}] \n", + e.timestamp, + e.pid, + e.start_address.value, + e.end_address.value, + e.src_agent.handle, + e.dst_agent.handle, + _operation, + e.error_code); + + rec.kind = e.kind; + rec.operation = e.operation; + return rec; +} + +template <> +kfd_event_record +parse_event(const agent_id_map_t& agents, std::string_view str) +{ + auto rec = kfd_event_record{}; + auto& e = rec.data.page_fault_event; + + common::init_public_api_struct(e); + e.kind = ROCPROFILER_BUFFER_TRACING_KFD_EVENT_PAGE_FAULT; + + uint32_t _kind = 0; + uint32_t _node_id = 0; + uint64_t _address = 0; + + char _fault; + const auto scan_count = + std::sscanf(str.data(), + kfd_event_info::format_str.data(), + &_kind, + &e.timestamp, + &e.pid, + &_address, + &_node_id, + &_fault); + + if(scan_count != 6) + { + ROCP_CI_LOG(WARNING) << fmt::format( + "kfd: parse_event: Expected {}, scanned {}", 6, scan_count); + return {}; + } + + e.address.value = page_to_bytes(_address); + e.agent_id = get_node_agent_id(agents, _node_id); + + if(_fault == READ_FAULT_CHAR) + { + e.operation = ROCPROFILER_KFD_EVENT_PAGE_FAULT_START_READ_FAULT; + } + else if(_fault == WRITE_FAULT_CHAR) + { + e.operation = ROCPROFILER_KFD_EVENT_PAGE_FAULT_START_WRITE_FAULT; + } + else + { + ROCP_CI_LOG(WARNING) << "Unknown PAGE_FAULT_START fault type. Expected read or write fault"; + } + + ROCP_INFO << fmt::format("Page fault start [ ts: {} pid: {} addr: 0x{:X} node: {} ] \n", + e.timestamp, + e.pid, + e.address.value, + e.agent_id.handle, + _fault); + + rec.kind = e.kind; + rec.operation = e.operation; + return rec; +} + +template <> +kfd_event_record +parse_event(const agent_id_map_t& agents, std::string_view str) +{ + auto rec = kfd_event_record{}; + auto& e = rec.data.page_fault_event; + + common::init_public_api_struct(e); + e.kind = ROCPROFILER_BUFFER_TRACING_KFD_EVENT_PAGE_FAULT; + + uint32_t _kind = 0; + uint32_t _node_id = 0; + uint64_t _address = 0; + + // How the fault was resolved: 'M'igrate / 'U'pdate + char _resolve_kind; + const auto scan_count = std::sscanf(str.data(), + kfd_event_info::format_str.data(), + &_kind, + &e.timestamp, + &e.pid, + &_address, + &_node_id, + &_resolve_kind); + if(scan_count != 6) + { + ROCP_CI_LOG(WARNING) << fmt::format( + "kfd: parse_event: Expected {}, scanned {}", 6, scan_count); + return {}; + } + + e.address.value = page_to_bytes(_address); + e.agent_id = get_node_agent_id(agents, _node_id); + + if(_resolve_kind == FAULT_MIGRATE_CHAR) + { + e.operation = ROCPROFILER_KFD_EVENT_PAGE_FAULT_END_PAGE_MIGRATED; + } + else if(_resolve_kind == FAULT_UPDATE_CHAR) + { + e.operation = ROCPROFILER_KFD_EVENT_PAGE_FAULT_END_PAGE_UPDATED; + } + else + { + ROCP_CI_LOG(WARNING) << "Unknown PAGE_FAULT_END migrated/updated state"; + } + + ROCP_INFO << fmt::format( + "Page fault end [ ts: {} pid: {} addr: 0x{:X} node: {} resolution: {} ] \n", + e.timestamp, + e.pid, + e.address.value, + e.agent_id.handle, + _resolve_kind); + + rec.kind = e.kind; + rec.operation = e.operation; + return rec; +} + +template <> +kfd_event_record +parse_event(const agent_id_map_t& agents, std::string_view str) +{ + auto rec = kfd_event_record{}; + auto& e = rec.data.queue_event; + + common::init_public_api_struct(e); + e.kind = ROCPROFILER_BUFFER_TRACING_KFD_EVENT_QUEUE; + + uint32_t _kind = 0; + uint32_t _operation = 0; + uint32_t _node_id = 0; + + const auto scan_count = std::sscanf(str.data(), + kfd_event_info::format_str.data(), + &_kind, + &e.timestamp, + &e.pid, + &_node_id, + &_operation); + + if(scan_count != 5) + { + ROCP_CI_LOG(WARNING) << fmt::format( + "kfd: parse_event: Expected {}, scanned {}", 5, scan_count); + return {}; + } + + e.operation = static_cast(_operation); + e.agent_id = get_node_agent_id(agents, _node_id); + + ROCP_INFO << fmt::format("Queue evict [ ts: {} pid: {} node: {} trigger: {} ] \n", + e.timestamp, + e.pid, + e.agent_id.handle, + _operation); + + rec.kind = e.kind; + rec.operation = e.operation; + return rec; +} + +template <> +kfd_event_record +parse_event(const agent_id_map_t& agents, std::string_view str) +{ + auto rec = kfd_event_record{}; + auto& e = rec.data.queue_event; + + common::init_public_api_struct(e); + e.kind = ROCPROFILER_BUFFER_TRACING_KFD_EVENT_QUEUE; + + uint32_t _kind = 0; + uint32_t _node_id = 0; + char _rescheduled = 0; + + const auto scan_count = std::sscanf(str.data(), + kfd_event_info::format_str.data(), + &_kind, + &e.timestamp, + &e.pid, + &_node_id, + &_rescheduled); + + e.agent_id = get_node_agent_id(agents, _node_id); + + if(scan_count == 5 && _rescheduled == QUEUE_RESTORE_RESCHEDULED_CHAR) + { + e.operation = ROCPROFILER_KFD_EVENT_QUEUE_RESTORE_RESCHEDULED; + } + else if(scan_count == 5 && _rescheduled != QUEUE_RESTORE_RESCHEDULED_CHAR) + { + ROCP_CI_LOG(WARNING) << "kfd: parse_event: Expected rescheduled with 5 items parsed"; + return {}; + } + else if(scan_count == 4) + { + e.operation = ROCPROFILER_KFD_EVENT_QUEUE_RESTORE; + } + else + { + ROCP_CI_LOG(WARNING) << fmt::format("kfd: parse_event: Expected 4 or 5, scanned {}", + scan_count); + return {}; + } + + ROCP_INFO << fmt::format("Queue restore [ ts: {} pid: {} node: {} rescheduled: {} ] \n", + e.timestamp, + e.pid, + e.agent_id.handle, + e.operation == ROCPROFILER_KFD_EVENT_QUEUE_RESTORE_RESCHEDULED); + + rec.kind = e.kind; + rec.operation = e.operation; + return rec; +} + +template <> +kfd_event_record +parse_event(const agent_id_map_t& agents, std::string_view str) +{ + auto rec = kfd_event_record{}; + auto& e = rec.data.unmap_event; + + common::init_public_api_struct(e); + e.kind = ROCPROFILER_BUFFER_TRACING_KFD_EVENT_UNMAP_FROM_GPU; + + uint32_t _kind = 0; + uint32_t _operation = 0; + uint64_t _start_address = 0; + uint64_t _size = 0; + uint32_t _node_id = 0; + + const auto scan_count = std::sscanf(str.data(), + kfd_event_info::format_str.data(), + &_kind, + &e.timestamp, + &e.pid, + &_start_address, + &_size, + &_node_id, + &_operation); + if(scan_count != 7) + { + ROCP_CI_LOG(WARNING) << fmt::format( + "kfd: parse_event: Expected {}, scanned {}", 7, scan_count); + return {}; + } + + e.operation = static_cast(_operation); + e.start_address.value = page_to_bytes(_start_address); + e.end_address.value = page_to_bytes(_start_address + _size); + e.agent_id = get_node_agent_id(agents, _node_id); + + ROCP_INFO << fmt::format("Unmap from GPU [ ts: {} pid: {} start addr: 0x{:X} end addr: 0x{:X} " + "node: {} trigger {} ] \n", + e.timestamp, + e.pid, + e.start_address.value, + e.end_address.value, + e.agent_id.handle, + _operation); + + rec.kind = e.kind; + rec.operation = e.operation; + return rec; +} + +template <> +kfd_event_record +parse_event(const agent_id_map_t&, std::string_view str) +{ + auto rec = kfd_event_record{}; + auto& e = rec.data.dropped_event; + + common::init_public_api_struct(e); + e.kind = ROCPROFILER_BUFFER_TRACING_KFD_EVENT_DROPPED_EVENTS; + e.operation = ROCPROFILER_KFD_EVENT_DROPPED_EVENTS; + + uint32_t _kind = 0; + + const auto scan_count = std::sscanf(str.data(), + kfd_event_info::format_str.data(), + &_kind, + &e.timestamp, + &e.pid, + &e.count); + + if(scan_count != 4) + { + ROCP_CI_LOG(WARNING) << fmt::format( + "kfd: parse_event: Expected {}, scanned {}", 4, scan_count); + return {}; + } + + ROCP_TRACE << fmt::format( + "Dropped events [ ts: {} pid: {} dropped count: {} ] \n", e.timestamp, e.pid, e.count); + + rec.kind = e.kind; + rec.operation = e.operation; + return rec; +} + +template +kfd_event_record +parse_event(size_t event_id, + const agent_id_map_t& agents, + std::string_view strn, + std::index_sequence) +{ + if(OpInx == static_cast(event_id)) + { + return parse_event(agents, strn); + } + + if constexpr(sizeof...(OpInxs) > 0) + return parse_event(event_id, agents, strn, std::index_sequence{}); + + return kfd_event_record{}; +} + +size_t +to_rocprofiler_kfd_event_id(const std::string_view event_data) +{ + size_t kfd_id{std::numeric_limits::max()}; + const auto scan_count = std::sscanf(event_data.data(), "%lx ", &kfd_id); + + ROCP_CI_LOG_IF(WARNING, scan_count != 1) + << fmt::format("kfd: parse_event: Expected {}, scanned {}", 1, scan_count); + + auto event_id = + to_rocprofiler_kfd_event_id_func(kfd_id, std::make_index_sequence{}); + + ROCP_CI_LOG_IF(WARNING, event_id == std::numeric_limits::max()) + << fmt::format("Failed to parse KFD event ID {}. Parsed ID: {}, kfd_event_id ID: {}\n", + event_data[0], + kfd_id, + event_id); + + return event_id; +} + +} // namespace + +// For use in tests +kfd_event_record +parse_event(size_t event_id, const agent_id_map_t& agents, std::string_view strn) +{ + return parse_event(event_id, agents, strn, std::make_index_sequence{}); +} + +void +kfd_readlines(const std::string_view str, void(handler)(std::string_view)) +{ + const auto find_newline = [&](auto b) { return std::find(b, str.cend(), '\n'); }; + const auto* cursor = str.cbegin(); + + for(const auto* pos = find_newline(cursor); pos != str.cend(); pos = find_newline(cursor)) + { + size_t char_count = pos - cursor; + assert(char_count > 0); + std::string_view event_str{cursor, char_count}; + + ROCP_INFO << fmt::format("KFD event: [{}]", event_str); + handler(event_str); + + cursor = pos + 1; + } +} + +// Event capture and reporting +namespace +{ +constexpr auto kfd_ioctl_version = (1000 * KFD_IOCTL_MAJOR_VERSION) + KFD_IOCTL_MINOR_VERSION; +// Support has been added in kfdv >= 1.10+ +static_assert(kfd_ioctl_version >= 1010, "KFD SMI support missing in kfd_ioctl.h"); + +bool +kfd_context_kinds(const context::context* ctx) +{ + return ctx->is_tracing_one_of(ROCPROFILER_BUFFER_TRACING_KFD_EVENT_PAGE_MIGRATE, + ROCPROFILER_BUFFER_TRACING_KFD_EVENT_PAGE_FAULT, + ROCPROFILER_BUFFER_TRACING_KFD_EVENT_QUEUE, + ROCPROFILER_BUFFER_TRACING_KFD_EVENT_UNMAP_FROM_GPU, + ROCPROFILER_BUFFER_TRACING_KFD_EVENT_DROPPED_EVENTS, + ROCPROFILER_BUFFER_TRACING_KFD_PAGE_MIGRATE, + ROCPROFILER_BUFFER_TRACING_KFD_PAGE_FAULT, + ROCPROFILER_BUFFER_TRACING_KFD_QUEUE); +} + +auto +get_contexts(rocprofiler_buffer_tracing_kind_t kind, int operation) +{ + auto active_contexts = context::get_active_contexts( + [](const auto* ctx) { return (ctx->buffered_tracer && kfd_context_kinds(ctx)); }); + auto operation_ctxs = context::context_array_t{}; + + for(const auto* itr : active_contexts) + { + // if the given domain + op is not enabled, skip this context + if(itr->buffered_tracer->domains(kind, operation)) + { + operation_ctxs.emplace_back(itr); + } + } + + return operation_ctxs; +} + +void poll_events(small_vector); + +} // namespace + +// KFD utils +namespace kfd +{ +using fd_flags_t = decltype(EFD_NONBLOCK); +using fd_t = decltype(pollfd::fd); +constexpr auto KFD_DEVICE_PATH{"/dev/kfd"}; + +SPECIALIZE_KFD_IOC_IOCTL(kfd_ioctl_get_version_args, AMDKFD_IOC_GET_VERSION); +SPECIALIZE_KFD_IOC_IOCTL(kfd_ioctl_smi_events_args, AMDKFD_IOC_SMI_EVENTS); + +namespace +{ +template +auto +ioctl(int kfd_fd, T& args) +{ + // from hsaKmt library (hsakmt/src/libhsakmt.c) + int exit_code{}; + + do + { + exit_code = ::ioctl(kfd_fd, IOC_event::value, static_cast(&args)); + } while(exit_code == -1 && (errno == EINTR || errno == EAGAIN)); + + if(exit_code == -1 && errno == EBADF) + { + /* In case pthread_atfork didn't catch it, this will + * make any subsequent hsaKmt calls fail in CHECK_KFD_OPEN. + */ + CHECK(true && "KFD file descriptor not valid in this process\n"); + } + return exit_code; +} + +struct kfd_device_fd +{ + fd_t fd{-1}; + + kfd_device_fd() + { + fd = ::open(KFD_DEVICE_PATH, O_RDWR | O_CLOEXEC); + ROCP_FATAL_IF(fd == -1) << "Error opening KFD handle @ " << KFD_DEVICE_PATH; + } + + ~kfd_device_fd() + { + if(fd >= 0) close(fd); + } +}; + +const kfd_ioctl_get_version_args +get_version() +{ + static kfd_ioctl_get_version_args version = [&]() { + auto args = kfd_ioctl_get_version_args{0, 0}; + kfd_device_fd kfd_fd{}; + + if(ioctl(kfd_fd.fd, args) != -1) + ROCP_INFO << fmt::format("KFD v{}.{}", args.major_version, args.minor_version); + else + ROCP_ERROR << fmt::format("Could not determine KFD version"); + return args; + }(); + + return version; +} + +struct poll_kfd_t +{ + static constexpr auto DEFAULT_FLAGS{EFD_CLOEXEC}; + + struct gpu_fd_t + { + unsigned int node_id = 0; + fd_t fd = {}; + const rocprofiler_agent_t* agent = nullptr; + }; + + kfd_device_fd kfd_fd = {}; + pollfd thread_notify = {}; + std::thread bg_thread = {}; + bool active = {false}; + + poll_kfd_t() = default; + + poll_kfd_t(const small_vector& rprof_ev) + : kfd_fd{kfd_device_fd{}} + { + small_vector file_handles = {}; + + const auto kfd_flags = kfd_bitmask(rprof_ev, std::make_index_sequence{}); + + ROCP_INFO << fmt::format("Setting KFD flags to [0b{:b}] \n", kfd_flags); + + // Create fd for notifying thread when we want to wake it up, and an eventfd for any events + // to this thread + file_handles.emplace_back( + pollfd{.fd = eventfd(0, DEFAULT_FLAGS), .events = 0, .revents = 0}); + + fd_t thread_pipes[2]{}; + + [&]() { + const auto retcode = pipe2(&thread_pipes[0], DEFAULT_FLAGS); + const auto _err = errno; + ROCP_FATAL_IF(retcode != 0) + << fmt::format("Pipe creation for page-migration thread notify returned {} :: {}\n", + retcode, + strerror(_err)); + }(); + + thread_notify = pollfd{ + .fd = thread_pipes[1], + .events = POLLIN, + .revents = 0, + }; + + // add pipe listening end to fds to watch + file_handles.emplace_back(pollfd{thread_pipes[0], POLLIN, 0}); + + // get FD, start thread, and then enable events + for(const auto& agent : agent::get_agents()) + { + if(agent->type == ROCPROFILER_AGENT_TYPE_GPU) + { + auto gpu_event_fd = get_node_fd(agent->gpu_id); + file_handles.emplace_back(pollfd{gpu_event_fd, POLLIN, 0}); + ROCP_INFO << fmt::format( + "GPU node {} with fd {} added\n", agent->gpu_id, gpu_event_fd); + } + } + + // Enable KFD masked events by writing flags to kfd fd + for(size_t i = 2; i < file_handles.size(); ++i) + { + auto& fd = file_handles[i]; + auto write_size = write(fd.fd, &kfd_flags, sizeof(kfd_flags)); + ROCP_INFO << fmt::format( + "Writing {} to GPU fd {} ({} bytes)\n", kfd_flags, fd.fd, write_size); + CHECK(write_size == sizeof(kfd_flags)); + } + + // start bg thread + internal_threading::notify_pre_internal_thread_create(ROCPROFILER_LIBRARY); + bg_thread = std::thread{poll_events, file_handles}; + internal_threading::notify_post_internal_thread_create(ROCPROFILER_LIBRARY); + + active = true; + } + + poll_kfd_t(const poll_kfd_t&) = delete; + poll_kfd_t& operator=(const poll_kfd_t&) = delete; + + poll_kfd_t(poll_kfd_t&&) noexcept = default; + poll_kfd_t& operator=(poll_kfd_t&&) noexcept = default; + + ~poll_kfd_t() + { + ROCP_INFO << fmt::format("Terminating poll_kfd\n"); + if(!active) return; + + // wake thread up + auto bytes_written{-1}; + do + { + bytes_written = write(thread_notify.fd, "E", 1); + } while(bytes_written == -1 && (errno == EINTR || errno == EAGAIN)); + + ROCP_INFO << fmt::format("Background thread signalled\n"); + + bg_thread.join(); + + close(thread_notify.fd); + } + + node_fd_t get_node_fd(int gpu_node_id) const + { + kfd_ioctl_smi_events_args args{}; + args.gpuid = gpu_node_id; + + if(auto ret = ioctl(kfd_fd.fd, args); ret == -1) + ROCP_ERROR << fmt::format( + "Could not get GPU node {} file descriptor (exit code: {})", gpu_node_id, ret); + return args.anon_fd; + } +}; +} // namespace +} // namespace kfd + +// for all contexts +struct config +{ +private: + kfd::poll_kfd_t kfd_handle{}; + + static inline config* _config{nullptr}; + + config(const small_vector& _event_ids) + : kfd_handle{_event_ids} + {} + +public: + static void init(const small_vector& event_ids) { _config = new config{event_ids}; } + + static void reset() + { + config* ptr = nullptr; + std::swap(ptr, _config); + delete ptr; + } + + static void reset_on_fork() { _config = nullptr; } +}; + +namespace +{ +rocprofiler_kfd_page_migrate_operation_t +get_page_migrate_record_op(const page_migrate_event_record_t& start, + const page_migrate_event_record_t& end) +{ + ROCP_ERROR_IF(end.operation != ROCPROFILER_KFD_EVENT_PAGE_MIGRATE_END) + << fmt::format("Expected end to be operation {}, got vs {}", + static_cast(ROCPROFILER_KFD_EVENT_PAGE_MIGRATE_END), + static_cast(end.operation)); + + if(start.operation == ROCPROFILER_KFD_EVENT_PAGE_MIGRATE_PREFETCH) + { + return ROCPROFILER_KFD_PAGE_MIGRATE_PREFETCH; + } + else if(start.operation == ROCPROFILER_KFD_EVENT_PAGE_MIGRATE_PAGEFAULT_GPU) + { + return ROCPROFILER_KFD_PAGE_MIGRATE_PAGEFAULT_GPU; + } + else if(start.operation == ROCPROFILER_KFD_EVENT_PAGE_MIGRATE_PAGEFAULT_CPU) + { + return ROCPROFILER_KFD_PAGE_MIGRATE_PAGEFAULT_CPU; + } + else if(start.operation == ROCPROFILER_KFD_EVENT_PAGE_MIGRATE_TTM_EVICTION) + { + return ROCPROFILER_KFD_PAGE_MIGRATE_TTM_EVICTION; + } + else + { + ROCP_ERROR << fmt::format("Invalid operation for pairing page_migrate (start {}, end {})", + static_cast(start.operation), + static_cast(end.operation)); + return ROCPROFILER_KFD_PAGE_MIGRATE_NONE; + } +} + +rocprofiler_kfd_page_fault_operation_t +get_page_fault_record_op(const page_fault_event_record_t& start, + const page_fault_event_record_t& end) +{ + if(start.operation == ROCPROFILER_KFD_EVENT_PAGE_FAULT_START_READ_FAULT && + end.operation == ROCPROFILER_KFD_EVENT_PAGE_FAULT_END_PAGE_MIGRATED) + { + return ROCPROFILER_KFD_PAGE_FAULT_READ_FAULT_MIGRATED; + } + else if(start.operation == ROCPROFILER_KFD_EVENT_PAGE_FAULT_START_READ_FAULT && + end.operation == ROCPROFILER_KFD_EVENT_PAGE_FAULT_END_PAGE_UPDATED) + { + return ROCPROFILER_KFD_PAGE_FAULT_READ_FAULT_UPDATED; + } + else if(start.operation == ROCPROFILER_KFD_EVENT_PAGE_FAULT_START_WRITE_FAULT && + end.operation == ROCPROFILER_KFD_EVENT_PAGE_FAULT_END_PAGE_MIGRATED) + { + return ROCPROFILER_KFD_PAGE_FAULT_WRITE_FAULT_MIGRATED; + } + else if(start.operation == ROCPROFILER_KFD_EVENT_PAGE_FAULT_START_WRITE_FAULT && + end.operation == ROCPROFILER_KFD_EVENT_PAGE_FAULT_END_PAGE_UPDATED) + { + return ROCPROFILER_KFD_PAGE_FAULT_WRITE_FAULT_UPDATED; + } + else + { + ROCP_ERROR << fmt::format("Invalid operation for pairing page_fault (start {}, end {})", + static_cast(start.operation), + static_cast(end.operation)); + return ROCPROFILER_KFD_PAGE_FAULT_NONE; + } +} + +rocprofiler_kfd_queue_operation_t +get_queue_record_op(const queue_event_record_t& start, const queue_event_record_t& end) +{ + ROCP_ERROR_IF(end.operation != ROCPROFILER_KFD_EVENT_QUEUE_RESTORE && + end.operation != ROCPROFILER_KFD_EVENT_QUEUE_RESTORE_RESCHEDULED) + << "Expected end operation for queue end event"; + + if(start.operation == ROCPROFILER_KFD_EVENT_QUEUE_EVICT_SVM) + return ROCPROFILER_KFD_QUEUE_EVICT_SVM; + else if(start.operation == ROCPROFILER_KFD_EVENT_QUEUE_EVICT_USERPTR) + return ROCPROFILER_KFD_QUEUE_EVICT_USERPTR; + else if(start.operation == ROCPROFILER_KFD_EVENT_QUEUE_EVICT_TTM) + return ROCPROFILER_KFD_QUEUE_EVICT_TTM; + else if(start.operation == ROCPROFILER_KFD_EVENT_QUEUE_EVICT_SUSPEND) + return ROCPROFILER_KFD_QUEUE_EVICT_SUSPEND; + else if(start.operation == ROCPROFILER_KFD_EVENT_QUEUE_EVICT_CRIU_CHECKPOINT) + return ROCPROFILER_KFD_QUEUE_EVICT_CRIU_CHECKPOINT; + else if(start.operation == ROCPROFILER_KFD_EVENT_QUEUE_EVICT_CRIU_RESTORE) + return ROCPROFILER_KFD_QUEUE_EVICT_CRIU_RESTORE; + else + { + ROCP_ERROR << fmt::format("Invalid operation for pairing queue_suspend (start {}, end {})", + static_cast(start.operation), + static_cast(end.operation)); + return ROCPROFILER_KFD_QUEUE_NONE; + } +} + +template +struct kfd_event_hash_t; + +template +struct kfd_event_compare_t; + +template +uint64_t +bitshift(T&& val, uint32_t lshift) +{ + return static_cast(val) << lshift; +} + +template <> +struct kfd_event_hash_t +{ + std::size_t operator()(const page_migrate_event_record_t& data) const noexcept + { + return data.start_address.handle ^ bitshift(data.src_agent.handle, 32) ^ + bitshift(data.dst_agent.handle, 32); + } +}; + +template <> +struct kfd_event_hash_t +{ + std::size_t operator()(const page_fault_event_record_t& data) const noexcept + { + return data.address.handle ^ bitshift(data.agent_id.handle, 32); + } +}; + +template <> +struct kfd_event_hash_t +{ + std::size_t operator()(const queue_event_record_t& data) const noexcept + { + return bitshift(data.pid, 32) ^ data.agent_id.handle; + } +}; + +template <> +struct kfd_event_compare_t +{ + bool operator()(const page_migrate_event_record_t& lhs, + const page_migrate_event_record_t& rhs) const noexcept + { + return std::tie(lhs.start_address.handle, + lhs.end_address.handle, + lhs.src_agent.handle, + lhs.dst_agent.handle) == std::tie(rhs.start_address.handle, + rhs.end_address.handle, + rhs.src_agent.handle, + rhs.dst_agent.handle); + } +}; + +template <> +struct kfd_event_compare_t +{ + bool operator()(const page_fault_event_record_t& lhs, + const page_fault_event_record_t& rhs) const noexcept + { + return std::tie(lhs.address.handle, lhs.agent_id.handle) == + std::tie(rhs.address.handle, rhs.agent_id.handle); + } +}; + +template <> +struct kfd_event_compare_t +{ + bool operator()(const queue_event_record_t& lhs, const queue_event_record_t& rhs) const noexcept + { + return std::tie(lhs.pid, lhs.agent_id.handle) == std::tie(rhs.pid, rhs.agent_id.handle); + } +}; + +template +using events_unordered_set = std::unordered_set, kfd_event_compare_t>; + +template +bool +is_one_of(int op, std::index_sequence) +{ + return ((op == Ops) || ...); +} + +void +check_paired_events(buffer::instance* buffer, const kfd_event_record& rec) +{ + thread_local static events_unordered_set page_migrate_events{}; + thread_local static events_unordered_set page_fault_events{}; + thread_local static events_unordered_set queue_events{}; + + if(rec.kind == ROCPROFILER_BUFFER_TRACING_KFD_EVENT_PAGE_MIGRATE) + { + const auto& end = rec.data.page_migrate_event; + + bool is_start_event = is_one_of(rec.operation, page_migrate_start_ops_t{}); + bool is_end_event = is_one_of(rec.operation, page_migrate_end_ops_t{}); + + if(is_start_event) + { + // start event, insert + page_migrate_events.insert(rec.data.page_migrate_event); + return; + } + else if(is_end_event) + { + // end event: pair and emplace into buffer + auto ret = common::init_public_api_struct(page_migrate_record_t{}); + if(auto found = page_migrate_events.find(end); found != page_migrate_events.end()) + { + const auto& start = *found; + ret.kind = ROCPROFILER_BUFFER_TRACING_KFD_PAGE_MIGRATE; + ret.operation = get_page_migrate_record_op(start, end); + ret.start_timestamp = start.timestamp; + ret.end_timestamp = end.timestamp; + ASSERT_SAME_AND_COPY(pid); + ASSERT_SAME_AND_COPY(start_address.handle); + ASSERT_SAME_AND_COPY(end_address.handle); + ASSERT_SAME_AND_COPY(src_agent.handle); + ASSERT_SAME_AND_COPY(dst_agent.handle); + ret.prefetch_agent = start.prefetch_agent; + ret.preferred_agent = start.preferred_agent; + ASSERT_SAME_AND_COPY(error_code); + // Create a paired record and insert into buffer + CHECK_NOTNULL(buffer)->emplace(ROCPROFILER_BUFFER_CATEGORY_TRACING, ret.kind, ret); + // Remove the item from the buffer + page_migrate_events.erase(found); + } + } + else + { + // This is not a valid operation + ROCP_ERROR << fmt::format( + "kfd_events: Invalid operation {} for paring page_migrate events", rec.operation); + } + } + else if(rec.kind == ROCPROFILER_BUFFER_TRACING_KFD_EVENT_PAGE_FAULT) + { + const auto& end = rec.data.page_fault_event; + + bool is_start_event = is_one_of(rec.operation, page_fault_start_ops_t{}); + bool is_end_event = is_one_of(rec.operation, page_fault_end_ops_t{}); + + if(is_start_event) + { + // start event, insert + page_fault_events.insert(rec.data.page_fault_event); + return; + } + else if(is_end_event) + { + // end event: pair and emplace into buffer + auto ret = common::init_public_api_struct(page_fault_record_t{}); + if(auto found = page_fault_events.find(end); found != page_fault_events.end()) + { + const auto& start = *found; + ret.kind = ROCPROFILER_BUFFER_TRACING_KFD_PAGE_FAULT; + ret.operation = get_page_fault_record_op(start, end); + ret.start_timestamp = start.timestamp; + ret.end_timestamp = end.timestamp; + ASSERT_SAME_AND_COPY(pid); + ASSERT_SAME_AND_COPY(agent_id.handle); + ASSERT_SAME_AND_COPY(address.handle); + // Create a paired record and insert into buffer + CHECK_NOTNULL(buffer)->emplace(ROCPROFILER_BUFFER_CATEGORY_TRACING, ret.kind, ret); + // Remove the item from the buffer + page_fault_events.erase(found); + } + } + else + { + // This is not a valid operation + ROCP_ERROR << fmt::format( + "kfd_events: Invalid operation {} for paring page_fault events", rec.operation); + } + } + else if(rec.kind == ROCPROFILER_BUFFER_TRACING_KFD_EVENT_QUEUE) + { + const auto& end = rec.data.queue_event; + + bool is_start_event = is_one_of(rec.operation, queue_evict_ops_t{}); + bool is_end_event = is_one_of(rec.operation, queue_restore_ops_t{}); + + if(is_start_event) + { + // start event, insert + queue_events.insert(rec.data.queue_event); + return; + } + else if(is_end_event) + { + // end event: pair and emplace into buffer + auto ret = common::init_public_api_struct(queue_record_t{}); + if(auto found = queue_events.find(end); found != queue_events.end()) + { + const auto& start = *found; + ret.kind = ROCPROFILER_BUFFER_TRACING_KFD_QUEUE; + ret.operation = get_queue_record_op(start, end); + ret.start_timestamp = start.timestamp; + ret.end_timestamp = end.timestamp; + ASSERT_SAME_AND_COPY(pid); + ASSERT_SAME_AND_COPY(agent_id.handle); + // Create a paired record and insert into buffer + CHECK_NOTNULL(buffer)->emplace(ROCPROFILER_BUFFER_CATEGORY_TRACING, ret.kind, ret); + // Remove the item from the buffer + queue_events.erase(found); + } + } + else if(rec.operation == ROCPROFILER_KFD_EVENT_QUEUE_RESTORE_RESCHEDULED) + { + // If event is ROCPROFILER_KFD_EVENT_QUEUE_RESTORE_RESCHEDULED we should not attempt to + // pair it. It is an instantaneous event. + // It is handled in handle_reporting -> emplace_buffer_record. + } + else + { + // Else, it is an error. + ROCP_ERROR << fmt::format("kfd_events: Invalid operation {} for paring events", + rec.operation); + } + } +} + +void +emplace_buffer_record(buffer::instance* buffer, const kfd_event_record& rec) +{ + switch(rec.kind) + { + case ROCPROFILER_BUFFER_TRACING_KFD_EVENT_PAGE_MIGRATE: + { + CHECK_NOTNULL(buffer)->emplace( + ROCPROFILER_BUFFER_CATEGORY_TRACING, rec.kind, rec.data.page_migrate_event); + break; + } + case ROCPROFILER_BUFFER_TRACING_KFD_EVENT_PAGE_FAULT: + { + CHECK_NOTNULL(buffer)->emplace( + ROCPROFILER_BUFFER_CATEGORY_TRACING, rec.kind, rec.data.page_fault_event); + break; + } + case ROCPROFILER_BUFFER_TRACING_KFD_EVENT_QUEUE: + { + CHECK_NOTNULL(buffer)->emplace( + ROCPROFILER_BUFFER_CATEGORY_TRACING, rec.kind, rec.data.queue_event); + break; + } + case ROCPROFILER_BUFFER_TRACING_KFD_EVENT_UNMAP_FROM_GPU: + { + CHECK_NOTNULL(buffer)->emplace( + ROCPROFILER_BUFFER_CATEGORY_TRACING, rec.kind, rec.data.unmap_event); + break; + } + case ROCPROFILER_BUFFER_TRACING_KFD_EVENT_DROPPED_EVENTS: + { + CHECK_NOTNULL(buffer)->emplace( + ROCPROFILER_BUFFER_CATEGORY_TRACING, rec.kind, rec.data.dropped_event); + break; + } + default: + { + ROCP_ERROR << fmt::format("Invalid Kind {} for record", static_cast(rec.kind)); + } + } +} + +void +handle_reporting(std::string_view event_data) +{ + // We can check the operation only after parsing the event + const auto kfd_event = to_rocprofiler_kfd_event_id(event_data); + auto event = parse_event( + kfd_event, get_node_map(), event_data, std::make_index_sequence{}); + + ROCP_ERROR_IF(event.kind < ROCPROFILER_BUFFER_TRACING_KFD_EVENT_PAGE_MIGRATE || + event.kind > ROCPROFILER_BUFFER_TRACING_KFD_QUEUE) + << fmt::format("kfd_events: Invalid record kind {}", static_cast(event.kind)); + + ROCP_ERROR_IF(event.operation == -1) + << fmt::format("kfd_events: Invalid record operation: ({}, {})", + static_cast(event.kind), + event.operation); + + auto buffered_contexts = get_contexts(event.kind, event.operation); + if(buffered_contexts.empty()) return; + + for(const auto& itr : buffered_contexts) + { + auto* buffer = buffer::get_buffer(itr->buffered_tracer->buffer_data.at(event.kind)); + + check_paired_events(buffer, event); + emplace_buffer_record(buffer, event); + } +} + +void +poll_events(small_vector file_handles) +{ + // storage to write records to, 1MB + constexpr size_t PREALLOCATE_ELEMENT_COUNT{1024 * 128}; + std::string scratch_buffer(PREALLOCATE_ELEMENT_COUNT, '\0'); + auto& exitfd = file_handles[1]; + + // Wait or spin on events. + // 0 -> return immediately even if no events + // -1 -> wait indefinitely + + pthread_setname_np(pthread_self(), "bg:poll-kfd"); + + for(auto& fd : file_handles) + { + ROCP_INFO << fmt::format( + "Handle = {}, events = {}, revents = {}\n", fd.fd, fd.events, fd.revents); + } + + while(true) + { + auto poll_ret = poll(file_handles.data(), file_handles.size(), -1); + + if(poll_ret == -1) + { + ROCP_CI_LOG(WARNING) + << "Background thread file descriptors for page-migration are invalid"; + return; + } + + if((exitfd.revents & POLLIN) != 0) + { + for(const auto& f : file_handles) + { + close(f.fd); + } + ROCP_INFO << "Terminating background thread\n"; + return; + } + + using namespace std::chrono_literals; + + // 0 and 1 are for generic and pipe-notify handles + for(size_t i = 2; i < file_handles.size(); ++i) + { + auto& fd = file_handles[i]; + + // We have data to read, perhaps multiple events + if((fd.revents & POLLIN) != 0) + { + size_t status_size = read(fd.fd, scratch_buffer.data(), scratch_buffer.size()); + auto event_strings = std::string_view{scratch_buffer.data(), status_size}; + kfd_readlines(event_strings, handle_reporting); + } + fd.revents = 0; + } + } +} + +template +const char* +name_by_id(const uint32_t id, std::index_sequence) +{ + if(Idx == id) return kfd_operation_info::name; + if constexpr(sizeof...(IdxTail) > 0) + return name_by_id(id, std::index_sequence{}); + else + return nullptr; +} + +template +void +get_ids(std::vector& _id_list, std::index_sequence) +{ + auto _emplace = [](auto& _vec, uint32_t _v) { + if(_v < static_cast(kfd_kind_info::last)) _vec.emplace_back(_v); + }; + + (_emplace(_id_list, kfd_operation_info::operation), ...); +} + +bool +context_filter(const context::context* ctx) +{ + return (ctx->buffered_tracer && kfd_context_kinds(ctx)); +} + +template +rocprofiler_status_t init(std::index_sequence) +{ + static const small_vector event_ids{Inxs...}; + // Check if version is more than 1.11 + auto ver = kfd::get_version(); + if(ver.major_version * 1000 + ver.minor_version > 1011) + { + if(!context::get_registered_contexts(context_filter).empty()) + { + config::init(event_ids); + } + return ROCPROFILER_STATUS_SUCCESS; + } + else + { + // Add a buffer record with this info + ROCP_ERROR << fmt::format( + "KFD does not support SVM event reporting in v{}.{} (requires v1.11)", + ver.major_version, + ver.minor_version); + return ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_KERNEL; + } +} + +using kfd_buffer_tracing_ids_t = + std::index_sequence; + +template +const char* +name_by_id(uint32_t kind, uint32_t op, std::index_sequence) +{ + if(kind == Kind) + { + return name_by_id(op, std::make_index_sequence::last>{}); + } + else if constexpr(sizeof...(Kinds) > 0) + return name_by_id(kind, op, std::index_sequence{}); + + ROCP_CI_LOG(WARNING) << fmt::format("KFD events name_by_id: Unknown Kind {} {}", kind, op); + return "KFD events: Unknown Kind"; +} + +template +std::vector +get_ids(int kind, std::index_sequence) +{ + if(kind == Kind) + { + auto _data = std::vector{}; + _data.reserve(kfd_kind_info::last); + get_ids(_data, std::make_index_sequence::last>{}); + return _data; + } + else if constexpr(sizeof...(Kinds) > 0) + return get_ids(kind, std::index_sequence{}); + + ROCP_CI_LOG(WARNING) << fmt::format("KFD events get_ids: Unknown Kind {}", kind); +} + +} // namespace + +} // namespace kfd +} // namespace rocprofiler + +namespace rocprofiler +{ +namespace kfd +{ +rocprofiler_status_t +init() +{ + // Prevent re-init for different buffer op kinds + static bool init_done{false}; + static rocprofiler_status_t retcode{ROCPROFILER_STATUS_ERROR}; + + if(!init_done) + { + pthread_atfork(nullptr, nullptr, []() { + // null out child's copy on fork and reinitialize + // otherwise all children wait on the same thread to join + config::reset_on_fork(); + init(std::make_index_sequence{}); + }); + + retcode = init(std::make_index_sequence{}); + init_done = true; + } + + return retcode; +} + +void +finalize() +{ + config::reset(); +} + +const char* +name_by_id(uint32_t kind, uint32_t id) +{ + return name_by_id(kind, id, kfd_buffer_tracing_ids_t{}); +} + +std::vector +get_ids(uint32_t kind) +{ + return get_ids(kind, kfd_buffer_tracing_ids_t{}); +} +} // namespace kfd +} // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/kfd/kfd.def.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/kfd/kfd.def.cpp new file mode 100644 index 0000000000..828d4a52ba --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/kfd/kfd.def.cpp @@ -0,0 +1,87 @@ +// MIT License +// +// Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#if defined(ROCPROFILER_LIB_ROCPROFILER_SDK_KFD_CPP_IMPL) && \ + ROCPROFILER_LIB_ROCPROFILER_SDK_KFD_CPP_IMPL == 1 + +// clang-format off +// Mapping for kfd_event_id (utils.h) | ID from kfd_ioctl.h | buffer_tracing_kind_t | format string +SPECIALIZE_KFD_EVENT_INFO(PAGE_MIGRATE_START, MIGRATE_START, EVENT_PAGE_MIGRATE, "%x %ld -%d @%lx(%lx) %x->%x %x:%x %d\n"); +SPECIALIZE_KFD_EVENT_INFO(PAGE_MIGRATE_END, MIGRATE_END, EVENT_PAGE_MIGRATE, "%x %ld -%d @%lx(%lx) %x->%x %d %d\n" ); +SPECIALIZE_KFD_EVENT_INFO(PAGE_FAULT_START, PAGE_FAULT_START, EVENT_PAGE_FAULT, "%x %ld -%d @%lx(%x) %c\n" ); +SPECIALIZE_KFD_EVENT_INFO(PAGE_FAULT_END, PAGE_FAULT_END, EVENT_PAGE_FAULT, "%x %ld -%d @%lx(%x) %c\n" ); +SPECIALIZE_KFD_EVENT_INFO(QUEUE_EVICTION, QUEUE_EVICTION, EVENT_QUEUE, "%x %ld -%d %x %d\n" ); +SPECIALIZE_KFD_EVENT_INFO(QUEUE_RESTORE, QUEUE_RESTORE, EVENT_QUEUE, "%x %ld -%d %x %c\n" ); +SPECIALIZE_KFD_EVENT_INFO(UNMAP_FROM_GPU, UNMAP_FROM_GPU, EVENT_UNMAP_FROM_GPU, "%x %ld -%d @%lx(%lx) %x %d\n" ); +SPECIALIZE_KFD_EVENT_INFO(DROPPED_EVENT, DROPPED_EVENT, EVENT_DROPPED_EVENTS, "%x %ld -%d %d\n" ); + +SPECIALIZE_KFD_KIND_INFO(EVENT_PAGE_MIGRATE, ROCPROFILER_KFD_EVENT_PAGE_MIGRATE_LAST); +SPECIALIZE_KFD_KIND_INFO(EVENT_PAGE_FAULT, ROCPROFILER_KFD_EVENT_PAGE_FAULT_LAST); +SPECIALIZE_KFD_KIND_INFO(EVENT_QUEUE, ROCPROFILER_KFD_EVENT_QUEUE_LAST); +SPECIALIZE_KFD_KIND_INFO(EVENT_UNMAP_FROM_GPU, ROCPROFILER_KFD_EVENT_UNMAP_FROM_GPU_LAST); +SPECIALIZE_KFD_KIND_INFO(EVENT_DROPPED_EVENTS, ROCPROFILER_KFD_EVENT_DROPPED_EVENTS_LAST); +SPECIALIZE_KFD_KIND_INFO(PAGE_MIGRATE, ROCPROFILER_KFD_PAGE_MIGRATE_LAST); +SPECIALIZE_KFD_KIND_INFO(PAGE_FAULT, ROCPROFILER_KFD_PAGE_FAULT_LAST); +SPECIALIZE_KFD_KIND_INFO(QUEUE, ROCPROFILER_KFD_QUEUE_LAST); + +// Events +SPECIALIZE_KFD_KIND_NAME(EVENT_PAGE_MIGRATE, ROCPROFILER_KFD_EVENT_PAGE_MIGRATE_PREFETCH); +SPECIALIZE_KFD_KIND_NAME(EVENT_PAGE_MIGRATE, ROCPROFILER_KFD_EVENT_PAGE_MIGRATE_PAGEFAULT_GPU); +SPECIALIZE_KFD_KIND_NAME(EVENT_PAGE_MIGRATE, ROCPROFILER_KFD_EVENT_PAGE_MIGRATE_PAGEFAULT_CPU); +SPECIALIZE_KFD_KIND_NAME(EVENT_PAGE_MIGRATE, ROCPROFILER_KFD_EVENT_PAGE_MIGRATE_TTM_EVICTION); +SPECIALIZE_KFD_KIND_NAME(EVENT_PAGE_MIGRATE, ROCPROFILER_KFD_EVENT_PAGE_MIGRATE_END); +SPECIALIZE_KFD_KIND_NAME(EVENT_PAGE_FAULT, ROCPROFILER_KFD_EVENT_PAGE_FAULT_START); +SPECIALIZE_KFD_KIND_NAME(EVENT_PAGE_FAULT, ROCPROFILER_KFD_EVENT_PAGE_FAULT_START_READ_FAULT); +SPECIALIZE_KFD_KIND_NAME(EVENT_PAGE_FAULT, ROCPROFILER_KFD_EVENT_PAGE_FAULT_START_WRITE_FAULT); +SPECIALIZE_KFD_KIND_NAME(EVENT_PAGE_FAULT, ROCPROFILER_KFD_EVENT_PAGE_FAULT_END_PAGE_MIGRATED); +SPECIALIZE_KFD_KIND_NAME(EVENT_PAGE_FAULT, ROCPROFILER_KFD_EVENT_PAGE_FAULT_END_PAGE_UPDATED); +SPECIALIZE_KFD_KIND_NAME(EVENT_QUEUE, ROCPROFILER_KFD_EVENT_QUEUE_EVICT_SVM); +SPECIALIZE_KFD_KIND_NAME(EVENT_QUEUE, ROCPROFILER_KFD_EVENT_QUEUE_EVICT_USERPTR); +SPECIALIZE_KFD_KIND_NAME(EVENT_QUEUE, ROCPROFILER_KFD_EVENT_QUEUE_EVICT_TTM); +SPECIALIZE_KFD_KIND_NAME(EVENT_QUEUE, ROCPROFILER_KFD_EVENT_QUEUE_EVICT_SUSPEND); +SPECIALIZE_KFD_KIND_NAME(EVENT_QUEUE, ROCPROFILER_KFD_EVENT_QUEUE_EVICT_CRIU_CHECKPOINT); +SPECIALIZE_KFD_KIND_NAME(EVENT_QUEUE, ROCPROFILER_KFD_EVENT_QUEUE_EVICT_CRIU_RESTORE); +SPECIALIZE_KFD_KIND_NAME(EVENT_QUEUE, ROCPROFILER_KFD_EVENT_QUEUE_RESTORE_RESCHEDULED); +SPECIALIZE_KFD_KIND_NAME(EVENT_QUEUE, ROCPROFILER_KFD_EVENT_QUEUE_RESTORE); +SPECIALIZE_KFD_KIND_NAME(EVENT_UNMAP_FROM_GPU, ROCPROFILER_KFD_EVENT_UNMAP_FROM_GPU_MMU_NOTIFY); +SPECIALIZE_KFD_KIND_NAME(EVENT_UNMAP_FROM_GPU, ROCPROFILER_KFD_EVENT_UNMAP_FROM_GPU_MMU_NOTIFY_MIGRATE); +SPECIALIZE_KFD_KIND_NAME(EVENT_UNMAP_FROM_GPU, ROCPROFILER_KFD_EVENT_UNMAP_FROM_GPU_UNMAP_FROM_CPU); +SPECIALIZE_KFD_KIND_NAME(EVENT_DROPPED_EVENTS, ROCPROFILER_KFD_EVENT_DROPPED_EVENTS); + +// Paired records +SPECIALIZE_KFD_KIND_NAME(PAGE_MIGRATE, ROCPROFILER_KFD_PAGE_MIGRATE_PREFETCH); +SPECIALIZE_KFD_KIND_NAME(PAGE_MIGRATE, ROCPROFILER_KFD_PAGE_MIGRATE_PAGEFAULT_GPU); +SPECIALIZE_KFD_KIND_NAME(PAGE_MIGRATE, ROCPROFILER_KFD_PAGE_MIGRATE_PAGEFAULT_CPU); +SPECIALIZE_KFD_KIND_NAME(PAGE_MIGRATE, ROCPROFILER_KFD_PAGE_MIGRATE_TTM_EVICTION); +SPECIALIZE_KFD_KIND_NAME(PAGE_FAULT, ROCPROFILER_KFD_PAGE_FAULT_READ_FAULT_MIGRATED); +SPECIALIZE_KFD_KIND_NAME(PAGE_FAULT, ROCPROFILER_KFD_PAGE_FAULT_READ_FAULT_UPDATED); +SPECIALIZE_KFD_KIND_NAME(PAGE_FAULT, ROCPROFILER_KFD_PAGE_FAULT_WRITE_FAULT_MIGRATED); +SPECIALIZE_KFD_KIND_NAME(PAGE_FAULT, ROCPROFILER_KFD_PAGE_FAULT_WRITE_FAULT_UPDATED); +SPECIALIZE_KFD_KIND_NAME(QUEUE, ROCPROFILER_KFD_QUEUE_EVICT_SVM); +SPECIALIZE_KFD_KIND_NAME(QUEUE, ROCPROFILER_KFD_QUEUE_EVICT_USERPTR); +SPECIALIZE_KFD_KIND_NAME(QUEUE, ROCPROFILER_KFD_QUEUE_EVICT_TTM); +SPECIALIZE_KFD_KIND_NAME(QUEUE, ROCPROFILER_KFD_QUEUE_EVICT_SUSPEND); +SPECIALIZE_KFD_KIND_NAME(QUEUE, ROCPROFILER_KFD_QUEUE_EVICT_CRIU_CHECKPOINT); +SPECIALIZE_KFD_KIND_NAME(QUEUE, ROCPROFILER_KFD_QUEUE_EVICT_CRIU_RESTORE); +// clang-format on + +#endif diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/page_migration/page_migration.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/kfd/kfd.hpp similarity index 86% rename from projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/page_migration/page_migration.hpp rename to projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/kfd/kfd.hpp index 946e77f611..79877af1fa 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/page_migration/page_migration.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/kfd/kfd.hpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -24,22 +24,22 @@ #include "lib/common/container/small_vector.hpp" -#include +#include namespace rocprofiler { -namespace page_migration +namespace kfd { const char* -name_by_id(uint32_t id); +name_by_id(uint32_t kind, uint32_t id); std::vector -get_ids(); +get_ids(uint32_t kind); rocprofiler_status_t init(); void finalize(); -} // namespace page_migration +} // namespace kfd } // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/kfd/tests/CMakeLists.txt b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/kfd/tests/CMakeLists.txt new file mode 100644 index 0000000000..8918a43247 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/kfd/tests/CMakeLists.txt @@ -0,0 +1,48 @@ +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +rocprofiler_deactivate_clang_tidy() + +include(GoogleTest) + +set(ROCPROFILER_LIB_KFD_EVENTS_TEST_SOURCES parser.cpp) + +add_executable(kfd-events-parser) + +target_sources(kfd-events-parser PRIVATE ${ROCPROFILER_LIB_KFD_EVENTS_TEST_SOURCES}) + +target_link_libraries( + kfd-events-parser + PRIVATE rocprofiler-sdk::rocprofiler-sdk-common-library + rocprofiler-sdk::rocprofiler-sdk-static-library GTest::gtest + GTest::gtest_main) + +gtest_add_tests( + TARGET kfd-events-parser + SOURCES ${ROCPROFILER_LIB_COUNTER_TEST_SOURCES} + TEST_LIST kfd-events-parsers_TESTS + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + +set_tests_properties( + ${kfd-events-parsers_TESTS} + PROPERTIES TIMEOUT 45 LABELS "unittests;kfd-events" FAIL_REGULAR_EXPRESSION + "${ROCPROFILER_DEFAULT_FAIL_REGEX}") diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/kfd/tests/parser.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/kfd/tests/parser.cpp new file mode 100644 index 0000000000..0516513fc6 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/kfd/tests/parser.cpp @@ -0,0 +1,274 @@ +// MIT License +// +// Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include +#include +#include +#include +#include +#include + +#include "lib/rocprofiler-sdk/kfd/defines.hpp" +#include "lib/rocprofiler-sdk/kfd/utils.hpp" +#include "rocprofiler-sdk/kfd/kfd_id.h" + +#include +#include +#include + +namespace rocprofiler +{ +namespace kfd +{ +#define ROCPROFILER_LIB_ROCPROFILER_SDK_KFD_CPP_IMPL 1 +#include "lib/rocprofiler-sdk/kfd/kfd.def.cpp" +#undef ROCPROFILER_LIB_ROCPROFILER_SDK_KFD_CPP_IMPL + +} // namespace kfd +} // namespace rocprofiler + +namespace +{ +using namespace rocprofiler::kfd; + +// Internally, get_node_map is called for a given gpu ID +// We are testing the parser, so use an identity mapping +const agent_id_map_t agent_map = { + {0, {0}}, + {0x1ea0, {0x1ea0}}, + {0xfc7d, {0xfc7d}}, + {0xad5c, {0xad5c}}, +}; + +kfd_event_record +parse(std::string_view arg) +{ + size_t kfd_id = std::numeric_limits::max(); + + const auto scan_count = std::sscanf(arg.data(), "%lx ", &kfd_id); + + EXPECT_EQ(scan_count, 1); + EXPECT_GE(kfd_id, KFD_SMI_EVENT_MIGRATE_START); + EXPECT_LE(kfd_id, KFD_SMI_EVENT_DROPPED_EVENT); + + auto event_id = to_rocprofiler_kfd_event_id_func(static_cast(kfd_id), + std::make_index_sequence{}); + + EXPECT_GE(event_id, KFD_EVENT_PAGE_MIGRATE_START); + EXPECT_LE(event_id, KFD_EVENT_DROPPED_EVENT); + + return parse_event(event_id, agent_map, arg); +} + +} // namespace + +/* The following tests ensure that we can correctly parse and convert an event to the structured + equivalent +*/ + +TEST(rocprofiler_lib, parse_kfd_event_page_migrate_start) +{ + using namespace rocprofiler::kfd; + + const auto event = parse("5 14601738586508 -152990 @7f0f15200(2c02) 0->1ea0 1ea0:0 1"); + const auto migrate_start = event.data.page_migrate_event; + + EXPECT_EQ(event.kind, ROCPROFILER_BUFFER_TRACING_KFD_EVENT_PAGE_MIGRATE); + EXPECT_EQ(event.operation, ROCPROFILER_KFD_EVENT_PAGE_MIGRATE_PAGEFAULT_GPU); + EXPECT_EQ(migrate_start.kind, ROCPROFILER_BUFFER_TRACING_KFD_EVENT_PAGE_MIGRATE); + EXPECT_EQ(migrate_start.operation, ROCPROFILER_KFD_EVENT_PAGE_MIGRATE_PAGEFAULT_GPU); + EXPECT_EQ(migrate_start.timestamp, 14601738586508); + EXPECT_EQ(migrate_start.pid, 152990); + EXPECT_EQ(migrate_start.start_address.handle, 0x7f0f15200 << 12); + EXPECT_EQ(migrate_start.end_address.handle, (0x7f0f15200 + 0x2c02) << 12); + EXPECT_EQ(migrate_start.src_agent.handle, 0); + EXPECT_EQ(migrate_start.dst_agent.handle, 0x1ea0); + EXPECT_EQ(migrate_start.prefetch_agent.handle, 0x1ea0); + EXPECT_EQ(migrate_start.preferred_agent.handle, 0); + EXPECT_EQ(migrate_start.error_code, 0); +} + +TEST(rocprofiler_lib, parse_kfd_event_page_migrate_end) +{ + using namespace rocprofiler::kfd; + + const auto event = parse("6 15202910515905 -152990 @7f0f15200(2c02) 0->1ea0 1 75"); + const auto migrate_end = event.data.page_migrate_event; + + EXPECT_EQ(event.kind, ROCPROFILER_BUFFER_TRACING_KFD_EVENT_PAGE_MIGRATE); + EXPECT_EQ(event.operation, ROCPROFILER_KFD_EVENT_PAGE_MIGRATE_END); + EXPECT_EQ(migrate_end.kind, ROCPROFILER_BUFFER_TRACING_KFD_EVENT_PAGE_MIGRATE); + EXPECT_EQ(migrate_end.operation, ROCPROFILER_KFD_EVENT_PAGE_MIGRATE_END); + EXPECT_EQ(migrate_end.timestamp, 15202910515905); + EXPECT_EQ(migrate_end.pid, 152990); + EXPECT_EQ(migrate_end.start_address.handle, 0x7f0f15200 << 12); + EXPECT_EQ(migrate_end.end_address.handle, (0x7f0f15200 + 0x2c02) << 12); + EXPECT_EQ(migrate_end.src_agent.handle, 0); + EXPECT_EQ(migrate_end.dst_agent.handle, 0x1ea0); + EXPECT_EQ(migrate_end.prefetch_agent.handle, 0); // Not generated for end event + EXPECT_EQ(migrate_end.preferred_agent.handle, 0); // Not generated for end event + EXPECT_EQ(migrate_end.error_code, 75); +} + +TEST(rocprofiler_lib, parse_kfd_event_page_fault_start) +{ + using namespace rocprofiler::kfd; + + const auto event = parse("7 40507688414784 -156893 @7fa608127(ad5c) R"); + const auto page_fault = event.data.page_fault_event; + + EXPECT_EQ(event.kind, ROCPROFILER_BUFFER_TRACING_KFD_EVENT_PAGE_FAULT); + EXPECT_EQ(event.operation, ROCPROFILER_KFD_EVENT_PAGE_FAULT_START_READ_FAULT); + EXPECT_EQ(page_fault.kind, ROCPROFILER_BUFFER_TRACING_KFD_EVENT_PAGE_FAULT); + EXPECT_EQ(page_fault.operation, ROCPROFILER_KFD_EVENT_PAGE_FAULT_START_READ_FAULT); + EXPECT_EQ(page_fault.timestamp, 40507688414784); + EXPECT_EQ(page_fault.pid, 156893); + EXPECT_EQ(page_fault.agent_id.handle, 0xad5c); + EXPECT_EQ(page_fault.address.handle, 0x7fa608127 << 12); +} + +TEST(rocprofiler_lib, parse_kfd_event_page_fault_end) +{ + using namespace rocprofiler::kfd; + + { + const auto event = parse("8 40507688514376 -156893 @7fa608127(ad5c) U"); + const auto page_fault = event.data.page_fault_event; + + EXPECT_EQ(event.kind, ROCPROFILER_BUFFER_TRACING_KFD_EVENT_PAGE_FAULT); + EXPECT_EQ(event.operation, ROCPROFILER_KFD_EVENT_PAGE_FAULT_END_PAGE_UPDATED); + EXPECT_EQ(page_fault.kind, ROCPROFILER_BUFFER_TRACING_KFD_EVENT_PAGE_FAULT); + EXPECT_EQ(page_fault.operation, ROCPROFILER_KFD_EVENT_PAGE_FAULT_END_PAGE_UPDATED); + EXPECT_EQ(page_fault.timestamp, 40507688514376); + EXPECT_EQ(page_fault.pid, 156893); + EXPECT_EQ(page_fault.agent_id.handle, 0xad5c); + EXPECT_EQ(page_fault.address.handle, 0x7fa608127 << 12); + } + + { + const auto event = parse("8 40507688516386 -156898 @7fa698127(ad5c) M"); + const auto page_fault = event.data.page_fault_event; + + EXPECT_EQ(event.kind, ROCPROFILER_BUFFER_TRACING_KFD_EVENT_PAGE_FAULT); + EXPECT_EQ(event.operation, ROCPROFILER_KFD_EVENT_PAGE_FAULT_END_PAGE_MIGRATED); + EXPECT_EQ(page_fault.kind, ROCPROFILER_BUFFER_TRACING_KFD_EVENT_PAGE_FAULT); + EXPECT_EQ(page_fault.operation, ROCPROFILER_KFD_EVENT_PAGE_FAULT_END_PAGE_MIGRATED); + EXPECT_EQ(page_fault.timestamp, 40507688516386); + EXPECT_EQ(page_fault.pid, 156898); + EXPECT_EQ(page_fault.agent_id.handle, 0xad5c); + EXPECT_EQ(page_fault.address.handle, 0x7fa698127 << 12); + } +} + +TEST(rocprofiler_lib, parse_kfd_event_queue_eviction) +{ + using namespace rocprofiler::kfd; + + { + const auto event = parse("9 38086928279363 -125752 1ea0 1"); + + const auto queue_event = event.data.queue_event; + EXPECT_EQ(event.kind, ROCPROFILER_BUFFER_TRACING_KFD_EVENT_QUEUE); + EXPECT_EQ(event.operation, ROCPROFILER_KFD_EVENT_QUEUE_EVICT_USERPTR); + EXPECT_EQ(queue_event.kind, ROCPROFILER_BUFFER_TRACING_KFD_EVENT_QUEUE); + EXPECT_EQ(queue_event.operation, ROCPROFILER_KFD_EVENT_QUEUE_EVICT_USERPTR); + EXPECT_EQ(queue_event.timestamp, 38086928279363); + EXPECT_EQ(queue_event.pid, 125752); + EXPECT_EQ(queue_event.agent_id.handle, 0x1ea0); + } + { + const auto event = parse("9 38086928279363 -125752 1ea0 3"); + + const auto queue_event = event.data.queue_event; + EXPECT_EQ(event.kind, ROCPROFILER_BUFFER_TRACING_KFD_EVENT_QUEUE); + EXPECT_EQ(event.operation, ROCPROFILER_KFD_EVENT_QUEUE_EVICT_SUSPEND); + EXPECT_EQ(queue_event.kind, ROCPROFILER_BUFFER_TRACING_KFD_EVENT_QUEUE); + EXPECT_EQ(queue_event.operation, ROCPROFILER_KFD_EVENT_QUEUE_EVICT_SUSPEND); + EXPECT_EQ(queue_event.timestamp, 38086928279363); + EXPECT_EQ(queue_event.pid, 125752); + EXPECT_EQ(queue_event.agent_id.handle, 0x1ea0); + } +} + +TEST(rocprofiler_lib, parse_kfd_event_queue_restore) +{ + using namespace rocprofiler::kfd; + { + const auto event = parse("a 38652512365099 -131057 fc7d"); + + const auto queue_event = event.data.queue_event; + EXPECT_EQ(event.kind, ROCPROFILER_BUFFER_TRACING_KFD_EVENT_QUEUE); + EXPECT_EQ(event.operation, ROCPROFILER_KFD_EVENT_QUEUE_RESTORE); + EXPECT_EQ(queue_event.kind, ROCPROFILER_BUFFER_TRACING_KFD_EVENT_QUEUE); + EXPECT_EQ(queue_event.operation, ROCPROFILER_KFD_EVENT_QUEUE_RESTORE); + EXPECT_EQ(queue_event.timestamp, 38652512365099); + EXPECT_EQ(queue_event.pid, 131057); + EXPECT_EQ(queue_event.agent_id.handle, 0xfc7d); + } + + { + const auto event = parse("a 40082605896929 -148516 fc7d R"); + + const auto queue_event = event.data.queue_event; + EXPECT_EQ(event.kind, ROCPROFILER_BUFFER_TRACING_KFD_EVENT_QUEUE); + EXPECT_EQ(event.operation, ROCPROFILER_KFD_EVENT_QUEUE_RESTORE_RESCHEDULED); + EXPECT_EQ(queue_event.kind, ROCPROFILER_BUFFER_TRACING_KFD_EVENT_QUEUE); + EXPECT_EQ(queue_event.operation, ROCPROFILER_KFD_EVENT_QUEUE_RESTORE_RESCHEDULED); + EXPECT_EQ(queue_event.timestamp, 40082605896929); + EXPECT_EQ(queue_event.pid, 148516); + EXPECT_EQ(queue_event.agent_id.handle, 0xfc7d); + } +} + +TEST(rocprofiler_lib, parse_kfd_event_unmap_from_gpu) +{ + using namespace rocprofiler::kfd; + + const auto event = parse("b 15203214457186 -152990 @7f93ac200(200) 1ea0 2"); + const auto unmap = event.data.unmap_event; + + EXPECT_EQ(event.kind, ROCPROFILER_BUFFER_TRACING_KFD_EVENT_UNMAP_FROM_GPU); + EXPECT_EQ(event.operation, ROCPROFILER_KFD_EVENT_QUEUE_EVICT_TTM); + EXPECT_EQ(unmap.kind, ROCPROFILER_BUFFER_TRACING_KFD_EVENT_UNMAP_FROM_GPU); + EXPECT_EQ(unmap.operation, ROCPROFILER_KFD_EVENT_QUEUE_EVICT_TTM); + EXPECT_EQ(unmap.timestamp, 15203214457186); + EXPECT_EQ(unmap.pid, 152990); + EXPECT_EQ(unmap.start_address.handle, 0x7f93ac200 << 12); + EXPECT_EQ(unmap.end_address.handle, (0x7f93ac200 + 0x200) << 12); + EXPECT_EQ(unmap.agent_id.handle, 0x1ea0); +} + +TEST(rocprofiler_lib, parse_kfd_event_dropped_event) +{ + using namespace rocprofiler::kfd; + + const auto event = parse("e 18203445217186 -323990 5"); + const auto dropped = event.data.dropped_event; + + EXPECT_EQ(event.kind, ROCPROFILER_BUFFER_TRACING_KFD_EVENT_DROPPED_EVENTS); + EXPECT_EQ(event.operation, ROCPROFILER_KFD_EVENT_DROPPED_EVENTS); + EXPECT_EQ(dropped.kind, ROCPROFILER_BUFFER_TRACING_KFD_EVENT_DROPPED_EVENTS); + EXPECT_EQ(dropped.operation, ROCPROFILER_KFD_EVENT_DROPPED_EVENTS); + EXPECT_EQ(dropped.timestamp, 18203445217186); + EXPECT_EQ(dropped.pid, 323990); + EXPECT_EQ(dropped.count, 5); +} diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/page_migration/utils.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/kfd/utils.hpp similarity index 58% rename from projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/page_migration/utils.hpp rename to projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/kfd/utils.hpp index 0b7c98be39..92c97c0a1d 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/page_migration/utils.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/kfd/utils.hpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -30,94 +30,57 @@ #include #include +#include #include #include #include namespace rocprofiler { -namespace page_migration +namespace kfd { -/* serves as an overview of what events we capture and report - -struct event_page_fault_start_t +enum rocprofiler_kfd_event_id { - int kind; - uint64_t timestamp; - int pid; - int node_id; - uint64_t address; - fault_t fault; + KFD_EVENT_NONE = -1, + KFD_EVENT_PAGE_MIGRATE_START, + KFD_EVENT_PAGE_MIGRATE_END, + KFD_EVENT_PAGE_FAULT_START, + KFD_EVENT_PAGE_FAULT_END, + KFD_EVENT_QUEUE_EVICTION, + KFD_EVENT_QUEUE_RESTORE, + KFD_EVENT_UNMAP_FROM_GPU, + KFD_EVENT_DROPPED_EVENT, + KFD_EVENT_LAST, }; -struct event_page_fault_end_t +struct kfd_event_record { - int kind; - uint64_t timestamp; - uint32_t pid; - int node_id; - uint64_t address; - bool migrated; + rocprofiler_buffer_tracing_kind_t kind{ROCPROFILER_BUFFER_TRACING_NONE}; + int operation{-1}; + union + { + rocprofiler_buffer_tracing_kfd_event_page_migrate_record_t page_migrate_event; + rocprofiler_buffer_tracing_kfd_event_page_fault_record_t page_fault_event; + rocprofiler_buffer_tracing_kfd_event_queue_record_t queue_event; + rocprofiler_buffer_tracing_kfd_event_unmap_from_gpu_record_t unmap_event; + rocprofiler_buffer_tracing_kfd_event_dropped_events_record_t dropped_event; + + rocprofiler_buffer_tracing_kfd_page_migrate_record_t page_migrate_record; + rocprofiler_buffer_tracing_kfd_page_fault_record_t page_fault_record; + rocprofiler_buffer_tracing_kfd_queue_record_t queue_record; + } data; }; -struct event_migrate_start_t -{ - int kind; - uint64_t timestamp; - uint32_t pid; - uint64_t start; - uint64_t end_offset; - uint32_t from; - uint32_t to; - uint32_t prefetch_node; // last prefetch location, 0 for CPU, or GPU id - uint32_t preferred_node; // perferred location, 0 for CPU, or GPU id - uint32_t trigger; -}; +using agent_id_map_t = std::unordered_map; -struct event_migrate_end_t -{ - int kind; - uint64_t timestamp; - uint32_t pid; - uint64_t start; - uint64_t end_offset; - uint32_t from; - uint32_t to; - uint32_t trigger; -}; +template +struct kfd_event_info; -struct event_queue_eviction_t -{ - int kind; - uint64_t timestamp; - uint32_t pid; - int node_id; - uint32_t trigger; -}; +template +struct kfd_kind_info; -struct event_queue_restore_t -{ - int kind; - uint64_t timestamp; - uint32_t pid; - int node_id; - bool rescheduled; -}; - -struct event_unmap_from_gpu_t -{ - int kind; - uint64_t timestamp; - uint32_t pid; - uint64_t address; - uint64_t size; - int node_id; - uint32_t trigger; -}; -*/ - -template -struct page_migration_info; +template +struct kfd_operation_info; using namespace rocprofiler::common; @@ -145,14 +108,14 @@ constexpr size_t bitmask(std::index_sequence) template constexpr size_t kfd_bitmask(std::index_sequence) { - return (page_migration_info::kfd_bitmask | ...); + return (kfd_event_info::kfd_bitmask | ...); } template constexpr size_t kfd_bitmask_impl(size_t rocprof_op, std::index_sequence) { - if(rocprof_op == OpInx) return page_migration_info::kfd_bitmask; + if(rocprof_op == OpInx) return kfd_event_info::kfd_bitmask; if constexpr(sizeof...(OpInxs) > 0) return kfd_bitmask_impl(rocprof_op, std::index_sequence{}); else @@ -174,24 +137,25 @@ kfd_bitmask(const container::small_vector& rocprof_event_ids, template constexpr size_t -kfd_to_rocprof_op(size_t kfd_id, std::index_sequence) +to_rocprofiler_kfd_event_id_func(size_t kfd_id, std::index_sequence) { - if(kfd_id == page_migration_info::kfd_operation) return OpInx; + if(kfd_id == kfd_event_info::kfd_id) return OpInx; if constexpr(sizeof...(OpInxs) > 0) - return kfd_to_rocprof_op(kfd_id, std::index_sequence{}); + return to_rocprofiler_kfd_event_id_func(kfd_id, std::index_sequence{}); else - return 0; + return std::numeric_limits::max(); } +kfd_event_record +parse_event(size_t event_id, const agent_id_map_t& agents, std::string_view strn); + size_t get_rocprof_op(const std::string_view event_data); void kfd_readlines(const std::string_view str, void(handler)(std::string_view)); -using rocprof_buffer_op_t = rocprofiler_page_migration_operation_t; - using node_fd_t = int; -} // namespace page_migration +} // namespace kfd } // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/page_migration/CMakeLists.txt b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/page_migration/CMakeLists.txt deleted file mode 100644 index a789aeac64..0000000000 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/page_migration/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -# -# -set(ROCPROFILER_LIB_UVM_SOURCES abi.cpp page_migration.cpp) -set(ROCPROFILER_LIB_UVM_HEADERS defines.hpp page_migration.hpp utils.hpp) - -target_sources(rocprofiler-sdk-object-library PRIVATE ${ROCPROFILER_LIB_UVM_SOURCES} - ${ROCPROFILER_LIB_UVM_HEADERS}) diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/page_migration/abi.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/page_migration/abi.cpp deleted file mode 100644 index e038b15196..0000000000 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/page_migration/abi.cpp +++ /dev/null @@ -1,107 +0,0 @@ -// MIT License -// -// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -#include - -#include "lib/common/container/small_vector.hpp" -#include "lib/common/defines.hpp" -#include "lib/common/mpl.hpp" -#include "lib/rocprofiler-sdk/details/kfd_ioctl.h" -#include "lib/rocprofiler-sdk/page_migration/utils.hpp" - -#define ASSERT_SAME(A, B) static_assert(static_cast(A) == static_cast(B)) - -#define ROCPROFILER_LIB_ROCPROFILER_SDK_PAGE_MIGRATION_PAGE_MIGRATION_CPP_IMPL 1 -#include "lib/rocprofiler-sdk/page_migration/page_migration.def.cpp" -#undef ROCPROFILER_LIB_ROCPROFILER_SDK_PAGE_MIGRATION_PAGE_MIGRATION_CPP_IMPL - -namespace rocprofiler -{ -namespace page_migration -{ -using namespace rocprofiler::page_migration; -using namespace rocprofiler::common::container; -using rocprofiler_page_migration_seq_t = std::make_index_sequence; - -static_assert(KFD_SMI_EVENT_NONE == 0); -static_assert(KFD_SMI_EVENT_MIGRATE_START == 5); -static_assert(KFD_SMI_EVENT_MIGRATE_END == 6); -static_assert(KFD_SMI_EVENT_PAGE_FAULT_START == 7); -static_assert(KFD_SMI_EVENT_PAGE_FAULT_END == 8); -static_assert(KFD_SMI_EVENT_QUEUE_EVICTION == 9); -static_assert(KFD_SMI_EVENT_QUEUE_RESTORE == 10); -static_assert(KFD_SMI_EVENT_UNMAP_FROM_GPU == 11); -static_assert(KFD_SMI_EVENT_DROPPED_EVENT == 12); -static_assert(KFD_SMI_EVENT_ALL_PROCESS == 64); - -// Update page_migration.def.cpp with event mappings -// Update page_migration.cpp to parse and report new event -static_assert(ROCPROFILER_PAGE_MIGRATION_LAST == 9, - "New event added, update KFD to ROCPROFILER mappings"); - -// clang-format off -ASSERT_SAME(ROCPROFILER_PAGE_MIGRATION_TRIGGER_PAGEFAULT_GPU, KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU ); -ASSERT_SAME(ROCPROFILER_PAGE_MIGRATION_TRIGGER_PAGEFAULT_CPU, KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU ); -ASSERT_SAME(ROCPROFILER_PAGE_MIGRATION_TRIGGER_TTM_EVICTION, KFD_MIGRATE_TRIGGER_TTM_EVICTION ); -ASSERT_SAME(ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND_TRIGGER_SVM, KFD_QUEUE_EVICTION_TRIGGER_SVM ); -ASSERT_SAME(ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND_TRIGGER_USERPTR, KFD_QUEUE_EVICTION_TRIGGER_USERPTR ); -ASSERT_SAME(ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND_TRIGGER_TTM, KFD_QUEUE_EVICTION_TRIGGER_TTM ); -ASSERT_SAME(ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND_TRIGGER_SUSPEND, KFD_QUEUE_EVICTION_TRIGGER_SUSPEND ); -ASSERT_SAME(ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND_TRIGGER_CRIU_CHECKPOINT, KFD_QUEUE_EVICTION_CRIU_CHECKPOINT ); -ASSERT_SAME(ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND_TRIGGER_CRIU_RESTORE, KFD_QUEUE_EVICTION_CRIU_RESTORE ); -ASSERT_SAME(ROCPROFILER_PAGE_MIGRATION_UNMAP_FROM_GPU_TRIGGER_MMU_NOTIFY, KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY ); -ASSERT_SAME(ROCPROFILER_PAGE_MIGRATION_UNMAP_FROM_GPU_TRIGGER_MMU_NOTIFY_MIGRATE, KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY_MIGRATE); -ASSERT_SAME(ROCPROFILER_PAGE_MIGRATION_UNMAP_FROM_GPU_TRIGGER_UNMAP_FROM_CPU, KFD_SVM_UNMAP_TRIGGER_UNMAP_FROM_CPU ); -// clang-format on - -static_assert(kfd_bitmask(std::index_sequence()) == - (KFD_SMI_EVENT_MASK_FROM_INDEX(KFD_SMI_EVENT_PAGE_FAULT_START) | - KFD_SMI_EVENT_MASK_FROM_INDEX(KFD_SMI_EVENT_MIGRATE_END) | - KFD_SMI_EVENT_MASK_FROM_INDEX(KFD_SMI_EVENT_UNMAP_FROM_GPU))); - -static_assert((page_migration_info::kfd_bitmask | - page_migration_info::kfd_bitmask | - page_migration_info::kfd_bitmask) == - (KFD_SMI_EVENT_MASK_FROM_INDEX(KFD_SMI_EVENT_MIGRATE_END) | - KFD_SMI_EVENT_MASK_FROM_INDEX(KFD_SMI_EVENT_QUEUE_EVICTION) | - KFD_SMI_EVENT_MASK_FROM_INDEX(KFD_SMI_EVENT_UNMAP_FROM_GPU))); - -static_assert(kfd_to_rocprof_op(KFD_SMI_EVENT_MIGRATE_START, rocprofiler_page_migration_seq_t{}) == - ROCPROFILER_PAGE_MIGRATION_PAGE_MIGRATE_START); -static_assert(kfd_to_rocprof_op(KFD_SMI_EVENT_MIGRATE_END, rocprofiler_page_migration_seq_t{}) == - ROCPROFILER_PAGE_MIGRATION_PAGE_MIGRATE_END); -static_assert(kfd_to_rocprof_op(KFD_SMI_EVENT_PAGE_FAULT_START, - rocprofiler_page_migration_seq_t{}) == - ROCPROFILER_PAGE_MIGRATION_PAGE_FAULT_START); -static_assert(kfd_to_rocprof_op(KFD_SMI_EVENT_PAGE_FAULT_END, rocprofiler_page_migration_seq_t{}) == - ROCPROFILER_PAGE_MIGRATION_PAGE_FAULT_END); -static_assert(kfd_to_rocprof_op(KFD_SMI_EVENT_QUEUE_EVICTION, rocprofiler_page_migration_seq_t{}) == - ROCPROFILER_PAGE_MIGRATION_QUEUE_EVICTION); -static_assert(kfd_to_rocprof_op(KFD_SMI_EVENT_QUEUE_RESTORE, rocprofiler_page_migration_seq_t{}) == - ROCPROFILER_PAGE_MIGRATION_QUEUE_RESTORE); -static_assert(kfd_to_rocprof_op(KFD_SMI_EVENT_UNMAP_FROM_GPU, rocprofiler_page_migration_seq_t{}) == - ROCPROFILER_PAGE_MIGRATION_UNMAP_FROM_GPU); - -} // namespace page_migration -} // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/page_migration/page_migration.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/page_migration/page_migration.cpp deleted file mode 100644 index f1a1d61a2f..0000000000 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/page_migration/page_migration.cpp +++ /dev/null @@ -1,937 +0,0 @@ -// MIT License -// -// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -#include "lib/rocprofiler-sdk/page_migration/page_migration.hpp" -#include "lib/common/logging.hpp" -#include "lib/common/mpl.hpp" -#include "lib/common/static_object.hpp" -#include "lib/common/utility.hpp" -#include "lib/rocprofiler-sdk/agent.hpp" -#include "lib/rocprofiler-sdk/buffer.hpp" -#include "lib/rocprofiler-sdk/context/context.hpp" -#include "lib/rocprofiler-sdk/details/kfd_ioctl.h" -#include "lib/rocprofiler-sdk/internal_threading.hpp" -#include "lib/rocprofiler-sdk/page_migration/utils.hpp" - -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#define ROCPROFILER_LIB_ROCPROFILER_SDK_PAGE_MIGRATION_PAGE_MIGRATION_CPP_IMPL 1 -#include "page_migration.def.cpp" -#undef ROCPROFILER_LIB_ROCPROFILER_SDK_PAGE_MIGRATION_PAGE_MIGRATION_CPP_IMPL - -namespace rocprofiler -{ -namespace page_migration -{ -template -using small_vector = common::container::small_vector; - -using context_t = context::context; -using context_array_t = common::container::small_vector; - -template -struct page_migration_info; - -template -struct kfd_event_info; - -template -struct page_migration_enum_info; - -template -struct page_migration_bounds; - -// Parsing and utilities -namespace -{ -constexpr auto -page_to_bytes(size_t val) -{ - // each page is 4KB = 4096 bytes - return val << 12; -} - -template -page_migration_record_t parse_event(std::string_view) -{ - ROCP_FATAL_IF(false) << page_migration_info::format_str; - return {}; -} - -auto -get_node_agent_id(uint32_t _node_id) -{ - using agent_id_map_t = std::unordered_map; - static auto*& _data = static_object::construct([]() { - auto _v = std::unordered_map{}; - for(const auto* agent : agent::get_agents()) - _v.emplace(agent->gpu_id, agent->id); - return _v; - }()); - - CHECK(_data != nullptr); - ROCP_FATAL_IF(_data->count(_node_id) == 0) << "page_migration: unknown node id: " << _node_id; - return _data->at(_node_id); -} - -template <> -page_migration_record_t -parse_event(std::string_view str) -{ - auto rec = page_migration_record_t{}; - auto& e = rec.args.page_fault_start; - uint32_t kind{}; - uint32_t _node_id = 0; - - char fault; - std::sscanf(str.data(), - page_migration_info::format_str.data(), - &kind, - &rec.timestamp, - &rec.pid, - &e.address, - &_node_id, - &fault); - - e.read_fault = (fault == 'R'); - e.address = page_to_bytes(e.address); - e.agent_id = get_node_agent_id(_node_id); - - ROCP_TRACE << fmt::format("Page fault start [ ts: {} pid: {} addr: 0x{:X} node: {} ] \n", - rec.timestamp, - rec.pid, - e.address, - e.agent_id.handle); - - return rec; -} - -template <> -page_migration_record_t -parse_event(std::string_view str) -{ - auto rec = page_migration_record_t{}; - auto& e = rec.args.page_fault_end; - uint32_t kind{}; - uint32_t _node_id = 0; - - char migrated; - std::sscanf(str.data(), - page_migration_info::format_str.data(), - &kind, - &rec.timestamp, - &rec.pid, - &e.address, - &_node_id, - &migrated); - - // M or U -> migrated / unmigrated? - if(migrated == 'M') - e.migrated = true; - else if(migrated == 'U') - e.migrated = false; - else - ROCP_WARNING << "Unknown PAGE_FAULT_END migrated/unmigrated state"; - - e.address = page_to_bytes(e.address); - e.agent_id = get_node_agent_id(_node_id); - - ROCP_TRACE << fmt::format( - "Page fault end [ ts: {} pid: {} addr: 0x{:X} node: {} migrated: {} ] \n", - rec.timestamp, - rec.pid, - e.address, - e.agent_id.handle, - migrated); - - return rec; -} - -template <> -page_migration_record_t -parse_event(std::string_view str) -{ - auto rec = page_migration_record_t{}; - auto& e = rec.args.page_migrate_start; - uint32_t kind{}; - uint32_t trigger{}; - uint32_t _from_node = 0; - uint32_t _to_node = 0; - uint32_t _prefetch_node = 0; - uint32_t _preferred_node = 0; - - std::sscanf( - str.data(), - page_migration_info::format_str.data(), - &kind, - &rec.timestamp, - &rec.pid, - &e.start_addr, - &e.end_addr, - &_from_node, - &_to_node, - &_prefetch_node, - &_preferred_node, - &trigger); - - e.end_addr += e.start_addr; - e.trigger = static_cast(trigger); - e.start_addr = page_to_bytes(e.start_addr); - e.end_addr = page_to_bytes(e.end_addr) - 1; - e.from_agent = get_node_agent_id(_from_node); - e.to_agent = get_node_agent_id(_to_node); - e.prefetch_agent = get_node_agent_id(_prefetch_node); - e.preferred_agent = get_node_agent_id(_preferred_node); - - ROCP_TRACE << fmt::format( - "Page migrate start [ ts: {} pid: {} addr s: 0x{:X} addr " - "e: 0x{:X} size: {}B from node: {} to node: {} prefetch node: {} preferred node: {} " - "trigger: {} ] \n", - rec.timestamp, - rec.pid, - e.start_addr, - e.end_addr, - (e.end_addr - e.start_addr), - e.from_agent.handle, - e.to_agent.handle, - e.prefetch_agent.handle, - e.preferred_agent.handle, - trigger); - - return rec; -} - -template <> -page_migration_record_t -parse_event(std::string_view str) -{ - auto rec = page_migration_record_t{}; - auto& e = rec.args.page_migrate_end; - uint32_t kind{}; - uint32_t trigger{}; - uint32_t _from_node = 0; - uint32_t _to_node = 0; - int32_t _error_code = 0; - - std::sscanf(str.data(), - page_migration_info::format_str.data(), - &kind, - &rec.timestamp, - &rec.pid, - &e.start_addr, - &e.end_addr, - &_from_node, - &_to_node, - &trigger, - &_error_code); - - e.end_addr += e.start_addr; - e.trigger = static_cast(trigger); - e.start_addr = page_to_bytes(e.start_addr); - e.end_addr = page_to_bytes(e.end_addr) - 1; - e.from_agent = get_node_agent_id(_from_node); - e.to_agent = get_node_agent_id(_to_node); - - // For older kernel versions, no event is generated in the error case, - // so the default value of 0 is correct for any given end event. - e.error_code = _error_code; - - ROCP_TRACE << fmt::format("Page migrate end [ ts: {} pid: {} addr s: 0x{:X} addr e: " - "0x{:X} from node: {} to node: {} trigger: {} error code: {}] \n", - rec.timestamp, - rec.pid, - e.start_addr, - e.end_addr, - e.from_agent.handle, - e.to_agent.handle, - trigger, - _error_code); - - return rec; -} - -template <> -page_migration_record_t -parse_event(std::string_view str) -{ - auto rec = page_migration_record_t{}; - auto& e = rec.args.queue_eviction; - uint32_t kind{}; - uint32_t trigger{}; - uint32_t _node_id = 0; - - std::sscanf(str.data(), - page_migration_info::format_str.data(), - &kind, - &rec.timestamp, - &rec.pid, - &_node_id, - &trigger); - - e.trigger = static_cast(trigger); - e.agent_id = get_node_agent_id(_node_id); - - ROCP_TRACE << fmt::format("Queue evict [ ts: {} pid: {} node: {} trigger: {} ] \n", - rec.timestamp, - rec.pid, - e.agent_id.handle, - trigger); - - return rec; -} - -template <> -page_migration_record_t -parse_event(std::string_view str) -{ - auto rec = page_migration_record_t{}; - auto& e = rec.args.queue_restore; - uint32_t kind{}; - uint32_t _node_id = 0; - - std::sscanf(str.data(), - page_migration_info::format_str.data(), - &kind, - &rec.timestamp, - &rec.pid, - &_node_id); - // check if we have a valid char at the end. -1 has \0 - if(str[str.size() - 2] == 'R') - e.rescheduled = true; - else - e.rescheduled = false; - e.agent_id = get_node_agent_id(_node_id); - - ROCP_TRACE << fmt::format( - "Queue restore [ ts: {} pid: {} node: {} ] \n", rec.timestamp, rec.pid, e.agent_id.handle); - - return rec; -} - -template <> -page_migration_record_t -parse_event(std::string_view str) -{ - auto rec = page_migration_record_t{}; - auto& e = rec.args.unmap_from_gpu; - uint32_t kind{}; - uint32_t trigger{}; - uint32_t _node_id = 0; - - std::sscanf(str.data(), - page_migration_info::format_str.data(), - &kind, - &rec.timestamp, - &rec.pid, - &e.start_addr, - &e.end_addr, - &_node_id, - &trigger); - - e.end_addr += e.start_addr; - e.trigger = static_cast(trigger); - e.start_addr = page_to_bytes(e.start_addr); - e.end_addr = page_to_bytes(e.end_addr); - e.agent_id = get_node_agent_id(_node_id); - - ROCP_TRACE << fmt::format( - "Unmap from GPU [ ts: {} pid: {} start addr: 0x{:X} end addr: 0x{:X} " - "node: {} trigger {} ] \n", - rec.timestamp, - rec.pid, - e.start_addr, - e.end_addr, - e.agent_id.handle, - trigger); - - return rec; -} - -template <> -page_migration_record_t -parse_event(std::string_view str) -{ - auto rec = page_migration_record_t{}; - auto& e = rec.args.dropped_event; - uint32_t kind{}; - - std::sscanf(str.data(), - page_migration_info::format_str.data(), - &kind, - &rec.timestamp, - &rec.pid, - &e.dropped_events_count); - - ROCP_TRACE << fmt::format("Dropped events [ ts: {} pid: {} dropped count: {} ] \n", - rec.timestamp, - rec.pid, - e.dropped_events_count); - - return rec; -} - -template <> -page_migration_record_t parse_event(std::string_view) -{ - ROCP_CI_LOG(WARNING) - << "ROCPROFILER_PAGE_MIGRATION_NONE for parsing page migration events should not happen"; - return common::init_public_api_struct(page_migration_record_t{}); -} - -template -page_migration_record_t -parse_event(size_t event_id, std::string_view strn, std::index_sequence) -{ - if(OpInx == static_cast(event_id)) - { - auto rec = parse_event(strn); - rec.size = sizeof(page_migration_record_t); - rec.kind = ROCPROFILER_BUFFER_TRACING_PAGE_MIGRATION; - rec.operation = static_cast(OpInx); - return rec; - } - - if constexpr(sizeof...(OpInxs) > 0) - return parse_event(event_id, strn, std::index_sequence{}); - - return page_migration_record_t{}; -} - -/* -----------------------------------------------------------------------------------*/ - -} // namespace - -size_t -get_rocprof_op(const std::string_view event_data) -{ - uint32_t kfd_event_id{}; - std::sscanf(event_data.data(), "%x ", &kfd_event_id); - - auto rocprof_id = - kfd_to_rocprof_op(static_cast(kfd_event_id), - std::make_index_sequence{}); - - ROCP_CI_LOG_IF(WARNING, rocprof_id == 0) - << fmt::format("Failed to parse KFD event ID {}. Parsed ID: {}, SDK ID: {}\n", - event_data[0], - kfd_event_id, - rocprof_id); - - return rocprof_id; -} - -void -kfd_readlines(const std::string_view str, void(handler)(std::string_view)) -{ - const auto find_newline = [&](auto b) { return std::find(b, str.cend(), '\n'); }; - const auto* cursor = str.cbegin(); - - for(const auto* pos = find_newline(cursor); pos != str.cend(); pos = find_newline(cursor)) - { - size_t char_count = pos - cursor; - assert(char_count > 0); - std::string_view event_str{cursor, char_count}; - - ROCP_INFO << fmt::format("KFD event: [{}]", event_str); - handler(event_str); - - cursor = pos + 1; - } -} - -// Event capture and reporting -namespace -{ -constexpr auto kfd_ioctl_version = (1000 * KFD_IOCTL_MAJOR_VERSION) + KFD_IOCTL_MINOR_VERSION; -// Support has been added in kfdv >= 1.10+ -static_assert(kfd_ioctl_version >= 1010, "KFD SMI support missing in kfd_ioctl.h"); - -auto -get_contexts(int operation) -{ - auto active_contexts = context::get_active_contexts([](const auto* ctx) { - return (ctx->buffered_tracer && - ctx->buffered_tracer->domains(ROCPROFILER_BUFFER_TRACING_PAGE_MIGRATION)); - }); - auto operation_ctxs = context::context_array_t{}; - - for(const auto* itr : active_contexts) - { - // if the given domain + op is not enabled, skip this context - if(itr->buffered_tracer->domains(ROCPROFILER_BUFFER_TRACING_PAGE_MIGRATION, operation)) - { - operation_ctxs.emplace_back(itr); - } - } - - return operation_ctxs; -} - -void -handle_reporting(std::string_view event_data) -{ - const auto op_inx = get_rocprof_op(event_data); - auto buffered_contexts = get_contexts(op_inx); - if(buffered_contexts.empty()) return; - - // Parse and process the event - auto record = parse_event( - op_inx, event_data, std::make_index_sequence{}); - - for(const auto& itr : buffered_contexts) - { - auto* buffer = buffer::get_buffer( - itr->buffered_tracer->buffer_data.at(ROCPROFILER_BUFFER_TRACING_PAGE_MIGRATION)); - CHECK_NOTNULL(buffer)->emplace( - ROCPROFILER_BUFFER_CATEGORY_TRACING, ROCPROFILER_BUFFER_TRACING_PAGE_MIGRATION, record); - } -} - -} // namespace - -namespace -{ -void poll_events(small_vector); -} - -// KFD utils -namespace kfd -{ -using fd_flags_t = decltype(EFD_NONBLOCK); -using fd_t = decltype(pollfd::fd); -constexpr auto KFD_DEVICE_PATH{"/dev/kfd"}; - -SPECIALIZE_KFD_IOC_IOCTL(kfd_ioctl_get_version_args, AMDKFD_IOC_GET_VERSION); -SPECIALIZE_KFD_IOC_IOCTL(kfd_ioctl_smi_events_args, AMDKFD_IOC_SMI_EVENTS); - -namespace -{ -template -auto -ioctl(int kfd_fd, T& args) -{ - // from hsaKmt library (hsakmt/src/libhsakmt.c) - int exit_code{}; - - do - { - exit_code = ::ioctl(kfd_fd, IOC_event::value, static_cast(&args)); - } while(exit_code == -1 && (errno == EINTR || errno == EAGAIN)); - - if(exit_code == -1 && errno == EBADF) - { - /* In case pthread_atfork didn't catch it, this will - * make any subsequent hsaKmt calls fail in CHECK_KFD_OPEN. - */ - CHECK(true && "KFD file descriptor not valid in this process\n"); - } - return exit_code; -} - -struct kfd_device_fd -{ - fd_t fd{-1}; - - kfd_device_fd() - { - fd = ::open(KFD_DEVICE_PATH, O_RDWR | O_CLOEXEC); - ROCP_FATAL_IF(fd == -1) << "Error opening KFD handle @ " << KFD_DEVICE_PATH; - } - - ~kfd_device_fd() - { - if(fd >= 0) close(fd); - } -}; - -const kfd_ioctl_get_version_args -get_version() -{ - static kfd_ioctl_get_version_args version = [&]() { - auto args = kfd_ioctl_get_version_args{0, 0}; - kfd_device_fd kfd_fd{}; - - if(ioctl(kfd_fd.fd, args) != -1) - ROCP_INFO << fmt::format("KFD v{}.{}", args.major_version, args.minor_version); - else - ROCP_ERROR << fmt::format("Could not determine KFD version"); - return args; - }(); - - return version; -} - -struct poll_kfd_t -{ - static constexpr auto DEFAULT_FLAGS{EFD_CLOEXEC}; - - struct gpu_fd_t - { - unsigned int node_id = 0; - fd_t fd = {}; - const rocprofiler_agent_t* agent = nullptr; - }; - - kfd_device_fd kfd_fd = {}; - pollfd thread_notify = {}; - std::thread bg_thread = {}; - bool active = {false}; - - poll_kfd_t() = default; - - poll_kfd_t(const small_vector& rprof_ev) - : kfd_fd{kfd_device_fd{}} - { - small_vector file_handles = {}; - - const auto kfd_flags = - kfd_bitmask(rprof_ev, std::make_index_sequence{}); - - ROCP_TRACE << fmt::format("Setting KFD flags to [0b{:b}] \n", kfd_flags); - - // Create fd for notifying thread when we want to wake it up, and an eventfd for any events - // to this thread - file_handles.emplace_back( - pollfd{.fd = eventfd(0, DEFAULT_FLAGS), .events = 0, .revents = 0}); - - fd_t thread_pipes[2]{}; - - [&]() { - const auto retcode = pipe2(&thread_pipes[0], DEFAULT_FLAGS); - const auto _err = errno; - ROCP_FATAL_IF(retcode != 0) - << fmt::format("Pipe creation for page-migration thread notify returned {} :: {}\n", - retcode, - strerror(_err)); - }(); - - thread_notify = pollfd{ - .fd = thread_pipes[1], - .events = POLLIN, - .revents = 0, - }; - - // add pipe listening end to fds to watch - file_handles.emplace_back(pollfd{thread_pipes[0], POLLIN, 0}); - - // get FD, start thread, and then enable events - for(const auto& agent : agent::get_agents()) - { - if(agent->type == ROCPROFILER_AGENT_TYPE_GPU) - { - auto gpu_event_fd = get_node_fd(agent->gpu_id); - file_handles.emplace_back(pollfd{gpu_event_fd, POLLIN, 0}); - ROCP_TRACE << fmt::format( - "GPU node {} with fd {} added\n", agent->gpu_id, gpu_event_fd); - } - } - - // Enable KFD masked events by writing flags to kfd fd - for(size_t i = 2; i < file_handles.size(); ++i) - { - auto& fd = file_handles[i]; - auto write_size = write(fd.fd, &kfd_flags, sizeof(kfd_flags)); - ROCP_TRACE << fmt::format( - "Writing {} to GPU fd {} ({} bytes)\n", kfd_flags, fd.fd, write_size); - CHECK(write_size == sizeof(kfd_flags)); - } - - // start bg thread - internal_threading::notify_pre_internal_thread_create(ROCPROFILER_LIBRARY); - bg_thread = std::thread{poll_events, file_handles}; - internal_threading::notify_post_internal_thread_create(ROCPROFILER_LIBRARY); - - active = true; - } - - static auto get_event_id(const std::string_view& strn) - { - uint32_t event_id{std::numeric_limits::max()}; - std::sscanf(strn.data(), "%x ", &event_id); - - CHECK(event_id <= KFD_SMI_EVENT_ALL_PROCESS); - } - - poll_kfd_t(const poll_kfd_t&) = delete; - poll_kfd_t& operator=(const poll_kfd_t&) = delete; - - poll_kfd_t(poll_kfd_t&&) noexcept = default; - poll_kfd_t& operator=(poll_kfd_t&&) noexcept = default; - - ~poll_kfd_t() - { - ROCP_TRACE << fmt::format("Terminating poll_kfd\n"); - if(!active) return; - - // wake thread up - auto bytes_written{-1}; - do - { - bytes_written = write(thread_notify.fd, "E", 1); - } while(bytes_written == -1 && (errno == EINTR || errno == EAGAIN)); - - bg_thread.join(); - - close(thread_notify.fd); - - ROCP_TRACE << fmt::format("Background thread signalled\n"); - } - - node_fd_t get_node_fd(int gpu_node_id) const - { - kfd_ioctl_smi_events_args args{}; - args.gpuid = gpu_node_id; - - if(auto ret = ioctl(kfd_fd.fd, args); ret == -1) - ROCP_ERROR << fmt::format( - "Could not get GPU node {} file descriptor (exit code: {})", gpu_node_id, ret); - return args.anon_fd; - } -}; -} // namespace -} // namespace kfd - -// for all contexts -struct config -{ -private: - kfd::poll_kfd_t kfd_handle{}; - - static inline config* _config{nullptr}; - - config(const small_vector& _event_ids) - : kfd_handle{_event_ids} - {} - -public: - static void init(const small_vector& event_ids) { _config = new config{event_ids}; } - - static void reset() - { - config* ptr = nullptr; - std::swap(ptr, _config); - delete ptr; - } - - static void reset_on_fork() { _config = nullptr; } -}; - -namespace -{ -void -poll_events(small_vector file_handles) -{ - // storage to write records to, 1MB - constexpr size_t PREALLOCATE_ELEMENT_COUNT{1024 * 128}; - std::string scratch_buffer(PREALLOCATE_ELEMENT_COUNT, '\0'); - auto& exitfd = file_handles[1]; - - // Wait or spin on events. - // 0 -> return immediately even if no events - // -1 -> wait indefinitely - - pthread_setname_np(pthread_self(), "bg:pagemigr"); - - for(auto& fd : file_handles) - { - ROCP_TRACE << fmt::format( - "Handle = {}, events = {}, revents = {}\n", fd.fd, fd.events, fd.revents); - } - - while(true) - { - auto poll_ret = poll(file_handles.data(), file_handles.size(), -1); - - if(poll_ret == -1) - { - ROCP_CI_LOG(WARNING) - << "Background thread file descriptors for page-migration are invalid"; - return; - } - - if((exitfd.revents & POLLIN) != 0) - { - for(const auto& f : file_handles) - { - close(f.fd); - } - ROCP_INFO << "Terminating background thread\n"; - return; - } - - using namespace std::chrono_literals; - - // 0 and 1 are for generic and pipe-notify handles - for(size_t i = 2; i < file_handles.size(); ++i) - { - auto& fd = file_handles[i]; - - // We have data to read, perhaps multiple events - if((fd.revents & POLLIN) != 0) - { - size_t status_size = read(fd.fd, scratch_buffer.data(), scratch_buffer.size()); - auto event_strings = std::string_view{scratch_buffer.data(), status_size}; - kfd_readlines(event_strings, handle_reporting); - } - fd.revents = 0; - } - } -} - -template -const char* -name_by_id(const uint32_t id, std::index_sequence) -{ - if(Idx == id) return page_migration_info::name; - if constexpr(sizeof...(IdxTail) > 0) - return name_by_id(id, std::index_sequence{}); - else - return nullptr; -} - -template -void -get_ids(std::vector& _id_list, std::index_sequence) -{ - auto _emplace = [](auto& _vec, uint32_t _v) { - if(_v < static_cast(ROCPROFILER_HSA_AMD_EXT_API_ID_LAST)) _vec.emplace_back(_v); - }; - - (_emplace(_id_list, page_migration_info::operation), ...); -} - -bool -context_filter(const context::context* ctx) -{ - return (ctx->buffered_tracer && - (ctx->buffered_tracer->domains(ROCPROFILER_BUFFER_TRACING_PAGE_MIGRATION))); -} - -template -void -to_bitmask(small_vector& _id_list, std::index_sequence) -{ - auto _emplace = [](auto& _vec, uint32_t _v) { - if(_v < static_cast(ROCPROFILER_HSA_AMD_EXT_API_ID_LAST)) _vec.emplace_back(_v); - }; - - (_emplace(_id_list, page_migration_info::operation), ...); -} - -template -rocprofiler_status_t init(std::index_sequence) -{ - static const small_vector event_ids{Inxs...}; - // Check if version is more than 1.11 - auto ver = kfd::get_version(); - if(ver.major_version * 1000 + ver.minor_version > 1011) - { - if(!context::get_registered_contexts(context_filter).empty()) - { - config::init(event_ids); - } - return ROCPROFILER_STATUS_SUCCESS; - } - else - { - // Add a buffer record with this info - ROCP_ERROR << fmt::format( - "KFD does not support SVM event reporting in v{}.{} (requires v1.11)", - ver.major_version, - ver.minor_version); - return ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_KERNEL; - } -} -} // namespace - -} // namespace page_migration -} // namespace rocprofiler - -namespace rocprofiler::page_migration -{ -rocprofiler_status_t -init() -{ - pthread_atfork(nullptr, nullptr, []() { - // null out child's copy on fork and reinitialize - // otherwise all children wait on the same thread to join - config::reset_on_fork(); - init(std::make_index_sequence{}); - }); - - return init(std::make_index_sequence{}); -} - -void -finalize() -{ - config::reset(); -} - -const char* -name_by_id(uint32_t id) -{ - return name_by_id(id, std::make_index_sequence{}); -} - -std::vector -get_ids() -{ - auto _data = std::vector{}; - _data.reserve(ROCPROFILER_PAGE_MIGRATION_LAST); - get_ids(_data, std::make_index_sequence{}); - return _data; -} -} // namespace rocprofiler::page_migration diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/page_migration/page_migration.def.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/page_migration/page_migration.def.cpp deleted file mode 100644 index 1af3ea1d43..0000000000 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/page_migration/page_migration.def.cpp +++ /dev/null @@ -1,60 +0,0 @@ -// MIT License -// -// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -#include "lib/rocprofiler-sdk/page_migration/defines.hpp" -#include "lib/rocprofiler-sdk/page_migration/page_migration.hpp" - -#if defined(ROCPROFILER_LIB_ROCPROFILER_SDK_PAGE_MIGRATION_PAGE_MIGRATION_CPP_IMPL) && \ - ROCPROFILER_LIB_ROCPROFILER_SDK_PAGE_MIGRATION_PAGE_MIGRATION_CPP_IMPL == 1 - -namespace rocprofiler -{ -namespace page_migration -{ -using namespace rocprofiler::common; - -using kfd_event_id_t = decltype(KFD_SMI_EVENT_NONE); -using migrate_trigger_t = rocprofiler_page_migration_trigger_t; -using page_migration_record_t = rocprofiler_buffer_tracing_page_migration_record_t; -using queue_suspend_trigger_t = rocprofiler_page_migration_queue_suspend_trigger_t; -using unmap_from_gpu_trigger_t = rocprofiler_page_migration_unmap_from_gpu_trigger_t; - -using trigger_type_list_t = common::mpl::type_list; -// clang-format off -// Map ROCPROF UVM enums to KFD enums -SPECIALIZE_PAGE_MIGRATION_INFO(NONE, NONE, "Error: Invalid UVM event from KFD" ); -SPECIALIZE_PAGE_MIGRATION_INFO(PAGE_MIGRATE_START, MIGRATE_START, "%x %ld -%d @%lx(%lx) %x->%x %x:%x %d\n"); -SPECIALIZE_PAGE_MIGRATION_INFO(PAGE_MIGRATE_END, MIGRATE_END, "%x %ld -%d @%lx(%lx) %x->%x %d %d\n" ); -SPECIALIZE_PAGE_MIGRATION_INFO(PAGE_FAULT_START, PAGE_FAULT_START, "%x %ld -%d @%lx(%x) %c\n" ); -SPECIALIZE_PAGE_MIGRATION_INFO(PAGE_FAULT_END, PAGE_FAULT_END, "%x %ld -%d @%lx(%x) %c\n" ); -SPECIALIZE_PAGE_MIGRATION_INFO(QUEUE_EVICTION, QUEUE_EVICTION, "%x %ld -%d %x %d\n" ); -SPECIALIZE_PAGE_MIGRATION_INFO(QUEUE_RESTORE, QUEUE_RESTORE, "%x %ld -%d %x\n" ); -SPECIALIZE_PAGE_MIGRATION_INFO(UNMAP_FROM_GPU, UNMAP_FROM_GPU, "%x %ld -%d @%lx(%lx) %x %d\n" ); -SPECIALIZE_PAGE_MIGRATION_INFO(DROPPED_EVENT, DROPPED_EVENT, "%x %ld -%d %d\n" ); -#undef SPECIALIZE_PAGE_MIGRATION_INFO -// clang-format on - -} // namespace page_migration -} // namespace rocprofiler -#endif diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/registration.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/registration.cpp index e02f14c502..b62338df33 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/registration.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/registration.cpp @@ -43,9 +43,9 @@ #include "lib/rocprofiler-sdk/hsa/scratch_memory.hpp" #include "lib/rocprofiler-sdk/intercept_table.hpp" #include "lib/rocprofiler-sdk/internal_threading.hpp" +#include "lib/rocprofiler-sdk/kfd/kfd.hpp" #include "lib/rocprofiler-sdk/marker/marker.hpp" #include "lib/rocprofiler-sdk/ompt.hpp" -#include "lib/rocprofiler-sdk/page_migration/page_migration.hpp" #include "lib/rocprofiler-sdk/pc_sampling/code_object.hpp" #include "lib/rocprofiler-sdk/pc_sampling/service.hpp" #include "lib/rocprofiler-sdk/rccl/rccl.hpp" @@ -757,7 +757,7 @@ finalize() hsa::queue_controller_fini(); thread_trace::finalize(); ompt::finalize_ompt(); - page_migration::finalize(); + kfd::finalize(); #if ROCPROFILER_SDK_HSA_PC_SAMPLING > 0 // WARNING: this must precede `code_object::finalize()` pc_sampling::code_object::finalize(); diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/tests/CMakeLists.txt b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/tests/CMakeLists.txt index 4d5ef6f322..d71aa7c297 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/tests/CMakeLists.txt +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/tests/CMakeLists.txt @@ -20,8 +20,7 @@ set(rocprofiler_lib_sources naming.cpp timestamp.cpp version.cpp - hsa_barrier.cpp - page_migration.cpp) + hsa_barrier.cpp) add_executable(rocprofiler-sdk-lib-tests) target_sources(rocprofiler-sdk-lib-tests PRIVATE ${rocprofiler_lib_sources} diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/tests/enum_string.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/tests/enum_string.cpp index 0ef5de57a3..59b361c5d5 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/tests/enum_string.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/tests/enum_string.cpp @@ -91,9 +91,6 @@ TEST(enum_string, fwd_h) TEST_STR(ROCPROFILER_STATUS_ERROR_BUFFER_BUSY); TEST_STR(ROCPROFILER_STATUS_ERROR_CONFIGURATION_LOCKED); - TEST_STR(ROCPROFILER_PAGE_MIGRATION_PAGE_FAULT_START); - TEST_STR(ROCPROFILER_PAGE_MIGRATION_QUEUE_EVICTION); - // rocprofiler_buffer_category_t TEST_STR(ROCPROFILER_BUFFER_CATEGORY_TRACING); TEST_STR(ROCPROFILER_BUFFER_CATEGORY_COUNTERS); @@ -116,6 +113,7 @@ TEST(enum_string, fwd_h) TEST_STR(ROCPROFILER_BUFFER_TRACING_MARKER_NAME_API); TEST_STR(ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY); TEST_STR(ROCPROFILER_BUFFER_TRACING_ROCDECODE_API); + TEST_STR(ROCPROFILER_BUFFER_TRACING_KFD_QUEUE); // rocprofiler_code_object_operation_t TEST_STR(ROCPROFILER_CODE_OBJECT_NONE); @@ -152,10 +150,6 @@ TEST(enum_string, fwd_h) TEST_STR(ROCPROFILER_BUFFER_POLICY_DISCARD); TEST_STR(ROCPROFILER_BUFFER_POLICY_LOSSLESS); - // rocprofiler_page_migration_operation_t - TEST_STR(ROCPROFILER_PAGE_MIGRATION_NONE); - TEST_STR(ROCPROFILER_PAGE_MIGRATION_DROPPED_EVENT); - // rocprofiler_scratch_memory_operation_t TEST_STR(ROCPROFILER_SCRATCH_MEMORY_NONE); TEST_STR(ROCPROFILER_SCRATCH_MEMORY_ALLOC); @@ -265,16 +259,18 @@ TEST(enum_string, roctx_api_id) TEST_API_ID_STR(ROCPROFILER_MARKER_NAME_API_ID, roctxNameHipStream); } -TEST(enum_string, page_migration_triggers) +TEST(enum_string, kfd_events) { - TEST_STR(ROCPROFILER_PAGE_MIGRATION_TRIGGER_PREFETCH); - TEST_STR(ROCPROFILER_PAGE_MIGRATION_TRIGGER_TTM_EVICTION); - - TEST_STR(ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND_TRIGGER_SVM); - TEST_STR(ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND_TRIGGER_CRIU_RESTORE); - - TEST_STR(ROCPROFILER_PAGE_MIGRATION_UNMAP_FROM_GPU_TRIGGER_MMU_NOTIFY); - TEST_STR(ROCPROFILER_PAGE_MIGRATION_UNMAP_FROM_GPU_TRIGGER_UNMAP_FROM_CPU); + TEST_STR(ROCPROFILER_KFD_EVENT_PAGE_MIGRATE_PAGEFAULT_CPU); + TEST_STR(ROCPROFILER_KFD_EVENT_PAGE_FAULT_END_PAGE_MIGRATED); + TEST_STR(ROCPROFILER_KFD_EVENT_QUEUE_EVICT_SUSPEND); + TEST_STR(ROCPROFILER_KFD_EVENT_QUEUE_RESTORE); + TEST_STR(ROCPROFILER_KFD_EVENT_UNMAP_FROM_GPU_UNMAP_FROM_CPU); + TEST_STR(ROCPROFILER_KFD_EVENT_DROPPED_EVENTS); + TEST_STR(ROCPROFILER_KFD_PAGE_MIGRATE_PAGEFAULT_CPU); + TEST_STR(ROCPROFILER_KFD_PAGE_MIGRATE_TTM_EVICTION); + TEST_STR(ROCPROFILER_KFD_PAGE_FAULT_WRITE_FAULT_UPDATED); + TEST_STR(ROCPROFILER_KFD_QUEUE_EVICT_CRIU_RESTORE); } TEST(enum_string, rccl_api_id) diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/tests/page_migration.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/tests/page_migration.cpp deleted file mode 100644 index 2bcc7fff2f..0000000000 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/tests/page_migration.cpp +++ /dev/null @@ -1,106 +0,0 @@ -// MIT License -// -// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -#include "lib/common/container/small_vector.hpp" -#include "lib/common/defines.hpp" -#include "lib/common/mpl.hpp" -#include "lib/rocprofiler-sdk/page_migration/utils.hpp" - -#include -#include -#include - -#include -#include - -#include -#include -#include - -#define ROCPROFILER_LIB_ROCPROFILER_SDK_PAGE_MIGRATION_PAGE_MIGRATION_CPP_IMPL 1 -#include "lib/rocprofiler-sdk/page_migration/page_migration.def.cpp" -#undef ROCPROFILER_LIB_ROCPROFILER_SDK_PAGE_MIGRATION_PAGE_MIGRATION_CPP_IMPL - -namespace -{ -constexpr std::string_view MULTILINE_STRING = "This is 0 Line 0\n" - "This is 10 Line 1\n" - "This is 20 Line 2\n" - "This is 30 Line 3\n" - "This is 40 Line 4\n"; -} - -void -return_line(const std::string_view line) -{ - static int line_no = 0; - std::stringstream strs{}; - strs << fmt::format("This is {} Line {}", line_no * 10, line_no); - EXPECT_EQ(strs.str(), line); - line_no++; -} - -auto -parse_lines() -{ - rocprofiler::page_migration::kfd_readlines(MULTILINE_STRING, return_line); -} - -TEST(page_migration, readlines) -{ - // Ensure all lines are read - parse_lines(); -} - -TEST(page_migration, rocprof_kfd_map) -{ - using namespace rocprofiler::page_migration; - using namespace rocprofiler::common::container; - - using rocprofiler_page_migration_seq_t = - std::make_index_sequence; - - const small_vector vec{ROCPROFILER_PAGE_MIGRATION_PAGE_MIGRATE_END, - ROCPROFILER_PAGE_MIGRATION_QUEUE_EVICTION, - ROCPROFILER_PAGE_MIGRATION_UNMAP_FROM_GPU}; - - EXPECT_EQ((page_migration_info::kfd_bitmask | - page_migration_info::kfd_bitmask | - page_migration_info::kfd_bitmask), - kfd_bitmask(vec, rocprofiler_page_migration_seq_t{})); - - const auto to_kfd_str = [](kfd_smi_event e) { - std::string str = fmt::format("{:x} ", static_cast(e)); - return rocprofiler::page_migration::get_rocprof_op({str}); - }; - - // clang-format off - EXPECT_EQ(to_kfd_str(KFD_SMI_EVENT_MIGRATE_START), ROCPROFILER_PAGE_MIGRATION_PAGE_MIGRATE_START); - EXPECT_EQ(to_kfd_str(KFD_SMI_EVENT_MIGRATE_END), ROCPROFILER_PAGE_MIGRATION_PAGE_MIGRATE_END); - EXPECT_EQ(to_kfd_str(KFD_SMI_EVENT_PAGE_FAULT_START), ROCPROFILER_PAGE_MIGRATION_PAGE_FAULT_START); - EXPECT_EQ(to_kfd_str(KFD_SMI_EVENT_PAGE_FAULT_END), ROCPROFILER_PAGE_MIGRATION_PAGE_FAULT_END); - EXPECT_EQ(to_kfd_str(KFD_SMI_EVENT_QUEUE_EVICTION), ROCPROFILER_PAGE_MIGRATION_QUEUE_EVICTION); - EXPECT_EQ(to_kfd_str(KFD_SMI_EVENT_QUEUE_RESTORE), ROCPROFILER_PAGE_MIGRATION_QUEUE_RESTORE); - EXPECT_EQ(to_kfd_str(KFD_SMI_EVENT_UNMAP_FROM_GPU), ROCPROFILER_PAGE_MIGRATION_UNMAP_FROM_GPU); - EXPECT_EQ(to_kfd_str(KFD_SMI_EVENT_DROPPED_EVENT), ROCPROFILER_PAGE_MIGRATION_DROPPED_EVENT); - // clang-format on -} diff --git a/projects/rocprofiler-sdk/tests/CMakeLists.txt b/projects/rocprofiler-sdk/tests/CMakeLists.txt index 4a1b051556..988379f596 100644 --- a/projects/rocprofiler-sdk/tests/CMakeLists.txt +++ b/projects/rocprofiler-sdk/tests/CMakeLists.txt @@ -71,7 +71,6 @@ add_subdirectory(async-copy-tracing) add_subdirectory(hsa-memory-allocation) add_subdirectory(scratch-memory-tracing) add_subdirectory(c-tool) -add_subdirectory(page-migration) add_subdirectory(thread-trace) add_subdirectory(pc_sampling) add_subdirectory(hip-graph-tracing) diff --git a/projects/rocprofiler-sdk/tests/bin/CMakeLists.txt b/projects/rocprofiler-sdk/tests/bin/CMakeLists.txt index 1acd978738..7104eea016 100644 --- a/projects/rocprofiler-sdk/tests/bin/CMakeLists.txt +++ b/projects/rocprofiler-sdk/tests/bin/CMakeLists.txt @@ -27,7 +27,6 @@ add_subdirectory(multistream) add_subdirectory(vector-operations) add_subdirectory(hip-in-libraries) add_subdirectory(scratch-memory) -add_subdirectory(page-migration) add_subdirectory(hsa-queue-dependency) add_subdirectory(hip-graph) add_subdirectory(hsa-memory-allocation) diff --git a/projects/rocprofiler-sdk/tests/bin/page-migration/CMakeLists.txt b/projects/rocprofiler-sdk/tests/bin/page-migration/CMakeLists.txt deleted file mode 100644 index c7da350e91..0000000000 --- a/projects/rocprofiler-sdk/tests/bin/page-migration/CMakeLists.txt +++ /dev/null @@ -1,42 +0,0 @@ -# -# -# -cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) - -if(NOT CMAKE_HIP_COMPILER) - find_program( - amdclangpp_EXECUTABLE - NAMES amdclang++ - HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm - PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm - PATH_SUFFIXES bin llvm/bin NO_CACHE) - mark_as_advanced(amdclangpp_EXECUTABLE) - - if(amdclangpp_EXECUTABLE) - set(CMAKE_HIP_COMPILER "${amdclangpp_EXECUTABLE}") - endif() -endif() - -project(rocprofiler-sdk-tests-bin-page-migration LANGUAGES CXX HIP) - -foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO) - if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "") - set(CMAKE_HIP_FLAGS_${_TYPE} "${CMAKE_CXX_FLAGS_${_TYPE}}") - endif() -endforeach() - -set(CMAKE_CXX_STANDARD 17) -set(CMAKE_CXX_EXTENSIONS OFF) -set(CMAKE_CXX_STANDARD_REQUIRED ON) -set(CMAKE_HIP_STANDARD 17) -set(CMAKE_HIP_EXTENSIONS OFF) -set(CMAKE_HIP_STANDARD_REQUIRED ON) - -set_source_files_properties(page-migration.cpp PROPERTIES LANGUAGE HIP) -add_executable(page-migration) -target_sources(page-migration PRIVATE page-migration.cpp) -target_compile_options(page-migration PRIVATE -W -Wall -Wextra -Wpedantic -Wshadow - -Werror) - -find_package(Threads REQUIRED) -target_link_libraries(page-migration PRIVATE Threads::Threads) diff --git a/projects/rocprofiler-sdk/tests/bin/page-migration/page-migration.cpp b/projects/rocprofiler-sdk/tests/bin/page-migration/page-migration.cpp deleted file mode 100644 index ecece5ba6f..0000000000 --- a/projects/rocprofiler-sdk/tests/bin/page-migration/page-migration.cpp +++ /dev/null @@ -1,300 +0,0 @@ -// MIT License -// -// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#define HIP_API_CALL(CALL) \ - { \ - hipError_t error_ = (CALL); \ - if(error_ != hipSuccess) \ - { \ - auto _hip_api_print_lk = auto_lock_t{print_lock}; \ - fprintf(stderr, \ - "%s:%d :: HIP error : %s\n", \ - __FILE__, \ - __LINE__, \ - hipGetErrorString(error_)); \ - throw std::runtime_error("hip_api_call"); \ - } \ - } - -using auto_lock_t = std::unique_lock; -auto print_lock = std::mutex{}; - -__global__ void -kernel(size_t* __restrict__ data, int size) -{ - int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - int stride = hipBlockDim_x * hipGridDim_x; - - for(int i = x; i < size; i += stride) - { - data[i] *= 2; - } -} - -struct mmap_allocator -{ - explicit mmap_allocator(size_t num_pages) - { - m_size = num_pages * sysconf(_SC_PAGE_SIZE); - void* ret = mmap(nullptr, // addr: null. Kernel gives us page-aligned memory - m_size, // length: num bytes to allocate - PROT_WRITE | PROT_READ, // mem_prot: Allow read/write - MAP_ANONYMOUS | MAP_PRIVATE, // flags: No file handle - -1, // no fd, use memory "MAP_ANONYMOUS" - 0); // offset into fd - if(ret == ((void*) -1)) // NOLINT(performance-no-int-to-ptr) - { - auto ecode = errno; - fprintf(stderr, "mmap error %d: %s", ecode, strerror(ecode)); - throw std::runtime_error("mmap failed"); - } - else - { - m_addr = ret; - ::memset(m_addr, 0, m_size); - } - } - - ~mmap_allocator() - { - auto ret = munmap(m_addr, m_size); - if(ret != 0) perror("munmap failed"); - } - - mmap_allocator(const mmap_allocator&) = delete; - mmap_allocator(mmap_allocator&&) noexcept = default; - mmap_allocator& operator=(const mmap_allocator&) = delete; - mmap_allocator& operator=(mmap_allocator&&) noexcept = default; - - template - Up* get() const - { - static_assert(!std::is_pointer::value, "must not be pointer type"); - return static_cast(m_addr); - } - -private: - size_t m_size = 0; - void* m_addr = nullptr; -}; - -int -run_test(int dev_idx, int num_iter) -{ - using namespace std::chrono_literals; - - constexpr size_t NUM_PAGES = 512; - const size_t PAGE_SIZE_BYTES = ::sysconf(_SC_PAGE_SIZE); - - const size_t elem_count = (NUM_PAGES * PAGE_SIZE_BYTES) / sizeof(size_t); - - auto alloc = mmap_allocator(NUM_PAGES); - void* data_v = alloc.get(); - auto* data = alloc.get(); - - for(size_t i = 0; i < elem_count; ++i) - if(data[i] != 0) throw std::runtime_error{"bad init"}; - - printf("Allocated size: %lu bytes (%lu KB), (%lu MB), %zu elements @ %p\n", - sizeof(size_t) * elem_count, - sizeof(size_t) * elem_count / 1024, - sizeof(size_t) * elem_count / 1024 / 1024, - elem_count, - data_v); - - HIP_API_CALL(hipSetDevice(dev_idx)); - HIP_API_CALL(hipHostRegister(data, elem_count * sizeof(size_t), hipHostRegisterDefault)); - - constexpr size_t MAPS_BUFFER_SIZE = 1024 * 1024; - - char maps[MAPS_BUFFER_SIZE]; - std::memset(maps, '\0', MAPS_BUFFER_SIZE); - auto fd = open("/proc/self/maps", O_RDONLY | O_CLOEXEC); - - if(fd == -1) - { - auto ecode = errno; - fprintf(stderr, "mmap error %d: %s", ecode, strerror(ecode)); - exit(-1); - } - - auto bytes = read(fd, maps, MAPS_BUFFER_SIZE - 1); - if(bytes == -1) - { - auto ecode = errno; - fprintf(stderr, "mmap error %d: %s", ecode, strerror(ecode)); - exit(-1); - } - close(fd); - - std::string_view maps_data{maps, static_cast(bytes)}; - std::cout << "------------\n"; - std::cout << maps_data; - std::cout << "------------\n"; - std::istringstream maps_stream{maps_data.data()}; - std::string line(1024, '\0'); - - while(std::getline(maps_stream, line)) - { - char __[1024]; - int _{}; - - void* start{}; - void* end{}; - - auto ret = - std::sscanf(line.data(), "%p-%p %s %d %d:%d %d\n", &start, &end, __, &_, &_, &_, &_); - if(ret > 0 && (start == data_v)) - { - size_t ptr_diff = ((size_t) end - (size_t) start); - printf("Found match: %zu %zu KB, %zu 4K > %s\n", - ptr_diff, - ptr_diff / 1024, - ptr_diff / 4096, - line.data()); - } - } - - hipStream_t stream{}; - HIP_API_CALL(hipStreamCreate(&stream)); - - for(int iter = 0; iter < num_iter; ++iter) - { - for(size_t i = 0; i < elem_count; ++i) - { - data[i] = i; - } - - // std::cout << "launching..." << std::endl; - hipLaunchKernelGGL(kernel, 1024, 1024, 0, stream, data, elem_count); - - HIP_API_CALL(hipStreamSynchronize(stream)); - - for(size_t i = 0; i < elem_count; ++i) - { - const auto data_i = data[i]; - if(data_i != (i * 2)) - { - auto msg = std::stringstream{}; - msg << "GPU computed value at " << i << " in iteration " << iter - << " is incorrect. Expected " << (i * 2) << ", found " << data_i; - throw std::runtime_error{msg.str()}; - } - } - } - - HIP_API_CALL(hipStreamSynchronize(stream)); - HIP_API_CALL(hipStreamDestroy(stream)); - HIP_API_CALL(hipHostUnregister(data)); - - return 0; -} - -int -main(int argc, const char** argv) -{ - using namespace std::chrono_literals; - - const auto usage_msg = [](const char** _argv) { - fprintf(stderr, "usage: %s \n", _argv[0]); - }; - - if(argc != 3) - { - usage_msg(argv); - exit(EXIT_FAILURE); - } - - for(int i = 1; i < argc; ++i) - { - auto _arg = std::string{argv[i]}; - if(_arg == "?" || _arg == "-h" || _arg == "--help") - { - usage_msg(argv); - exit(EXIT_SUCCESS); - } - } - - const auto num_threads = std::atoi(argv[1]); - if(num_threads < 1) - { - fprintf(stderr, "Error: Invalid value %d for num_threads (min 1)\n", num_threads); - exit(EXIT_FAILURE); - } - - const auto num_iter = std::atoi(argv[2]); - if(num_iter < 1) - { - fprintf(stderr, "Error: Invalid value %d for num_iter (min 1)\n", num_iter); - exit(EXIT_FAILURE); - } - - int ndevice = 0; - HIP_API_CALL(hipGetDeviceCount(&ndevice)); - - run_test(0, num_iter); - - std::vector threads; - threads.reserve(num_threads); - - std::cerr << "Running " << num_iter << " iterations/thread on " << num_threads << " threads\n"; - for(auto i = 0; i < num_threads; ++i) - { - auto _dev_idx = i % ndevice; - threads.emplace_back([_dev_idx, _num_iter = num_iter]() { run_test(_dev_idx, _num_iter); }); - } - - std::cerr << "Waiting for threads\n"; - for(auto& t : threads) - { - t.join(); - } - - for(int i = 0; i < ndevice; ++i) - { - HIP_API_CALL(hipSetDevice(i)); - HIP_API_CALL(hipDeviceSynchronize()); - HIP_API_CALL(hipDeviceReset()); - } - - return 0; -} diff --git a/projects/rocprofiler-sdk/tests/page-migration/CMakeLists.txt b/projects/rocprofiler-sdk/tests/page-migration/CMakeLists.txt deleted file mode 100644 index 17ec36abb8..0000000000 --- a/projects/rocprofiler-sdk/tests/page-migration/CMakeLists.txt +++ /dev/null @@ -1,76 +0,0 @@ -# -# -# -cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) - -project( - rocprofiler-sdk-tests-page-migration - LANGUAGES CXX - VERSION 0.0.0) - -find_package(rocprofiler-sdk REQUIRED) - -if(ROCPROFILER_MEMCHECK_PRELOAD_ENV) - set(PRELOAD_ENV - "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}:$") -else() - set(PRELOAD_ENV "LD_PRELOAD=$") -endif() - -list(GET rocprofiler-sdk-tests-gfx-info 0 page-migration-gpu-0-gfx-info) - -# disabled on all architectures -if("${page-migration-gpu-0-gfx-info}" MATCHES "^gfx([0-9a-fA-F]+)$") - set(IS_DISABLED ON) -endif() - -add_test(NAME test-page-migration-execute COMMAND $ 4 1024) - -set(page-migration-env - "${PRELOAD_ENV}" - "ROCPROFILER_DISABLE_PERFETTO_ANNOTATIONS=1" - "ROCPROFILER_TOOL_OUTPUT_FILE=page-migration-test.json" - "ROCPROFILER_TOOL_CONTEXTS_EXCLUDE=HSA_API_BUFFERED,HSA_API_CALLBACK" - "LD_LIBRARY_PATH=$:$ENV{LD_LIBRARY_PATH}" - ) - -set_tests_properties( - test-page-migration-execute - PROPERTIES TIMEOUT - 60 - LABELS - "integration-tests" - ENVIRONMENT - "${page-migration-env}" - FAIL_REGULAR_EXPRESSION - "${ROCPROFILER_DEFAULT_FAIL_REGEX}" - SKIP_REGULAR_EXPRESSION - "KFD does not support SVM event reporting" - WORKING_DIRECTORY - ${CMAKE_CURRENT_BINARY_DIR} - DISABLED - "${IS_DISABLED}") - -# copy to binary directory -rocprofiler_configure_pytest_files(COPY validate.py conftest.py CONFIG pytest.ini) - -add_test(NAME test-page-migration-validate - COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py --input - ${CMAKE_CURRENT_BINARY_DIR}/page-migration-test.json) - -set_tests_properties( - test-page-migration-validate - PROPERTIES TIMEOUT - 45 - LABELS - "integration-tests" - DEPENDS - test-page-migration-execute - FAIL_REGULAR_EXPRESSION - "${ROCPROFILER_DEFAULT_FAIL_REGEX}" - SKIP_REGULAR_EXPRESSION - "KFD does not support SVM event reporting" - WORKING_DIRECTORY - ${CMAKE_CURRENT_BINARY_DIR} - DISABLED - "${IS_DISABLED}") diff --git a/projects/rocprofiler-sdk/tests/page-migration/conftest.py b/projects/rocprofiler-sdk/tests/page-migration/conftest.py deleted file mode 100644 index 003469543c..0000000000 --- a/projects/rocprofiler-sdk/tests/page-migration/conftest.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python3 - -# MIT License -# -# Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. - -import json -import pytest -from rocprofiler_sdk.pytest_utils.dotdict import dotdict - - -def pytest_addoption(parser): - parser.addoption( - "--input", - action="store", - default="page-migration-test.json", - help="Input JSON", - ) - - -@pytest.fixture -def input_data(request): - filename = request.config.getoption("--input") - data = None - with open(filename, "r") as inp: - data = json.load(inp) - - if data["rocprofiler-sdk-json-tool"]["metadata"]["validate_page_migration"] is False: - return pytest.skip( - "Skipping test because KFD does not support SVM event reporting" - ) - return dotdict(data) diff --git a/projects/rocprofiler-sdk/tests/page-migration/pytest.ini b/projects/rocprofiler-sdk/tests/page-migration/pytest.ini deleted file mode 100644 index 5e1e1c14a0..0000000000 --- a/projects/rocprofiler-sdk/tests/page-migration/pytest.ini +++ /dev/null @@ -1,5 +0,0 @@ - -[pytest] -addopts = --durations=20 -rA -s -vv -testpaths = validate.py -pythonpath = @ROCPROFILER_SDK_TESTS_BINARY_DIR@/pytest-packages diff --git a/projects/rocprofiler-sdk/tests/page-migration/validate.py b/projects/rocprofiler-sdk/tests/page-migration/validate.py deleted file mode 100644 index 7d4470950b..0000000000 --- a/projects/rocprofiler-sdk/tests/page-migration/validate.py +++ /dev/null @@ -1,377 +0,0 @@ -#!/usr/bin/env python3 - -# MIT License -# -# Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. - -from collections import defaultdict -import sys -import pytest - - -test_api_traces = [ - "hsa_api_traces", - "marker_api_traces", - "hip_api_traces", - "rccl_api_traces", - "scratch_memory_traces", -] - - -# helper function -def node_exists(name, data, min_len=1): - assert name in data - assert data[name] is not None - if isinstance(data[name], (list, tuple, dict, set)): - assert len(data[name]) >= min_len - - -def to_dict(key_values): - a = defaultdict() - for kv in key_values: - a[kv["key"]] = kv["value"] - return a - - -def get_operation(record, kind_name, op_name=None): - for idx, itr in enumerate(record["names"]): - if kind_name == itr["kind"]: - if op_name is None: - return idx, itr["operations"] - else: - for oidx, oname in enumerate(itr["operations"]): - if op_name == oname: - return oidx - - return None - - -def dict_from_value_key(d): - ret_d = defaultdict() - - for k, v in d.items(): - assert v not in ret_d - ret_d[v] = k - return ret_d - - -def sort_by_timestamp(lines): - timestamp_line_map = {} - - for log_line in lines: - timestamp = log_line.split(" ")[1] - timestamp_line_map[timestamp] = log_line - - timestamps_sorted = sorted([l.split(" ")[1] for l in lines]) - return timestamps_sorted, timestamp_line_map - - -# ------------------------------ Tests ------------------------------ # - - -def test_data_structure(input_data): - """verify minimum amount of expected data is present""" - data = input_data - - node_exists("rocprofiler-sdk-json-tool", data) - - sdk_data = data["rocprofiler-sdk-json-tool"] - - node_exists("metadata", sdk_data) - node_exists("pid", sdk_data["metadata"]) - node_exists("main_tid", sdk_data["metadata"]) - node_exists("init_time", sdk_data["metadata"]) - node_exists("fini_time", sdk_data["metadata"]) - node_exists("validate_page_migration", sdk_data["metadata"]) - - assert sdk_data["metadata"]["validate_page_migration"] is True - - node_exists("agents", sdk_data) - node_exists("call_stack", sdk_data) - node_exists("callback_records", sdk_data) - node_exists("buffer_records", sdk_data) - - node_exists("names", sdk_data["callback_records"]) - node_exists("code_objects", sdk_data["callback_records"]) - node_exists("kernel_symbols", sdk_data["callback_records"]) - node_exists("host_functions", sdk_data["callback_records"]) - node_exists("hsa_api_traces", sdk_data["callback_records"], 0) - node_exists("hip_api_traces", sdk_data["callback_records"]) - node_exists("marker_api_traces", sdk_data["callback_records"], 0) - - node_exists("names", sdk_data["buffer_records"]) - node_exists("kernel_dispatch", sdk_data["buffer_records"]) - node_exists("memory_copies", sdk_data["buffer_records"], 0) - node_exists("hsa_api_traces", sdk_data["buffer_records"], 0) - node_exists("hip_api_traces", sdk_data["buffer_records"]) - node_exists("marker_api_traces", sdk_data["buffer_records"], 0) - node_exists("retired_correlation_ids", sdk_data["buffer_records"]) - node_exists("page_migration", sdk_data["buffer_records"]) - - -def test_timestamps(input_data): - data = input_data - sdk_data = data["rocprofiler-sdk-json-tool"] - - cb_start = {} - cb_end = {} - for titr in test_api_traces: - for itr in sdk_data["callback_records"][titr]: - cid = itr["correlation_id"]["internal"] - phase = itr["phase"] - if phase == 1: - cb_start[cid] = itr["timestamp"] - elif phase == 2: - cb_end[cid] = itr["timestamp"] - assert cb_start[cid] <= itr["timestamp"] - else: - assert phase == 1 or phase == 2 - - for itr in sdk_data["buffer_records"][titr]: - assert itr["start_timestamp"] <= itr["end_timestamp"] - - for titr in ["kernel_dispatch", "memory_copies"]: - for itr in sdk_data["buffer_records"][titr]: - assert itr["start_timestamp"] < itr["end_timestamp"] - assert itr["correlation_id"]["internal"] > 0 - assert itr["correlation_id"]["external"] > 0 - assert sdk_data["metadata"]["init_time"] < itr["start_timestamp"] - assert sdk_data["metadata"]["init_time"] < itr["end_timestamp"] - assert sdk_data["metadata"]["fini_time"] > itr["start_timestamp"] - assert sdk_data["metadata"]["fini_time"] > itr["end_timestamp"] - - # api_start = cb_start[itr["correlation_id"]["internal"]] - # api_end = cb_end[itr["correlation_id"]["internal"]] - # assert api_start < itr["start_timestamp"] - # assert api_end <= itr["end_timestamp"] - - -def test_internal_correlation_ids(input_data): - data = input_data - sdk_data = data["rocprofiler-sdk-json-tool"] - - api_corr_ids = [] - for titr in test_api_traces: - for itr in sdk_data["callback_records"][titr]: - api_corr_ids.append(itr["correlation_id"]["internal"]) - - for itr in sdk_data["buffer_records"][titr]: - api_corr_ids.append(itr["correlation_id"]["internal"]) - - api_corr_ids_sorted = sorted(api_corr_ids) - api_corr_ids_unique = list(set(api_corr_ids)) - - for itr in sdk_data["buffer_records"]["kernel_dispatch"]: - assert itr["correlation_id"]["internal"] in api_corr_ids_unique - - for itr in sdk_data["buffer_records"]["memory_copies"]: - assert itr["correlation_id"]["internal"] in api_corr_ids_unique - - len_corr_id_unq = len(api_corr_ids_unique) - assert len(api_corr_ids) != len_corr_id_unq - assert max(api_corr_ids_sorted) == len_corr_id_unq - - -def test_external_correlation_ids(input_data): - data = input_data - sdk_data = data["rocprofiler-sdk-json-tool"] - - extern_corr_ids = [] - for titr in test_api_traces: - for itr in sdk_data["callback_records"][titr]: - assert itr["correlation_id"]["external"] > 0 - assert itr["thread_id"] == itr["correlation_id"]["external"] - extern_corr_ids.append(itr["correlation_id"]["external"]) - - extern_corr_ids = list(set(sorted(extern_corr_ids))) - for titr in test_api_traces: - for itr in sdk_data["buffer_records"][titr]: - assert itr["correlation_id"]["external"] > 0 - assert itr["thread_id"] == itr["correlation_id"]["external"] - assert itr["thread_id"] in extern_corr_ids - assert itr["correlation_id"]["external"] in extern_corr_ids - - for itr in sdk_data["buffer_records"]["kernel_dispatch"]: - assert itr["correlation_id"]["external"] > 0 - assert itr["correlation_id"]["external"] in extern_corr_ids - - for itr in sdk_data["buffer_records"]["memory_copies"]: - assert itr["correlation_id"]["external"] > 0 - assert itr["correlation_id"]["external"] in extern_corr_ids - - -def test_kernel_ids(input_data): - data = input_data - sdk_data = data["rocprofiler-sdk-json-tool"] - - symbol_info = {} - for itr in sdk_data["callback_records"]["kernel_symbols"]: - phase = itr["phase"] - payload = itr["payload"] - kern_id = payload["kernel_id"] - - assert phase == 1 or phase == 2 - assert kern_id > 0 - if phase == 1: - assert len(payload["kernel_name"]) > 0 - symbol_info[kern_id] = payload - elif phase == 2: - assert payload["kernel_id"] in symbol_info.keys() - assert payload["kernel_name"] == symbol_info[kern_id]["kernel_name"] - - for itr in sdk_data["buffer_records"]["kernel_dispatch"]: - assert itr["dispatch_info"]["kernel_id"] in symbol_info.keys() - - for itr in sdk_data["callback_records"]["kernel_dispatch"]: - assert itr["payload"]["dispatch_info"]["kernel_id"] in symbol_info.keys() - - -def get_allocated_pages(callback_records): - # Get how many pages we allocated - op_idx = get_operation(callback_records, "HIP_RUNTIME_API", "hipHostRegister") - rt_idx, rt_data = get_operation(callback_records, "HIP_RUNTIME_API") - - assert op_idx is not None, f"{rt_idx}:\n{rt_data}" - - host_register_record = [] - for itr in callback_records["hip_api_traces"]: - if itr["kind"] == rt_idx and itr["operation"] == op_idx and itr["phase"] == 2: - assert "sizeBytes" in itr["args"].keys(), f"{itr}" - assert "hostPtr" in itr["args"].keys(), f"{itr}" - host_register_record.append(itr) - - num_host_register_calls = len(host_register_record) - assert num_host_register_calls == 5, "Expected 5 hipHostRegister calls in test" - - ret = [] - for i in range(num_host_register_calls): - alloc_size = int(host_register_record[0]["args"]["sizeBytes"], 10) - start_addr = int(host_register_record[0]["args"]["hostPtr"], 16) - end_addr = start_addr + alloc_size - ret.append((start_addr, end_addr)) - return ret - - -def validate_node(id, nodes): - assert id.handle in nodes - - -def test_page_migration_data(input_data): - data = input_data - sdk_data = data["rocprofiler-sdk-json-tool"] - buffer_records = sdk_data.buffer_records - callback_records = sdk_data.callback_records - page_migtation_buffers = buffer_records.page_migration - - _, bf_op_names = get_operation(buffer_records, "PAGE_MIGRATION") - assert bf_op_names[0] == "PAGE_MIGRATION_NONE" - - for op_name in bf_op_names: - assert "PAGE_MIGRATION" in op_name - - assert len(bf_op_names) == 9 - - nodes = set(x.id.handle for x in sdk_data.agents) - allocations = get_allocated_pages(callback_records) - - for start_addr, end_addr in allocations: - - assert ( - start_addr < end_addr - ), "Expected start address less than end address for mmap range" - alloc_size = end_addr - start_addr - assert int(alloc_size) == 512 * 4096 # We allocated 512 pages in the test - - # PID must be same - assert len(set(r.pid for r in page_migtation_buffers)) == 1 - - for r in page_migtation_buffers: - op = r.operation - - assert r.size == 160 - assert op != 0 and bf_op_names[op] != "PAGE_MIGRATION_NONE" - assert bf_op_names[op].lower().replace("page_migration_", "") in r.keys() - - if "page_fault_start" in r: - arg = r.page_fault_start - assert arg.read_fault < 2 - validate_node(arg.agent_id, nodes) - assert arg.address > 0 - - if "page_fault_end" in r: - arg = r.page_fault_end - assert arg.migrated < 2 - validate_node(arg.agent_id, nodes) - assert arg.address > 0 - - if "page_migrate_start" in r: - arg = r.page_migrate_start - assert ( - 0 < arg.start_addr < arg.end_addr - ), "Expected start addr to be less than end addr" - if arg.start_addr == start_addr: - assert arg.end_addr == end_addr - validate_node(arg.from_agent, nodes) - validate_node(arg.to_agent, nodes) - validate_node(arg.prefetch_agent, nodes) - validate_node(arg.preferred_agent, nodes) - assert 0 <= arg.trigger < 4 - - if "page_migrate_end" in r: - arg = r.page_migrate_end - assert ( - 0 < arg.start_addr < arg.end_addr - ), "Expected start addr to be less than end addr" - if arg.start_addr == start_addr: - assert arg.end_addr == end_addr - validate_node(arg.from_agent, nodes) - validate_node(arg.to_agent, nodes) - assert 0 <= arg.trigger < 4 - - if "queue_eviction" in r: - arg = r.queue_eviction - validate_node(arg.agent_id, nodes) - assert 0 <= arg.trigger < 6 - - if "queue_restore" in r: - arg = r.queue_restore - assert arg.rescheduled < 2 - validate_node(arg.agent_id, nodes) - - if "unmap_from_gpu" in r: - arg = r.unmap_from_gpu - assert ( - 0 < arg.start_addr < arg.end_addr - ), "Expected start addr to be less than end addr" - if arg.start_addr == start_addr: - assert arg.end_addr == end_addr - validate_node(arg.agent_id, nodes) - assert 0 <= arg.trigger < 3 - - if "dropped_event" in r: - arg = r.dropped_event - # We shouldn't get any dropped events. If we do, our test needs to be redesigned. - assert arg.dropped_events_count == 0 - - -if __name__ == "__main__": - exit_code = pytest.main(["-x", __file__] + sys.argv[1:]) - sys.exit(exit_code) diff --git a/projects/rocprofiler-sdk/tests/tools/json-tool.cpp b/projects/rocprofiler-sdk/tests/tools/json-tool.cpp index 22c3954c42..24d3e344bb 100644 --- a/projects/rocprofiler-sdk/tests/tools/json-tool.cpp +++ b/projects/rocprofiler-sdk/tests/tools/json-tool.cpp @@ -903,7 +903,6 @@ auto memory_copy_bf_records = std::deque{}; auto scratch_memory_records = std::deque{}; -auto page_migration_records = std::deque{}; auto corr_id_retire_records = std::deque{}; auto rccl_api_bf_records = std::deque{}; @@ -912,6 +911,18 @@ auto rocdecode_api_ext_bf_records = std::deque{}; auto rocjpeg_api_bf_records = std::deque{}; auto ompt_bf_records = std::deque{}; +auto kfd_page_migrate_event_records = + std::deque{}; +auto kfd_page_fault_event_records = + std::deque{}; +auto kfd_queue_event_records = std::deque{}; +auto kfd_unmap_from_gpu_event_records = + std::deque{}; +auto kfd_dropped_events_event_records = + std::deque{}; +auto kfd_page_migrate_records = std::deque{}; +auto kfd_page_fault_records = std::deque{}; +auto kfd_queue_records = std::deque{}; void tool_tracing_buffered(rocprofiler_context_id_t /*context*/, @@ -1002,13 +1013,6 @@ tool_tracing_buffered(rocprofiler_context_id_t /*context*/, scratch_memory_records.emplace_back(*record); } - else if(header->kind == ROCPROFILER_BUFFER_TRACING_PAGE_MIGRATION) - { - auto* record = static_cast( - header->payload); - - page_migration_records.emplace_back(*record); - } else if(header->kind == ROCPROFILER_BUFFER_TRACING_CORRELATION_ID_RETIREMENT) { auto* record = @@ -1060,6 +1064,66 @@ tool_tracing_buffered(rocprofiler_context_id_t /*context*/, rocjpeg_api_bf_records.emplace_back(*record); } + else if(header->kind == ROCPROFILER_BUFFER_TRACING_KFD_EVENT_PAGE_MIGRATE) + { + auto* record = + static_cast( + header->payload); + + kfd_page_migrate_event_records.emplace_back(*record); + } + else if(header->kind == ROCPROFILER_BUFFER_TRACING_KFD_EVENT_PAGE_FAULT) + { + auto* record = + static_cast( + header->payload); + + kfd_page_fault_event_records.emplace_back(*record); + } + else if(header->kind == ROCPROFILER_BUFFER_TRACING_KFD_EVENT_QUEUE) + { + auto* record = static_cast( + header->payload); + + kfd_queue_event_records.emplace_back(*record); + } + else if(header->kind == ROCPROFILER_BUFFER_TRACING_KFD_EVENT_UNMAP_FROM_GPU) + { + auto* record = + static_cast( + header->payload); + + kfd_unmap_from_gpu_event_records.emplace_back(*record); + } + else if(header->kind == ROCPROFILER_BUFFER_TRACING_KFD_EVENT_DROPPED_EVENTS) + { + auto* record = + static_cast( + header->payload); + + kfd_dropped_events_event_records.emplace_back(*record); + } + else if(header->kind == ROCPROFILER_BUFFER_TRACING_KFD_PAGE_MIGRATE) + { + auto* record = static_cast( + header->payload); + + kfd_page_migrate_records.emplace_back(*record); + } + else if(header->kind == ROCPROFILER_BUFFER_TRACING_KFD_PAGE_FAULT) + { + auto* record = static_cast( + header->payload); + + kfd_page_fault_records.emplace_back(*record); + } + else if(header->kind == ROCPROFILER_BUFFER_TRACING_KFD_QUEUE) + { + auto* record = + static_cast(header->payload); + + kfd_queue_records.emplace_back(*record); + } else { throw std::runtime_error{ @@ -1155,7 +1219,6 @@ rocprofiler_context_id_t scratch_memory_ctx = {0}; rocprofiler_context_id_t corr_id_retire_ctx = {0}; rocprofiler_context_id_t kernel_dispatch_callback_ctx = {0}; rocprofiler_context_id_t kernel_dispatch_buffered_ctx = {0}; -rocprofiler_context_id_t page_migration_ctx = {0}; rocprofiler_context_id_t runtime_init_callback_ctx = {}; rocprofiler_context_id_t runtime_init_buffered_ctx = {}; rocprofiler_context_id_t rocdecode_api_callback_ctx = {0}; @@ -1163,24 +1226,39 @@ rocprofiler_context_id_t rocdecode_api_buffered_ctx = {0}; rocprofiler_context_id_t rocdecode_api_ext_buffered_ctx = {0}; rocprofiler_context_id_t rocjpeg_api_callback_ctx = {0}; rocprofiler_context_id_t rocjpeg_api_buffered_ctx = {0}; +rocprofiler_context_id_t page_migrate_event_ctx = {0}; +rocprofiler_context_id_t kfd_page_fault_event_ctx = {0}; +rocprofiler_context_id_t kfd_queue_event_ctx = {0}; +rocprofiler_context_id_t kfd_unmap_from_gpu_event_ctx = {0}; +rocprofiler_context_id_t kfd_droped_events_event_ctx = {0}; +rocprofiler_context_id_t kfd_page_migrate_records_ctx = {0}; +rocprofiler_context_id_t kfd_page_fault_records_ctx = {0}; +rocprofiler_context_id_t kfd_queue_records_ctx = {0}; // buffers -rocprofiler_buffer_id_t runtime_init_buffered_buffer = {}; -rocprofiler_buffer_id_t hsa_api_buffered_buffer = {}; -rocprofiler_buffer_id_t hip_api_buffered_buffer = {}; -rocprofiler_buffer_id_t marker_api_buffered_buffer = {}; -rocprofiler_buffer_id_t kernel_dispatch_buffer = {}; -rocprofiler_buffer_id_t memory_copy_buffer = {}; -rocprofiler_buffer_id_t memory_allocation_buffer = {}; -rocprofiler_buffer_id_t page_migration_buffer = {}; -rocprofiler_buffer_id_t counter_collection_buffer = {}; -rocprofiler_buffer_id_t scratch_memory_buffer = {}; -rocprofiler_buffer_id_t corr_id_retire_buffer = {}; -rocprofiler_buffer_id_t rccl_api_buffered_buffer = {}; -rocprofiler_buffer_id_t rocdecode_api_buffer = {}; -rocprofiler_buffer_id_t rocdecode_api_ext_buffer = {}; -rocprofiler_buffer_id_t rocjpeg_api_buffer = {}; -rocprofiler_buffer_id_t ompt_buffered_buffer = {}; +rocprofiler_buffer_id_t runtime_init_buffered_buffer = {}; +rocprofiler_buffer_id_t hsa_api_buffered_buffer = {}; +rocprofiler_buffer_id_t hip_api_buffered_buffer = {}; +rocprofiler_buffer_id_t marker_api_buffered_buffer = {}; +rocprofiler_buffer_id_t kernel_dispatch_buffer = {}; +rocprofiler_buffer_id_t memory_copy_buffer = {}; +rocprofiler_buffer_id_t memory_allocation_buffer = {}; +rocprofiler_buffer_id_t counter_collection_buffer = {}; +rocprofiler_buffer_id_t scratch_memory_buffer = {}; +rocprofiler_buffer_id_t corr_id_retire_buffer = {}; +rocprofiler_buffer_id_t rccl_api_buffered_buffer = {}; +rocprofiler_buffer_id_t rocdecode_api_buffer = {}; +rocprofiler_buffer_id_t rocdecode_api_ext_buffer = {}; +rocprofiler_buffer_id_t rocjpeg_api_buffer = {}; +rocprofiler_buffer_id_t ompt_buffered_buffer = {}; +rocprofiler_buffer_id_t page_migrate_event_buffer = {}; +rocprofiler_buffer_id_t kfd_page_fault_event_buffer = {}; +rocprofiler_buffer_id_t kfd_queue_event_buffer = {}; +rocprofiler_buffer_id_t kfd_unmap_from_gpu_event_buffer = {}; +rocprofiler_buffer_id_t kfd_droped_events_event_buffer = {}; +rocprofiler_buffer_id_t kfd_page_migrate_records_buffer = {}; +rocprofiler_buffer_id_t kfd_page_fault_records_buffer = {}; +rocprofiler_buffer_id_t kfd_queue_records_buffer = {}; auto contexts = std::unordered_map{ {"RUNTIME_INIT_CALLBACK", &runtime_init_callback_ctx}, @@ -1200,7 +1278,6 @@ auto contexts = std::unordered_map{ {"KERNEL_DISPATCH_BUFFERED", &kernel_dispatch_buffered_ctx}, {"MEMORY_COPY_BUFFERED", &memory_copy_buffered_ctx}, {"MEMORY_ALLOCATION_BUFFERED", &memory_allocation_buffered_ctx}, - {"PAGE_MIGRATION", &page_migration_ctx}, {"COUNTER_COLLECTION", &counter_collection_ctx}, {"SCRATCH_MEMORY", &scratch_memory_ctx}, {"CORRELATION_ID_RETIREMENT", &corr_id_retire_ctx}, @@ -1211,9 +1288,17 @@ auto contexts = std::unordered_map{ {"ROCJPEG_API_CALLBACK", &rocjpeg_api_callback_ctx}, {"ROCJPEG_API_BUFFERED", &rocjpeg_api_buffered_ctx}, {"OMPT_BUFFERED", &ompt_buffered_ctx}, + {"KFD_EVENT_PAGE_MIGRATE", &page_migrate_event_ctx}, + {"KFD_EVENT_PAGE_FAULT", &kfd_page_fault_event_ctx}, + {"KFD_EVENT_QUEUE", &kfd_queue_event_ctx}, + {"KFD_EVENT_UNMAP_FROM_GPU", &kfd_unmap_from_gpu_event_ctx}, + {"KFD_DROPPED_EVENTS", &kfd_droped_events_event_ctx}, + {"KFD_PAGE_MIGRATE", &kfd_page_migrate_records_ctx}, + {"KFD_PAGE_FAULT", &kfd_page_fault_records_ctx}, + {"KFD_QUEUE", &kfd_queue_records_ctx}, }; -auto buffers = std::array{&runtime_init_buffered_buffer, +auto buffers = std::array{&runtime_init_buffered_buffer, &hsa_api_buffered_buffer, &hip_api_buffered_buffer, &marker_api_buffered_buffer, @@ -1221,22 +1306,28 @@ auto buffers = std::array{&runtime_init_buffered_b &memory_copy_buffer, &memory_allocation_buffer, &scratch_memory_buffer, - &page_migration_buffer, &counter_collection_buffer, &corr_id_retire_buffer, &rccl_api_buffered_buffer, &ompt_buffered_buffer, &rocdecode_api_buffer, &rocdecode_api_ext_buffer, - &rocjpeg_api_buffer}; + &rocjpeg_api_buffer, + &kfd_page_fault_event_buffer, + &kfd_queue_event_buffer, + &kfd_unmap_from_gpu_event_buffer, + &kfd_droped_events_event_buffer, + &kfd_page_migrate_records_buffer, + &kfd_page_fault_records_buffer, + &kfd_queue_records_buffer}; auto agents = std::vector{}; auto agents_map = std::unordered_map{}; -rocprofiler_timestamp_t init_time = 0; -rocprofiler_timestamp_t fini_time = 0; -rocprofiler_thread_id_t main_tid = 0; -auto page_migration_status = ROCPROFILER_STATUS_SUCCESS; +rocprofiler_timestamp_t init_time = 0; +rocprofiler_timestamp_t fini_time = 0; +rocprofiler_thread_id_t main_tid = 0; +auto kfd_configure_status = ROCPROFILER_STATUS_SUCCESS; int tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) @@ -1505,15 +1596,6 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) &scratch_memory_buffer), "buffer creation"); - ROCPROFILER_CALL(rocprofiler_create_buffer(page_migration_ctx, - buffer_size, - watermark, - ROCPROFILER_BUFFER_POLICY_LOSSLESS, - tool_tracing_buffered, - tool_data, - &page_migration_buffer), - "buffer creation"); - ROCPROFILER_CALL(rocprofiler_create_buffer(corr_id_retire_ctx, buffer_size, watermark, @@ -1576,6 +1658,78 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) &ompt_buffered_buffer), "buffer creation"); + ROCPROFILER_CALL(rocprofiler_create_buffer(page_migrate_event_ctx, + buffer_size, + watermark, + ROCPROFILER_BUFFER_POLICY_LOSSLESS, + tool_tracing_buffered, + tool_data, + &page_migrate_event_buffer), + "buffer creation"); + + ROCPROFILER_CALL(rocprofiler_create_buffer(kfd_page_fault_event_ctx, + buffer_size, + watermark, + ROCPROFILER_BUFFER_POLICY_LOSSLESS, + tool_tracing_buffered, + tool_data, + &kfd_page_fault_event_buffer), + "buffer creation"); + + ROCPROFILER_CALL(rocprofiler_create_buffer(kfd_queue_event_ctx, + buffer_size, + watermark, + ROCPROFILER_BUFFER_POLICY_LOSSLESS, + tool_tracing_buffered, + tool_data, + &kfd_queue_event_buffer), + "buffer creation"); + + ROCPROFILER_CALL(rocprofiler_create_buffer(kfd_unmap_from_gpu_event_ctx, + buffer_size, + watermark, + ROCPROFILER_BUFFER_POLICY_LOSSLESS, + tool_tracing_buffered, + tool_data, + &kfd_unmap_from_gpu_event_buffer), + "buffer creation"); + + ROCPROFILER_CALL(rocprofiler_create_buffer(kfd_droped_events_event_ctx, + buffer_size, + watermark, + ROCPROFILER_BUFFER_POLICY_LOSSLESS, + tool_tracing_buffered, + tool_data, + &kfd_droped_events_event_buffer), + "buffer creation"); + + ROCPROFILER_CALL(rocprofiler_create_buffer(kfd_page_migrate_records_ctx, + buffer_size, + watermark, + ROCPROFILER_BUFFER_POLICY_LOSSLESS, + tool_tracing_buffered, + tool_data, + &kfd_page_migrate_records_buffer), + "buffer creation"); + + ROCPROFILER_CALL(rocprofiler_create_buffer(kfd_page_fault_records_ctx, + buffer_size, + watermark, + ROCPROFILER_BUFFER_POLICY_LOSSLESS, + tool_tracing_buffered, + tool_data, + &kfd_page_fault_records_buffer), + "buffer creation"); + + ROCPROFILER_CALL(rocprofiler_create_buffer(kfd_queue_records_ctx, + buffer_size, + watermark, + ROCPROFILER_BUFFER_POLICY_LOSSLESS, + tool_tracing_buffered, + tool_data, + &kfd_queue_records_buffer), + "buffer creation"); + ROCPROFILER_CALL(rocprofiler_configure_buffer_tracing_service( runtime_init_buffered_ctx, ROCPROFILER_BUFFER_TRACING_RUNTIME_INITIALIZATION, @@ -1667,20 +1821,139 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) "buffer tracing service for scratch memory configure"); { - page_migration_status = - rocprofiler_configure_buffer_tracing_service(page_migration_ctx, - ROCPROFILER_BUFFER_TRACING_PAGE_MIGRATION, - nullptr, - 0, - page_migration_buffer); + kfd_configure_status = rocprofiler_configure_buffer_tracing_service( + page_migrate_event_ctx, + ROCPROFILER_BUFFER_TRACING_KFD_EVENT_PAGE_MIGRATE, + nullptr, + 0, + page_migrate_event_buffer); constexpr auto message = "buffer tracing service for page migration configure"; - if(page_migration_status == ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_KERNEL) + if(kfd_configure_status == ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_KERNEL) std::cerr << message - << " failed: " << rocprofiler_get_status_string(page_migration_status) + << " failed: " << rocprofiler_get_status_string(kfd_configure_status) << std::endl; else - ROCPROFILER_CALL(page_migration_status, message); + ROCPROFILER_CALL(kfd_configure_status, message); + } + + { + kfd_configure_status = rocprofiler_configure_buffer_tracing_service( + kfd_page_fault_event_ctx, + ROCPROFILER_BUFFER_TRACING_KFD_EVENT_PAGE_FAULT, + nullptr, + 0, + kfd_page_fault_event_buffer); + + constexpr auto message = "buffer tracing service for page migration configure"; + if(kfd_configure_status == ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_KERNEL) + std::cerr << message + << " failed: " << rocprofiler_get_status_string(kfd_configure_status) + << std::endl; + else + ROCPROFILER_CALL(kfd_configure_status, message); + } + + { + kfd_configure_status = + rocprofiler_configure_buffer_tracing_service(kfd_queue_event_ctx, + ROCPROFILER_BUFFER_TRACING_KFD_EVENT_QUEUE, + nullptr, + 0, + kfd_queue_event_buffer); + + constexpr auto message = "buffer tracing service for page migration configure"; + if(kfd_configure_status == ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_KERNEL) + std::cerr << message + << " failed: " << rocprofiler_get_status_string(kfd_configure_status) + << std::endl; + else + ROCPROFILER_CALL(kfd_configure_status, message); + } + + { + kfd_configure_status = rocprofiler_configure_buffer_tracing_service( + kfd_unmap_from_gpu_event_ctx, + ROCPROFILER_BUFFER_TRACING_KFD_EVENT_UNMAP_FROM_GPU, + nullptr, + 0, + kfd_unmap_from_gpu_event_buffer); + + constexpr auto message = "buffer tracing service for page migration configure"; + if(kfd_configure_status == ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_KERNEL) + std::cerr << message + << " failed: " << rocprofiler_get_status_string(kfd_configure_status) + << std::endl; + else + ROCPROFILER_CALL(kfd_configure_status, message); + } + + { + kfd_configure_status = rocprofiler_configure_buffer_tracing_service( + kfd_droped_events_event_ctx, + ROCPROFILER_BUFFER_TRACING_KFD_EVENT_DROPPED_EVENTS, + nullptr, + 0, + kfd_droped_events_event_buffer); + + constexpr auto message = "buffer tracing service for page migration configure"; + if(kfd_configure_status == ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_KERNEL) + std::cerr << message + << " failed: " << rocprofiler_get_status_string(kfd_configure_status) + << std::endl; + else + ROCPROFILER_CALL(kfd_configure_status, message); + } + + { + kfd_configure_status = rocprofiler_configure_buffer_tracing_service( + kfd_page_migrate_records_ctx, + ROCPROFILER_BUFFER_TRACING_KFD_PAGE_MIGRATE, + nullptr, + 0, + kfd_page_migrate_records_buffer); + + constexpr auto message = "buffer tracing service for page migration configure"; + if(kfd_configure_status == ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_KERNEL) + std::cerr << message + << " failed: " << rocprofiler_get_status_string(kfd_configure_status) + << std::endl; + else + ROCPROFILER_CALL(kfd_configure_status, message); + } + + { + kfd_configure_status = + rocprofiler_configure_buffer_tracing_service(kfd_page_fault_records_ctx, + ROCPROFILER_BUFFER_TRACING_KFD_PAGE_FAULT, + nullptr, + 0, + kfd_page_fault_records_buffer); + + constexpr auto message = "buffer tracing service for page migration configure"; + if(kfd_configure_status == ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_KERNEL) + std::cerr << message + << " failed: " << rocprofiler_get_status_string(kfd_configure_status) + << std::endl; + else + ROCPROFILER_CALL(kfd_configure_status, message); + } + + { + kfd_configure_status = + rocprofiler_configure_buffer_tracing_service(kfd_queue_records_ctx, + ROCPROFILER_BUFFER_TRACING_KFD_QUEUE, + nullptr, + 0, + kfd_queue_records_buffer); + + constexpr auto message = "buffer tracing service for page migration configure"; + if(kfd_configure_status == ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_KERNEL) + std::cerr << message + << " failed: " << rocprofiler_get_status_string(kfd_configure_status) + << std::endl; + else + ROCPROFILER_CALL(kfd_configure_status, message); } ROCPROFILER_CALL(rocprofiler_configure_buffer_tracing_service( @@ -1883,7 +2156,14 @@ tool_fini(void* tool_data) << ", memory_copy_bf_records=" << memory_copy_bf_records.size() << ", memory_allocation_bf_records=" << memory_allocation_bf_records.size() << ", scratch_memory_records=" << scratch_memory_records.size() - << ", page_migration=" << page_migration_records.size() + << ", kfd_page_migrate_event=" << kfd_page_migrate_event_records.size() + << ", kfd_page_fault_event=" << kfd_page_fault_event_records.size() + << ", kfd_queue_event=" << kfd_queue_event_records.size() + << ", kfd_unmap_from_gpu_event=" << kfd_unmap_from_gpu_event_records.size() + << ", kfd_droped_events_event=" << kfd_dropped_events_event_records.size() + << ", kfd_page_migrate_record=" << kfd_page_migrate_records.size() + << ", kfd_page_fault_record=" << kfd_page_fault_records.size() + << ", kfd_queue_record=" << kfd_queue_records.size() << ", runtime_init_bf_records=" << runtime_init_bf_records.size() << ", hsa_api_bf_records=" << hsa_api_bf_records.size() << ", hip_api_bf_records=" << hip_api_bf_records.size() @@ -1955,8 +2235,8 @@ write_json(call_stack_t* _call_stack) auto json_ar = JSONOutputArchive{*ofs, json_opts}; auto buffer_names = sdk::get_buffer_tracing_names(); auto callbk_names = sdk::get_callback_tracing_names(); - auto validate_page_migration = - (page_migration_status != ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_KERNEL); + auto validate_kfd_events = + (kfd_configure_status != ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_KERNEL); json_ar.setNextName("rocprofiler-sdk-json-tool"); json_ar.startNode(); @@ -1967,7 +2247,7 @@ write_json(call_stack_t* _call_stack) json_ar(cereal::make_nvp("main_tid", main_tid)); json_ar(cereal::make_nvp("init_time", init_time)); json_ar(cereal::make_nvp("fini_time", fini_time)); - json_ar(cereal::make_nvp("validate_page_migration", validate_page_migration)); + json_ar(cereal::make_nvp("validate_kfd_events", validate_kfd_events)); json_ar.finishNode(); json_ar(cereal::make_nvp("agents", agents)); @@ -2012,7 +2292,15 @@ write_json(call_stack_t* _call_stack) json_ar(cereal::make_nvp("memory_copies", memory_copy_bf_records)); json_ar(cereal::make_nvp("memory_allocations", memory_allocation_bf_records)); json_ar(cereal::make_nvp("scratch_memory_traces", scratch_memory_records)); - json_ar(cereal::make_nvp("page_migration", page_migration_records)); + json_ar(cereal::make_nvp("kfd_page_migrate_events", kfd_page_migrate_event_records)); + json_ar(cereal::make_nvp("kfd_page_fault_events", kfd_page_fault_event_records)); + json_ar(cereal::make_nvp("kfd_queue_events", kfd_queue_event_records)); + json_ar( + cereal::make_nvp("kfd_unmap_from_gpu_events", kfd_unmap_from_gpu_event_records)); + json_ar(cereal::make_nvp("kfd_droped_events", kfd_dropped_events_event_records)); + json_ar(cereal::make_nvp("kfd_page_migrate_records", kfd_page_migrate_records)); + json_ar(cereal::make_nvp("kfd_page_fault_records", kfd_page_fault_records)); + json_ar(cereal::make_nvp("kfd_queue_records", kfd_queue_records)); json_ar(cereal::make_nvp("hsa_api_traces", hsa_api_bf_records)); json_ar(cereal::make_nvp("hip_api_traces", hip_api_bf_records)); json_ar(cereal::make_nvp("marker_api_traces", marker_api_bf_records));