Page migration reporting (#651)
* Page migration reporting support * Page migration: Update parser and reporting Container does not lave latest KFD header, so CI might fail * Add kfd_ioctl.h * Formatting * Update get_key - get key was not used (and shouldn't be), so delete it * clang-tidy fixes * Tests for page migration * Apply suggestions from code review Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> * Update tests/bin/page-migration/CMakeLists.txt Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> * Update page-migration test app - add hipHostRegister to register mmap'ed allocation with HIP - misc cleanup and reorg - remove HSA_XNACK=1 from test env * Update lib/rocprofiler-sdk/tests/page_migration.cpp - fix compilation error * Minor updates (reorg, rename) * Page migration reporting support * Page migration: Update parser and reporting Container does not lave latest KFD header, so CI might fail * Update page migration tests, fix trigger types * Page Migration Tracing Support Refactoring (#753) * Reorganization * Update page migration init/fini * Formatting * Update page_migration.cpp - change logging severity * Skip test if KFD does not support page migration reporting * Rework skipping test if KFD does not support page migration * Fix event trigger enum values * Fix clang-diagnostic-unused-const-variable --------- Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Jonathan R. Madsen <jonathanrmadsen@gmail.com> Co-authored-by: Jonathan R. Madsen <jrmadsen@users.noreply.github.com>
Этот коммит содержится в:
@@ -26,6 +26,8 @@
|
||||
#include <rocprofiler-sdk/defines.h>
|
||||
#include <rocprofiler-sdk/fwd.h>
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
ROCPROFILER_EXTERN_C_INIT
|
||||
|
||||
/**
|
||||
@@ -35,6 +37,49 @@ ROCPROFILER_EXTERN_C_INIT
|
||||
* @{
|
||||
*/
|
||||
|
||||
/**
|
||||
* @brief Page migration triggers
|
||||
*
|
||||
*/
|
||||
typedef enum
|
||||
{
|
||||
ROCPROFILER_PAGE_MIGRATION_TRIGGER_NONE = -1,
|
||||
ROCPROFILER_PAGE_MIGRATION_TRIGGER_PREFETCH,
|
||||
ROCPROFILER_PAGE_MIGRATION_TRIGGER_PAGEFAULT_GPU,
|
||||
ROCPROFILER_PAGE_MIGRATION_TRIGGER_PAGEFAULT_CPU,
|
||||
ROCPROFILER_PAGE_MIGRATION_TRIGGER_TTM_EVICTION,
|
||||
ROCPROFILER_PAGE_MIGRATION_TRIGGER_LAST,
|
||||
} rocprofiler_page_migration_trigger_t;
|
||||
|
||||
/**
|
||||
* @brief Page migration triggers causing the queue to suspend
|
||||
*
|
||||
*/
|
||||
typedef enum
|
||||
{
|
||||
ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND_TRIGGER_NONE = -1,
|
||||
ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND_TRIGGER_SVM,
|
||||
ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND_TRIGGER_USERPTR,
|
||||
ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND_TRIGGER_TTM,
|
||||
ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND_TRIGGER_SUSPEND,
|
||||
ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND_TRIGGER_CRIU_CHECKPOINT,
|
||||
ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND_TRIGGER_CRIU_RESTORE,
|
||||
ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND_TRIGGER_LAST,
|
||||
} rocprofiler_page_migration_queue_suspend_trigger_t;
|
||||
|
||||
/**
|
||||
* @brief Page migration triggers causing an unmap from the GPU
|
||||
*
|
||||
*/
|
||||
typedef enum
|
||||
{
|
||||
ROCPROFILER_PAGE_MIGRATION_UNMAP_FROM_GPU_TRIGGER_NONE = -1,
|
||||
ROCPROFILER_PAGE_MIGRATION_UNMAP_FROM_GPU_TRIGGER_MMU_NOTIFY,
|
||||
ROCPROFILER_PAGE_MIGRATION_UNMAP_FROM_GPU_TRIGGER_MMU_NOTIFY_MIGRATE,
|
||||
ROCPROFILER_PAGE_MIGRATION_UNMAP_FROM_GPU_TRIGGER_UNMAP_FROM_CPU,
|
||||
ROCPROFILER_PAGE_MIGRATION_UNMAP_FROM_GPU_TRIGGER_LAST,
|
||||
} rocprofiler_page_migration_unmap_from_gpu_trigger_t;
|
||||
|
||||
/**
|
||||
* @brief ROCProfiler Buffer HSA API Tracer Record.
|
||||
*/
|
||||
@@ -150,17 +195,63 @@ typedef struct rocprofiler_buffer_tracing_kernel_dispatch_record_t
|
||||
/// @brief runtime grid size
|
||||
} rocprofiler_buffer_tracing_kernel_dispatch_record_t;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint8_t read_fault : 1; ///< Is the fault due to a read or a write
|
||||
uint8_t migrated : 1;
|
||||
uint32_t node_id; ///< GPU or CPU node ID which reports a page fault
|
||||
uint64_t address; ///< Address access that caused the page fault
|
||||
} rocprofiler_buffer_tracing_page_migration_page_fault_record_t;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint64_t start_addr; ///< Start address of the page being migrated
|
||||
uint64_t end_addr; ///< End address of the page being migrated
|
||||
uint32_t from_node; ///< Source node
|
||||
uint32_t to_node; ///< Destination node
|
||||
uint32_t prefetch_node; ///< Node from which page was prefetched
|
||||
uint32_t preferred_node; ///< Preferred destinaion node
|
||||
rocprofiler_page_migration_trigger_t trigger; ///< Cause of migration
|
||||
} rocprofiler_buffer_tracing_page_migration_page_migrate_record_t;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint8_t rescheduled : 1;
|
||||
uint32_t node_id; ///< GPU node from which the queue was suspended
|
||||
rocprofiler_page_migration_queue_suspend_trigger_t trigger; ///< Cause of queue suspension
|
||||
} rocprofiler_buffer_tracing_page_migration_queue_suspend_record_t;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32_t node_id; ///< Node ID from which page was unmapped
|
||||
uint64_t start_addr; ///< Start address of unmapped page
|
||||
uint64_t end_addr; ///< End address of unmapped page
|
||||
rocprofiler_page_migration_unmap_from_gpu_trigger_t trigger; ///< Cause of unmap
|
||||
} rocprofiler_buffer_tracing_page_migration_unmap_from_gpu_record_t;
|
||||
|
||||
/**
|
||||
* @brief ROCProfiler Buffer Page Migration Tracer Record. Not implemented.
|
||||
* @brief ROCProfiler Buffer Page Migration Tracer Record
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
uint64_t size; ///< size of this struct
|
||||
rocprofiler_buffer_tracing_kind_t kind; ///< ROCPROFILER_BUFFER_TRACING_PAGE_MIGRATION
|
||||
rocprofiler_correlation_id_t correlation_id; ///< correlation ids for record
|
||||
rocprofiler_tracing_operation_t operation;
|
||||
rocprofiler_timestamp_t start_timestamp; ///< start time in nanoseconds
|
||||
rocprofiler_timestamp_t end_timestamp; ///< end time in nanoseconds
|
||||
// Not Sure What is the info needed here?
|
||||
uint32_t pid;
|
||||
|
||||
union
|
||||
{
|
||||
rocprofiler_buffer_tracing_page_migration_page_fault_record_t page_fault;
|
||||
rocprofiler_buffer_tracing_page_migration_page_migrate_record_t page_migrate;
|
||||
rocprofiler_buffer_tracing_page_migration_queue_suspend_record_t queue_suspend;
|
||||
rocprofiler_buffer_tracing_page_migration_unmap_from_gpu_record_t unmap_from_gpu;
|
||||
struct
|
||||
{
|
||||
uint64_t reserved[12];
|
||||
};
|
||||
};
|
||||
} rocprofiler_buffer_tracing_page_migration_record_t;
|
||||
|
||||
/**
|
||||
|
||||
@@ -89,6 +89,10 @@ typedef enum // NOLINT(performance-enum-size)
|
||||
ROCPROFILER_STATUS_ERROR_AST_GENERATION_FAILED, ///< AST could not be generated correctly
|
||||
ROCPROFILER_STATUS_ERROR_AST_NOT_FOUND, ///< AST was not found
|
||||
ROCPROFILER_STATUS_ERROR_AQL_NO_EVENT_COORD, ///< Event coordinate was not found by AQL profile
|
||||
ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_KERNEL, ///< A service depends on a newer version of KFD
|
||||
///< (amdgpu kernel driver). Check logs for
|
||||
///< service that report incompatibility
|
||||
|
||||
ROCPROFILER_STATUS_LAST,
|
||||
} rocprofiler_status_t;
|
||||
|
||||
@@ -202,6 +206,21 @@ typedef enum // NOLINT(performance-enum-size)
|
||||
ROCPROFILER_MEMORY_COPY_LAST,
|
||||
} rocprofiler_memory_copy_operation_t;
|
||||
|
||||
/**
|
||||
* @brief Page migration event.
|
||||
*/
|
||||
typedef enum // NOLINT(performance-enum-size)
|
||||
{
|
||||
ROCPROFILER_PAGE_MIGRATION_NONE = 0, ///< Unknown event
|
||||
ROCPROFILER_PAGE_MIGRATION_PAGE_MIGRATE,
|
||||
ROCPROFILER_PAGE_MIGRATION_PAGE_FAULT,
|
||||
ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND,
|
||||
ROCPROFILER_PAGE_MIGRATION_UNMAP_FROM_GPU,
|
||||
// Any and all events, from all processes. Requires superuser
|
||||
// ROCPROFILER_PAGE_MIGRATION_ANY_ALL_PROCESSES,
|
||||
ROCPROFILER_PAGE_MIGRATION_LAST,
|
||||
} rocprofiler_page_migration_operation_t;
|
||||
|
||||
/**
|
||||
* @brief ROCProfiler Kernel Dispatch Tracing Operation Types.
|
||||
*/
|
||||
|
||||
Ссылка в новой задаче
Block a user