* Page migration reporting support

* Page migration: Update parser and reporting

Container does not lave latest KFD header, so CI might fail

* Add kfd_ioctl.h

* Formatting

* Update get_key

- get key was not used (and shouldn't be), so delete it

* clang-tidy fixes

* Tests for page migration

* Apply suggestions from code review

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Update tests/bin/page-migration/CMakeLists.txt

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Update page-migration test app

- add hipHostRegister to register mmap'ed allocation with HIP
- misc cleanup and reorg
- remove HSA_XNACK=1 from test env

* Update lib/rocprofiler-sdk/tests/page_migration.cpp

- fix compilation error

* Minor updates (reorg, rename)

* Page migration reporting support

* Page migration: Update parser and reporting

Container does not lave latest KFD header, so CI might fail

* Update page migration tests, fix trigger types

* Page Migration Tracing Support Refactoring (#753)

* Reorganization

* Update page migration init/fini

* Formatting

* Update page_migration.cpp

- change logging severity

* Skip test if KFD does not support page migration reporting

* Rework skipping test if KFD does not support page migration

* Fix event trigger enum values

* Fix clang-diagnostic-unused-const-variable

---------

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Jonathan R. Madsen <jonathanrmadsen@gmail.com>
Co-authored-by: Jonathan R. Madsen <jrmadsen@users.noreply.github.com>
Этот коммит содержится в:
Mythreya
2024-04-12 13:51:44 -07:00
коммит произвёл GitHub
родитель 0e83f48cd5
Коммит fd3d97287c
28 изменённых файлов: 4455 добавлений и 13 удалений
+94 -3
Просмотреть файл
@@ -26,6 +26,8 @@
#include <rocprofiler-sdk/defines.h>
#include <rocprofiler-sdk/fwd.h>
#include <stdint.h>
ROCPROFILER_EXTERN_C_INIT
/**
@@ -35,6 +37,49 @@ ROCPROFILER_EXTERN_C_INIT
* @{
*/
/**
* @brief Page migration triggers
*
*/
typedef enum
{
ROCPROFILER_PAGE_MIGRATION_TRIGGER_NONE = -1,
ROCPROFILER_PAGE_MIGRATION_TRIGGER_PREFETCH,
ROCPROFILER_PAGE_MIGRATION_TRIGGER_PAGEFAULT_GPU,
ROCPROFILER_PAGE_MIGRATION_TRIGGER_PAGEFAULT_CPU,
ROCPROFILER_PAGE_MIGRATION_TRIGGER_TTM_EVICTION,
ROCPROFILER_PAGE_MIGRATION_TRIGGER_LAST,
} rocprofiler_page_migration_trigger_t;
/**
* @brief Page migration triggers causing the queue to suspend
*
*/
typedef enum
{
ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND_TRIGGER_NONE = -1,
ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND_TRIGGER_SVM,
ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND_TRIGGER_USERPTR,
ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND_TRIGGER_TTM,
ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND_TRIGGER_SUSPEND,
ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND_TRIGGER_CRIU_CHECKPOINT,
ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND_TRIGGER_CRIU_RESTORE,
ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND_TRIGGER_LAST,
} rocprofiler_page_migration_queue_suspend_trigger_t;
/**
* @brief Page migration triggers causing an unmap from the GPU
*
*/
typedef enum
{
ROCPROFILER_PAGE_MIGRATION_UNMAP_FROM_GPU_TRIGGER_NONE = -1,
ROCPROFILER_PAGE_MIGRATION_UNMAP_FROM_GPU_TRIGGER_MMU_NOTIFY,
ROCPROFILER_PAGE_MIGRATION_UNMAP_FROM_GPU_TRIGGER_MMU_NOTIFY_MIGRATE,
ROCPROFILER_PAGE_MIGRATION_UNMAP_FROM_GPU_TRIGGER_UNMAP_FROM_CPU,
ROCPROFILER_PAGE_MIGRATION_UNMAP_FROM_GPU_TRIGGER_LAST,
} rocprofiler_page_migration_unmap_from_gpu_trigger_t;
/**
* @brief ROCProfiler Buffer HSA API Tracer Record.
*/
@@ -150,17 +195,63 @@ typedef struct rocprofiler_buffer_tracing_kernel_dispatch_record_t
/// @brief runtime grid size
} rocprofiler_buffer_tracing_kernel_dispatch_record_t;
typedef struct
{
uint8_t read_fault : 1; ///< Is the fault due to a read or a write
uint8_t migrated : 1;
uint32_t node_id; ///< GPU or CPU node ID which reports a page fault
uint64_t address; ///< Address access that caused the page fault
} rocprofiler_buffer_tracing_page_migration_page_fault_record_t;
typedef struct
{
uint64_t start_addr; ///< Start address of the page being migrated
uint64_t end_addr; ///< End address of the page being migrated
uint32_t from_node; ///< Source node
uint32_t to_node; ///< Destination node
uint32_t prefetch_node; ///< Node from which page was prefetched
uint32_t preferred_node; ///< Preferred destinaion node
rocprofiler_page_migration_trigger_t trigger; ///< Cause of migration
} rocprofiler_buffer_tracing_page_migration_page_migrate_record_t;
typedef struct
{
uint8_t rescheduled : 1;
uint32_t node_id; ///< GPU node from which the queue was suspended
rocprofiler_page_migration_queue_suspend_trigger_t trigger; ///< Cause of queue suspension
} rocprofiler_buffer_tracing_page_migration_queue_suspend_record_t;
typedef struct
{
uint32_t node_id; ///< Node ID from which page was unmapped
uint64_t start_addr; ///< Start address of unmapped page
uint64_t end_addr; ///< End address of unmapped page
rocprofiler_page_migration_unmap_from_gpu_trigger_t trigger; ///< Cause of unmap
} rocprofiler_buffer_tracing_page_migration_unmap_from_gpu_record_t;
/**
* @brief ROCProfiler Buffer Page Migration Tracer Record. Not implemented.
* @brief ROCProfiler Buffer Page Migration Tracer Record
*/
typedef struct
{
uint64_t size; ///< size of this struct
rocprofiler_buffer_tracing_kind_t kind; ///< ROCPROFILER_BUFFER_TRACING_PAGE_MIGRATION
rocprofiler_correlation_id_t correlation_id; ///< correlation ids for record
rocprofiler_tracing_operation_t operation;
rocprofiler_timestamp_t start_timestamp; ///< start time in nanoseconds
rocprofiler_timestamp_t end_timestamp; ///< end time in nanoseconds
// Not Sure What is the info needed here?
uint32_t pid;
union
{
rocprofiler_buffer_tracing_page_migration_page_fault_record_t page_fault;
rocprofiler_buffer_tracing_page_migration_page_migrate_record_t page_migrate;
rocprofiler_buffer_tracing_page_migration_queue_suspend_record_t queue_suspend;
rocprofiler_buffer_tracing_page_migration_unmap_from_gpu_record_t unmap_from_gpu;
struct
{
uint64_t reserved[12];
};
};
} rocprofiler_buffer_tracing_page_migration_record_t;
/**
+19
Просмотреть файл
@@ -89,6 +89,10 @@ typedef enum // NOLINT(performance-enum-size)
ROCPROFILER_STATUS_ERROR_AST_GENERATION_FAILED, ///< AST could not be generated correctly
ROCPROFILER_STATUS_ERROR_AST_NOT_FOUND, ///< AST was not found
ROCPROFILER_STATUS_ERROR_AQL_NO_EVENT_COORD, ///< Event coordinate was not found by AQL profile
ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_KERNEL, ///< A service depends on a newer version of KFD
///< (amdgpu kernel driver). Check logs for
///< service that report incompatibility
ROCPROFILER_STATUS_LAST,
} rocprofiler_status_t;
@@ -202,6 +206,21 @@ typedef enum // NOLINT(performance-enum-size)
ROCPROFILER_MEMORY_COPY_LAST,
} rocprofiler_memory_copy_operation_t;
/**
* @brief Page migration event.
*/
typedef enum // NOLINT(performance-enum-size)
{
ROCPROFILER_PAGE_MIGRATION_NONE = 0, ///< Unknown event
ROCPROFILER_PAGE_MIGRATION_PAGE_MIGRATE,
ROCPROFILER_PAGE_MIGRATION_PAGE_FAULT,
ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND,
ROCPROFILER_PAGE_MIGRATION_UNMAP_FROM_GPU,
// Any and all events, from all processes. Requires superuser
// ROCPROFILER_PAGE_MIGRATION_ANY_ALL_PROCESSES,
ROCPROFILER_PAGE_MIGRATION_LAST,
} rocprofiler_page_migration_operation_t;
/**
* @brief ROCProfiler Kernel Dispatch Tracing Operation Types.
*/