From 5ef0b3c34d8db5d0717e0776ff39115b0e12a41e Mon Sep 17 00:00:00 2001 From: "Pryor, Adam" Date: Wed, 1 Oct 2025 15:46:10 -0500 Subject: [PATCH] [SWDEV-547088] Dynamic GPU Metrics Implementation (#692) * Added ability to format gpu_metrics v1_9 * New gpu_metrics format from the driver should allow amd-smi to parse with future compatibility guaranteed --------- Signed-off-by: Oliveira, Daniel Signed-off-by: adapryor Co-authored-by: Oliveira, Daniel --- CMakeLists.txt | 2 + amdsmi_cli/amdsmi_commands.py | 2 +- rocm_smi/include/rocm_smi/rocm_smi_device.h | 5 + .../rocm_smi/rocm_smi_dyn_gpu_metrics.h | 1253 +++++++++++++++++ .../include/rocm_smi/rocm_smi_gpu_metrics.h | 44 +- rocm_smi/src/rocm_smi_dyn_gpu_metrics.cc | 333 +++++ rocm_smi/src/rocm_smi_gpu_metrics.cc | 550 +++++++- 7 files changed, 2165 insertions(+), 24 deletions(-) create mode 100644 rocm_smi/include/rocm_smi/rocm_smi_dyn_gpu_metrics.h create mode 100644 rocm_smi/src/rocm_smi_dyn_gpu_metrics.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index e2e24085e3..1b7375ff0d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -246,6 +246,7 @@ set(CMN_SRC_LIST "${ROCM_SRC_DIR}/rocm_smi_kfd.cc" "${ROCM_SRC_DIR}/rocm_smi_io_link.cc" "${ROCM_SRC_DIR}/rocm_smi_gpu_metrics.cc" + "${ROCM_SRC_DIR}/rocm_smi_dyn_gpu_metrics.cc" "${ROCM_SRC_DIR}/rocm_smi.cc" "${ROCM_SRC_DIR}/rocm_smi_logger.cc" "${SHR_MUTEX_DIR}/shared_mutex.cc" @@ -271,6 +272,7 @@ set(CMN_INC_LIST "${ROCM_INC_DIR}/rocm_smi_kfd.h" "${ROCM_INC_DIR}/rocm_smi_io_link.h" "${ROCM_INC_DIR}/rocm_smi_gpu_metrics.h" + "${ROCM_INC_DIR}/rocm_smi_dyn_gpu_metrics.h" "${ROCM_INC_DIR}/rocm_smi.h" "${ROCM_INC_DIR}/rocm_smi_logger.h" "${SHR_MUTEX_DIR}/shared_mutex.h" diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index 6a3c68ffbe..776cf04924 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -1698,7 +1698,7 @@ class AMDSMICommands(): partition_id = "N/A" num_partition = gpu_metric['num_partition'] - if num_partition == "N/A" and isinstance(partition_id, int) and partition_id > 0: + if num_partition == "N/A": num_partition = 1 # Workaround for XCP metrics not providing num_partition in v1.0 logging.debug(f"num_partition is N/A and partition_id: {partition_id} (greater > 0).\nModified num_partition: {num_partition} to adjust for XCP metrics.") diff --git a/rocm_smi/include/rocm_smi/rocm_smi_device.h b/rocm_smi/include/rocm_smi/rocm_smi_device.h index 64b1b763ba..73f6140836 100644 --- a/rocm_smi/include/rocm_smi/rocm_smi_device.h +++ b/rocm_smi/include/rocm_smi/rocm_smi_device.h @@ -271,6 +271,8 @@ class Device { rsmi_status_t get_smi_device_identifiers(uint32_t device_id, rsmi_device_identifiers_t *device_identifiers); + auto is_dynamic_gpu_metrics_supported() const -> bool { return m_is_dynamic_gpu_metrics_supported; } + private: std::shared_ptr monitor_; std::shared_ptr power_monitor_; @@ -308,6 +310,9 @@ class Device { uint64_t m_gpu_metrics_updated_timestamp; uint32_t m_device_id; uint32_t m_partition_id; + + // New dynamic GPU metrics support + bool m_is_dynamic_gpu_metrics_supported = false; }; diff --git a/rocm_smi/include/rocm_smi/rocm_smi_dyn_gpu_metrics.h b/rocm_smi/include/rocm_smi/rocm_smi_dyn_gpu_metrics.h new file mode 100644 index 0000000000..ec923a0cc2 --- /dev/null +++ b/rocm_smi/include/rocm_smi/rocm_smi_dyn_gpu_metrics.h @@ -0,0 +1,1253 @@ +/* + * MIT License + * + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * Developed by: + * + * AMD ML Software Engineering + * + * Advanced Micro Devices, Inc. + * + * www.amd.com + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in + * the documentation and/or other materials provided with the distribution. + * - Neither the names of Advanced Micro Devices, Inc, + * nor the names of its contributors may be used to endorse or promote + * products derived from this Software without specific prior written + * permission. + * + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + * + */ + + +#ifndef ROCM_SMI_ROCM_SMI_DYN_GPU_METRICS_H_ +#define ROCM_SMI_ROCM_SMI_DYN_GPU_METRICS_H_ + +#include "rocm_smi/rocm_smi_common.h" +#include "rocm_smi/rocm_smi.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace amd::smi +{ + + +/* + * NOTES: + * + * For the new dynamic metrics implementation, we need to define a `schema`. + * The `schema` defines the `types` of the `attributes` (or `properties`, much like a syntax) it + * defines the type of data that can be stored in an attribute. It acts as a blueprint. + * + * If we think of the metrics system as a database, the schema is like the table structure. + * It defines the fields (attributes) that can be stored, their types, and any constraints on them. + * This allows for a flexible and extensible system where new metrics can be added without + * needing to change the underlying codebase significantly. + * + */ + +namespace details +{ +/* + * NOTE: + * Namespace for internal details of 'dynamic gpu metrics'. + * This namespace contains implementation details that are not intended for public use. + * It is used to encapsulate the internal workings of the dynamic GPU metrics system. + * This allows for better organization of code and separation of concerns. + * The public API will interact with this namespace, but the details will be hidden from the user. + * This is a common practice in C++ to keep the public interface clean and maintainable. + * + * Guidelines for using namespace details: + * ----------------------------------------- + * * Use namespace details in headers for: + * - Implementation-specific types (used in templates or PImpl). + * - SFINAE helpers, type traits, or metaprogramming utilities. + * - Internal functions needed for templates/inline functions. + * - Internal constants or enums that are not part of the public API. + * - Internal classes or structs that are not meant for public use. + * + * * Use namespace details in implementation files for: + * - Helper functions/constants not meant for public use. + * - Internal state management (e.g., PImpl details). + * * Avoid exposing namespace details in documentation or public API. + * + * This improves encapsulation and prevents users from relying on internal details that may change. + * + */ + +/* + * Data types for the attributes + */ +enum class AMDGpuMetricAttributeType_t +{ + TYPE_UINT8, + TYPE_INT8, + TYPE_UINT16, + TYPE_INT16, + TYPE_UINT32, + TYPE_INT32, + TYPE_UINT64, + TYPE_INT64 +}; + +/* + * Attribute IDs for the GPU metrics + */ +enum class AMDGpuMetricAttributeId_t +{ + TEMPERATURE_HOTSPOT, + TEMPERATURE_MEM, + TEMPERATURE_VRSOC, + CURR_SOCKET_POWER, + AVERAGE_GFX_ACTIVITY, + AVERAGE_UMC_ACTIVITY, + MEM_MAX_BANDWIDTH, + ENERGY_ACCUMULATOR, + SYSTEM_CLOCK_COUNTER, + ACCUMULATION_COUNTER, + PROCHOT_RESIDENCY_ACC, + PPT_RESIDENCY_ACC, + SOCKET_THM_RESIDENCY_ACC, + VR_THM_RESIDENCY_ACC, + HBM_THM_RESIDENCY_ACC, + GFXCLK_LOCK_STATUS, + PCIE_LINK_WIDTH, + PCIE_LINK_SPEED, + XGMI_LINK_WIDTH, + XGMI_LINK_SPEED, + GFX_ACTIVITY_ACC, + MEM_ACTIVITY_ACC, + PCIE_BANDWIDTH_ACC, + PCIE_BANDWIDTH_INST, + PCIE_L0_TO_RECOV_COUNT_ACC, + PCIE_REPLAY_COUNT_ACC, + PCIE_REPLAY_ROVER_COUNT_ACC, + PCIE_NAK_SENT_COUNT_ACC, + PCIE_NAK_RCVD_COUNT_ACC, + XGMI_READ_DATA_ACC, + XGMI_WRITE_DATA_ACC, + XGMI_LINK_STATUS, + FIRMWARE_TIMESTAMP, + CURRENT_GFXCLK, + CURRENT_SOCCLK, + CURRENT_VCLK0, + CURRENT_DCLK0, + CURRENT_UCLK, + NUM_PARTITION, + PCIE_LC_PERF_OTHER_END_RECOVERY, + GFX_BUSY_INST, + JPEG_BUSY, + VCN_BUSY, + GFX_BUSY_ACC, + GFX_BELOW_HOST_LIMIT_PPT_ACC, + GFX_BELOW_HOST_LIMIT_THM_ACC, + GFX_LOW_UTILIZATION_ACC, + GFX_BELOW_HOST_LIMIT_TOTAL_ACC, +}; + +struct AMDGpuDynamicTranslationTextInfo_t +{ + public: + std::string m_short_info; + std::string m_long_info; + + private: + +}; +using AMDGpuMetricAttributeIdTranslationTable_t = std::unordered_map; + +static const auto AMDGpuMetricAttributeIdToString = AMDGpuMetricAttributeIdTranslationTable_t { + {AMDGpuMetricAttributeId_t::TEMPERATURE_HOTSPOT, {"TEMPERATURE_HOTSPOT", "Temperature of the GPU hotspot"}}, + {AMDGpuMetricAttributeId_t::TEMPERATURE_MEM, {"TEMPERATURE_MEM", "Temperature of the GPU memory"}}, + {AMDGpuMetricAttributeId_t::TEMPERATURE_VRSOC, {"TEMPERATURE_VRSOC", "Temperature of the VR SOC"}}, + {AMDGpuMetricAttributeId_t::CURR_SOCKET_POWER, {"CURR_SOCKET_POWER", "Current power consumption of the socket"}}, + {AMDGpuMetricAttributeId_t::AVERAGE_GFX_ACTIVITY, {"AVERAGE_GFX_ACTIVITY", "Average GPU activity percentage"}}, + {AMDGpuMetricAttributeId_t::AVERAGE_UMC_ACTIVITY, {"AVERAGE_UMC_ACTIVITY", "Average UMC activity percentage"}}, + {AMDGpuMetricAttributeId_t::MEM_MAX_BANDWIDTH, {"MEM_MAX_BANDWIDTH", "Maximum memory bandwidth in GB/s"}}, + {AMDGpuMetricAttributeId_t::ENERGY_ACCUMULATOR, {"ENERGY_ACCUMULATOR", "Energy consumed in Joules"}}, + {AMDGpuMetricAttributeId_t::SYSTEM_CLOCK_COUNTER, {"SYSTEM_CLOCK_COUNTER", "System clock counter in nanoseconds"}}, + {AMDGpuMetricAttributeId_t::ACCUMULATION_COUNTER, {"ACCUMULATION_COUNTER", "Counter for accumulated metrics"}}, + {AMDGpuMetricAttributeId_t::PROCHOT_RESIDENCY_ACC, {"PROCHOT_RESIDENCY_ACC", "Accumulator for 'Processor Hot' residency time"}}, + {AMDGpuMetricAttributeId_t::PPT_RESIDENCY_ACC, {"PPT_RESIDENCY_ACC", "Accumulator for 'Package Power Tracking' residency time"}}, + {AMDGpuMetricAttributeId_t::SOCKET_THM_RESIDENCY_ACC, {"SOCKET_THM_RESIDENCY_ACC", "Accumulator for socket thermal residency time"}}, + {AMDGpuMetricAttributeId_t::VR_THM_RESIDENCY_ACC, {"VR_THM_RESIDENCY_ACC", "Accumulator for 'Voltage Regulator' thermal residency time"}}, + {AMDGpuMetricAttributeId_t::HBM_THM_RESIDENCY_ACC, {"HBM_THM_RESIDENCY_ACC", "Accumulator for 'High Bandwidth Memory' thermal residency time"}}, + {AMDGpuMetricAttributeId_t::GFXCLK_LOCK_STATUS, {"GFXCLK_LOCK_STATUS", "Status of GFX clock lock"}}, + {AMDGpuMetricAttributeId_t::PCIE_LINK_WIDTH, {"PCIE_LINK_WIDTH", "Width of the PCIe link"}}, + {AMDGpuMetricAttributeId_t::PCIE_LINK_SPEED, {"PCIE_LINK_SPEED", "Speed of the PCIe link"}}, + {AMDGpuMetricAttributeId_t::XGMI_LINK_WIDTH, {"XGMI_LINK_WIDTH", "Width of the XGMI link"}}, + {AMDGpuMetricAttributeId_t::XGMI_LINK_SPEED, {"XGMI_LINK_SPEED", "Speed of the XGMI link"}}, + {AMDGpuMetricAttributeId_t::GFX_ACTIVITY_ACC, {"GFX_ACTIVITY_ACC", "Accumulator for GFX activity"}}, + {AMDGpuMetricAttributeId_t::MEM_ACTIVITY_ACC, {"MEM_ACTIVITY_ACC", "Accumulator for memory activity"}}, + {AMDGpuMetricAttributeId_t::PCIE_BANDWIDTH_ACC, {"PCIE_BANDWIDTH_ACC", "Accumulator for PCIe bandwidth"}}, + {AMDGpuMetricAttributeId_t::PCIE_BANDWIDTH_INST, {"PCIE_BANDWIDTH_INST", "Instantaneous PCIe bandwidth"}}, + {AMDGpuMetricAttributeId_t::PCIE_L0_TO_RECOV_COUNT_ACC, {"PCIE_L0_TO_RECOV_COUNT_ACC", "Accumulator for PCIe L0 to recovery count"}}, + {AMDGpuMetricAttributeId_t::PCIE_REPLAY_COUNT_ACC, {"PCIE_REPLAY_COUNT_ACC", "Accumulator for PCIe replay count"}}, + {AMDGpuMetricAttributeId_t::PCIE_REPLAY_ROVER_COUNT_ACC, {"PCIE_REPLAY_ROVER_COUNT_ACC", "Accumulator for PCIe replay rover count"}}, + {AMDGpuMetricAttributeId_t::PCIE_NAK_SENT_COUNT_ACC, {"PCIE_NAK_SENT_COUNT_ACC", "Accumulator for PCIe NAK sent count"}}, + {AMDGpuMetricAttributeId_t::PCIE_NAK_RCVD_COUNT_ACC, {"PCIE_NAK_RCVD_COUNT_ACC", "Accumulator for PCIe NAK received count"}}, + {AMDGpuMetricAttributeId_t::XGMI_READ_DATA_ACC, {"XGMI_READ_DATA_ACC", "Accumulator for XGMI read data"}}, + {AMDGpuMetricAttributeId_t::XGMI_WRITE_DATA_ACC, {"XGMI_WRITE_DATA_ACC", "Accumulator for XGMI write data"}}, + {AMDGpuMetricAttributeId_t::XGMI_LINK_STATUS, {"XGMI_LINK_STATUS", "Status of the XGMI link"}}, + {AMDGpuMetricAttributeId_t::FIRMWARE_TIMESTAMP, {"Firmware Timestamp", "Timestamp from the firmware"}}, + {AMDGpuMetricAttributeId_t::CURRENT_GFXCLK, {"CURRENT_GFXCLK", "Current GFX clock frequency in MHz"}}, + {AMDGpuMetricAttributeId_t::CURRENT_SOCCLK, {"CURRENT_SOCCLK", "Current SOC clock frequency in MHz"}}, + {AMDGpuMetricAttributeId_t::CURRENT_VCLK0, {"CURRENT_VCLK0", "Current VCLK0 frequency in MHz"}}, + {AMDGpuMetricAttributeId_t::CURRENT_DCLK0, {"CURRENT_DCLK0", "Current DCLK0 frequency in MHz"}}, + {AMDGpuMetricAttributeId_t::CURRENT_UCLK, {"CURRENT_UCLK", "Current UCLK frequency in MHz"}}, + {AMDGpuMetricAttributeId_t::NUM_PARTITION, {"NUM_PARTITION", "Number of GPU partitions"}}, + {AMDGpuMetricAttributeId_t::PCIE_LC_PERF_OTHER_END_RECOVERY, {"PCIE_LC_PERF_OTHER_END_RECOVERY", "PCIe link controller performance other end recovery"}}, + {AMDGpuMetricAttributeId_t::GFX_BUSY_INST, {"GFX_BUSY_INST", "Instantaneous GFX busy percentage"}}, + {AMDGpuMetricAttributeId_t::JPEG_BUSY, {"JPEG_BUSY", "JPEG engine busy percentage"}}, + {AMDGpuMetricAttributeId_t::VCN_BUSY, {"VCN_BUSY", "Video Core Next engine busy percentage"}}, + {AMDGpuMetricAttributeId_t::GFX_BUSY_ACC, {"GFX_BUSY_ACC", "Accumulator for GFX busy percentage"}}, + {AMDGpuMetricAttributeId_t::GFX_BELOW_HOST_LIMIT_PPT_ACC, {"GFX_BELOW_HOST_LIMIT_PPT_ACC", "Accumulator for GFX below host limit due to PPT"}}, + {AMDGpuMetricAttributeId_t::GFX_BELOW_HOST_LIMIT_THM_ACC, {"GFX_BELOW_HOST_LIMIT_THM_ACC", "Accumulator for GFX below host limit due to thermal"}}, + {AMDGpuMetricAttributeId_t::GFX_LOW_UTILIZATION_ACC, {"GFX_LOW_UTILIZATION_ACC", "Accumulator for GFX low utilization"}}, + {AMDGpuMetricAttributeId_t::GFX_BELOW_HOST_LIMIT_TOTAL_ACC, {"GFX_BELOW_HOST_LIMIT_TOTAL_ACC", "Total accumulator for GFX below host limit"}}, +}; + + +/* + * Unit types used by attribute instances + */ +enum class AMDGpuMetricUnitType_t +{ + NONE, + + /* + * Temperature units + */ + CELSIUS, + CELSIUS_ACCUMULATOR, + + /* + * Bandwidth/Data Rate units + */ + BIT_PER_SECOND, + BYTE_PER_SECOND, + KILOBYTE_PER_SECOND, + KILOBYTE_PER_SECOND_ACCUMULATOR, + GIGABYTE_PER_SECOND, + GIGABYTE_PER_SECOND_ACCUMULATOR, + + /* + * Power/Energy units + */ + WATT, + JOULE, + + /* + * Electrical units + */ + VOLTAGE, + + /* + * Time/Frequency units + */ + TIMESTAMP_NANOSECONDS, + CLOCK_MEGAHERTZ, + + /* + * Unitless or generic units + */ + PERCENT, + COUNT_ACCUMULATOR, + QUANTITY, + STATUS_FLAG +}; +using AMDGpuMetricUnitTypeTranslationTable_t = std::unordered_map; + +static const auto AMDGpuMetricUnitTypeToString = AMDGpuMetricUnitTypeTranslationTable_t { + {AMDGpuMetricUnitType_t::NONE, {"NONE", "No unit"}}, + {AMDGpuMetricUnitType_t::CELSIUS, {"CELSIUS", "Temperature (°C)"}}, + {AMDGpuMetricUnitType_t::CELSIUS_ACCUMULATOR, {"CELSIUS_ACCUMULATOR", "Accumulated temperature counter (°C)"}}, + {AMDGpuMetricUnitType_t::BIT_PER_SECOND, {"BIT_PER_SECOND", "Throughput (bit/s)"}}, + {AMDGpuMetricUnitType_t::BYTE_PER_SECOND, {"BYTE_PER_SECOND", "Throughput (B/s)"}}, + {AMDGpuMetricUnitType_t::KILOBYTE_PER_SECOND, {"KILOBYTE_PER_SECOND", "Throughput (KB/s)"}}, + {AMDGpuMetricUnitType_t::KILOBYTE_PER_SECOND_ACCUMULATOR, {"KILOBYTE_PER_SECOND_ACCUMULATOR", "Accumulated KB/s counter"}}, + {AMDGpuMetricUnitType_t::GIGABYTE_PER_SECOND, {"GIGABYTE_PER_SECOND", "Throughput (GB/s)"}}, + {AMDGpuMetricUnitType_t::GIGABYTE_PER_SECOND_ACCUMULATOR, {"GIGABYTE_PER_SECOND_ACCUMULATOR", "Accumulated GB/s counter"}}, + {AMDGpuMetricUnitType_t::WATT, {"WATT", "Power (W)"}}, + {AMDGpuMetricUnitType_t::JOULE, {"JOULE", "Energy (J)"}}, + {AMDGpuMetricUnitType_t::VOLTAGE, {"VOLTAGE", "Voltage (V)"}}, + {AMDGpuMetricUnitType_t::TIMESTAMP_NANOSECONDS, {"TIMESTAMP_NANOSECONDS", "Timestamp / time (ns)"}}, + {AMDGpuMetricUnitType_t::CLOCK_MEGAHERTZ, {"CLOCK_MEGAHERTZ", "Frequency (MHz)"}}, + {AMDGpuMetricUnitType_t::PERCENT, {"PERCENT", "Percentage (%)"}}, + {AMDGpuMetricUnitType_t::COUNT_ACCUMULATOR, {"COUNT_ACCUMULATOR", "Monotonic count"}}, + {AMDGpuMetricUnitType_t::QUANTITY, {"QUANTITY", "Unitless Quantity"}}, + {AMDGpuMetricUnitType_t::STATUS_FLAG, {"STATUS_FLAG", "Status bit/flag (bitmask)"}}, +}; + +/* + * Header structure for dynamic GPU metrics + */ +struct AMDGpuDynamicMetricsHeader_v1_t +{ + public: + uint16_t m_structure_size; + uint8_t m_format_revision; + uint8_t m_content_revision; + + static constexpr auto get_size() -> std::size_t + { + return sizeof(AMDGpuDynamicMetricsHeader_v1_t); + } + + + private: + +}; + +using AMDGpuDynamicMetricsVersion_t = std::set>; + + +/* + * Attribute IDs for the GPU metrics + */ +constexpr auto get_metric_data_type_size(AMDGpuMetricAttributeType_t attrib_type) -> std::size_t; + +struct AMDGpuMetricAttributeInstance_t +{ + public: + std::string m_name; + std::string m_description; + AMDGpuMetricAttributeId_t m_attribute_id; + AMDGpuMetricAttributeType_t m_attribute_type; + AMDGpuMetricUnitType_t m_unit_type; + + AMDGpuMetricAttributeInstance_t() = default; + + AMDGpuMetricAttributeInstance_t(const std::string& name, + const std::string& description, + AMDGpuMetricAttributeId_t attribute_id, + AMDGpuMetricAttributeType_t attribute_type, + AMDGpuMetricUnitType_t unit_type) + : m_name(name), + m_description(description), + m_attribute_id(attribute_id), + m_attribute_type(attribute_type), + m_unit_type(unit_type) + { + m_unique_id = get_unique_attribute_id(attribute_id, attribute_type); + + /* + * The availability version is a set of pairs representing the major and minor version. + * This allows for tracking the availability of the metric across different versions. + * For now, we initialize it to an empty set, meaning the metric is available in all versions. + */ + m_availability_version = {{0, 0}}; + } + + AMDGpuMetricAttributeInstance_t(const std::string& name, + const std::string& description, + AMDGpuMetricAttributeId_t attribute_id, + AMDGpuMetricAttributeType_t attribute_type, + AMDGpuMetricUnitType_t unit_type, + const AMDGpuDynamicMetricsVersion_t& availability_version) + : m_name(name), + m_description(description), + m_attribute_id(attribute_id), + m_attribute_type(attribute_type), + m_unit_type(unit_type), + m_availability_version(availability_version) + { + m_unique_id = get_unique_attribute_id(attribute_id, attribute_type); + } + + + /* + * Get the unique ID of the metric instance. + */ + constexpr auto get_unique_attribute_id(AMDGpuMetricAttributeId_t attribute_id, AMDGpuMetricAttributeType_t attribute_type) -> std::uint64_t + { + /* + * The unique ID is calculated based on the attribute ID and type. + * This allows for a unique identifier for each metric instance. + * + * Example: + * If attribute_id is TEMPERATURE_MEM (1) and attribute_type is TYPE_INT32 (5), + * then m_unique_id will be 1 * 100 + 5 = 105. + * + * We might need to revisit this, but for now, it serves as a unique identifier. + */ + return (static_cast(attribute_id) * 100 + static_cast(attribute_type)); + } + + constexpr auto get_type_size() const -> std::size_t + { + return get_metric_data_type_size(m_attribute_type); + } + + + private: + std::uint64_t m_unique_id; + AMDGpuDynamicMetricsVersion_t m_availability_version; + +}; + + +/* + * Based on supported value types in `AMDGpuMetricAttributeType_t` + */ +using AMDGpuMetricAttributeValue_t = std::variant, std::vector, + std::vector, std::vector, + std::vector, std::vector, + std::vector, std::vector>; + + +struct AMDGpuMetricValueDataSizeVisitor_t +{ + public: + /* + * Helper to check if Tp is a std::vector + */ + template + struct is_std_vector : std::false_type {}; + + template + struct is_std_vector> : std::true_type {}; + + /* + * Scalar types only + */ + template + constexpr auto operator()(const Tp& value) const -> std::size_t + { + if constexpr (!is_std_vector::value) { + return sizeof(Tp); + } else { + return value.size() * sizeof(typename Tp::value_type); + } + } + + private: + +}; + + +struct AMDGpuMetricAttributeData_t +{ + public: + AMDGpuMetricAttributeInstance_t m_instance; + AMDGpuMetricAttributeValue_t m_value; + + AMDGpuMetricAttributeData_t() = default; + AMDGpuMetricAttributeData_t(const AMDGpuMetricAttributeInstance_t& metric_instance, + const AMDGpuMetricAttributeValue_t& metric_value) + : m_instance(metric_instance), + m_value(metric_value) + { } + + auto is_multivalued() const -> bool + { + return (std::holds_alternative>(m_value) || + std::holds_alternative>(m_value) || + std::holds_alternative>(m_value) || + std::holds_alternative>(m_value) || + std::holds_alternative>(m_value) || + std::holds_alternative>(m_value) || + std::holds_alternative>(m_value) || + std::holds_alternative>(m_value)); + } + + constexpr auto get_metric_serialized_data_size() const -> std::size_t + { + return std::visit(AMDGpuMetricValueDataSizeVisitor_t{}, m_value); + } + + + private: + +}; + +// Hash for enum-class keys +struct AttributeIdHash_t { + size_t operator()(AMDGpuMetricAttributeId_t id) const noexcept { + using U = std::underlying_type_t; + return std::hash{}(static_cast(id)); + } +}; + +using AMDGpuMetricSchemaType_t = std::vector; + +using AMDGpuMetricSchemaMapType_t = + std::unordered_map; + +// Check if type Tp has callable 0 arg member function named "is_multivalued()" +template +struct is_multivalued_attribute : std::false_type { }; + +template +struct is_multivalued_attribute< + Tp, + std::void_t().is_multivalued())> +> : std::true_type { }; + +constexpr auto get_metric_data_type_size(AMDGpuMetricAttributeType_t attrib_type) -> std::size_t +{ + switch (attrib_type) { + case (AMDGpuMetricAttributeType_t::TYPE_UINT8): + return sizeof(std::uint8_t); + + case (AMDGpuMetricAttributeType_t::TYPE_INT8): + return sizeof(std::int8_t); + + case (AMDGpuMetricAttributeType_t::TYPE_UINT16): + return sizeof(std::uint16_t); + + case (AMDGpuMetricAttributeType_t::TYPE_INT16): + return sizeof(std::int16_t); + + case (AMDGpuMetricAttributeType_t::TYPE_UINT32): + return sizeof(std::uint32_t); + + case (AMDGpuMetricAttributeType_t::TYPE_INT32): + return sizeof(std::int32_t); + + case (AMDGpuMetricAttributeType_t::TYPE_UINT64): + return sizeof(std::uint64_t); + + case (AMDGpuMetricAttributeType_t::TYPE_INT64): + return sizeof(std::int64_t); + + default: + throw std::runtime_error("Error: Metric attribute type unknown... "); + } +} + +enum class AMDGpuMetricAttributeTypeFlag_t : std::uint32_t +{ + ATTRIBUTE_FLAG_TYPE_NONE = (0x0), + ATTRIBUTE_FLAG_TYPE8 = (0x1 << 0), + ATTRIBUTE_FLAG_TYPE16 = (0x1 << 1), + ATTRIBUTE_FLAG_TYPE32 = (0x1 << 2), + ATTRIBUTE_FLAG_TYPE64 = (0x1 << 3), +}; + +// Used to determine how far to skip when parsing gpu_metrics file +constexpr auto get_metric_bytes(AMDGpuMetricAttributeType_t attrib_type) -> std::size_t +{ + using Flag = AMDGpuMetricAttributeTypeFlag_t; + + switch (attrib_type) { + case AMDGpuMetricAttributeType_t::TYPE_UINT8: + case AMDGpuMetricAttributeType_t::TYPE_INT8: + return static_cast(Flag::ATTRIBUTE_FLAG_TYPE8); + + case AMDGpuMetricAttributeType_t::TYPE_UINT16: + case AMDGpuMetricAttributeType_t::TYPE_INT16: + return static_cast(Flag::ATTRIBUTE_FLAG_TYPE16); + + case AMDGpuMetricAttributeType_t::TYPE_UINT32: + case AMDGpuMetricAttributeType_t::TYPE_INT32: + return static_cast(Flag::ATTRIBUTE_FLAG_TYPE32); + + case AMDGpuMetricAttributeType_t::TYPE_UINT64: + case AMDGpuMetricAttributeType_t::TYPE_INT64: + return static_cast(Flag::ATTRIBUTE_FLAG_TYPE64); + } + return 0; // Unreachable +} + +constexpr auto ATTR_INST_BITS = std::uint8_t(10); // ATTR_INST_MASK (0x000003FF) +constexpr auto ATTR_ID_BITS = std::uint8_t(10); // ATTR_ID_MASK (0x000FFC00) +constexpr auto ATTR_TYPE_BITS = std::uint8_t(4); // ATTR_TYPE_MASK (0x00F00000) +constexpr auto ATTR_UNIT_BITS = std::uint8_t(8); // ATTR_UNIT_MASK (0xFF000000) + +/* + * Bit shifting are constant, derived from bit sizes + */ +constexpr auto ATTR_ID_SHIFT = (ATTR_INST_BITS); // 10 +constexpr auto ATTR_TYPE_SHIFT = (ATTR_ID_SHIFT + ATTR_ID_BITS); // 20 +constexpr auto ATTR_UNIT_SHIFT = (ATTR_TYPE_SHIFT + ATTR_TYPE_BITS); // 24 + +/* + * Masks are constant and used for decoding values safely + * - They are derived from bit sizes and shifts + * - They help in isolating specific fields when encoding/decoding + */ +constexpr auto ATTR_INST_MASK = static_cast((1ULL << ATTR_INST_BITS) - 1); +constexpr auto ATTR_ID_MASK = static_cast(((1ULL << ATTR_ID_BITS) - 1) << ATTR_ID_SHIFT); +constexpr auto ATTR_TYPE_MASK = static_cast(((1ULL << ATTR_TYPE_BITS) - 1) << ATTR_TYPE_SHIFT); +constexpr auto ATTR_UNIT_MASK = static_cast(((1ULL << ATTR_UNIT_BITS) - 1) << ATTR_UNIT_SHIFT); + + +struct AMDGpuMetricAttributeDecode_t +{ + public: + uint64_t m_attr_unit; // Unit type, currently unused + uint64_t m_attr_type; // Type (e.g., U8, S16, U32, etc.) + uint64_t m_attr_id; // Attribute ID (enumerated in `AMDGpuMetricAttributeId_t`) + uint64_t m_attr_instance; // Instance count (number of values for this attribute) + + constexpr auto operator==(const AMDGpuMetricAttributeDecode_t& other) const noexcept -> bool + { + return ((m_attr_unit == other.m_attr_unit) && + (m_attr_type == other.m_attr_type) && + (m_attr_id == other.m_attr_id) && + (m_attr_instance == other.m_attr_instance)); + } + + private: + +}; + +/* + * Function to encode the attribute type, ID, and instance into a single uint32_t value. + * So we can do something like: + * auto attribute1 = amdgpu_metrics_enc_attr(AMDGpuMetricAttributeType_t::TYPE_UINT32, + * AMDGpuMetricAttributeId_t::GFX_BUSY_INST, + * +*/ +[[nodiscard]] +constexpr auto amdgpu_metrics_encode_attr(std::uint64_t attr_unit, + std::uint64_t attr_type, + std::uint64_t attr_id, + std::uint64_t attr_instance) noexcept -> std::uint64_t +{ + return ((attr_unit << ATTR_UNIT_SHIFT) | + (attr_type << ATTR_TYPE_SHIFT) | + (attr_id << ATTR_ID_SHIFT) | + (attr_instance)); +} + +[[nodiscard]] +constexpr auto amdgpu_metrics_decode_attr(std::uint64_t encoded_attr) noexcept -> AMDGpuMetricAttributeDecode_t +{ + return AMDGpuMetricAttributeDecode_t { + .m_attr_unit = ((encoded_attr & ATTR_UNIT_MASK) >> ATTR_UNIT_SHIFT), + .m_attr_type = ((encoded_attr & ATTR_TYPE_MASK) >> ATTR_TYPE_SHIFT), + .m_attr_id = ((encoded_attr & ATTR_ID_MASK) >> ATTR_ID_SHIFT), + .m_attr_instance = (encoded_attr & ATTR_INST_MASK) + }; +} + +} // namespace details + + +/* + * The AMDGpuMetricsBaseSchema is a predefined schema for GPU metrics. + * It contains a list of metric instances with their respective attributes and initial values. + * This schema is used to define the structure of the GPU metrics that can be collected. + */ +static const auto AMDGpuMetricsBaseSchema = details::AMDGpuMetricSchemaMapType_t{ + { details::AMDGpuMetricAttributeId_t::TEMPERATURE_HOTSPOT, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("Temperature Hotspot", + "Temperature of the GPU hotspot", + details::AMDGpuMetricAttributeId_t::TEMPERATURE_HOTSPOT, + details::AMDGpuMetricAttributeType_t::TYPE_UINT16, + details::AMDGpuMetricUnitType_t::CELSIUS), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::TEMPERATURE_MEM, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("Temperature Memory", + "Temperature of the GPU memory", + details::AMDGpuMetricAttributeId_t::TEMPERATURE_MEM, + details::AMDGpuMetricAttributeType_t::TYPE_UINT16, + details::AMDGpuMetricUnitType_t::CELSIUS), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::TEMPERATURE_VRSOC, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("Temperature VR SOC", + "Temperature of the VR SOC", + details::AMDGpuMetricAttributeId_t::TEMPERATURE_VRSOC, + details::AMDGpuMetricAttributeType_t::TYPE_UINT16, + details::AMDGpuMetricUnitType_t::CELSIUS), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::CURR_SOCKET_POWER, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("Current Socket Power", + "Current power consumption of the socket", + details::AMDGpuMetricAttributeId_t::CURR_SOCKET_POWER, + details::AMDGpuMetricAttributeType_t::TYPE_UINT16, + details::AMDGpuMetricUnitType_t::WATT), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::AVERAGE_GFX_ACTIVITY, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("Average GFX Activity", + "Average GPU activity percentage", + details::AMDGpuMetricAttributeId_t::AVERAGE_GFX_ACTIVITY, + details::AMDGpuMetricAttributeType_t::TYPE_UINT16, + details::AMDGpuMetricUnitType_t::PERCENT), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::AVERAGE_UMC_ACTIVITY, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("Average UMC Activity", + "Average UMC activity percentage", + details::AMDGpuMetricAttributeId_t::AVERAGE_UMC_ACTIVITY, + details::AMDGpuMetricAttributeType_t::TYPE_UINT16, + details::AMDGpuMetricUnitType_t::PERCENT), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::MEM_MAX_BANDWIDTH, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("Memory Max Bandwidth", + "Maximum memory bandwidth in GB/s", + details::AMDGpuMetricAttributeId_t::MEM_MAX_BANDWIDTH, + details::AMDGpuMetricAttributeType_t::TYPE_UINT64, + details::AMDGpuMetricUnitType_t::GIGABYTE_PER_SECOND), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::ENERGY_ACCUMULATOR, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("Energy Accumulator", + "Energy consumed in Joules", + details::AMDGpuMetricAttributeId_t::ENERGY_ACCUMULATOR, + details::AMDGpuMetricAttributeType_t::TYPE_UINT64, + details::AMDGpuMetricUnitType_t::JOULE), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::SYSTEM_CLOCK_COUNTER, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("System Clock Counter", + "System clock counter in nanoseconds", + details::AMDGpuMetricAttributeId_t::SYSTEM_CLOCK_COUNTER, + details::AMDGpuMetricAttributeType_t::TYPE_UINT64, + details::AMDGpuMetricUnitType_t::TIMESTAMP_NANOSECONDS), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::ACCUMULATION_COUNTER, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("Accumulation Counter", + "Counter for accumulated metrics", + details::AMDGpuMetricAttributeId_t::ACCUMULATION_COUNTER, + details::AMDGpuMetricAttributeType_t::TYPE_UINT32, + details::AMDGpuMetricUnitType_t::COUNT_ACCUMULATOR), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::PROCHOT_RESIDENCY_ACC, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("ProcHot Residency Accumulator", + "Accumulator for 'Processor Hot' residency time", + details::AMDGpuMetricAttributeId_t::PROCHOT_RESIDENCY_ACC, + details::AMDGpuMetricAttributeType_t::TYPE_UINT32, + details::AMDGpuMetricUnitType_t::CELSIUS_ACCUMULATOR), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::PPT_RESIDENCY_ACC, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("PPT Residency Accumulator", + "Accumulator for 'Package Power Tracking' residency time", + details::AMDGpuMetricAttributeId_t::PPT_RESIDENCY_ACC, + details::AMDGpuMetricAttributeType_t::TYPE_UINT32, + details::AMDGpuMetricUnitType_t::CELSIUS_ACCUMULATOR), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::SOCKET_THM_RESIDENCY_ACC, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("Socket Thermal Residency Accumulator", + "Accumulator for socket thermal residency time", + details::AMDGpuMetricAttributeId_t::SOCKET_THM_RESIDENCY_ACC, + details::AMDGpuMetricAttributeType_t::TYPE_UINT32, + details::AMDGpuMetricUnitType_t::CELSIUS_ACCUMULATOR), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::VR_THM_RESIDENCY_ACC, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("VR Thermal Residency Accumulator", + "Accumulator for 'Voltage Regulator' thermal residency time", + details::AMDGpuMetricAttributeId_t::VR_THM_RESIDENCY_ACC, + details::AMDGpuMetricAttributeType_t::TYPE_UINT32, + details::AMDGpuMetricUnitType_t::CELSIUS_ACCUMULATOR), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::HBM_THM_RESIDENCY_ACC, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("HBM Thermal Residency Accumulator", + "Accumulator for 'High Bandwidth Memory' thermal residency time", + details::AMDGpuMetricAttributeId_t::HBM_THM_RESIDENCY_ACC, + details::AMDGpuMetricAttributeType_t::TYPE_UINT32, + details::AMDGpuMetricUnitType_t::CELSIUS_ACCUMULATOR), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::GFXCLK_LOCK_STATUS, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("GFXCLK Lock Status", + "Status of GFX clock lock", + details::AMDGpuMetricAttributeId_t::GFXCLK_LOCK_STATUS, + details::AMDGpuMetricAttributeType_t::TYPE_UINT32, + details::AMDGpuMetricUnitType_t::STATUS_FLAG), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::PCIE_LINK_WIDTH, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("PCIe Link Width", + "Current PCIe link width", + details::AMDGpuMetricAttributeId_t::PCIE_LINK_WIDTH, + details::AMDGpuMetricAttributeType_t::TYPE_UINT16, + details::AMDGpuMetricUnitType_t::QUANTITY), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::PCIE_LINK_SPEED, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("PCIe Link Speed", + "Current PCIe link speed in GT/s", + details::AMDGpuMetricAttributeId_t::PCIE_LINK_SPEED, + details::AMDGpuMetricAttributeType_t::TYPE_UINT16, + details::AMDGpuMetricUnitType_t::GIGABYTE_PER_SECOND), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::XGMI_LINK_WIDTH, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("XGMI Link Width", + "Current XGMI link width", + details::AMDGpuMetricAttributeId_t::XGMI_LINK_WIDTH, + details::AMDGpuMetricAttributeType_t::TYPE_UINT16, + details::AMDGpuMetricUnitType_t::QUANTITY), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::XGMI_LINK_SPEED, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("XGMI Link Speed", + "Current XGMI link speed in GT/s", + details::AMDGpuMetricAttributeId_t::XGMI_LINK_SPEED, + details::AMDGpuMetricAttributeType_t::TYPE_UINT16, + details::AMDGpuMetricUnitType_t::GIGABYTE_PER_SECOND), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::GFX_ACTIVITY_ACC, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("GFX Activity Accumulator", + "Accumulator for GFX activity percentage", + details::AMDGpuMetricAttributeId_t::GFX_ACTIVITY_ACC, + details::AMDGpuMetricAttributeType_t::TYPE_UINT32, + details::AMDGpuMetricUnitType_t::PERCENT), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::MEM_ACTIVITY_ACC, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("Memory Activity Accumulator", + "Accumulator for memory activity percentage", + details::AMDGpuMetricAttributeId_t::MEM_ACTIVITY_ACC, + details::AMDGpuMetricAttributeType_t::TYPE_UINT32, + details::AMDGpuMetricUnitType_t::PERCENT), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::PCIE_BANDWIDTH_ACC, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("PCIe Bandwidth Accumulator", + "Accumulator for PCIe bandwidth in GB/s", + details::AMDGpuMetricAttributeId_t::PCIE_BANDWIDTH_ACC, + details::AMDGpuMetricAttributeType_t::TYPE_UINT64, + details::AMDGpuMetricUnitType_t::GIGABYTE_PER_SECOND_ACCUMULATOR), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::PCIE_BANDWIDTH_INST, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("PCIe Bandwidth Instantaneous", + "Instantaneous PCIe bandwidth in GB/s", + details::AMDGpuMetricAttributeId_t::PCIE_BANDWIDTH_INST, + details::AMDGpuMetricAttributeType_t::TYPE_UINT64, + details::AMDGpuMetricUnitType_t::GIGABYTE_PER_SECOND), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::PCIE_L0_TO_RECOV_COUNT_ACC, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("PCIe L0 to Recovery Count Accumulator", + "Accumulator for PCIe L0 to recovery count", + details::AMDGpuMetricAttributeId_t::PCIE_L0_TO_RECOV_COUNT_ACC, + details::AMDGpuMetricAttributeType_t::TYPE_UINT64, + details::AMDGpuMetricUnitType_t::COUNT_ACCUMULATOR), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::PCIE_REPLAY_COUNT_ACC, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("PCIe Replay Count Accumulator", + "Accumulator for PCIe replay count", + details::AMDGpuMetricAttributeId_t::PCIE_REPLAY_COUNT_ACC, + details::AMDGpuMetricAttributeType_t::TYPE_UINT64, + details::AMDGpuMetricUnitType_t::COUNT_ACCUMULATOR), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::PCIE_REPLAY_ROVER_COUNT_ACC, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("PCIe Replay Rover Count Accumulator", + "Accumulator for PCIe replay rover count", + details::AMDGpuMetricAttributeId_t::PCIE_REPLAY_ROVER_COUNT_ACC, + details::AMDGpuMetricAttributeType_t::TYPE_UINT64, + details::AMDGpuMetricUnitType_t::COUNT_ACCUMULATOR), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::PCIE_NAK_SENT_COUNT_ACC, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("PCIe NAK Sent Count Accumulator", + "Accumulator for PCIe NAK sent count", + details::AMDGpuMetricAttributeId_t::PCIE_NAK_SENT_COUNT_ACC, + details::AMDGpuMetricAttributeType_t::TYPE_UINT32, + details::AMDGpuMetricUnitType_t::COUNT_ACCUMULATOR), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::PCIE_NAK_RCVD_COUNT_ACC, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("PCIe NAK Received Count Accumulator", + "Accumulator for PCIe NAK received count", + details::AMDGpuMetricAttributeId_t::PCIE_NAK_RCVD_COUNT_ACC, + details::AMDGpuMetricAttributeType_t::TYPE_UINT32, + details::AMDGpuMetricUnitType_t::COUNT_ACCUMULATOR), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::XGMI_READ_DATA_ACC, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("XGMI Read Data Accumulator", + "Accumulator for XGMI read data in bytes", + details::AMDGpuMetricAttributeId_t::XGMI_READ_DATA_ACC, + details::AMDGpuMetricAttributeType_t::TYPE_UINT64, + details::AMDGpuMetricUnitType_t::KILOBYTE_PER_SECOND_ACCUMULATOR), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::XGMI_WRITE_DATA_ACC, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("XGMI Write Data Accumulator", + "Accumulator for XGMI write data in bytes", + details::AMDGpuMetricAttributeId_t::XGMI_WRITE_DATA_ACC, + details::AMDGpuMetricAttributeType_t::TYPE_UINT64, + details::AMDGpuMetricUnitType_t::KILOBYTE_PER_SECOND_ACCUMULATOR), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::XGMI_LINK_STATUS, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("XGMI Link Status", + "Status of the XGMI link", + details::AMDGpuMetricAttributeId_t::XGMI_LINK_STATUS, + details::AMDGpuMetricAttributeType_t::TYPE_UINT16, + details::AMDGpuMetricUnitType_t::STATUS_FLAG), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::FIRMWARE_TIMESTAMP, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("Firmware Timestamp", + "Timestamp of the firmware in nanoseconds", + details::AMDGpuMetricAttributeId_t::FIRMWARE_TIMESTAMP, + details::AMDGpuMetricAttributeType_t::TYPE_UINT64, + details::AMDGpuMetricUnitType_t::TIMESTAMP_NANOSECONDS), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::CURRENT_GFXCLK, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("Current GFX Clock", + "Current GFX clock frequency in MHz", + details::AMDGpuMetricAttributeId_t::CURRENT_GFXCLK, + details::AMDGpuMetricAttributeType_t::TYPE_UINT16, + details::AMDGpuMetricUnitType_t::CLOCK_MEGAHERTZ), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::CURRENT_SOCCLK, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("Current SOC Clock", + "Current SOC clock frequency in MHz", + details::AMDGpuMetricAttributeId_t::CURRENT_SOCCLK, + details::AMDGpuMetricAttributeType_t::TYPE_UINT16, + details::AMDGpuMetricUnitType_t::CLOCK_MEGAHERTZ), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::CURRENT_VCLK0, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("Current VCLK0", + "Current VCLK0 frequency in MHz", + details::AMDGpuMetricAttributeId_t::CURRENT_VCLK0, + details::AMDGpuMetricAttributeType_t::TYPE_UINT16, + details::AMDGpuMetricUnitType_t::CLOCK_MEGAHERTZ), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::CURRENT_DCLK0, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("Current DCLK0", + "Current DCLK0 frequency in MHz", + details::AMDGpuMetricAttributeId_t::CURRENT_DCLK0, + details::AMDGpuMetricAttributeType_t::TYPE_UINT16, + details::AMDGpuMetricUnitType_t::CLOCK_MEGAHERTZ), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::CURRENT_UCLK, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("Current UCLK", + "Current UCLK frequency in MHz", + details::AMDGpuMetricAttributeId_t::CURRENT_UCLK, + details::AMDGpuMetricAttributeType_t::TYPE_UINT16, + details::AMDGpuMetricUnitType_t::CLOCK_MEGAHERTZ), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::NUM_PARTITION, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("Number of Partitions", + "Number of partitions in the GPU", + details::AMDGpuMetricAttributeId_t::NUM_PARTITION, + details::AMDGpuMetricAttributeType_t::TYPE_UINT16, + details::AMDGpuMetricUnitType_t::QUANTITY), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::PCIE_LC_PERF_OTHER_END_RECOVERY, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("PCIe LC Perf Other End Recovery", + "PCIe link control performance other end recovery", + details::AMDGpuMetricAttributeId_t::PCIE_LC_PERF_OTHER_END_RECOVERY, + details::AMDGpuMetricAttributeType_t::TYPE_UINT32, + details::AMDGpuMetricUnitType_t::COUNT_ACCUMULATOR), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::GFX_BUSY_INST, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("GFX Busy Instantaneous", + "GFX Busy Instantaneous in percent", + details::AMDGpuMetricAttributeId_t::GFX_BUSY_INST, + details::AMDGpuMetricAttributeType_t::TYPE_UINT32, + details::AMDGpuMetricUnitType_t::PERCENT), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::JPEG_BUSY, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("JPEG Busy Instantaneous", + "JPEG Busy Instantaneous in percent", + details::AMDGpuMetricAttributeId_t::JPEG_BUSY, + details::AMDGpuMetricAttributeType_t::TYPE_UINT16, + details::AMDGpuMetricUnitType_t::PERCENT), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::VCN_BUSY, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("VCN Busy Instantaneous", + "VCN Busy Instantaneous in percent", + details::AMDGpuMetricAttributeId_t::VCN_BUSY, + details::AMDGpuMetricAttributeType_t::TYPE_UINT16, + details::AMDGpuMetricUnitType_t::PERCENT), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::GFX_BUSY_ACC, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("GFX Busy Accumulator", + "GFX Busy Accumulator in percent", + details::AMDGpuMetricAttributeId_t::GFX_BUSY_ACC, + details::AMDGpuMetricAttributeType_t::TYPE_UINT64, + details::AMDGpuMetricUnitType_t::PERCENT), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::GFX_BELOW_HOST_LIMIT_PPT_ACC, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("GFX Below Host Limit PPT Accumulator", + "GFX Below Host Limit PPT Accumulator in percent", + details::AMDGpuMetricAttributeId_t::GFX_BELOW_HOST_LIMIT_PPT_ACC, + details::AMDGpuMetricAttributeType_t::TYPE_UINT64, + details::AMDGpuMetricUnitType_t::PERCENT), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::GFX_BELOW_HOST_LIMIT_THM_ACC, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("GFX Below Host Limit THM Accumulator", + "GFX Below Host Limit THM Accumulator in percent", + details::AMDGpuMetricAttributeId_t::GFX_BELOW_HOST_LIMIT_THM_ACC, + details::AMDGpuMetricAttributeType_t::TYPE_UINT64, + details::AMDGpuMetricUnitType_t::PERCENT), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::GFX_LOW_UTILIZATION_ACC, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("GFX Low Utilization Accumulator", + "GFX Low Utilization Accumulator in percent", + details::AMDGpuMetricAttributeId_t::GFX_LOW_UTILIZATION_ACC, + details::AMDGpuMetricAttributeType_t::TYPE_UINT64, + details::AMDGpuMetricUnitType_t::PERCENT), + static_cast(0) + }}, + + { details::AMDGpuMetricAttributeId_t::GFX_BELOW_HOST_LIMIT_TOTAL_ACC, + details::AMDGpuMetricAttributeData_t{ + details::AMDGpuMetricAttributeInstance_t("GFX Below Host Limit Total Accumulator", + "GFX Below Host Limit Total Accumulator in percent", + details::AMDGpuMetricAttributeId_t::GFX_BELOW_HOST_LIMIT_TOTAL_ACC, + details::AMDGpuMetricAttributeType_t::TYPE_UINT64, + details::AMDGpuMetricUnitType_t::PERCENT), + static_cast(0) + }} +}; + + +using AMDGpuDynamicMetricsOffsetMap_t = std::map; +using AMDGpuDynamicMetricsOffsetIt_t = AMDGpuDynamicMetricsOffsetMap_t::const_iterator; + +/* + * This is the actual representation of the whole dynamic metrics data structure, for either: + * - 'gpu_metrics' file + * - 'xcp_metrics' file + */ +class AMDGpuDynamicMetrics_t +{ + public: + AMDGpuDynamicMetrics_t() = default; + AMDGpuDynamicMetrics_t(const AMDGpuDynamicMetrics_t&) = delete; + AMDGpuDynamicMetrics_t(AMDGpuDynamicMetrics_t&& inst) noexcept + : m_header(inst.m_header), + m_dynamic_metrics_data(std::move(inst.m_dynamic_metrics_data)), + m_dynamic_metrics_data_offsets(std::move(inst.m_dynamic_metrics_data_offsets)) { + } + AMDGpuDynamicMetrics_t& operator=(const AMDGpuDynamicMetrics_t&) = delete; + AMDGpuDynamicMetrics_t& operator=(AMDGpuDynamicMetrics_t&& inst) { + m_header = inst.m_header; + m_dynamic_metrics_data = std::move(inst.m_dynamic_metrics_data); + m_dynamic_metrics_data_offsets = std::move(inst.m_dynamic_metrics_data_offsets); + return *this; + } + ~AMDGpuDynamicMetrics_t() { + //{ + // std::unique_lock lock(m_mutex); + // m_current_conditional_var.notify_all(); + //} + } + + // Parsing helpers + auto parse_from_buffer(const std::byte* data, std::size_t size) noexcept -> rsmi_status_t; + auto parse_from_file(const std::string& metrics_file_path, std::size_t read_size = 0) -> rsmi_status_t; + + auto get_metric_rows() const noexcept + -> const details::AMDGpuMetricSchemaType_t& { return m_dynamic_metrics_data; } + + auto get_header() const noexcept + -> const details::AMDGpuDynamicMetricsHeader_v1_t& { return m_header; } + + /* + * The Cursor here, is a helper class to help with navigation within the dynamic metrics data + * based on the data offsets + * + */ + class AMDGpuDynamicMetricsCursor_t + { + public: + AMDGpuDynamicMetricsCursor_t(const AMDGpuDynamicMetrics_t& metrics_data, + std::uint64_t start_offset = 0) + : m_metrics(metrics_data), + m_current_offset(start_offset), + m_read_lock(metrics_data.m_mutex) { + m_current_metric_attribute = m_metrics.m_dynamic_metrics_data_offsets.lower_bound(0); + } + + ~AMDGpuDynamicMetricsCursor_t() = default; + + private: + const AMDGpuDynamicMetrics_t& m_metrics; + std::uint64_t m_current_offset{0}; + AMDGpuDynamicMetricsOffsetIt_t m_current_metric_attribute; + mutable std::shared_lock m_read_lock; + }; + + private: + std::string m_metric_source_file_path{}; + details::AMDGpuDynamicMetricsHeader_v1_t m_header; + uint32_t m_attr_count; + details::AMDGpuMetricSchemaType_t m_dynamic_metrics_data{}; + AMDGpuDynamicMetricsOffsetMap_t m_dynamic_metrics_data_offsets{}; + mutable std::shared_mutex m_mutex; + +}; + +template +constexpr auto is_multivalued_attribute_v = details::is_multivalued_attribute::value; + +using AMDGPUMetricsDynDataBuffer_t = std::vector; +rsmi_status_t read_dynamic_gpu_metrics_file(const std::string& metrics_file_path, + const size_t read_size, + AMDGPUMetricsDynDataBuffer_t& out); +} // namespace amd::smi + + +#endif // ROCM_SMI_ROCM_SMI_DYN_GPU_METRICS_H_ diff --git a/rocm_smi/include/rocm_smi/rocm_smi_gpu_metrics.h b/rocm_smi/include/rocm_smi/rocm_smi_gpu_metrics.h index c2fd0585d6..bea50610a4 100644 --- a/rocm_smi/include/rocm_smi/rocm_smi_gpu_metrics.h +++ b/rocm_smi/include/rocm_smi/rocm_smi_gpu_metrics.h @@ -25,6 +25,7 @@ #include "rocm_smi/rocm_smi_common.h" #include "rocm_smi/rocm_smi.h" +#include "rocm_smi/rocm_smi_dyn_gpu_metrics.h" #include #include @@ -42,7 +43,6 @@ #include #include - /** * All 1.4 and newer GPU metrics are now defined in this header. * @@ -89,12 +89,12 @@ constexpr uint32_t kRSMI_MAX_NUM_XCC = 8; // Note: This *must* match MAX_XCP constexpr uint32_t kRSMI_MAX_NUM_XCP = 8; - struct AMDGpuMetricsHeader_v1_t { uint16_t m_structure_size; uint8_t m_format_revision; uint8_t m_content_revision; }; + struct amdgpu_xcp_metrics { /* Utilization Instantaneous (%) */ uint32_t gfx_busy_inst[kRSMI_MAX_NUM_XCC]; @@ -788,7 +788,7 @@ struct AMDGpuMetrics_v18_t { /* PCIE other end recovery counter */ uint32_t m_pcie_lc_perf_other_end_recovery; }; -using AMGpuMetricsLatest_t = AMDGpuMetrics_v18_t; +using AMGpuMetricsLatest_t = AMDGpuDynamicMetrics_t; /** * This is GPU Metrics version that gets to public access. @@ -1053,7 +1053,8 @@ enum class AMDGpuMetricVersionFlags_t : AMDGpuMetricVersionFlagId_t kGpuMetricV15 = (0x1 << 5), kGpuMetricV16 = (0x1 << 6), kGpuMetricV17 = (0x1 << 7), - kGpuMetricV18 = (0x1 << 8), // Added new version flag + kGpuMetricV18 = (0x1 << 8), // Added new version flag: Last static GPU Metrics + kGpuMetricV19 = (0x1 << 9), // Dyn.GPU Metrics }; using AMDGpuMetricVersionTranslationTbl_t = std::map; using GpuMetricTypePtr_t = std::shared_ptr; @@ -1311,6 +1312,41 @@ class GpuMetricsBase_v18_t final : public GpuMetricsBase_t { std::shared_ptr m_gpu_metric_ptr; }; +class GpuMetricsBaseDynamic_t final : public GpuMetricsBase_t { + public: + ~GpuMetricsBaseDynamic_t() = default; + + // Unused + size_t sizeof_metric_table() override { return 0; } + + // Unused + GpuMetricTypePtr_t get_metrics_table() override { return nullptr; } + + AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override { + if (m_header.m_format_revision != 1) { + return AMDGpuMetricVersionFlags_t::kGpuMetricNone; + } + + return static_cast(1u << m_header.m_content_revision); + } + + // Store header and metrics table + inline rsmi_status_t set_parsed_dynamic(AMDGpuDynamicMetrics_t&& parsed) noexcept { + m_dyn = std::move(parsed); + m_header = m_dyn.get_header(); + return rsmi_status_t::RSMI_STATUS_SUCCESS; + } + + rsmi_status_t populate_metrics_dynamic_tbl() override; + + AMGpuMetricsPublicLatestTupl_t copy_internal_to_external_metrics() override; + + private: + AMDGpuDynamicMetrics_t m_dyn; + details::AMDGpuDynamicMetricsHeader_v1_t m_header{}; + +}; + template rsmi_status_t rsmi_dev_gpu_metrics_info_query(uint32_t dv_ind, AMDGpuMetricsUnitType_t metric_counter, T& metric_value); diff --git a/rocm_smi/src/rocm_smi_dyn_gpu_metrics.cc b/rocm_smi/src/rocm_smi_dyn_gpu_metrics.cc new file mode 100644 index 0000000000..87344a39e0 --- /dev/null +++ b/rocm_smi/src/rocm_smi_dyn_gpu_metrics.cc @@ -0,0 +1,333 @@ +/* + * MIT License + * + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * Developed by: + * + * AMD ML Software Engineering + * + * Advanced Micro Devices, Inc. + * + * www.amd.com + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in + * the documentation and/or other materials provided with the distribution. + * - Neither the names of Advanced Micro Devices, Inc, + * nor the names of its contributors may be used to endorse or promote + * products derived from this Software without specific prior written + * permission. + * + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + * + */ + +#include "rocm_smi/rocm_smi.h" +#include "rocm_smi/rocm_smi_main.h" +#include "rocm_smi/rocm_smi_dyn_gpu_metrics.h" +#include "rocm_smi/rocm_smi_logger.h" +#include "rocm_smi/rocm_smi_utils.h" +#include +#include +#include +#include + +namespace amd::smi +{ + +using namespace details; + +struct Cursor { + const std::byte* byte_ptr; + std::size_t remainder; +}; + +// Used when mismatch in schema to safely skip value +static inline bool skip_payload(Cursor& cur, + AMDGpuMetricAttributeType_t t, + uint64_t instances) { + + const std::size_t elem = get_metric_bytes(t); + if (elem == 0 || instances > std::numeric_limits::max() / elem) { + return false; + } + + const std::size_t bytes = static_cast(instances) * elem; + if (cur.remainder < bytes) { + return false; + } + + cur.byte_ptr += bytes; + cur.remainder -= bytes; + return true; +} + +// Lookup a schema instance for (attr_id, attr_type) +static inline rsmi_status_t schema_lookup_instance( AMDGpuMetricAttributeId_t attr_id, + AMDGpuMetricAttributeType_t attr_type, + AMDGpuMetricAttributeInstance_t& schema_inst) { + + if (const auto attr_id_itr = AMDGpuMetricsBaseSchema.find(attr_id); attr_id_itr != AMDGpuMetricsBaseSchema.end()) { + const auto& inst = attr_id_itr->second.m_instance; + if (inst.m_attribute_type == attr_type) { + schema_inst = inst; + return RSMI_STATUS_SUCCESS; + } + return RSMI_STATUS_NOT_SUPPORTED; + } + return RSMI_STATUS_NOT_FOUND; +} + +template +static inline std::optional read_scalar(Cursor& c) { + // Ensure we can read safely + if (c.remainder < sizeof(T)) { + return std::nullopt; + } + T v{}; + std::memcpy(&v, c.byte_ptr, sizeof(T)); + c.byte_ptr += sizeof(T); + c.remainder -= sizeof(T); + return v; +} + +template +static inline std::optional> read_vector(Cursor& c, std::size_t count) { + + static_assert(std::is_integral_v && std::is_trivially_copyable_v, + "metrics expect integral element types"); + + // Prevent size_t overflow + if (count > SIZE_MAX / sizeof(T) || count == 0) { + return std::nullopt; + } + + // Ensure we can read entire array safely + const std::size_t bytes = count * sizeof(T); + if (c.remainder < bytes) { + return std::nullopt; + } + + std::vector out; + out.resize(count); + std::memcpy(out.data(), c.byte_ptr, bytes); + c.byte_ptr += bytes; + c.remainder -= bytes; + return out; +} + +// Template to fill AMDGpuMetricAttributeValue_t with either a scalar or vector +template +static inline std::optional read_metric_value(Cursor& c, + uint64_t instances) { + + if (instances == 1) { + if (auto v = read_scalar(c)) { + return AMDGpuMetricAttributeValue_t{*v}; + } + return std::nullopt; + } + if (auto vv = read_vector(c, static_cast(instances))) { + return AMDGpuMetricAttributeValue_t{std::move(*vv)}; + } + return std::nullopt; +} + +auto AMDGpuDynamicMetrics_t::parse_from_buffer(const std::byte* data, + std::size_t size) noexcept -> rsmi_status_t { + + rsmi_status_t status = RSMI_STATUS_SUCCESS; + if (!data || (size < (sizeof(AMDGpuDynamicMetricsHeader_v1_t) + sizeof(uint32_t)))) { + return RSMI_STATUS_INSUFFICIENT_SIZE; + } + + // Grab header + details::AMDGpuDynamicMetricsHeader_v1_t hdr{}; + std::memcpy(&hdr, data, sizeof(hdr)); + + // Advance metrics pointer past header and keep track of remaining file size + Cursor cur{ (data + sizeof(hdr)), (size - sizeof(hdr)) }; + + // Grab attribute count, directly after header and increment + auto attr_count_opt = read_scalar(cur); + if (!attr_count_opt) { + return RSMI_STATUS_UNEXPECTED_SIZE; + } + uint32_t attr_count = *attr_count_opt; + if (attr_count == 0 || attr_count > size){ + return RSMI_STATUS_UNEXPECTED_SIZE; + } + + details::AMDGpuMetricSchemaType_t metrics_data; + metrics_data.reserve(attr_count); + AMDGpuDynamicMetricsOffsetMap_t offsets; + for (uint32_t i = 0; i < attr_count; ++i) { + + if (cur.remainder < sizeof(uint64_t)) { + return RSMI_STATUS_UNEXPECTED_SIZE; + } + + // Absolute offset for attribute start in file + const std::size_t entry_start = static_cast(cur.byte_ptr - data); + + // Read attribute instance and increment + auto enc_opt = read_scalar(cur); + if (!enc_opt) { + return RSMI_STATUS_UNEXPECTED_SIZE; + } + const uint64_t enc = *enc_opt; + + const auto dec = amdgpu_metrics_decode_attr(enc); + + const auto attr_type = static_cast(dec.m_attr_type); + const auto attr_id = static_cast(dec.m_attr_id); + const auto instances = static_cast(dec.m_attr_instance); + + if (instances == 0) { + return RSMI_STATUS_UNEXPECTED_SIZE; + } + + // Schema lookup + AMDGpuMetricAttributeInstance_t inst{}; + status = schema_lookup_instance(attr_id, attr_type, inst); + if (status != RSMI_STATUS_SUCCESS){ + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ + << " | Warn: schema lookup miss" + << " | Attr ID: " << static_cast>(attr_id) + << " | Attr Type: " << static_cast>(attr_type) + << " | Returning = " << getRSMIStatusString(status) + << " |"; + LOG_TRACE(ss); + + if (!skip_payload(cur, attr_type, instances)){ + return status; + } + continue; // Do not emit row, go to next attribute + } + + // Read scalar or all vector values after attribute instance + AMDGpuMetricAttributeValue_t val{}; + + std::optional mv; + switch (attr_type) { + case AMDGpuMetricAttributeType_t::TYPE_UINT8: { + mv = read_metric_value(cur, instances); + break; + } + case AMDGpuMetricAttributeType_t::TYPE_INT8: { + mv = read_metric_value(cur, instances); + break; + } + case AMDGpuMetricAttributeType_t::TYPE_UINT16: { + mv = read_metric_value(cur, instances); + break; + } + case AMDGpuMetricAttributeType_t::TYPE_INT16: { + mv = read_metric_value(cur, instances); + break; + } + case AMDGpuMetricAttributeType_t::TYPE_UINT32: { + mv = read_metric_value(cur, instances); + break; + } + case AMDGpuMetricAttributeType_t::TYPE_INT32: { + mv = read_metric_value(cur, instances); + break; + } + case AMDGpuMetricAttributeType_t::TYPE_UINT64: { + mv = read_metric_value(cur, instances); + break; + } + case AMDGpuMetricAttributeType_t::TYPE_INT64: { + mv = read_metric_value(cur, instances); + break; + } + default: return RSMI_STATUS_INSUFFICIENT_SIZE; + } + + if (!mv) { + return RSMI_STATUS_UNEXPECTED_SIZE; + } + + val = std::move(*mv); // safely set val + const uint32_t row_index = static_cast(metrics_data.size()); + metrics_data.emplace_back(inst, val); + offsets.try_emplace(entry_start, row_index); + } + + { + std::unique_lock lk(m_mutex); + m_header = hdr; + m_attr_count = attr_count; + m_dynamic_metrics_data.swap(metrics_data); + m_dynamic_metrics_data_offsets.swap(offsets); + } + return RSMI_STATUS_SUCCESS; +} + +auto AMDGpuDynamicMetrics_t::parse_from_file(const std::string& metrics_file_path, + std::size_t read_size) -> rsmi_status_t { + AMDGPUMetricsDynDataBuffer_t buf; + + auto st = read_dynamic_gpu_metrics_file(metrics_file_path, read_size, buf); + if (st != RSMI_STATUS_SUCCESS) { + return st; + } + + return parse_from_buffer(reinterpret_cast(buf.data()), buf.size()); +} + +rsmi_status_t read_dynamic_gpu_metrics_file(const std::string& metrics_file_path, + const size_t read_size, + AMDGPUMetricsDynDataBuffer_t& out) { + + // Clear output buffer and open file stream + out.clear(); + std::ifstream gpu_metrics_file(metrics_file_path, std::ios::binary); + if (!gpu_metrics_file.is_open()) { + return RSMI_STATUS_NOT_FOUND; + } + + if ((read_size <= 0)) { + return RSMI_STATUS_UNEXPECTED_SIZE; + } + + out.resize(read_size); + gpu_metrics_file.read(reinterpret_cast(out.data()), + static_cast(read_size)); + + const std::streamsize gpu_metrics_filesize = gpu_metrics_file.gcount(); + + if(gpu_metrics_filesize <= 0){ + out.clear(); + return RSMI_STATUS_NO_DATA; + } + + out.resize(static_cast(gpu_metrics_filesize)); + return RSMI_STATUS_SUCCESS; + +} + +} // namespace amd::smi diff --git a/rocm_smi/src/rocm_smi_gpu_metrics.cc b/rocm_smi/src/rocm_smi_gpu_metrics.cc index 97dee9e585..e340e03d80 100644 --- a/rocm_smi/src/rocm_smi_gpu_metrics.cc +++ b/rocm_smi/src/rocm_smi_gpu_metrics.cc @@ -21,6 +21,7 @@ */ #include "rocm_smi/rocm_smi_gpu_metrics.h" +#include "rocm_smi/rocm_smi_dyn_gpu_metrics.h" // Dynamic metrics #include "rocm_smi/rocm_smi_common.h" // Should go before rocm_smi.h #include "rocm_smi/rocm_smi.h" #include "rocm_smi/rocm_smi_main.h" @@ -46,6 +47,8 @@ #include #include #include +#include +#include using namespace amd::smi; @@ -145,6 +148,7 @@ const AMDGpuMetricVersionTranslationTbl_t amdgpu_metric_version_translation_tabl {join_metrics_version(1, 6), AMDGpuMetricVersionFlags_t::kGpuMetricV16}, {join_metrics_version(1, 7), AMDGpuMetricVersionFlags_t::kGpuMetricV17}, {join_metrics_version(1, 8), AMDGpuMetricVersionFlags_t::kGpuMetricV18}, + {join_metrics_version(1, 9), AMDGpuMetricVersionFlags_t::kGpuMetricV19}, // Dynamic GPU Metrics }; /** @@ -365,6 +369,7 @@ AMDGpuMetricFactories_t amd_gpu_metrics_factory_table {AMDGpuMetricVersionFlags_t::kGpuMetricV16, std::make_shared(GpuMetricsBase_v16_t{})}, {AMDGpuMetricVersionFlags_t::kGpuMetricV17, std::make_shared(GpuMetricsBase_v17_t{})}, {AMDGpuMetricVersionFlags_t::kGpuMetricV18, std::make_shared(GpuMetricsBase_v18_t{})}, + {AMDGpuMetricVersionFlags_t::kGpuMetricV19, std::make_shared()}, }; GpuMetricsBasePtr amdgpu_metrics_factory(AMDGpuMetricVersionFlags_t gpu_metric_version) @@ -417,6 +422,58 @@ constexpr T init_max_uint_types() } } +AMDGpuMetricsDataType_t dtype_from_attr(details::AMDGpuMetricAttributeType_t t) { + switch (t) { + case details::AMDGpuMetricAttributeType_t::TYPE_UINT8: { + return AMDGpuMetricsDataType_t::kUInt8; + } + case details::AMDGpuMetricAttributeType_t::TYPE_UINT16: { + return AMDGpuMetricsDataType_t::kUInt16; + } + case details::AMDGpuMetricAttributeType_t::TYPE_UINT32: { + return AMDGpuMetricsDataType_t::kUInt32; + } + default: { + return AMDGpuMetricsDataType_t::kUInt64; + } + } +} + +template +constexpr uint64_t safe_way_to_uint64(Tp value) { + if constexpr (std::is_signed_v) { + using intermediate_type = std::conditional_t>; + return static_cast(static_cast(value)); + } else { + return static_cast(value); + } +} + +// Existing format_metric_row doesn't take vectors, so overload and write our own +template +AMDGpuDynamicMetricTblValues_t +format_metric_row(const std::vector& vec, const std::string& value_title, details::AMDGpuMetricAttributeType_t attr_type) +{ + AMDGpuDynamicMetricTblValues_t out; + out.reserve(vec.size()); + + const auto n = static_cast( + std::min(vec.size(), std::numeric_limits::max())); + + for (uint16_t idx = 0; idx < n; ++idx) { + uint64_t u64 = safe_way_to_uint64(vec[idx]); + AMDGpuDynamicMetricsValue_t amdgpu_dynamic_metric_value_init{}; + amdgpu_dynamic_metric_value_init.m_value = u64; + amdgpu_dynamic_metric_value_init.m_info = value_title + " : " + std::to_string(idx); + amdgpu_dynamic_metric_value_init.m_original_type = dtype_from_attr(attr_type); + out.emplace_back(std::move(amdgpu_dynamic_metric_value_init)); + } + return out; +} + +template struct is_vector : std::false_type {}; +template struct is_vector> : std::true_type {}; + template AMDGpuDynamicMetricTblValues_t format_metric_row(const T& metric, const std::string& value_title) { @@ -483,6 +540,235 @@ AMDGpuDynamicMetricTblValues_t format_metric_row(const T& metric, const std::str return multi_values; } +rsmi_status_t GpuMetricsBaseDynamic_t::populate_metrics_dynamic_tbl() { + std::ostringstream ss; + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + ss << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ss); + + auto m_metrics_dynamic_tbl = AMDGpuDynamicMetricsTbl_t{}; + + auto emit = [&](AMDGpuMetricsClassId_t cls, AMDGpuMetricsUnitType_t unit, + const char* label, + const details::AMDGpuMetricAttributeData_t& row) { + + auto rows = std::visit([&](const auto& x) -> AMDGpuDynamicMetricTblValues_t { + using S = std::decay_t; + if constexpr (is_vector::value) { // Would like to use is_multivalued() here, but compiler needs well-formed + return format_metric_row(x, std::string(label), row.m_instance.m_attribute_type); + } else { + return format_metric_row(x, std::string(label)); + } + }, row.m_value); + + m_metrics_dynamic_tbl[cls].insert({unit, std::move(rows)}); + }; + + for (const auto& r : m_dyn.get_metric_rows()) { + switch (r.m_instance.m_attribute_id) { + + // Power energy and temperature + case details::AMDGpuMetricAttributeId_t::TEMPERATURE_HOTSPOT: + emit(AMDGpuMetricsClassId_t::kGpuMetricTemperature, AMDGpuMetricsUnitType_t::kMetricTempHotspot, + "temperature_hotspot", r); + break; + case details::AMDGpuMetricAttributeId_t::TEMPERATURE_MEM: + emit(AMDGpuMetricsClassId_t::kGpuMetricTemperature, AMDGpuMetricsUnitType_t::kMetricTempMem, + "temperature_mem", r); + break; + case details::AMDGpuMetricAttributeId_t::TEMPERATURE_VRSOC: + emit(AMDGpuMetricsClassId_t::kGpuMetricTemperature, AMDGpuMetricsUnitType_t::kMetricTempVrSoc, + "temperature_vrsoc", r); + break; + case details::AMDGpuMetricAttributeId_t::CURR_SOCKET_POWER: + emit(AMDGpuMetricsClassId_t::kGpuMetricPowerEnergy, AMDGpuMetricsUnitType_t::kMetricCurrSocketPower, + "curr_socket_power", r); + break; + case details::AMDGpuMetricAttributeId_t::ENERGY_ACCUMULATOR: + emit(AMDGpuMetricsClassId_t::kGpuMetricPowerEnergy, AMDGpuMetricsUnitType_t::kMetricEnergyAccumulator, + "energy_acc", r); + break; + + // Utilization + case details::AMDGpuMetricAttributeId_t::AVERAGE_GFX_ACTIVITY: + emit(AMDGpuMetricsClassId_t::kGpuMetricUtilization, AMDGpuMetricsUnitType_t::kMetricAvgGfxActivity, + "average_gfx_activity", r); + break; + case details::AMDGpuMetricAttributeId_t::AVERAGE_UMC_ACTIVITY: + emit(AMDGpuMetricsClassId_t::kGpuMetricUtilization, AMDGpuMetricsUnitType_t::kMetricAvgUmcActivity, + "average_umc_activity", r); + break; + case details::AMDGpuMetricAttributeId_t::GFX_ACTIVITY_ACC: + emit(AMDGpuMetricsClassId_t::kGpuMetricUtilization, AMDGpuMetricsUnitType_t::kMetricGfxActivityAccumulator, + "gfx_activity_acc", r); + break; + case details::AMDGpuMetricAttributeId_t::MEM_ACTIVITY_ACC: + emit(AMDGpuMetricsClassId_t::kGpuMetricUtilization, AMDGpuMetricsUnitType_t::kMetricMemActivityAccumulator, + "mem_activity_acc", r); + break; + case details::AMDGpuMetricAttributeId_t::GFXCLK_LOCK_STATUS: + emit(AMDGpuMetricsClassId_t::kGpuMetricGfxClkLockStatus, AMDGpuMetricsUnitType_t::kMetricGfxClkLockStatus, + "gfxclk_lock_status", r); + break; + + // Metric Timestamp + case details::AMDGpuMetricAttributeId_t::FIRMWARE_TIMESTAMP: + emit(AMDGpuMetricsClassId_t::kGpuMetricTimestamp, AMDGpuMetricsUnitType_t::kMetricTSFirmware, + "firmware_timestamp", r); + break; + case details::AMDGpuMetricAttributeId_t::SYSTEM_CLOCK_COUNTER: + emit(AMDGpuMetricsClassId_t::kGpuMetricTimestamp, AMDGpuMetricsUnitType_t::kMetricTSClockCounter, + "system_clock_counter", r); + break; + + // Throttle Residency + case details::AMDGpuMetricAttributeId_t::ACCUMULATION_COUNTER: + emit(AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency, AMDGpuMetricsUnitType_t::kMetricAccumulationCounter, + "accumulation_counter", r); + break; + + // Link Width Speed + case details::AMDGpuMetricAttributeId_t::PCIE_LINK_WIDTH: + emit(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, AMDGpuMetricsUnitType_t::kMetricPcieLinkWidth, + "pcie_link_width", r); + break; + case details::AMDGpuMetricAttributeId_t::PCIE_LINK_SPEED: + emit(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, AMDGpuMetricsUnitType_t::kMetricPcieLinkSpeed, + "pcie_link_speed", r); + break; + case details::AMDGpuMetricAttributeId_t::XGMI_LINK_WIDTH: + emit(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, AMDGpuMetricsUnitType_t::kMetricXgmiLinkWidth, + "xgmi_link_width", r); + break; + case details::AMDGpuMetricAttributeId_t::XGMI_LINK_SPEED: + emit(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, AMDGpuMetricsUnitType_t::kMetricXgmiLinkSpeed, + "xgmi_link_speed", r); + break; + case details::AMDGpuMetricAttributeId_t::PCIE_BANDWIDTH_ACC: + emit(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, AMDGpuMetricsUnitType_t::kMetricPcieBandwidthAccumulator, + "pcie_bandwidth_acc", r); + break; + case details::AMDGpuMetricAttributeId_t::PCIE_BANDWIDTH_INST: + emit(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, AMDGpuMetricsUnitType_t::kMetricPcieBandwidthInst, + "pcie_bandwidth_inst", r); + break; + case details::AMDGpuMetricAttributeId_t::PCIE_L0_TO_RECOV_COUNT_ACC: + emit(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, AMDGpuMetricsUnitType_t::kMetricPcieL0RecovCountAccumulator, + "pcie_l0_recov_count_acc", r); + break; + case details::AMDGpuMetricAttributeId_t::PCIE_REPLAY_COUNT_ACC: + emit(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, AMDGpuMetricsUnitType_t::kMetricPcieReplayCountAccumulator, + "pcie_replay_count_acc", r); + break; + case details::AMDGpuMetricAttributeId_t::PCIE_REPLAY_ROVER_COUNT_ACC: + emit(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, AMDGpuMetricsUnitType_t::kMetricPcieReplayRollOverCountAccumulator, + "pcie_replay_rollover_count_acc", r); + break; + case details::AMDGpuMetricAttributeId_t::PCIE_NAK_SENT_COUNT_ACC: + emit(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, AMDGpuMetricsUnitType_t::kMetricPcieNakSentCountAccumulator, + "pcie_nak_sent_count_acc", r); + break; + case details::AMDGpuMetricAttributeId_t::PCIE_NAK_RCVD_COUNT_ACC: + emit(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, AMDGpuMetricsUnitType_t::kMetricPcieNakReceivedCountAccumulator, + "pcie_nak_rcvd_count_acc", r); + break; + case details::AMDGpuMetricAttributeId_t::XGMI_READ_DATA_ACC: + emit(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, AMDGpuMetricsUnitType_t::kMetricXgmiReadDataAccumulator, + "xgmi_read_data_acc", r); + break; + case details::AMDGpuMetricAttributeId_t::XGMI_WRITE_DATA_ACC: + emit(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, AMDGpuMetricsUnitType_t::kMetricXgmiWriteDataAccumulator, + "xgmi_write_data_acc", r); + break; + case details::AMDGpuMetricAttributeId_t::XGMI_LINK_STATUS: + emit(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, AMDGpuMetricsUnitType_t::kMetricXgmiLinkStatus, + "xgmi_link_status", r); + break; + + // Current Clock + case details::AMDGpuMetricAttributeId_t::CURRENT_GFXCLK: + emit(AMDGpuMetricsClassId_t::kGpuMetricCurrentClock, AMDGpuMetricsUnitType_t::kMetricCurrGfxClock, + "current_gfxclk", r); + break; + case details::AMDGpuMetricAttributeId_t::CURRENT_SOCCLK: + emit(AMDGpuMetricsClassId_t::kGpuMetricCurrentClock, AMDGpuMetricsUnitType_t::kMetricCurrSocClock, + "current_socclk", r); + break; + case details::AMDGpuMetricAttributeId_t::CURRENT_VCLK0: + emit(AMDGpuMetricsClassId_t::kGpuMetricCurrentClock, AMDGpuMetricsUnitType_t::kMetricCurrVClock0, + "current_vclk0", r); + break; + case details::AMDGpuMetricAttributeId_t::CURRENT_DCLK0: + emit(AMDGpuMetricsClassId_t::kGpuMetricCurrentClock, AMDGpuMetricsUnitType_t::kMetricCurrDClock0, + "current_dclk0", r); + break; + case details::AMDGpuMetricAttributeId_t::CURRENT_UCLK: + emit(AMDGpuMetricsClassId_t::kGpuMetricCurrentClock, AMDGpuMetricsUnitType_t::kMetricCurrUClock, + "current_uclk", r); + break; + + // Throttle Residency + case details::AMDGpuMetricAttributeId_t::PROCHOT_RESIDENCY_ACC: + emit(AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency, AMDGpuMetricsUnitType_t::kMetricProchotResidencyAccumulator, + "prochot_residency_acc", r); + break; + case details::AMDGpuMetricAttributeId_t::PPT_RESIDENCY_ACC: + emit(AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency, AMDGpuMetricsUnitType_t::kMetricPPTResidencyAccumulator, + "ppt_residency_acc", r); + break; + case details::AMDGpuMetricAttributeId_t::SOCKET_THM_RESIDENCY_ACC: + emit(AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency, AMDGpuMetricsUnitType_t::kMetricSocketThmResidencyAccumulator, + "socket_thm_residency_acc", r); + break; + case details::AMDGpuMetricAttributeId_t::VR_THM_RESIDENCY_ACC: + emit(AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency, AMDGpuMetricsUnitType_t::kMetricVRThmResidencyAccumulator, + "vr_thm_residency_acc", r); + break; + case details::AMDGpuMetricAttributeId_t::HBM_THM_RESIDENCY_ACC: + emit(AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency, AMDGpuMetricsUnitType_t::kMetricHBMThmResidencyAccumulator, + "hbm_thm_residency_acc", r); + break; + + // XCP stats + case details::AMDGpuMetricAttributeId_t::GFX_BUSY_INST: + emit(AMDGpuMetricsClassId_t::kGpuMetricXcpStats, AMDGpuMetricsUnitType_t::kMetricGfxBusyInst, + "xcp_stats->gfx_busy_inst", r); + break; + case details::AMDGpuMetricAttributeId_t::JPEG_BUSY: + emit(AMDGpuMetricsClassId_t::kGpuMetricXcpStats, AMDGpuMetricsUnitType_t::kMetricJpegBusy, + "xcp_stats->jpeg_busy", r); + break; + case details::AMDGpuMetricAttributeId_t::VCN_BUSY: + emit(AMDGpuMetricsClassId_t::kGpuMetricXcpStats, AMDGpuMetricsUnitType_t::kMetricVcnBusy, + "xcp_stats->vcn_busy", r); + break; + case details::AMDGpuMetricAttributeId_t::GFX_BUSY_ACC: + emit(AMDGpuMetricsClassId_t::kGpuMetricXcpStats, AMDGpuMetricsUnitType_t::kMetricGfxBusyAcc, + "xcp_stats->gfx_busy_acc", r); + break; + + default: + ss << __PRETTY_FUNCTION__ + << " UNKNOWN Attribute " + << static_cast(r.m_instance.m_attribute_id) + << " |"; + LOG_ERROR(ss); + break; + } + } + + ss << __PRETTY_FUNCTION__ << " | ======= end ======= " + << " | Success " + << " | Returning = " << getRSMIStatusString(status_code) << " |"; + LOG_TRACE(ss); + + { std::lock_guard lk(s_base_tbl_mu); + // Copy to base class + this->m_base_metrics_dynamic_tbl = m_metrics_dynamic_tbl; + } + + return status_code; +} rsmi_status_t GpuMetricsBase_v18_t::populate_metrics_dynamic_tbl() { std::ostringstream ss; @@ -1843,6 +2129,199 @@ rsmi_status_t init_max_public_gpu_matrics(AMGpuMetricsPublicLatest_t& rsmi_gpu_m return status_code; } +AMGpuMetricsPublicLatestTupl_t GpuMetricsBaseDynamic_t::copy_internal_to_external_metrics() { + std::ostringstream ss; + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + ss << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ss); + + AMGpuMetricsPublicLatest_t out{}; + init_max_public_gpu_matrics(out); + + out.common_header.structure_size = m_header.m_structure_size; + out.common_header.format_revision = m_header.m_format_revision; + out.common_header.content_revision= m_header.m_content_revision; + + auto assign_by_type = [&](auto& dst, + const details::AMDGpuMetricAttributeData_t& r) { + using D = std::decay_t; + + std::visit([&](const auto& x) { + using S = std::decay_t; + if constexpr (std::is_integral_v) { + dst = static_cast(x); + } + }, r.m_value); + }; + + auto assign_vector = [&]( auto& dst, + const details::AMDGpuMetricAttributeData_t& r, + std::size_t cap) { + + using Dst = std::remove_reference_t; + using T = std::remove_cv_t>; + auto v = std::get_if>(&r.m_value); + const std::size_t n = std::min(v->size(), cap); + std::copy_n(v->data(), n, dst); + }; + + for (const auto& r : m_dyn.get_metric_rows()) { + + switch (r.m_instance.m_attribute_id) { + // Temps + case details::AMDGpuMetricAttributeId_t::TEMPERATURE_HOTSPOT: + assign_by_type(out.temperature_hotspot, r); break; + case details::AMDGpuMetricAttributeId_t::TEMPERATURE_MEM: + assign_by_type(out.temperature_mem, r); break; + case details::AMDGpuMetricAttributeId_t::TEMPERATURE_VRSOC: + assign_by_type(out.temperature_vrsoc, r); break; + + // Power/Energy + case details::AMDGpuMetricAttributeId_t::CURR_SOCKET_POWER: + assign_by_type(out.current_socket_power, r); break; + case details::AMDGpuMetricAttributeId_t::ENERGY_ACCUMULATOR: + assign_by_type(out.energy_accumulator, r); break; + + // Utilization + case details::AMDGpuMetricAttributeId_t::AVERAGE_GFX_ACTIVITY: + assign_by_type(out.average_gfx_activity, r); break; + case details::AMDGpuMetricAttributeId_t::AVERAGE_UMC_ACTIVITY: + assign_by_type(out.average_umc_activity, r); break; + case details::AMDGpuMetricAttributeId_t::GFX_ACTIVITY_ACC: + assign_by_type(out.gfx_activity_acc, r); break; + case details::AMDGpuMetricAttributeId_t::MEM_ACTIVITY_ACC: + assign_by_type(out.mem_activity_acc, r); break; + + // Timestamps / Lock + case details::AMDGpuMetricAttributeId_t::SYSTEM_CLOCK_COUNTER: + assign_by_type(out.system_clock_counter, r); break; + case details::AMDGpuMetricAttributeId_t::FIRMWARE_TIMESTAMP: + assign_by_type(out.firmware_timestamp, r); break; + case details::AMDGpuMetricAttributeId_t::GFXCLK_LOCK_STATUS: + assign_by_type(out.gfxclk_lock_status, r); break; + + // Link width/speed, bandwidth, counts + case details::AMDGpuMetricAttributeId_t::PCIE_LINK_WIDTH: + assign_by_type(out.pcie_link_width, r); break; + case details::AMDGpuMetricAttributeId_t::PCIE_LINK_SPEED: + assign_by_type(out.pcie_link_speed, r); break; + case details::AMDGpuMetricAttributeId_t::XGMI_LINK_WIDTH: + assign_by_type(out.xgmi_link_width, r); break; + case details::AMDGpuMetricAttributeId_t::XGMI_LINK_SPEED: + assign_by_type(out.xgmi_link_speed, r); break; + case details::AMDGpuMetricAttributeId_t::PCIE_BANDWIDTH_ACC: + assign_by_type(out.pcie_bandwidth_acc, r); break; + case details::AMDGpuMetricAttributeId_t::PCIE_BANDWIDTH_INST: + assign_by_type(out.pcie_bandwidth_inst, r); break; + case details::AMDGpuMetricAttributeId_t::PCIE_L0_TO_RECOV_COUNT_ACC: + assign_by_type(out.pcie_l0_to_recov_count_acc, r); break; + case details::AMDGpuMetricAttributeId_t::PCIE_REPLAY_COUNT_ACC: + assign_by_type(out.pcie_replay_count_acc, r); break; + case details::AMDGpuMetricAttributeId_t::PCIE_REPLAY_ROVER_COUNT_ACC: + assign_by_type(out.pcie_replay_rover_count_acc, r); break; + case details::AMDGpuMetricAttributeId_t::PCIE_NAK_SENT_COUNT_ACC: + assign_by_type(out.pcie_nak_sent_count_acc, r); break; + case details::AMDGpuMetricAttributeId_t::PCIE_NAK_RCVD_COUNT_ACC: + assign_by_type(out.pcie_nak_rcvd_count_acc, r); break; + + // Residency / counters + case details::AMDGpuMetricAttributeId_t::ACCUMULATION_COUNTER: + assign_by_type(out.accumulation_counter, r); break; + case details::AMDGpuMetricAttributeId_t::PROCHOT_RESIDENCY_ACC: + assign_by_type(out.prochot_residency_acc, r); break; + case details::AMDGpuMetricAttributeId_t::PPT_RESIDENCY_ACC: + assign_by_type(out.ppt_residency_acc, r); break; + case details::AMDGpuMetricAttributeId_t::SOCKET_THM_RESIDENCY_ACC: + assign_by_type(out.socket_thm_residency_acc, r); break; + case details::AMDGpuMetricAttributeId_t::VR_THM_RESIDENCY_ACC: + assign_by_type(out.vr_thm_residency_acc, r); break; + case details::AMDGpuMetricAttributeId_t::HBM_THM_RESIDENCY_ACC: + assign_by_type(out.hbm_thm_residency_acc, r); break; + + // VRAM max bandwidth + case details::AMDGpuMetricAttributeId_t::MEM_MAX_BANDWIDTH: + assign_by_type(out.vram_max_bandwidth, r); break; + + // XGMI accumulators / link status (arrays) + case details::AMDGpuMetricAttributeId_t::XGMI_READ_DATA_ACC: { + assign_vector(out.xgmi_read_data_acc, r, RSMI_MAX_NUM_XGMI_LINKS); break; + } + case details::AMDGpuMetricAttributeId_t::XGMI_WRITE_DATA_ACC: { + assign_vector(out.xgmi_write_data_acc, r, RSMI_MAX_NUM_XGMI_LINKS); break; + } + case details::AMDGpuMetricAttributeId_t::XGMI_LINK_STATUS: { + assign_vector(out.xgmi_link_status, r, RSMI_MAX_NUM_XGMI_LINKS); break; + } + + // Current clocks (arrays) + uclk (scalar) + case details::AMDGpuMetricAttributeId_t::CURRENT_GFXCLK: { + assign_vector(out.current_gfxclks, r, RSMI_MAX_NUM_GFX_CLKS); break; + } + case details::AMDGpuMetricAttributeId_t::CURRENT_SOCCLK: { + assign_vector(out.current_socclks, r, RSMI_MAX_NUM_CLKS); break; + } + case details::AMDGpuMetricAttributeId_t::CURRENT_VCLK0: { + assign_vector(out.current_vclk0s, r, RSMI_MAX_NUM_CLKS); break; + } + case details::AMDGpuMetricAttributeId_t::CURRENT_DCLK0: { + assign_vector(out.current_dclk0s, r, RSMI_MAX_NUM_CLKS); break; + } + + case details::AMDGpuMetricAttributeId_t::CURRENT_UCLK: + assign_by_type(out.current_uclk, r); break; + + case details::AMDGpuMetricAttributeId_t::PCIE_LC_PERF_OTHER_END_RECOVERY: + assign_by_type(out.pcie_lc_perf_other_end_recovery, r); break; + + // XCP stats + // Only fill in entry 0 + case details::AMDGpuMetricAttributeId_t::GFX_BUSY_INST: { + assign_vector(out.xcp_stats[0].gfx_busy_inst, r, RSMI_MAX_NUM_XCC); break; + } + case details::AMDGpuMetricAttributeId_t::JPEG_BUSY: { + assign_vector(out.xcp_stats[0].jpeg_busy, r, RSMI_MAX_NUM_JPEG_ENG_V1); break; + } + case details::AMDGpuMetricAttributeId_t::VCN_BUSY: { + assign_vector(out.xcp_stats[0].vcn_busy, r, RSMI_MAX_NUM_VCNS); break; + } + case details::AMDGpuMetricAttributeId_t::GFX_BUSY_ACC: { + assign_vector(out.xcp_stats[0].gfx_busy_acc, r, RSMI_MAX_NUM_XCC); break; + } + case details::AMDGpuMetricAttributeId_t::GFX_BELOW_HOST_LIMIT_PPT_ACC: { + assign_vector(out.xcp_stats[0].gfx_below_host_limit_ppt_acc, r, RSMI_MAX_NUM_XCC); break; + } + case details::AMDGpuMetricAttributeId_t::GFX_BELOW_HOST_LIMIT_THM_ACC: { + assign_vector(out.xcp_stats[0].gfx_below_host_limit_thm_acc, r, RSMI_MAX_NUM_XCC); break; + } + case details::AMDGpuMetricAttributeId_t::GFX_LOW_UTILIZATION_ACC: { + assign_vector(out.xcp_stats[0].gfx_low_utilization_acc, r, RSMI_MAX_NUM_XCC); break; + } + case details::AMDGpuMetricAttributeId_t::GFX_BELOW_HOST_LIMIT_TOTAL_ACC: { + assign_vector(out.xcp_stats[0].gfx_below_host_limit_total_acc, r, RSMI_MAX_NUM_XCC); break; + } + + default: break; + } + + } + + out.current_gfxclk = out.current_gfxclks[0]; + out.current_socclk = out.current_socclks[0]; + out.current_vclk0 = out.current_vclk0s[0]; + out.current_vclk1 = out.current_vclk0s[1]; + out.current_dclk0 = out.current_dclk0s[0]; + out.current_dclk1 = out.current_dclk0s[1]; + + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Returning = " << getRSMIStatusString(status_code) + << " |"; + LOG_TRACE(ss); + + return std::make_tuple(status_code, out); +} + AMGpuMetricsPublicLatestTupl_t GpuMetricsBase_v18_t::copy_internal_to_external_metrics() { std::ostringstream ss; @@ -3874,6 +4353,7 @@ rsmi_status_t Device::dev_read_gpu_metrics_header_data() auto op_result = readDevInfo(DevInfoTypes::kDevGpuMetrics, sizeof(AMDGpuMetricsHeader_v1_t), &m_gpu_metrics_header); + if ((status_code = ErrnoToRsmiStatus(op_result)) != rsmi_status_t::RSMI_STATUS_SUCCESS) { ss << __PRETTY_FUNCTION__ @@ -3948,24 +4428,54 @@ rsmi_status_t Device::dev_read_gpu_metrics_all_data() return status_code; } - auto op_result = readDevInfo(DevInfoTypes::kDevGpuMetrics, - m_gpu_metrics_header.m_structure_size, - m_gpu_metrics_ptr->get_metrics_table().get()); - if ((status_code = ErrnoToRsmiStatus(op_result)) != - rsmi_status_t::RSMI_STATUS_SUCCESS) { - ss << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Fail " - << " | Device #: " << index() - << " | Metric Version: " << stringfy_metrics_header(m_gpu_metrics_header) - << " | Cause: readDevInfo(kDevGpuMetrics)" - << " | Returning = " - << getRSMIStatusString(status_code) - << " Could not read Metrics Header: " - << print_unsigned_int(m_gpu_metrics_header.m_structure_size) - << " |"; - LOG_ERROR(ss); - return status_code; + if (m_is_dynamic_gpu_metrics_supported){ + + std::string file_name = "/sys/class/drm/card" + + std::to_string(index()) + + "/device/gpu_metrics"; + + // Parse blob to schema rows AMDGpuDynamicMetrics_t + AMDGpuDynamicMetrics_t parsed; + rsmi_status_t st = parsed.parse_from_file(file_name, m_gpu_metrics_header.m_structure_size); + + if (st != RSMI_STATUS_SUCCESS) { + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << index() + << " | Cause: read_dynamic_gpu_metrics_file()" + << " | Returning rocmsmi_status = " << getRSMIStatusString(st) << " |"; + LOG_ERROR(ss); + return rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA; + } + + // Store AMDGpuDynamicMetrics_t + auto* dyn = static_cast(m_gpu_metrics_ptr.get()); + status_code = dyn->set_parsed_dynamic(std::move(parsed)); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + return status_code; + } + + } else { + auto op_result = readDevInfo(DevInfoTypes::kDevGpuMetrics, + m_gpu_metrics_header.m_structure_size, + m_gpu_metrics_ptr->get_metrics_table().get()); + if ((status_code = ErrnoToRsmiStatus(op_result)) != + rsmi_status_t::RSMI_STATUS_SUCCESS) { + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << index() + << " | Metric Version: " << stringfy_metrics_header(m_gpu_metrics_header) + << " | Cause: readDevInfo(kDevGpuMetrics)" + << " | Returning = " + << getRSMIStatusString(status_code) + << " Could not read Metrics Header: " + << print_unsigned_int(m_gpu_metrics_header.m_structure_size) + << " |"; + LOG_ERROR(ss); + return status_code; + } } // All metric units are pushed in. @@ -4027,6 +4537,9 @@ rsmi_status_t Device::setup_gpu_metrics_reading() return status_code; } + m_is_dynamic_gpu_metrics_supported = (static_cast>(gpu_metrics_flag_version) >= + static_cast>(AMDGpuMetricVersionFlags_t::kGpuMetricV19)); + // m_gpu_metrics_ptr.reset(); m_gpu_metrics_ptr = amdgpu_metrics_factory(gpu_metrics_flag_version); @@ -4047,7 +4560,6 @@ rsmi_status_t Device::setup_gpu_metrics_reading() m_gpu_metrics_ptr->set_device_id(m_device_id); m_gpu_metrics_ptr->set_partition_id(m_partition_id); - // // m_gpu_metrics_ptr has the pointer to the proper object type/version. status_code = dev_read_gpu_metrics_all_data(); if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) {