From 83589929dbd3e4ee9138506b6a1192e14c906ab2 Mon Sep 17 00:00:00 2001 From: "Oliveira, Daniel" Date: Tue, 24 Oct 2023 21:12:05 -0500 Subject: [PATCH] rocm_smi_lib: Fix Refactoring gpu_metrics code Uses new support for 'gpu_metrics_v1_4' Code changes related to the following: * rsmi gpu_metrics APIs * rsmi gpu_metrics Logs * new data structure fields added in 1.4 * added APIs for all other existing metrics before 1.4 * added support to older metrics; 1.1, and 1.2 * public APIs renamed to start with prefix 'rsmi_dev_metrics_' * Unit tests updated * Examples updated Build changes related to the following: None Change-Id: Ibdaf031be9d916020b4049544dbd725858c7711d Signed-off-by: Oliveira, Daniel [ROCm/rocm_smi_lib commit: 2c8ba4cae949807ee6aed6123e786fbe9517c4c6] --- .../rocm-smi-lib/include/rocm_smi/rocm_smi.h | 1267 ++++++++- .../include/rocm_smi/rocm_smi_device.h | 14 +- .../include/rocm_smi/rocm_smi_gpu_metrics.h | 161 +- .../rocm-smi-lib/python_smi_tools/README.md | 4 +- .../rocm_smi/example/rocm_smi_example.cc | 383 ++- projects/rocm-smi-lib/src/rocm_smi.cc | 1022 ++++++- projects/rocm-smi-lib/src/rocm_smi_device.cc | 11 +- .../rocm-smi-lib/src/rocm_smi_gpu_metrics.cc | 2420 ++++++++++++----- .../functional/gpu_metrics_read.cc | 378 ++- .../functional/measure_api_execution_time.cc | 895 +++++- .../rocm_smi_test/functional/temp_read.cc | 2 +- 11 files changed, 5624 insertions(+), 933 deletions(-) diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h index 3b5d41d72c..26d0ca8686 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h @@ -80,6 +80,7 @@ extern "C" { //! The number of points that make up a voltage-frequency curve definition #define RSMI_NUM_VOLTAGE_CURVE_POINTS 3 + /** * @brief Error codes retured by rocm_smi_lib functions */ @@ -910,6 +911,7 @@ typedef rsmi_od_volt_freq_data_t rsmi_od_volt_freq_data; */ struct metrics_table_header_t { // TODO(amd) Doxygen documents + // Note: This should match: AMDGpuMetricsHeader_v1_t /// \cond Ignore in docs. uint16_t structure_size; uint8_t format_revision; @@ -920,95 +922,175 @@ struct metrics_table_header_t { /** * @brief The following structure holds the gpu metrics values for a device. */ -// Below is the assumed version of gpu_metric data on the device. If the device -// is using this version, we can read data directly into rsmi_gpu_metrics_t. -// If the device is using an older format, a conversion of formats will be -// required. -// DGPU targets have a format version of 1. APU targets have a format version of -// 2. Currently, only version 1 (DGPU) gpu_metrics is supported. -#define RSMI_GPU_METRICS_API_FORMAT_VER 1 -// The content version increments when gpu_metrics is extended with new and/or -// existing field sizes are changed. -/** - * @brief The GPU metrics version 1 - */ -#define RSMI_GPU_METRICS_API_CONTENT_VER_1 1 -/** - * @brief The GPU metrics version 2 - */ -#define RSMI_GPU_METRICS_API_CONTENT_VER_2 2 -/** - * @brief The GPU metrics version 3 - */ -#define RSMI_GPU_METRICS_API_CONTENT_VER_3 3 -/** - * @brief This should match NUM_HBM_INSTANCES - */ -#define RSMI_NUM_HBM_INSTANCES 4 /** * @brief Unit conversion factor for HBM temperatures */ #define CENTRIGRADE_TO_MILLI_CENTIGRADE 1000 +/** + * @brief This should match kRSMI_MAX_NUM_HBM_INSTANCES + */ +#define RSMI_NUM_HBM_INSTANCES 4 + +/** + * @brief This should match kRSMI_MAX_NUM_VCN + */ +#define RSMI_MAX_NUM_VCN 4 + +/** + * @brief This should match kRSMI_MAX_NUM_CLKS + */ +#define RSMI_MAX_NUM_CLKS 4 + +/** + * @brief This should match kRSMI_MAX_NUM_XGMI_LINKS + */ +#define RSMI_MAX_NUM_XGMI_LINKS 8 + +/** + * @brief This should match kRSMI_MAX_NUM_GFX_CLKS + */ +#define RSMI_MAX_NUM_GFX_CLKS 8 + + typedef struct { // TODO(amd) Doxygen documents + // Note: This structure is extended to fit the needs of different GPU metric + // versions when exposing data through the structure. + // Depending on the version, some data members will hold data, and + // some will not. A good example is the set of 'current clocks': + // - current_gfxclk, current_socclk, current_vclk0, current_dclk0 + // These are single-valued data members, up to version 1.3. + // For version 1.4 and up these are multi-valued data members (arrays) + // and their counterparts; + // - current_gfxclks[], current_socclks[], current_vclk0s[], + // current_dclk0s[] + // will hold the data /// \cond Ignore in docs. + + /* + * v1.0 Base + */ struct metrics_table_header_t common_header; -/* Temperature */ - uint16_t temperature_edge; - uint16_t temperature_hotspot; - uint16_t temperature_mem; - uint16_t temperature_vrgfx; - uint16_t temperature_vrsoc; - uint16_t temperature_vrmem; + // Temperature + uint16_t temperature_edge; + uint16_t temperature_hotspot; + uint16_t temperature_mem; + uint16_t temperature_vrgfx; + uint16_t temperature_vrsoc; + uint16_t temperature_vrmem; -/* Utilization */ - uint16_t average_gfx_activity; - uint16_t average_umc_activity; // memory controller - uint16_t average_mm_activity; // UVD or VCN + // Utilization + uint16_t average_gfx_activity; + uint16_t average_umc_activity; // memory controller + uint16_t average_mm_activity; // UVD or VCN -/* Power/Energy */ - uint16_t average_socket_power; - uint64_t energy_accumulator; // v1 mod. (32->64) + // Power/Energy + uint16_t average_socket_power; + uint64_t energy_accumulator; // v1 mod. (32->64) -/* Driver attached timestamp (in ns) */ - uint64_t system_clock_counter; // v1 mod. (moved from top of struct) + // Driver attached timestamp (in ns) + uint64_t system_clock_counter; // v1 mod. (moved from top of struct) -/* Average clocks */ - uint16_t average_gfxclk_frequency; - uint16_t average_socclk_frequency; - uint16_t average_uclk_frequency; - uint16_t average_vclk0_frequency; - uint16_t average_dclk0_frequency; - uint16_t average_vclk1_frequency; - uint16_t average_dclk1_frequency; + // Average clocks + uint16_t average_gfxclk_frequency; + uint16_t average_socclk_frequency; + uint16_t average_uclk_frequency; + uint16_t average_vclk0_frequency; + uint16_t average_dclk0_frequency; + uint16_t average_vclk1_frequency; + uint16_t average_dclk1_frequency; -/* Current clocks */ - uint16_t current_gfxclk; - uint16_t current_socclk; - uint16_t current_uclk; - uint16_t current_vclk0; - uint16_t current_dclk0; - uint16_t current_vclk1; - uint16_t current_dclk1; + // Current clocks + uint16_t current_gfxclk; + uint16_t current_socclk; + uint16_t current_uclk; + uint16_t current_vclk0; + uint16_t current_dclk0; + uint16_t current_vclk1; + uint16_t current_dclk1; -/* Throttle status */ - uint32_t throttle_status; + // Throttle status + uint32_t throttle_status; -/* Fans */ - uint16_t current_fan_speed; + // Fans + uint16_t current_fan_speed; -/* Link width/speed */ - uint16_t pcie_link_width; // v1 mod.(8->16) - uint16_t pcie_link_speed; // in 0.1 GT/s; v1 mod. (8->16) + // Link width/speed + uint16_t pcie_link_width; // v1 mod.(8->16) + uint16_t pcie_link_speed; // in 0.1 GT/s; v1 mod. (8->16) - uint16_t padding; // new in v1 - uint32_t gfx_activity_acc; // new in v1 - uint32_t mem_activity_acc; // new in v1 - uint16_t temperature_hbm[RSMI_NUM_HBM_INSTANCES]; // new in v1 + /* + * v1.1 additions + */ + uint32_t gfx_activity_acc; // new in v1 + uint32_t mem_activity_acc; // new in v1 + uint16_t temperature_hbm[RSMI_NUM_HBM_INSTANCES]; // new in v1 + + + /* + * v1.2 additions + */ + // PMFW attached timestamp (10ns resolution) + uint64_t firmware_timestamp; + + + /* + * v1.3 additions + */ + // Voltage (mV) + uint16_t voltage_soc; + uint16_t voltage_gfx; + uint16_t voltage_mem; + + // Throttle status + uint64_t indep_throttle_status; + + + /* + * v1.4 additions + */ + // Power (Watts) + uint16_t current_socket_power; + + // Utilization (%) + uint16_t vcn_activity[RSMI_MAX_NUM_VCN]; // VCN instances activity percent (encode/decode) + + // Clock Lock Status. Each bit corresponds to clock instance + uint32_t gfxclk_lock_status; + + // XGMI bus width and bitrate (in Gbps) + uint16_t xgmi_link_width; + uint16_t xgmi_link_speed; + + // PCIE accumulated bandwidth (GB/sec) + uint64_t pcie_bandwidth_acc; + + // PCIE instantaneous bandwidth (GB/sec) + uint64_t pcie_bandwidth_inst; + + // PCIE L0 to recovery state transition accumulated count + uint64_t pcie_l0_to_recov_count_acc; + + // PCIE replay accumulated count + uint64_t pcie_replay_count_acc; + + // PCIE replay rollover accumulated count + uint64_t pcie_replay_rover_count_acc; + + // XGMI accumulated data transfer size(KiloBytes) + uint64_t xgmi_read_data_acc[RSMI_MAX_NUM_XGMI_LINKS]; + uint64_t xgmi_write_data_acc[RSMI_MAX_NUM_XGMI_LINKS]; + + // XGMI accumulated data transfer size(KiloBytes) + uint16_t current_gfxclks[RSMI_MAX_NUM_GFX_CLKS]; + uint16_t current_socclks[RSMI_MAX_NUM_CLKS]; + uint16_t current_vclk0s[RSMI_MAX_NUM_CLKS]; + uint16_t current_dclk0s[RSMI_MAX_NUM_CLKS]; + /// \endcond } rsmi_gpu_metrics_t; @@ -4272,6 +4354,1055 @@ rsmi_status_t rsmi_event_notification_stop(uint32_t dv_ind); /** @} */ // end of EvntNotif + +/*****************************************************************************/ +/** @defgroup GPU Metric Functions + * These functions are used to get granular information about all counters + * available in GPU Metrics. + * @{ + */ + +/** + * Metric multi-valued counter types + */ +typedef uint16_t GPUMetricTempHbm_t[RSMI_NUM_HBM_INSTANCES]; +typedef uint16_t GPUMetricVcnActivity_t[RSMI_MAX_NUM_VCN]; +typedef uint64_t GPUMetricXgmiReadDataAcc_t[RSMI_MAX_NUM_XGMI_LINKS]; +typedef uint64_t GPUMetricXgmiWriteDataAcc_t[RSMI_MAX_NUM_XGMI_LINKS]; +typedef uint16_t GPUMetricCurrGfxClk_t[RSMI_MAX_NUM_GFX_CLKS]; +typedef uint16_t GPUMetricCurrSocClk_t[RSMI_MAX_NUM_CLKS]; +typedef uint16_t GPUMetricCurrVClk0_t[RSMI_MAX_NUM_CLKS]; +typedef uint16_t GPUMetricCurrDClk0_t[RSMI_MAX_NUM_CLKS]; + + +/****** + * Metric single-valued counter types + */ + +/** + * @brief Get the 'temp_hotspot' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint16_t in which + * the 'temp_hotspot' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] hotspot_value a pointer to uint16_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_temp_hotspot_get(uint32_t dv_ind, uint16_t* hotspot_value); + +/** + * @brief Get the 'temp_mem' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint16_t in which + * the 'temp_mem' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] mem_value a pointer to uint16_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_temp_mem_get(uint32_t dv_ind, uint16_t* mem_value); + +/** + * @brief Get the 'temp_vrsoc' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint16_t in which + * the 'temp_vrsoc' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] vrsoc_value a pointer to uint16_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_temp_vrsoc_get(uint32_t dv_ind, uint16_t* vrsoc_value); + +/** + * @brief Get the 'curr_socket_power' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint16_t in which + * the 'socket_power' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] socket_power_value a pointer to uint16_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_curr_socket_power_get(uint32_t dv_ind, uint16_t* socket_power_value); + +/** + * @brief Get the 'avg_gfx_activity' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint16_t in which + * the 'gfx_activity' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] gfx_activity_value a pointer to uint16_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_avg_gfx_activity_get(uint32_t dv_ind, uint16_t* gfx_activity_value); + +/** + * @brief Get the 'avg_umc_activity' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint16_t in which + * the 'umc_activity' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] umc_activity_value a pointer to uint16_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_avg_umc_activity_get(uint32_t dv_ind, uint16_t* umc_activity_value); + +/** + * @brief Get the 'energy_acc' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint64_t in which + * the 'energy_acc' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] energy_acc_value a pointer to uint64_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_energy_acc_get(uint32_t dv_ind, uint64_t* energy_acc_value); + +/** + * @brief Get the 'system_clock_counter' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint64_t in which + * the 'system_clock_counter' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] system_clock_counter_value a pointer to uint64_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_system_clock_counter_get(uint32_t dv_ind, uint64_t* system_clock_counter_value); + +/** + * @brief Get the 'firmware_timestamp' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint64_t in which + * the 'firmware_timestamp' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] firmware_timestamp_value a pointer to uint64_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_firmware_timestamp_get(uint32_t dv_ind, uint64_t* firmware_timestamp_value); + +/** + * @brief Get the 'throttle_status' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint32_t in which + * the 'throttle_status' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] throttle_status_value a pointer to uint32_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_throttle_status_get(uint32_t dv_ind, uint32_t* throttle_status_value); + +/** + * @brief Get the 'pcie_link_width' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint16_t in which + * the 'pcie_link_width' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] pcie_link_width_value a pointer to uint16_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_pcie_link_width_get(uint32_t dv_ind, uint16_t* pcie_link_width_value); + +/** + * @brief Get the 'pcie_link_speed' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint16_t in which + * the 'pcie_link_speed' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] pcie_link_speed_value a pointer to uint16_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_pcie_link_speed_get(uint32_t dv_ind, uint16_t* pcie_link_speed_value); + +/** + * @brief Get the 'xgmi_link_width' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint16_t in which + * the 'xgmi_link_width' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] xgmi_link_width_value a pointer to uint16_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_xgmi_link_width_get(uint32_t dv_ind, uint16_t* xgmi_link_width_value); + +/** + * @brief Get the 'xgmi_link_speed' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint16_t in which + * the 'xgmi_link_speed' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] xgmi_link_speed_value a pointer to uint16_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_xgmi_link_speed_get(uint32_t dv_ind, uint16_t* xgmi_link_speed_value); + +/** + * @brief Get the 'gfxclk_lock_status' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint32_t in which + * the 'gfxclk_lock_status' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] gfxclk_lock_status_value a pointer to uint32_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_gfxclk_lock_status_get(uint32_t dv_ind, uint32_t* gfxclk_lock_status_value); + +/** + * @brief Get the 'gfx_activity_acc' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint32_t in which + * the 'gfx_activity_acc' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] gfx_activity_acc_value a pointer to uint32_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_gfx_activity_acc_get(uint32_t dv_ind, uint32_t* gfx_activity_acc_value); + +/** + * @brief Get the 'mem_activity_acc' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint32_t in which + * the 'mem_activity_acc' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] mem_activity_acc_value a pointer to uint32_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_mem_activity_acc_get(uint32_t dv_ind, uint32_t* mem_activity_acc_value); + +/** + * @brief Get the 'pcie_bandwidth_acc' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint64_t in which + * the 'pcie_bandwidth_acc' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] pcie_bandwidth_acc_value a pointer to uint64_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_pcie_bandwidth_acc_get(uint32_t dv_ind, uint64_t* pcie_bandwidth_acc_value); + +/** + * @brief Get the 'pcie_bandwidth_inst' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint64_t in which + * the 'pcie_bandwidth_inst' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] pcie_bandwidth_inst_value a pointer to uint64_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_pcie_bandwidth_inst_get(uint32_t dv_ind, uint64_t* pcie_bandwidth_inst_value); + +/** + * @brief Get the 'pcie_l0_recov_count_acc' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint64_t in which + * the 'pcie_l0_recov_count_acc' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] pcie_count_acc_value a pointer to uint64_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_pcie_l0_recov_count_acc_get(uint32_t dv_ind, uint64_t* pcie_count_acc_value); + +/** + * @brief Get the 'pcie_replay_count_acc' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint64_t in which + * the 'pcie_replay_count_acc' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] pcie_count_acc_value a pointer to uint64_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_pcie_replay_count_acc_get(uint32_t dv_ind, uint64_t* pcie_count_acc_value); + +/** + * @brief Get the 'pcie_replay_rover_count_acc' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint64_t in which + * the 'pcie_replay_rover_count_acc' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] pcie_count_acc_value a pointer to uint64_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_pcie_replay_rover_count_acc_get(uint32_t dv_ind, uint64_t* pcie_count_acc_value); + +/** + * @brief Get the 'curr_uclk' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint16_t in which + * the 'curr_uclk' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] uclk_value a pointer to uint16_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_curr_uclk_get(uint32_t dv_ind, uint16_t* uclk_value); + + +/****** + * Metric multi-valued counter types + */ + +/** + * @brief Get the 'temp_hbm' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint16_t in which + * the 'temp_hbm' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] temp_hbm_value a pointer to uint16_t to which the device gpu + * metric unit will be stored + * - This is a multi-valued counter holding a 4 (RSMI_NUM_HBM_INSTANCES) + * element array (GPUMetricTempHbm_t) + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_temp_hbm_get(uint32_t dv_ind, GPUMetricTempHbm_t* temp_hbm_value); + +/** + * @brief Get the 'vcn_activity' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint16_t in which + * the 'vcn_activity' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] vcn_activity_value a pointer to uint16_t to which the device gpu + * metric unit will be stored + * - This is a multi-valued counter holding a 4 (RSMI_MAX_NUM_VCN) + * element array (GPUMetricVcnActivity_t) + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_vcn_activity_get(uint32_t dv_ind, GPUMetricVcnActivity_t* vcn_activity_value); + +/** + * @brief Get the 'xgmi_read_data' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint64_t in which + * the 'xgmi_read_data' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] xgmi_read_data_acc_value a pointer to uint64_t to which the device gpu + * metric unit will be stored + * - This is a multi-valued counter holding an 8 (RSMI_MAX_NUM_XGMI_LINKS) + * element array (GPUMetricXgmiReadDataAcc_t) + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_xgmi_read_data_get(uint32_t dv_ind, GPUMetricXgmiReadDataAcc_t* xgmi_read_data_acc_value); + +/** + * @brief Get the 'xgmi_write_data' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint64_t in which + * the 'xgmi_write_data' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] xgmi_write_data_acc_value a pointer to uint64_t to which the device gpu + * metric unit will be stored + * - This is a multi-valued counter holding an 8 (RSMI_MAX_NUM_XGMI_LINKS) + * element array (GPUMetricXgmiWriteDataAcc_t) + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_xgmi_write_data_get(uint32_t dv_ind, GPUMetricXgmiWriteDataAcc_t* xgmi_write_data_acc_value); + +/** + * @brief Get the 'curr_gfxclk' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint64_t in which + * the 'curr_gfxclk' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] current_gfxclk_value a pointer to uint16_t to which the device gpu + * metric unit will be stored + * - This is a multi-valued counter holding an 8 (RSMI_MAX_NUM_GFX_CLKS) + * element array (GPUMetricCurrGfxClk_t) + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_curr_gfxclk_get(uint32_t dv_ind, GPUMetricCurrGfxClk_t* current_gfxclk_value); + +/** + * @brief Get the 'curr_socclk' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint16_t in which + * the 'curr_socclk' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] current_socclk_value a pointer to uint16_t to which the device gpu + * metric unit will be stored + * - This is a multi-valued counter holding a 4 (RSMI_MAX_NUM_CLKS) + * element array (GPUMetricCurrSocClk_t) + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_curr_socclk_get(uint32_t dv_ind, GPUMetricCurrSocClk_t* current_socclk_value); + +/** + * @brief Get the 'curr_vclk0' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint16_t in which + * the 'curr_vclk0' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] current_vclk_value a pointer to uint16_t to which the device gpu + * metric unit will be stored + * - This is a multi-valued counter holding a 4 (RSMI_MAX_NUM_CLKS) + * element array (GPUMetricCurrVClk0_t) + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_curr_vclk0_get(uint32_t dv_ind, GPUMetricCurrVClk0_t* current_vclk_value); + +/** + * @brief Get the 'curr_dclk0' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint16_t in which + * the 'curr_dclk0' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] current_dclk_value a pointer to uint16_t to which the device gpu + * metric unit will be stored + * - This is a multi-valued counter holding a 4 (RSMI_MAX_NUM_CLKS) + * element array (GPUMetricCurrDClk0_t) + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_curr_dclk0_get(uint32_t dv_ind, GPUMetricCurrDClk0_t* current_dclk_value); + +/** + * @brief Get the 'temp_edge' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint16_t in which + * the 'temp_edge' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] edge_value a pointer to uint16_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_temp_edge_get(uint32_t dv_ind, uint16_t* edge_value); + +/** + * @brief Get the 'temp_vrgfx' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint16_t in which + * the 'temp_vrgfx' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] vrgfx_value a pointer to uint16_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_temp_vrgfx_get(uint32_t dv_ind, uint16_t* vrgfx_value); + +/** + * @brief Get the 'temp_vrmem' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint16_t in which + * the 'temp_vrmem' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] vrmem_value a pointer to uint16_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_temp_vrmem_get(uint32_t dv_ind, uint16_t* vrmem_value); + +/** + * @brief Get the 'avg_mm_activity' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint16_t in which + * the 'avg_mm_activity' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] mm_activity_value a pointer to uint16_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_avg_mm_activity_get(uint32_t dv_ind, uint16_t* mm_activity_value); + +/** + * @brief Get the 'curr_vclk1' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint16_t in which + * the 'curr_vclk1' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] current_vclk_value a pointer to uint16_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_curr_vclk1_get(uint32_t dv_ind, uint16_t* current_vclk_value); + +/** + * @brief Get the 'curr_dclk1' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint16_t in which + * the 'curr_dclk1' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] current_dclk_value a pointer to uint16_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_curr_dclk1_get(uint32_t dv_ind, uint16_t* current_dclk_value); + +/** + * @brief Get the 'indep_throttle_status' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint64_t in which + * the 'indep_throttle_status' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] throttle_status_value a pointer to uint64_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_indep_throttle_status_get(uint32_t dv_ind, uint64_t* throttle_status_value); + +/** + * @brief Get the 'avg_socket_power' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint16_t in which + * the 'avg_socket_power' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] socket_power_value a pointer to uint16_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_avg_socket_power_get(uint32_t dv_ind, uint16_t* socket_power_value); + +/** + * @brief Get the 'curr_fan_speed' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint16_t in which + * the 'curr_fan_speed' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] fan_speed_value a pointer to uint16_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_curr_fan_speed_get(uint32_t dv_ind, uint16_t* fan_speed_value); + +/** + * @brief Get the 'avg_gfx_clock_frequency' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint16_t in which + * the 'avg_gfx_clock_frequency' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] clock_frequency_value a pointer to uint16_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_avg_gfx_clock_frequency_get(uint32_t dv_ind, uint16_t* clock_frequency_value); + +/** + * @brief Get the 'avg_soc_clock_frequency' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint16_t in which + * the 'avg_soc_clock_frequency' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] clock_frequency_value a pointer to uint16_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_avg_soc_clock_frequency_get(uint32_t dv_ind, uint16_t* clock_frequency_value); + +/** + * @brief Get the 'avg_uclock_frequency' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint16_t in which + * the 'avg_uclock_frequency' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] clock_frequency_value a pointer to uint16_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_avg_uclock_frequency_get(uint32_t dv_ind, uint16_t* clock_frequency_value); + +/** + * @brief Get the 'avg_vclock0_frequency' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint16_t in which + * the 'avg_vclock0_frequency' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] clock_frequency_value a pointer to uint16_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_avg_vclock0_frequency_get(uint32_t dv_ind, uint16_t* clock_frequency_value); + +/** + * @brief Get the 'avg_dclock0_frequency' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint16_t in which + * the 'avg_dclock0_frequency' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] clock_frequency_value a pointer to uint16_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_avg_dclock0_frequency_get(uint32_t dv_ind, uint16_t* clock_frequency_value); + +/** + * @brief Get the 'avg_vclock1_frequency' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint16_t in which + * the 'avg_vclock1_frequency' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] clock_frequency_value a pointer to uint16_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_avg_vclock1_frequency_get(uint32_t dv_ind, uint16_t* clock_frequency_value); + +/** + * @brief Get the 'avg_dclock1_frequency' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint16_t in which + * the 'avg_dclock1_frequency' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] clock_frequency_value a pointer to uint16_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_avg_dclock1_frequency_get(uint32_t dv_ind, uint16_t* clock_frequency_value); + +/** + * @brief Get the 'volt_soc' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint16_t in which + * the 'volt_soc' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] voltage_value a pointer to uint16_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_volt_soc_get(uint32_t dv_ind, uint16_t* voltage_value); + +/** + * @brief Get the 'volt_gfx' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint16_t in which + * the 'volt_gfx' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] voltage_value a pointer to uint16_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_volt_gfx_get(uint32_t dv_ind, uint16_t* voltage_value); + +/** + * @brief Get the 'volt_mem' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint16_t in which + * the 'volt_mem' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] voltage_value a pointer to uint16_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_volt_mem_get(uint32_t dv_ind, uint16_t* voltage_value); + +/** + * @brief Get the 'metrics_header_info' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a metrics_table_header_t in which + * the 'metrics_header_info' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] header_value a pointer to metrics_table_header_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_header_info_get(uint32_t dv_ind, metrics_table_header_t* header_value); + +/** + * @brief Get the 'xcd_counter' from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint16_t in which + * the 'xcd_counter' will stored + * + * @param[in] dv_ind a device index + * + * @param[inout] xcd_counter_value a pointer to uint16_t to which the device gpu + * metric unit will be stored + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +rsmi_status_t +rsmi_dev_metrics_xcd_counter_get(uint32_t dv_ind, uint16_t* xcd_counter_value); + +/** + * @brief Get the log from the GPU metrics associated with the device + * + * @details Given a device index @p dv_ind it will log all the gpu metric info + * related to the device. The 'logging' feature must be on. + * + * @param[in] dv_ind a device index + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * + */ +rsmi_status_t +rsmi_dev_metrics_log_get(uint32_t dv_ind); + + #ifdef __cplusplus } #endif // __cplusplus diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h index 5712affa87..cf8e650788 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h @@ -53,6 +53,7 @@ #include #include #include +#include #include "rocm_smi/rocm_smi_monitor.h" #include "rocm_smi/rocm_smi_power_mon.h" @@ -220,7 +221,7 @@ class Device { void set_evt_notif_anon_fd(uint32_t fd) { evt_notif_anon_fd_ = static_cast(fd);} int evt_notif_anon_fd(void) const {return evt_notif_anon_fd_;} - metrics_table_header_t &gpu_metrics_ver(void) {return gpu_metrics_ver_;} + void fillSupportedFuncs(void); void DumpSupportedFunctions(void); bool DeviceAPISupported(std::string name, uint64_t variant, @@ -230,17 +231,15 @@ class Device { template std::string readBootPartitionState(uint32_t dv_ind); rsmi_status_t check_amdgpu_property_reinforcement_query(uint32_t dev_idx, AMDGpuVerbTypes_t verb_type); - void dev_set_gpu_metric(GpuMetricsBasePtr gpu_metrics_ptr) { m_gpu_metrics_ptr = gpu_metrics_ptr; }; + void dev_set_gpu_metric(GpuMetricsBasePtr gpu_metrics_ptr) { m_gpu_metrics_ptr = std::move(gpu_metrics_ptr); }; GpuMetricsBasePtr& dev_get_gpu_metric() { return m_gpu_metrics_ptr; }; const AMDGpuMetricsHeader_v1_t& dev_get_metrics_header() {return m_gpu_metrics_header; } rsmi_status_t setup_gpu_metrics_reading(); rsmi_status_t dev_read_gpu_metrics_header_data(); rsmi_status_t dev_read_gpu_metrics_all_data(); - rsmi_status_t dev_log_gpu_metrics(); rsmi_status_t run_internal_gpu_metrics_query(AMDGpuMetricsUnitType_t metric_counter, AMDGpuDynamicMetricTblValues_t& values); - - template - rsmi_status_t dev_run_gpu_metrics_query(AMDGpuMetricsUnitType_t metric_counter, T& metric_value); + rsmi_status_t dev_log_gpu_metrics(std::ostringstream& outstream_metrics); + AMGpuMetricsPublicLatestTupl_t dev_copy_internal_to_external_metrics(); private: @@ -265,7 +264,6 @@ class Device { bool returnWriteErr = false); rsmi_status_t run_amdgpu_property_reinforcement_query(const AMDGpuPropertyQuery_t& amdgpu_property_query); - uint64_t bdfid_; uint64_t kfd_gpu_id_; std::unordered_set #include #include #include #include +#include #include @@ -97,12 +99,11 @@ struct AMDGpuMetricsHeader_v1_t }; -struct AMDGpuMetricsBase_t; -using AMDGpuMetricsBaseRef = AMDGpuMetricsBase_t&; struct AMDGpuMetricsBase_t { virtual ~AMDGpuMetricsBase_t() = default; }; +using AMDGpuMetricsBaseRef = AMDGpuMetricsBase_t&; struct AMDGpuMetrics_v11_t : AMDGpuMetricsBase_t { @@ -316,7 +317,7 @@ struct AMDGpuMetrics_v14_t : AMDGpuMetricsBase_t uint16_t m_temperature_vrsoc; // Power (Watts) - uint16_t m_curr_socket_power; + uint16_t m_current_socket_power; // Utilization (%) uint16_t m_average_gfx_activity; @@ -340,8 +341,8 @@ struct AMDGpuMetrics_v14_t : AMDGpuMetricsBase_t uint16_t m_pcie_link_speed; // in 0.1 GT/s // XGMI bus width and bitrate (in Gbps) - uint16_t m_xgmi_link_width; - uint16_t m_xgmi_link_speed; + uint16_t m_xgmi_link_width; + uint16_t m_xgmi_link_speed; // Utilization Accumulated (%) uint32_t m_gfx_activity_acc; @@ -353,6 +354,15 @@ struct AMDGpuMetrics_v14_t : AMDGpuMetricsBase_t // PCIE instantaneous bandwidth (GB/sec) uint64_t m_pcie_bandwidth_inst; + // PCIE L0 to recovery state transition accumulated count + uint64_t m_pcie_l0_to_recov_count_acc; + + // PCIE replay accumulated count + uint64_t m_pcie_replay_count_acc; + + // PCIE replay rollover accumulated count + uint64_t m_pcie_replay_rover_count_acc; + // XGMI accumulated data transfer size(KiloBytes) uint64_t m_xgmi_read_data_acc[kRSMI_MAX_NUM_XGMI_LINKS]; uint64_t m_xgmi_write_data_acc[kRSMI_MAX_NUM_XGMI_LINKS]; @@ -371,33 +381,52 @@ struct AMDGpuMetrics_v14_t : AMDGpuMetricsBase_t }; using AMGpuMetricsLatest_t = AMDGpuMetrics_v14_t; +/** + * This is GPU Metrics version that gets to public access. + * It is a unique/unified version (joined) of the previous + * versions (1.2 to latest 1.4). Data fields not used/relevant + * for the current driver version and GPU metrics version will + * not be populated, and therefore 0s (zeroes). + * + * If/in case anything new is added to a new version and there is + * a requirement to make it publicly available, into a single static + * table/form/struct, then it should be added here. + * + */ +using AMGpuMetricsPublicLatest_t = rsmi_gpu_metrics_t; +using AMGpuMetricsPublicLatestTupl_t = std::tuple; + +using GpuMetricU16Tbl_t = std::vector; +using GpuMetricU32Tbl_t = std::vector; +using GpuMetricU64Tbl_t = std::vector; using GPUMetricTempHbm_t = decltype(AMDGpuMetrics_v13_t::m_temperature_hbm); -using GPUMetricTempHbmTbl_t = std::array; +using GPUMetricTempHbmTbl_t = GpuMetricU16Tbl_t; using GPUMetricVcnActivity_t = decltype(AMDGpuMetrics_v14_t::m_vcn_activity); -using GPUMetricVcnActivityTbl_t = std::array; +using GPUMetricVcnActivityTbl_t = GpuMetricU16Tbl_t; using GPUMetricXgmiReadDataAcc_t = decltype(AMDGpuMetrics_v14_t::m_xgmi_read_data_acc); using GPUMetricXgmiWriteDataAcc_t = decltype(AMDGpuMetrics_v14_t::m_xgmi_write_data_acc); -using GPUMetricXgmiAccTbl_t = std::array; +using GPUMetricXgmiAccTbl_t = GpuMetricU64Tbl_t; using GPUMetricCurrGfxClk_t = decltype(AMDGpuMetrics_v14_t::m_current_gfxclk); -using GPUMetricCurrGfxClkTbl_t = std::array; +using GPUMetricCurrGfxClkTbl_t = GpuMetricU16Tbl_t; using GPUMetricCurrSocClk_t = decltype(AMDGpuMetrics_v14_t::m_current_socclk); -using GPUMetricCurrSocClkTbl_t = std::array; +using GPUMetricCurrSocClkTbl_t = GpuMetricU16Tbl_t; using GPUMetricCurrVClk0_t = decltype(AMDGpuMetrics_v14_t::m_current_vclk0); -using GPUMetricCurrVClkTbl_t = std::array; +using GPUMetricCurrVClkTbl_t = GpuMetricU16Tbl_t; using GPUMetricCurrDClk0_t = decltype(AMDGpuMetrics_v14_t::m_current_dclk0); -using GPUMetricCurrDClkTbl_t = std::array; +using GPUMetricCurrDClkTbl_t = GpuMetricU16Tbl_t; -/* +//// +/************************************************************ * When a new metric table is released, we have to update: * - 1. Constants related to the new metrics added; + 1. Constants related to the new metrics added (if any); (ie: kRSMI_MAX_NUM_XGMI_LINKS) 2. Constants related to new version: (ie: kRSMI_GPU_METRICS_API_CONTENT_MAJOR_VER_1) @@ -411,19 +440,35 @@ using GPUMetricCurrDClkTbl_t = std::array; 5. AMGpuMetricsLatest_t -> Newest AMDGpuMetrics_v1x_t 6. AMDGpuMetricVersionFlags_t (ie: AMDGpuMetricVersionFlags_t::kGpuMetricV14) + 7. Create the proper API using granular controls used by + rsmi_dev_gpu_metrics_info_query() (ie: rsmi_dev_temp_hotspot_get()) + + -> Remember to check/update: + - AMDGpuMetricsUnitType_t + - amdgpu_metrics_unit_type_translation_table + - AMDGpuMetrics_v1X_t structure in question + - populate_metrics_dynamic_tbl() + - copy_internal_to_external_metrics() + - init_max_public_gpu_matrics() */ using AMDGpuMetricTypeId_t = uint32_t; using AMDGpuMetricTypeIdSeq_t = uint32_t; using AMDGpuMetricVersionFlagId_t = uint32_t; +//// +/* + * + * These are used as Metric class, so Metric Units can be properly grouped. + * Each Metric Unit (or a set of them) is related to a Metric class. + * + */ enum class AMDGpuMetricsClassId_t : AMDGpuMetricTypeId_t { - kGpuMetricHeader = 0, + kGpuMetricHeader, kGpuMetricTemperature, kGpuMetricUtilization, kGpuMetricPowerEnergy, - kGpuMetricSystemClockCounter, kGpuMetricAverageClock, kGpuMetricCurrentClock, kGpuMetricThrottleStatus, @@ -435,6 +480,22 @@ enum class AMDGpuMetricsClassId_t : AMDGpuMetricTypeId_t }; using AMDGpuMetricsClassIdTranslationTbl_t = std::map; +/* + * + * These are the Metric units. Each one represents a specific metric we want + * to either store or retrieve. + * + * This also gives a more granular control over to what exactly is needed, + * helping to generalize metric queries. + * + * Each type a new (non-existing metric unit) metric is added, it should be + * updated here. + * - Their names matches (closely, regardless of their version) the name of + * the data structure members they represent. + * + * All metric units not flagged as v1.4 were either part of the base or + * added/changed up to v1.3 + */ enum class AMDGpuMetricsUnitType_t : AMDGpuMetricTypeId_t { // kGpuMetricTemperature counters @@ -452,7 +513,7 @@ enum class AMDGpuMetricsUnitType_t : AMDGpuMetricTypeId_t kMetricAvgMmActivity, kMetricGfxActivityAccumulator, kMetricMemActivityAccumulator, - kMetricVcnActivity, + kMetricVcnActivity, //v1.4 // kGpuMetricAverageClock counters kMetricAvgGfxClockFrequency, @@ -464,11 +525,11 @@ enum class AMDGpuMetricsUnitType_t : AMDGpuMetricTypeId_t kMetricAvgDClock1Frequency, // kGpuMetricCurrentClock counters - kMetricCurrGfxClock, - kMetricCurrSocClock, + kMetricCurrGfxClock, //v1.4: Changed to multi-valued + kMetricCurrSocClock, //v1.4: Changed to multi-valued kMetricCurrUClock, - kMetricCurrVClock0, - kMetricCurrDClock0, + kMetricCurrVClock0, //v1.4: Changed to multi-valued + kMetricCurrDClock0, //v1.4: Changed to multi-valued kMetricCurrVClock1, kMetricCurrDClock1, @@ -477,7 +538,7 @@ enum class AMDGpuMetricsUnitType_t : AMDGpuMetricTypeId_t kMetricIndepThrottleStatus, // kGpuMetricGfxClkLockStatus counters - kMetricGfxClkLockStatus, + kMetricGfxClkLockStatus, //v1.4 // kGpuMetricCurrentFanSpeed counters kMetricCurrFanSpeed, @@ -485,22 +546,25 @@ enum class AMDGpuMetricsUnitType_t : AMDGpuMetricTypeId_t // kGpuMetricLinkWidthSpeed counters kMetricPcieLinkWidth, kMetricPcieLinkSpeed, - kMetricPcieBandwidthAccumulator, - kMetricPcieBandwidthInst, - kMetricXgmiLinkWidth, - kMetricXgmiLinkSpeed, - kMetricXgmiReadDataAccumulator, - kMetricXgmiWriteDataAccumulator, + kMetricPcieBandwidthAccumulator, //v1.4 + kMetricPcieBandwidthInst, //v1.4 + kMetricXgmiLinkWidth, //v1.4 + kMetricXgmiLinkSpeed, //v1.4 + kMetricXgmiReadDataAccumulator, //v1.4 + kMetricXgmiWriteDataAccumulator, //v1.4 + kMetricPcieL0RecovCountAccumulator, //v1.4 + kMetricPcieReplayCountAccumulator, //v1.4 + kMetricPcieReplayRollOverCountAccumulator, //v1.4 // kGpuMetricPowerEnergy counters kMetricAvgSocketPower, - kMetricCurrSocketPower, - kMetricEnergyAccumulator, + kMetricCurrSocketPower, //v1.4 + kMetricEnergyAccumulator, //v1.4 // kGpuMetricVoltage counters - kMetricVoltageSoc, - kMetricVoltageGfx, - kMetricVoltageMem, + kMetricVoltageSoc, //v1.3 + kMetricVoltageGfx, //v1.3 + kMetricVoltageMem, //v1.3 // kGpuMetricTimestamp counters kMetricTSClockCounter, @@ -526,8 +590,12 @@ struct AMDGpuDynamicMetricsValue_t using AMDGpuDynamicMetricTblValues_t = std::vector; using AMDGpuDynamicMetricsTbl_t = std::map>; -// Note: All supported metric versions are listed her -// If not here, they are not supported + +/* + * + * Note: All supported metric versions are listed here, otherwise unsupported + * + */ enum class AMDGpuMetricVersionFlags_t : AMDGpuMetricVersionFlagId_t { kGpuMetricNone = 0x0, @@ -537,7 +605,7 @@ enum class AMDGpuMetricVersionFlags_t : AMDGpuMetricVersionFlagId_t kGpuMetricV13 = (0x1 << 3), kGpuMetricV14 = (0x1 << 4), }; -using AMDGpuMetricVersionTranslationTbl_t = std::map; +using AMDGpuMetricVersionTranslationTbl_t = std::map; class GpuMetricsBase_t; @@ -551,7 +619,7 @@ class GpuMetricsBase_t virtual AMDGpuMetricsBaseRef get_metrics_table() = 0; virtual AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() = 0; virtual rsmi_status_t populate_metrics_dynamic_tbl() = 0; - + virtual AMGpuMetricsPublicLatestTupl_t copy_internal_to_external_metrics() = 0; virtual AMDGpuDynamicMetricsTbl_t get_metrics_dynamic_tbl() { return m_metrics_dynamic_tbl; } @@ -575,7 +643,7 @@ class GpuMetricsBase_v11_t final : public GpuMetricsBase_t AMDGpuMetricsBaseRef get_metrics_table() override { - return m_gpu_metrics_tbl; + return this->m_gpu_metrics_tbl; } AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override @@ -584,6 +652,7 @@ class GpuMetricsBase_v11_t final : public GpuMetricsBase_t } rsmi_status_t populate_metrics_dynamic_tbl() override; + AMGpuMetricsPublicLatestTupl_t copy_internal_to_external_metrics() override; private: @@ -602,7 +671,7 @@ class GpuMetricsBase_v12_t final : public GpuMetricsBase_t AMDGpuMetricsBaseRef get_metrics_table() override { - return m_gpu_metrics_tbl; + return this->m_gpu_metrics_tbl; } AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override @@ -611,7 +680,7 @@ class GpuMetricsBase_v12_t final : public GpuMetricsBase_t } rsmi_status_t populate_metrics_dynamic_tbl() override; - + AMGpuMetricsPublicLatestTupl_t copy_internal_to_external_metrics() override; private: AMDGpuMetrics_v12_t m_gpu_metrics_tbl; @@ -629,7 +698,7 @@ class GpuMetricsBase_v13_t final : public GpuMetricsBase_t AMDGpuMetricsBaseRef get_metrics_table() override { - return m_gpu_metrics_tbl; + return this->m_gpu_metrics_tbl; } AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override @@ -638,6 +707,7 @@ class GpuMetricsBase_v13_t final : public GpuMetricsBase_t } rsmi_status_t populate_metrics_dynamic_tbl() override; + AMGpuMetricsPublicLatestTupl_t copy_internal_to_external_metrics() override; private: @@ -656,7 +726,7 @@ class GpuMetricsBase_v14_t final : public GpuMetricsBase_t AMDGpuMetricsBaseRef get_metrics_table() override { - return m_gpu_metrics_tbl; + return this->m_gpu_metrics_tbl; } AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override @@ -665,6 +735,7 @@ class GpuMetricsBase_v14_t final : public GpuMetricsBase_t } rsmi_status_t populate_metrics_dynamic_tbl() override; + AMGpuMetricsPublicLatestTupl_t copy_internal_to_external_metrics() override; private: @@ -677,5 +748,9 @@ rsmi_status_t rsmi_dev_gpu_metrics_info_query(uint32_t dv_ind, AMDGpuMetricsUnit } // namespace amd::smi -#endif // ROCM_SMI_ROCM_SMI_GPU_METRICS_H_ +rsmi_status_t +rsmi_dev_gpu_metrics_header_info_get(uint32_t dv_ind, metrics_table_header_t& header_value); + + +#endif // ROCM_SMI_ROCM_SMI_GPU_METRICS_H_ diff --git a/projects/rocm-smi-lib/python_smi_tools/README.md b/projects/rocm-smi-lib/python_smi_tools/README.md index bbb8652019..d9cc2a2a6b 100644 --- a/projects/rocm-smi-lib/python_smi_tools/README.md +++ b/projects/rocm-smi-lib/python_smi_tools/README.md @@ -1,6 +1,4 @@ -## Synopsis - -Radeon Open Compute Platform - System Management Interface - Command Line tool. +## Radeon Open Compute (ROCm) - System Management Interface - Command Line Tool This tool acts as a command line interface for manipulating and monitoring the amdgpu kernel, and is intended to replace diff --git a/projects/rocm-smi-lib/rocm_smi/example/rocm_smi_example.cc b/projects/rocm-smi-lib/rocm_smi/example/rocm_smi_example.cc index cef1fd1263..546d72398c 100755 --- a/projects/rocm-smi-lib/rocm_smi/example/rocm_smi_example.cc +++ b/projects/rocm-smi-lib/rocm_smi/example/rocm_smi_example.cc @@ -47,10 +47,12 @@ #include #include -#include -#include +#include #include +#include #include +#include +#include #include "rocm_smi/rocm_smi.h" #include "rocm_smi/rocm_smi_utils.h" @@ -166,9 +168,9 @@ void print_function_header_with_rsmi_ret( } static void print_test_header(const char *str, uint32_t dv_ind) { - std::cout << "********************************" << "\n"; + std::cout << "******************************************" << "\n"; std::cout << "*** " << str << "\n"; - std::cout << "********************************" << "\n"; + std::cout << "******************************************" << "\n"; std::cout << "Device index: " << dv_ind << "\n"; } @@ -728,6 +730,40 @@ template constexpr float convert_mw_to_w(T mw) { return static_cast(mw / 1000.0); } +template +auto print_error_or_value(rsmi_status_t status_code, const T& metric) { + if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { + if constexpr (std::is_array_v) { + auto idx = uint16_t(0); + auto str_values = std::string(); + const auto num_elems = static_cast(std::end(metric) - std::begin(metric)); + str_values = ("\n\t\t num of values: " + std::to_string(num_elems) + "\n"); + for (const auto& el : metric) { + str_values += "\t\t [" + std::to_string(idx) + "]: " + std::to_string(el) + "\n"; + ++idx; + } + return str_values; + } + else if constexpr ((std::is_same_v) || + (std::is_same_v) || + (std::is_same_v)) { + return std::to_string(metric); + } + } + else { + return ("\n\t\tStatus: [" + std::to_string(status_code) + "] " + "-> " + amd::smi::getRSMIStatusString(status_code)); + } +}; + +template +std::string print_unsigned_int(T value) { + std::stringstream ss; + ss << static_cast(value | 0); + + return ss.str(); +} + + int main() { rsmi_status_t ret; @@ -742,7 +778,7 @@ int main() { rsmi_dev_perf_level_t pfl; rsmi_frequencies_t f; uint32_t num_monitor_devs = 0; - rsmi_gpu_metrics_t p; + rsmi_gpu_metrics_t gpu_metrics; std::string val_str; RSMI_POWER_TYPE power_type = RSMI_INVALID_POWER; @@ -804,43 +840,308 @@ int main() { std::cout << "Not Supported\n"; } - ret = rsmi_dev_gpu_metrics_info_get(i, &p); - print_test_header("GPU METRICS", i); - print_function_header_with_rsmi_ret(ret, - "rsmi_dev_gpu_metrics_info_get(" + std::to_string(i) + ", &p)"); - std::cout << "\t**p.common_header.content_revision: " << std::dec - << p.common_header.content_revision << "\n"; - std::cout << "\t**p.common_header.format_revision: " << std::dec - << p.common_header.format_revision << "\n"; - std::cout << "\t**p.average_gfxclk_frequency: " << std::dec - << p.average_gfxclk_frequency << "\n"; - std::cout << "\t**p.average_socclk_frequency: " << std::dec - << p.average_socclk_frequency << "\n"; - std::cout << "\t**p.average_uclk_frequency: " << std::dec - << p.average_uclk_frequency << "\n"; - std::cout << "\t**p.average_vclk0_frequency: " << std::dec - << p.average_vclk0_frequency << "\n"; - std::cout << "\t**p.average_dclk0_frequency: " << std::dec - << p.average_dclk0_frequency << "\n"; - std::cout << "\t**p.average_vclk1_frequency: " << std::dec - << p.average_vclk1_frequency << "\n"; - std::cout << "\t**p.average_dclk1_frequency: " << std::dec - << p.average_dclk1_frequency << "\n"; + // + std::cout << "\n"; + print_test_header("GPU METRICS: Using static struct (Backwards Compatibility) ", i); + print_function_header_with_rsmi_ret(ret, "rsmi_dev_gpu_metrics_info_get(" + std::to_string(i) + ", &gpu_metrics)"); + rsmi_dev_gpu_metrics_info_get(i, &gpu_metrics); + + std::cout << "\t**.common_header.format_revision : " + << print_unsigned_int(gpu_metrics.common_header.format_revision) << "\n"; + std::cout << "\t**.common_header.content_revision : " + << print_unsigned_int(gpu_metrics.common_header.content_revision) << "\n"; + + std::cout << "\t**.temperature_edge : " << std::dec + << gpu_metrics.temperature_edge << "\n"; + std::cout << "\t**.temperature_hotspot : " << std::dec + << gpu_metrics.temperature_hotspot << "\n"; + std::cout << "\t**.temperature_mem : " << std::dec + << gpu_metrics.temperature_mem << "\n"; + std::cout << "\t**.temperature_vrgfx : " << std::dec + << gpu_metrics.temperature_vrgfx << "\n"; + std::cout << "\t**.temperature_vrsoc : " << std::dec + << gpu_metrics.temperature_vrsoc << "\n"; + std::cout << "\t**.temperature_vrmem : " << std::dec + << gpu_metrics.temperature_vrmem << "\n"; + std::cout << "\t**.average_gfx_activity : " << std::dec + << gpu_metrics.average_gfx_activity << "\n"; + std::cout << "\t**.average_umc_activity : " << std::dec + << gpu_metrics.average_umc_activity << "\n"; + std::cout << "\t**.average_mm_activity : " << std::dec + << gpu_metrics.average_mm_activity << "\n"; + std::cout << "\t**.average_socket_power : " << std::dec + << gpu_metrics.average_socket_power << "\n"; + std::cout << "\t**.energy_accumulator : " << std::dec + << gpu_metrics.energy_accumulator << "\n"; + std::cout << "\t**.system_clock_counter : " << std::dec + << gpu_metrics.system_clock_counter << "\n"; + std::cout << "\t**.average_gfxclk_frequency : " << std::dec + << gpu_metrics.average_gfxclk_frequency << "\n"; + std::cout << "\t**.average_socclk_frequency : " << std::dec + << gpu_metrics.average_socclk_frequency << "\n"; + std::cout << "\t**.average_uclk_frequency : " << std::dec + << gpu_metrics.average_uclk_frequency << "\n"; + std::cout << "\t**.average_vclk0_frequency : " << std::dec + << gpu_metrics.average_vclk0_frequency<< "\n"; + std::cout << "\t**.average_dclk0_frequency : " << std::dec + << gpu_metrics.average_dclk0_frequency << "\n"; + std::cout << "\t**.average_vclk1_frequency : " << std::dec + << gpu_metrics.average_vclk1_frequency << "\n"; + std::cout << "\t**.average_dclk1_frequency : " << std::dec + << gpu_metrics.average_dclk1_frequency << "\n"; + std::cout << "\t**.current_gfxclk : " << std::dec + << gpu_metrics.current_gfxclk << "\n"; + std::cout << "\t**.current_socclk : " << std::dec + << gpu_metrics.current_socclk << "\n"; + std::cout << "\t**.current_uclk : " << std::dec + << gpu_metrics.current_uclk << "\n"; + std::cout << "\t**.current_vclk0 : " << std::dec + << gpu_metrics.current_vclk0 << "\n"; + std::cout << "\t**.current_dclk0 : " << std::dec + << gpu_metrics.current_dclk0 << "\n"; + std::cout << "\t**.current_vclk1 : " << std::dec + << gpu_metrics.current_vclk1 << "\n"; + std::cout << "\t**.current_dclk1 : " << std::dec + << gpu_metrics.current_dclk1 << "\n"; + std::cout << "\t**.throttle_status : " << std::dec + << gpu_metrics.throttle_status << "\n"; + std::cout << "\t**.current_fan_speed : " << std::dec + << gpu_metrics.current_fan_speed << "\n"; + std::cout << "\t**.pcie_link_width : " << std::dec + << gpu_metrics.pcie_link_width << "\n"; + std::cout << "\t**.pcie_link_speed : " << std::dec + << gpu_metrics.pcie_link_speed << "\n"; + std::cout << "\t**.gfx_activity_acc : " << std::dec + << gpu_metrics.gfx_activity_acc << "\n"; + std::cout << "\t**.mem_activity_acc : " << std::dec + << gpu_metrics.mem_activity_acc << "\n"; + std::cout << "\t**.firmware_timestamp : " << std::dec + << gpu_metrics.firmware_timestamp << "\n"; + std::cout << "\t**.voltage_soc : " << std::dec + << gpu_metrics.voltage_soc << "\n"; + std::cout << "\t**.voltage_gfx : " << std::dec + << gpu_metrics.voltage_gfx << "\n"; + std::cout << "\t**.voltage_mem : " << std::dec + << gpu_metrics.voltage_mem << "\n"; + std::cout << "\t**.indep_throttle_status : " << std::dec + << gpu_metrics.indep_throttle_status << "\n"; + std::cout << "\t**.current_socket_power : " << std::dec + << gpu_metrics.current_socket_power << "\n"; + std::cout << "\t**.gfxclk_lock_status : " << std::dec + << gpu_metrics.gfxclk_lock_status << "\n"; + std::cout << "\t**.xgmi_link_width : " << std::dec + << gpu_metrics.xgmi_link_width << "\n"; + std::cout << "\t**.xgmi_link_speed : " << std::dec + << gpu_metrics.xgmi_link_speed << "\n"; + std::cout << "\t**.pcie_bandwidth_acc : " << std::dec + << gpu_metrics.pcie_bandwidth_acc << "\n"; + std::cout << "\t**.pcie_bandwidth_inst : " << std::dec + << gpu_metrics.pcie_bandwidth_inst << "\n"; + std::cout << "\t**.pcie_l0_to_recov_count_acc : " << std::dec + << gpu_metrics.pcie_l0_to_recov_count_acc << "\n"; + std::cout << "\t**.pcie_replay_count_acc : " << std::dec + << gpu_metrics.pcie_replay_count_acc << "\n"; + std::cout << "\t**.pcie_replay_rover_count_acc : " << std::dec + << gpu_metrics.pcie_replay_rover_count_acc << "\n"; + + std::cout << "\t**.temperature_hbm[] : " << std::dec << "\n"; + for (const auto& temp : gpu_metrics.temperature_hbm) { + std::cout << "\t -> " << std::dec << temp << "\n"; + } + + std::cout << "\t**.vcn_activity[] : " << std::dec << "\n"; + for (const auto& vcn : gpu_metrics.vcn_activity) { + std::cout << "\t -> " << std::dec << vcn << "\n"; + } + + std::cout << "\t**.xgmi_read_data_acc[] : " << std::dec << "\n"; + for (const auto& read_data : gpu_metrics.xgmi_read_data_acc) { + std::cout << "\t -> " << std::dec << read_data << "\n"; + } + + std::cout << "\t**.xgmi_write_data_acc[] : " << std::dec << "\n"; + for (const auto& write_data : gpu_metrics.xgmi_write_data_acc) { + std::cout << "\t -> " << std::dec << write_data << "\n"; + } + + std::cout << "\t**.current_gfxclks[] : " << std::dec << "\n"; + for (const auto& gfxclk : gpu_metrics.current_gfxclks) { + std::cout << "\t -> " << std::dec << gfxclk << "\n"; + } + + std::cout << "\t**.current_socclks[] : " << std::dec << "\n"; + for (const auto& socclk : gpu_metrics.current_socclks) { + std::cout << "\t -> " << std::dec << socclk << "\n"; + } + + std::cout << "\t**.current_vclk0s[] : " << std::dec << "\n"; + for (const auto& vclk : gpu_metrics.current_vclk0s) { + std::cout << "\t -> " << std::dec << vclk << "\n"; + } + + std::cout << "\t**.current_dclk0s[] : " << std::dec << "\n"; + for (const auto& dclk : gpu_metrics.current_dclk0s) { + std::cout << "\t -> " << std::dec << dclk << "\n"; + } + std::cout << " ** Note: Values MAX'ed out (UINTX MAX are unsupported for the version in question) ** " << "\n"; + + std::cout << "\n\n"; + print_test_header("GPU METRICS: Using direct APIs (newer)", i); + metrics_table_header_t header_values; + GPUMetricTempHbm_t hbm_values; + GPUMetricVcnActivity_t vcn_values; + GPUMetricXgmiReadDataAcc_t xgmi_read_values; + GPUMetricXgmiWriteDataAcc_t xgmi_write_values; + GPUMetricCurrGfxClk_t curr_gfxclk_values; + GPUMetricCurrSocClk_t curr_socclk_values; + GPUMetricCurrVClk0_t curr_vclk0_values; + GPUMetricCurrDClk0_t curr_dclk0_values; + + ret = rsmi_dev_metrics_header_info_get(i, &header_values); + std::cout << "\t[Metrics Header]" << "\n"; + std::cout << "\t -> format_revision : " << print_unsigned_int(header_values.format_revision) << "\n"; + std::cout << "\t -> content_revision : " << print_unsigned_int(header_values.content_revision) << "\n"; + std::cout << "\t--------------------" << "\n"; + + std::cout << "\n"; + std::cout << "\t[Temperature]" << "\n"; + ret = rsmi_dev_metrics_temp_edge_get(i, &val_ui16); + std::cout << "\t -> temp_edge(): " << print_error_or_value(ret, val_ui16) << "\n"; + ret = rsmi_dev_metrics_temp_hotspot_get(i, &val_ui16); + std::cout << "\t -> temp_hotspot(): " << print_error_or_value(ret, val_ui16) << "\n"; + ret = rsmi_dev_metrics_temp_mem_get(i, &val_ui16); + std::cout << "\t -> temp_mem(): " << print_error_or_value(ret, val_ui16) << "\n"; + ret = rsmi_dev_metrics_temp_vrgfx_get(i, &val_ui16); + std::cout << "\t -> temp_vrgfx(): " << print_error_or_value(ret, val_ui16) << "\n"; + ret = rsmi_dev_metrics_temp_vrsoc_get(i, &val_ui16); + std::cout << "\t -> temp_vrsoc(): " << print_error_or_value(ret, val_ui16) << "\n"; + ret = rsmi_dev_metrics_temp_vrmem_get(i, &val_ui16); + std::cout << "\t -> temp_vrmem(): " << print_error_or_value(ret, val_ui16) << "\n"; + ret = rsmi_dev_metrics_temp_hbm_get(i, &hbm_values); + std::cout << "\t -> temp_hbm(): " << print_error_or_value(ret, hbm_values) << "\n"; + + std::cout << "\n"; + std::cout << "\t[Power/Energy]" << "\n"; + ret = rsmi_dev_metrics_curr_socket_power_get(i, &val_ui16); + std::cout << "\t -> current_socket_power(): " << print_error_or_value(ret, val_ui16) << "\n"; + ret = rsmi_dev_metrics_energy_acc_get(i, &val_ui64); + std::cout << "\t -> energy_accum(): " << print_error_or_value(ret, val_ui64) << "\n"; + ret = rsmi_dev_metrics_avg_socket_power_get(i, &val_ui16); + std::cout << "\t -> average_socket_power(): " << print_error_or_value(ret, val_ui16) << "\n"; + + std::cout << "\n"; + std::cout << "\t[Utilization]" << "\n"; + ret = rsmi_dev_metrics_avg_gfx_activity_get(i, &val_ui16); + std::cout << "\t -> average_gfx_activity(): " << print_error_or_value(ret, val_ui16) << "\n"; + ret = rsmi_dev_metrics_avg_umc_activity_get(i, &val_ui16); + std::cout << "\t -> average_umc_activity(): " << print_error_or_value(ret, val_ui16) << "\n"; + ret = rsmi_dev_metrics_avg_mm_activity_get(i, &val_ui16); + std::cout << "\t -> average_mm_activity(): " << print_error_or_value(ret, val_ui16) << "\n"; + ret = rsmi_dev_metrics_vcn_activity_get(i, &vcn_values); + std::cout << "\t -> vcn_activity(): " << print_error_or_value(ret, vcn_values) << "\n"; + ret = rsmi_dev_metrics_mem_activity_acc_get(i, &val_ui32); + std::cout << "\t -> mem_activity_accum(): " << print_error_or_value(ret, val_ui32) << "\n"; + ret = rsmi_dev_metrics_gfx_activity_acc_get(i, &val_ui32); + std::cout << "\t -> gfx_activity_accum(): " << print_error_or_value(ret, val_ui32) << "\n"; + + std::cout << "\n"; + std::cout << "\t[Average Clock]" << "\n"; + ret = rsmi_dev_metrics_avg_gfx_clock_frequency_get(i, &val_ui16); + std::cout << "\t -> average_gfx_clock_frequency(): " << print_error_or_value(ret, val_ui16) << "\n"; + ret = rsmi_dev_metrics_avg_soc_clock_frequency_get(i, &val_ui16); + std::cout << "\t -> average_soc_clock_frequency(): " << print_error_or_value(ret, val_ui16) << "\n"; + ret = rsmi_dev_metrics_avg_uclock_frequency_get(i, &val_ui16); + std::cout << "\t -> average_uclock_frequency(): " << print_error_or_value(ret, val_ui16) << "\n"; + ret = rsmi_dev_metrics_avg_vclock0_frequency_get(i, &val_ui16); + std::cout << "\t -> average_vclock0_frequency(): " << print_error_or_value(ret, val_ui16) << "\n"; + ret = rsmi_dev_metrics_avg_dclock0_frequency_get(i, &val_ui16); + std::cout << "\t -> average_dclock0_frequency(): " << print_error_or_value(ret, val_ui16) << "\n"; + ret = rsmi_dev_metrics_avg_vclock1_frequency_get(i, &val_ui16); + std::cout << "\t -> average_vclock1_frequency(): " << print_error_or_value(ret, val_ui16) << "\n"; + ret = rsmi_dev_metrics_avg_dclock1_frequency_get(i, &val_ui16); + std::cout << "\t -> average_dclock1_frequency(): " << print_error_or_value(ret, val_ui16) << "\n"; + + std::cout << "\n"; + std::cout << "\t[Current Clock]" << "\n"; + ret = rsmi_dev_metrics_curr_vclk1_get(i, &val_ui16); + std::cout << "\t -> current_vclock1(): " << print_error_or_value(ret, val_ui16) << "\n"; + ret = rsmi_dev_metrics_curr_dclk1_get(i, &val_ui16); + std::cout << "\t -> current_dclock1(): " << print_error_or_value(ret, val_ui16) << "\n"; + ret = rsmi_dev_metrics_curr_uclk_get(i, &val_ui16); + std::cout << "\t -> current_uclock(): " << print_error_or_value(ret, val_ui16) << "\n"; + ret = rsmi_dev_metrics_curr_dclk0_get(i, &curr_dclk0_values); + std::cout << "\t -> current_dclk0(): " << print_error_or_value(ret, curr_dclk0_values) << "\n"; + ret = rsmi_dev_metrics_curr_gfxclk_get(i, &curr_gfxclk_values); + std::cout << "\t -> current_gfxclk(): " << print_error_or_value(ret, curr_gfxclk_values) << "\n"; + ret = rsmi_dev_metrics_curr_socclk_get(i, &curr_socclk_values); + std::cout << "\t -> current_soc_clock(): " << print_error_or_value(ret, curr_socclk_values) << "\n"; + ret = rsmi_dev_metrics_curr_vclk0_get(i, &curr_vclk0_values); + std::cout << "\t -> current_vclk0(): " << print_error_or_value(ret, curr_vclk0_values) << "\n"; + + std::cout << "\n"; + std::cout << "\t[Throttle]" << "\n"; + ret = rsmi_dev_metrics_indep_throttle_status_get(i, &val_ui64); + std::cout << "\t -> indep_throttle_status(): " << print_error_or_value(ret, val_ui64) << "\n"; + ret = rsmi_dev_metrics_throttle_status_get(i, &val_ui32); + std::cout << "\t -> throttle_status(): " << print_error_or_value(ret, val_ui32) << "\n"; + + std::cout << "\n"; + std::cout << "\t[Gfx Clock Lock]" << "\n"; + ret = rsmi_dev_metrics_gfxclk_lock_status_get(i, &val_ui32); + std::cout << "\t -> gfxclk_lock_status(): " << print_error_or_value(ret, val_ui32) << "\n"; + + std::cout << "\n"; + std::cout << "\t[Current Fan Speed]" << "\n"; + ret = rsmi_dev_metrics_curr_fan_speed_get(i, &val_ui16); + std::cout << "\t -> current_fan_speed(): " << print_error_or_value(ret, val_ui16) << "\n"; + + std::cout << "\n"; + std::cout << "\t[Link/Bandwidth/Speed]" << "\n"; + ret = rsmi_dev_metrics_pcie_link_width_get(i, &val_ui16); + std::cout << "\t -> pcie_link_width(): " << print_error_or_value(ret, val_ui16) << "\n"; + ret = rsmi_dev_metrics_pcie_link_speed_get(i, &val_ui16); + std::cout << "\t -> pcie_link_speed(): " << print_error_or_value(ret, val_ui16) << "\n"; + ret = rsmi_dev_metrics_pcie_bandwidth_acc_get(i, &val_ui64); + std::cout << "\t -> pcie_bandwidth_accum(): " << print_error_or_value(ret, val_ui64) << "\n"; + ret = rsmi_dev_metrics_pcie_bandwidth_inst_get(i, &val_ui64); + std::cout << "\t -> pcie_bandwidth_inst(): " << print_error_or_value(ret, val_ui64) << "\n"; + ret = rsmi_dev_metrics_pcie_l0_recov_count_acc_get(i, &val_ui64); + std::cout << "\t -> pcie_l0_recov_count_accum(): " << print_error_or_value(ret, val_ui64) << "\n"; + ret = rsmi_dev_metrics_pcie_replay_count_acc_get(i, &val_ui64); + std::cout << "\t -> pcie_replay_count_accum(): " << print_error_or_value(ret, val_ui64) << "\n"; + ret = rsmi_dev_metrics_pcie_replay_rover_count_acc_get(i, &val_ui64); + std::cout << "\t -> pcie_replay_rollover_count_accum(): " << print_error_or_value(ret, val_ui64) << "\n"; + ret = rsmi_dev_metrics_xgmi_link_width_get(i, &val_ui16); + std::cout << "\t -> xgmi_link_width(): " << print_error_or_value(ret, val_ui16) << "\n"; + ret = rsmi_dev_metrics_xgmi_link_speed_get(i, &val_ui16); + std::cout << "\t -> xgmi_link_speed(): " << print_error_or_value(ret, val_ui16) << "\n"; + ret = rsmi_dev_metrics_xgmi_read_data_get(i, &xgmi_read_values); + std::cout << "\t -> xgmi_read_data(): " << print_error_or_value(ret, xgmi_read_values) << "\n"; + ret = rsmi_dev_metrics_xgmi_write_data_get(i, &xgmi_write_values); + std::cout << "\t -> xgmi_write_data(): " << print_error_or_value(ret, xgmi_write_values) << "\n"; + + std::cout << "\n"; + std::cout << "\t[Voltage]" << "\n"; + ret = rsmi_dev_metrics_volt_soc_get(i, &val_ui16); + std::cout << "\t -> voltage_soc(): " << print_error_or_value(ret, val_ui16) << "\n"; + ret = rsmi_dev_metrics_volt_gfx_get(i, &val_ui16); + std::cout << "\t -> voltage_gfx(): " << print_error_or_value(ret, val_ui16) << "\n"; + ret = rsmi_dev_metrics_volt_mem_get(i, &val_ui16); + std::cout << "\t -> voltage_mem(): " << print_error_or_value(ret, val_ui16) << "\n"; + + std::cout << "\n"; + std::cout << "\t[Timestamp]" << "\n"; + ret = rsmi_dev_metrics_system_clock_counter_get(i, &val_ui64); + std::cout << "\t -> system_clock_counter(): " << print_error_or_value(ret, val_ui64) << "\n"; + ret = rsmi_dev_metrics_firmware_timestamp_get(i, &val_ui64); + std::cout << "\t -> firmware_timestamp(): " << print_error_or_value(ret, val_ui64) << "\n"; + + std::cout << "\n"; + std::cout << "\t[XCD CounterVoltage]" << "\n"; + ret = rsmi_dev_metrics_xcd_counter_get(i, &val_ui16); + std::cout << "\t -> xcd_counter(): " << print_error_or_value(ret, val_ui16) << "\n"; + std::cout << "\n\n"; - std::cout << "\t**p.current_gfxclk: " << std::dec - << p.current_gfxclk << "\n"; - std::cout << "\t**p.current_socclk: " << std::dec - << p.current_socclk << "\n"; - std::cout << "\t**p.current_uclk: " << std::dec - << p.current_uclk << "\n"; - std::cout << "\t**p.current_vclk0: " << std::dec - << p.current_vclk0 << "\n"; - std::cout << "\t**p.current_dclk0: " << std::dec - << p.current_dclk0 << "\n"; - std::cout << "\t**p.current_vclk1: " << std::dec - << p.current_vclk1 << "\n"; - std::cout << "\t**p.current_dclk1: " << std::dec - << p.current_dclk1 << "\n"; ret = rsmi_dev_perf_level_get(i, &pfl); CHK_AND_PRINT_RSMI_ERR_RET(ret) diff --git a/projects/rocm-smi-lib/src/rocm_smi.cc b/projects/rocm-smi-lib/src/rocm_smi.cc index 325d86a3eb..ba66636ae3 100755 --- a/projects/rocm-smi-lib/src/rocm_smi.cc +++ b/projects/rocm-smi-lib/src/rocm_smi.cc @@ -61,6 +61,7 @@ #include #include #include +#include #include #include @@ -3016,6 +3017,11 @@ rsmi_dev_energy_count_get(uint32_t dv_ind, uint64_t *power, ss << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ss); + if (power == nullptr || + timestamp == nullptr) { + return RSMI_STATUS_INVALID_ARGS; + } + rsmi_status_t ret; rsmi_gpu_metrics_t gpu_metrics; ret = rsmi_dev_gpu_metrics_info_get(dv_ind, &gpu_metrics); @@ -3023,11 +3029,6 @@ rsmi_dev_energy_count_get(uint32_t dv_ind, uint64_t *power, return ret; } - if (power == nullptr || - timestamp == nullptr) { - return RSMI_STATUS_INVALID_ARGS; - } - *power = gpu_metrics.energy_accumulator; *timestamp = gpu_metrics.system_clock_counter; // hard-coded for now since all ASICs have same resolution. If it ASIC @@ -3489,6 +3490,11 @@ rsmi_utilization_count_get(uint32_t dv_ind, uint64_t *timestamp) { TRY + if (timestamp == nullptr || + utilization_counters == nullptr) { + return RSMI_STATUS_INVALID_ARGS; + } + rsmi_status_t ret; rsmi_gpu_metrics_t gpu_metrics; uint32_t val_ui32; @@ -3498,11 +3504,6 @@ rsmi_utilization_count_get(uint32_t dv_ind, return ret; } - if (timestamp == nullptr || - utilization_counters == nullptr) { - return RSMI_STATUS_INVALID_ARGS; - } - for (uint32_t index = 0 ; index < count; index++) { switch (utilization_counters[index].type) { case RSMI_COARSE_GRAIN_GFX_ACTIVITY: @@ -5549,13 +5550,45 @@ rsmi_status_t rsmi_event_notification_stop(uint32_t dv_ind) { // so they can be used/tested. // rsmi_status_t -rsmi_dev_temp_hotspot_get(uint32_t dv_ind, uint16_t* hotspot_value) +rsmi_dev_metrics_temp_edge_get(uint32_t dv_ind, uint16_t* edge_value) { TRY std::ostringstream ostrstream; ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ostrstream); + assert(edge_value != nullptr); + if (edge_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricTempEdge); + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *edge_value); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_metrics_temp_hotspot_get(uint32_t dv_ind, uint16_t* hotspot_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + assert(hotspot_value != nullptr); + if (hotspot_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricTempHotspot); auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *hotspot_value); ostrstream << __PRETTY_FUNCTION__ @@ -5571,13 +5604,18 @@ rsmi_dev_temp_hotspot_get(uint32_t dv_ind, uint16_t* hotspot_value) } rsmi_status_t -rsmi_dev_temp_mem_get(uint32_t dv_ind, uint16_t* mem_value) +rsmi_dev_metrics_temp_mem_get(uint32_t dv_ind, uint16_t* mem_value) { TRY std::ostringstream ostrstream; ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ostrstream); + assert(mem_value != nullptr); + if (mem_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricTempMem); auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *mem_value); ostrstream << __PRETTY_FUNCTION__ @@ -5593,13 +5631,45 @@ rsmi_dev_temp_mem_get(uint32_t dv_ind, uint16_t* mem_value) } rsmi_status_t -rsmi_dev_temp_vrsoc_get(uint32_t dv_ind, uint16_t* vrsoc_value) +rsmi_dev_metrics_temp_vrgfx_get(uint32_t dv_ind, uint16_t* vrgfx_value) { TRY std::ostringstream ostrstream; ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ostrstream); + assert(vrgfx_value != nullptr); + if (vrgfx_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricTempVrGfx); + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *vrgfx_value); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_metrics_temp_vrsoc_get(uint32_t dv_ind, uint16_t* vrsoc_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + assert(vrsoc_value != nullptr); + if (vrsoc_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricTempVrSoc); auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *vrsoc_value); ostrstream << __PRETTY_FUNCTION__ @@ -5615,13 +5685,45 @@ rsmi_dev_temp_vrsoc_get(uint32_t dv_ind, uint16_t* vrsoc_value) } rsmi_status_t -rsmi_dev_curr_socket_power_get(uint32_t dv_ind, uint16_t* socket_power_value) +rsmi_dev_metrics_temp_vrmem_get(uint32_t dv_ind, uint16_t* vrmem_value) { TRY std::ostringstream ostrstream; ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ostrstream); + assert(vrmem_value != nullptr); + if (vrmem_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricTempVrMem); + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *vrmem_value); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_metrics_curr_socket_power_get(uint32_t dv_ind, uint16_t* socket_power_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + assert(socket_power_value != nullptr); + if (socket_power_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrSocketPower); auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *socket_power_value); ostrstream << __PRETTY_FUNCTION__ @@ -5637,13 +5739,45 @@ rsmi_dev_curr_socket_power_get(uint32_t dv_ind, uint16_t* socket_power_value) } rsmi_status_t -rsmi_dev_avg_gfx_activity_get(uint32_t dv_ind, uint16_t* gfx_activity_value) +rsmi_dev_metrics_avg_socket_power_get(uint32_t dv_ind, uint16_t* socket_power_value) { TRY std::ostringstream ostrstream; ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ostrstream); + assert(socket_power_value != nullptr); + if (socket_power_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricAvgSocketPower); + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *socket_power_value); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_metrics_avg_gfx_activity_get(uint32_t dv_ind, uint16_t* gfx_activity_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + assert(gfx_activity_value != nullptr); + if (gfx_activity_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricAvgGfxActivity); auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *gfx_activity_value); ostrstream << __PRETTY_FUNCTION__ @@ -5659,13 +5793,18 @@ rsmi_dev_avg_gfx_activity_get(uint32_t dv_ind, uint16_t* gfx_activity_value) } rsmi_status_t -rsmi_dev_avg_umc_activity_get(uint32_t dv_ind, uint16_t* umc_activity_value) +rsmi_dev_metrics_avg_umc_activity_get(uint32_t dv_ind, uint16_t* umc_activity_value) { TRY std::ostringstream ostrstream; ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ostrstream); + assert(umc_activity_value != nullptr); + if (umc_activity_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricAvgUmcActivity); auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *umc_activity_value); ostrstream << __PRETTY_FUNCTION__ @@ -5681,13 +5820,45 @@ rsmi_dev_avg_umc_activity_get(uint32_t dv_ind, uint16_t* umc_activity_value) } rsmi_status_t -rsmi_dev_energy_acc_get(uint32_t dv_ind, uint64_t* energy_acc_value) +rsmi_dev_metrics_avg_mm_activity_get(uint32_t dv_ind, uint16_t* mm_activity_value) { TRY std::ostringstream ostrstream; ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ostrstream); + assert(mm_activity_value != nullptr); + if (mm_activity_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricAvgMmActivity); + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *mm_activity_value); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_metrics_energy_acc_get(uint32_t dv_ind, uint64_t* energy_acc_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + assert(energy_acc_value != nullptr); + if (energy_acc_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricEnergyAccumulator); auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *energy_acc_value); ostrstream << __PRETTY_FUNCTION__ @@ -5703,13 +5874,18 @@ rsmi_dev_energy_acc_get(uint32_t dv_ind, uint64_t* energy_acc_value) } rsmi_status_t -rsmi_dev_system_clock_counter_get(uint32_t dv_ind, uint64_t* system_clock_counter_value) +rsmi_dev_metrics_system_clock_counter_get(uint32_t dv_ind, uint64_t* system_clock_counter_value) { TRY std::ostringstream ostrstream; ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ostrstream); + assert(system_clock_counter_value != nullptr); + if (system_clock_counter_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricTSClockCounter); auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *system_clock_counter_value); ostrstream << __PRETTY_FUNCTION__ @@ -5725,13 +5901,18 @@ rsmi_dev_system_clock_counter_get(uint32_t dv_ind, uint64_t* system_clock_counte } rsmi_status_t -rsmi_dev_firmware_timestamp_get(uint32_t dv_ind, uint64_t* firmware_timestamp_value) +rsmi_dev_metrics_firmware_timestamp_get(uint32_t dv_ind, uint64_t* firmware_timestamp_value) { TRY std::ostringstream ostrstream; ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ostrstream); + assert(firmware_timestamp_value != nullptr); + if (firmware_timestamp_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricTSFirmware); auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *firmware_timestamp_value); ostrstream << __PRETTY_FUNCTION__ @@ -5747,13 +5928,45 @@ rsmi_dev_firmware_timestamp_get(uint32_t dv_ind, uint64_t* firmware_timestamp_va } rsmi_status_t -rsmi_dev_throttle_status_get(uint32_t dv_ind, uint32_t* throttle_status_value) +rsmi_dev_metrics_indep_throttle_status_get(uint32_t dv_ind, uint64_t* throttle_status_value) { TRY std::ostringstream ostrstream; ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ostrstream); + assert(throttle_status_value != nullptr); + if (throttle_status_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricIndepThrottleStatus); + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *throttle_status_value); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_metrics_throttle_status_get(uint32_t dv_ind, uint32_t* throttle_status_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + assert(throttle_status_value != nullptr); + if (throttle_status_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricThrottleStatus); auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *throttle_status_value); ostrstream << __PRETTY_FUNCTION__ @@ -5769,13 +5982,45 @@ rsmi_dev_throttle_status_get(uint32_t dv_ind, uint32_t* throttle_status_value) } rsmi_status_t -rsmi_dev_pcie_link_width_get(uint32_t dv_ind, uint16_t* pcie_link_width_value) +rsmi_dev_metrics_curr_fan_speed_get(uint32_t dv_ind, uint16_t* fan_speed_value) { TRY std::ostringstream ostrstream; ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ostrstream); + assert(fan_speed_value != nullptr); + if (fan_speed_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrFanSpeed); + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *fan_speed_value); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_metrics_pcie_link_width_get(uint32_t dv_ind, uint16_t* pcie_link_width_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + assert(pcie_link_width_value != nullptr); + if (pcie_link_width_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricPcieLinkWidth); auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *pcie_link_width_value); ostrstream << __PRETTY_FUNCTION__ @@ -5791,13 +6036,18 @@ rsmi_dev_pcie_link_width_get(uint32_t dv_ind, uint16_t* pcie_link_width_value) } rsmi_status_t -rsmi_dev_pcie_link_speed_get(uint32_t dv_ind, uint16_t* pcie_link_speed_value) +rsmi_dev_metrics_pcie_link_speed_get(uint32_t dv_ind, uint16_t* pcie_link_speed_value) { TRY std::ostringstream ostrstream; ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ostrstream); + assert(pcie_link_speed_value != nullptr); + if (pcie_link_speed_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricPcieLinkSpeed); auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *pcie_link_speed_value); ostrstream << __PRETTY_FUNCTION__ @@ -5813,13 +6063,18 @@ rsmi_dev_pcie_link_speed_get(uint32_t dv_ind, uint16_t* pcie_link_speed_value) } rsmi_status_t -rsmi_dev_xgmi_link_width_get(uint32_t dv_ind, uint16_t* xgmi_link_width_value) +rsmi_dev_metrics_xgmi_link_width_get(uint32_t dv_ind, uint16_t* xgmi_link_width_value) { TRY std::ostringstream ostrstream; ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ostrstream); + assert(xgmi_link_width_value != nullptr); + if (xgmi_link_width_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricXgmiLinkWidth); auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *xgmi_link_width_value); ostrstream << __PRETTY_FUNCTION__ @@ -5835,13 +6090,18 @@ rsmi_dev_xgmi_link_width_get(uint32_t dv_ind, uint16_t* xgmi_link_width_value) } rsmi_status_t -rsmi_dev_xgmi_link_speed_get(uint32_t dv_ind, uint16_t* xgmi_link_speed_value) +rsmi_dev_metrics_xgmi_link_speed_get(uint32_t dv_ind, uint16_t* xgmi_link_speed_value) { TRY std::ostringstream ostrstream; ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ostrstream); + assert(xgmi_link_speed_value != nullptr); + if (xgmi_link_speed_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricXgmiLinkSpeed); auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *xgmi_link_speed_value); ostrstream << __PRETTY_FUNCTION__ @@ -5857,13 +6117,18 @@ rsmi_dev_xgmi_link_speed_get(uint32_t dv_ind, uint16_t* xgmi_link_speed_value) } rsmi_status_t -rsmi_dev_gfxclk_lock_status_get(uint32_t dv_ind, uint32_t* gfxclk_lock_status_value) +rsmi_dev_metrics_gfxclk_lock_status_get(uint32_t dv_ind, uint32_t* gfxclk_lock_status_value) { TRY std::ostringstream ostrstream; ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ostrstream); + assert(gfxclk_lock_status_value != nullptr); + if (gfxclk_lock_status_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricGfxClkLockStatus); auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *gfxclk_lock_status_value); ostrstream << __PRETTY_FUNCTION__ @@ -5879,13 +6144,18 @@ rsmi_dev_gfxclk_lock_status_get(uint32_t dv_ind, uint32_t* gfxclk_lock_status_va } rsmi_status_t -rsmi_dev_gfx_activity_acc_get(uint32_t dv_ind, uint32_t* gfx_activity_acc_value) +rsmi_dev_metrics_gfx_activity_acc_get(uint32_t dv_ind, uint32_t* gfx_activity_acc_value) { TRY std::ostringstream ostrstream; ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ostrstream); + assert(gfx_activity_acc_value != nullptr); + if (gfx_activity_acc_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricGfxActivityAccumulator); auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *gfx_activity_acc_value); ostrstream << __PRETTY_FUNCTION__ @@ -5901,13 +6171,18 @@ rsmi_dev_gfx_activity_acc_get(uint32_t dv_ind, uint32_t* gfx_activity_acc_value) } rsmi_status_t -rsmi_dev_mem_activity_acc_get(uint32_t dv_ind, uint32_t* mem_activity_acc_value) +rsmi_dev_metrics_mem_activity_acc_get(uint32_t dv_ind, uint32_t* mem_activity_acc_value) { TRY std::ostringstream ostrstream; ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ostrstream); + assert(mem_activity_acc_value != nullptr); + if (mem_activity_acc_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricMemActivityAccumulator); auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *mem_activity_acc_value); ostrstream << __PRETTY_FUNCTION__ @@ -5923,13 +6198,18 @@ rsmi_dev_mem_activity_acc_get(uint32_t dv_ind, uint32_t* mem_activity_acc_value) } rsmi_status_t -rsmi_dev_pcie_bandwidth_acc_get(uint32_t dv_ind, uint64_t* pcie_bandwidth_acc_value) +rsmi_dev_metrics_pcie_bandwidth_acc_get(uint32_t dv_ind, uint64_t* pcie_bandwidth_acc_value) { TRY std::ostringstream ostrstream; ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ostrstream); + assert(pcie_bandwidth_acc_value != nullptr); + if (pcie_bandwidth_acc_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricPcieBandwidthAccumulator); auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *pcie_bandwidth_acc_value); ostrstream << __PRETTY_FUNCTION__ @@ -5945,13 +6225,18 @@ rsmi_dev_pcie_bandwidth_acc_get(uint32_t dv_ind, uint64_t* pcie_bandwidth_acc_va } rsmi_status_t -rsmi_dev_pcie_bandwidth_inst_get(uint32_t dv_ind, uint64_t* pcie_bandwidth_inst_value) +rsmi_dev_metrics_pcie_bandwidth_inst_get(uint32_t dv_ind, uint64_t* pcie_bandwidth_inst_value) { TRY std::ostringstream ostrstream; ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ostrstream); + assert(pcie_bandwidth_inst_value != nullptr); + if (pcie_bandwidth_inst_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricPcieBandwidthInst); auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *pcie_bandwidth_inst_value); ostrstream << __PRETTY_FUNCTION__ @@ -5967,13 +6252,99 @@ rsmi_dev_pcie_bandwidth_inst_get(uint32_t dv_ind, uint64_t* pcie_bandwidth_inst_ } rsmi_status_t -rsmi_dev_curr_uclk_get(uint32_t dv_ind, uint16_t* uclk_value) +rsmi_dev_metrics_pcie_l0_recov_count_acc_get(uint32_t dv_ind, uint64_t* pcie_count_acc_value) { TRY std::ostringstream ostrstream; ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ostrstream); + assert(pcie_count_acc_value != nullptr); + if (pcie_count_acc_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricPcieL0RecovCountAccumulator); + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *pcie_count_acc_value); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_metrics_pcie_replay_count_acc_get(uint32_t dv_ind, uint64_t* pcie_count_acc_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + assert(pcie_count_acc_value != nullptr); + if (pcie_count_acc_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricPcieReplayCountAccumulator); + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *pcie_count_acc_value); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_metrics_pcie_replay_rover_count_acc_get(uint32_t dv_ind, uint64_t* pcie_count_acc_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + assert(pcie_count_acc_value != nullptr); + if (pcie_count_acc_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricPcieReplayRollOverCountAccumulator); + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *pcie_count_acc_value); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_metrics_curr_uclk_get(uint32_t dv_ind, uint16_t* uclk_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + assert(uclk_value != nullptr); + if (uclk_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrUClock); auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *uclk_value); ostrstream << __PRETTY_FUNCTION__ @@ -5989,155 +6360,258 @@ rsmi_dev_curr_uclk_get(uint32_t dv_ind, uint16_t* uclk_value) } rsmi_status_t -rsmi_dev_vcn_activity_get(uint32_t dv_ind, amd::smi::GPUMetricVcnActivityTbl_t* vcn_activity_value) +rsmi_dev_metrics_temp_hbm_get(uint32_t dv_ind, GPUMetricTempHbm_t* temp_hbm_value) { TRY std::ostringstream ostrstream; ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ostrstream); - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricVcnActivity); - amd::smi::GPUMetricVcnActivityTbl_t tmp_vcn_activity_value; - *vcn_activity_value = tmp_vcn_activity_value; - auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + assert(temp_hbm_value != nullptr); + if (temp_hbm_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricTempHbm); + amd::smi::GPUMetricTempHbmTbl_t tmp_hbl_tbl{}; + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_hbl_tbl); + if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { + const auto max_num_elems = + static_cast(std::end(*temp_hbm_value) - std::begin(*temp_hbm_value)); + std::copy_n(std::begin(tmp_hbl_tbl), max_num_elems, *temp_hbm_value); + } ostrstream << __PRETTY_FUNCTION__ << " | ======= end ======= " << " | End Result " << " | Device #: " << dv_ind << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Metric Size: " << tmp_hbl_tbl.size() << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; LOG_INFO(ostrstream); + return status_code; CATCH } rsmi_status_t -rsmi_dev_xgmi_read_data_get(uint32_t dv_ind, amd::smi::GPUMetricXgmiAccTbl_t* xgmi_read_data_acc_value) +rsmi_dev_metrics_vcn_activity_get(uint32_t dv_ind, GPUMetricVcnActivity_t* vcn_activity_value) { TRY std::ostringstream ostrstream; ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ostrstream); + assert(vcn_activity_value != nullptr); + if (vcn_activity_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricVcnActivity); - amd::smi::GPUMetricXgmiAccTbl_t tmp_xgmi_read_data_acc_value; - *xgmi_read_data_acc_value = tmp_xgmi_read_data_acc_value; - auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + amd::smi::GPUMetricVcnActivityTbl_t tmp_vcn_tbl{}; + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_vcn_tbl); + if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { + const auto max_num_elems = + static_cast(std::end(*vcn_activity_value) - std::begin(*vcn_activity_value)); + std::copy_n(std::begin(tmp_vcn_tbl), max_num_elems, *vcn_activity_value); + } ostrstream << __PRETTY_FUNCTION__ << " | ======= end ======= " << " | End Result " << " | Device #: " << dv_ind << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Metric Size: " << tmp_vcn_tbl.size() << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; LOG_INFO(ostrstream); + return status_code; CATCH } rsmi_status_t -rsmi_dev_xgmi_write_data_get(uint32_t dv_ind, amd::smi::GPUMetricXgmiAccTbl_t* xgmi_write_data_acc_value) +rsmi_dev_metrics_xgmi_read_data_get(uint32_t dv_ind, GPUMetricXgmiReadDataAcc_t* xgmi_read_data_acc_value) { TRY std::ostringstream ostrstream; ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ostrstream); - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricVcnActivity); - amd::smi::GPUMetricXgmiAccTbl_t tmp_xgmi_write_data_acc_value; - *xgmi_write_data_acc_value = tmp_xgmi_write_data_acc_value; - auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + assert(xgmi_read_data_acc_value != nullptr); + if (xgmi_read_data_acc_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricXgmiReadDataAccumulator); + amd::smi::GPUMetricXgmiAccTbl_t tmp_xgmi_acc_tbl{}; + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_xgmi_acc_tbl); + if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { + const auto max_num_elems = + static_cast(std::end(*xgmi_read_data_acc_value) - std::begin(*xgmi_read_data_acc_value)); + std::copy_n(std::begin(tmp_xgmi_acc_tbl), max_num_elems, *xgmi_read_data_acc_value); + } ostrstream << __PRETTY_FUNCTION__ << " | ======= end ======= " << " | End Result " << " | Device #: " << dv_ind << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Metric Size: " << tmp_xgmi_acc_tbl.size() << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; LOG_INFO(ostrstream); + return status_code; CATCH } rsmi_status_t -rsmi_dev_curr_gfxclk_get(uint32_t dv_ind, amd::smi::GPUMetricCurrGfxClkTbl_t* current_gfxclk_value) +rsmi_dev_metrics_xgmi_write_data_get(uint32_t dv_ind, GPUMetricXgmiWriteDataAcc_t* xgmi_write_data_acc_value) { TRY std::ostringstream ostrstream; ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ostrstream); - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricVcnActivity); - amd::smi::GPUMetricCurrGfxClkTbl_t tmp_current_gfxclk_value; - *current_gfxclk_value = tmp_current_gfxclk_value; - auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + assert(xgmi_write_data_acc_value != nullptr); + if (xgmi_write_data_acc_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricXgmiWriteDataAccumulator); + amd::smi::GPUMetricXgmiAccTbl_t tmp_xgmi_acc_tbl{}; + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_xgmi_acc_tbl); + if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { + const auto max_num_elems = + static_cast(std::end(*xgmi_write_data_acc_value) - std::begin(*xgmi_write_data_acc_value)); + std::copy_n(std::begin(tmp_xgmi_acc_tbl), max_num_elems, *xgmi_write_data_acc_value); + } ostrstream << __PRETTY_FUNCTION__ << " | ======= end ======= " << " | End Result " << " | Device #: " << dv_ind << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Metric Size: " << tmp_xgmi_acc_tbl.size() << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; LOG_INFO(ostrstream); + return status_code; CATCH } rsmi_status_t -rsmi_dev_curr_socclk_get(uint32_t dv_ind, amd::smi::GPUMetricCurrSocClkTbl_t* current_socclk_value) +rsmi_dev_metrics_curr_gfxclk_get(uint32_t dv_ind, GPUMetricCurrGfxClk_t* current_gfxclk_value) { TRY std::ostringstream ostrstream; ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ostrstream); - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricVcnActivity); - amd::smi::GPUMetricCurrSocClkTbl_t tmp_current_socclk_value; - *current_socclk_value = tmp_current_socclk_value; - auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + assert(current_gfxclk_value != nullptr); + if (current_gfxclk_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrGfxClock); + amd::smi::GPUMetricCurrGfxClkTbl_t tmp_curr_gfxclk_tbl{}; + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_curr_gfxclk_tbl); + if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { + const auto max_num_elems = + static_cast(std::end(*current_gfxclk_value) - std::begin(*current_gfxclk_value)); + std::copy_n(std::begin(tmp_curr_gfxclk_tbl), max_num_elems, *current_gfxclk_value); + } ostrstream << __PRETTY_FUNCTION__ << " | ======= end ======= " << " | End Result " << " | Device #: " << dv_ind << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Metric Size: " << tmp_curr_gfxclk_tbl.size() << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; LOG_INFO(ostrstream); + return status_code; CATCH } rsmi_status_t -rsmi_dev_curr_vclk0_get(uint32_t dv_ind, amd::smi::GPUMetricCurrVClkTbl_t* current_vclk_value) +rsmi_dev_metrics_curr_socclk_get(uint32_t dv_ind, GPUMetricCurrSocClk_t* current_socclk_value) { TRY std::ostringstream ostrstream; ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ostrstream); - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricVcnActivity); - amd::smi::GPUMetricCurrVClkTbl_t tmp_current_vclk_value; - *current_vclk_value = tmp_current_vclk_value; - auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + assert(current_socclk_value != nullptr); + if (current_socclk_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrSocClock); + amd::smi::GPUMetricCurrSocClkTbl_t tmp_curr_socclk_tbl{}; + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_curr_socclk_tbl); + if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { + const auto max_num_elems = + static_cast(std::end(*current_socclk_value) - std::begin(*current_socclk_value)); + std::copy_n(std::begin(tmp_curr_socclk_tbl), max_num_elems, *current_socclk_value); + } ostrstream << __PRETTY_FUNCTION__ << " | ======= end ======= " << " | End Result " << " | Device #: " << dv_ind << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Metric Size: " << tmp_curr_socclk_tbl.size() << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; LOG_INFO(ostrstream); + return status_code; CATCH } rsmi_status_t -rsmi_dev_curr_vdlk0_get(uint32_t dv_ind, amd::smi::GPUMetricCurrDClkTbl_t* current_dclk_value) +rsmi_dev_metrics_curr_vclk0_get(uint32_t dv_ind, GPUMetricCurrVClk0_t* current_vclk_value) { TRY std::ostringstream ostrstream; ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ostrstream); - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricVcnActivity); - amd::smi::GPUMetricCurrDClkTbl_t tmp_current_dclk_value; - *current_dclk_value = tmp_current_dclk_value; - auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + assert(current_vclk_value != nullptr); + if (current_vclk_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrVClock0); + amd::smi::GPUMetricCurrVClkTbl_t tmp_curr_vclk0_tbl{}; + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_curr_vclk0_tbl); + if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { + const auto max_num_elems = + static_cast(std::end(*current_vclk_value) - std::begin(*current_vclk_value)); + std::copy_n(std::begin(tmp_curr_vclk0_tbl), max_num_elems, *current_vclk_value); + } + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Metric Size: " << tmp_curr_vclk0_tbl.size() + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_metrics_curr_vclk1_get(uint32_t dv_ind, uint16_t* current_vclk_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + assert(current_vclk_value != nullptr); + if (current_vclk_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrVClock1); + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *current_vclk_value); ostrstream << __PRETTY_FUNCTION__ << " | ======= end ======= " << " | End Result " @@ -6145,6 +6619,426 @@ rsmi_dev_curr_vdlk0_get(uint32_t dv_ind, amd::smi::GPUMetricCurrDClkTbl_t* curre << " | Metric Type: " << static_cast(gpu_metric_unit) << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_metrics_curr_dclk0_get(uint32_t dv_ind, GPUMetricCurrDClk0_t* current_dclk_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + assert(current_dclk_value != nullptr); + if (current_dclk_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrDClock0); + amd::smi::GPUMetricCurrDClkTbl_t tmp_curr_dclk0_tbl; + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_curr_dclk0_tbl); + if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { + const auto max_num_elems = + static_cast(std::end(*current_dclk_value) - std::begin(*current_dclk_value)); + std::copy_n(std::begin(tmp_curr_dclk0_tbl), max_num_elems, *current_dclk_value); + } + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Metric Size: " << tmp_curr_dclk0_tbl.size() + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_metrics_curr_dclk1_get(uint32_t dv_ind, uint16_t* current_dclk_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + assert(current_dclk_value != nullptr); + if (current_dclk_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrDClock1); + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *current_dclk_value); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_metrics_avg_gfx_clock_frequency_get(uint32_t dv_ind, uint16_t* clock_frequency_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + assert(clock_frequency_value != nullptr); + if (clock_frequency_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricAvgGfxClockFrequency); + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *clock_frequency_value); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_metrics_avg_soc_clock_frequency_get(uint32_t dv_ind, uint16_t* clock_frequency_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + assert(clock_frequency_value != nullptr); + if (clock_frequency_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricAvgSocClockFrequency); + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *clock_frequency_value); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_metrics_avg_uclock_frequency_get(uint32_t dv_ind, uint16_t* clock_frequency_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + assert(clock_frequency_value != nullptr); + if (clock_frequency_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricAvgUClockFrequency); + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *clock_frequency_value); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_metrics_avg_vclock0_frequency_get(uint32_t dv_ind, uint16_t* clock_frequency_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + assert(clock_frequency_value != nullptr); + if (clock_frequency_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricAvgVClock0Frequency); + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *clock_frequency_value); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_metrics_avg_dclock0_frequency_get(uint32_t dv_ind, uint16_t* clock_frequency_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + assert(clock_frequency_value != nullptr); + if (clock_frequency_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricAvgDClock0Frequency); + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *clock_frequency_value); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_metrics_avg_vclock1_frequency_get(uint32_t dv_ind, uint16_t* clock_frequency_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + assert(clock_frequency_value != nullptr); + if (clock_frequency_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricAvgVClock1Frequency); + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *clock_frequency_value); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_metrics_avg_dclock1_frequency_get(uint32_t dv_ind, uint16_t* clock_frequency_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + assert(clock_frequency_value != nullptr); + if (clock_frequency_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricAvgDClock1Frequency); + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *clock_frequency_value); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_metrics_volt_soc_get(uint32_t dv_ind, uint16_t* voltage_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + assert(voltage_value != nullptr); + if (voltage_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricVoltageSoc); + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *voltage_value); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_metrics_volt_gfx_get(uint32_t dv_ind, uint16_t* voltage_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + assert(voltage_value != nullptr); + if (voltage_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricVoltageGfx); + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *voltage_value); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_metrics_volt_mem_get(uint32_t dv_ind, uint16_t* voltage_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + assert(voltage_value != nullptr); + if (voltage_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricVoltageMem); + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *voltage_value); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_metrics_header_info_get(uint32_t dv_ind, metrics_table_header_t* header_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + assert(header_value != nullptr); + if (header_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + + auto status_code = rsmi_dev_gpu_metrics_header_info_get(dv_ind, *header_value); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Format Revision: " << header_value->format_revision + << " | Content Revision: " << header_value->content_revision + << " | Header Size: " << header_value->structure_size + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + + +rsmi_status_t +rsmi_dev_metrics_xcd_counter_get(uint32_t dv_ind, uint16_t* xcd_counter_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + assert(xcd_counter_value != nullptr); + if (xcd_counter_value == nullptr) { + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + + auto xcd_counter = uint16_t(0); + GPUMetricCurrGfxClk_t curr_gfxclk_table{}; + auto status_code = rsmi_dev_metrics_curr_gfxclk_get(dv_ind, &curr_gfxclk_table); + if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { + for (const auto& gfxclk : curr_gfxclk_table) { + if ((gfxclk != 0) && (gfxclk != UINT16_MAX)) { + xcd_counter++; + } + } + } + + *xcd_counter_value = xcd_counter; + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | XCDs counter: " << xcd_counter + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_metrics_log_get(uint32_t dv_ind) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + GET_DEV_FROM_INDX + auto status_code = dev->dev_log_gpu_metrics(ostrstream); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << "All GPU Metrics..." + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + return status_code; CATCH } diff --git a/projects/rocm-smi-lib/src/rocm_smi_device.cc b/projects/rocm-smi-lib/src/rocm_smi_device.cc index 149cf22718..60d344f598 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_device.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_device.cc @@ -530,7 +530,7 @@ static const std::map kDevFuncDependsMap = { Device::Device(std::string p, RocmSMI_env_vars const *e) : monitor_(nullptr), path_(p), env_(e), evt_notif_anon_fd_(-1), - gpu_metrics_ver_{0, 0, 0} { + m_gpu_metrics_header{0, 0, 0} { #ifndef DEBUG env_ = nullptr; #endif @@ -875,7 +875,14 @@ int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size, if ((num*b_size) != b_size) { ss << "Could not read DevInfoBinary for DevInfoType (" << RocmSMI::devInfoTypesStrings.at(type) << ") - SYSFS (" - << sysfs_path << "), binary size error, " + << sysfs_path << "), binary size error; " + << "[buff: " + << p_binary_data + << " size: " + << b_size + << " read: " + << num + << "]" << ", returning ENOENT (" << std::strerror(ENOENT) << ")"; LOG_ERROR(ss); return ENOENT; diff --git a/projects/rocm-smi-lib/src/rocm_smi_gpu_metrics.cc b/projects/rocm-smi-lib/src/rocm_smi_gpu_metrics.cc index 6833d35e91..ef233bb88e 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_gpu_metrics.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_gpu_metrics.cc @@ -54,16 +54,20 @@ #include #include +#include #include #include #include #include +#include #include #include #include #include +#include #include // NOLINT #include +#include #include #include #include @@ -74,478 +78,6 @@ using namespace amd::smi; #define TRY try { #define CATCH } catch (...) {return amd::smi::handleException();} -// Put definitions of old gpu_metrics formats here -typedef struct { - struct metrics_table_header_t common_header; - - /* Driver attached timestamp (in ns) */ - uint64_t system_clock_counter; - -/* Temperature */ - uint16_t temperature_edge; - uint16_t temperature_hotspot; - uint16_t temperature_mem; - uint16_t temperature_vrgfx; - uint16_t temperature_vrsoc; - uint16_t temperature_vrmem; - -/* Utilization */ - uint16_t average_gfx_activity; - uint16_t average_umc_activity; // memory controller - uint16_t average_mm_activity; // UVD or VCN - -/* Power/Energy */ - uint16_t average_socket_power; - uint32_t energy_accumulator; - -/* Average clocks */ - uint16_t average_gfxclk_frequency; - uint16_t average_socclk_frequency; - uint16_t average_uclk_frequency; - uint16_t average_vclk0_frequency; - uint16_t average_dclk0_frequency; - uint16_t average_vclk1_frequency; - uint16_t average_dclk1_frequency; - -/* Current clocks */ - uint16_t current_gfxclk; - uint16_t current_socclk; - uint16_t current_uclk; - uint16_t current_vclk0; - uint16_t current_dclk0; - uint16_t current_vclk1; - uint16_t current_dclk1; - -/* Throttle status */ - uint32_t throttle_status; - -/* Fans */ - uint16_t current_fan_speed; - -/* Link width/speed */ - uint8_t pcie_link_width; - uint8_t pcie_link_speed; // in 0.1 GT/s -} rsmi_gpu_metrics_v_1_0_t; - -typedef struct { - rsmi_gpu_metrics_t base; - uint64_t firmware_timestamp; -} rsmi_gpu_metrics_v_1_2; - -typedef struct { - rsmi_gpu_metrics_t base; - /* PMFW attached timestamp (10ns resolution) */ - uint64_t firmware_timestamp; - - /* Voltage (mV) */ - uint16_t voltage_soc; - uint16_t voltage_gfx; - uint16_t voltage_mem; - - uint16_t padding1; - - /* Throttle status (ASIC independent) */ - uint64_t indep_throttle_status; - -} rsmi_gpu_metrics_v_1_3; - - -// log current gpu_metrics file content read -// any metrics value can be a nullptr -void log_gpu_metrics(const metrics_table_header_t *gpu_metrics_table_header, - const rsmi_gpu_metrics_v_1_2 *rsmi_gpu_metrics_v_1_2, - const rsmi_gpu_metrics_v_1_3 *gpu_metrics_v_1_3, - const rsmi_gpu_metrics_t *rsmi_gpu_metrics) { - if (!RocmSMI::getInstance().isLoggingOn()) { - return; - } - std::ostringstream ss; - if (gpu_metrics_table_header != nullptr) { - ss - /* Common Header */ - << print_unsigned_hex_and_int( - gpu_metrics_table_header->structure_size, - "gpu_metrics_table_header->structure_size") - << print_unsigned_hex_and_int( - gpu_metrics_table_header->format_revision, - "gpu_metrics_table_header->format_revision") - << print_unsigned_hex_and_int( - gpu_metrics_table_header->content_revision, - "gpu_metrics_table_header->content_revision"); - LOG_DEBUG(ss); - } - if (rsmi_gpu_metrics == nullptr) { - return; - } - - ss - /* Common Header */ - << print_unsigned_hex_and_int( - rsmi_gpu_metrics->common_header.structure_size, - "rsmi_gpu_metrics->common_header.structure_size") - << print_unsigned_hex_and_int( - rsmi_gpu_metrics->common_header.format_revision, - "rsmi_gpu_metrics->common_header.format_revision") - << print_unsigned_hex_and_int( - rsmi_gpu_metrics->common_header.content_revision, - "rsmi_gpu_metrics->common_header.content_revision") - /* Temperature */ - << print_unsigned_hex_and_int( - rsmi_gpu_metrics->temperature_edge, - "rsmi_gpu_metrics->temperature_edge") - << print_unsigned_hex_and_int( - rsmi_gpu_metrics->temperature_hotspot, - "rsmi_gpu_metrics->temperature_hotspot") - << print_unsigned_hex_and_int( - rsmi_gpu_metrics->temperature_mem, - "rsmi_gpu_metrics->temperature_mem") - << print_unsigned_hex_and_int( - rsmi_gpu_metrics->temperature_vrgfx, - "rsmi_gpu_metrics->temperature_vrgfx") - << print_unsigned_hex_and_int( - rsmi_gpu_metrics->temperature_vrsoc, - "rsmi_gpu_metrics->temperature_vrsoc") - << print_unsigned_hex_and_int( - rsmi_gpu_metrics->temperature_vrmem, - "rsmi_gpu_metrics->temperature_vrmem") - /* Utilization */ - << print_unsigned_hex_and_int( - rsmi_gpu_metrics->average_gfx_activity, - "rsmi_gpu_metrics->average_gfx_activity") - << print_unsigned_hex_and_int( - rsmi_gpu_metrics->average_umc_activity, - "rsmi_gpu_metrics->average_umc_activity") - << print_unsigned_hex_and_int( - rsmi_gpu_metrics->average_mm_activity, - "rsmi_gpu_metrics->average_mm_activity") - /* Power/Energy */ - << print_unsigned_hex_and_int( - rsmi_gpu_metrics->average_socket_power, - "rsmi_gpu_metrics->average_socket_power") - << print_unsigned_hex_and_int( - rsmi_gpu_metrics->energy_accumulator, - "rsmi_gpu_metrics->energy_accumulator") - /* Driver attached timestamp (in ns) */ - << print_unsigned_hex_and_int( - rsmi_gpu_metrics->system_clock_counter, - "rsmi_gpu_metrics->system_clock_counter") - /* Average clocks */ - << print_unsigned_hex_and_int( - rsmi_gpu_metrics->average_gfxclk_frequency, - "rsmi_gpu_metrics->average_gfxclk_frequency") - << print_unsigned_hex_and_int( - rsmi_gpu_metrics->average_socclk_frequency, - "rsmi_gpu_metrics->average_socclk_frequency") - << print_unsigned_hex_and_int( - rsmi_gpu_metrics->average_uclk_frequency, - "rsmi_gpu_metrics->average_uclk_frequency") - << print_unsigned_hex_and_int( - rsmi_gpu_metrics->average_vclk0_frequency, - "rsmi_gpu_metrics->average_vclk0_frequency") - << print_unsigned_hex_and_int( - rsmi_gpu_metrics->average_dclk0_frequency, - "rsmi_gpu_metrics->average_dclk0_frequency") - << print_unsigned_hex_and_int( - rsmi_gpu_metrics->average_vclk1_frequency, - "rsmi_gpu_metrics->average_vclk1_frequency") - << print_unsigned_hex_and_int( - rsmi_gpu_metrics->average_dclk1_frequency, - "rsmi_gpu_metrics->average_dclk1_frequency") - /* Current clocks */ - << print_unsigned_hex_and_int( - rsmi_gpu_metrics->current_gfxclk, - "rsmi_gpu_metrics->current_gfxclk") - << print_unsigned_hex_and_int( - rsmi_gpu_metrics->current_socclk, - "rsmi_gpu_metrics->current_socclk") - << print_unsigned_hex_and_int( - rsmi_gpu_metrics->current_uclk, - "rsmi_gpu_metrics->current_uclk") - << print_unsigned_hex_and_int( - rsmi_gpu_metrics->current_vclk0, - "rsmi_gpu_metrics->current_vclk0") - << print_unsigned_hex_and_int( - rsmi_gpu_metrics->current_dclk0, - "rsmi_gpu_metrics->current_dclk0") - << print_unsigned_hex_and_int( - rsmi_gpu_metrics->current_vclk1, - "rsmi_gpu_metrics->current_vclk1") - << print_unsigned_hex_and_int( - rsmi_gpu_metrics->current_dclk1, - "rsmi_gpu_metrics->current_dclk1") - /* Throttle status */ - << print_unsigned_hex_and_int( - rsmi_gpu_metrics->throttle_status, - "rsmi_gpu_metrics->throttle_status") - /* Fans */ - << print_unsigned_hex_and_int( - rsmi_gpu_metrics->current_fan_speed, - "rsmi_gpu_metrics->current_fan_speed") - /* Link width/speed */ - << print_unsigned_hex_and_int( - rsmi_gpu_metrics->pcie_link_width, - "rsmi_gpu_metrics->pcie_link_width") - << print_unsigned_hex_and_int( - rsmi_gpu_metrics->pcie_link_speed, - "rsmi_gpu_metrics->pcie_link_speed") - << print_unsigned_hex_and_int( - rsmi_gpu_metrics->padding, - "rsmi_gpu_metrics->padding") - << print_unsigned_hex_and_int( - rsmi_gpu_metrics->gfx_activity_acc, - "rsmi_gpu_metrics->gfx_activity_acc") - << print_unsigned_hex_and_int( - rsmi_gpu_metrics->mem_activity_acc, - "rsmi_gpu_metrics->mem_activity_acc"); - for (int i=0; i < RSMI_NUM_HBM_INSTANCES; i++) { - ss << print_unsigned_hex_and_int( - rsmi_gpu_metrics->temperature_hbm[i], - "rsmi_gpu_metrics->temperature_hbm[" + std::to_string(i) + "]"); - } - - if (rsmi_gpu_metrics_v_1_2 != nullptr) { - /* PMFW attached timestamp (10ns resolution) */ - ss - << print_unsigned_hex_and_int( - rsmi_gpu_metrics_v_1_2->firmware_timestamp, - "rsmi_gpu_metrics_v_1_2->firmware_timestamp"); - } - - if (gpu_metrics_v_1_3 != nullptr) { - /* PMFW attached timestamp (10ns resolution) */ - ss - << print_unsigned_hex_and_int( - gpu_metrics_v_1_3->firmware_timestamp, - "gpu_metrics_v_1_3->firmware_timestamp") - /* Voltage (mV) */ - << print_unsigned_hex_and_int( - gpu_metrics_v_1_3->voltage_soc, - "gpu_metrics_v_1_3->voltage_soc") - << print_unsigned_hex_and_int( - gpu_metrics_v_1_3->voltage_gfx, - "gpu_metrics_v_1_3->voltage_gfx") - << print_unsigned_hex_and_int( - gpu_metrics_v_1_3->voltage_mem, - "gpu_metrics_v_1_3->voltage_mem") - << print_unsigned_hex_and_int( - gpu_metrics_v_1_3->padding1, - "gpu_metrics_v_1_3->padding1") - /* Throttle status (ASIC independent) */ - << print_unsigned_hex_and_int( - gpu_metrics_v_1_3->indep_throttle_status, - "gpu_metrics_v_1_3->indep_throttle_status"); - } - LOG_DEBUG(ss); -} - -static rsmi_status_t GetGPUMetricsFormat1(uint32_t dv_ind, - rsmi_gpu_metrics_t *data, uint8_t content_v) { - assert(content_v != RSMI_GPU_METRICS_API_CONTENT_VER_1 && - content_v != RSMI_GPU_METRICS_API_CONTENT_VER_2 && - content_v != RSMI_GPU_METRICS_API_CONTENT_VER_3 ); - if (content_v == RSMI_GPU_METRICS_API_CONTENT_VER_1 || - content_v == RSMI_GPU_METRICS_API_CONTENT_VER_2 || - content_v == RSMI_GPU_METRICS_API_CONTENT_VER_3 ) { - // This function shouldn't be called if content version is - // RSMI_GPU_METRICS_API_CONTENT_VER_1 or RSMI_GPU_METRICS_API_CONTENT_VER_2 - // or RSMI_GPU_METRICS_API_CONTENT_VER_3 - return RSMI_STATUS_INVALID_ARGS; - } - void *metric_data = nullptr; - size_t data_size; - rsmi_status_t ret; - - rsmi_gpu_metrics_v_1_0_t metric_data_v_1_0; - - if (content_v == 0) { - metric_data = &metric_data_v_1_0; - data_size = sizeof(rsmi_gpu_metrics_v_1_0_t); - } // else { ... handle other conversions to v1 - - assert(metric_data != nullptr && "Unexpected conversion attempted."); - ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind, data_size, - metric_data); - if (ret != RSMI_STATUS_SUCCESS) { - return ret; - } - -#define ASSIGN_DATA_FIELD(FIELD, SRC) \ - data->FIELD = (SRC)->FIELD; - -#define ASSIGN_COMMON_FORMATS(SRC) \ - ASSIGN_DATA_FIELD(common_header, (SRC)) \ - ASSIGN_DATA_FIELD(temperature_edge, (SRC)) \ - ASSIGN_DATA_FIELD(temperature_hotspot, (SRC)) \ - ASSIGN_DATA_FIELD(temperature_mem, (SRC)) \ - ASSIGN_DATA_FIELD(temperature_vrgfx, (SRC)) \ - ASSIGN_DATA_FIELD(temperature_vrsoc, (SRC)) \ - ASSIGN_DATA_FIELD(temperature_vrmem, (SRC)) \ - ASSIGN_DATA_FIELD(average_gfx_activity, (SRC)) \ - ASSIGN_DATA_FIELD(average_umc_activity, (SRC)) \ - ASSIGN_DATA_FIELD(average_mm_activity, (SRC)) \ - ASSIGN_DATA_FIELD(average_socket_power, (SRC)) \ - ASSIGN_DATA_FIELD(system_clock_counter, (SRC)) \ - ASSIGN_DATA_FIELD(average_gfxclk_frequency, (SRC)) \ - ASSIGN_DATA_FIELD(average_socclk_frequency, (SRC)) \ - ASSIGN_DATA_FIELD(average_uclk_frequency, (SRC)) \ - ASSIGN_DATA_FIELD(average_vclk0_frequency, (SRC)) \ - ASSIGN_DATA_FIELD(average_dclk0_frequency, (SRC)) \ - ASSIGN_DATA_FIELD(average_vclk1_frequency, (SRC)) \ - ASSIGN_DATA_FIELD(average_dclk1_frequency, (SRC)) \ - ASSIGN_DATA_FIELD(current_gfxclk, (SRC)) \ - ASSIGN_DATA_FIELD(current_socclk, (SRC)) \ - ASSIGN_DATA_FIELD(current_uclk, (SRC)) \ - ASSIGN_DATA_FIELD(current_vclk0, (SRC)) \ - ASSIGN_DATA_FIELD(current_dclk0, (SRC)) \ - ASSIGN_DATA_FIELD(current_vclk1, (SRC)) \ - ASSIGN_DATA_FIELD(current_dclk1, (SRC)) \ - ASSIGN_DATA_FIELD(throttle_status, (SRC)) \ - ASSIGN_DATA_FIELD(current_fan_speed, (SRC)) - - // Now handle differences from format 1 - if (content_v == 0) { - // First handle all data that is common to Format1 and other formats - ASSIGN_COMMON_FORMATS( - reinterpret_cast(metric_data)) - - // Then, the differences: - data->energy_accumulator = static_cast( - reinterpret_cast( - metric_data)->energy_accumulator); - data->pcie_link_width = static_cast( - reinterpret_cast( - metric_data)->pcie_link_width); - data->pcie_link_speed = static_cast( - reinterpret_cast( - metric_data)->pcie_link_speed); - - // These fields didn't exist in v0 - data->gfx_activity_acc = 0; - data->mem_activity_acc = 0; - (void)memset(data->temperature_hbm, 0, - RSMI_NUM_HBM_INSTANCES * sizeof(uint16_t)); - } // else handle other conversions to format 1 -#undef ASSIGN_DATA_FIELD -#undef ASSIGN_COMMON_FORMATS - return RSMI_STATUS_SUCCESS; -} - -// Translate gpu_metrics version 1.2 to rsmi_gpu_metrics_t. gpu_metrics -// version 1.2 provides timestamp provided by the firmware. This timestamp -// is sampled atomically along when gpu_metric information. Use this -// timestamp instead of system_clock_counter - -static void map_gpu_metrics_1_2_to_rsmi_gpu_metrics_t( - const rsmi_gpu_metrics_v_1_2 *gpu_metrics_v_1_2, - rsmi_gpu_metrics_t *rsmi_gpu_metrics) -{ - memcpy(rsmi_gpu_metrics, &gpu_metrics_v_1_2->base, - sizeof(rsmi_gpu_metrics_t)); - // firmware_timestamp is at 10ns resolution - rsmi_gpu_metrics->system_clock_counter = - gpu_metrics_v_1_2->firmware_timestamp * 10; -} - -static void map_gpu_metrics_1_3_to_rsmi_gpu_metrics_t( - const rsmi_gpu_metrics_v_1_3 *gpu_metrics_v_1_3, - rsmi_gpu_metrics_t *rsmi_gpu_metrics) -{ - memcpy(rsmi_gpu_metrics, &gpu_metrics_v_1_3->base, - sizeof(rsmi_gpu_metrics_t)); - // firmware_timestamp is at 10ns resolution - rsmi_gpu_metrics->system_clock_counter = - gpu_metrics_v_1_3->firmware_timestamp * 10; - -} - -rsmi_status_t -rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t *smu) { - TRY - DEVICE_MUTEX - CHK_SUPPORT_NAME_ONLY(smu) - rsmi_gpu_metrics_v_1_2 smu_v_1_2; - rsmi_gpu_metrics_v_1_3 smu_v_1_3; - rsmi_status_t ret; - - std::ostringstream ss; - if (!dev->gpu_metrics_ver().structure_size) { - ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind, - sizeof(struct metrics_table_header_t), &dev->gpu_metrics_ver()); - log_gpu_metrics(&dev->gpu_metrics_ver(), nullptr, nullptr, nullptr); - - if (ret != RSMI_STATUS_SUCCESS) { - ss << "Returning = " << getRSMIStatusString(ret) - << ",\ndev->gpu_metrics_ver().structure_size = " - << print_unsigned_int(dev->gpu_metrics_ver().structure_size) - << ", could not read common header"; - LOG_ERROR(ss); - return ret; - } - } - // only supports gpu_metrics_v1_x version - if (dev->gpu_metrics_ver().format_revision != 1) { - ss << "Returning = " << getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) - << ",\ndev->gpu_metrics_ver().format_revision = " - << print_unsigned_int(dev->gpu_metrics_ver().format_revision) - << " was not equal to 1"; - LOG_ERROR(ss); - return RSMI_STATUS_NOT_SUPPORTED; - } - - // Initialize the smu fields to zero as some of them only valid in - // a specific version. - *smu = {}; - - bool isRevisionExpected = ((dev->gpu_metrics_ver().content_revision == 1) || - (dev->gpu_metrics_ver().content_revision == 2) || - (dev->gpu_metrics_ver().content_revision == 3)); - if (isRevisionExpected == false) { - ss << __PRETTY_FUNCTION__ << " | content revision was = " - << print_unsigned_hex_and_int(dev->gpu_metrics_ver().content_revision) - << ", expected version 1,2, or 3 | returning " - << getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED); - LOG_ERROR(ss); - return RSMI_STATUS_NOT_SUPPORTED; - } - if (dev->gpu_metrics_ver().content_revision == - RSMI_GPU_METRICS_API_CONTENT_VER_1) { - ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind, - sizeof(rsmi_gpu_metrics_t), smu); - ss << __PRETTY_FUNCTION__ << " | RSMI_GPU_METRICS_API_CONTENT_VER_1"; - LOG_DEBUG(ss); - log_gpu_metrics(nullptr, nullptr, nullptr, smu); - } else if (dev->gpu_metrics_ver().content_revision == - RSMI_GPU_METRICS_API_CONTENT_VER_2) { - ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind, - sizeof(rsmi_gpu_metrics_v_1_2), &smu_v_1_2); - map_gpu_metrics_1_2_to_rsmi_gpu_metrics_t(&smu_v_1_2, smu); - ss << __PRETTY_FUNCTION__ << " | RSMI_GPU_METRICS_API_CONTENT_VER_2"; - LOG_DEBUG(ss); - log_gpu_metrics(nullptr, &smu_v_1_2, nullptr, smu); - } else if (dev->gpu_metrics_ver().content_revision == - RSMI_GPU_METRICS_API_CONTENT_VER_3) { - ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind, - sizeof(rsmi_gpu_metrics_v_1_3), &smu_v_1_3); - map_gpu_metrics_1_3_to_rsmi_gpu_metrics_t(&smu_v_1_3, smu); - ss << __PRETTY_FUNCTION__ << " | RSMI_GPU_METRICS_API_CONTENT_VER_3"; - LOG_DEBUG(ss); - log_gpu_metrics(nullptr, nullptr, &smu_v_1_3, smu); - } else { - ret = GetGPUMetricsFormat1(dv_ind, smu, - dev->gpu_metrics_ver().content_revision); - ss << __PRETTY_FUNCTION__ << " | GetGPUMetricsFormat1"; - LOG_DEBUG(ss); - log_gpu_metrics(nullptr, nullptr, nullptr, smu); - } - - if (ret != RSMI_STATUS_SUCCESS) { - return ret; - } - - return ret; - CATCH -} - namespace amd::smi { @@ -576,24 +108,54 @@ uint64_t actual_timestamp_in_secs() return duration_cast(system_clock::now().time_since_epoch()).count(); } +auto timestamp_to_time_point(uint64_t timestamp_in_secs) +{ + using namespace std::chrono; + system_clock::time_point time_point{seconds{timestamp_in_secs}}; + std::time_t timestamp_time = system_clock::to_time_t(time_point); + return timestamp_time; +} + + std::string stringfy_metrics_header(const AMDGpuMetricsHeader_v1_t& metrics_header) { std::stringstream metrics_header_info; metrics_header_info - << "Format: " << print_unsigned_hex_and_int(metrics_header.m_format_revision) - << "." << print_unsigned_hex_and_int(metrics_header.m_content_revision) - << " Size: " << print_unsigned_hex_and_int(metrics_header.m_structure_size); + << "{Header Info: " + << print_unsigned_int(metrics_header.m_format_revision) + << "." + << print_unsigned_int(metrics_header.m_content_revision) + << " Size: " + << print_unsigned_int(metrics_header.m_structure_size) + << "} " + << "[Format: " << print_unsigned_hex_and_int(metrics_header.m_format_revision) + << " Revision: " << print_unsigned_hex_and_int(metrics_header.m_content_revision) + << " Size: " << print_unsigned_hex_and_int(metrics_header.m_structure_size) + << "]" + << "\n"; return metrics_header_info.str(); } +std::string stringfy_metric_header_version(const AMDGpuMetricsHeader_v1_t& metrics_header) +{ + std::stringstream metrics_header_info; + metrics_header_info + << print_unsigned_int(metrics_header.m_format_revision) + << "." + << print_unsigned_int(metrics_header.m_content_revision); + + return metrics_header_info.str(); +} + + // -// version 1,0: 256 -// version 1,1: 257 -// version 1,2: 258 -// version 1,3: 259 -// version 1,4: 260 -// version 1,5: 261 +// version 1.0: 256 +// version 1.1: 257 +// version 1.2: 258 +// version 1.3: 259 +// version 1.4: 260 +// version 1.5: 261 // const AMDGpuMetricVersionTranslationTbl_t amdgpu_metric_version_translation_table { @@ -612,7 +174,6 @@ const AMDGpuMetricsClassIdTranslationTbl_t amdgpu_metrics_class_id_translation_t {AMDGpuMetricsClassId_t::kGpuMetricTemperature, "Temperature"}, {AMDGpuMetricsClassId_t::kGpuMetricUtilization, "Utilization"}, {AMDGpuMetricsClassId_t::kGpuMetricPowerEnergy, "Power/Energy"}, - {AMDGpuMetricsClassId_t::kGpuMetricSystemClockCounter, "System Clock"}, {AMDGpuMetricsClassId_t::kGpuMetricAverageClock, "Average Clock"}, {AMDGpuMetricsClassId_t::kGpuMetricCurrentClock, "Current Clock"}, {AMDGpuMetricsClassId_t::kGpuMetricThrottleStatus, "Throttle"}, @@ -679,6 +240,9 @@ const AMDGpuMetricsUnitTypeTranslationTbl_t amdgpu_metrics_unit_type_translation {AMDGpuMetricsUnitType_t::kMetricXgmiLinkSpeed, "XgmiLinkSpeed"}, {AMDGpuMetricsUnitType_t::kMetricXgmiReadDataAccumulator, "XgmiReadDataAcc"}, {AMDGpuMetricsUnitType_t::kMetricXgmiWriteDataAccumulator, "XgmiWriteDataAcc"}, + {AMDGpuMetricsUnitType_t::kMetricPcieL0RecovCountAccumulator, "PcieL0RecovCountAcc"}, + {AMDGpuMetricsUnitType_t::kMetricPcieReplayCountAccumulator, "PcieReplayCountAcc"}, + {AMDGpuMetricsUnitType_t::kMetricPcieReplayRollOverCountAccumulator, "PcieReplayRollOverCountAcc"}, // kGpuMetricPowerEnergy counters {AMDGpuMetricsUnitType_t::kMetricAvgSocketPower, "AvgSocketPower"}, @@ -698,14 +262,72 @@ const AMDGpuMetricsUnitTypeTranslationTbl_t amdgpu_metrics_unit_type_translation AMDGpuMetricVersionFlags_t translate_header_to_flag_version(const AMDGpuMetricsHeader_v1_t& metrics_header) { + std::ostringstream ostrstream; + auto version_id(AMDGpuMetricVersionFlags_t::kGpuMetricNone); + ostrstream << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ostrstream); + const auto flag_version = join_metrics_version(metrics_header); if (amdgpu_metric_version_translation_table.find(flag_version) != amdgpu_metric_version_translation_table.end()) { - return amdgpu_metric_version_translation_table.at(flag_version); + version_id = amdgpu_metric_version_translation_table.at(flag_version); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Translation Tbl: " << flag_version + << " | Metric Version: " << stringfy_metrics_header(metrics_header) + << " | Returning = " + << static_cast(version_id) + << " |"; + LOG_TRACE(ostrstream); + return version_id; } - return AMDGpuMetricVersionFlags_t::kGpuMetricNone; + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Translation Tbl: " << flag_version + << " | Metric Version: " << stringfy_metrics_header(metrics_header) + << " | Returning = " + << static_cast(version_id) + << " |"; + LOG_ERROR(ostrstream); + return version_id; } +uint16_t translate_flag_to_metric_version(AMDGpuMetricVersionFlags_t version_flag) +{ + std::ostringstream ostrstream; + auto version_id = uint16_t(0); + ostrstream << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ostrstream); + + for (const auto& [key, value] : amdgpu_metric_version_translation_table) { + if (value == version_flag) { + version_id = key; + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Version Flag: " << static_cast(version_flag) + << " | Unified Version: " << version_id + << " | Str. Version: " << stringfy_metric_header_version(disjoin_metrics_version(version_id)) + << " |"; + LOG_TRACE(ostrstream); + return version_id; + } + } + + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Version Flag: " << static_cast(version_flag) + << " | Unified Version: " << version_id + << " | Str. Version: " << stringfy_metric_header_version(disjoin_metrics_version(version_id)) + << " |"; + LOG_TRACE(ostrstream); + return version_id; +} + + rsmi_status_t is_gpu_metrics_version_supported(const AMDGpuMetricsHeader_v1_t& metrics_header) { const auto flag_version = join_metrics_version(metrics_header); @@ -714,71 +336,65 @@ rsmi_status_t is_gpu_metrics_version_supported(const AMDGpuMetricsHeader_v1_t& m ? rsmi_status_t::RSMI_STATUS_SUCCESS : rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED; } -struct AMDGpuMetricsLogInfo_t -{ - rsmi_status_t m_status_code; - std::string m_title; - std::string m_pretty_function; -}; - -class AMDGpuMetricsLogger_t -{ - public: - enum class LogInfoType_t - { - kLogError, - kLogAlarm, - kLogInfo, - kLogBuffer, - kLogTrace, - kLogDebug, - }; - - void operator()(const AMDGpuMetricsLogInfo_t& log_info) - { - m_ostrstream << log_info.m_pretty_function << log_info.m_title; - LOG_TRACE(m_ostrstream); - - m_ostrstream << log_info.m_pretty_function - << " | ======= end ======= " - << " | Fail " - << " | Device #: " - << " | Metric Version: " - << " | Cause: Couldn't get a valid metric object" - << " | Returning = " - << getRSMIStatusString(log_info.m_status_code) - << " |"; - - LOG_ERROR(m_ostrstream); - } - - private: - std::ostringstream m_ostrstream; - -}; - AMDGpuMetricFactories_t amd_gpu_metrics_factory_table { - {AMDGpuMetricVersionFlags_t::kGpuMetricV11, std::make_unique(GpuMetricsBase_v11_t{})}, - {AMDGpuMetricVersionFlags_t::kGpuMetricV12, std::make_unique(GpuMetricsBase_v12_t{})}, - {AMDGpuMetricVersionFlags_t::kGpuMetricV13, std::make_unique(GpuMetricsBase_v13_t{})}, - {AMDGpuMetricVersionFlags_t::kGpuMetricV14, std::make_unique(GpuMetricsBase_v14_t{})}, + {AMDGpuMetricVersionFlags_t::kGpuMetricV11, std::make_shared(GpuMetricsBase_v11_t{})}, + {AMDGpuMetricVersionFlags_t::kGpuMetricV12, std::make_shared(GpuMetricsBase_v12_t{})}, + {AMDGpuMetricVersionFlags_t::kGpuMetricV13, std::make_shared(GpuMetricsBase_v13_t{})}, + {AMDGpuMetricVersionFlags_t::kGpuMetricV14, std::make_shared(GpuMetricsBase_v14_t{})}, }; GpuMetricsBasePtr amdgpu_metrics_factory(AMDGpuMetricVersionFlags_t gpu_metric_version) { + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ostrstream); + auto contains = [](const AMDGpuMetricVersionFlags_t metric_version) { return (amd_gpu_metrics_factory_table.find(metric_version) != amd_gpu_metrics_factory_table.end()); }; if (contains(gpu_metric_version)) { - return std::move(amd_gpu_metrics_factory_table[gpu_metric_version]); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Factory Version: " << static_cast(gpu_metric_version) + << " |"; + LOG_TRACE(ostrstream); + + return (amd_gpu_metrics_factory_table[gpu_metric_version]); } + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Factory Version: " << static_cast(gpu_metric_version) + << " | Returning = " + << "No object from factory." + << " |"; + LOG_ERROR(ostrstream); return nullptr; } + +template +constexpr bool is_dependent_false_v = false; + +template +constexpr T init_max_uint_types() +{ + if constexpr ((std::is_same_v) || + (std::is_same_v) || + (std::is_same_v) || + (std::is_same_v)) { + return std::numeric_limits::max(); + } + else { + static_assert(is_dependent_false_v, "Error: Type not supported..."); + } +} + template AMDGpuDynamicMetricTblValues_t format_metric_row(const T& metric, const std::string& value_title) { @@ -787,20 +403,20 @@ AMDGpuDynamicMetricTblValues_t format_metric_row(const T& metric, const std::str auto get_data_type_info = [&]() { auto data_type(AMDGpuMetricsDataType_t::kUInt64); if constexpr (std::is_array_v) { - const uint8_t check_uint8[]={1}; - const uint16_t check_uint16[]={2}; - const uint32_t check_uint32[]={3}; - const uint64_t check_uint64[]={4}; - if constexpr (std::is_same_v) { + const uint8_t kCheckUint8[]={1}; + const uint16_t kCheckUint16[]={2}; + const uint32_t kCheckUint32[]={3}; + const uint64_t kCheckUint64[]={4}; + if constexpr (std::is_same_v) { data_type = AMDGpuMetricsDataType_t::kUInt8; } - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { data_type = AMDGpuMetricsDataType_t::kUInt16; } - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { data_type = AMDGpuMetricsDataType_t::kUInt32; } - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { data_type = AMDGpuMetricsDataType_t::kUInt64; } return std::make_tuple(data_type, static_cast(std::end(metric) - std::begin(metric))); @@ -848,12 +464,40 @@ AMDGpuDynamicMetricTblValues_t format_metric_row(const T& metric, const std::str rsmi_status_t GpuMetricsBase_v14_t::populate_metrics_dynamic_tbl() { + std::ostringstream ostrstream; + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + ostrstream << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ostrstream); + + // + // Note: Any metric treatment/changes (if any) should happen before they + // get written to internal/external tables. + // + auto run_metric_adjustments_v14 = [&]() { + ostrstream << __PRETTY_FUNCTION__ << " | ======= start ======="; + const auto gpu_metrics_version = translate_flag_to_metric_version(get_gpu_metrics_version_used()); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= info ======= " + << " | Applying adjustments " + << " | Metric Version: " << stringfy_metric_header_version( + disjoin_metrics_version(gpu_metrics_version)) + << " |"; + LOG_TRACE(ostrstream); + + // firmware_timestamp is at 10ns resolution + ostrstream << __PRETTY_FUNCTION__ + << " | ======= Changes ======= " + << " | {m_firmware_timestamp} from: " << m_gpu_metrics_tbl.m_firmware_timestamp + << " to: " << (m_gpu_metrics_tbl.m_firmware_timestamp * 10); + m_gpu_metrics_tbl.m_firmware_timestamp = (m_gpu_metrics_tbl.m_firmware_timestamp * 10); + LOG_DEBUG(ostrstream); + }; + + + // Adjustments/Changes specific to this version + run_metric_adjustments_v14(); + // Temperature Info - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempHotspot, - format_metric_row(m_gpu_metrics_tbl.m_temperature_hotspot, - "temperature_hotspot")) - ); m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempHotspot, format_metric_row(m_gpu_metrics_tbl.m_temperature_hotspot, @@ -873,7 +517,7 @@ rsmi_status_t GpuMetricsBase_v14_t::populate_metrics_dynamic_tbl() // Power/Energy Info m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricPowerEnergy] .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrSocketPower, - format_metric_row(m_gpu_metrics_tbl.m_curr_socket_power, + format_metric_row(m_gpu_metrics_tbl.m_current_socket_power, "curr_socket_power")) ); m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricPowerEnergy] @@ -957,15 +601,30 @@ rsmi_status_t GpuMetricsBase_v14_t::populate_metrics_dynamic_tbl() "xgmi_link_speed")) ); m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieBandwidthAccumulator,\ + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieBandwidthAccumulator, format_metric_row(m_gpu_metrics_tbl.m_pcie_bandwidth_acc, "pcie_bandwidth_acc")) ); m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieBandwidthInst, - format_metric_row(m_gpu_metrics_tbl.m_pcie_bandwidth_acc, + format_metric_row(m_gpu_metrics_tbl.m_pcie_bandwidth_inst, "pcie_bandwidth_inst")) ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieL0RecovCountAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc, + "pcie_l0_recov_count_acc")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieReplayCountAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_pcie_replay_count_acc, + "pcie_replay_count_acc")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieReplayRollOverCountAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc, + "pcie_replay_rollover_count_acc")) + ); m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiReadDataAccumulator, format_metric_row(m_gpu_metrics_tbl.m_xgmi_read_data_acc, @@ -1004,11 +663,325 @@ rsmi_status_t GpuMetricsBase_v14_t::populate_metrics_dynamic_tbl() "current_uclk")) ); - return rsmi_status_t::RSMI_STATUS_SUCCESS; + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Returning = " << getRSMIStatusString(status_code) + << " |"; + LOG_TRACE(ostrstream); + + return status_code; +} + +rsmi_status_t init_max_public_gpu_matrics(AMGpuMetricsPublicLatest_t& rsmi_gpu_metrics) +{ + std::ostringstream ostrstream; + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + ostrstream << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ostrstream); + + rsmi_gpu_metrics.temperature_edge = init_max_uint_types(); + rsmi_gpu_metrics.temperature_hotspot = init_max_uint_types(); + rsmi_gpu_metrics.temperature_mem = init_max_uint_types(); + rsmi_gpu_metrics.temperature_vrgfx = init_max_uint_types(); + rsmi_gpu_metrics.temperature_vrsoc = init_max_uint_types(); + rsmi_gpu_metrics.temperature_vrmem = init_max_uint_types(); + rsmi_gpu_metrics.average_gfx_activity = init_max_uint_types(); + rsmi_gpu_metrics.average_umc_activity = init_max_uint_types(); + rsmi_gpu_metrics.average_mm_activity = init_max_uint_types(); + rsmi_gpu_metrics.average_socket_power = init_max_uint_types(); + rsmi_gpu_metrics.energy_accumulator = init_max_uint_types(); + rsmi_gpu_metrics.system_clock_counter = init_max_uint_types(); + rsmi_gpu_metrics.average_gfxclk_frequency = init_max_uint_types(); + rsmi_gpu_metrics.average_socclk_frequency = init_max_uint_types(); + rsmi_gpu_metrics.average_uclk_frequency = init_max_uint_types(); + rsmi_gpu_metrics.average_vclk0_frequency = init_max_uint_types(); + rsmi_gpu_metrics.average_dclk0_frequency = init_max_uint_types(); + rsmi_gpu_metrics.average_vclk1_frequency = init_max_uint_types(); + rsmi_gpu_metrics.average_dclk1_frequency = init_max_uint_types(); + rsmi_gpu_metrics.current_gfxclk = init_max_uint_types(); + rsmi_gpu_metrics.current_socclk = init_max_uint_types(); + rsmi_gpu_metrics.current_uclk = init_max_uint_types(); + rsmi_gpu_metrics.current_vclk0 = init_max_uint_types(); + rsmi_gpu_metrics.current_dclk0 = init_max_uint_types(); + rsmi_gpu_metrics.current_vclk1 = init_max_uint_types(); + rsmi_gpu_metrics.current_dclk1 = init_max_uint_types(); + rsmi_gpu_metrics.throttle_status = init_max_uint_types(); + rsmi_gpu_metrics.current_fan_speed = init_max_uint_types(); + rsmi_gpu_metrics.pcie_link_width = init_max_uint_types(); + rsmi_gpu_metrics.pcie_link_speed = init_max_uint_types(); + rsmi_gpu_metrics.gfx_activity_acc = init_max_uint_types(); + rsmi_gpu_metrics.mem_activity_acc = init_max_uint_types(); + + std::fill(std::begin(rsmi_gpu_metrics.temperature_hbm), + std::end(rsmi_gpu_metrics.temperature_hbm), + init_max_uint_types()); + + rsmi_gpu_metrics.firmware_timestamp = init_max_uint_types(); + rsmi_gpu_metrics.voltage_soc = init_max_uint_types(); + rsmi_gpu_metrics.voltage_gfx = init_max_uint_types(); + rsmi_gpu_metrics.voltage_mem = init_max_uint_types(); + rsmi_gpu_metrics.indep_throttle_status = init_max_uint_types(); + rsmi_gpu_metrics.current_socket_power = init_max_uint_types(); + + std::fill(std::begin(rsmi_gpu_metrics.vcn_activity), + std::end(rsmi_gpu_metrics.vcn_activity), + init_max_uint_types()); + + rsmi_gpu_metrics.gfxclk_lock_status = init_max_uint_types(); + rsmi_gpu_metrics.xgmi_link_width = init_max_uint_types(); + rsmi_gpu_metrics.xgmi_link_speed = init_max_uint_types(); + rsmi_gpu_metrics.pcie_bandwidth_acc = init_max_uint_types(); + rsmi_gpu_metrics.pcie_bandwidth_inst = init_max_uint_types(); + rsmi_gpu_metrics.pcie_l0_to_recov_count_acc = init_max_uint_types(); + rsmi_gpu_metrics.pcie_replay_count_acc = init_max_uint_types(); + rsmi_gpu_metrics.pcie_replay_rover_count_acc = init_max_uint_types(); + + std::fill(std::begin(rsmi_gpu_metrics.xgmi_read_data_acc), + std::end(rsmi_gpu_metrics.xgmi_read_data_acc), + init_max_uint_types()); + + std::fill(std::begin(rsmi_gpu_metrics.xgmi_write_data_acc), + std::end(rsmi_gpu_metrics.xgmi_write_data_acc), + init_max_uint_types()); + + std::fill(std::begin(rsmi_gpu_metrics.current_gfxclks), + std::end(rsmi_gpu_metrics.current_gfxclks), + init_max_uint_types()); + + std::fill(std::begin(rsmi_gpu_metrics.current_socclks), + std::end(rsmi_gpu_metrics.current_socclks), + init_max_uint_types()); + + std::fill(std::begin(rsmi_gpu_metrics.current_vclk0s), + std::end(rsmi_gpu_metrics.current_vclk0s), + init_max_uint_types()); + + std::fill(std::begin(rsmi_gpu_metrics.current_dclk0s), + std::end(rsmi_gpu_metrics.current_dclk0s), + init_max_uint_types()); + + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Returning = " << getRSMIStatusString(status_code) + << " |"; + LOG_TRACE(ostrstream); + + return status_code; +} + + +AMGpuMetricsPublicLatestTupl_t GpuMetricsBase_v14_t::copy_internal_to_external_metrics() +{ + std::ostringstream ostrstream; + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + ostrstream << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ostrstream); + + auto copy_data_from_internal_metrics_tbl = [&]() { + AMGpuMetricsPublicLatest_t metrics_public_init{}; + + // + // Note: Initializing data members with their max. If field is max, + // no data was assigned to it. + init_max_public_gpu_matrics(metrics_public_init); + + // Header + metrics_public_init.common_header.structure_size = m_gpu_metrics_tbl.m_common_header.m_structure_size; + metrics_public_init.common_header.format_revision = m_gpu_metrics_tbl.m_common_header.m_format_revision; + metrics_public_init.common_header.content_revision = m_gpu_metrics_tbl.m_common_header.m_content_revision; + + + // Temperature + metrics_public_init.temperature_hotspot = m_gpu_metrics_tbl.m_temperature_hotspot; + metrics_public_init.temperature_mem = m_gpu_metrics_tbl.m_temperature_mem; + metrics_public_init.temperature_vrsoc = m_gpu_metrics_tbl.m_temperature_vrsoc; + + // Power + metrics_public_init.current_socket_power = m_gpu_metrics_tbl.m_current_socket_power; + + // Utilization + metrics_public_init.average_gfx_activity = m_gpu_metrics_tbl.m_average_gfx_activity; + metrics_public_init.average_umc_activity = m_gpu_metrics_tbl.m_average_umc_activity; + + // vcn_activity + const auto vcn_activity_num_elems = + static_cast( + std::end(m_gpu_metrics_tbl.m_vcn_activity) - + std::begin(m_gpu_metrics_tbl.m_vcn_activity)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_vcn_activity), + vcn_activity_num_elems, + metrics_public_init.vcn_activity); + + // Power/Energy + metrics_public_init.energy_accumulator = m_gpu_metrics_tbl.m_energy_accumulator; + + // Driver attached timestamp (in ns) + metrics_public_init.system_clock_counter = m_gpu_metrics_tbl.m_system_clock_counter; + + // Throttle status + metrics_public_init.throttle_status = m_gpu_metrics_tbl.m_throttle_status; + + // Clock Lock Status. Each bit corresponds to clock instance + metrics_public_init.gfxclk_lock_status = m_gpu_metrics_tbl.m_gfxclk_lock_status; + + // Link width (number of lanes) and speed + metrics_public_init.pcie_link_width = m_gpu_metrics_tbl.m_pcie_link_width; + metrics_public_init.pcie_link_speed = m_gpu_metrics_tbl.m_pcie_link_speed; + + // XGMI bus width and bitrate + metrics_public_init.xgmi_link_width = m_gpu_metrics_tbl.m_xgmi_link_width; + metrics_public_init.xgmi_link_speed = m_gpu_metrics_tbl.m_xgmi_link_speed; + + // Utilization Accumulated + metrics_public_init.gfx_activity_acc = m_gpu_metrics_tbl.m_gfx_activity_acc; + metrics_public_init.mem_activity_acc = m_gpu_metrics_tbl.m_mem_activity_acc; + + // PCIE accumulated bandwidth + metrics_public_init.pcie_bandwidth_acc = m_gpu_metrics_tbl.m_pcie_bandwidth_acc; + + // PCIE instantaneous bandwidth + metrics_public_init.pcie_bandwidth_inst = m_gpu_metrics_tbl.m_pcie_bandwidth_inst; + + // PCIE L0 to recovery state transition accumulated count + metrics_public_init.pcie_l0_to_recov_count_acc = m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc; + + // PCIE replay accumulated count + metrics_public_init.pcie_replay_count_acc = m_gpu_metrics_tbl.m_pcie_replay_count_acc; + + // PCIE replay rollover accumulated count + metrics_public_init.pcie_replay_rover_count_acc = m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc; + + // XGMI accumulated data transfer size + // xgmi_read_data + const auto xgmi_read_data_num_elems = + static_cast( + std::end(m_gpu_metrics_tbl.m_xgmi_read_data_acc) - + std::begin(m_gpu_metrics_tbl.m_xgmi_read_data_acc)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_xgmi_read_data_acc), + xgmi_read_data_num_elems, + metrics_public_init.xgmi_read_data_acc); + // xgmi_write_data + const auto xgmi_write_data_num_elems = + static_cast( + std::end(m_gpu_metrics_tbl.m_xgmi_write_data_acc) - + std::begin(m_gpu_metrics_tbl.m_xgmi_write_data_acc)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_xgmi_write_data_acc), + xgmi_write_data_num_elems, + metrics_public_init.xgmi_write_data_acc); + + // PMFW attached timestamp (10ns resolution) + metrics_public_init.firmware_timestamp = m_gpu_metrics_tbl.m_firmware_timestamp; + + // Current clocks + // current_gfxclk + const auto curr_gfxclk_num_elems = + static_cast( + std::end(m_gpu_metrics_tbl.m_current_gfxclk) - + std::begin(m_gpu_metrics_tbl.m_current_gfxclk)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_gfxclk), + curr_gfxclk_num_elems, + metrics_public_init.current_gfxclks); + + // current_socclk + const auto curr_socclk_num_elems = + static_cast( + std::end(m_gpu_metrics_tbl.m_current_socclk) - + std::begin(m_gpu_metrics_tbl.m_current_socclk)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_socclk), + curr_socclk_num_elems, + metrics_public_init.current_socclks); + + // current_vclk0 + const auto curr_vclk0_num_elems = + static_cast( + std::end(m_gpu_metrics_tbl.m_current_vclk0) - + std::begin(m_gpu_metrics_tbl.m_current_vclk0)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_vclk0), + curr_vclk0_num_elems, + metrics_public_init.current_vclk0s); + + // current_dclk0 + const auto curr_dclk0_num_elems = + static_cast( + std::end(m_gpu_metrics_tbl.m_current_dclk0) - + std::begin(m_gpu_metrics_tbl.m_current_dclk0)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_dclk0), + curr_dclk0_num_elems, + metrics_public_init.current_dclk0s); + + metrics_public_init.current_uclk = m_gpu_metrics_tbl.m_current_uclk; + + // + // Note: Backwards compatibility -> Handling extra/exception cases + // related to earlier versions (1.3) + metrics_public_init.current_gfxclk = metrics_public_init.current_gfxclks[0]; + metrics_public_init.average_gfxclk_frequency = metrics_public_init.current_gfxclks[0]; + + metrics_public_init.current_socclk = metrics_public_init.current_socclks[0]; + metrics_public_init.average_socclk_frequency = metrics_public_init.current_socclks[0]; + + metrics_public_init.current_vclk0 = metrics_public_init.current_vclk0s[0]; + metrics_public_init.average_vclk0_frequency = metrics_public_init.current_vclk0s[0]; + + metrics_public_init.current_vclk1 = metrics_public_init.current_vclk0s[1]; + metrics_public_init.average_vclk1_frequency = metrics_public_init.current_vclk0s[1]; + + metrics_public_init.current_dclk0 = metrics_public_init.current_dclk0s[0]; + metrics_public_init.average_dclk0_frequency = metrics_public_init.current_dclk0s[0]; + + metrics_public_init.current_dclk1 = metrics_public_init.current_dclk0s[1]; + metrics_public_init.average_dclk1_frequency = metrics_public_init.current_dclk0s[1]; + + return metrics_public_init; + }(); + + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Returning = " << getRSMIStatusString(status_code) + << " |"; + LOG_TRACE(ostrstream); + + return std::make_tuple(status_code, copy_data_from_internal_metrics_tbl); } rsmi_status_t GpuMetricsBase_v13_t::populate_metrics_dynamic_tbl() { + std::ostringstream ostrstream; + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + ostrstream << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ostrstream); + + // + // Note: Any metric treatment/changes (if any) should happen before they + // get written to internal/external tables. + // + auto run_metric_adjustments_v13 = [&]() { + ostrstream << __PRETTY_FUNCTION__ << " | ======= start ======="; + const auto gpu_metrics_version = translate_flag_to_metric_version(get_gpu_metrics_version_used()); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= info ======= " + << " | Applying adjustments " + << " | Metric Version: " << stringfy_metric_header_version( + disjoin_metrics_version(gpu_metrics_version)) + << " |"; + LOG_TRACE(ostrstream); + + // firmware_timestamp is at 10ns resolution + ostrstream << __PRETTY_FUNCTION__ + << " | ======= Changes ======= " + << " | {m_firmware_timestamp} from: " << m_gpu_metrics_tbl.m_firmware_timestamp + << " to: " << (m_gpu_metrics_tbl.m_firmware_timestamp * 10); + m_gpu_metrics_tbl.m_firmware_timestamp = (m_gpu_metrics_tbl.m_firmware_timestamp * 10); + LOG_DEBUG(ostrstream); + }; + + + // Adjustments/Changes specific to this version + run_metric_adjustments_v13(); + // Temperature Info m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempEdge, @@ -1219,19 +1192,776 @@ rsmi_status_t GpuMetricsBase_v13_t::populate_metrics_dynamic_tbl() "voltage_mem")) ); - return rsmi_status_t::RSMI_STATUS_SUCCESS; + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Returning = " << getRSMIStatusString(status_code) + << " |"; + LOG_TRACE(ostrstream); + + return status_code; } +AMGpuMetricsPublicLatestTupl_t GpuMetricsBase_v13_t::copy_internal_to_external_metrics() +{ + std::ostringstream ostrstream; + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + ostrstream << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ostrstream); + + auto copy_data_from_internal_metrics_tbl = [&]() { + AMGpuMetricsPublicLatest_t metrics_public_init{}; + + // + // Note: Initializing data members with their max. If field is max, + // no data was assigned to it. + init_max_public_gpu_matrics(metrics_public_init); + + // Header + metrics_public_init.common_header.structure_size = m_gpu_metrics_tbl.m_common_header.m_structure_size; + metrics_public_init.common_header.format_revision = m_gpu_metrics_tbl.m_common_header.m_format_revision; + metrics_public_init.common_header.content_revision = m_gpu_metrics_tbl.m_common_header.m_content_revision; + + // Temperature + metrics_public_init.temperature_edge = m_gpu_metrics_tbl.m_temperature_edge; + metrics_public_init.temperature_hotspot = m_gpu_metrics_tbl.m_temperature_hotspot; + metrics_public_init.temperature_mem = m_gpu_metrics_tbl.m_temperature_mem; + metrics_public_init.temperature_vrgfx = m_gpu_metrics_tbl.m_temperature_vrgfx; + metrics_public_init.temperature_vrsoc = m_gpu_metrics_tbl.m_temperature_vrsoc; + metrics_public_init.temperature_vrmem = m_gpu_metrics_tbl.m_temperature_vrmem; + + // Utilization + metrics_public_init.average_gfx_activity = m_gpu_metrics_tbl.m_average_gfx_activity; + metrics_public_init.average_umc_activity = m_gpu_metrics_tbl.m_average_umc_activity; + metrics_public_init.average_mm_activity = m_gpu_metrics_tbl.m_average_mm_activity; + + // Power/Energy + metrics_public_init.average_socket_power = m_gpu_metrics_tbl.m_average_socket_power; // 1.3 and 1.4 have the same value + metrics_public_init.energy_accumulator = m_gpu_metrics_tbl.m_energy_accumulator; + + // Driver attached timestamp (in ns) + metrics_public_init.system_clock_counter = m_gpu_metrics_tbl.m_system_clock_counter; + + // Average clocks + metrics_public_init.average_gfxclk_frequency = m_gpu_metrics_tbl.m_average_gfxclk_frequency; + metrics_public_init.average_socclk_frequency = m_gpu_metrics_tbl.m_average_socclk_frequency; + metrics_public_init.average_uclk_frequency = m_gpu_metrics_tbl.m_average_uclk_frequency; + metrics_public_init.average_vclk0_frequency = m_gpu_metrics_tbl.m_average_vclk0_frequency; + metrics_public_init.average_dclk0_frequency = m_gpu_metrics_tbl.m_average_dclk0_frequency; + metrics_public_init.average_vclk1_frequency = m_gpu_metrics_tbl.m_average_vclk1_frequency; + metrics_public_init.average_dclk1_frequency = m_gpu_metrics_tbl.m_average_dclk1_frequency; + + // Current clocks + metrics_public_init.current_gfxclk = m_gpu_metrics_tbl.m_current_gfxclk; + metrics_public_init.current_socclk = m_gpu_metrics_tbl.m_current_socclk; + metrics_public_init.current_vclk0 = m_gpu_metrics_tbl.m_current_vclk0; + metrics_public_init.current_dclk0 = m_gpu_metrics_tbl.m_current_dclk0; + metrics_public_init.current_uclk = m_gpu_metrics_tbl.m_current_uclk; + metrics_public_init.current_vclk1 = m_gpu_metrics_tbl.m_current_vclk1; + metrics_public_init.current_dclk1 = m_gpu_metrics_tbl.m_current_dclk1; + + // Throttle status + metrics_public_init.throttle_status = m_gpu_metrics_tbl.m_throttle_status; + + // Fans + metrics_public_init.current_fan_speed = m_gpu_metrics_tbl.m_current_fan_speed; + + // Link width/speed + metrics_public_init.pcie_link_width = m_gpu_metrics_tbl.m_pcie_link_width; + metrics_public_init.pcie_link_speed = m_gpu_metrics_tbl.m_pcie_link_speed; + + metrics_public_init.gfx_activity_acc = m_gpu_metrics_tbl.m_gfx_activity_acc; + metrics_public_init.mem_activity_acc = m_gpu_metrics_tbl.m_mem_activity_acc; + + // temperature_hbm + const auto temp_hbm_num_elems = + static_cast( + std::end(m_gpu_metrics_tbl.m_temperature_hbm) - + std::begin(m_gpu_metrics_tbl.m_temperature_hbm)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_temperature_hbm), + temp_hbm_num_elems, + metrics_public_init.temperature_hbm); + + // PMFW attached timestamp (10ns resolution) + metrics_public_init.firmware_timestamp = m_gpu_metrics_tbl.m_firmware_timestamp; + + // Voltage (mV) + metrics_public_init.voltage_soc = m_gpu_metrics_tbl.m_voltage_soc; + metrics_public_init.voltage_gfx = m_gpu_metrics_tbl.m_voltage_gfx; + metrics_public_init.voltage_mem = m_gpu_metrics_tbl.m_voltage_mem; + + // Throttle status + metrics_public_init.indep_throttle_status = m_gpu_metrics_tbl.m_indep_throttle_status; + + // + // Note: Backwards compatibility -> Handling extra/exception cases + // related to earlier versions (1.2) + metrics_public_init.current_socket_power = metrics_public_init.average_socket_power; + + return metrics_public_init; + }(); + + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Returning = " << getRSMIStatusString(status_code) + << " |"; + LOG_TRACE(ostrstream); + + return std::make_tuple(status_code, copy_data_from_internal_metrics_tbl); +} + + rsmi_status_t GpuMetricsBase_v12_t::populate_metrics_dynamic_tbl() { - // TODO: Implement these; - return rsmi_status_t::RSMI_STATUS_NOT_YET_IMPLEMENTED; + std::ostringstream ostrstream; + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + ostrstream << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ostrstream); + + // + // Note: Any metric treatment/changes (if any) should happen before they + // get written to internal/external tables. + // + auto run_metric_adjustments_v12 = [&]() { + ostrstream << __PRETTY_FUNCTION__ << " | ======= start ======="; + const auto gpu_metrics_version = translate_flag_to_metric_version(get_gpu_metrics_version_used()); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= info ======= " + << " | Applying adjustments " + << " | Metric Version: " << stringfy_metric_header_version( + disjoin_metrics_version(gpu_metrics_version)) + << " |"; + LOG_TRACE(ostrstream); + + // firmware_timestamp is at 10ns resolution + ostrstream << __PRETTY_FUNCTION__ + << " | ======= Changes ======= " + << " | {m_firmware_timestamp} from: " << m_gpu_metrics_tbl.m_firmware_timestamp + << " to: " << (m_gpu_metrics_tbl.m_firmware_timestamp * 10); + m_gpu_metrics_tbl.m_firmware_timestamp = (m_gpu_metrics_tbl.m_firmware_timestamp * 10); + LOG_DEBUG(ostrstream); + }; + + + // Adjustments/Changes specific to this version + run_metric_adjustments_v12(); + + // Temperature Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempEdge, + format_metric_row(m_gpu_metrics_tbl.m_temperature_edge, + "temperature_edge")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempHotspot, + format_metric_row(m_gpu_metrics_tbl.m_temperature_hotspot, + "temperature_hotspot")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempMem, + format_metric_row(m_gpu_metrics_tbl.m_temperature_mem, + "temperature_mem")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempVrGfx, + format_metric_row(m_gpu_metrics_tbl.m_temperature_vrgfx, + "temperature_vrgfx")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempVrSoc, + format_metric_row(m_gpu_metrics_tbl.m_temperature_vrsoc, + "temperature_vrsoc")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempVrMem, + format_metric_row(m_gpu_metrics_tbl.m_temperature_vrmem, + "temperature_vrmem")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempHbm, + format_metric_row(m_gpu_metrics_tbl.m_temperature_hbm, + "[temperature_hbm]")) + ); + + // Power/Energy Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricPowerEnergy] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgSocketPower, + format_metric_row(m_gpu_metrics_tbl.m_average_socket_power, + "average_socket_power")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricPowerEnergy] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricEnergyAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_energy_accumulator, + "energy_acc")) + ); + + // Utilization Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgGfxActivity, + format_metric_row(m_gpu_metrics_tbl.m_average_gfx_activity, + "average_gfx_activity")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgUmcActivity, + format_metric_row(m_gpu_metrics_tbl.m_average_umc_activity, + "average_umc_activity")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgMmActivity, + format_metric_row(m_gpu_metrics_tbl.m_average_mm_activity, + "average_mm_activity")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxActivityAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_gfx_activity_acc, + "gfx_activity_acc")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricMemActivityAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_mem_activity_acc, + "mem_activity_acc")) + ); + + // Timestamp Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTimestamp] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTSFirmware, + format_metric_row(m_gpu_metrics_tbl.m_firmware_timestamp, + "firmware_timestamp")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTimestamp] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTSClockCounter, + format_metric_row(m_gpu_metrics_tbl.m_system_clock_counter, + "system_clock_counter")) + ); + + // Fan Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentFanSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrFanSpeed, + format_metric_row(m_gpu_metrics_tbl.m_current_fan_speed, + "current_fan_speed")) + ); + + // Throttle Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleStatus] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricThrottleStatus, + format_metric_row(m_gpu_metrics_tbl.m_throttle_status, + "throttle_status")) + ); + + // Average Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricAverageClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgGfxClockFrequency, + format_metric_row(m_gpu_metrics_tbl.m_average_gfxclk_frequency, + "average_gfxclk_frequency")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricAverageClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgSocClockFrequency, + format_metric_row(m_gpu_metrics_tbl.m_average_socclk_frequency, + "average_socclk_frequency")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricAverageClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgUClockFrequency, + format_metric_row(m_gpu_metrics_tbl.m_average_uclk_frequency, + "average_uclk_frequency")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricAverageClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgVClock0Frequency, + format_metric_row(m_gpu_metrics_tbl.m_average_vclk0_frequency, + "average_vclk0_frequency")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricAverageClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgDClock0Frequency, + format_metric_row(m_gpu_metrics_tbl.m_average_dclk0_frequency, + "average_dclk0_frequency")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricAverageClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgVClock1Frequency, + format_metric_row(m_gpu_metrics_tbl.m_average_vclk1_frequency, + "average_vclk1_frequency")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricAverageClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgDClock1Frequency, + format_metric_row(m_gpu_metrics_tbl.m_average_dclk1_frequency, + "average_dclk1_frequency")) + ); + + // CurrentClock Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrGfxClock, + format_metric_row(m_gpu_metrics_tbl.m_current_gfxclk, + "current_gfxclk")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrSocClock, + format_metric_row(m_gpu_metrics_tbl.m_current_socclk, + "current_socclk")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrUClock, + format_metric_row(m_gpu_metrics_tbl.m_current_uclk, + "current_uclk")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrVClock0, + format_metric_row(m_gpu_metrics_tbl.m_current_vclk0, + "current_vclk0")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrDClock0, + format_metric_row(m_gpu_metrics_tbl.m_current_dclk0, + "current_dclk0")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrVClock1, + format_metric_row(m_gpu_metrics_tbl.m_current_vclk1, + "current_vclk1")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrDClock1, + format_metric_row(m_gpu_metrics_tbl.m_current_dclk1, + "current_dclk1")) + ); + + // Link/Width/Speed Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieLinkWidth, + format_metric_row(m_gpu_metrics_tbl.m_pcie_link_width, + "pcie_link_width")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieLinkSpeed, + format_metric_row(m_gpu_metrics_tbl.m_pcie_link_speed, + "pcie_link_speed")) + ); + + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Returning = " << getRSMIStatusString(status_code) + << " |"; + LOG_TRACE(ostrstream); + + return status_code; +} + +AMGpuMetricsPublicLatestTupl_t GpuMetricsBase_v12_t::copy_internal_to_external_metrics() +{ + std::ostringstream ostrstream; + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + ostrstream << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ostrstream); + + auto copy_data_from_internal_metrics_tbl = [&]() { + AMGpuMetricsPublicLatest_t metrics_public_init{}; + + // + // Note: Initializing data members with their max. If field is max, + // no data was assigned to it. + init_max_public_gpu_matrics(metrics_public_init); + + // Header + metrics_public_init.common_header.structure_size = m_gpu_metrics_tbl.m_common_header.m_structure_size; + metrics_public_init.common_header.format_revision = m_gpu_metrics_tbl.m_common_header.m_format_revision; + metrics_public_init.common_header.content_revision = m_gpu_metrics_tbl.m_common_header.m_content_revision; + + // Temperature + metrics_public_init.temperature_edge = m_gpu_metrics_tbl.m_temperature_edge; + metrics_public_init.temperature_hotspot = m_gpu_metrics_tbl.m_temperature_hotspot; + metrics_public_init.temperature_mem = m_gpu_metrics_tbl.m_temperature_mem; + metrics_public_init.temperature_vrgfx = m_gpu_metrics_tbl.m_temperature_vrgfx; + metrics_public_init.temperature_vrsoc = m_gpu_metrics_tbl.m_temperature_vrsoc; + metrics_public_init.temperature_vrmem = m_gpu_metrics_tbl.m_temperature_vrmem; + + // Utilization + metrics_public_init.average_gfx_activity = m_gpu_metrics_tbl.m_average_gfx_activity; + metrics_public_init.average_umc_activity = m_gpu_metrics_tbl.m_average_umc_activity; + metrics_public_init.average_mm_activity = m_gpu_metrics_tbl.m_average_mm_activity; + + // Power/Energy + metrics_public_init.average_socket_power = m_gpu_metrics_tbl.m_average_socket_power; + metrics_public_init.energy_accumulator = m_gpu_metrics_tbl.m_energy_accumulator; + + // Driver attached timestamp (in ns) + metrics_public_init.system_clock_counter = m_gpu_metrics_tbl.m_system_clock_counter; + + // Average clocks + metrics_public_init.average_gfxclk_frequency = m_gpu_metrics_tbl.m_average_gfxclk_frequency; + metrics_public_init.average_socclk_frequency = m_gpu_metrics_tbl.m_average_socclk_frequency; + metrics_public_init.average_uclk_frequency = m_gpu_metrics_tbl.m_average_uclk_frequency; + metrics_public_init.average_vclk0_frequency = m_gpu_metrics_tbl.m_average_vclk0_frequency; + metrics_public_init.average_dclk0_frequency = m_gpu_metrics_tbl.m_average_dclk0_frequency; + metrics_public_init.average_vclk1_frequency = m_gpu_metrics_tbl.m_average_vclk1_frequency; + metrics_public_init.average_dclk1_frequency = m_gpu_metrics_tbl.m_average_dclk1_frequency; + + // Current clocks + metrics_public_init.current_gfxclk = m_gpu_metrics_tbl.m_current_gfxclk; + metrics_public_init.current_socclk = m_gpu_metrics_tbl.m_current_socclk; + metrics_public_init.current_vclk0 = m_gpu_metrics_tbl.m_current_vclk0; + metrics_public_init.current_dclk0 = m_gpu_metrics_tbl.m_current_dclk0; + metrics_public_init.current_uclk = m_gpu_metrics_tbl.m_current_uclk; + metrics_public_init.current_vclk1 = m_gpu_metrics_tbl.m_current_vclk1; + metrics_public_init.current_dclk1 = m_gpu_metrics_tbl.m_current_dclk1; + + // Throttle status + metrics_public_init.throttle_status = m_gpu_metrics_tbl.m_throttle_status; + + // Fans + metrics_public_init.current_fan_speed = m_gpu_metrics_tbl.m_current_fan_speed; + + // Link width/speed + metrics_public_init.pcie_link_width = m_gpu_metrics_tbl.m_pcie_link_width; + metrics_public_init.pcie_link_speed = m_gpu_metrics_tbl.m_pcie_link_speed; + + metrics_public_init.gfx_activity_acc = m_gpu_metrics_tbl.m_gfx_activity_acc; + metrics_public_init.mem_activity_acc = m_gpu_metrics_tbl.m_mem_activity_acc; + + // temperature_hbm + const auto temp_hbm_num_elems = + static_cast( + std::end(m_gpu_metrics_tbl.m_temperature_hbm) - + std::begin(m_gpu_metrics_tbl.m_temperature_hbm)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_temperature_hbm), + temp_hbm_num_elems, + metrics_public_init.temperature_hbm); + + // PMFW attached timestamp (10ns resolution) + metrics_public_init.firmware_timestamp = m_gpu_metrics_tbl.m_firmware_timestamp; + + // + // Note: Backwards compatibility -> Handling extra/exception cases + // related to earlier versions (1.1) + + + return metrics_public_init; + }(); + + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Returning = " << getRSMIStatusString(status_code) + << " |"; + LOG_TRACE(ostrstream); + + return std::make_tuple(status_code, copy_data_from_internal_metrics_tbl); } rsmi_status_t GpuMetricsBase_v11_t::populate_metrics_dynamic_tbl() { - // TODO: Implement these; - return rsmi_status_t::RSMI_STATUS_NOT_YET_IMPLEMENTED; + std::ostringstream ostrstream; + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + ostrstream << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ostrstream); + + // + // Note: Any metric treatment/changes (if any) should happen before they + // get written to internal/external tables. + // + auto run_metric_adjustments_v11 = [&]() { + ostrstream << __PRETTY_FUNCTION__ << " | ======= start ======="; + const auto gpu_metrics_version = translate_flag_to_metric_version(get_gpu_metrics_version_used()); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= info ======= " + << " | Applying adjustments " + << " | Metric Version: " << stringfy_metric_header_version( + disjoin_metrics_version(gpu_metrics_version)) + << " |"; + LOG_TRACE(ostrstream); + }; + + + // Adjustments/Changes specific to this version + run_metric_adjustments_v11(); + + // Temperature Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempEdge, + format_metric_row(m_gpu_metrics_tbl.m_temperature_edge, + "temperature_edge")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempHotspot, + format_metric_row(m_gpu_metrics_tbl.m_temperature_hotspot, + "temperature_hotspot")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempMem, + format_metric_row(m_gpu_metrics_tbl.m_temperature_mem, + "temperature_mem")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempVrGfx, + format_metric_row(m_gpu_metrics_tbl.m_temperature_vrgfx, + "temperature_vrgfx")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempVrSoc, + format_metric_row(m_gpu_metrics_tbl.m_temperature_vrsoc, + "temperature_vrsoc")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempVrMem, + format_metric_row(m_gpu_metrics_tbl.m_temperature_vrmem, + "temperature_vrmem")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempHbm, + format_metric_row(m_gpu_metrics_tbl.m_temperature_hbm, + "[temperature_hbm]")) + ); + + // Power/Energy Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricPowerEnergy] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgSocketPower, + format_metric_row(m_gpu_metrics_tbl.m_average_socket_power, + "average_socket_power")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricPowerEnergy] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricEnergyAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_energy_accumulator, + "energy_acc")) + ); + + // Utilization Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgGfxActivity, + format_metric_row(m_gpu_metrics_tbl.m_average_gfx_activity, + "average_gfx_activity")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgUmcActivity, + format_metric_row(m_gpu_metrics_tbl.m_average_umc_activity, + "average_umc_activity")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgMmActivity, + format_metric_row(m_gpu_metrics_tbl.m_average_mm_activity, + "average_mm_activity")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxActivityAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_gfx_activity_acc, + "gfx_activity_acc")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricMemActivityAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_mem_activity_acc, + "mem_activity_acc")) + ); + + // Timestamp Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTimestamp] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTSClockCounter, + format_metric_row(m_gpu_metrics_tbl.m_system_clock_counter, + "system_clock_counter")) + ); + + // Fan Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentFanSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrFanSpeed, + format_metric_row(m_gpu_metrics_tbl.m_current_fan_speed, + "current_fan_speed")) + ); + + // Throttle Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleStatus] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricThrottleStatus, + format_metric_row(m_gpu_metrics_tbl.m_throttle_status, + "throttle_status")) + ); + + // Average Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricAverageClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgGfxClockFrequency, + format_metric_row(m_gpu_metrics_tbl.m_average_gfxclk_frequency, + "average_gfxclk_frequency")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricAverageClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgSocClockFrequency, + format_metric_row(m_gpu_metrics_tbl.m_average_socclk_frequency, + "average_socclk_frequency")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricAverageClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgUClockFrequency, + format_metric_row(m_gpu_metrics_tbl.m_average_uclk_frequency, + "average_uclk_frequency")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricAverageClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgVClock0Frequency, + format_metric_row(m_gpu_metrics_tbl.m_average_vclk0_frequency, + "average_vclk0_frequency")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricAverageClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgDClock0Frequency, + format_metric_row(m_gpu_metrics_tbl.m_average_dclk0_frequency, + "average_dclk0_frequency")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricAverageClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgVClock1Frequency, + format_metric_row(m_gpu_metrics_tbl.m_average_vclk1_frequency, + "average_vclk1_frequency")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricAverageClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgDClock1Frequency, + format_metric_row(m_gpu_metrics_tbl.m_average_dclk1_frequency, + "average_dclk1_frequency")) + ); + + // CurrentClock Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrGfxClock, + format_metric_row(m_gpu_metrics_tbl.m_current_gfxclk, + "current_gfxclk")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrSocClock, + format_metric_row(m_gpu_metrics_tbl.m_current_socclk, + "current_socclk")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrUClock, + format_metric_row(m_gpu_metrics_tbl.m_current_uclk, + "current_uclk")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrVClock0, + format_metric_row(m_gpu_metrics_tbl.m_current_vclk0, + "current_vclk0")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrDClock0, + format_metric_row(m_gpu_metrics_tbl.m_current_dclk0, + "current_dclk0")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrVClock1, + format_metric_row(m_gpu_metrics_tbl.m_current_vclk1, + "current_vclk1")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrDClock1, + format_metric_row(m_gpu_metrics_tbl.m_current_dclk1, + "current_dclk1")) + ); + + // Link/Width/Speed Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieLinkWidth, + format_metric_row(m_gpu_metrics_tbl.m_pcie_link_width, + "pcie_link_width")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieLinkSpeed, + format_metric_row(m_gpu_metrics_tbl.m_pcie_link_speed, + "pcie_link_speed")) + ); + + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Returning = " << getRSMIStatusString(status_code) + << " |"; + LOG_TRACE(ostrstream); + + return status_code; +} + +AMGpuMetricsPublicLatestTupl_t GpuMetricsBase_v11_t::copy_internal_to_external_metrics() +{ + std::ostringstream ostrstream; + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + ostrstream << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ostrstream); + + auto copy_data_from_internal_metrics_tbl = [&]() { + AMGpuMetricsPublicLatest_t metrics_public_init{}; + + // + // Note: Initializing data members with their max. If field is max, + // no data was assigned to it. + init_max_public_gpu_matrics(metrics_public_init); + + // Header + metrics_public_init.common_header.structure_size = m_gpu_metrics_tbl.m_common_header.m_structure_size; + metrics_public_init.common_header.format_revision = m_gpu_metrics_tbl.m_common_header.m_format_revision; + metrics_public_init.common_header.content_revision = m_gpu_metrics_tbl.m_common_header.m_content_revision; + + // Temperature + metrics_public_init.temperature_edge = m_gpu_metrics_tbl.m_temperature_edge; + metrics_public_init.temperature_hotspot = m_gpu_metrics_tbl.m_temperature_hotspot; + metrics_public_init.temperature_mem = m_gpu_metrics_tbl.m_temperature_mem; + metrics_public_init.temperature_vrgfx = m_gpu_metrics_tbl.m_temperature_vrgfx; + metrics_public_init.temperature_vrsoc = m_gpu_metrics_tbl.m_temperature_vrsoc; + metrics_public_init.temperature_vrmem = m_gpu_metrics_tbl.m_temperature_vrmem; + + // Utilization + metrics_public_init.average_gfx_activity = m_gpu_metrics_tbl.m_average_gfx_activity; + metrics_public_init.average_umc_activity = m_gpu_metrics_tbl.m_average_umc_activity; + metrics_public_init.average_mm_activity = m_gpu_metrics_tbl.m_average_mm_activity; + + // Power/Energy + metrics_public_init.average_socket_power = m_gpu_metrics_tbl.m_average_socket_power; + metrics_public_init.energy_accumulator = m_gpu_metrics_tbl.m_energy_accumulator; + + // Driver attached timestamp (in ns) + metrics_public_init.system_clock_counter = m_gpu_metrics_tbl.m_system_clock_counter; + + // Average clocks + metrics_public_init.average_gfxclk_frequency = m_gpu_metrics_tbl.m_average_gfxclk_frequency; + metrics_public_init.average_socclk_frequency = m_gpu_metrics_tbl.m_average_socclk_frequency; + metrics_public_init.average_uclk_frequency = m_gpu_metrics_tbl.m_average_uclk_frequency; + metrics_public_init.average_vclk0_frequency = m_gpu_metrics_tbl.m_average_vclk0_frequency; + metrics_public_init.average_dclk0_frequency = m_gpu_metrics_tbl.m_average_dclk0_frequency; + metrics_public_init.average_vclk1_frequency = m_gpu_metrics_tbl.m_average_vclk1_frequency; + metrics_public_init.average_dclk1_frequency = m_gpu_metrics_tbl.m_average_dclk1_frequency; + + // Current clocks + metrics_public_init.current_gfxclk = m_gpu_metrics_tbl.m_current_gfxclk; + metrics_public_init.current_socclk = m_gpu_metrics_tbl.m_current_socclk; + metrics_public_init.current_vclk0 = m_gpu_metrics_tbl.m_current_vclk0; + metrics_public_init.current_dclk0 = m_gpu_metrics_tbl.m_current_dclk0; + metrics_public_init.current_uclk = m_gpu_metrics_tbl.m_current_uclk; + metrics_public_init.current_vclk1 = m_gpu_metrics_tbl.m_current_vclk1; + metrics_public_init.current_dclk1 = m_gpu_metrics_tbl.m_current_dclk1; + + // Throttle status + metrics_public_init.throttle_status = m_gpu_metrics_tbl.m_throttle_status; + + // Fans + metrics_public_init.current_fan_speed = m_gpu_metrics_tbl.m_current_fan_speed; + + // Link width/speed + metrics_public_init.pcie_link_width = m_gpu_metrics_tbl.m_pcie_link_width; + metrics_public_init.pcie_link_speed = m_gpu_metrics_tbl.m_pcie_link_speed; + + metrics_public_init.gfx_activity_acc = m_gpu_metrics_tbl.m_gfx_activity_acc; + metrics_public_init.mem_activity_acc = m_gpu_metrics_tbl.m_mem_activity_acc; + + // temperature_hbm + const auto temp_hbm_num_elems = + static_cast( + std::end(m_gpu_metrics_tbl.m_temperature_hbm) - + std::begin(m_gpu_metrics_tbl.m_temperature_hbm)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_temperature_hbm), + temp_hbm_num_elems, + metrics_public_init.temperature_hbm); + + // + // Note: Backwards compatibility -> Handling extra/exception cases + // related to earlier versions (1.0) + + + return metrics_public_init; + }(); + + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Returning = " << getRSMIStatusString(status_code) + << " |"; + LOG_TRACE(ostrstream); + + return std::make_tuple(status_code, copy_data_from_internal_metrics_tbl); } @@ -1308,15 +2038,19 @@ rsmi_status_t Device::dev_read_gpu_metrics_all_data() ostrstream << __PRETTY_FUNCTION__ << " | ======= start ======="; LOG_TRACE(ostrstream); - // At this point we should have a valid gpu_metrics pointer. - if (!m_gpu_metrics_ptr) { - status_code = rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA; + // At this point we should have a valid gpu_metrics pointer, and + // we already read the header; setup_gpu_metrics_reading() + if ((!m_gpu_metrics_ptr) || + ((!m_gpu_metrics_header.m_structure_size) || + (!m_gpu_metrics_header.m_format_revision) || + (!m_gpu_metrics_header.m_content_revision))) { + status_code = rsmi_status_t::RSMI_STATUS_SETTING_UNAVAILABLE; ostrstream << __PRETTY_FUNCTION__ << " | ======= end ======= " << " | Fail " << " | Device #: " << index() << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) - << " | Cause: Couldn't get a valid metric object" + << " | Cause: Couldn't get a valid metric object. setup_gpu_metrics_reading()" << " | Returning = " << getRSMIStatusString(status_code) << " |"; @@ -1325,7 +2059,7 @@ rsmi_status_t Device::dev_read_gpu_metrics_all_data() } auto op_result = readDevInfo(DevInfoTypes::kDevGpuMetrics, - m_gpu_metrics_ptr->sizeof_metric_table(), + m_gpu_metrics_header.m_structure_size, &m_gpu_metrics_ptr->get_metrics_table()); if ((status_code = ErrnoToRsmiStatus(op_result)) != rsmi_status_t::RSMI_STATUS_SUCCESS) { @@ -1344,6 +2078,21 @@ rsmi_status_t Device::dev_read_gpu_metrics_all_data() return status_code; } + // All metric units are pushed in. + status_code = m_gpu_metrics_ptr->populate_metrics_dynamic_tbl(); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) { + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << index() + << " | Metric Version: " << stringfy_metrics_header(m_gpu_metrics_header) + << " | Update Timestamp: " << m_gpu_metrics_updated_timestamp + << " | Returning = " + << getRSMIStatusString(status_code) + << " |"; + LOG_ERROR(ostrstream); + } + m_gpu_metrics_updated_timestamp = actual_timestamp_in_secs(); ostrstream << __PRETTY_FUNCTION__ << " | ======= end ======= " @@ -1388,24 +2137,28 @@ rsmi_status_t Device::setup_gpu_metrics_reading() return status_code; } - auto gpu_metrics_ptr = amdgpu_metrics_factory(gpu_metrics_flag_version); - if (!gpu_metrics_ptr) { - status_code = rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA; - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Fail " - << " | Device #: " << index() - << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) - << " | Cause: amdgpu_metrics_factory() couldn't get a valid metric object" - << " | Returning = " - << getRSMIStatusString(status_code) - << " |"; - LOG_ERROR(ostrstream); - return status_code; + // + // if/in case setup_gpu_metrics_reading() was called already use the same pointer + if (!m_gpu_metrics_ptr) { + m_gpu_metrics_ptr = amdgpu_metrics_factory(gpu_metrics_flag_version); + if (!m_gpu_metrics_ptr) { + status_code = rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA; + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << index() + << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) + << " | Cause: amdgpu_metrics_factory() couldn't get a valid metric object" + << " | Returning = " + << getRSMIStatusString(status_code) + << " |"; + LOG_ERROR(ostrstream); + return status_code; + } } - /// gpu_metrics_ptr has the pointer to the proper object type/version. - dev_set_gpu_metric(gpu_metrics_ptr); + // + // m_gpu_metrics_ptr has the pointer to the proper object type/version. status_code = dev_read_gpu_metrics_all_data(); if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) { ostrstream << __PRETTY_FUNCTION__ @@ -1421,139 +2174,312 @@ rsmi_status_t Device::setup_gpu_metrics_reading() return status_code; } + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Device #: " << index() + << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) + << " | Fabric: [" << &m_gpu_metrics_ptr + << " ]" + << " | Returning = " + << getRSMIStatusString(status_code) + << " |"; + LOG_TRACE(ostrstream); return status_code; } -rsmi_status_t Device::dev_log_gpu_metrics() + +template +struct MetricValueCast_t; + +template<> +struct MetricValueCast_t +{ + using value_type = std::uint8_t; +}; +template<> +struct MetricValueCast_t +{ + using value_type = std::uint16_t; +}; +template<> +struct MetricValueCast_t +{ + using value_type = std::uint32_t; +}; + +template<> +struct MetricValueCast_t +{ + using value_type = std::uint64_t; +}; + +template +auto get_casted_value(const AMDGpuDynamicMetricsValue_t& metrics_value) +{ + using ValueType_t = typename MetricValueCast_t
::value_type; + return static_cast(metrics_value.m_value); +} + + +rsmi_status_t Device::dev_log_gpu_metrics(std::ostringstream& outstream_metrics) { std::ostringstream ostrstream; + std::ostringstream tmp_outstream_metrics; auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); ostrstream << __PRETTY_FUNCTION__ << " | ======= start ======="; LOG_TRACE(ostrstream); - // At this point we should have a valid gpu_metrics pointer. + // If we still don't have a valid gpu_metrics pointer; + // meaning, we didn't run any queries, and just want to + // print all the gpu metrics content, we need to setup + // the environment first. if (!m_gpu_metrics_ptr) { - status_code = rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA; - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Fail " - << " | Device #: " << index() - << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) - << " | Cause: Couldn't get a valid metric object" - << " | Returning = " - << getRSMIStatusString(status_code) - << " |"; - LOG_ERROR(ostrstream); - return status_code; + status_code = setup_gpu_metrics_reading(); + if ((status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) || (!m_gpu_metrics_ptr)) { + // At this point we should have a valid gpu_metrics pointer. + status_code = rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA; + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << index() + << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) + << " | Cause: Couldn't get a valid metric object" + << " | Returning = " + << getRSMIStatusString(status_code) + << " |"; + LOG_ERROR(ostrstream); + return status_code; + } } // Header info + const auto kDoubleLine = std::string("+==============================+"); + const auto kSingleLine = std::string("+------------------------------+"); auto header_output = [&]() { const auto gpu_metrics_header = dev_get_metrics_header(); - ostrstream << "GPU Metrics Header: \n"; - ostrstream << "Timestamp: " << m_gpu_metrics_updated_timestamp << "\n"; - ostrstream << "Based on: " << static_cast(m_gpu_metrics_ptr->get_gpu_metrics_version_used()) << "\n"; - ostrstream << print_unsigned_hex_and_int(gpu_metrics_header.m_structure_size, " ->structure_size "); - ostrstream << print_unsigned_hex_and_int(gpu_metrics_header.m_format_revision, " ->format_revision "); - ostrstream << print_unsigned_hex_and_int(gpu_metrics_header.m_content_revision, " ->content_revision "); - LOG_DEBUG(ostrstream); + const auto timestamp_time = timestamp_to_time_point(m_gpu_metrics_updated_timestamp); + tmp_outstream_metrics << "\n" << kDoubleLine << "\n"; + tmp_outstream_metrics << "*** GPU Metrics Header: ***"; + tmp_outstream_metrics << "\n"; + tmp_outstream_metrics << "Timestamp: " + << " [" + << m_gpu_metrics_updated_timestamp + << "] " + << std::ctime(×tamp_time); + tmp_outstream_metrics << "Version: " + << print_unsigned_int(gpu_metrics_header.m_format_revision) + << "." + << print_unsigned_int(gpu_metrics_header.m_content_revision) + << " [Flag: " + << static_cast(m_gpu_metrics_ptr->get_gpu_metrics_version_used()) + << "] " + << "\n"; + + tmp_outstream_metrics << print_unsigned_hex_and_int(gpu_metrics_header.m_structure_size, " ->structure_size "); + tmp_outstream_metrics << print_unsigned_hex_and_int(gpu_metrics_header.m_format_revision, " ->format_revision "); + tmp_outstream_metrics << print_unsigned_hex_and_int(gpu_metrics_header.m_content_revision, " ->content_revision "); + tmp_outstream_metrics << "\n" << kSingleLine << "\n"; return; }; + // Metrics info auto table_content_output = [&]() { const auto gpu_metrics_tbl = m_gpu_metrics_ptr->get_metrics_dynamic_tbl(); - - ostrstream << "GPU Metrics Data: \n"; + tmp_outstream_metrics << "\n"; + tmp_outstream_metrics << "*** GPU Metrics Data: *** \n"; for (const auto& [metric_class, metric_data] : gpu_metrics_tbl) { - ostrstream << amdgpu_metrics_class_id_translation_table.at(metric_class) << "\n"; + tmp_outstream_metrics << "\n"; + tmp_outstream_metrics << "[ " << amdgpu_metrics_class_id_translation_table.at(metric_class) << " ]" << "\n"; for (const auto& [metric_unit, metric_values] : metric_data) { + auto tmp_metric_info = ("[ " + amdgpu_metrics_unit_type_translation_table.at(metric_unit) + " ]"); for (const auto& metric_value : metric_values) { - ostrstream << print_unsigned_hex_and_int(metric_value.m_value, metric_value.m_info); + switch (metric_value.m_original_type) { + case (AMDGpuMetricsDataType_t::kUInt16): + { + auto value = get_casted_value(metric_value); + tmp_outstream_metrics << print_unsigned_hex_and_int((value), metric_value.m_info) << " -> " << tmp_metric_info; + } + break; + + case (AMDGpuMetricsDataType_t::kUInt32): + { + auto value = get_casted_value(metric_value); + tmp_outstream_metrics << print_unsigned_hex_and_int((value), metric_value.m_info) << " -> " << tmp_metric_info; + } + break; + + case (AMDGpuMetricsDataType_t::kUInt64): + { + auto value = get_casted_value(metric_value); + tmp_outstream_metrics << print_unsigned_hex_and_int((value), metric_value.m_info) << " -> " << tmp_metric_info; + } + break; + + default: + tmp_outstream_metrics << "Error: No data type conversion for original type: " << static_cast(metric_value.m_original_type) << "\n"; + break; + } } } + tmp_outstream_metrics << "\n\n"; } + tmp_outstream_metrics << "\n" << kDoubleLine << "\n"; return; }; // header_output(); table_content_output(); + outstream_metrics << tmp_outstream_metrics.rdbuf(); + LOG_DEBUG(tmp_outstream_metrics); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Device #: " << index() + << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) + << " | Fabric: [" << &m_gpu_metrics_ptr + << " ]" + << " | Returning = " + << getRSMIStatusString(status_code) + << " |"; + LOG_TRACE(ostrstream); return status_code; } +AMGpuMetricsPublicLatestTupl_t Device::dev_copy_internal_to_external_metrics() +{ + std::ostringstream ostrstream; + std::ostringstream tmp_outstream_metrics; + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + ostrstream << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ostrstream); + + if (!m_gpu_metrics_ptr) { + // At this point we should have a valid gpu_metrics pointer. + status_code = rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA; + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << index() + << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) + << " | Cause: Couldn't get a valid metric object" + << " | Returning = " + << getRSMIStatusString(status_code) + << " |"; + LOG_ERROR(ostrstream); + return std::make_tuple(status_code, AMGpuMetricsPublicLatest_t()); + } + + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Device #: " << index() + << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) + << " | Fabric: [" << &m_gpu_metrics_ptr + << " ]" + << " | Returning = " + << getRSMIStatusString(status_code) + << " |"; + LOG_TRACE(ostrstream); + + return m_gpu_metrics_ptr->copy_internal_to_external_metrics(); +} + + rsmi_status_t Device::run_internal_gpu_metrics_query(AMDGpuMetricsUnitType_t metric_counter, AMDGpuDynamicMetricTblValues_t& values) { std::ostringstream ostrstream; - auto status_code(rsmi_status_t::RSMI_STATUS_NOT_FOUND); + auto status_code(rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED); ostrstream << __PRETTY_FUNCTION__ << " | ======= start ======="; LOG_TRACE(ostrstream); - status_code = setup_gpu_metrics_reading(); - if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) { - return status_code; - } - if (!m_gpu_metrics_ptr) { - status_code = rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA; - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Fail " - << " | Device #: " << index() - << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) - << " | Cause: Couldn't get a valid metric object" - << " | Returning = " - << getRSMIStatusString(status_code) - << " |"; - LOG_ERROR(ostrstream); - return status_code; + status_code = setup_gpu_metrics_reading(); + if ((status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) || (!m_gpu_metrics_ptr)) { + status_code = rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA; + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << index() + << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) + << " | Cause: Couldn't get a valid metric object" + << " | Returning = " + << getRSMIStatusString(status_code) + << " |"; + LOG_ERROR(ostrstream); + return status_code; + } } // Lookup the dynamic table + ostrstream << __PRETTY_FUNCTION__ + << " | ======= info ======= " + << " | Device #: " << index() + << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) + << " | Metric Unit: " << static_cast(metric_counter) + << " |"; + LOG_INFO(ostrstream); const auto gpu_metrics_tbl = m_gpu_metrics_ptr->get_metrics_dynamic_tbl(); for (const auto& [metric_class, metric_data] : gpu_metrics_tbl) { for (const auto& [metric_unit, metric_values] : metric_data) { if (metric_unit == metric_counter) { values = metric_values; - return rsmi_status_t::RSMI_STATUS_SUCCESS; + status_code = rsmi_status_t::RSMI_STATUS_SUCCESS; + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Device #: " << index() + << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) + << " | Metric Unit: " << static_cast(metric_counter) + << " | Returning = " + << getRSMIStatusString(status_code) + << " |"; + LOG_ERROR(ostrstream); + return status_code; } } } + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << index() + << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) + << " | Returning = " + << getRSMIStatusString(status_code) + << " |"; + LOG_ERROR(ostrstream); return status_code; } -template struct is_vector : std::false_type {}; -template struct is_vector> : std::true_type {}; -template inline constexpr bool is_vector_v = is_vector::value; +template +constexpr inline bool is_metric_data_type_supported_v = + ((std::is_same_v) || (std::is_same_v) || + (std::is_same_v) || (std::is_same_v) || + (std::is_same_v) || (std::is_same_v)); -template struct is_array : std::false_type {}; -template struct is_array> : std::true_type {}; +template +struct is_std_vector : std::false_type {}; -template struct is_bounded_uint8_array : std::false_type {}; -template struct is_bounded_uint16_array : std::false_type {}; -template struct is_bounded_uint32_array : std::false_type {}; -template struct is_bounded_uint64_array : std::false_type {}; +template +struct is_std_vector> : std::true_type {}; -template -struct is_bounded_uint8_array : std::true_type {}; - -template -struct is_bounded_uint16_array : std::true_type {}; - -template -struct is_bounded_uint32_array : std::true_type {}; - -template -struct is_bounded_uint64_array : std::true_type {}; - -template struct is_bounded_array : std::false_type {}; - -template -struct is_bounded_array : std::true_type {}; +template +inline constexpr bool is_std_vector_v = is_std_vector::value; +template +constexpr bool is_std_vector_type_supported_v() +{ + if constexpr (is_std_vector_v) { + using ValueType_t = typename T::value_type; + return (is_metric_data_type_supported_v); + } + return false; +}; template rsmi_status_t rsmi_dev_gpu_metrics_info_query(uint32_t dv_ind, AMDGpuMetricsUnitType_t metric_counter, T& metric_value) @@ -1563,79 +2489,173 @@ rsmi_status_t rsmi_dev_gpu_metrics_info_query(uint32_t dv_ind, AMDGpuMetricsUnit ostrstream << __PRETTY_FUNCTION__ << " | ======= start ======="; LOG_TRACE(ostrstream); - AMDGpuDynamicMetricTblValues_t tmp_values{}; - GET_DEV_FROM_INDX - status_code = dev->run_internal_gpu_metrics_query(metric_counter, tmp_values); - if ((status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) || tmp_values.empty()) { - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Fail " - << " | Device #: " << dv_ind - << " | Metric Version: " << stringfy_metrics_header(dev->dev_get_metrics_header()) - << " | Cause: Couldn't find metric/counter requested" - << " | Metric Type: " << static_cast(metric_counter) - << " " << amdgpu_metrics_unit_type_translation_table.at(metric_counter) - << " | Values: " << tmp_values.size() - << " | Returning = " - << getRSMIStatusString(status_code) - << " |"; - LOG_ERROR(ostrstream); - return status_code; - } - - const auto num_stored_elems = (std::end(tmp_values) - std::begin(tmp_values)); - if constexpr (std::is_array_v) { - std::variant tmp_value; - auto idx = uint16_t(0); - for (const auto& value : tmp_values) { - tmp_value = value.m_value; - idx++; - switch (value.m_original_type) { - case AMDGpuMetricsDataType_t::kUInt8: - metric_value[idx] = std::get(tmp_value); - break; - - case AMDGpuMetricsDataType_t::kUInt16: - metric_value[idx] = std::get(tmp_value); - break; - - case AMDGpuMetricsDataType_t::kUInt32: - metric_value[idx] = std::get(tmp_value); - break; - - case AMDGpuMetricsDataType_t::kUInt64: - metric_value[idx] = std::get(tmp_value); - break; - - default: - break; + static constexpr bool is_supported_vector_type = [&]() { + if constexpr (is_std_vector_v) { + if (is_std_vector_type_supported_v()) { + return true; } + } + return false; + }(); - metric_value[idx++] = tmp_value; + + if constexpr ((is_supported_vector_type) || (is_metric_data_type_supported_v)) { + // Get all stored values for the metric unit/counter + AMDGpuDynamicMetricTblValues_t tmp_values{}; + GET_DEV_FROM_INDX + status_code = dev->run_internal_gpu_metrics_query(metric_counter, tmp_values); + if ((status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) || tmp_values.empty()) { + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Metric Version: " << stringfy_metrics_header(dev->dev_get_metrics_header()) + << " | Cause: Couldn't find metric/counter requested" + << " | Metric Type: " << static_cast(metric_counter) + << " " << amdgpu_metrics_unit_type_translation_table.at(metric_counter) + << " | Values: " << tmp_values.size() + << " | Returning = " + << getRSMIStatusString(status_code) + << " |"; + LOG_ERROR(ostrstream); + return status_code; + } + + if constexpr (is_std_vector_v) { + using ValueType_t = typename T::value_type; + ValueType_t tmp_value; + + for (const auto& value : tmp_values) { + tmp_value = static_cast(value.m_value); + metric_value.push_back(tmp_value); + } + } + else if constexpr (is_metric_data_type_supported_v) { + T tmp_value(0); + tmp_value = static_cast(tmp_values[0].m_value); + metric_value = tmp_value; } } - if constexpr ((std::is_same_v) || (std::is_same_v) || - (std::is_same_v) || (std::is_same_v)) { - T tmp_value(0); - tmp_value = static_cast(tmp_values[0].m_value); - metric_value = tmp_value; + else { + static_assert(is_dependent_false_v, "Error: Data Type not supported..."); } + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(metric_counter) + << " | Returning = " + << getRSMIStatusString(status_code) + << " |"; + LOG_ERROR(ostrstream); return status_code; } template -rsmi_status_t rsmi_dev_gpu_metrics_info_query(uint32_t dv_ind, AMDGpuMetricsUnitType_t metric_counter, uint8_t& metric_value); +rsmi_status_t rsmi_dev_gpu_metrics_info_query +(uint32_t dv_ind, AMDGpuMetricsUnitType_t metric_counter, uint16_t& metric_value); template -rsmi_status_t rsmi_dev_gpu_metrics_info_query(uint32_t dv_ind, AMDGpuMetricsUnitType_t metric_counter, uint16_t& metric_value); +rsmi_status_t rsmi_dev_gpu_metrics_info_query +(uint32_t dv_ind, AMDGpuMetricsUnitType_t metric_counter, uint32_t& metric_value); template -rsmi_status_t rsmi_dev_gpu_metrics_info_query(uint32_t dv_ind, AMDGpuMetricsUnitType_t metric_counter, uint32_t& metric_value); +rsmi_status_t rsmi_dev_gpu_metrics_info_query +(uint32_t dv_ind, AMDGpuMetricsUnitType_t metric_counter, uint64_t& metric_value); template -rsmi_status_t rsmi_dev_gpu_metrics_info_query(uint32_t dv_ind, AMDGpuMetricsUnitType_t metric_counter, uint64_t& metric_value); +rsmi_status_t rsmi_dev_gpu_metrics_info_query +(uint32_t dv_ind, AMDGpuMetricsUnitType_t metric_counter, GpuMetricU16Tbl_t& metric_value); +template +rsmi_status_t rsmi_dev_gpu_metrics_info_query +(uint32_t dv_ind, AMDGpuMetricsUnitType_t metric_counter, GpuMetricU32Tbl_t& metric_value); + +template +rsmi_status_t rsmi_dev_gpu_metrics_info_query +(uint32_t dv_ind, AMDGpuMetricsUnitType_t metric_counter, GpuMetricU64Tbl_t& metric_value); } //namespace amd::smi + +rsmi_status_t +rsmi_dev_gpu_metrics_header_info_get(uint32_t dv_ind, metrics_table_header_t& header_value) +{ + TRY + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + GET_DEV_FROM_INDX + status_code = dev->dev_read_gpu_metrics_header_data(); + if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { + auto tmp_header_info = dev->dev_get_metrics_header(); + std::memcpy(&header_value, &tmp_header_info, sizeof(metrics_table_header_t)); + } + + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Device #: " << dv_ind + << " | Returning = " + << getRSMIStatusString(status_code) + << " |"; + LOG_TRACE(ostrstream); + + return status_code; + CATCH +} + +//dev_read_gpu_metrics_header_data + +/** + * Note: These keep backwards compatibility with previous GPU metrics work + */ +// log current gpu_metrics file content read +// any metrics value can be a nullptr +rsmi_status_t +rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t* smu) { + TRY + DEVICE_MUTEX + CHK_SUPPORT_NAME_ONLY(smu) + + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + assert(smu != nullptr); + if (smu == nullptr) { + status_code = rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + return status_code; + } + + dev->dev_log_gpu_metrics(ostrstream); + const auto [error_code, external_metrics] = dev->dev_copy_internal_to_external_metrics(); + if (error_code != rsmi_status_t::RSMI_STATUS_SUCCESS) { + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Returning = " + << getRSMIStatusString(error_code) + << " |"; + LOG_ERROR(ostrstream); + return error_code; + } + + *smu = external_metrics; + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Device #: " << dv_ind + << " | Returning = " + << getRSMIStatusString(status_code) + << " |"; + LOG_TRACE(ostrstream); + + return status_code; + CATCH +} + diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/gpu_metrics_read.cc b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/gpu_metrics_read.cc index f7944ddcbf..70a6028d18 100644 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/gpu_metrics_read.cc +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/gpu_metrics_read.cc @@ -100,7 +100,7 @@ void TestGpuMetricsRead::Run(void) { PrintDeviceHeader(i); IF_VERB(STANDARD) { - std::cout << "\t**GPU METRICS:\n"; + std::cout << "\t**GPU METRICS: Using static struct (Backwards Compatibility):\n"; } rsmi_gpu_metrics_t smu; err = rsmi_dev_gpu_metrics_info_get(i, &smu); @@ -191,4 +191,380 @@ void TestGpuMetricsRead::Run(void) { err = rsmi_dev_gpu_metrics_info_get(i, nullptr); ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS); } + + + // + auto val_ui16 = uint16_t(0); + auto val_ui32 = uint32_t(0); + auto val_ui64 = uint64_t(0); + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + for (uint32_t i = 0; i < num_monitor_devs(); ++i) { + PrintDeviceHeader(i); + + auto temp_edge_value = val_ui16; + status_code = rsmi_dev_metrics_temp_edge_get(i, &temp_edge_value); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + + auto temp_hotspot_value = val_ui16; + status_code = rsmi_dev_metrics_temp_hotspot_get(i, &temp_hotspot_value); + CHK_ERR_ASRT(status_code); + + auto temp_mem_value = val_ui16; + status_code = rsmi_dev_metrics_temp_mem_get(i, &temp_mem_value); + CHK_ERR_ASRT(status_code); + + auto temp_vrgfx_value = val_ui16; + status_code = rsmi_dev_metrics_temp_vrgfx_get(i, &temp_vrgfx_value); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + + auto temp_vrsoc_value = val_ui16; + status_code = rsmi_dev_metrics_temp_vrsoc_get(i, &temp_vrsoc_value); + CHK_ERR_ASRT(status_code); + + auto temp_vrmem_value = val_ui16; + status_code = rsmi_dev_metrics_temp_vrmem_get(i, &temp_vrmem_value); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + + GPUMetricTempHbm_t temp_hbm_values; + status_code = rsmi_dev_metrics_temp_hbm_get(i, &temp_hbm_values); + CHK_ERR_ASRT(status_code); + + auto temp_curr_socket_power_value = val_ui16; + status_code = rsmi_dev_metrics_curr_socket_power_get(i, &temp_curr_socket_power_value); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + + auto temp_energy_accum_value = val_ui64; + status_code = rsmi_dev_metrics_energy_acc_get(i, &temp_energy_accum_value); + CHK_ERR_ASRT(status_code); + + auto temp_avg_socket_power_value = val_ui16; + status_code = rsmi_dev_metrics_avg_socket_power_get(i, &temp_avg_socket_power_value); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + + auto temp_avg_gfx_activity_value = val_ui16; + status_code = rsmi_dev_metrics_avg_gfx_activity_get(i, &temp_avg_gfx_activity_value); + CHK_ERR_ASRT(status_code); + + auto temp_avg_umc_activity_value = val_ui16; + status_code = rsmi_dev_metrics_avg_umc_activity_get(i, &temp_avg_umc_activity_value); + CHK_ERR_ASRT(status_code); + + auto temp_avg_mm_activity_value = val_ui16; + status_code = rsmi_dev_metrics_avg_mm_activity_get(i, &temp_avg_mm_activity_value); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + + GPUMetricVcnActivity_t temp_vcn_values; + status_code = rsmi_dev_metrics_vcn_activity_get(i, &temp_vcn_values); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + + auto temp_mem_activity_accum_value = val_ui32; + status_code = rsmi_dev_metrics_mem_activity_acc_get(i, &temp_mem_activity_accum_value); + CHK_ERR_ASRT(status_code); + + auto temp_gfx_activity_accum_value = val_ui32; + status_code = rsmi_dev_metrics_gfx_activity_acc_get(i, &temp_gfx_activity_accum_value); + CHK_ERR_ASRT(status_code); + + auto temp_avg_gfx_clock_freq_value = val_ui16; + status_code = rsmi_dev_metrics_avg_gfx_clock_frequency_get(i, &temp_avg_gfx_clock_freq_value); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + + auto temp_avg_soc_clock_freq_value = val_ui16; + status_code = rsmi_dev_metrics_avg_soc_clock_frequency_get(i, &temp_avg_soc_clock_freq_value); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + + auto temp_avg_uclock_freq_value = val_ui16; + status_code = rsmi_dev_metrics_avg_uclock_frequency_get(i, &temp_avg_uclock_freq_value); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + + auto temp_avg_vclock0_freq_value = val_ui16; + status_code = rsmi_dev_metrics_avg_vclock0_frequency_get(i, &temp_avg_vclock0_freq_value); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + + auto temp_avg_dclock0_freq_value = val_ui16; + status_code = rsmi_dev_metrics_avg_dclock0_frequency_get(i, &temp_avg_dclock0_freq_value); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + + auto temp_avg_vclock1_freq_value = val_ui16; + status_code = rsmi_dev_metrics_avg_vclock1_frequency_get(i, &temp_avg_vclock1_freq_value); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + + auto temp_avg_dclock1_freq_value = val_ui16; + status_code = rsmi_dev_metrics_avg_dclock1_frequency_get(i, &temp_avg_dclock1_freq_value); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + + auto temp_curr_vclk1_value = val_ui16; + status_code = rsmi_dev_metrics_curr_vclk1_get(i, &temp_curr_vclk1_value); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + + auto temp_curr_dclk1_value = val_ui16; + status_code = rsmi_dev_metrics_curr_dclk1_get(i, &temp_curr_dclk1_value); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + + auto temp_curr_uclk_value = val_ui16; + status_code = rsmi_dev_metrics_curr_uclk_get(i, &temp_curr_uclk_value); + CHK_ERR_ASRT(status_code); + + GPUMetricCurrDClk0_t temp_curr_dclk0_values; + status_code = rsmi_dev_metrics_curr_dclk0_get(i, &temp_curr_dclk0_values); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + + GPUMetricCurrGfxClk_t temp_curr_gfxclk_values; + status_code = rsmi_dev_metrics_curr_gfxclk_get(i, &temp_curr_gfxclk_values); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + + GPUMetricCurrSocClk_t temp_curr_socclk_values; + status_code = rsmi_dev_metrics_curr_socclk_get(i, &temp_curr_socclk_values); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + + GPUMetricCurrVClk0_t temp_curr_vclk0_values; + status_code = rsmi_dev_metrics_curr_vclk0_get(i, &temp_curr_vclk0_values); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + + auto temp_indep_throttle_status_value = val_ui64; + status_code = rsmi_dev_metrics_indep_throttle_status_get(i, &temp_indep_throttle_status_value); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + + auto temp_throttle_status_value = val_ui32; + status_code = rsmi_dev_metrics_throttle_status_get(i, &temp_throttle_status_value); + CHK_ERR_ASRT(status_code); + + auto temp_gfxclk_lock_status_value = val_ui32; + status_code = rsmi_dev_metrics_gfxclk_lock_status_get(i, &temp_gfxclk_lock_status_value); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + auto temp_curr_fan_speed_value = val_ui16; + status_code = rsmi_dev_metrics_curr_fan_speed_get(i, &temp_curr_fan_speed_value); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + + auto temp_pcie_link_width_value = val_ui16; + status_code = rsmi_dev_metrics_pcie_link_width_get(i, &temp_pcie_link_width_value); + CHK_ERR_ASRT(status_code); + + auto temp_pcie_link_speed_value = val_ui16; + status_code = rsmi_dev_metrics_pcie_link_speed_get(i, &temp_pcie_link_speed_value); + CHK_ERR_ASRT(status_code); + + auto temp_pcie_bandwidth_accum_value = val_ui64; + status_code = rsmi_dev_metrics_pcie_bandwidth_acc_get(i, &temp_pcie_bandwidth_accum_value); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + + auto temp_pcie_bandwidth_inst_value = val_ui64; + status_code = rsmi_dev_metrics_pcie_bandwidth_inst_get(i, &temp_pcie_bandwidth_inst_value); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + + auto temp_pcie_l0_recov_count_accum_value = val_ui64; + status_code = rsmi_dev_metrics_pcie_l0_recov_count_acc_get(i, &temp_pcie_l0_recov_count_accum_value); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + + auto temp_pcie_replay_count_accum_value = val_ui64; + status_code = rsmi_dev_metrics_pcie_replay_count_acc_get(i, &temp_pcie_replay_count_accum_value); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + + auto temp_pcie_replay_rover_count_accum_value = val_ui64; + status_code = rsmi_dev_metrics_pcie_replay_rover_count_acc_get(i, &temp_pcie_replay_rover_count_accum_value); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + + auto temp_xgmi_link_width_value = val_ui16; + status_code = rsmi_dev_metrics_xgmi_link_width_get(i, &temp_xgmi_link_width_value); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + + auto temp_xgmi_link_speed_value = val_ui16; + status_code = rsmi_dev_metrics_xgmi_link_speed_get(i, &temp_xgmi_link_speed_value); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + + GPUMetricXgmiReadDataAcc_t temp_xgmi_read_values; + status_code = rsmi_dev_metrics_xgmi_read_data_get(i, &temp_xgmi_read_values); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + + GPUMetricXgmiWriteDataAcc_t temp_xgmi_write_values; + status_code = rsmi_dev_metrics_xgmi_write_data_get(i, &temp_xgmi_write_values); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + + auto temp_voltage_soc_value = val_ui16; + status_code = rsmi_dev_metrics_volt_soc_get(i, &temp_voltage_soc_value); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + + auto temp_voltage_gfx_value = val_ui16; + status_code = rsmi_dev_metrics_volt_gfx_get(i, &temp_voltage_gfx_value); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + + auto temp_voltage_mem_value = val_ui16; + status_code = rsmi_dev_metrics_volt_mem_get(i, &temp_voltage_mem_value); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + + auto temp_system_clock_counter_value = val_ui64; + status_code = rsmi_dev_metrics_system_clock_counter_get(i, &temp_system_clock_counter_value); + CHK_ERR_ASRT(status_code); + + auto temp_firmware_timestamp_value = val_ui64; + status_code = rsmi_dev_metrics_firmware_timestamp_get(i, &temp_firmware_timestamp_value); + CHK_ERR_ASRT(status_code); + + auto temp_xcd_counter_value = val_ui16; + status_code = rsmi_dev_metrics_xcd_counter_get(i, &temp_xcd_counter_value); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + + IF_VERB(STANDARD) { + std::cout << "\n"; + std::cout << "\t[Temperature]" << "\n"; + std::cout << "\t -> temp_edge(): " << temp_edge_value << "\n"; + std::cout << "\t -> temp_hotspot(): " << temp_hotspot_value << "\n"; + std::cout << "\t -> temp_mem(): " << temp_mem_value << "\n"; + std::cout << "\t -> temp_vrgfx(): " << temp_vrgfx_value << "\n"; + std::cout << "\t -> temp_vrsoc(): " << temp_vrsoc_value << "\n"; + std::cout << "\t -> temp_vrmem(): " << temp_vrmem_value << "\n"; + std::cout << "\t -> temp_hbm(): " << temp_hbm_values << "\n"; + + std::cout << "\n"; + std::cout << "\t[Power/Energy]" << "\n"; + std::cout << "\t -> current_socket_power(): " << temp_curr_socket_power_value << "\n"; + std::cout << "\t -> energy_accum(): " << temp_energy_accum_value << "\n"; + std::cout << "\t -> average_socket_power(): " << temp_avg_socket_power_value << "\n"; + + std::cout << "\n"; + std::cout << "\t[Utilization]" << "\n"; + std::cout << "\t -> average_gfx_activity(): " << temp_avg_gfx_activity_value << "\n"; + std::cout << "\t -> average_umc_activity(): " << temp_avg_umc_activity_value << "\n"; + std::cout << "\t -> average_mm_activity(): " << temp_avg_mm_activity_value << "\n"; + std::cout << "\t -> vcn_activity(): " << temp_vcn_values << "\n"; + std::cout << "\t -> mem_activity_accum(): " << temp_mem_activity_accum_value << "\n"; + std::cout << "\t -> gfx_activity_accum(): " << temp_gfx_activity_accum_value << "\n"; + + std::cout << "\n"; + std::cout << "\t[Average Clock]" << "\n"; + std::cout << "\t -> average_gfx_clock_frequency(): " << temp_avg_gfx_clock_freq_value << "\n"; + std::cout << "\t -> average_soc_clock_frequency(): " << temp_avg_soc_clock_freq_value << "\n"; + std::cout << "\t -> average_uclock_frequency(): " << temp_avg_uclock_freq_value << "\n"; + std::cout << "\t -> average_vclock0_frequency(): " << temp_avg_vclock0_freq_value << "\n"; + std::cout << "\t -> average_dclock0_frequency(): " << temp_avg_dclock0_freq_value << "\n"; + std::cout << "\t -> average_vclock1_frequency(): " << temp_avg_vclock1_freq_value << "\n"; + std::cout << "\t -> average_dclock1_frequency(): " << temp_avg_dclock1_freq_value << "\n"; + + std::cout << "\n"; + std::cout << "\t[Current Clock]" << "\n"; + std::cout << "\t -> current_vclock1(): " << temp_curr_vclk1_value << "\n"; + std::cout << "\t -> current_dclock1(): " << temp_curr_dclk1_value << "\n"; + std::cout << "\t -> current_uclock(): " << temp_curr_uclk_value << "\n"; + std::cout << "\t -> current_dclk0(): " << temp_curr_dclk0_values << "\n"; + std::cout << "\t -> current_gfxclk(): " << temp_curr_gfxclk_values << "\n"; + std::cout << "\t -> current_soc_clock(): " << temp_curr_socclk_values << "\n"; + std::cout << "\t -> current_vclk0(): " << temp_curr_vclk0_values << "\n"; + + std::cout << "\n"; + std::cout << "\t[Throttle]" << "\n"; + std::cout << "\t -> indep_throttle_status(): " << temp_indep_throttle_status_value << "\n"; + std::cout << "\t -> throttle_status(): " << temp_throttle_status_value << "\n"; + + std::cout << "\n"; + std::cout << "\t[Gfx Clock Lock]" << "\n"; + std::cout << "\t -> gfxclk_lock_status(): " << temp_gfxclk_lock_status_value << "\n"; + + std::cout << "\n"; + std::cout << "\t[Current Fan Speed]" << "\n"; + std::cout << "\t -> current_fan_speed(): " << temp_curr_fan_speed_value << "\n"; + + std::cout << "\n"; + std::cout << "\t[Link/Bandwidth/Speed]" << "\n"; + std::cout << "\t -> pcie_link_width(): " << temp_pcie_link_width_value << "\n"; + std::cout << "\t -> pcie_link_speed(): " << temp_pcie_link_speed_value << "\n"; + std::cout << "\t -> pcie_bandwidth_accum(): " << temp_pcie_bandwidth_accum_value << "\n"; + std::cout << "\t -> pcie_bandwidth_inst(): " << temp_pcie_bandwidth_inst_value << "\n"; + std::cout << "\t -> pcie_l0_recov_count_accum(): " << temp_pcie_l0_recov_count_accum_value << "\n"; + std::cout << "\t -> pcie_replay_count_accum(): " << temp_pcie_replay_count_accum_value << "\n"; + std::cout << "\t -> pcie_replay_rollover_count_accum(): " << temp_pcie_replay_rover_count_accum_value << "\n"; + std::cout << "\t -> xgmi_link_width(): " << temp_xgmi_link_width_value << "\n"; + std::cout << "\t -> xgmi_link_speed(): " << temp_xgmi_link_speed_value << "\n"; + std::cout << "\t -> xgmi_read_data(): " << temp_xgmi_read_values << "\n"; + std::cout << "\t -> xgmi_write_data(): " << temp_xgmi_write_values << "\n"; + + std::cout << "\n"; + std::cout << "\t[Voltage]" << "\n"; + std::cout << "\t -> voltage_soc(): " << temp_voltage_soc_value << "\n"; + std::cout << "\t -> voltage_gfx(): " << temp_voltage_gfx_value << "\n"; + std::cout << "\t -> voltage_mem(): " << temp_voltage_mem_value << "\n"; + + std::cout << "\n"; + std::cout << "\t[Timestamp]" << "\n"; + std::cout << "\t -> system_clock_counter(): " << temp_system_clock_counter_value << "\n"; + std::cout << "\t -> firmware_timestamp(): " << temp_firmware_timestamp_value << "\n"; + + std::cout << "\n"; + std::cout << "\t[XCD CounterVoltage]" << "\n"; + std::cout << "\t -> xcd_counter(): " << temp_xcd_counter_value << "\n"; + std::cout << "\n\n"; + } + } + } diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/measure_api_execution_time.cc b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/measure_api_execution_time.cc index d53f3b3fa1..ff471cb2ee 100644 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/measure_api_execution_time.cc +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/measure_api_execution_time.cc @@ -167,7 +167,900 @@ void TestMeasureApiExecutionTime::Run(void) { skip = false; std::cout << "----------------------------------------------------------------------------" << std::endl; + //Test execution time for each individual gpu metric + auto val_ui16 = uint16_t(0); + auto val_ui32 = uint32_t(0); + auto val_ui64 = uint64_t(0); + GPUMetricTempHbm_t temp_hbm_values; + GPUMetricVcnActivity_t temp_vcn_values; + GPUMetricCurrDClk0_t temp_curr_dclk0_values; + GPUMetricCurrGfxClk_t temp_curr_gfxclk_values; + GPUMetricCurrSocClk_t temp_curr_socclk_values; + GPUMetricCurrVClk0_t temp_curr_vclk0_values; + GPUMetricXgmiReadDataAcc_t temp_xgmi_read_values; + GPUMetricXgmiWriteDataAcc_t temp_xgmi_write_values; + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + start = std::chrono::high_resolution_clock::now(); + auto start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_temp_edge_get(dv_ind, &val_ui16); } - std::cout.precision(prev); + auto stop_api = std::chrono::high_resolution_clock::now(); + auto duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_temp_edge_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_temp_hotspot_get(dv_ind, &val_ui16); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_temp_hotspot_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_temp_mem_get(dv_ind, &val_ui16); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_temp_mem_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_temp_vrgfx_get(dv_ind, &val_ui16); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_temp_vrgfx_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_temp_vrsoc_get(dv_ind, &val_ui16); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_temp_vrsoc_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_temp_vrmem_get(dv_ind, &val_ui16); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_temp_vrmem_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_temp_hbm_get(dv_ind, &temp_hbm_values); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_temp_hbm_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_curr_socket_power_get(dv_ind, &val_ui16); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_curr_socket_power_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_energy_acc_get(dv_ind, &val_ui64); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_energy_acc_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_avg_socket_power_get(dv_ind, &val_ui16); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_avg_socket_power_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_avg_gfx_activity_get(dv_ind, &val_ui16); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_avg_gfx_activity_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_avg_umc_activity_get(dv_ind, &val_ui16); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_avg_umc_activity_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_avg_mm_activity_get(dv_ind, &val_ui16); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_avg_mm_activity_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_vcn_activity_get(dv_ind, &temp_vcn_values); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_vcn_activity_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_mem_activity_acc_get(dv_ind, &val_ui32); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_mem_activity_acc_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_gfx_activity_acc_get(dv_ind, &val_ui32); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_gfx_activity_acc_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_avg_gfx_clock_frequency_get(dv_ind, &val_ui16); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_avg_gfx_clock_frequency_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_avg_soc_clock_frequency_get(dv_ind, &val_ui16); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_avg_soc_clock_frequency_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_avg_uclock_frequency_get(dv_ind, &val_ui16); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_avg_uclock_frequency_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_avg_vclock0_frequency_get(dv_ind, &val_ui16); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_avg_vclock0_frequency_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_avg_dclock0_frequency_get(dv_ind, &val_ui16); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_avg_dclock0_frequency_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_avg_vclock1_frequency_get(dv_ind, &val_ui16); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_avg_vclock1_frequency_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_avg_dclock1_frequency_get(dv_ind, &val_ui16); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_avg_dclock1_frequency_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_curr_vclk1_get(dv_ind, &val_ui16); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_curr_vclk1_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_curr_dclk1_get(dv_ind, &val_ui16); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_curr_dclk1_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_curr_uclk_get(dv_ind, &val_ui16); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_curr_uclk_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_curr_dclk0_get(dv_ind, &temp_curr_dclk0_values); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_curr_dclk0_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_curr_gfxclk_get(dv_ind, &temp_curr_gfxclk_values); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_curr_gfxclk_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_curr_socclk_get(dv_ind, &temp_curr_socclk_values); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_curr_socclk_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_curr_vclk0_get(dv_ind, &temp_curr_vclk0_values); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_curr_vclk0_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_indep_throttle_status_get(dv_ind, &val_ui64); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_indep_throttle_status_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_throttle_status_get(dv_ind, &val_ui32); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_throttle_status_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_gfxclk_lock_status_get(dv_ind, &val_ui32); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_gfxclk_lock_status_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_curr_fan_speed_get(dv_ind, &val_ui16); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_curr_fan_speed_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_pcie_link_width_get(dv_ind, &val_ui16); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_pcie_link_width_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_pcie_link_speed_get(dv_ind, &val_ui16); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_pcie_link_speed_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_pcie_bandwidth_acc_get(dv_ind, &val_ui64); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_pcie_bandwidth_acc_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_pcie_bandwidth_inst_get(dv_ind, &val_ui64); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_pcie_bandwidth_inst_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_pcie_l0_recov_count_acc_get(dv_ind, &val_ui64); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_pcie_l0_recov_count_acc_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_pcie_replay_count_acc_get(dv_ind, &val_ui64); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_pcie_replay_count_acc_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_pcie_replay_rover_count_acc_get(dv_ind, &val_ui64); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_pcie_replay_rover_count_acc_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_xgmi_link_width_get(dv_ind, &val_ui16); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_xgmi_link_width_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_xgmi_link_speed_get(dv_ind, &val_ui16); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_xgmi_link_speed_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_xgmi_read_data_get(dv_ind, &temp_xgmi_read_values); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_xgmi_read_data_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_xgmi_write_data_get(dv_ind, &temp_xgmi_write_values); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_xgmi_write_data_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_volt_soc_get(dv_ind, &val_ui16); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_volt_soc_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_volt_gfx_get(dv_ind, &val_ui16); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_volt_gfx_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_volt_mem_get(dv_ind, &val_ui16); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_volt_mem_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_system_clock_counter_get(dv_ind, &val_ui64); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_system_clock_counter_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_firmware_timestamp_get(dv_ind, &val_ui64); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_firmware_timestamp_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + start_api = std::chrono::high_resolution_clock::now(); + for (int i=0; i < repeat; ++i) { + status_code = rsmi_dev_metrics_xcd_counter_get(dv_ind, &val_ui16); + } + stop_api = std::chrono::high_resolution_clock::now(); + duration_api = std::chrono::duration_cast(stop_api - start_api); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + skip = true; + } + if (!skip) { + std::cout << "\rsmi_dev_metrics_xcd_counter_get() execution time: " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * repeat); + } + skip = false; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + stop = std::chrono::high_resolution_clock::now(); + duration = std::chrono::duration_cast(stop - start); + if (!skip) { + const auto kTOTAL_GPU_METRICS_APIS = uint16_t(52); + std::cout << "\rTotal execution time (All APIs): " + << (float(duration_api.count()) / repeat) << " microseconds" << std::endl; + EXPECT_LT(duration_api.count(), 500 * (repeat * kTOTAL_GPU_METRICS_APIS)); + } + skip = false; + std::cout << "============================================================================" << std::endl; + + } + std::cout.precision(prev); + } diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/temp_read.cc b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/temp_read.cc index 8bc17bf013..d4a534717d 100755 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/temp_read.cc +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/temp_read.cc @@ -106,7 +106,7 @@ void TestTempRead::Run(void) { return; } - uint32_t type; + uint32_t type(0); for (uint32_t x = 0; x < num_iterations(); ++x) { for (uint32_t i = 0; i < num_monitor_devs(); ++i) { PrintDeviceHeader(i);