diff --git a/CHANGELOG.md b/CHANGELOG.md index 5e31a5867b..9fcb207e86 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,44 @@ Full documentation for rocm_smi_lib is available at [https://rocm.docs.amd.com/] ***All information listed below is for reference and subject to change.*** +## amd_smi_lib for ROCm 6.5.0 + +### Added + +- **Added support for GPU metrics 1.8**. + - Added new fields for `rsmi_gpu_metrics_t` including: + - Adding the following metrics to allow new calculations for violation status: + - Per XCP metrics `gfx_below_host_limit_ppt_acc[XCP][MAX_XCC]` - GFX Clock Host limit Package Power Tracking violation counts + - Per XCP metrics `gfx_below_host_limit_thm_acc[XCP][MAX_XCC]` - GFX Clock Host limit Thermal (TVIOL) violation counts + - Per XCP metrics `gfx_low_utilization_acc[XCP][MAX_XCC]` - violation counts for how did low utilization caused the GPU to be below application clocks. + - Per XCP metrics `gfx_below_host_limit_total_acc[XCP][MAX_XCC]`- violation counts for how long GPU was held below application clocks any limiter (see above new violation metrics). + - Increasing available JPEG engines to 40. + Current ASICs may not support all 40. These will be indicated as UINT16_MAX or N/A in CLI. + +### Changed + +- N/A + +### Removed + +- N/A + +### Optimized + +- N/A + +### Resolved issues + +- N/A + +### Upcoming changes + +- N/A + +### Known issues + +- N/A + ## rocm_smi_lib for ROCm 6.4 ### Added diff --git a/include/rocm_smi/rocm_smi.h b/include/rocm_smi/rocm_smi.h index 7ccab0c6db..bb3c4cd49c 100755 --- a/include/rocm_smi/rocm_smi.h +++ b/include/rocm_smi/rocm_smi.h @@ -954,6 +954,11 @@ typedef struct metrics_table_header_t metrics_table_header_t; */ #define RSMI_MAX_NUM_JPEG_ENGS 32 +/** + * @brief This should match kRSMI_MAX_NUM_JPEG_ENG_V1 + */ +#define RSMI_MAX_NUM_JPEG_ENG_V1 40 + /** * @brief This should match kRSMI_MAX_NUM_CLKS */ @@ -1001,7 +1006,7 @@ struct amdgpu_xcp_metrics_t { */ /* Utilization Instantaneous (%) */ uint32_t gfx_busy_inst[RSMI_MAX_NUM_XCC]; - uint16_t jpeg_busy[RSMI_MAX_NUM_JPEG_ENGS]; + uint16_t jpeg_busy[RSMI_MAX_NUM_JPEG_ENG_V1]; uint16_t vcn_busy[RSMI_MAX_NUM_VCNS]; /* Utilization Accumulated (%) */ @@ -1012,6 +1017,14 @@ struct amdgpu_xcp_metrics_t { */ /* Total App Clock Counter Accumulated */ uint64_t gfx_below_host_limit_acc[RSMI_MAX_NUM_XCC]; + + /** + * v1.8 additions + */ + uint64_t gfx_below_host_limit_ppt_acc[RSMI_MAX_NUM_XCC]; + uint64_t gfx_below_host_limit_thm_acc[RSMI_MAX_NUM_XCC]; + uint64_t gfx_low_utilization_acc[RSMI_MAX_NUM_XCC]; + uint64_t gfx_below_host_limit_total_acc[RSMI_MAX_NUM_XCC]; }; typedef struct { @@ -1218,7 +1231,7 @@ typedef struct { /* * v1.7 additions */ - /* VRAM max bandwidth at max memory clock (GB/s) */ + /* VRAM max bandwidth at max memory clock */ uint64_t vram_max_bandwidth; /* XGMI link status(up/down) */ diff --git a/include/rocm_smi/rocm_smi_gpu_metrics.h b/include/rocm_smi/rocm_smi_gpu_metrics.h index 5712ea41ef..d9325cf9ff 100644 --- a/include/rocm_smi/rocm_smi_gpu_metrics.h +++ b/include/rocm_smi/rocm_smi_gpu_metrics.h @@ -1,44 +1,23 @@ /* - * ============================================================================= - * The University of Illinois/NCSA - * Open Source License (NCSA) - * - * Copyright (c) 2017-2024, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Developed by: - * - * AMD Research and AMD ROC Software Development - * - * Advanced Micro Devices, Inc. - * - * www.amd.com + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal with the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: * - * - Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in - * the documentation and/or other materials provided with the distribution. - * - Neither the names of , - * nor the names of its contributors may be used to endorse or promote - * products derived from this Software without specific prior written - * permission. + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS WITH THE SOFTWARE. - * + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. */ #ifndef ROCM_SMI_ROCM_SMI_GPU_METRICS_H_ @@ -52,9 +31,12 @@ #include #include #include +#include #include #include #include +#include +#include #include #include #include @@ -72,10 +54,11 @@ constexpr uint32_t kRSMI_GPU_METRICS_API_CONTENT_MINOR_VER_1 = 1; constexpr uint32_t kRSMI_GPU_METRICS_API_CONTENT_MINOR_VER_2 = 2; constexpr uint32_t kRSMI_GPU_METRICS_API_CONTENT_MINOR_VER_3 = 3; constexpr uint32_t kRSMI_GPU_METRICS_API_CONTENT_MINOR_VER_4 = 4; +constexpr uint32_t kRSMI_GPU_METRICS_API_CONTENT_MINOR_VER_8 = 8; constexpr uint32_t kRSMI_LATEST_GPU_METRICS_API_CONTENT_MAJOR_VER = kRSMI_GPU_METRICS_API_CONTENT_MAJOR_VER_1; -constexpr uint32_t kRSMI_LATEST_GPU_METRICS_API_CONTENT_MINON_VER - = kRSMI_GPU_METRICS_API_CONTENT_MINOR_VER_4; +constexpr uint32_t kRSMI_LATEST_GPU_METRICS_API_CONTENT_MINOR_VER + = kRSMI_GPU_METRICS_API_CONTENT_MINOR_VER_8; // Note: This *must* match NUM_HBM_INSTANCES @@ -96,6 +79,10 @@ constexpr uint32_t kRSMI_MAX_NUM_VCNS = 4; // Note: This *must* match NUM_JPEG_ENG constexpr uint32_t kRSMI_MAX_JPEG_ENGINES = 32; +// Note: Updated for amdgpu_xcp_metrics_v1_2. +// Document provides NUM_JPEG_ENG_V1 but will rename to kRSMI_MAX_NUM_JPEG_ENG_V1 +constexpr uint32_t kRSMI_MAX_NUM_JPEG_ENG_V1 = 40; + // Note: This *must* match MAX_XCC constexpr uint32_t kRSMI_MAX_NUM_XCC = 8; @@ -108,6 +95,15 @@ struct AMDGpuMetricsHeader_v1_t { uint8_t m_format_revision; uint8_t m_content_revision; }; +struct amdgpu_xcp_metrics { + /* Utilization Instantaneous (%) */ + uint32_t gfx_busy_inst[kRSMI_MAX_NUM_XCC]; + uint16_t jpeg_busy[kRSMI_MAX_JPEG_ENGINES]; + uint16_t vcn_busy[kRSMI_MAX_NUM_VCNS]; + + /* Utilization Accumulated (%) */ + uint64_t gfx_busy_acc[kRSMI_MAX_NUM_XCC]; +}; struct amdgpu_xcp_metrics_v1_1 { /* Utilization Instantaneous (%) */ @@ -122,14 +118,21 @@ struct amdgpu_xcp_metrics_v1_1 { uint64_t gfx_below_host_limit_acc[kRSMI_MAX_NUM_XCC]; }; -struct amdgpu_xcp_metrics { +/* new for gpu metrics v1.8 */ +struct amdgpu_xcp_metrics_v1_2 { /* Utilization Instantaneous (%) */ uint32_t gfx_busy_inst[kRSMI_MAX_NUM_XCC]; - uint16_t jpeg_busy[kRSMI_MAX_JPEG_ENGINES]; + uint16_t jpeg_busy[kRSMI_MAX_NUM_JPEG_ENG_V1]; uint16_t vcn_busy[kRSMI_MAX_NUM_VCNS]; /* Utilization Accumulated (%) */ uint64_t gfx_busy_acc[kRSMI_MAX_NUM_XCC]; + + /* Total App Clock Counter Accumulated */ + uint64_t gfx_below_host_limit_ppt_acc[kRSMI_MAX_NUM_XCC]; + uint64_t gfx_below_host_limit_thm_acc[kRSMI_MAX_NUM_XCC]; + uint64_t gfx_low_utilization_acc[kRSMI_MAX_NUM_XCC]; + uint64_t gfx_below_host_limit_total_acc[kRSMI_MAX_NUM_XCC]; }; struct AMDGpuMetricsBase_t { @@ -602,7 +605,7 @@ struct AMDGpuMetrics_v17_t { uint16_t m_average_gfx_activity; uint16_t m_average_umc_activity; // memory controller - /* VRAM max bandwidth at max memory clock */ + /* VRAM max bandwidth at max memory clock (GB/s) */ uint64_t m_vram_max_bandwidth; // new for 1.7 /* Energy (15.259uJ (2^-16) units) */ @@ -685,7 +688,107 @@ struct AMDGpuMetrics_v17_t { /* PCIE other end recovery counter */ uint32_t m_pcie_lc_perf_other_end_recovery; }; -using AMGpuMetricsLatest_t = AMDGpuMetrics_v17_t; + +struct AMDGpuMetrics_v18_t { + ~AMDGpuMetrics_v18_t() = default; + struct AMDGpuMetricsHeader_v1_t m_common_header; + + /* Temperature (Celsius) */ + uint16_t m_temperature_hotspot; + uint16_t m_temperature_mem; + uint16_t m_temperature_vrsoc; + + /* Power (Watts) */ + uint16_t m_current_socket_power; + + /* Utilization (%) */ + uint16_t m_average_gfx_activity; + uint16_t m_average_umc_activity; // memory controller + + /* VRAM max bandwidthi (in GB/sec) at max memory clock */ + uint64_t m_mem_max_bandwidth; + + /* Energy (15.259uJ (2^-16) units) */ + uint64_t m_energy_accumulator; + + /* Driver attached timestamp (in ns) */ + uint64_t m_system_clock_counter; + + /* Accumulation cycle counter */ + uint32_t m_accumulation_counter; + + /* Accumulated throttler residencies */ + uint32_t m_prochot_residency_acc; + uint32_t m_ppt_residency_acc; + uint32_t m_socket_thm_residency_acc; + uint32_t m_vr_thm_residency_acc; + uint32_t m_hbm_thm_residency_acc; + + /* Clock Lock Status. Each bit corresponds to clock instance */ + uint32_t m_gfxclk_lock_status; + + /* Link width (number of lanes) and speed (in 0.1 GT/s) */ + uint16_t m_pcie_link_width; + uint16_t m_pcie_link_speed; + + /* XGMI bus width and bitrate (in Gbps) */ + uint16_t m_xgmi_link_width; + uint16_t m_xgmi_link_speed; + + /* Utilization Accumulated (%) */ + uint32_t m_gfx_activity_acc; + uint32_t m_mem_activity_acc; + + /*PCIE accumulated bandwidth (GB/sec) */ + uint64_t m_pcie_bandwidth_acc; + + /*PCIE instantaneous bandwidth (GB/sec) */ + uint64_t m_pcie_bandwidth_inst; + + /* PCIE L0 to recovery state transition accumulated count */ + uint64_t m_pcie_l0_to_recov_count_acc; + + /* PCIE replay accumulated count */ + uint64_t m_pcie_replay_count_acc; + + /* PCIE replay rollover accumulated count */ + uint64_t m_pcie_replay_rover_count_acc; + + /* PCIE NAK sent accumulated count */ + uint32_t m_pcie_nak_sent_count_acc; + + /* PCIE NAK received accumulated count */ + uint32_t m_pcie_nak_rcvd_count_acc; + + /* XGMI accumulated data transfer size(KiloBytes) */ + uint64_t m_xgmi_read_data_acc[kRSMI_MAX_NUM_XGMI_LINKS]; + uint64_t m_xgmi_write_data_acc[kRSMI_MAX_NUM_XGMI_LINKS]; + + /* XGMI link status(active/inactive) */ + uint16_t m_xgmi_link_status[kRSMI_MAX_NUM_XGMI_LINKS]; + + uint16_t m_padding; + + /* PMFW attached timestamp (10ns resolution) */ + uint64_t m_firmware_timestamp; + + /* Current clocks (Mhz) */ + uint16_t m_current_gfxclk[kRSMI_MAX_NUM_GFX_CLKS]; + uint16_t m_current_socclk[kRSMI_MAX_NUM_CLKS]; + uint16_t m_current_vclk0[kRSMI_MAX_NUM_CLKS]; + uint16_t m_current_dclk0[kRSMI_MAX_NUM_CLKS]; + uint16_t m_current_uclk; + + /* Number of current partition */ + uint16_t m_num_partition; + + /* XCP metrics stats */ + struct amdgpu_xcp_metrics_v1_2 m_xcp_stats[kRSMI_MAX_NUM_XCP]; + + /* PCIE other end recovery counter */ + uint32_t m_pcie_lc_perf_other_end_recovery; +}; +using AMGpuMetricsLatest_t = AMDGpuMetrics_v18_t; /** * This is GPU Metrics version that gets to public access. @@ -900,11 +1003,18 @@ enum class AMDGpuMetricsUnitType_t : AMDGpuMetricTypeId_t kMetricJpegBusy, // v1.6 kMetricVcnBusy, // v1.6 kMetricGfxBusyAcc, // v1.6 + kMetricGfxBelowHostLimitAccumulator, // v1.7 + kMetricPcieLCPerfOtherEndRecov, // v1.6 kMetricVramMaxBandwidth, // v1.7 kMetricXgmiLinkStatus, // v1.7 - kMetricGfxBelowHostLimitAccumulator, // v1.7 + + kMetricGfxBelowHostLimitPptAcc, // v1.8 + kMetricGfxBelowHostLimitThmAcc, // v1.8 + kMetricGfxBelowHostLimitTotalAcc, // v1.8 + kMetricGfxLowUtilitizationAcc, // v1.8 + }; using AMDGpuMetricsUnitTypeTranslationTbl_t = std::map; @@ -943,6 +1053,7 @@ enum class AMDGpuMetricVersionFlags_t : AMDGpuMetricVersionFlagId_t kGpuMetricV15 = (0x1 << 5), kGpuMetricV16 = (0x1 << 6), kGpuMetricV17 = (0x1 << 7), + kGpuMetricV18 = (0x1 << 8), // Added new version flag }; using AMDGpuMetricVersionTranslationTbl_t = std::map; using GpuMetricTypePtr_t = std::shared_ptr; @@ -952,27 +1063,24 @@ class GpuMetricsBase_t { virtual ~GpuMetricsBase_t() = default; virtual size_t sizeof_metric_table() = 0; virtual GpuMetricTypePtr_t get_metrics_table() = 0; - virtual void dump_internal_metrics_table() = 0; virtual AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() = 0; virtual rsmi_status_t populate_metrics_dynamic_tbl() = 0; virtual AMGpuMetricsPublicLatestTupl_t copy_internal_to_external_metrics() = 0; virtual void set_device_id(uint32_t device_id) { m_device_id = device_id; } virtual void set_partition_id(uint32_t partition_id) { m_partition_id = partition_id; } virtual AMDGpuDynamicMetricsTbl_t get_metrics_dynamic_tbl() { - return m_metrics_dynamic_tbl; + return m_base_metrics_dynamic_tbl; } protected: - AMDGpuDynamicMetricsTbl_t m_metrics_dynamic_tbl; + AMDGpuDynamicMetricsTbl_t m_base_metrics_dynamic_tbl; uint64_t m_metrics_timestamp; uint32_t m_device_id; uint32_t m_partition_id; - }; using GpuMetricsBasePtr = std::shared_ptr; using AMDGpuMetricFactories_t = const std::map; - class GpuMetricsBase_v11_t final : public GpuMetricsBase_t { public: virtual ~GpuMetricsBase_v11_t() = default; @@ -989,10 +1097,6 @@ class GpuMetricsBase_v11_t final : public GpuMetricsBase_t { return m_gpu_metric_ptr; } - void dump_internal_metrics_table() override { - return; - } - AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override { return AMDGpuMetricVersionFlags_t::kGpuMetricV11; } @@ -1022,10 +1126,6 @@ class GpuMetricsBase_v12_t final : public GpuMetricsBase_t { return m_gpu_metric_ptr; } - void dump_internal_metrics_table() override { - return; - } - AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override { return AMDGpuMetricVersionFlags_t::kGpuMetricV12; } @@ -1054,8 +1154,6 @@ class GpuMetricsBase_v13_t final : public GpuMetricsBase_t { return (m_gpu_metric_ptr); } - void dump_internal_metrics_table() override; - AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override { return AMDGpuMetricVersionFlags_t::kGpuMetricV13; } @@ -1085,8 +1183,6 @@ class GpuMetricsBase_v14_t final : public GpuMetricsBase_t { return m_gpu_metric_ptr; } - void dump_internal_metrics_table() override; - AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override { return AMDGpuMetricVersionFlags_t::kGpuMetricV14; } @@ -1116,8 +1212,6 @@ class GpuMetricsBase_v15_t final : public GpuMetricsBase_t { return m_gpu_metric_ptr; } - void dump_internal_metrics_table() override; - AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override { return AMDGpuMetricVersionFlags_t::kGpuMetricV15; } @@ -1147,8 +1241,6 @@ class GpuMetricsBase_v16_t final : public GpuMetricsBase_t { return m_gpu_metric_ptr; } - void dump_internal_metrics_table() override; - AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override { return AMDGpuMetricVersionFlags_t::kGpuMetricV16; } @@ -1177,8 +1269,6 @@ class GpuMetricsBase_v17_t final : public GpuMetricsBase_t { return m_gpu_metric_ptr; } - void dump_internal_metrics_table() override; - AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override { return AMDGpuMetricVersionFlags_t::kGpuMetricV17; } @@ -1191,6 +1281,34 @@ class GpuMetricsBase_v17_t final : public GpuMetricsBase_t { std::shared_ptr m_gpu_metric_ptr; }; +class GpuMetricsBase_v18_t final : public GpuMetricsBase_t { + public: + ~GpuMetricsBase_v18_t() = default; + + size_t sizeof_metric_table() override { + return sizeof(AMDGpuMetrics_v18_t); + } + + GpuMetricTypePtr_t get_metrics_table() override { + if (!m_gpu_metric_ptr) { + m_gpu_metric_ptr.reset(&m_gpu_metrics_tbl, [](AMDGpuMetrics_v18_t*){}); + } + assert(m_gpu_metric_ptr != nullptr); + return m_gpu_metric_ptr; + } + + AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override { + return AMDGpuMetricVersionFlags_t::kGpuMetricV18; + } + + rsmi_status_t populate_metrics_dynamic_tbl() override; + AMGpuMetricsPublicLatestTupl_t copy_internal_to_external_metrics() override; + + private: + AMDGpuMetrics_v18_t m_gpu_metrics_tbl; + std::shared_ptr m_gpu_metric_ptr; +}; + template rsmi_status_t rsmi_dev_gpu_metrics_info_query(uint32_t dv_ind, AMDGpuMetricsUnitType_t metric_counter, T& metric_value); diff --git a/python_smi_tools/rocm_smi.py b/python_smi_tools/rocm_smi.py index 65c0d8734b..7ad404cf7d 100755 --- a/python_smi_tools/rocm_smi.py +++ b/python_smi_tools/rocm_smi.py @@ -3795,9 +3795,41 @@ def showGPUMetrics(deviceList): }, "xcp_stats.gfx_busy_acc": { "value": gpu_metrics.xcp_stats, - "unit": percent_unit, + "unit": count, }, "xcp_stats.gfx_below_host_limit_acc": { + "value": gpu_metrics.xcp_stats, + "unit": count, + }, + "xcp_stats.gfx_below_host_limit_ppt_acc": { + "value": gpu_metrics.xcp_stats, + "unit": count, + }, + "xcp_stats.gfx_below_host_limit_thm_acc": { + "value": gpu_metrics.xcp_stats, + "unit": count, + }, + "xcp_stats.gfx_low_utilization_acc": { + "value": gpu_metrics.xcp_stats, + "unit": count, + }, + "xcp_stats.gfx_below_host_limit_total_acc": { + "value": gpu_metrics.xcp_stats, + "unit": count, + }, + "xcp_stats.gfx_below_host_limit_ppt_acc": { + "value": gpu_metrics.xcp_stats, + "unit": percent_unit, + }, + "xcp_stats.gfx_below_host_limit_thm_acc": { + "value": gpu_metrics.xcp_stats, + "unit": percent_unit, + }, + "xcp_stats.gfx_low_utilization_acc": { + "value": gpu_metrics.xcp_stats, + "unit": percent_unit, + }, + "xcp_stats.gfx_below_host_limit_total_acc": { "value": gpu_metrics.xcp_stats, "unit": percent_unit, }, @@ -3841,6 +3873,30 @@ def showGPUMetrics(deviceList): for _, val in enumerate(item.gfx_below_host_limit_acc): print_xcp_detail.append(validateIfMaxUint(val, UIntegerTypes.UINT64_T)) printLog(device, k + " (" + str(v["unit"]) + ")", str(print_xcp_detail), xcp=str(curr_xcp)) + if 'xcp_stats.gfx_below_host_limit_ppt_acc' in k: + for curr_xcp, item in enumerate(v['value']): + print_xcp_detail = [] + for _, val in enumerate(item.gfx_below_host_limit_ppt_acc): + print_xcp_detail.append(validateIfMaxUint(val, UIntegerTypes.UINT64_T)) + printLog(device, k + " (" + str(v["unit"]) + ")", str(print_xcp_detail), xcp=str(curr_xcp)) + if 'xcp_stats.gfx_below_host_limit_thm_acc' in k: + for curr_xcp, item in enumerate(v['value']): + print_xcp_detail = [] + for _, val in enumerate(item.gfx_below_host_limit_thm_acc): + print_xcp_detail.append(validateIfMaxUint(val, UIntegerTypes.UINT64_T)) + printLog(device, k + " (" + str(v["unit"]) + ")", str(print_xcp_detail), xcp=str(curr_xcp)) + if 'xcp_stats.gfx_low_utilization_acc' in k: + for curr_xcp, item in enumerate(v['value']): + print_xcp_detail = [] + for _, val in enumerate(item.gfx_low_utilization_acc): + print_xcp_detail.append(validateIfMaxUint(val, UIntegerTypes.UINT64_T)) + printLog(device, k + " (" + str(v["unit"]) + ")", str(print_xcp_detail), xcp=str(curr_xcp)) + if 'xcp_stats.gfx_below_host_limit_total_acc' in k: + for curr_xcp, item in enumerate(v['value']): + print_xcp_detail = [] + for _, val in enumerate(item.gfx_below_host_limit_total_acc): + print_xcp_detail.append(validateIfMaxUint(val, UIntegerTypes.UINT64_T)) + printLog(device, k + " (" + str(v["unit"]) + ")", str(print_xcp_detail), xcp=str(curr_xcp)) if int(device) < (len(deviceList) - 1): printLogSpacer() diff --git a/python_smi_tools/rsmiBindings.py b/python_smi_tools/rsmiBindings.py index 1968cb6be6..d19fdad740 100644 --- a/python_smi_tools/rsmiBindings.py +++ b/python_smi_tools/rsmiBindings.py @@ -662,10 +662,14 @@ class amdgpu_xcp_metrics_t(Structure): # amdgpu_xcp_metrics_t._pack_ = 1 # source:False amdgpu_xcp_metrics_t._fields_ = [ ('gfx_busy_inst', c_uint32 * 8), - ('jpeg_busy', c_uint16 * 32), + ('jpeg_busy', c_uint16 * 40), ('vcn_busy', c_uint16 * 4), ('gfx_busy_acc', c_uint64 * 8), ('gfx_below_host_limit_acc', c_uint64 * 8), + ('gfx_below_host_limit_ppt_acc', c_uint64 * 8), + ('gfx_below_host_limit_thm_acc', c_uint64 * 8), + ('gfx_low_utilization_acc', c_uint64 * 8), + ('gfx_below_host_limit_total_acc', c_uint64 * 8), ] xcp_stats_t = amdgpu_xcp_metrics_t diff --git a/rocm_smi/example/rocm_smi_example.cc b/rocm_smi/example/rocm_smi_example.cc index f6a3b8b0f7..794c70b747 100755 --- a/rocm_smi/example/rocm_smi_example.cc +++ b/rocm_smi/example/rocm_smi_example.cc @@ -991,6 +991,11 @@ int main() { std::cout << "\t -> " << std::dec << dclk << "\n"; } + std::cout << "\t**.jpeg_activity[] : " << std::dec << "\n"; + for (const auto& jpeg : gpu_metrics.jpeg_activity) { + std::cout << "\t -> " << std::dec << jpeg << "\n"; + } + std::cout << std::dec << "xcp_stats.gfx_busy_inst = \n"; auto xcp = 0; for (auto& row : gpu_metrics.xcp_stats) { @@ -1046,6 +1051,50 @@ int main() { xcp++; } + xcp = 0; + std::cout << std::dec << "xcp_stats.gfx_below_host_limit_ppt_acc = \n"; // new for 1.8 + for (auto& row : gpu_metrics.xcp_stats) { + std::cout << "XCP[" << xcp << "] = " << "[ "; + std::copy(std::begin(row.gfx_below_host_limit_ppt_acc), + std::end(row.gfx_below_host_limit_ppt_acc), + amd::smi::make_ostream_joiner(&std::cout, ", ")); + std::cout << " ]\n"; + xcp++; + } + + xcp = 0; + std::cout << std::dec << "xcp_stats.gfx_below_host_limit_thm_acc = \n"; // new for 1.8 + for (auto& row : gpu_metrics.xcp_stats) { + std::cout << "XCP[" << xcp << "] = " << "[ "; + std::copy(std::begin(row.gfx_below_host_limit_thm_acc), + std::end(row.gfx_below_host_limit_thm_acc), + amd::smi::make_ostream_joiner(&std::cout, ", ")); + std::cout << " ]\n"; + xcp++; + } + + xcp = 0; + std::cout << std::dec << "xcp_stats.gfx_low_utilization_acc = \n"; + for (auto& row : gpu_metrics.xcp_stats) { + std::cout << "XCP[" << xcp << "] = " << "[ "; + std::copy(std::begin(row.gfx_low_utilization_acc), + std::end(row.gfx_low_utilization_acc), + amd::smi::make_ostream_joiner(&std::cout, ", ")); + std::cout << " ]\n"; + xcp++; + } + + xcp = 0; + std::cout << std::dec << "xcp_stats.gfx_below_host_limit_total_acc = \n"; + for (auto& row : gpu_metrics.xcp_stats) { + std::cout << "XCP[" << xcp << "] = " << "[ "; + std::copy(std::begin(row.gfx_below_host_limit_total_acc), + std::end(row.gfx_below_host_limit_total_acc), + amd::smi::make_ostream_joiner(&std::cout, ", ")); + std::cout << " ]\n"; + xcp++; + } + std::cout << "\n"; std::cout << "\t ** -> Checking metrics with constant changes ** " << "\n"; constexpr uint16_t kMAX_ITER_TEST = 10; diff --git a/src/rocm_smi_gpu_metrics.cc b/src/rocm_smi_gpu_metrics.cc index 2053398895..ae6e2b75f5 100755 --- a/src/rocm_smi_gpu_metrics.cc +++ b/src/rocm_smi_gpu_metrics.cc @@ -1,44 +1,23 @@ /* - * ============================================================================= - * The University of Illinois/NCSA - * Open Source License (NCSA) - * - * Copyright (c) 2017-2024, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Developed by: - * - * AMD Research and AMD ROC Software Development - * - * Advanced Micro Devices, Inc. - * - * www.amd.com + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal with the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: * - * - Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in - * the documentation and/or other materials provided with the distribution. - * - Neither the names of , - * nor the names of its contributors may be used to endorse or promote - * products derived from this Software without specific prior written - * permission. + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS WITH THE SOFTWARE. - * + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. */ #include "rocm_smi/rocm_smi_gpu_metrics.h" @@ -84,7 +63,7 @@ namespace amd::smi constexpr uint16_t join_metrics_version(uint8_t format_rev, uint8_t content_rev) { - return (format_rev << 8 | content_rev); + return static_cast((format_rev << 8 | content_rev)); } constexpr uint16_t join_metrics_version(const AMDGpuMetricsHeader_v1_t& metrics_header) @@ -168,6 +147,7 @@ const AMDGpuMetricVersionTranslationTbl_t amdgpu_metric_version_translation_tabl {join_metrics_version(1, 5), AMDGpuMetricVersionFlags_t::kGpuMetricV15}, {join_metrics_version(1, 6), AMDGpuMetricVersionFlags_t::kGpuMetricV16}, {join_metrics_version(1, 7), AMDGpuMetricVersionFlags_t::kGpuMetricV17}, + {join_metrics_version(1, 8), AMDGpuMetricVersionFlags_t::kGpuMetricV18}, }; /** @@ -277,22 +257,27 @@ const AMDGpuMetricsUnitTypeTranslationTbl_t amdgpu_metrics_unit_type_translation {AMDGpuMetricsUnitType_t::kMetricHBMThmResidencyAccumulator, "HBMThmResidencyAccumulator"}, /* v1.6 */ // kGpuMetricPartition - {AMDGpuMetricsUnitType_t::kGpuMetricNumPartition, "numPartition"}, /* v1.6 */ + {AMDGpuMetricsUnitType_t::kGpuMetricNumPartition, "numPartition"}, /* v1.6 */ // kGpuMetricXcpStats - {AMDGpuMetricsUnitType_t::kMetricGfxBusyInst, "GfxBusyInst"}, /* v1.6 */ - {AMDGpuMetricsUnitType_t::kMetricJpegBusy, "JpegBusy"}, /* v1.6 */ - {AMDGpuMetricsUnitType_t::kMetricVcnBusy, "VcnBusy"}, /* v1.6 */ - {AMDGpuMetricsUnitType_t::kMetricGfxBusyAcc, "GfxBusyAcc"}, /* v1.6 */ + {AMDGpuMetricsUnitType_t::kMetricGfxBusyInst, "GfxBusyInst"}, /* v1.6 */ + {AMDGpuMetricsUnitType_t::kMetricJpegBusy, "JpegBusy"}, /* v1.6 */ + {AMDGpuMetricsUnitType_t::kMetricVcnBusy, "VcnBusy"}, /* v1.6 */ + {AMDGpuMetricsUnitType_t::kMetricGfxBusyAcc, "GfxBusyAcc"}, /* v1.6 */ // kGpuMetricLinkWidthSpeed - {AMDGpuMetricsUnitType_t::kMetricPcieLCPerfOtherEndRecov, "PcieLCPerfOtherEndRecov"}, /* v1.6 */ + {AMDGpuMetricsUnitType_t::kMetricPcieLCPerfOtherEndRecov, "PcieLCPerfOtherEndRecov"}, /* v1.6 */ - {AMDGpuMetricsUnitType_t::kMetricXgmiLinkStatus, "XgmiLinkStatus"}, /* v1.7 */ - {AMDGpuMetricsUnitType_t::kMetricVramMaxBandwidth, "VramMaxBandwidth"}, /* v1.7 */ - {AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitAccumulator, - "GfxBelowHostLimitAccumulator"}, /* v1.7 */ + {AMDGpuMetricsUnitType_t::kMetricXgmiLinkStatus, "XgmiLinkStatus"}, /* v1.7 */ + {AMDGpuMetricsUnitType_t::kMetricVramMaxBandwidth, "VramMaxBandwidth"}, /* v1.7 */ + {AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitAccumulator,"GfxBelowHostLimitAccumulator"}, /* v1.7 */ + + // kGpuMetricXcpStats v1.8 + {AMDGpuMetricsUnitType_t::kMetricGfxLowUtilitizationAcc, "GfxLowUtilitizationAcc"}, /* v1.8 */ + {AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitTotalAcc, "GfxBelowHostLimitTotalAcc"}, /* v1.8 */ + {AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitPptAcc, "GfxBelowHostLimitPptAcc"}, /* v1.8 */ + {AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitThmAcc, "GfxBelowHostLimitThmAcc"}, /* v1.8 */ }; @@ -382,6 +367,7 @@ AMDGpuMetricFactories_t amd_gpu_metrics_factory_table {AMDGpuMetricVersionFlags_t::kGpuMetricV15, std::make_shared(GpuMetricsBase_v15_t{})}, {AMDGpuMetricVersionFlags_t::kGpuMetricV16, std::make_shared(GpuMetricsBase_v16_t{})}, {AMDGpuMetricVersionFlags_t::kGpuMetricV17, std::make_shared(GpuMetricsBase_v17_t{})}, + {AMDGpuMetricVersionFlags_t::kGpuMetricV18, std::make_shared(GpuMetricsBase_v18_t{})}, }; GpuMetricsBasePtr amdgpu_metrics_factory(AMDGpuMetricVersionFlags_t gpu_metric_version) @@ -500,381 +486,267 @@ AMDGpuDynamicMetricTblValues_t format_metric_row(const T& metric, const std::str return multi_values; } -void GpuMetricsBase_v17_t::dump_internal_metrics_table() -{ + +rsmi_status_t GpuMetricsBase_v18_t::populate_metrics_dynamic_tbl() { std::ostringstream ss; - auto idx = uint64_t(0); - auto idy = uint64_t(0); - std::cout << __PRETTY_FUNCTION__ << " | ======= start ======= \n"; - ss << __PRETTY_FUNCTION__ - << " | ======= DEBUG ======= " - << " | Metric Version: " - << stringfy_metric_header_version(m_gpu_metrics_tbl.m_common_header) - << " | Size: " - << print_unsigned_int(m_gpu_metrics_tbl.m_common_header.m_structure_size) - << " |" - << "\n"; - ss << " temperature_hotspot: " << m_gpu_metrics_tbl.m_temperature_hotspot << "\n" - << " temperature_mem: " << m_gpu_metrics_tbl.m_temperature_mem << "\n" - << " temperature_vrsoc: " << m_gpu_metrics_tbl.m_temperature_vrsoc << "\n" - << " current_socket_power: " << m_gpu_metrics_tbl.m_current_socket_power << "\n" - << " average_gfx_activity: " << m_gpu_metrics_tbl.m_average_gfx_activity << "\n" - << " average_umc_activity: " << m_gpu_metrics_tbl.m_average_umc_activity << "\n"; + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + ss << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ss); - ss << " vram_max_bandwidth: " << m_gpu_metrics_tbl.m_vram_max_bandwidth << "\n" // new for v1.7 - << " energy_accumulator: " << m_gpu_metrics_tbl.m_energy_accumulator << "\n" - << " system_clock_counter: " << m_gpu_metrics_tbl.m_system_clock_counter << "\n" - << " accumulation_counter: " << m_gpu_metrics_tbl.m_accumulation_counter << "\n" - << " prochot_residency_acc: " << m_gpu_metrics_tbl.m_prochot_residency_acc << "\n" - << " ppt_residency_acc: " << m_gpu_metrics_tbl.m_ppt_residency_acc << "\n" - << " socket_thm_residency_acc: " << m_gpu_metrics_tbl.m_socket_thm_residency_acc << "\n" - << " vr_thm_residency_acc: " << m_gpu_metrics_tbl.m_vr_thm_residency_acc << "\n" - << " hbm_thm_residency_acc: " << m_gpu_metrics_tbl.m_hbm_thm_residency_acc << "\n" - << " gfxclk_lock_status: " << m_gpu_metrics_tbl.m_gfxclk_lock_status << "\n" - << " pcie_link_width: " << m_gpu_metrics_tbl.m_pcie_link_width << "\n" - << " pcie_link_speed: " << m_gpu_metrics_tbl.m_pcie_link_speed << "\n" - << " xgmi_link_width: " << m_gpu_metrics_tbl.m_xgmi_link_width << "\n" - << " xgmi_link_speed: " << m_gpu_metrics_tbl.m_xgmi_link_speed << "\n" - << " gfx_activity_acc: " << m_gpu_metrics_tbl.m_gfx_activity_acc << "\n" - << " mem_activity_acc: " << m_gpu_metrics_tbl.m_mem_activity_acc << "\n" - << " pcie_bandwidth_acc: " << m_gpu_metrics_tbl.m_pcie_bandwidth_acc << "\n" - << " pcie_bandwidth_inst: " << m_gpu_metrics_tbl.m_pcie_bandwidth_inst << "\n" - << " pcie_l0_to_recov_count_acc: " << m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc << "\n" - << " pcie_replay_count_acc: " << m_gpu_metrics_tbl.m_pcie_replay_count_acc << "\n" - << " pcie_replay_rover_count_acc: " << m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc << "\n" - << " pcie_nak_sent_count_acc: " << m_gpu_metrics_tbl.m_pcie_nak_sent_count_acc << "\n" - << " pcie_nak_rcvd_count_acc: " << m_gpu_metrics_tbl.m_pcie_nak_rcvd_count_acc << "\n" - << " firmware_timestamp: " << m_gpu_metrics_tbl.m_firmware_timestamp << "\n" - << " current_uclk: " << m_gpu_metrics_tbl.m_current_uclk << "\n" - << " num_partition: " << m_gpu_metrics_tbl.m_num_partition << "\n" - << " pcie_lc_perf_other_end_recovery: " - << m_gpu_metrics_tbl.m_pcie_lc_perf_other_end_recovery << "\n"; - idx = 0; - for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_link_status) { // new for v1.7 - ss << "\t [" << idx << "]: " << temp << "\n"; - ++idx; - } + auto m_metrics_dynamic_tbl = AMDGpuDynamicMetricsTbl_t{}; + // + // Note: Any metric treatment/changes (if any) should happen before they + // get written to internal/external tables. + // + auto run_metric_adjustments_v18 = [&]() { + ss << __PRETTY_FUNCTION__ << " | ======= start ======="; + const auto gpu_metrics_version = + translate_flag_to_metric_version(get_gpu_metrics_version_used()); + ss << __PRETTY_FUNCTION__ << " | ======= info ======= " + << " | Applying adjustments " + << " | Metric Version: " + << stringfy_metric_header_version(disjoin_metrics_version(gpu_metrics_version)) << " |"; + LOG_TRACE(ss); - for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_read_data_acc) { - ss << "\t [" << idx << "]: " << temp << "\n"; - ++idx; - } + // firmware_timestamp is at 10ns resolution + ss << __PRETTY_FUNCTION__ << " | ======= Changes ======= " + << " | {m_firmware_timestamp} from: " << m_gpu_metrics_tbl.m_firmware_timestamp + << " to: " << (m_gpu_metrics_tbl.m_firmware_timestamp * 10); + m_gpu_metrics_tbl.m_firmware_timestamp = (m_gpu_metrics_tbl.m_firmware_timestamp * 10); + LOG_DEBUG(ss); + }; - ss << " xgmi_write_data_acc: " << "\n"; - idx = 0; - for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_write_data_acc) { - ss << "\t [" << idx << "]: " << temp << "\n"; - ++idx; - } + // Temperature Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempHotspot, + format_metric_row(m_gpu_metrics_tbl.m_temperature_hotspot, + "temperature_hotspot"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempMem, + format_metric_row(m_gpu_metrics_tbl.m_temperature_mem, + "temperature_mem"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempVrSoc, + format_metric_row(m_gpu_metrics_tbl.m_temperature_vrsoc, + "temperature_vrsoc"))); - ss << " current_gfxclk: " << "\n"; - idx = 0; - for (const auto& temp : m_gpu_metrics_tbl.m_current_gfxclk) { - ss << "\t [" << idx << "]: " << temp << "\n"; - ++idx; - } + // Power/Energy Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricPowerEnergy] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrSocketPower, + format_metric_row(m_gpu_metrics_tbl.m_current_socket_power, + "curr_socket_power"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricPowerEnergy] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricEnergyAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_energy_accumulator, + "energy_acc"))); - ss << " current_socclk: " << "\n"; - idx = 0; - for (const auto& temp : m_gpu_metrics_tbl.m_current_socclk) { - ss << "\t [" << idx << "]: " << temp << "\n"; - ++idx; - } + // Utilization Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgGfxActivity, + format_metric_row(m_gpu_metrics_tbl.m_average_gfx_activity, + "average_gfx_activity"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgUmcActivity, + format_metric_row(m_gpu_metrics_tbl.m_average_umc_activity, + "average_umc_activity"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxActivityAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_gfx_activity_acc, + "gfx_activity_acc"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricMemActivityAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_mem_activity_acc, + "mem_activity_acc"))); - ss << " current_vclk0: " << "\n"; - idx = 0; - for (const auto& temp : m_gpu_metrics_tbl.m_current_vclk0) { - ss << "\t [" << idx << "]: " << temp << "\n"; - ++idx; - } + // GfxLock Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricGfxClkLockStatus] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxClkLockStatus, + format_metric_row(m_gpu_metrics_tbl.m_gfxclk_lock_status, + "gfxclk_lock_status"))); - ss << " current_dclk0: " << "\n"; - idx = 0; - for (const auto& temp : m_gpu_metrics_tbl.m_current_dclk0) { - ss << "\t [" << idx << "]: " << temp << "\n"; - ++idx; - } + // Timestamp Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTimestamp] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTSFirmware, + format_metric_row(m_gpu_metrics_tbl.m_firmware_timestamp, + "firmware_timestamp"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTimestamp] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTSClockCounter, + format_metric_row(m_gpu_metrics_tbl.m_system_clock_counter, + "system_clock_counter"))); - idx = 0; - idy = 0; - ss << " xcp_stats.gfx_busy_inst: " << "\n"; - for (auto& row : m_gpu_metrics_tbl.m_xcp_stats) { - if (idx == 0) { - ss << "\t [ "; - } - for (auto& col : row.gfx_busy_inst) { - ss << "\t [" << idx << "] [" << idy << "]: " << col; - if (idy + 1 != (std::end(row.gfx_busy_inst) - std::end(row.gfx_busy_inst) - 1)) { - ss << ", "; - } - if (idx + 1 != - (std::end(m_gpu_metrics_tbl.m_xcp_stats) - std::end(m_gpu_metrics_tbl.m_xcp_stats) - 1)) { - ss << "\n"; - } else { - ss << "]\n"; - } - idy++; - } - idx++; - } + // Link/Width/Speed Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieLinkWidth, + format_metric_row(m_gpu_metrics_tbl.m_pcie_link_width, + "pcie_link_width"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieLinkSpeed, + format_metric_row(m_gpu_metrics_tbl.m_pcie_link_speed, + "pcie_link_speed"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiLinkWidth, + format_metric_row(m_gpu_metrics_tbl.m_xgmi_link_width, + "xgmi_link_width"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiLinkSpeed, + format_metric_row(m_gpu_metrics_tbl.m_xgmi_link_speed, + "xgmi_link_speed"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieBandwidthAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_pcie_bandwidth_acc, + "pcie_bandwidth_acc"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieBandwidthInst, + format_metric_row(m_gpu_metrics_tbl.m_pcie_bandwidth_inst, + "pcie_bandwidth_inst"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieL0RecovCountAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc, + "pcie_l0_recov_count_acc"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieReplayCountAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_pcie_replay_count_acc, + "pcie_replay_count_acc"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieReplayRollOverCountAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc, + "pcie_replay_rollover_count_acc"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieNakSentCountAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_pcie_nak_sent_count_acc, + "pcie_nak_sent_count_acc"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieNakReceivedCountAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_pcie_nak_rcvd_count_acc, + "pcie_nak_rcvd_count_acc"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiReadDataAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_xgmi_read_data_acc, + "[xgmi_read_data_acc]"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiWriteDataAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_xgmi_write_data_acc, + "[xgmi_write_data_acc]"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiLinkStatus, + format_metric_row(m_gpu_metrics_tbl.m_xgmi_link_status, + "[xgmi_link_status]"))); - idx = 0; - idy = 0; - ss << " xcp_stats.vcn_busy: " << "\n"; - for (auto& row : m_gpu_metrics_tbl.m_xcp_stats) { - if (idx == 0) { - ss << "\t [ "; - } - for (auto& col : row.vcn_busy) { - ss << "\t [" << idx << "] [" << idy << "]: " << col; - if (idy + 1 != (std::end(row.vcn_busy) - std::end(row.vcn_busy) - 1)) { - ss << ", "; - } - if (idx + 1 != - (std::end(m_gpu_metrics_tbl.m_xcp_stats) - std::end(m_gpu_metrics_tbl.m_xcp_stats) - 1)) { - ss << "\n"; - } else { - ss << "]\n"; - } - idy++; - } - idx++; - } + // CurrentClock Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrGfxClock, + format_metric_row(m_gpu_metrics_tbl.m_current_gfxclk, + "[current_gfxclk]"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrSocClock, + format_metric_row(m_gpu_metrics_tbl.m_current_socclk, + "[current_socclk]"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrVClock0, + format_metric_row(m_gpu_metrics_tbl.m_current_vclk0, + "[current_vclk0]"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrDClock0, + format_metric_row(m_gpu_metrics_tbl.m_current_dclk0, + "[current_dclk0]"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrUClock, + format_metric_row(m_gpu_metrics_tbl.m_current_uclk, + "current_uclk"))); - idx = 0; - idy = 0; - ss << " xcp_stats.jpeg_busy: " << "\n"; - for (auto& row : m_gpu_metrics_tbl.m_xcp_stats) { - if (idx == 0) { - ss << "\t [ "; - } - for (auto& col : row.jpeg_busy) { - ss << "\t [" << idx << "] [" << idy << "]: " << col; - if (idy + 1 != (std::end(row.jpeg_busy) - std::end(row.jpeg_busy) - 1)) { - ss << ", "; - } - if (idx + 1 != - (std::end(m_gpu_metrics_tbl.m_xcp_stats) - std::end(m_gpu_metrics_tbl.m_xcp_stats) - 1)) { - ss << "\n"; - } else { - ss << "]\n"; - } - idy++; - } - idx++; - } + /* Accumulation cycle counter */ + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAccumulationCounter, + format_metric_row(m_gpu_metrics_tbl.m_accumulation_counter, + "accumulation_counter"))); - idx = 0; - idy = 0; - ss << " xcp_stats.gfx_busy_acc: " << "\n"; - for (auto& row : m_gpu_metrics_tbl.m_xcp_stats) { - if (idx == 0) { - ss << "\t [ "; - } - for (auto& col : row.gfx_busy_acc) { - ss << "\t [" << idx << "] [" << idy << "]: " << col; - if (idy + 1 != (std::end(row.gfx_busy_acc) - std::end(row.gfx_busy_acc) - 1)) { - ss << ", "; - } - if (idx + 1 != - (std::end(m_gpu_metrics_tbl.m_xcp_stats) - std::end(m_gpu_metrics_tbl.m_xcp_stats) - 1)) { - ss << "\n"; - } else { - ss << "]\n"; - } - idy++; - } - idx++; - } + /* Accumulated throttler residencies */ + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricProchotResidencyAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_prochot_residency_acc, + "prochot_residency_acc"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPPTResidencyAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_ppt_residency_acc, + "ppt_residency_acc"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricSocketThmResidencyAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_socket_thm_residency_acc, + "socket_thm_residency_acc"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricVRThmResidencyAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_vr_thm_residency_acc, + "vr_thm_residency_acc"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricHBMThmResidencyAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_hbm_thm_residency_acc, + "hbm_thm_residency_acc"))); - LOG_DEBUG(ss); -} + /* Partition info */ + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricPartition] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kGpuMetricNumPartition, + format_metric_row(m_gpu_metrics_tbl.m_num_partition, + "num_partition"))); + /* xcp_stats info */ + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxBusyInst, + format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_busy_inst, + "xcp_stats->gfx_busy_inst"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricVcnBusy, + format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->vcn_busy, + "xcp_stats->vcn_busy"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricJpegBusy, + format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->jpeg_busy, + "xcp_stats->jpeg_busy"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxBusyAcc, + format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_busy_acc, + "xcp_stats->gfx_busy_acc"))); -void GpuMetricsBase_v16_t::dump_internal_metrics_table() -{ - std::ostringstream ss; - auto idx = uint64_t(0); - auto idy = uint64_t(0); - std::cout << __PRETTY_FUNCTION__ << " | ======= start ======= \n"; - ss << __PRETTY_FUNCTION__ - << " | ======= DEBUG ======= " - << " | Metric Version: " - << stringfy_metric_header_version(m_gpu_metrics_tbl.m_common_header) - << " | Size: " - << print_unsigned_int(m_gpu_metrics_tbl.m_common_header.m_structure_size) - << " |" - << "\n"; - ss << " temperature_hotspot: " << m_gpu_metrics_tbl.m_temperature_hotspot << "\n" - << " temperature_mem: " << m_gpu_metrics_tbl.m_temperature_mem << "\n" - << " temperature_vrsoc: " << m_gpu_metrics_tbl.m_temperature_vrsoc << "\n" - << " current_socket_power: " << m_gpu_metrics_tbl.m_current_socket_power << "\n" - << " average_gfx_activity: " << m_gpu_metrics_tbl.m_average_gfx_activity << "\n" - << " average_umc_activity: " << m_gpu_metrics_tbl.m_average_umc_activity << "\n"; + /* gpu metrics v1.8 xcp_stats info */ + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitTotalAcc, + format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_below_host_limit_total_acc, + "xcp_stats->gfx_below_host_limit_total_acc"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitPptAcc, + format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_below_host_limit_ppt_acc, + "xcp_stats->gfx_below_host_limit_ppt_acc"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitThmAcc, + format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_below_host_limit_thm_acc, + "xcp_stats->gfx_below_host_limit_thm_acc"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxLowUtilitizationAcc, + format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_low_utilization_acc, + "xcp_stats->gfx_low_utilization_acc"))); - ss << " energy_accumulator: " << m_gpu_metrics_tbl.m_energy_accumulator << "\n" - << " system_clock_counter: " << m_gpu_metrics_tbl.m_system_clock_counter << "\n" - << " accumulation_counter: " << m_gpu_metrics_tbl.m_accumulation_counter << "\n" - << " prochot_residency_acc: " << m_gpu_metrics_tbl.m_prochot_residency_acc << "\n" - << " ppt_residency_acc: " << m_gpu_metrics_tbl.m_ppt_residency_acc << "\n" - << " socket_thm_residency_acc: " << m_gpu_metrics_tbl.m_socket_thm_residency_acc << "\n" - << " vr_thm_residency_acc: " << m_gpu_metrics_tbl.m_vr_thm_residency_acc << "\n" - << " hbm_thm_residency_acc: " << m_gpu_metrics_tbl.m_hbm_thm_residency_acc << "\n" - << " average_gfx_activity: " << m_gpu_metrics_tbl.m_average_gfx_activity << "\n" - << " average_umc_activity: " << m_gpu_metrics_tbl.m_average_umc_activity << "\n" - << " gfxclk_lock_status: " << m_gpu_metrics_tbl.m_gfxclk_lock_status << "\n" - << " pcie_link_width: " << m_gpu_metrics_tbl.m_pcie_link_width << "\n" - << " pcie_link_speed: " << m_gpu_metrics_tbl.m_pcie_link_speed << "\n" - << " xgmi_link_width: " << m_gpu_metrics_tbl.m_xgmi_link_width << "\n" - << " xgmi_link_speed: " << m_gpu_metrics_tbl.m_xgmi_link_speed << "\n" - << " gfx_activity_acc: " << m_gpu_metrics_tbl.m_gfx_activity_acc << "\n" - << " mem_activity_acc: " << m_gpu_metrics_tbl.m_mem_activity_acc << "\n" - << " pcie_bandwidth_acc: " << m_gpu_metrics_tbl.m_pcie_bandwidth_acc << "\n" - << " pcie_bandwidth_inst: " << m_gpu_metrics_tbl.m_pcie_bandwidth_inst << "\n" - << " pcie_l0_to_recov_count_acc: " << m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc << "\n" - << " pcie_replay_count_acc: " << m_gpu_metrics_tbl.m_pcie_replay_count_acc << "\n" - << " pcie_replay_rover_count_acc: " << m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc << "\n" - << " pcie_nak_sent_count_acc: " << m_gpu_metrics_tbl.m_pcie_nak_sent_count_acc << "\n" - << " pcie_nak_rcvd_count_acc: " << m_gpu_metrics_tbl.m_pcie_nak_rcvd_count_acc << "\n" - << " firmware_timestamp: " << m_gpu_metrics_tbl.m_firmware_timestamp << "\n" - << " current_uclk: " << m_gpu_metrics_tbl.m_current_uclk << "\n" - << " num_partition: " << m_gpu_metrics_tbl.m_num_partition << "\n" - << " pcie_lc_perf_other_end_recovery: " - << m_gpu_metrics_tbl.m_pcie_lc_perf_other_end_recovery << "\n"; - idx = 0; - for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_read_data_acc) { - ss << "\t [" << idx << "]: " << temp << "\n"; - ++idx; - } + /* PCIE other end recovery counter info */ + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieLCPerfOtherEndRecov, + format_metric_row(m_gpu_metrics_tbl.m_pcie_lc_perf_other_end_recovery, + "pcie_lc_perf_other_end_recovery"))); - ss << " xgmi_write_data_acc: " << "\n"; - idx = 0; - for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_write_data_acc) { - ss << "\t [" << idx << "]: " << temp << "\n"; - ++idx; - } + /* VRAM max bandwidth (in GB/sec) at max memory clock */ + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricVramMaxBandwidth, + format_metric_row(m_gpu_metrics_tbl.m_mem_max_bandwidth, + "vram_max_bandwidth"))); - ss << " current_gfxclk: " << "\n"; - idx = 0; - for (const auto& temp : m_gpu_metrics_tbl.m_current_gfxclk) { - ss << "\t [" << idx << "]: " << temp << "\n"; - ++idx; - } + ss << __PRETTY_FUNCTION__ << " | ======= end ======= " + << " | Success " + << " | Returning = " << getRSMIStatusString(status_code) << " |"; + LOG_TRACE(ss); - ss << " current_socclk: " << "\n"; - idx = 0; - for (const auto& temp : m_gpu_metrics_tbl.m_current_socclk) { - ss << "\t [" << idx << "]: " << temp << "\n"; - ++idx; - } + // Copy to base class + std::copy(m_metrics_dynamic_tbl.begin(), m_metrics_dynamic_tbl.end(), + std::inserter(GpuMetricsBase_t::m_base_metrics_dynamic_tbl, + GpuMetricsBase_t::m_base_metrics_dynamic_tbl.end())); - ss << " current_vclk0: " << "\n"; - idx = 0; - for (const auto& temp : m_gpu_metrics_tbl.m_current_vclk0) { - ss << "\t [" << idx << "]: " << temp << "\n"; - ++idx; - } - - ss << " current_dclk0: " << "\n"; - idx = 0; - for (const auto& temp : m_gpu_metrics_tbl.m_current_dclk0) { - ss << "\t [" << idx << "]: " << temp << "\n"; - ++idx; - } - - idx = 0; - idy = 0; - ss << " xcp_stats.gfx_busy_inst: " << "\n"; - for (auto& row : m_gpu_metrics_tbl.m_xcp_stats) { - if (idx == 0) { - ss << "\t [ "; - } - for (auto& col : row.gfx_busy_inst) { - ss << "\t [" << idx << "] [" << idy << "]: " << col; - if (idy + 1 != (std::end(row.gfx_busy_inst) - std::end(row.gfx_busy_inst) - 1)) { - ss << ", "; - } - if (idx + 1 != - (std::end(m_gpu_metrics_tbl.m_xcp_stats) - std::end(m_gpu_metrics_tbl.m_xcp_stats) - 1)) { - ss << "\n"; - } else { - ss << "]\n"; - } - idy++; - } - idx++; - } - - idx = 0; - idy = 0; - ss << " xcp_stats.vcn_busy: " << "\n"; - for (auto& row : m_gpu_metrics_tbl.m_xcp_stats) { - if (idx == 0) { - ss << "\t [ "; - } - for (auto& col : row.vcn_busy) { - ss << "\t [" << idx << "] [" << idy << "]: " << col; - if (idy + 1 != (std::end(row.vcn_busy) - std::end(row.vcn_busy) - 1)) { - ss << ", "; - } - if (idx + 1 != - (std::end(m_gpu_metrics_tbl.m_xcp_stats) - std::end(m_gpu_metrics_tbl.m_xcp_stats) - 1)) { - ss << "\n"; - } else { - ss << "]\n"; - } - idy++; - } - idx++; - } - - idx = 0; - idy = 0; - ss << " xcp_stats.jpeg_busy: " << "\n"; - for (auto& row : m_gpu_metrics_tbl.m_xcp_stats) { - if (idx == 0) { - ss << "\t [ "; - } - for (auto& col : row.jpeg_busy) { - ss << "\t [" << idx << "] [" << idy << "]: " << col; - if (idy + 1 != (std::end(row.jpeg_busy) - std::end(row.jpeg_busy) - 1)) { - ss << ", "; - } - if (idx + 1 != - (std::end(m_gpu_metrics_tbl.m_xcp_stats) - std::end(m_gpu_metrics_tbl.m_xcp_stats) - 1)) { - ss << "\n"; - } else { - ss << "]\n"; - } - idy++; - } - idx++; - } - - idx = 0; - idy = 0; - ss << " xcp_stats.gfx_busy_acc: " << "\n"; - for (auto& row : m_gpu_metrics_tbl.m_xcp_stats) { - if (idx == 0) { - ss << "\t [ "; - } - for (auto& col : row.gfx_busy_acc) { - ss << "\t [" << idx << "] [" << idy << "]: " << col; - if (idy + 1 != (std::end(row.gfx_busy_acc) - std::end(row.gfx_busy_acc) - 1)) { - ss << ", "; - } - if (idx + 1 != - (std::end(m_gpu_metrics_tbl.m_xcp_stats) - std::end(m_gpu_metrics_tbl.m_xcp_stats) - 1)) { - ss << "\n"; - } else { - ss << "]\n"; - } - idy++; - } - idx++; - } - - LOG_DEBUG(ss); + return status_code; } rsmi_status_t GpuMetricsBase_v17_t::populate_metrics_dynamic_tbl() { @@ -883,10 +755,7 @@ rsmi_status_t GpuMetricsBase_v17_t::populate_metrics_dynamic_tbl() { ss << __PRETTY_FUNCTION__ << " | ======= start ======="; LOG_TRACE(ss); - if (!m_metrics_dynamic_tbl.empty()) { - m_metrics_dynamic_tbl.clear(); - } - + auto m_metrics_dynamic_tbl = AMDGpuDynamicMetricsTbl_t{}; // // Note: Any metric treatment/changes (if any) should happen before they // get written to internal/external tables. @@ -1106,6 +975,11 @@ rsmi_status_t GpuMetricsBase_v17_t::populate_metrics_dynamic_tbl() { format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_busy_acc, "xcp_stats->gfx_busy_acc"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats].insert( + std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_below_host_limit_acc, + "xcp_stats->gfx_below_host_limit_acc"))); + /* PCIE other end recovery counter info */ m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieLCPerfOtherEndRecov, @@ -1118,12 +992,6 @@ rsmi_status_t GpuMetricsBase_v17_t::populate_metrics_dynamic_tbl() { format_metric_row(m_gpu_metrics_tbl.m_vram_max_bandwidth, "vram_max_bandwidth"))); - /* Total App Clock Counter Accumulated */ - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitAccumulator, - format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_below_host_limit_acc, - "gfx_below_host_limit_acc"))); - ss << __PRETTY_FUNCTION__ << " | ======= end ======= " << " | Success " @@ -1131,6 +999,12 @@ rsmi_status_t GpuMetricsBase_v17_t::populate_metrics_dynamic_tbl() { << " |"; LOG_TRACE(ss); + // Copy to base class + std::copy(m_metrics_dynamic_tbl.begin(), + m_metrics_dynamic_tbl.end(), + std::inserter(GpuMetricsBase_t::m_base_metrics_dynamic_tbl, + GpuMetricsBase_t::m_base_metrics_dynamic_tbl.end())); + return status_code; } @@ -1140,10 +1014,7 @@ rsmi_status_t GpuMetricsBase_v16_t::populate_metrics_dynamic_tbl() { ss << __PRETTY_FUNCTION__ << " | ======= start ======="; LOG_TRACE(ss); - if (!m_metrics_dynamic_tbl.empty()) { - m_metrics_dynamic_tbl.clear(); - } - + auto m_metrics_dynamic_tbl = AMDGpuDynamicMetricsTbl_t{}; // // Note: Any metric treatment/changes (if any) should happen before they // get written to internal/external tables. @@ -1371,127 +1242,22 @@ rsmi_status_t GpuMetricsBase_v16_t::populate_metrics_dynamic_tbl() { << " |"; LOG_TRACE(ss); + // Copy to base class + std::copy(m_metrics_dynamic_tbl.begin(), + m_metrics_dynamic_tbl.end(), + std::inserter(GpuMetricsBase_t::m_base_metrics_dynamic_tbl, + GpuMetricsBase_t::m_base_metrics_dynamic_tbl.end())); + return status_code; } -void GpuMetricsBase_v15_t::dump_internal_metrics_table() -{ - std::ostringstream ss; - std::cout << __PRETTY_FUNCTION__ << " | ======= start ======= \n"; - ss << __PRETTY_FUNCTION__ - << " | ======= DEBUG ======= " - << " | Metric Version: " << stringfy_metric_header_version(m_gpu_metrics_tbl.m_common_header) - << " | Size: " << print_unsigned_int(m_gpu_metrics_tbl.m_common_header.m_structure_size) - << " |" - << "\n"; - ss << " temperature_hotspot: " << m_gpu_metrics_tbl.m_temperature_hotspot << "\n" - << " temperature_mem: " << m_gpu_metrics_tbl.m_temperature_mem << "\n" - << " temperature_vrsoc: " << m_gpu_metrics_tbl.m_temperature_vrsoc << "\n" - - << " current_socket_power: " << m_gpu_metrics_tbl.m_current_socket_power << "\n" - - << " average_gfx_activity: " << m_gpu_metrics_tbl.m_average_gfx_activity << "\n" - << " average_umc_activity: " << m_gpu_metrics_tbl.m_average_umc_activity << "\n"; - - ss << " vcn_activity: " << "\n"; - auto idx = uint64_t(0); - for (const auto& temp : m_gpu_metrics_tbl.m_vcn_activity) { - ss << "\t [" << idx << "]: " << temp << "\n"; - ++idx; - } - - ss << " jpeg_activity: " << "\n"; - idx = 0; - for (const auto& temp : m_gpu_metrics_tbl.m_jpeg_activity) { - ss << "\t [" << idx << "]: " << temp << "\n"; - ++idx; - } - - ss << " energy_accumulator: " << m_gpu_metrics_tbl.m_energy_accumulator << "\n" - << " system_clock_counter: " << m_gpu_metrics_tbl.m_system_clock_counter << "\n" - - << " throttle_status: " << m_gpu_metrics_tbl.m_throttle_status << "\n" - - << " average_gfx_activity: " << m_gpu_metrics_tbl.m_average_gfx_activity << "\n" - << " average_umc_activity: " << m_gpu_metrics_tbl.m_average_umc_activity << "\n" - - << " gfxclk_lock_status: " << m_gpu_metrics_tbl.m_gfxclk_lock_status << "\n" - - << " pcie_link_width: " << m_gpu_metrics_tbl.m_pcie_link_width << "\n" - << " pcie_link_speed: " << m_gpu_metrics_tbl.m_pcie_link_speed << "\n" - - << " xgmi_link_width: " << m_gpu_metrics_tbl.m_xgmi_link_width << "\n" - << " xgmi_link_speed: " << m_gpu_metrics_tbl.m_xgmi_link_speed << "\n" - - << " gfx_activity_acc: " << m_gpu_metrics_tbl.m_gfx_activity_acc << "\n" - << " mem_activity_acc: " << m_gpu_metrics_tbl.m_mem_activity_acc << "\n" - - << " pcie_bandwidth_acc: " << m_gpu_metrics_tbl.m_pcie_bandwidth_acc << "\n" - << " pcie_bandwidth_inst: " << m_gpu_metrics_tbl.m_pcie_bandwidth_inst << "\n" - << " pcie_l0_to_recov_count_acc: " << m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc << "\n" - << " pcie_replay_count_acc: " << m_gpu_metrics_tbl.m_pcie_replay_count_acc << "\n" - << " pcie_replay_rover_count_acc: " << m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc << "\n" - << " pcie_nak_sent_count_acc: " << m_gpu_metrics_tbl.m_pcie_nak_sent_count_acc << "\n" - << " pcie_nak_rcvd_count_acc: " << m_gpu_metrics_tbl.m_pcie_nak_rcvd_count_acc << "\n"; - - ss << " xgmi_read_data_acc: " << "\n"; - idx = 0; - for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_read_data_acc) { - ss << "\t [" << idx << "]: " << temp << "\n"; - ++idx; - } - - ss << " xgmi_write_data_acc: " << "\n"; - idx = 0; - for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_write_data_acc) { - ss << "\t [" << idx << "]: " << temp << "\n"; - ++idx; - } - - ss << " firmware_timestamp: " << m_gpu_metrics_tbl.m_firmware_timestamp << "\n"; - - ss << " current_gfxclk: " << "\n"; - idx = 0; - for (const auto& temp : m_gpu_metrics_tbl.m_current_gfxclk) { - ss << "\t [" << idx << "]: " << temp << "\n"; - ++idx; - } - - ss << " current_socclk: " << "\n"; - idx = 0; - for (const auto& temp : m_gpu_metrics_tbl.m_current_socclk) { - ss << "\t [" << idx << "]: " << temp << "\n"; - ++idx; - } - - ss << " current_vclk0: " << "\n"; - idx = 0; - for (const auto& temp : m_gpu_metrics_tbl.m_current_vclk0) { - ss << "\t [" << idx << "]: " << temp << "\n"; - ++idx; - } - - ss << " current_dclk0: " << "\n"; - idx = 0; - for (const auto& temp : m_gpu_metrics_tbl.m_current_dclk0) { - ss << "\t [" << idx << "]: " << temp << "\n"; - ++idx; - } - - ss << " padding: " << m_gpu_metrics_tbl.m_padding << "\n"; - LOG_DEBUG(ss); -} - rsmi_status_t GpuMetricsBase_v15_t::populate_metrics_dynamic_tbl() { std::ostringstream ss; auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); ss << __PRETTY_FUNCTION__ << " | ======= start ======="; LOG_TRACE(ss); - if (!m_metrics_dynamic_tbl.empty()) { - m_metrics_dynamic_tbl.clear(); - } - + auto m_metrics_dynamic_tbl = AMDGpuDynamicMetricsTbl_t{}; // // Note: Any metric treatment/changes (if any) should happen before they // get written to internal/external tables. @@ -1708,119 +1474,22 @@ rsmi_status_t GpuMetricsBase_v15_t::populate_metrics_dynamic_tbl() { << " |"; LOG_TRACE(ss); + // Copy to base class + std::copy(m_metrics_dynamic_tbl.begin(), + m_metrics_dynamic_tbl.end(), + std::inserter(GpuMetricsBase_t::m_base_metrics_dynamic_tbl, + GpuMetricsBase_t::m_base_metrics_dynamic_tbl.end())); + return status_code; } - -void GpuMetricsBase_v14_t::dump_internal_metrics_table() -{ - std::ostringstream ss; - std::cout << __PRETTY_FUNCTION__ << " | ======= start ======= \n"; - ss << __PRETTY_FUNCTION__ - << " | ======= DEBUG ======= " - << " | Metric Version: " << stringfy_metric_header_version(m_gpu_metrics_tbl.m_common_header) - << " | Size: " << print_unsigned_int(m_gpu_metrics_tbl.m_common_header.m_structure_size) - << " |" - << "\n"; - ss << " temperature_hotspot: " << m_gpu_metrics_tbl.m_temperature_hotspot << "\n" - << " temperature_mem: " << m_gpu_metrics_tbl.m_temperature_mem << "\n" - << " temperature_vrsoc: " << m_gpu_metrics_tbl.m_temperature_vrsoc << "\n" - - << " current_socket_power: " << m_gpu_metrics_tbl.m_current_socket_power << "\n" - - << " average_gfx_activity: " << m_gpu_metrics_tbl.m_average_gfx_activity << "\n" - << " average_umc_activity: " << m_gpu_metrics_tbl.m_average_umc_activity << "\n"; - - ss << " vcn_activity: " << "\n"; - auto idx = uint64_t(0); - for (const auto& temp : m_gpu_metrics_tbl.m_vcn_activity) { - ss << "\t [" << idx << "]: " << temp << "\n"; - ++idx; - } - - ss << " energy_accumulator: " << m_gpu_metrics_tbl.m_energy_accumulator << "\n" - << " system_clock_counter: " << m_gpu_metrics_tbl.m_system_clock_counter << "\n" - - << " throttle_status: " << m_gpu_metrics_tbl.m_throttle_status << "\n" - - << " average_gfx_activity: " << m_gpu_metrics_tbl.m_average_gfx_activity << "\n" - << " average_umc_activity: " << m_gpu_metrics_tbl.m_average_umc_activity << "\n" - - << " gfxclk_lock_status: " << m_gpu_metrics_tbl.m_gfxclk_lock_status << "\n" - - << " pcie_link_width: " << m_gpu_metrics_tbl.m_pcie_link_width << "\n" - << " pcie_link_speed: " << m_gpu_metrics_tbl.m_pcie_link_speed << "\n" - - << " xgmi_link_width: " << m_gpu_metrics_tbl.m_xgmi_link_width << "\n" - << " xgmi_link_speed: " << m_gpu_metrics_tbl.m_xgmi_link_speed << "\n" - - << " gfx_activity_acc: " << m_gpu_metrics_tbl.m_gfx_activity_acc << "\n" - << " mem_activity_acc: " << m_gpu_metrics_tbl.m_mem_activity_acc << "\n" - - << " pcie_bandwidth_acc: " << m_gpu_metrics_tbl.m_pcie_bandwidth_acc << "\n" - << " pcie_bandwidth_inst: " << m_gpu_metrics_tbl.m_pcie_bandwidth_inst << "\n" - << " pcie_l0_to_recov_count_acc: " << m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc << "\n" - << " pcie_replay_count_acc: " << m_gpu_metrics_tbl.m_pcie_replay_count_acc << "\n" - << " pcie_replay_rover_count_acc: " << m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc << "\n"; - - ss << " xgmi_read_data_acc: " << "\n"; - idx = 0; - for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_read_data_acc) { - ss << "\t [" << idx << "]: " << temp << "\n"; - ++idx; - } - - ss << " xgmi_write_data_acc: " << "\n"; - idx = 0; - for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_write_data_acc) { - ss << "\t [" << idx << "]: " << temp << "\n"; - ++idx; - } - - ss << " firmware_timestamp: " << m_gpu_metrics_tbl.m_firmware_timestamp << "\n"; - - ss << " current_gfxclk: " << "\n"; - idx = 0; - for (const auto& temp : m_gpu_metrics_tbl.m_current_gfxclk) { - ss << "\t [" << idx << "]: " << temp << "\n"; - ++idx; - } - - ss << " current_socclk: " << "\n"; - idx = 0; - for (const auto& temp : m_gpu_metrics_tbl.m_current_socclk) { - ss << "\t [" << idx << "]: " << temp << "\n"; - ++idx; - } - - ss << " current_vclk0: " << "\n"; - idx = 0; - for (const auto& temp : m_gpu_metrics_tbl.m_current_vclk0) { - ss << "\t [" << idx << "]: " << temp << "\n"; - ++idx; - } - - ss << " current_dclk0: " << "\n"; - idx = 0; - for (const auto& temp : m_gpu_metrics_tbl.m_current_dclk0) { - ss << "\t [" << idx << "]: " << temp << "\n"; - ++idx; - } - - ss << " padding: " << m_gpu_metrics_tbl.m_padding << "\n"; - LOG_DEBUG(ss); -} - rsmi_status_t GpuMetricsBase_v14_t::populate_metrics_dynamic_tbl() { std::ostringstream ss; auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); ss << __PRETTY_FUNCTION__ << " | ======= start ======="; LOG_TRACE(ss); - if (!m_metrics_dynamic_tbl.empty()) { - m_metrics_dynamic_tbl.clear(); - } - + auto m_metrics_dynamic_tbl = AMDGpuDynamicMetricsTbl_t{}; // // Note: Any metric treatment/changes (if any) should happen before they // get written to internal/external tables. @@ -2022,6 +1691,12 @@ rsmi_status_t GpuMetricsBase_v14_t::populate_metrics_dynamic_tbl() { << " |"; LOG_TRACE(ss); + // Copy to base class + std::copy(m_metrics_dynamic_tbl.begin(), + m_metrics_dynamic_tbl.end(), + std::inserter(GpuMetricsBase_t::m_base_metrics_dynamic_tbl, + GpuMetricsBase_t::m_base_metrics_dynamic_tbl.end())); + return status_code; } @@ -2125,6 +1800,7 @@ rsmi_status_t init_max_public_gpu_matrics(AMGpuMetricsPublicLatest_t& rsmi_gpu_m rsmi_gpu_metrics.pcie_nak_sent_count_acc = init_max_uint_types(); rsmi_gpu_metrics.pcie_nak_rcvd_count_acc = init_max_uint_types(); + rsmi_gpu_metrics.accumulation_counter = init_max_uint_types(); rsmi_gpu_metrics.prochot_residency_acc = init_max_uint_types(); rsmi_gpu_metrics.ppt_residency_acc = init_max_uint_types(); @@ -2148,6 +1824,14 @@ rsmi_status_t init_max_public_gpu_matrics(AMGpuMetricsPublicLatest_t& rsmi_gpu_m init_max_uint_types()); std::fill(std::begin(row.gfx_below_host_limit_acc), std::end(row.gfx_below_host_limit_acc), init_max_uint_types()); + std::fill(std::begin(row.gfx_below_host_limit_ppt_acc), std::end(row.gfx_below_host_limit_ppt_acc), + init_max_uint_types()); + std::fill(std::begin(row.gfx_below_host_limit_thm_acc), std::end(row.gfx_below_host_limit_thm_acc), + init_max_uint_types()); + std::fill(std::begin(row.gfx_low_utilization_acc), std::end(row.gfx_low_utilization_acc), + init_max_uint_types()); + std::fill(std::begin(row.gfx_below_host_limit_total_acc), std::end(row.gfx_below_host_limit_total_acc), + init_max_uint_types()); } ss << __PRETTY_FUNCTION__ @@ -2160,6 +1844,230 @@ rsmi_status_t init_max_public_gpu_matrics(AMGpuMetricsPublicLatest_t& rsmi_gpu_m return status_code; } +AMGpuMetricsPublicLatestTupl_t GpuMetricsBase_v18_t::copy_internal_to_external_metrics() +{ + std::ostringstream ss; + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + ss << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ss); + + auto copy_data_from_internal_metrics_tbl = [&]() + { + AMGpuMetricsPublicLatest_t metrics_public_init{}; + + // + // Note: Initializing data members with their max. If field is max, + // no data was assigned to it. + init_max_public_gpu_matrics(metrics_public_init); + + // Header + metrics_public_init.common_header.structure_size = m_gpu_metrics_tbl.m_common_header.m_structure_size; + metrics_public_init.common_header.format_revision = m_gpu_metrics_tbl.m_common_header.m_format_revision; + metrics_public_init.common_header.content_revision = m_gpu_metrics_tbl.m_common_header.m_content_revision; + + + // Temperature + metrics_public_init.temperature_hotspot = m_gpu_metrics_tbl.m_temperature_hotspot; + metrics_public_init.temperature_mem = m_gpu_metrics_tbl.m_temperature_mem; + metrics_public_init.temperature_vrsoc = m_gpu_metrics_tbl.m_temperature_vrsoc; + + // Power + metrics_public_init.current_socket_power = m_gpu_metrics_tbl.m_current_socket_power; + + // Utilization + metrics_public_init.average_gfx_activity = m_gpu_metrics_tbl.m_average_gfx_activity; + metrics_public_init.average_umc_activity = m_gpu_metrics_tbl.m_average_umc_activity; + + // Power/Energy + metrics_public_init.energy_accumulator = m_gpu_metrics_tbl.m_energy_accumulator; + + // Driver attached timestamp (in ns) + metrics_public_init.system_clock_counter = m_gpu_metrics_tbl.m_system_clock_counter; + + // Clock Lock Status. Each bit corresponds to clock instance + metrics_public_init.gfxclk_lock_status = m_gpu_metrics_tbl.m_gfxclk_lock_status; + + // Link width (number of lanes) and speed + metrics_public_init.pcie_link_width = m_gpu_metrics_tbl.m_pcie_link_width; + metrics_public_init.pcie_link_speed = m_gpu_metrics_tbl.m_pcie_link_speed; + + // XGMI bus width and bitrate + metrics_public_init.xgmi_link_width = m_gpu_metrics_tbl.m_xgmi_link_width; + metrics_public_init.xgmi_link_speed = m_gpu_metrics_tbl.m_xgmi_link_speed; + + // Utilization Accumulated + metrics_public_init.gfx_activity_acc = m_gpu_metrics_tbl.m_gfx_activity_acc; + metrics_public_init.mem_activity_acc = m_gpu_metrics_tbl.m_mem_activity_acc; + + // PCIE accumulated bandwidth + metrics_public_init.pcie_bandwidth_acc = m_gpu_metrics_tbl.m_pcie_bandwidth_acc; + + // PCIE instantaneous bandwidth + metrics_public_init.pcie_bandwidth_inst = m_gpu_metrics_tbl.m_pcie_bandwidth_inst; + + // PCIE L0 to recovery state transition accumulated count + metrics_public_init.pcie_l0_to_recov_count_acc = m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc; + + // PCIE replay accumulated count + metrics_public_init.pcie_replay_count_acc = m_gpu_metrics_tbl.m_pcie_replay_count_acc; + + // PCIE replay rollover accumulated count + metrics_public_init.pcie_replay_rover_count_acc = m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc; + + // PCIE NAK sent accumulated count + metrics_public_init.pcie_nak_sent_count_acc = m_gpu_metrics_tbl.m_pcie_nak_sent_count_acc; + + // PCIE NAK received accumulated count + metrics_public_init.pcie_nak_rcvd_count_acc = m_gpu_metrics_tbl.m_pcie_nak_rcvd_count_acc; + + // Accumulated throttler residencies + // bumped up public to uint64_t due to planned size increase for newer ASICs + metrics_public_init.accumulation_counter = m_gpu_metrics_tbl.m_accumulation_counter; + metrics_public_init.prochot_residency_acc = m_gpu_metrics_tbl.m_prochot_residency_acc; + metrics_public_init.ppt_residency_acc = m_gpu_metrics_tbl.m_ppt_residency_acc; + metrics_public_init.socket_thm_residency_acc = m_gpu_metrics_tbl.m_socket_thm_residency_acc; + metrics_public_init.vr_thm_residency_acc = m_gpu_metrics_tbl.m_vr_thm_residency_acc; + metrics_public_init.hbm_thm_residency_acc = m_gpu_metrics_tbl.m_hbm_thm_residency_acc; + + /* VRAM max bandwidth at max memory clock */ + metrics_public_init.vram_max_bandwidth = m_gpu_metrics_tbl.m_mem_max_bandwidth; + + // XGMI accumulated data transfer size + // xgmi_read_data + const auto xgmi_read_data_num_elems = + static_cast( + std::end(m_gpu_metrics_tbl.m_xgmi_read_data_acc) - + std::begin(m_gpu_metrics_tbl.m_xgmi_read_data_acc)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_xgmi_read_data_acc), + xgmi_read_data_num_elems, + metrics_public_init.xgmi_read_data_acc); + // xgmi_write_data + const auto xgmi_write_data_num_elems = + static_cast( + std::end(m_gpu_metrics_tbl.m_xgmi_write_data_acc) - + std::begin(m_gpu_metrics_tbl.m_xgmi_write_data_acc)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_xgmi_write_data_acc), + xgmi_write_data_num_elems, + metrics_public_init.xgmi_write_data_acc); + + // xgmi_link_status // new for 1.7 + const auto xgmi_link_status_num_elems = static_cast( + std::end(m_gpu_metrics_tbl.m_xgmi_link_status) - + std::begin(m_gpu_metrics_tbl.m_xgmi_link_status)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_xgmi_link_status), + xgmi_link_status_num_elems, + metrics_public_init.xgmi_link_status); + + // PMFW attached timestamp (10ns resolution) + metrics_public_init.firmware_timestamp = m_gpu_metrics_tbl.m_firmware_timestamp; + + // Current clocks + // current_gfxclk + const auto curr_gfxclk_num_elems = + static_cast( + std::end(m_gpu_metrics_tbl.m_current_gfxclk) - + std::begin(m_gpu_metrics_tbl.m_current_gfxclk)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_gfxclk), + curr_gfxclk_num_elems, + metrics_public_init.current_gfxclks); + + // current_socclk + const auto curr_socclk_num_elems = + static_cast( + std::end(m_gpu_metrics_tbl.m_current_socclk) - + std::begin(m_gpu_metrics_tbl.m_current_socclk)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_socclk), + curr_socclk_num_elems, + metrics_public_init.current_socclks); + + // current_vclk0 + const auto curr_vclk0_num_elems = + static_cast( + std::end(m_gpu_metrics_tbl.m_current_vclk0) - + std::begin(m_gpu_metrics_tbl.m_current_vclk0)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_vclk0), + curr_vclk0_num_elems, + metrics_public_init.current_vclk0s); + + // current_dclk0 + const auto curr_dclk0_num_elems = + static_cast( + std::end(m_gpu_metrics_tbl.m_current_dclk0) - + std::begin(m_gpu_metrics_tbl.m_current_dclk0)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_dclk0), + curr_dclk0_num_elems, + metrics_public_init.current_dclk0s); + + metrics_public_init.current_uclk = m_gpu_metrics_tbl.m_current_uclk; + + metrics_public_init.num_partition = m_gpu_metrics_tbl.m_num_partition; + + metrics_public_init.pcie_lc_perf_other_end_recovery = + m_gpu_metrics_tbl.m_pcie_lc_perf_other_end_recovery; + + // xcp stats + auto priv_it = std::begin(m_gpu_metrics_tbl.m_xcp_stats); + for (auto pub_it = std::begin(metrics_public_init.xcp_stats); + pub_it != std::end(metrics_public_init.xcp_stats); ++pub_it, ++priv_it) { + std::copy_n(std::begin(priv_it->gfx_busy_inst), RSMI_MAX_NUM_XCC, pub_it->gfx_busy_inst); + std::copy_n(std::begin(priv_it->jpeg_busy), RSMI_MAX_NUM_JPEG_ENG_V1, pub_it->jpeg_busy); + std::copy_n(std::begin(priv_it->vcn_busy), RSMI_MAX_NUM_VCNS, pub_it->vcn_busy); + std::copy_n(std::begin(priv_it->gfx_busy_acc), RSMI_MAX_NUM_XCC, pub_it->gfx_busy_acc); + std::copy_n(std::begin(priv_it->gfx_below_host_limit_ppt_acc), RSMI_MAX_NUM_XCC, + pub_it->gfx_below_host_limit_ppt_acc); + std::copy_n(std::begin(priv_it->gfx_below_host_limit_thm_acc), RSMI_MAX_NUM_XCC, + pub_it->gfx_below_host_limit_thm_acc); + std::copy_n(std::begin(priv_it->gfx_low_utilization_acc), RSMI_MAX_NUM_XCC, + pub_it->gfx_low_utilization_acc); + std::copy_n(std::begin(priv_it->gfx_below_host_limit_total_acc), RSMI_MAX_NUM_XCC, + pub_it->gfx_below_host_limit_total_acc); + } + + // + // Note: Backwards compatibility -> Handling extra/exception cases + // related to earlier versions (1.3/1.4/1.5) + metrics_public_init.current_gfxclk = metrics_public_init.current_gfxclks[0]; + + metrics_public_init.current_socclk = metrics_public_init.current_socclks[0]; + + metrics_public_init.current_vclk0 = metrics_public_init.current_vclk0s[0]; + + metrics_public_init.current_vclk1 = metrics_public_init.current_vclk0s[1]; + + metrics_public_init.current_dclk0 = metrics_public_init.current_dclk0s[0]; + + metrics_public_init.current_dclk1 = metrics_public_init.current_dclk0s[1]; + + // separate by XCP + if (this->m_partition_id < kRSMI_MAX_NUM_XCP + && m_gpu_metrics_tbl.m_xcp_stats[this->m_partition_id].vcn_busy[0] != UINT16_MAX) { + std::copy(std::begin(m_gpu_metrics_tbl.m_xcp_stats[this->m_partition_id].vcn_busy), + std::end(m_gpu_metrics_tbl.m_xcp_stats[this->m_partition_id].vcn_busy), + std::begin(metrics_public_init.vcn_activity)); + } + if (this->m_partition_id < kRSMI_MAX_NUM_XCP + && m_gpu_metrics_tbl.m_xcp_stats[this->m_partition_id].jpeg_busy[0] != UINT16_MAX) { + auto src_size = std::size(m_gpu_metrics_tbl.m_xcp_stats[this->m_partition_id].jpeg_busy); + auto dest_size = std::size(metrics_public_init.jpeg_activity); + auto copy_size = std::min(src_size, dest_size); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_xcp_stats[this->m_partition_id].jpeg_busy), + copy_size, + std::begin(metrics_public_init.jpeg_activity)); + } + + return metrics_public_init; + }(); + + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Returning = " << getRSMIStatusString(status_code) + << " |"; + LOG_TRACE(ss); + + return std::make_tuple(status_code, copy_data_from_internal_metrics_tbl); +}; + AMGpuMetricsPublicLatestTupl_t GpuMetricsBase_v17_t::copy_internal_to_external_metrics() { std::ostringstream ss; @@ -2949,77 +2857,6 @@ AMGpuMetricsPublicLatestTupl_t GpuMetricsBase_v14_t::copy_internal_to_external_m return std::make_tuple(status_code, copy_data_from_internal_metrics_tbl); } -void GpuMetricsBase_v13_t::dump_internal_metrics_table() -{ - std::ostringstream ss; - std::cout << __PRETTY_FUNCTION__ << " | ======= start ======= \n"; - ss << __PRETTY_FUNCTION__ - << " | ======= DEBUG ======= " - << " | Metric Version: " << stringfy_metric_header_version(m_gpu_metrics_tbl.m_common_header) - << " | Size: " << print_unsigned_int(m_gpu_metrics_tbl.m_common_header.m_structure_size) - << " |" - << "\n"; - ss << " temperature_edge: " << m_gpu_metrics_tbl.m_temperature_edge << "\n" - << " temperature_hotspot: " << m_gpu_metrics_tbl.m_temperature_hotspot << "\n" - << " temperature_mem: " << m_gpu_metrics_tbl.m_temperature_mem << "\n" - << " temperature_vrgfx: " << m_gpu_metrics_tbl.m_temperature_vrgfx << "\n" - << " temperature_vrsoc: " << m_gpu_metrics_tbl.m_temperature_vrsoc << "\n" - << " temperature_vrmem: " << m_gpu_metrics_tbl.m_temperature_vrmem << "\n" - - << " average_gfx_activity: " << m_gpu_metrics_tbl.m_average_gfx_activity << "\n" - << " average_umc_activity: " << m_gpu_metrics_tbl.m_average_umc_activity << "\n" - << " average_mm_activity: " << m_gpu_metrics_tbl.m_average_mm_activity << "\n" - << " average_socket_power: " << m_gpu_metrics_tbl.m_average_socket_power << "\n" - - << " energy_accumulator: " << m_gpu_metrics_tbl.m_energy_accumulator << "\n" - << " system_clock_counter: " << m_gpu_metrics_tbl.m_system_clock_counter << "\n" - - << " average_gfxclk_frequency: " << m_gpu_metrics_tbl.m_average_gfxclk_frequency << "\n" - << " average_socclk_frequency: " << m_gpu_metrics_tbl.m_average_socclk_frequency << "\n" - << " average_uclk_frequency: " << m_gpu_metrics_tbl.m_average_uclk_frequency << "\n" - << " average_vclk0_frequency: " << m_gpu_metrics_tbl.m_average_vclk0_frequency << "\n" - << " average_dclk0_frequency: " << m_gpu_metrics_tbl.m_average_dclk0_frequency << "\n" - << " average_vclk1_frequency: " << m_gpu_metrics_tbl.m_average_vclk1_frequency << "\n" - << " average_dclk1_frequency: " << m_gpu_metrics_tbl.m_average_dclk1_frequency << "\n" - - << " current_gfxclk: " << m_gpu_metrics_tbl.m_current_gfxclk << "\n" - << " current_socclk: " << m_gpu_metrics_tbl.m_current_socclk << "\n" - << " current_uclk: " << m_gpu_metrics_tbl.m_current_uclk << "\n" - << " current_vclk0: " << m_gpu_metrics_tbl.m_current_vclk0 << "\n" - << " current_dclk0: " << m_gpu_metrics_tbl.m_current_dclk0 << "\n" - << " current_vclk1: " << m_gpu_metrics_tbl.m_current_vclk1 << "\n" - << " current_dclk1: " << m_gpu_metrics_tbl.m_current_dclk1 << "\n" - - << " throttle_status: " << m_gpu_metrics_tbl.m_throttle_status << "\n" - - << " current_fan_speed: " << m_gpu_metrics_tbl.m_current_fan_speed << "\n" - - << " pcie_link_width: " << m_gpu_metrics_tbl.m_pcie_link_width << "\n" - << " pcie_link_speed: " << m_gpu_metrics_tbl.m_pcie_link_speed << "\n" - - << " padding: " << m_gpu_metrics_tbl.m_padding << "\n" - - << " gfx_activity_acc: " << m_gpu_metrics_tbl.m_gfx_activity_acc << "\n" - << " mem_activity_acc: " << m_gpu_metrics_tbl.m_mem_activity_acc << "\n"; - LOG_DEBUG(ss); - - ss << " temperature_hbm: " << "\n"; - auto idx = uint64_t(0); - for (const auto& temp : m_gpu_metrics_tbl.m_temperature_hbm) { - ss << "\t [" << idx << "]: " << temp << "\n"; - ++idx; - } - - ss << " firmware_timestamp: " << m_gpu_metrics_tbl.m_firmware_timestamp << "\n" - - << " voltage_soc: " << m_gpu_metrics_tbl.m_voltage_soc << "\n" - << " voltage_gfx: " << m_gpu_metrics_tbl.m_voltage_gfx << "\n" - << " voltage_mem: " << m_gpu_metrics_tbl.m_voltage_mem << "\n" - - << " padding1: " << m_gpu_metrics_tbl.m_padding1 << "\n" - << " m_indep_throttle_status: " << m_gpu_metrics_tbl.m_indep_throttle_status << "\n"; - LOG_DEBUG(ss); -} rsmi_status_t GpuMetricsBase_v13_t::populate_metrics_dynamic_tbl() { std::ostringstream ss; @@ -3027,10 +2864,7 @@ rsmi_status_t GpuMetricsBase_v13_t::populate_metrics_dynamic_tbl() { ss << __PRETTY_FUNCTION__ << " | ======= start ======="; LOG_TRACE(ss); - if (!m_metrics_dynamic_tbl.empty()) { - m_metrics_dynamic_tbl.clear(); - } - + auto m_metrics_dynamic_tbl = AMDGpuDynamicMetricsTbl_t{}; // // Note: Any metric treatment/changes (if any) should happen before they // get written to internal/external tables. @@ -3276,6 +3110,12 @@ rsmi_status_t GpuMetricsBase_v13_t::populate_metrics_dynamic_tbl() { << " |"; LOG_TRACE(ss); + // Copy to base class + std::copy(m_metrics_dynamic_tbl.begin(), + m_metrics_dynamic_tbl.end(), + std::inserter(GpuMetricsBase_t::m_base_metrics_dynamic_tbl, + GpuMetricsBase_t::m_base_metrics_dynamic_tbl.end())); + return status_code; } @@ -3385,7 +3225,7 @@ AMGpuMetricsPublicLatestTupl_t GpuMetricsBase_v13_t::copy_internal_to_external_m metrics_public_init.vcn_activity[0] = metrics_public_init.average_mm_activity; } // average_mm_activity needs to not be UIN16_MAX and - // metrics_public_init.xcp_stats->vcn_busy[0] should also be UIN16_MAX + // metrics_public_init.xcp_stats->vcn_busy[0] should also be UINT16_MAX if (metrics_public_init.average_mm_activity != UINT16_MAX && metrics_public_init.xcp_stats->vcn_busy[0] == UINT16_MAX) { metrics_public_init.xcp_stats->vcn_busy[0] = metrics_public_init.average_mm_activity; @@ -3410,10 +3250,7 @@ rsmi_status_t GpuMetricsBase_v12_t::populate_metrics_dynamic_tbl() { ss << __PRETTY_FUNCTION__ << " | ======= start ======="; LOG_TRACE(ss); - if (!m_metrics_dynamic_tbl.empty()) { - m_metrics_dynamic_tbl.clear(); - } - + auto m_metrics_dynamic_tbl = AMDGpuDynamicMetricsTbl_t{}; // // Note: Any metric treatment/changes (if any) should happen before they // get written to internal/external tables. @@ -3637,6 +3474,12 @@ rsmi_status_t GpuMetricsBase_v12_t::populate_metrics_dynamic_tbl() { << " |"; LOG_TRACE(ss); + // Copy to base class + std::copy(m_metrics_dynamic_tbl.begin(), + m_metrics_dynamic_tbl.end(), + std::inserter(GpuMetricsBase_t::m_base_metrics_dynamic_tbl, + GpuMetricsBase_t::m_base_metrics_dynamic_tbl.end())); + return status_code; } @@ -3747,10 +3590,7 @@ rsmi_status_t GpuMetricsBase_v11_t::populate_metrics_dynamic_tbl() { ss << __PRETTY_FUNCTION__ << " | ======= start ======="; LOG_TRACE(ss); - if (!m_metrics_dynamic_tbl.empty()) { - m_metrics_dynamic_tbl.clear(); - } - + auto m_metrics_dynamic_tbl = AMDGpuDynamicMetricsTbl_t{}; // // Note: Any metric treatment/changes (if any) should happen before they // get written to internal/external tables. @@ -3961,6 +3801,12 @@ rsmi_status_t GpuMetricsBase_v11_t::populate_metrics_dynamic_tbl() { << " |"; LOG_TRACE(ss); + // Copy to base class + std::copy(m_metrics_dynamic_tbl.begin(), + m_metrics_dynamic_tbl.end(), + std::inserter(GpuMetricsBase_t::m_base_metrics_dynamic_tbl, + GpuMetricsBase_t::m_base_metrics_dynamic_tbl.end())); + return status_code; } @@ -4705,8 +4551,8 @@ rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t* smu) { CHK_SUPPORT_NAME_ONLY(smu) auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); - std::ostringstream ostrstream; - std::ostringstream ss; + thread_local std::ostringstream ostrstream; + thread_local std::ostringstream ss; ss << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ss); @@ -4727,9 +4573,10 @@ rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t* smu) { dev->set_smi_device_id(dv_ind); uint32_t partition_id = 0; - auto ret = rsmi_dev_partition_id_get(dv_ind, &partition_id); + rsmi_dev_partition_id_get(dv_ind, &partition_id); dev->set_smi_partition_id(partition_id); dev->dev_log_gpu_metrics(ostrstream); + const auto [error_code, external_metrics] = dev->dev_copy_internal_to_external_metrics(); if (error_code != rsmi_status_t::RSMI_STATUS_SUCCESS) { ss << __PRETTY_FUNCTION__ diff --git a/tests/rocm_smi_test/functional/gpu_metrics_read.cc b/tests/rocm_smi_test/functional/gpu_metrics_read.cc index cffac628ca..ee4aceb041 100644 --- a/tests/rocm_smi_test/functional/gpu_metrics_read.cc +++ b/tests/rocm_smi_test/functional/gpu_metrics_read.cc @@ -379,6 +379,50 @@ void TestGpuMetricsRead::Run(void) { std::cout << " ]\n"; xcp++; } + // new for gpu metrics v1.8 + xcp = 0; + std::cout << std::dec << "xcp_stats.gfx_below_host_limit_ppt_acc = \n"; + for (auto& row : smu.xcp_stats) { + std::cout << "XCP[" << xcp << "] = " << "[ "; + std::copy(std::begin(row.gfx_below_host_limit_ppt_acc), + std::end(row.gfx_below_host_limit_ppt_acc), + amd::smi::make_ostream_joiner(&std::cout, ", ")); + std::cout << " ]\n"; + xcp++; + } + + xcp = 0; + std::cout << std::dec << "xcp_stats.gfx_below_host_limit_thm_acc = \n"; + for (auto& row : smu.xcp_stats) { + std::cout << "XCP[" << xcp << "] = " << "[ "; + std::copy(std::begin(row.gfx_below_host_limit_thm_acc), + std::end(row.gfx_below_host_limit_thm_acc), + amd::smi::make_ostream_joiner(&std::cout, ", ")); + std::cout << " ]\n"; + xcp++; + } + + xcp = 0; + std::cout << std::dec << "xcp_stats.gfx_low_utilization_acc = \n"; + for (auto& row : smu.xcp_stats) { + std::cout << "XCP[" << xcp << "] = " << "[ "; + std::copy(std::begin(row.gfx_low_utilization_acc), + std::end(row.gfx_low_utilization_acc), + amd::smi::make_ostream_joiner(&std::cout, ", ")); + std::cout << " ]\n"; + xcp++; + } + + xcp = 0; + std::cout << std::dec << "xcp_stats.gfx_below_host_limit_total_acc = \n"; + for (auto& row : smu.xcp_stats) { + std::cout << "XCP[" << xcp << "] = " << "[ "; + std::copy(std::begin(row.gfx_below_host_limit_total_acc), + std::end(row.gfx_below_host_limit_total_acc), + amd::smi::make_ostream_joiner(&std::cout, ", ")); + std::cout << " ]\n"; + xcp++; + } std::cout << "\n\n"; std::cout << "\t ** -> Checking metrics with constant changes ** " << "\n";