From 125bdaf4f5dd805d3c3ba67e55f51a1b532b8f45 Mon Sep 17 00:00:00 2001 From: Charis Poag Date: Thu, 12 Dec 2024 17:56:09 -0600 Subject: [PATCH] [SWDEV-496693] GPU metrics 1.7 Changes: - Added new GPU metrics: 1) XGMI link status - Up/Down; 1 = up; 0 = down 2) Graphics clocks below host limit (per XCP) accumulators -> used to help calculate a violation status 3) VRAM max bandwidth at max memory clock - Updated rocm-smi --showmetrics to include new metrics. Units/values reflect as indicated by driver, may differ from AMD SMI or other ROCm SMI interfaces which use these fields. - N/A fields means the device does not support providing this data. Change-Id: I17b313345f15070a76b3a30dd8d5645d212d601b Signed-off-by: Charis Poag [ROCm/rocm_smi_lib commit: 88a7e4b8ad30de1d8f3ef413a1032856984e8c3e] --- projects/rocm-smi-lib/CHANGELOG.md | 46 +- .../rocm-smi-lib/include/rocm_smi/rocm_smi.h | 21 +- .../include/rocm_smi/rocm_smi_gpu_metrics.h | 151 +++- .../rocm-smi-lib/python_smi_tools/rocm_smi.py | 20 + .../python_smi_tools/rsmiBindings.py | 3 + .../rocm_smi/example/rocm_smi_example.cc | 18 + .../rocm-smi-lib/src/rocm_smi_gpu_metrics.cc | 696 +++++++++++++++++- .../functional/gpu_metrics_read.cc | 20 + 8 files changed, 969 insertions(+), 6 deletions(-) diff --git a/projects/rocm-smi-lib/CHANGELOG.md b/projects/rocm-smi-lib/CHANGELOG.md index e9e5968df7..ac8af3f8e3 100644 --- a/projects/rocm-smi-lib/CHANGELOG.md +++ b/projects/rocm-smi-lib/CHANGELOG.md @@ -4,10 +4,54 @@ Full documentation for rocm_smi_lib is available at [https://rocm.docs.amd.com/] ***All information listed below is for reference and subject to change.*** -## rocm_smi_lib for ROCm 6.3 +## rocm_smi_lib for ROCm 6.4 ### Added +- **Added support for GPU metrics 1.7 to `rsmi_dev_gpu_metrics_info_get()`** +Updated `rsmi_dev_gpu_metrics_info_get()` and structure `rsmi_gpu_metrics_t` to include new fields for XGMI Link Status, graphics clocks below host limit (per XCP), and VRAM max bandwidth: + - `uint64_t vram_max_bandwidth` - VRAM max bandwidth at max memory clock (GB/s) + - `uint16_t xgmi_link_status[MAX_NUM_XGMI_LINKS]` - XGMI link statis, 1=Up 0=Down + - `uint64_t gfx_below_host_limit_acc[MAX_NUM_XCC]` - graphics clocks below host limit (per XCP) accumulators. Used for graphic clk below host limit violation status. + +- **Added new GPU metrics 1.7 to `rocm-smi --showmetrics`** +New metrics added to `rocm-smi --showmetrics` +```shell +$ rocm-smi --showmetrics + GPU[0] : vram_max_bandwidth (GB/s): 1555 + GPU[0] : xgmi_link_status (Up/Down): ['1', '1', '1', '1', '0', '1', '0', '1'] + GPU[0] XCP[0] : xcp_stats.gfx_below_host_limit_acc (%): ['0', '0', '0', '0', '0', '0', '0', '0'] + GPU[0] XCP[1] : xcp_stats.gfx_below_host_limit_acc (%): ['0', '0', '0', '0', '0', '0', '0', '0'] + GPU[0] XCP[2] : xcp_stats.gfx_below_host_limit_acc (%): ['0', '0', '0', '0', '0', '0', '0', '0'] + GPU[0] XCP[3] : xcp_stats.gfx_below_host_limit_acc (%): ['0', '0', '0', '0', '0', '0', '0', '0'] + GPU[0] XCP[4] : xcp_stats.gfx_below_host_limit_acc (%): ['0', '0', '0', '0', '0', '0', '0', '0'] + GPU[0] XCP[5] : xcp_stats.gfx_below_host_limit_acc (%): ['0', '0', '0', '0', '0', '0', '0', '0'] + GPU[0] XCP[6] : xcp_stats.gfx_below_host_limit_acc (%): ['0', '0', '0', '0', '0', '0', '0', '0'] + GPU[0] XCP[7] : xcp_stats.gfx_below_host_limit_acc (%): ['0', '0', '0', '0', '0', '0', '0', '0'] + GPU[1] : vram_max_bandwidth (GB/s): 1555 + GPU[1] : xgmi_link_status (Up/Down): ['1', '1', '1', '1', '0', '1', '0', '1'] + ... + GPU[1] XCP[0] : xcp_stats.gfx_below_host_limit_acc (%): ['0', '0', '0', '0', '0', '0', '0', '0'] + GPU[1] XCP[1] : xcp_stats.gfx_below_host_limit_acc (%): ['0', '0', '0', '0', '0', '0', '0', '0'] + GPU[1] XCP[2] : xcp_stats.gfx_below_host_limit_acc (%): ['0', '0', '0', '0', '0', '0', '0', '0'] + GPU[1] XCP[3] : xcp_stats.gfx_below_host_limit_acc (%): ['0', '0', '0', '0', '0', '0', '0', '0'] + GPU[1] XCP[4] : xcp_stats.gfx_below_host_limit_acc (%): ['0', '0', '0', '0', '0', '0', '0', '0'] + GPU[1] XCP[5] : xcp_stats.gfx_below_host_limit_acc (%): ['0', '0', '0', '0', '0', '0', '0', '0'] + GPU[1] XCP[6] : xcp_stats.gfx_below_host_limit_acc (%): ['0', '0', '0', '0', '0', '0', '0', '0'] + GPU[1] XCP[7] : xcp_stats.gfx_below_host_limit_acc (%): ['0', '0', '0', '0', '0', '0', '0', '0'] + ... +``` + +### Changed + +### Removed + +### Resolved issues + +### Upcoming changes + +## rocm_smi_lib for ROCm 6.3 + - **Added `rsmi_dev_memory_partition_capabilities_get` which returns driver memory partition capablities.** Driver now has the ability to report what the user can set memory partition modes to. User can now see available memory partition modes upon an invalid argument return from memory partition mode set (`rsmi_dev_memory_partition_set`). diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h index caf19c1bab..8948d0f943 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h @@ -40,6 +40,7 @@ * DEALINGS WITH THE SOFTWARE. * */ + #ifndef ROCM_SMI_ROCM_SMI_H_ #define ROCM_SMI_ROCM_SMI_H_ @@ -987,6 +988,9 @@ typedef struct metrics_table_header_t metrics_table_header_t; * @brief The following structures hold the gpu statistics for a device. */ struct amdgpu_xcp_metrics_t { + /* + * v1.6 additions + */ /* Utilization Instantaneous (%) */ uint32_t gfx_busy_inst[RSMI_MAX_NUM_XCC]; uint16_t jpeg_busy[RSMI_MAX_NUM_JPEG_ENGS]; @@ -994,6 +998,12 @@ struct amdgpu_xcp_metrics_t { /* Utilization Accumulated (%) */ uint64_t gfx_busy_acc[RSMI_MAX_NUM_XCC]; + + /* + * v1.7 additions + */ + /* Total App Clock Counter Accumulated */ + uint64_t gfx_below_host_limit_acc[RSMI_MAX_NUM_XCC]; }; typedef struct { @@ -1173,7 +1183,7 @@ typedef struct { /** * Accumulated throttler residencies * - * Socket (thermal) - + * Socket (thermal) - * Socket thermal violation % (greater than 0% is a violation); * aka TVIOL * @@ -1197,6 +1207,15 @@ typedef struct { /* PCIE other end recovery counter */ uint32_t pcie_lc_perf_other_end_recovery; + /* + * v1.7 additions + */ + /* VRAM max bandwidth at max memory clock (GB/s) */ + uint64_t vram_max_bandwidth; + + /* XGMI link status(up/down) */ + uint16_t xgmi_link_status[RSMI_MAX_NUM_XGMI_LINKS]; + /// \endcond } rsmi_gpu_metrics_t; diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_gpu_metrics.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_gpu_metrics.h index 00c0f8e70b..5712ea41ef 100644 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_gpu_metrics.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_gpu_metrics.h @@ -109,6 +109,19 @@ struct AMDGpuMetricsHeader_v1_t { uint8_t m_content_revision; }; +struct amdgpu_xcp_metrics_v1_1 { + /* Utilization Instantaneous (%) */ + uint32_t gfx_busy_inst[kRSMI_MAX_NUM_XCC]; + uint16_t jpeg_busy[kRSMI_MAX_JPEG_ENGINES]; + uint16_t vcn_busy[kRSMI_MAX_NUM_VCNS]; + + /* Utilization Accumulated (%) */ + uint64_t gfx_busy_acc[kRSMI_MAX_NUM_XCC]; + + /* Total App Clock Counter Accumulated */ + uint64_t gfx_below_host_limit_acc[kRSMI_MAX_NUM_XCC]; +}; + struct amdgpu_xcp_metrics { /* Utilization Instantaneous (%) */ uint32_t gfx_busy_inst[kRSMI_MAX_NUM_XCC]; @@ -572,7 +585,107 @@ struct AMDGpuMetrics_v16_t { /* PCIE other end recovery counter */ uint32_t m_pcie_lc_perf_other_end_recovery; }; -using AMGpuMetricsLatest_t = AMDGpuMetrics_v16_t; + +struct AMDGpuMetrics_v17_t { + ~AMDGpuMetrics_v17_t() = default; + struct AMDGpuMetricsHeader_v1_t m_common_header; + + /* Temperature (Celsius) */ + uint16_t m_temperature_hotspot; + uint16_t m_temperature_mem; + uint16_t m_temperature_vrsoc; + + /* Power (Watts) */ + uint16_t m_current_socket_power; + + /* Utilization (%) */ + uint16_t m_average_gfx_activity; + uint16_t m_average_umc_activity; // memory controller + + /* VRAM max bandwidth at max memory clock */ + uint64_t m_vram_max_bandwidth; // new for 1.7 + + /* Energy (15.259uJ (2^-16) units) */ + uint64_t m_energy_accumulator; + + /* Driver attached timestamp (in ns) */ + uint64_t m_system_clock_counter; + + /* Accumulation cycle counter */ + uint32_t m_accumulation_counter; + + /* Accumulated throttler residencies */ + uint32_t m_prochot_residency_acc; + uint32_t m_ppt_residency_acc; + uint32_t m_socket_thm_residency_acc; + uint32_t m_vr_thm_residency_acc; + uint32_t m_hbm_thm_residency_acc; + + /* Clock Lock Status. Each bit corresponds to clock instance */ + uint32_t m_gfxclk_lock_status; + + /* Link width (number of lanes) and speed (in 0.1 GT/s) */ + uint16_t m_pcie_link_width; + uint16_t m_pcie_link_speed; + + /* XGMI bus width and bitrate (in Gbps) */ + uint16_t m_xgmi_link_width; + uint16_t m_xgmi_link_speed; + + /* Utilization Accumulated (%) */ + uint32_t m_gfx_activity_acc; + uint32_t m_mem_activity_acc; + + /*PCIE accumulated bandwidth (GB/sec) */ + uint64_t m_pcie_bandwidth_acc; + + /*PCIE instantaneous bandwidth (GB/sec) */ + uint64_t m_pcie_bandwidth_inst; + + /* PCIE L0 to recovery state transition accumulated count */ + uint64_t m_pcie_l0_to_recov_count_acc; + + /* PCIE replay accumulated count */ + uint64_t m_pcie_replay_count_acc; + + /* PCIE replay rollover accumulated count */ + uint64_t m_pcie_replay_rover_count_acc; + + /* PCIE NAK sent accumulated count */ + uint32_t m_pcie_nak_sent_count_acc; + + /* PCIE NAK received accumulated count */ + uint32_t m_pcie_nak_rcvd_count_acc; + + /* XGMI accumulated data transfer size(KiloBytes) */ + uint64_t m_xgmi_read_data_acc[kRSMI_MAX_NUM_XGMI_LINKS]; + uint64_t m_xgmi_write_data_acc[kRSMI_MAX_NUM_XGMI_LINKS]; + + /* XGMI link status(up/down) */ + uint16_t m_xgmi_link_status[kRSMI_MAX_NUM_XGMI_LINKS]; // new for 1.7 + + uint16_t m_padding; + + /* PMFW attached timestamp (10ns resolution) */ + uint64_t m_firmware_timestamp; + + /* Current clocks (Mhz) */ + uint16_t m_current_gfxclk[kRSMI_MAX_NUM_GFX_CLKS]; + uint16_t m_current_socclk[kRSMI_MAX_NUM_CLKS]; + uint16_t m_current_vclk0[kRSMI_MAX_NUM_CLKS]; + uint16_t m_current_dclk0[kRSMI_MAX_NUM_CLKS]; + uint16_t m_current_uclk; + + /* Number of current partition */ + uint16_t m_num_partition; + + /* XCP metrics stats */ + struct amdgpu_xcp_metrics_v1_1 m_xcp_stats[kRSMI_MAX_NUM_XCP]; + + /* PCIE other end recovery counter */ + uint32_t m_pcie_lc_perf_other_end_recovery; +}; +using AMGpuMetricsLatest_t = AMDGpuMetrics_v17_t; /** * This is GPU Metrics version that gets to public access. @@ -787,8 +900,11 @@ enum class AMDGpuMetricsUnitType_t : AMDGpuMetricTypeId_t kMetricJpegBusy, // v1.6 kMetricVcnBusy, // v1.6 kMetricGfxBusyAcc, // v1.6 - kMetricPcieLCPerfOtherEndRecov, // v1.6 + + kMetricVramMaxBandwidth, // v1.7 + kMetricXgmiLinkStatus, // v1.7 + kMetricGfxBelowHostLimitAccumulator, // v1.7 }; using AMDGpuMetricsUnitTypeTranslationTbl_t = std::map; @@ -826,6 +942,7 @@ enum class AMDGpuMetricVersionFlags_t : AMDGpuMetricVersionFlagId_t kGpuMetricV14 = (0x1 << 4), kGpuMetricV15 = (0x1 << 5), kGpuMetricV16 = (0x1 << 6), + kGpuMetricV17 = (0x1 << 7), }; using AMDGpuMetricVersionTranslationTbl_t = std::map; using GpuMetricTypePtr_t = std::shared_ptr; @@ -1044,6 +1161,36 @@ class GpuMetricsBase_v16_t final : public GpuMetricsBase_t { std::shared_ptr m_gpu_metric_ptr; }; +class GpuMetricsBase_v17_t final : public GpuMetricsBase_t { + public: + ~GpuMetricsBase_v17_t() = default; + + size_t sizeof_metric_table() override { + return sizeof(AMDGpuMetrics_v17_t); + } + + GpuMetricTypePtr_t get_metrics_table() override { + if (!m_gpu_metric_ptr) { + m_gpu_metric_ptr.reset(&m_gpu_metrics_tbl, [](AMDGpuMetrics_v17_t*){}); + } + assert(m_gpu_metric_ptr != nullptr); + return m_gpu_metric_ptr; + } + + void dump_internal_metrics_table() override; + + AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override { + return AMDGpuMetricVersionFlags_t::kGpuMetricV17; + } + + rsmi_status_t populate_metrics_dynamic_tbl() override; + AMGpuMetricsPublicLatestTupl_t copy_internal_to_external_metrics() override; + + private: + AMDGpuMetrics_v17_t m_gpu_metrics_tbl; + std::shared_ptr m_gpu_metric_ptr; +}; + template rsmi_status_t rsmi_dev_gpu_metrics_info_query(uint32_t dv_ind, AMDGpuMetricsUnitType_t metric_counter, T& metric_value); diff --git a/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py b/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py index b805c1f170..699bfb306c 100755 --- a/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py +++ b/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py @@ -3549,6 +3549,7 @@ def showGPUMetrics(deviceList): clock_unit="MHz" fan_speed="rpm" percent_unit="%" + vram_max_bw="GB/s" pcie_acc_unit="GB/s" pcie_lanes_unit="Lanes" pcie_speed_unit="0.1 GT/s" @@ -3557,6 +3558,7 @@ def showGPUMetrics(deviceList): time_unit="ns" time_unit_10="10ns resolution" count="Count" + link_status="Up/Down" no_unit = None for device in deviceList: @@ -3764,6 +3766,14 @@ def showGPUMetrics(deviceList): "value": validateIfMaxUint(gpu_metrics.pcie_lc_perf_other_end_recovery, UIntegerTypes.UINT32_T), "unit": count, }, + "vram_max_bandwidth": { + "value": validateIfMaxUint(gpu_metrics.vram_max_bandwidth, UIntegerTypes.UINT64_T), + "unit": vram_max_bw, + }, + "xgmi_link_status": { + "value": validateIfMaxUint(list(gpu_metrics.xgmi_link_status), UIntegerTypes.UINT16_T), + "unit": link_status, + }, "num_partition": { "value": validateIfMaxUint(gpu_metrics.num_partition, UIntegerTypes.UINT16_T), "unit": no_unit, @@ -3784,6 +3794,10 @@ def showGPUMetrics(deviceList): "value": gpu_metrics.xcp_stats, "unit": percent_unit, }, + "xcp_stats.gfx_below_host_limit_acc": { + "value": gpu_metrics.xcp_stats, + "unit": percent_unit, + }, } printLog(device, 'Metric Version and Size (Bytes)', @@ -3818,6 +3832,12 @@ def showGPUMetrics(deviceList): for _, val in enumerate(item.gfx_busy_acc): print_xcp_detail.append(validateIfMaxUint(val, UIntegerTypes.UINT64_T)) printLog(device, k + " (" + str(v["unit"]) + ")", str(print_xcp_detail), xcp=str(curr_xcp)) + if 'xcp_stats.gfx_below_host_limit_acc' in k: + for curr_xcp, item in enumerate(v['value']): + print_xcp_detail = [] + for _, val in enumerate(item.gfx_below_host_limit_acc): + print_xcp_detail.append(validateIfMaxUint(val, UIntegerTypes.UINT64_T)) + printLog(device, k + " (" + str(v["unit"]) + ")", str(print_xcp_detail), xcp=str(curr_xcp)) if int(device) < (len(deviceList) - 1): printLogSpacer() diff --git a/projects/rocm-smi-lib/python_smi_tools/rsmiBindings.py b/projects/rocm-smi-lib/python_smi_tools/rsmiBindings.py index 6a9aa4219a..e10cedf3e2 100644 --- a/projects/rocm-smi-lib/python_smi_tools/rsmiBindings.py +++ b/projects/rocm-smi-lib/python_smi_tools/rsmiBindings.py @@ -663,6 +663,7 @@ amdgpu_xcp_metrics_t._fields_ = [ ('jpeg_busy', c_uint16 * 32), ('vcn_busy', c_uint16 * 4), ('gfx_busy_acc', c_uint64 * 8), + ('gfx_below_host_limit_acc', c_uint64 * 8), ] xcp_stats_t = amdgpu_xcp_metrics_t @@ -739,5 +740,7 @@ rsmi_gpu_metrics_t._fields_ = [ ('num_partition', c_uint16), ('xcp_stats', xcp_stats_t * 8), ('pcie_lc_perf_other_end_recovery', c_uint32), + ('vram_max_bandwidth', c_uint64), + ('xgmi_link_status', c_uint16 * 8), ] amdsmi_gpu_metrics_t = rsmi_gpu_metrics_t diff --git a/projects/rocm-smi-lib/rocm_smi/example/rocm_smi_example.cc b/projects/rocm-smi-lib/rocm_smi/example/rocm_smi_example.cc index e1528739b6..f6a3b8b0f7 100755 --- a/projects/rocm-smi-lib/rocm_smi/example/rocm_smi_example.cc +++ b/projects/rocm-smi-lib/rocm_smi/example/rocm_smi_example.cc @@ -921,6 +921,8 @@ int main() { << gpu_metrics.pcie_bandwidth_acc << "\n"; std::cout << "\t**.pcie_bandwidth_inst : " << std::dec << gpu_metrics.pcie_bandwidth_inst << "\n"; + std::cout << "\t**.vram_max_bandwidth=" << std::dec + << gpu_metrics.vram_max_bandwidth << "\n"; std::cout << "\t**.pcie_l0_to_recov_count_acc : " << std::dec << gpu_metrics.pcie_l0_to_recov_count_acc << "\n"; std::cout << "\t**.pcie_replay_count_acc : " << std::dec @@ -964,6 +966,11 @@ int main() { std::cout << "\t -> " << std::dec << write_data << "\n"; } + std::cout << "\t**.xgmi_link_status[] : " << std::dec << "\n"; + for (const auto& write_data : gpu_metrics.xgmi_link_status) { + std::cout << "\t -> " << std::dec << write_data << "\n"; + } + std::cout << "\t**.current_gfxclks[] : " << std::dec << "\n"; for (const auto& gfxclk : gpu_metrics.current_gfxclks) { std::cout << "\t -> " << std::dec << gfxclk << "\n"; @@ -1028,6 +1035,17 @@ int main() { xcp++; } + xcp = 0; + std::cout << std::dec << "xcp_stats.gfx_below_host_limit_acc = \n"; // new for 1.7 + for (auto& row : gpu_metrics.xcp_stats) { + std::cout << "XCP[" << xcp << "] = " << "[ "; + std::copy(std::begin(row.gfx_below_host_limit_acc), + std::end(row.gfx_below_host_limit_acc), + amd::smi::make_ostream_joiner(&std::cout, ", ")); + std::cout << " ]\n"; + xcp++; + } + std::cout << "\n"; std::cout << "\t ** -> Checking metrics with constant changes ** " << "\n"; constexpr uint16_t kMAX_ITER_TEST = 10; diff --git a/projects/rocm-smi-lib/src/rocm_smi_gpu_metrics.cc b/projects/rocm-smi-lib/src/rocm_smi_gpu_metrics.cc index c34fa09040..2053398895 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_gpu_metrics.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_gpu_metrics.cc @@ -157,6 +157,7 @@ std::string stringfy_metric_header_version(const AMDGpuMetricsHeader_v1_t& metri // version 1.4: 260 // version 1.5: 261 // version 1.6: 262 +// version 1.7: 263 // const AMDGpuMetricVersionTranslationTbl_t amdgpu_metric_version_translation_table { @@ -166,6 +167,7 @@ const AMDGpuMetricVersionTranslationTbl_t amdgpu_metric_version_translation_tabl {join_metrics_version(1, 4), AMDGpuMetricVersionFlags_t::kGpuMetricV14}, {join_metrics_version(1, 5), AMDGpuMetricVersionFlags_t::kGpuMetricV15}, {join_metrics_version(1, 6), AMDGpuMetricVersionFlags_t::kGpuMetricV16}, + {join_metrics_version(1, 7), AMDGpuMetricVersionFlags_t::kGpuMetricV17}, }; /** @@ -285,6 +287,12 @@ const AMDGpuMetricsUnitTypeTranslationTbl_t amdgpu_metrics_unit_type_translation // kGpuMetricLinkWidthSpeed {AMDGpuMetricsUnitType_t::kMetricPcieLCPerfOtherEndRecov, "PcieLCPerfOtherEndRecov"}, /* v1.6 */ + + + {AMDGpuMetricsUnitType_t::kMetricXgmiLinkStatus, "XgmiLinkStatus"}, /* v1.7 */ + {AMDGpuMetricsUnitType_t::kMetricVramMaxBandwidth, "VramMaxBandwidth"}, /* v1.7 */ + {AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitAccumulator, + "GfxBelowHostLimitAccumulator"}, /* v1.7 */ }; @@ -373,6 +381,7 @@ AMDGpuMetricFactories_t amd_gpu_metrics_factory_table {AMDGpuMetricVersionFlags_t::kGpuMetricV14, std::make_shared(GpuMetricsBase_v14_t{})}, {AMDGpuMetricVersionFlags_t::kGpuMetricV15, std::make_shared(GpuMetricsBase_v15_t{})}, {AMDGpuMetricVersionFlags_t::kGpuMetricV16, std::make_shared(GpuMetricsBase_v16_t{})}, + {AMDGpuMetricVersionFlags_t::kGpuMetricV17, std::make_shared(GpuMetricsBase_v17_t{})}, }; GpuMetricsBasePtr amdgpu_metrics_factory(AMDGpuMetricVersionFlags_t gpu_metric_version) @@ -491,6 +500,197 @@ AMDGpuDynamicMetricTblValues_t format_metric_row(const T& metric, const std::str return multi_values; } +void GpuMetricsBase_v17_t::dump_internal_metrics_table() +{ + std::ostringstream ss; + auto idx = uint64_t(0); + auto idy = uint64_t(0); + std::cout << __PRETTY_FUNCTION__ << " | ======= start ======= \n"; + ss << __PRETTY_FUNCTION__ + << " | ======= DEBUG ======= " + << " | Metric Version: " + << stringfy_metric_header_version(m_gpu_metrics_tbl.m_common_header) + << " | Size: " + << print_unsigned_int(m_gpu_metrics_tbl.m_common_header.m_structure_size) + << " |" + << "\n"; + ss << " temperature_hotspot: " << m_gpu_metrics_tbl.m_temperature_hotspot << "\n" + << " temperature_mem: " << m_gpu_metrics_tbl.m_temperature_mem << "\n" + << " temperature_vrsoc: " << m_gpu_metrics_tbl.m_temperature_vrsoc << "\n" + << " current_socket_power: " << m_gpu_metrics_tbl.m_current_socket_power << "\n" + << " average_gfx_activity: " << m_gpu_metrics_tbl.m_average_gfx_activity << "\n" + << " average_umc_activity: " << m_gpu_metrics_tbl.m_average_umc_activity << "\n"; + + ss << " vram_max_bandwidth: " << m_gpu_metrics_tbl.m_vram_max_bandwidth << "\n" // new for v1.7 + << " energy_accumulator: " << m_gpu_metrics_tbl.m_energy_accumulator << "\n" + << " system_clock_counter: " << m_gpu_metrics_tbl.m_system_clock_counter << "\n" + << " accumulation_counter: " << m_gpu_metrics_tbl.m_accumulation_counter << "\n" + << " prochot_residency_acc: " << m_gpu_metrics_tbl.m_prochot_residency_acc << "\n" + << " ppt_residency_acc: " << m_gpu_metrics_tbl.m_ppt_residency_acc << "\n" + << " socket_thm_residency_acc: " << m_gpu_metrics_tbl.m_socket_thm_residency_acc << "\n" + << " vr_thm_residency_acc: " << m_gpu_metrics_tbl.m_vr_thm_residency_acc << "\n" + << " hbm_thm_residency_acc: " << m_gpu_metrics_tbl.m_hbm_thm_residency_acc << "\n" + << " gfxclk_lock_status: " << m_gpu_metrics_tbl.m_gfxclk_lock_status << "\n" + << " pcie_link_width: " << m_gpu_metrics_tbl.m_pcie_link_width << "\n" + << " pcie_link_speed: " << m_gpu_metrics_tbl.m_pcie_link_speed << "\n" + << " xgmi_link_width: " << m_gpu_metrics_tbl.m_xgmi_link_width << "\n" + << " xgmi_link_speed: " << m_gpu_metrics_tbl.m_xgmi_link_speed << "\n" + << " gfx_activity_acc: " << m_gpu_metrics_tbl.m_gfx_activity_acc << "\n" + << " mem_activity_acc: " << m_gpu_metrics_tbl.m_mem_activity_acc << "\n" + << " pcie_bandwidth_acc: " << m_gpu_metrics_tbl.m_pcie_bandwidth_acc << "\n" + << " pcie_bandwidth_inst: " << m_gpu_metrics_tbl.m_pcie_bandwidth_inst << "\n" + << " pcie_l0_to_recov_count_acc: " << m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc << "\n" + << " pcie_replay_count_acc: " << m_gpu_metrics_tbl.m_pcie_replay_count_acc << "\n" + << " pcie_replay_rover_count_acc: " << m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc << "\n" + << " pcie_nak_sent_count_acc: " << m_gpu_metrics_tbl.m_pcie_nak_sent_count_acc << "\n" + << " pcie_nak_rcvd_count_acc: " << m_gpu_metrics_tbl.m_pcie_nak_rcvd_count_acc << "\n" + << " firmware_timestamp: " << m_gpu_metrics_tbl.m_firmware_timestamp << "\n" + << " current_uclk: " << m_gpu_metrics_tbl.m_current_uclk << "\n" + << " num_partition: " << m_gpu_metrics_tbl.m_num_partition << "\n" + << " pcie_lc_perf_other_end_recovery: " + << m_gpu_metrics_tbl.m_pcie_lc_perf_other_end_recovery << "\n"; + idx = 0; + for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_link_status) { // new for v1.7 + ss << "\t [" << idx << "]: " << temp << "\n"; + ++idx; + } + + for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_read_data_acc) { + ss << "\t [" << idx << "]: " << temp << "\n"; + ++idx; + } + + ss << " xgmi_write_data_acc: " << "\n"; + idx = 0; + for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_write_data_acc) { + ss << "\t [" << idx << "]: " << temp << "\n"; + ++idx; + } + + ss << " current_gfxclk: " << "\n"; + idx = 0; + for (const auto& temp : m_gpu_metrics_tbl.m_current_gfxclk) { + ss << "\t [" << idx << "]: " << temp << "\n"; + ++idx; + } + + ss << " current_socclk: " << "\n"; + idx = 0; + for (const auto& temp : m_gpu_metrics_tbl.m_current_socclk) { + ss << "\t [" << idx << "]: " << temp << "\n"; + ++idx; + } + + ss << " current_vclk0: " << "\n"; + idx = 0; + for (const auto& temp : m_gpu_metrics_tbl.m_current_vclk0) { + ss << "\t [" << idx << "]: " << temp << "\n"; + ++idx; + } + + ss << " current_dclk0: " << "\n"; + idx = 0; + for (const auto& temp : m_gpu_metrics_tbl.m_current_dclk0) { + ss << "\t [" << idx << "]: " << temp << "\n"; + ++idx; + } + + idx = 0; + idy = 0; + ss << " xcp_stats.gfx_busy_inst: " << "\n"; + for (auto& row : m_gpu_metrics_tbl.m_xcp_stats) { + if (idx == 0) { + ss << "\t [ "; + } + for (auto& col : row.gfx_busy_inst) { + ss << "\t [" << idx << "] [" << idy << "]: " << col; + if (idy + 1 != (std::end(row.gfx_busy_inst) - std::end(row.gfx_busy_inst) - 1)) { + ss << ", "; + } + if (idx + 1 != + (std::end(m_gpu_metrics_tbl.m_xcp_stats) - std::end(m_gpu_metrics_tbl.m_xcp_stats) - 1)) { + ss << "\n"; + } else { + ss << "]\n"; + } + idy++; + } + idx++; + } + + idx = 0; + idy = 0; + ss << " xcp_stats.vcn_busy: " << "\n"; + for (auto& row : m_gpu_metrics_tbl.m_xcp_stats) { + if (idx == 0) { + ss << "\t [ "; + } + for (auto& col : row.vcn_busy) { + ss << "\t [" << idx << "] [" << idy << "]: " << col; + if (idy + 1 != (std::end(row.vcn_busy) - std::end(row.vcn_busy) - 1)) { + ss << ", "; + } + if (idx + 1 != + (std::end(m_gpu_metrics_tbl.m_xcp_stats) - std::end(m_gpu_metrics_tbl.m_xcp_stats) - 1)) { + ss << "\n"; + } else { + ss << "]\n"; + } + idy++; + } + idx++; + } + + idx = 0; + idy = 0; + ss << " xcp_stats.jpeg_busy: " << "\n"; + for (auto& row : m_gpu_metrics_tbl.m_xcp_stats) { + if (idx == 0) { + ss << "\t [ "; + } + for (auto& col : row.jpeg_busy) { + ss << "\t [" << idx << "] [" << idy << "]: " << col; + if (idy + 1 != (std::end(row.jpeg_busy) - std::end(row.jpeg_busy) - 1)) { + ss << ", "; + } + if (idx + 1 != + (std::end(m_gpu_metrics_tbl.m_xcp_stats) - std::end(m_gpu_metrics_tbl.m_xcp_stats) - 1)) { + ss << "\n"; + } else { + ss << "]\n"; + } + idy++; + } + idx++; + } + + idx = 0; + idy = 0; + ss << " xcp_stats.gfx_busy_acc: " << "\n"; + for (auto& row : m_gpu_metrics_tbl.m_xcp_stats) { + if (idx == 0) { + ss << "\t [ "; + } + for (auto& col : row.gfx_busy_acc) { + ss << "\t [" << idx << "] [" << idy << "]: " << col; + if (idy + 1 != (std::end(row.gfx_busy_acc) - std::end(row.gfx_busy_acc) - 1)) { + ss << ", "; + } + if (idx + 1 != + (std::end(m_gpu_metrics_tbl.m_xcp_stats) - std::end(m_gpu_metrics_tbl.m_xcp_stats) - 1)) { + ss << "\n"; + } else { + ss << "]\n"; + } + idy++; + } + idx++; + } + + LOG_DEBUG(ss); +} + + void GpuMetricsBase_v16_t::dump_internal_metrics_table() { std::ostringstream ss; @@ -677,6 +877,263 @@ void GpuMetricsBase_v16_t::dump_internal_metrics_table() LOG_DEBUG(ss); } +rsmi_status_t GpuMetricsBase_v17_t::populate_metrics_dynamic_tbl() { + std::ostringstream ss; + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + ss << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ss); + + if (!m_metrics_dynamic_tbl.empty()) { + m_metrics_dynamic_tbl.clear(); + } + + // + // Note: Any metric treatment/changes (if any) should happen before they + // get written to internal/external tables. + // + auto run_metric_adjustments_v17 = [&]() { + ss << __PRETTY_FUNCTION__ << " | ======= start ======="; + const auto gpu_metrics_version = + translate_flag_to_metric_version(get_gpu_metrics_version_used()); + ss << __PRETTY_FUNCTION__ + << " | ======= info ======= " + << " | Applying adjustments " + << " | Metric Version: " << stringfy_metric_header_version( + disjoin_metrics_version(gpu_metrics_version)) + << " |"; + LOG_TRACE(ss); + + // firmware_timestamp is at 10ns resolution + ss << __PRETTY_FUNCTION__ + << " | ======= Changes ======= " + << " | {m_firmware_timestamp} from: " << m_gpu_metrics_tbl.m_firmware_timestamp + << " to: " << (m_gpu_metrics_tbl.m_firmware_timestamp * 10); + m_gpu_metrics_tbl.m_firmware_timestamp = (m_gpu_metrics_tbl.m_firmware_timestamp * 10); + LOG_DEBUG(ss); + }; + + // Adjustments/Changes specific to this version + run_metric_adjustments_v17(); + + // Temperature Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempHotspot, + format_metric_row(m_gpu_metrics_tbl.m_temperature_hotspot, + "temperature_hotspot"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempMem, + format_metric_row(m_gpu_metrics_tbl.m_temperature_mem, + "temperature_mem"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempVrSoc, + format_metric_row(m_gpu_metrics_tbl.m_temperature_vrsoc, + "temperature_vrsoc"))); + + // Power/Energy Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricPowerEnergy] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrSocketPower, + format_metric_row(m_gpu_metrics_tbl.m_current_socket_power, + "curr_socket_power"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricPowerEnergy] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricEnergyAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_energy_accumulator, + "energy_acc"))); + + // Utilization Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgGfxActivity, + format_metric_row(m_gpu_metrics_tbl.m_average_gfx_activity, + "average_gfx_activity"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgUmcActivity, + format_metric_row(m_gpu_metrics_tbl.m_average_umc_activity, + "average_umc_activity"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxActivityAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_gfx_activity_acc, + "gfx_activity_acc"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricMemActivityAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_mem_activity_acc, + "mem_activity_acc"))); + + // Timestamp Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTimestamp] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTSFirmware, + format_metric_row(m_gpu_metrics_tbl.m_firmware_timestamp, + "firmware_timestamp"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTimestamp] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTSClockCounter, + format_metric_row(m_gpu_metrics_tbl.m_system_clock_counter, + "system_clock_counter"))); + + + // GfxLock Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricGfxClkLockStatus] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxClkLockStatus, + format_metric_row(m_gpu_metrics_tbl.m_gfxclk_lock_status, + "gfxclk_lock_status"))); + + // Link/Width/Speed Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieLinkWidth, + format_metric_row(m_gpu_metrics_tbl.m_pcie_link_width, + "pcie_link_width"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieLinkSpeed, + format_metric_row(m_gpu_metrics_tbl.m_pcie_link_speed, + "pcie_link_speed"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiLinkWidth, + format_metric_row(m_gpu_metrics_tbl.m_xgmi_link_width, + "xgmi_link_width"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiLinkSpeed, + format_metric_row(m_gpu_metrics_tbl.m_xgmi_link_speed, + "xgmi_link_speed"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieBandwidthAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_pcie_bandwidth_acc, + "pcie_bandwidth_acc"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieBandwidthInst, + format_metric_row(m_gpu_metrics_tbl.m_pcie_bandwidth_inst, + "pcie_bandwidth_inst"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieL0RecovCountAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc, + "pcie_l0_recov_count_acc"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieReplayCountAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_pcie_replay_count_acc, + "pcie_replay_count_acc"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieReplayRollOverCountAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc, + "pcie_replay_rollover_count_acc"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieNakSentCountAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_pcie_nak_sent_count_acc, + "pcie_nak_sent_count_acc"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieNakReceivedCountAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_pcie_nak_rcvd_count_acc, + "pcie_nak_rcvd_count_acc"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiReadDataAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_xgmi_read_data_acc, + "[xgmi_read_data_acc]"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiWriteDataAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_xgmi_write_data_acc, + "[xgmi_write_data_acc]"))); + // new for v1.7 + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiLinkStatus, + format_metric_row(m_gpu_metrics_tbl.m_xgmi_link_status, + "[xgmi_link_status]"))); + // CurrentClock Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrGfxClock, + format_metric_row(m_gpu_metrics_tbl.m_current_gfxclk, + "[current_gfxclk]"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrSocClock, + format_metric_row(m_gpu_metrics_tbl.m_current_socclk, + "[current_socclk]"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrVClock0, + format_metric_row(m_gpu_metrics_tbl.m_current_vclk0, + "[current_vclk0]"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrDClock0, + format_metric_row(m_gpu_metrics_tbl.m_current_dclk0, + "[current_dclk0]"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrUClock, + format_metric_row(m_gpu_metrics_tbl.m_current_uclk, + "current_uclk"))); + + /* Accumulation cycle counter */ + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAccumulationCounter, + format_metric_row(m_gpu_metrics_tbl.m_accumulation_counter, + "accumulation_counter"))); + + /* Accumulated throttler residencies */ + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricProchotResidencyAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_prochot_residency_acc, + "prochot_residency_acc"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPPTResidencyAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_ppt_residency_acc, + "ppt_residency_acc"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricSocketThmResidencyAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_socket_thm_residency_acc, + "socket_thm_residency_acc"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricVRThmResidencyAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_vr_thm_residency_acc, + "vr_thm_residency_acc"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricHBMThmResidencyAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_hbm_thm_residency_acc, + "hbm_thm_residency_acc"))); + + /* Partition info */ + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricPartition] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kGpuMetricNumPartition, + format_metric_row(m_gpu_metrics_tbl.m_num_partition, + "num_partition"))); + + /* xcp_stats info */ + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxBusyInst, + format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_busy_inst, + "xcp_stats->gfx_busy_inst"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricVcnBusy, + format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->vcn_busy, + "xcp_stats->vcn_busy"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricJpegBusy, + format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->jpeg_busy, + "xcp_stats->jpeg_busy"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxBusyAcc, + format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_busy_acc, + "xcp_stats->gfx_busy_acc"))); + + /* PCIE other end recovery counter info */ + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieLCPerfOtherEndRecov, + format_metric_row(m_gpu_metrics_tbl.m_pcie_lc_perf_other_end_recovery, + "pcie_lc_perf_other_end_recovery"))); + + /* VRAM max bandwidth at max memory clock */ + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricVramMaxBandwidth, + format_metric_row(m_gpu_metrics_tbl.m_vram_max_bandwidth, + "vram_max_bandwidth"))); + + /* Total App Clock Counter Accumulated */ + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_below_host_limit_acc, + "gfx_below_host_limit_acc"))); + + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Returning = " << getRSMIStatusString(status_code) + << " |"; + LOG_TRACE(ss); + + return status_code; +} + rsmi_status_t GpuMetricsBase_v16_t::populate_metrics_dynamic_tbl() { std::ostringstream ss; auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); @@ -714,7 +1171,6 @@ rsmi_status_t GpuMetricsBase_v16_t::populate_metrics_dynamic_tbl() { // Adjustments/Changes specific to this version run_metric_adjustments_v16(); - // Temperature Info m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempHotspot, @@ -1608,6 +2064,12 @@ rsmi_status_t init_max_public_gpu_matrics(AMGpuMetricsPublicLatest_t& rsmi_gpu_m rsmi_gpu_metrics.pcie_link_speed = init_max_uint_types(); rsmi_gpu_metrics.gfx_activity_acc = init_max_uint_types(); rsmi_gpu_metrics.mem_activity_acc = init_max_uint_types(); + rsmi_gpu_metrics.vram_max_bandwidth = init_max_uint_types(); + + std::fill(std::begin(rsmi_gpu_metrics.xgmi_link_status), + std::end(rsmi_gpu_metrics.xgmi_link_status), + init_max_uint_types()); + std::fill(std::begin(rsmi_gpu_metrics.temperature_hbm), std::end(rsmi_gpu_metrics.temperature_hbm), @@ -1684,6 +2146,8 @@ rsmi_status_t init_max_public_gpu_matrics(AMGpuMetricsPublicLatest_t& rsmi_gpu_m init_max_uint_types()); std::fill(std::begin(row.gfx_busy_acc), std::end(row.gfx_busy_acc), init_max_uint_types()); + std::fill(std::begin(row.gfx_below_host_limit_acc), std::end(row.gfx_below_host_limit_acc), + init_max_uint_types()); } ss << __PRETTY_FUNCTION__ @@ -1696,6 +2160,225 @@ rsmi_status_t init_max_public_gpu_matrics(AMGpuMetricsPublicLatest_t& rsmi_gpu_m return status_code; } +AMGpuMetricsPublicLatestTupl_t GpuMetricsBase_v17_t::copy_internal_to_external_metrics() +{ + std::ostringstream ss; + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + ss << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ss); + + auto copy_data_from_internal_metrics_tbl = [&]() { + AMGpuMetricsPublicLatest_t metrics_public_init{}; + + // + // Note: Initializing data members with their max. If field is max, + // no data was assigned to it. + init_max_public_gpu_matrics(metrics_public_init); + + // Header + metrics_public_init.common_header.structure_size = m_gpu_metrics_tbl.m_common_header.m_structure_size; + metrics_public_init.common_header.format_revision = m_gpu_metrics_tbl.m_common_header.m_format_revision; + metrics_public_init.common_header.content_revision = m_gpu_metrics_tbl.m_common_header.m_content_revision; + + + // Temperature + metrics_public_init.temperature_hotspot = m_gpu_metrics_tbl.m_temperature_hotspot; + metrics_public_init.temperature_mem = m_gpu_metrics_tbl.m_temperature_mem; + metrics_public_init.temperature_vrsoc = m_gpu_metrics_tbl.m_temperature_vrsoc; + + // Power + metrics_public_init.current_socket_power = m_gpu_metrics_tbl.m_current_socket_power; + + // Utilization + metrics_public_init.average_gfx_activity = m_gpu_metrics_tbl.m_average_gfx_activity; + metrics_public_init.average_umc_activity = m_gpu_metrics_tbl.m_average_umc_activity; + + // Power/Energy + metrics_public_init.energy_accumulator = m_gpu_metrics_tbl.m_energy_accumulator; + + // Driver attached timestamp (in ns) + metrics_public_init.system_clock_counter = m_gpu_metrics_tbl.m_system_clock_counter; + + // Clock Lock Status. Each bit corresponds to clock instance + metrics_public_init.gfxclk_lock_status = m_gpu_metrics_tbl.m_gfxclk_lock_status; + + // Link width (number of lanes) and speed + metrics_public_init.pcie_link_width = m_gpu_metrics_tbl.m_pcie_link_width; + metrics_public_init.pcie_link_speed = m_gpu_metrics_tbl.m_pcie_link_speed; + + // XGMI bus width and bitrate + metrics_public_init.xgmi_link_width = m_gpu_metrics_tbl.m_xgmi_link_width; + metrics_public_init.xgmi_link_speed = m_gpu_metrics_tbl.m_xgmi_link_speed; + + // Utilization Accumulated + metrics_public_init.gfx_activity_acc = m_gpu_metrics_tbl.m_gfx_activity_acc; + metrics_public_init.mem_activity_acc = m_gpu_metrics_tbl.m_mem_activity_acc; + + // PCIE accumulated bandwidth + metrics_public_init.pcie_bandwidth_acc = m_gpu_metrics_tbl.m_pcie_bandwidth_acc; + + // PCIE instantaneous bandwidth + metrics_public_init.pcie_bandwidth_inst = m_gpu_metrics_tbl.m_pcie_bandwidth_inst; + + // PCIE L0 to recovery state transition accumulated count + metrics_public_init.pcie_l0_to_recov_count_acc = m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc; + + // PCIE replay accumulated count + metrics_public_init.pcie_replay_count_acc = m_gpu_metrics_tbl.m_pcie_replay_count_acc; + + // PCIE replay rollover accumulated count + metrics_public_init.pcie_replay_rover_count_acc = m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc; + + // PCIE NAK sent accumulated count + metrics_public_init.pcie_nak_sent_count_acc = m_gpu_metrics_tbl.m_pcie_nak_sent_count_acc; + + // PCIE NAK received accumulated count + metrics_public_init.pcie_nak_rcvd_count_acc = m_gpu_metrics_tbl.m_pcie_nak_rcvd_count_acc; + + // Accumulated throttler residencies + // bumped up public to uint64_t due to planned size increase for newer ASICs + metrics_public_init.accumulation_counter = m_gpu_metrics_tbl.m_accumulation_counter; + metrics_public_init.prochot_residency_acc = m_gpu_metrics_tbl.m_prochot_residency_acc; + metrics_public_init.ppt_residency_acc = m_gpu_metrics_tbl.m_ppt_residency_acc; + metrics_public_init.socket_thm_residency_acc = m_gpu_metrics_tbl.m_socket_thm_residency_acc; + metrics_public_init.vr_thm_residency_acc = m_gpu_metrics_tbl.m_vr_thm_residency_acc; + metrics_public_init.hbm_thm_residency_acc = m_gpu_metrics_tbl.m_hbm_thm_residency_acc; + + /* VRAM max bandwidth at max memory clock */ + metrics_public_init.vram_max_bandwidth = m_gpu_metrics_tbl.m_vram_max_bandwidth; + + // XGMI accumulated data transfer size + // xgmi_read_data + const auto xgmi_read_data_num_elems = + static_cast( + std::end(m_gpu_metrics_tbl.m_xgmi_read_data_acc) - + std::begin(m_gpu_metrics_tbl.m_xgmi_read_data_acc)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_xgmi_read_data_acc), + xgmi_read_data_num_elems, + metrics_public_init.xgmi_read_data_acc); + // xgmi_write_data + const auto xgmi_write_data_num_elems = + static_cast( + std::end(m_gpu_metrics_tbl.m_xgmi_write_data_acc) - + std::begin(m_gpu_metrics_tbl.m_xgmi_write_data_acc)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_xgmi_write_data_acc), + xgmi_write_data_num_elems, + metrics_public_init.xgmi_write_data_acc); + + // xgmi_link_status // new for 1.7 + const auto xgmi_link_status_num_elems = static_cast( + std::end(m_gpu_metrics_tbl.m_xgmi_link_status) - + std::begin(m_gpu_metrics_tbl.m_xgmi_link_status)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_xgmi_link_status), + xgmi_link_status_num_elems, + metrics_public_init.xgmi_link_status); + + // PMFW attached timestamp (10ns resolution) + metrics_public_init.firmware_timestamp = m_gpu_metrics_tbl.m_firmware_timestamp; + + // Current clocks + // current_gfxclk + const auto curr_gfxclk_num_elems = + static_cast( + std::end(m_gpu_metrics_tbl.m_current_gfxclk) - + std::begin(m_gpu_metrics_tbl.m_current_gfxclk)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_gfxclk), + curr_gfxclk_num_elems, + metrics_public_init.current_gfxclks); + + // current_socclk + const auto curr_socclk_num_elems = + static_cast( + std::end(m_gpu_metrics_tbl.m_current_socclk) - + std::begin(m_gpu_metrics_tbl.m_current_socclk)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_socclk), + curr_socclk_num_elems, + metrics_public_init.current_socclks); + + // current_vclk0 + const auto curr_vclk0_num_elems = + static_cast( + std::end(m_gpu_metrics_tbl.m_current_vclk0) - + std::begin(m_gpu_metrics_tbl.m_current_vclk0)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_vclk0), + curr_vclk0_num_elems, + metrics_public_init.current_vclk0s); + + // current_dclk0 + const auto curr_dclk0_num_elems = + static_cast( + std::end(m_gpu_metrics_tbl.m_current_dclk0) - + std::begin(m_gpu_metrics_tbl.m_current_dclk0)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_dclk0), + curr_dclk0_num_elems, + metrics_public_init.current_dclk0s); + + metrics_public_init.current_uclk = m_gpu_metrics_tbl.m_current_uclk; + + metrics_public_init.num_partition = m_gpu_metrics_tbl.m_num_partition; + + metrics_public_init.pcie_lc_perf_other_end_recovery = + m_gpu_metrics_tbl.m_pcie_lc_perf_other_end_recovery; + + auto priv_it = std::begin(m_gpu_metrics_tbl.m_xcp_stats); + for (auto pub_it = std::begin(metrics_public_init.xcp_stats); + pub_it != std::end(metrics_public_init.xcp_stats); + ++pub_it, ++priv_it) { + std::copy_n(std::begin(priv_it->gfx_busy_inst), RSMI_MAX_NUM_XCC, + pub_it->gfx_busy_inst); + std::copy_n(std::begin(priv_it->jpeg_busy), RSMI_MAX_NUM_JPEG_ENGS, + pub_it->jpeg_busy); + std::copy_n(std::begin(priv_it->vcn_busy), RSMI_MAX_NUM_VCNS, + pub_it->vcn_busy); + std::copy_n(std::begin(priv_it->gfx_busy_acc), RSMI_MAX_NUM_XCC, + pub_it->gfx_busy_acc); + std::copy_n(std::begin(priv_it->gfx_below_host_limit_acc), RSMI_MAX_NUM_XCC, + pub_it->gfx_below_host_limit_acc); + } + + // + // Note: Backwards compatibility -> Handling extra/exception cases + // related to earlier versions (1.3/1.4/1.5) + metrics_public_init.current_gfxclk = metrics_public_init.current_gfxclks[0]; + + metrics_public_init.current_socclk = metrics_public_init.current_socclks[0]; + + metrics_public_init.current_vclk0 = metrics_public_init.current_vclk0s[0]; + + metrics_public_init.current_vclk1 = metrics_public_init.current_vclk0s[1]; + + metrics_public_init.current_dclk0 = metrics_public_init.current_dclk0s[0]; + + metrics_public_init.current_dclk1 = metrics_public_init.current_dclk0s[1]; + + // separate by XCP + if (this->m_partition_id < kRSMI_MAX_NUM_XCP + && m_gpu_metrics_tbl.m_xcp_stats[this->m_partition_id].vcn_busy[0] != UINT16_MAX) { + std::copy(std::begin(m_gpu_metrics_tbl.m_xcp_stats[this->m_partition_id].vcn_busy), + std::end(m_gpu_metrics_tbl.m_xcp_stats[this->m_partition_id].vcn_busy), + std::begin(metrics_public_init.vcn_activity)); + } + if (this->m_partition_id < kRSMI_MAX_NUM_XCP + && m_gpu_metrics_tbl.m_xcp_stats[this->m_partition_id].jpeg_busy[0] != UINT16_MAX) { + std::copy(std::begin(m_gpu_metrics_tbl.m_xcp_stats[this->m_partition_id].jpeg_busy), + std::end(m_gpu_metrics_tbl.m_xcp_stats[this->m_partition_id].jpeg_busy), + std::begin(metrics_public_init.jpeg_activity)); + } + + return metrics_public_init; + }(); + + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Returning = " << getRSMIStatusString(status_code) + << " |"; + LOG_TRACE(ss); + + return std::make_tuple(status_code, copy_data_from_internal_metrics_tbl); + +} + AMGpuMetricsPublicLatestTupl_t GpuMetricsBase_v16_t::copy_internal_to_external_metrics() { std::ostringstream ss; @@ -2695,9 +3378,18 @@ AMGpuMetricsPublicLatestTupl_t GpuMetricsBase_v13_t::copy_internal_to_external_m // Note: Backwards compatibility -> Handling extra/exception cases // related to earlier versions (1.2) // metrics_public_init.current_socket_power = metrics_public_init.average_socket_power; - if (metrics_public_init.average_mm_activity != UINT16_MAX) { + // average_mm_activity needs to not be UIN16_MAX and + // metrics_public_init.vcn_activity[0] should also be UIN16_MAX + if (metrics_public_init.average_mm_activity != UINT16_MAX + && metrics_public_init.vcn_activity[0] == UINT16_MAX) { metrics_public_init.vcn_activity[0] = metrics_public_init.average_mm_activity; } + // average_mm_activity needs to not be UIN16_MAX and + // metrics_public_init.xcp_stats->vcn_busy[0] should also be UIN16_MAX + if (metrics_public_init.average_mm_activity != UINT16_MAX + && metrics_public_init.xcp_stats->vcn_busy[0] == UINT16_MAX) { + metrics_public_init.xcp_stats->vcn_busy[0] = metrics_public_init.average_mm_activity; + } return metrics_public_init; }(); diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/gpu_metrics_read.cc b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/gpu_metrics_read.cc index 2ec93cbdf6..cffac628ca 100644 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/gpu_metrics_read.cc +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/gpu_metrics_read.cc @@ -269,6 +269,12 @@ void TestGpuMetricsRead::Run(void) { amd::smi::make_ostream_joiner(&std::cout, ", ")); std::cout << std::dec << "]\n"; + std::cout << std::dec << "xgmi_link_status= ["; + std::copy(std::begin(smu.xgmi_link_status), + std::end(smu.xgmi_link_status), + amd::smi::make_ostream_joiner(&std::cout, ", ")); + std::cout << std::dec << "]\n"; + // Voltage (mV) std::cout << "voltage_soc = " << std::dec << smu.voltage_soc << "\n"; std::cout << "voltage_gfx = " << std::dec << smu.voltage_gfx << "\n"; @@ -284,6 +290,9 @@ void TestGpuMetricsRead::Run(void) { std::cout << "pcie_bandwidth_acc=" << std::dec << smu.pcie_bandwidth_acc << "\n"; std::cout << "pcie_bandwidth_inst=" << std::dec << smu.pcie_bandwidth_inst << "\n"; + // VRAM max bandwidth at max memory clock + std::cout << "vram_max_bandwidth=" << std::dec << smu.vram_max_bandwidth << "\n"; + // Counts std::cout << "pcie_l0_to_recov_count_acc= " << std::dec << smu.pcie_l0_to_recov_count_acc << "\n"; @@ -360,6 +369,17 @@ void TestGpuMetricsRead::Run(void) { xcp++; } + xcp = 0; + std::cout << std::dec << "xcp_stats.gfx_below_host_limit_acc = \n"; // new for 1.7 + for (auto& row : smu.xcp_stats) { + std::cout << "XCP[" << xcp << "] = " << "[ "; + std::copy(std::begin(row.gfx_below_host_limit_acc), + std::end(row.gfx_below_host_limit_acc), + amd::smi::make_ostream_joiner(&std::cout, ", ")); + std::cout << " ]\n"; + xcp++; + } + std::cout << "\n\n"; std::cout << "\t ** -> Checking metrics with constant changes ** " << "\n"; constexpr uint16_t kMAX_ITER_TEST = 10;