From 4f502e5daba62cea6de88b3f803ac035492ebf04 Mon Sep 17 00:00:00 2001 From: Charis Poag Date: Wed, 6 Dec 2023 03:25:38 -0600 Subject: [PATCH] Add vcn and jpeg activity Changes: - Add new engine field vcn_activity (from 1.4/1.5 gpu_metrics - Updated log output to enhance view of gpu_metric data as json pretty print - Added new fields provided in 1.5 - Added unit overview in python API, CLI is WIP Signed-off-by: Charis Poag Change-Id: I7d9f29e7ecc35dcd0697814c222cdd02b0d5518e [ROCm/amdsmi commit: 8f3861e1d9ab117a452646b9ef7042fd1b3d9638] --- projects/amdsmi/amdsmi_cli/amdsmi_commands.py | 32 +++-- projects/amdsmi/include/amd_smi/amdsmi.h | 134 +++++++++++------- .../include/amd_smi/impl/amd_smi_gpu_device.h | 1 + projects/amdsmi/py-interface/README.md | 117 +++++++-------- .../amdsmi/py-interface/amdsmi_interface.py | 8 ++ .../amdsmi/py-interface/amdsmi_wrapper.py | 6 + projects/amdsmi/src/amd_smi/amd_smi.cc | 28 +++- projects/amdsmi/src/amd_smi/amd_smi_utils.cc | 19 ++- 8 files changed, 229 insertions(+), 116 deletions(-) diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index e8ec181d25..aa2ea7adeb 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -24,6 +24,7 @@ import logging import sys import threading import time +import json from _version import __version__ from amdsmi_helpers import AMDSMIHelpers @@ -941,7 +942,9 @@ class AMDSMICommands(): # Put the metrics table in the debug logs try: - logging.debug("GPU Metrics table for %s | %s", gpu_id, amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)) + gpu_metric_output = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu) + gpu_metric_str = json.dumps(gpu_metric_output, indent=4) + logging.debug("GPU Metrics table for %s | %s", gpu_id, gpu_metric_str) except amdsmi_exception.AmdSmiLibraryException as e: logging.debug("Unabled to load GPU Metrics table for %s | %s", gpu_id, e.err_info) @@ -963,14 +966,30 @@ class AMDSMICommands(): engine_usage['gfx_usage'] = engine_usage.pop('gfx_activity') engine_usage['mem_usage'] = engine_usage.pop('umc_activity') engine_usage['mm_ip_usage'] = engine_usage.pop('mm_activity') - + engine_usage['vcn_activities'] = gpu_metric_output.pop('vcn_activity') + engine_usage['jpeg_activities[AID0]'] = gpu_metric_output.pop('jpeg_activities[AID0]') + engine_usage['jpeg_activities[AID1]'] = gpu_metric_output.pop('jpeg_activities[AID1]') + engine_usage['jpeg_activities[AID2]'] = gpu_metric_output.pop('jpeg_activities[AID2]') + engine_usage['jpeg_activities[AID3]'] = gpu_metric_output.pop('jpeg_activities[AID3]') for key, value in engine_usage.items(): - if value == 65535: + if not isinstance(value, list) and value > 100: engine_usage[key] = "N/A" + elif isinstance(value, list): + engine_usage[key] = ["N/A" if v > 100 else v for v in value] if self.logger.is_human_readable_format(): - if engine_usage[key] != "N/A": - unit = '%' + unit = '%' + if isinstance(value, list): + engine_usage[key] = [f"{v} {unit}" if str(v) != "N/A" else str(v) for v in engine_usage[key]] + save_value = engine_usage[key] + pretty_array = "[" + for i in range(len(save_value)): + if (i+1 != len(save_value)): + pretty_array += save_value[i] + ", " + else: + pretty_array += save_value[i] + "]" + engine_usage[key] = pretty_array + elif not isinstance(value, list) and engine_usage[key] != "N/A": engine_usage[key] = f"{value} {unit}" values_dict['usage'] = engine_usage @@ -1225,9 +1244,6 @@ class AMDSMICommands(): logging.debug("Failed to get pcie replay rollover counter for gpu %s | %s", gpu_id, e.get_error_info()) try: - # nak_info = amdsmi_interface.amdsmi_get_gpu_pci_nak_info(args.gpu) - # pcie_dict['nak_sent_count'] = nak_info['nak_sent_count'] - # pcie_dict['nak_received_count'] = nak_info['nak_received_count'] pcie_dict['nak_sent_count'] = "N/A" pcie_dict['nak_received_count'] = "N/A" except amdsmi_exception.AmdSmiLibraryException as e: diff --git a/projects/amdsmi/include/amd_smi/amdsmi.h b/projects/amdsmi/include/amd_smi/amdsmi.h index 6dff3c3306..c25c6e6381 100644 --- a/projects/amdsmi/include/amd_smi/amdsmi.h +++ b/projects/amdsmi/include/amd_smi/amdsmi.h @@ -88,6 +88,55 @@ typedef enum { #define AMDSMI_GPU_UUID_SIZE 38 +/** + * @brief The following structure holds the gpu metrics values for a device. + */ + +/** + * @brief Unit conversion factor for HBM temperatures + */ +#define CENTRIGRADE_TO_MILLI_CENTIGRADE 1000 + +/** + * @brief This should match NUM_HBM_INSTANCES + */ +#define AMDSMI_NUM_HBM_INSTANCES 4 + +/** + * @brief This should match MAX_NUM_VCN + */ +#define AMDSMI_MAX_NUM_VCN 4 + +/** + * @brief This should match MAX_NUM_CLKS + */ +#define AMDSMI_MAX_NUM_CLKS 4 + +/** + * @brief This should match MAX_NUM_XGMI_LINKS + */ +#define AMDSMI_MAX_NUM_XGMI_LINKS 8 + +/** + * @brief This should match MAX_NUM_GFX_CLKS + */ +#define AMDSMI_MAX_NUM_GFX_CLKS 8 + +/** + * @brief This should match AMDSMI_MAX_AID + */ +#define AMDSMI_MAX_AID 4 + +/** + * @brief This should match AMDSMI_MAX_ENGINES + */ +#define AMDSMI_MAX_ENGINES 8 + +/** + * @brief This should match AMDSMI_MAX_NUM_JPEG (8*4=32) + */ +#define AMDSMI_MAX_NUM_JPEG 32 + /* string format */ #define AMDSMI_TIME_FORMAT "%02d:%02d:%02d.%03d" #define AMDSMI_DATE_FORMAT "%04d-%02d-%02d:%02d:%02d:%02d.%03d" @@ -544,6 +593,12 @@ typedef struct { uint32_t reserved[4]; } amdsmi_clk_info_t; +/** + * amdsmi_engine_usage_t: + * This structure holds common + * GPU activity values seen in both BM or + * SRIOV + **/ typedef struct { uint32_t gfx_activity; uint32_t umc_activity; @@ -1137,41 +1192,6 @@ typedef struct { /// \endcond } amd_metrics_table_header_t; -/** - * @brief The following structure holds the gpu metrics values for a device. - */ - -/** - * @brief Unit conversion factor for HBM temperatures - */ -#define CENTRIGRADE_TO_MILLI_CENTIGRADE 1000 - -/** - * @brief This should match NUM_HBM_INSTANCES - */ -#define AMDSMI_NUM_HBM_INSTANCES 4 - -/** - * @brief This should match MAX_NUM_VCN - */ -#define AMDSMI_MAX_NUM_VCN 4 - -/** - * @brief This should match MAX_NUM_CLKS - */ -#define AMDSMI_MAX_NUM_CLKS 4 - -/** - * @brief This should match MAX_NUM_XGMI_LINKS - */ -#define AMDSMI_MAX_NUM_XGMI_LINKS 8 - -/** - * @brief This should match MAX_NUM_GFX_CLKS - */ -#define AMDSMI_MAX_NUM_GFX_CLKS 8 - - typedef struct { // TODO(amd) Doxygen documents // Note: This structure is extended to fit the needs of different GPU metric @@ -1191,7 +1211,7 @@ typedef struct { /* * v1.0 Base */ - // Temperature + // Temperature (C) uint16_t temperature_edge; uint16_t temperature_hotspot; uint16_t temperature_mem; @@ -1199,19 +1219,19 @@ typedef struct { uint16_t temperature_vrsoc; uint16_t temperature_vrmem; - // Utilization + // Utilization (%) uint16_t average_gfx_activity; uint16_t average_umc_activity; // memory controller uint16_t average_mm_activity; // UVD or VCN - // Power/Energy + // Power (W) /Energy (15.259uJ per 1ns) uint16_t average_socket_power; uint64_t energy_accumulator; // v1 mod. (32->64) // Driver attached timestamp (in ns) uint64_t system_clock_counter; // v1 mod. (moved from top of struct) - // Average clocks + // Average clocks (MHz) uint16_t average_gfxclk_frequency; uint16_t average_socclk_frequency; uint16_t average_uclk_frequency; @@ -1220,7 +1240,7 @@ typedef struct { uint16_t average_vclk1_frequency; uint16_t average_dclk1_frequency; - // Current clocks + // Current clocks (MHz) uint16_t current_gfxclk; uint16_t current_socclk; uint16_t current_uclk; @@ -1232,10 +1252,10 @@ typedef struct { // Throttle status uint32_t throttle_status; - // Fans + // Fans (RPM) uint16_t current_fan_speed; - // Link width/speed + // Link width (number of lanes) /speed (0.1 GT/s) uint16_t pcie_link_width; // v1 mod.(8->16) uint16_t pcie_link_speed; // in 0.1 GT/s; v1 mod. (8->16) @@ -1274,19 +1294,19 @@ typedef struct { uint16_t current_socket_power; // Utilization (%) - uint16_t vcn_activity[AMDSMI_MAX_NUM_VCN]; // VCN instances activity percent (encode/decode) + uint16_t vcn_activity[AMDSMI_MAX_NUM_VCN]; // Clock Lock Status. Each bit corresponds to clock instance uint32_t gfxclk_lock_status; - // XGMI bus width and bitrate (in Gbps) + // XGMI bus width and bitrate (in GB/s) uint16_t xgmi_link_width; uint16_t xgmi_link_speed; - // PCIE accumulated bandwidth (GB/sec) + // PCIe accumulated bandwidth (GB/sec) uint64_t pcie_bandwidth_acc; - // PCIE instantaneous bandwidth (GB/sec) + // PCIe instantaneous bandwidth (GB/sec) uint64_t pcie_bandwidth_inst; // PCIE L0 to recovery state transition accumulated count @@ -1298,15 +1318,33 @@ typedef struct { // PCIE replay rollover accumulated count uint64_t pcie_replay_rover_count_acc; - // XGMI accumulated data transfer size(KiloBytes) + // XGMI accumulated data transfer size (KB) uint64_t xgmi_read_data_acc[AMDSMI_MAX_NUM_XGMI_LINKS]; uint64_t xgmi_write_data_acc[AMDSMI_MAX_NUM_XGMI_LINKS]; - // Current clock frequencies + // Current clock frequencies (MHz) uint16_t current_gfxclks[AMDSMI_MAX_NUM_GFX_CLKS]; uint16_t current_socclks[AMDSMI_MAX_NUM_CLKS]; uint16_t current_vclk0s[AMDSMI_MAX_NUM_CLKS]; uint16_t current_dclk0s[AMDSMI_MAX_NUM_CLKS]; + + /* + * v1.5 additions + */ + // Memory Bandwidth Usage Accumulated (GB/sec) + uint64_t mem_bandwidth_acc; + + // Memory Bandwidth Maximum (GB/sec) + uint32_t mem_max_bandwidth; + + // PCIE NAK sent accumulated count + uint32_t pcie_nak_sent_count_acc; + + // PCIE NAK received accumulated count + uint32_t pcie_nak_rcvd_count_acc; + + // JPEG activity % per AID + uint16_t jpeg_activity[AMDSMI_MAX_NUM_JPEG]; /// \endcond } amdsmi_gpu_metrics_t; diff --git a/projects/amdsmi/include/amd_smi/impl/amd_smi_gpu_device.h b/projects/amdsmi/include/amd_smi/impl/amd_smi_gpu_device.h index 61290a5f37..b512ce3e76 100644 --- a/projects/amdsmi/include/amd_smi/impl/amd_smi_gpu_device.h +++ b/projects/amdsmi/include/amd_smi/impl/amd_smi_gpu_device.h @@ -48,6 +48,7 @@ #include "amd_smi/impl/amd_smi_processor.h" #include "amd_smi/impl/amd_smi_drm.h" #include "shared_mutex.h" // NOLINT +#include "rocm_smi/rocm_smi_logger.h" namespace amd { namespace smi { diff --git a/projects/amdsmi/py-interface/README.md b/projects/amdsmi/py-interface/README.md index 013c3477b1..3dc1df88ed 100644 --- a/projects/amdsmi/py-interface/README.md +++ b/projects/amdsmi/py-interface/README.md @@ -2117,63 +2117,68 @@ Input parameters: Output: Dictionary with fields -Field | Description -`---|--- -`temperature_edge` | edge temperature value -`temperature_hotspot` | hotspot temperature value -`temperature_mem` | memory temperature value -`temperature_vrgfx` | vrgfx temperature value -`temperature_vrsoc` | vrsoc temperature value -`temperature_vrmem` | vrmem temperature value -`average_gfx_activity` | average gfx activity -`average_umc_activity` | average umc activity -`average_mm_activity` | average mm activity -`average_socket_power` | average socket power -`energy_accumulator` | energy accumulator value -`system_clock_counter` | system clock counter -`average_gfxclk_frequency` | average gfx clock frequency -`average_socclk_frequency` | average soc clock frequency -`average_uclk_frequency` | average uclk frequency -`average_vclk0_frequency` | average vclk0 frequency -`average_dclk0_frequency` | average dclk0 frequency -`average_vclk1_frequency` | average vclk1 frequency -`average_dclk1_frequency` | average dclk1 frequency -`current_gfxclk` | current gfx clock -`current_socclk` | current soc clock -`current_uclk` | current uclk -`current_vclk0` | current vclk0 -`current_dclk0` | current dclk0 -`current_vclk1` | current vclk1 -`current_dclk1` | current dclk1 -`throttle_status` | current throttle status -`current_fan_speed` | current fan speed -`pcie_link_width` | pcie link width -`pcie_link_speed` | pcie link speed +| Field | Description |Unit| +|-------|-------------|----| +`temperature_edge` | Edge temperature value | Celsius (C) +`temperature_hotspot` | Hotspot (aka junction) temperature value | Celsius (C) +`temperature_mem` | Memory temperature value | Celsius (C) +`temperature_vrgfx` | vrgfx temperature value | Celsius (C) +`temperature_vrsoc` | vrsoc temperature value | Celsius (C) +`temperature_vrmem` | vrmem temperature value | Celsius (C) +`average_gfx_activity` | Average gfx activity | % +`average_umc_activity` | Average umc activity | % +`average_mm_activity` | Average mm activity | % +`average_socket_power` | Average socket power | W +`energy_accumulator` | Energy accumulated with a 15.3 uJ resolution over 1ns | uJ +`system_clock_counter` | System clock counter | ns +`average_gfxclk_frequency` | Average gfx clock frequency | MHz +`average_socclk_frequency` | Average soc clock frequency | MHz +`average_uclk_frequency` | Average uclk frequency | MHz +`average_vclk0_frequency` | Average vclk0 frequency | MHz +`average_dclk0_frequency` | Average dclk0 frequency | MHz +`average_vclk1_frequency` | Average vclk1 frequency | MHz +`average_dclk1_frequency` | Average dclk1 frequency | MHz +`current_gfxclk` | Current gfx clock | MHz +`current_socclk` | Current soc clock | MHz +`current_uclk` | Current uclk | MHz +`current_vclk0` | Current vclk0 | MHz +`current_dclk0` | Current dclk0 | MHz +`current_vclk1` | Current vclk1 | MHz +`current_dclk1` | Current dclk1 | MHz +`throttle_status` | Current throttle status | MHz +`current_fan_speed` | Current fan speed | RPM +`pcie_link_width` | PCIe link width (number of lanes) | lanes +`pcie_link_speed` | PCIe link speed in 0.1 GT/s (Giga Transfers per second) | GT/s `padding` | padding -`gfx_activity_acc` | gfx activity acc -`mem_activity_acc` | mem activity acc -`temperature_hbm` | list of hbm temperatures -`firmware_timestamp` | timestamp from PMFW -`voltage_soc` | soc voltage -`voltage_gfx` | gfx voltage -`voltage_mem` | mem voltage -`indep_throttle_status` | asic independent throttle status -`current_socket_power` | current socket power -`vcn_activity` | list of encoding and decoding engine utilizations -`gfxclk_lock_status` | gfx clock lock status -`xgmi_link_width` | XGMI bus width -`xgmi_link_speed` | XGMI bitrate (in Gbps) -`pcie_bandwidth_acc` | PCIE accumulated bandwidth (GB/sec) -`pcie_bandwidth_inst` | PCIE instantaneous bandwidth (GB/sec) -`pcie_l0_to_recov_count_acc` | PCIE L0 to recovery state transition accumulated count -`pcie_replay_count_acc` | PCIE replay accumulated count -`pcie_replay_rover_count_acc` | PCIE replay rollover accumulated count -`xgmi_read_data_acc` | XGMI accumulated read data transfer size(KiloBytes) -`xgmi_write_data_acc` | XGMI accumulated write data transfer size(KiloBytes) -`current_gfxclks` | list of current gfx clock frequencies -`current_socclks` | list of current soc clock frequencies -`current_vclk0s` | list of current v0 clock frequencies -`current_dclk0s` | list of current d0 clock frequencies +`gfx_activity_acc` | gfx activity accumulated | % +`mem_activity_acc` | Memory activity accumulated | % +`temperature_hbm` | list of hbm temperatures | Celsius (C) +`firmware_timestamp` | timestamp from PMFW (10ns resolution) | ns +`voltage_soc` | soc voltage | mV +`voltage_gfx` | gfx voltage | mV +`voltage_mem` | mem voltage | mV +`indep_throttle_status` | ASIC independent throttle status (see drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h for bit flags) | +`current_socket_power` | Current socket power (also known as instant socket power) | W +`vcn_activity` | List of VCN encode/decode engine utilization per AID | % +`gfxclk_lock_status` | Clock lock status. Each bit corresponds to clock instance. | +`xgmi_link_width` | XGMI bus width | lanes +`xgmi_link_speed` | XGMI bitrate | GB/s +`pcie_bandwidth_acc` | PCIe accumulated bandwidth | GB/s +`pcie_bandwidth_inst` | PCIe instantaneous bandwidth | GB/s +`pcie_l0_to_recov_count_acc` | PCIe L0 to recovery state transition accumulated count | +`pcie_replay_count_acc` | PCIe replay accumulated count | +`pcie_replay_rover_count_acc` | PCIe replay rollover accumulated count | +`xgmi_read_data_acc` | XGMI accumulated read data transfer size (KiloBytes) | KB +`xgmi_write_data_acc` | XGMI accumulated write data transfer size (KiloBytes) | KB +`current_gfxclks` | List of current gfx clock frequencies | MHz +`current_socclks` | List of current soc clock frequencies | MHz +`current_vclk0s` | List of current v0 clock frequencies | MHz +`current_dclk0s` | List of current d0 clock frequencies | MHz +`mem_bandwidth_acc` | Memory bandwidth usage accumulated | GB/s +`mem_max_bandwidth` | Maximum memory bandwidth usage accumulated | GB/s +`pcie_nak_sent_count_acc` | PCIe NAC sent count accumulated | +`pcie_nak_rcvd_count_acc` | PCIe NAC received count accumulated | +`jpeg_activitys[AID]` | List of JPEG engine activity for each AID (X=0-3) | % Exceptions that can be thrown by `amdsmi_get_gpu_metrics_info` function: diff --git a/projects/amdsmi/py-interface/amdsmi_interface.py b/projects/amdsmi/py-interface/amdsmi_interface.py index dc4a92452e..18e563c198 100644 --- a/projects/amdsmi/py-interface/amdsmi_interface.py +++ b/projects/amdsmi/py-interface/amdsmi_interface.py @@ -2978,6 +2978,14 @@ def amdsmi_get_gpu_metrics_info( "current_socclks": list(gpu_metrics.current_socclks), "current_vclk0s": list(gpu_metrics.current_vclk0s), "current_dclk0s": list(gpu_metrics.current_dclk0s), + "mem_bandwidth_acc": gpu_metrics.mem_bandwidth_acc, + "mem_max_bandwidth": gpu_metrics.mem_max_bandwidth, + "pcie_nak_sent_count_acc": gpu_metrics.pcie_nak_sent_count_acc, + "pcie_nak_rcvd_count_acc": gpu_metrics.pcie_nak_rcvd_count_acc, + "jpeg_activities[AID0]": list(gpu_metrics.jpeg_activities)[:8], + "jpeg_activities[AID1]": list(gpu_metrics.jpeg_activities)[8:16], + "jpeg_activities[AID2]": list(gpu_metrics.jpeg_activities)[16:24], + "jpeg_activities[AID3]": list(gpu_metrics.jpeg_activities)[24:32], } diff --git a/projects/amdsmi/py-interface/amdsmi_wrapper.py b/projects/amdsmi/py-interface/amdsmi_wrapper.py index dccdcd81f2..78fc72bb5f 100644 --- a/projects/amdsmi/py-interface/amdsmi_wrapper.py +++ b/projects/amdsmi/py-interface/amdsmi_wrapper.py @@ -894,6 +894,7 @@ amdsmi_clk_info_t = struct_amdsmi_clk_info_t class struct_amdsmi_engine_usage_t(Structure): pass + struct_amdsmi_engine_usage_t._pack_ = 1 # source:False struct_amdsmi_engine_usage_t._fields_ = [ ('gfx_activity', ctypes.c_uint32), @@ -1514,6 +1515,11 @@ struct_amdsmi_gpu_metrics_t._fields_ = [ ('current_socclks', ctypes.c_uint16 * 4), ('current_vclk0s', ctypes.c_uint16 * 4), ('current_dclk0s', ctypes.c_uint16 * 4), + ('mem_bandwidth_acc', ctypes.c_uint64), + ('mem_max_bandwidth', ctypes.c_uint32), + ('pcie_nak_sent_count_acc', ctypes.c_uint32), + ('pcie_nak_rcvd_count_acc', ctypes.c_uint32), + ('jpeg_activities', ctypes.c_uint16 * 32), ] amdsmi_gpu_metrics_t = struct_amdsmi_gpu_metrics_t diff --git a/projects/amdsmi/src/amd_smi/amd_smi.cc b/projects/amdsmi/src/amd_smi/amd_smi.cc index 917784d582..d9bfbc9e91 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi.cc @@ -68,6 +68,7 @@ #include "amd_smi/impl/amdgpu_drm.h" #include "amd_smi/impl/amd_smi_utils.h" #include "amd_smi/impl/amd_smi_processor.h" +#include "rocm_smi/rocm_smi_logger.h" #ifdef ENABLE_ESMI_LIB #include "amd_smi/impl/amd_smi_cpu_socket.h" #include "amd_smi/impl/amd_smi_cpu_core.h" @@ -1154,9 +1155,32 @@ amdsmi_status_t amdsmi_get_gpu_metrics_info( amdsmi_gpu_metrics_t *pgpu_metrics) { AMDSMI_CHECK_INIT(); // nullptr api supported - - return rsmi_wrapper(rsmi_dev_gpu_metrics_info_get, processor_handle, + amdsmi_status_t ret = + rsmi_wrapper(rsmi_dev_gpu_metrics_info_get, processor_handle, reinterpret_cast(pgpu_metrics)); + // WARNING: TEMPORARY - awaiting 1.5 update from amdgpu driver/firmware + // intended to be removed later + // START: REMOVE WHATS BELOW ME + uint8_t content_ver = pgpu_metrics->common_header.content_revision; + int8_t format_ver = pgpu_metrics->common_header.format_revision; + const uint8_t expected_format_ver = 1; + const uint8_t expected_content_ver = 4; + if (ret == AMDSMI_STATUS_SUCCESS && + (format_ver == expected_format_ver && + content_ver <= expected_content_ver)) { + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | SET JPEG_ACTIVITY to MAX_UINT16, " + << "detected content version: " << std::dec << +content_ver + << "; format version: " << std::dec << +format_ver + << "; awaiting 1.5 metrics remove once released"; + LOG_ALWAYS(ss); + std::fill_n(&pgpu_metrics->jpeg_activity[0], + (sizeof(pgpu_metrics->jpeg_activity) / + sizeof(pgpu_metrics->jpeg_activity[0])), + std::numeric_limits::max()); + } + // END: REMOVE WHATS ABOVE ME + return ret; } diff --git a/projects/amdsmi/src/amd_smi/amd_smi_utils.cc b/projects/amdsmi/src/amd_smi/amd_smi_utils.cc index 2b16250dc3..79d9103699 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi_utils.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi_utils.cc @@ -43,6 +43,7 @@ #include "amd_smi/impl/amd_smi_utils.h" #include "shared_mutex.h" // NOLINT +#include "rocm_smi/rocm_smi_logger.h" static const uint32_t kAmdGpuId = 0x1002; @@ -153,6 +154,20 @@ amdsmi_status_t smi_amdgpu_get_board_info(amd::smi::AMDSmiGPUDevice* device, amd fgets(info->product_name, sizeof(info->product_name), fp); fclose(fp); } + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ + << "Returning status = AMDSMI_STATUS_SUCCESS" + << " | model_number_path = " << model_number_path + << "; info->model_number: " << info->model_number + << "\n product_serial_path = " << product_serial_path + << "; info->product_serial: " << info->product_serial + << "\n fru_id_path = " << fru_id_path + << "; info->fru_id: " << info->fru_id + << "\n manufacturer_name_path = " << manufacturer_name_path + << "; info->manufacturer_name: " << info->manufacturer_name + << "\n product_name_path = " << product_name_path + << "; info->product_name: " << info->product_name; + LOG_INFO(ss); return AMDSMI_STATUS_SUCCESS; } @@ -236,8 +251,8 @@ amdsmi_status_t smi_amdgpu_get_ranges(amd::smi::AMDSmiGPUDevice* device, amdsmi_ unsigned int dpm_level, freq; char firstChar = line[0]; - if (firstChar == 'S'){ - if (sscanf(line.c_str(), "%c: %d%s", &single_char, &sleep_freq, str) <= 2){ + if (firstChar == 'S') { + if (sscanf(line.c_str(), "%c: %d%s", &single_char, &sleep_freq, str) <= 2) { ranges.close(); return AMDSMI_STATUS_NO_DATA; }