Add vcn and jpeg activity
Changes:
- Add new engine field vcn_activity (from 1.4/1.5
gpu_metrics
- Updated log output to enhance view of gpu_metric
data as json pretty print
- Added new fields provided in 1.5
- Added unit overview in python API, CLI is WIP
Signed-off-by: Charis Poag <Charis.Poag@amd.com>
Change-Id: I7d9f29e7ecc35dcd0697814c222cdd02b0d5518e
[ROCm/amdsmi commit: 8f3861e1d9]
Dieser Commit ist enthalten in:
@@ -24,6 +24,7 @@ import logging
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
import json
|
||||
|
||||
from _version import __version__
|
||||
from amdsmi_helpers import AMDSMIHelpers
|
||||
@@ -941,7 +942,9 @@ class AMDSMICommands():
|
||||
|
||||
# Put the metrics table in the debug logs
|
||||
try:
|
||||
logging.debug("GPU Metrics table for %s | %s", gpu_id, amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu))
|
||||
gpu_metric_output = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)
|
||||
gpu_metric_str = json.dumps(gpu_metric_output, indent=4)
|
||||
logging.debug("GPU Metrics table for %s | %s", gpu_id, gpu_metric_str)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Unabled to load GPU Metrics table for %s | %s", gpu_id, e.err_info)
|
||||
|
||||
@@ -963,14 +966,30 @@ class AMDSMICommands():
|
||||
engine_usage['gfx_usage'] = engine_usage.pop('gfx_activity')
|
||||
engine_usage['mem_usage'] = engine_usage.pop('umc_activity')
|
||||
engine_usage['mm_ip_usage'] = engine_usage.pop('mm_activity')
|
||||
|
||||
engine_usage['vcn_activities'] = gpu_metric_output.pop('vcn_activity')
|
||||
engine_usage['jpeg_activities[AID0]'] = gpu_metric_output.pop('jpeg_activities[AID0]')
|
||||
engine_usage['jpeg_activities[AID1]'] = gpu_metric_output.pop('jpeg_activities[AID1]')
|
||||
engine_usage['jpeg_activities[AID2]'] = gpu_metric_output.pop('jpeg_activities[AID2]')
|
||||
engine_usage['jpeg_activities[AID3]'] = gpu_metric_output.pop('jpeg_activities[AID3]')
|
||||
for key, value in engine_usage.items():
|
||||
if value == 65535:
|
||||
if not isinstance(value, list) and value > 100:
|
||||
engine_usage[key] = "N/A"
|
||||
elif isinstance(value, list):
|
||||
engine_usage[key] = ["N/A" if v > 100 else v for v in value]
|
||||
|
||||
if self.logger.is_human_readable_format():
|
||||
if engine_usage[key] != "N/A":
|
||||
unit = '%'
|
||||
unit = '%'
|
||||
if isinstance(value, list):
|
||||
engine_usage[key] = [f"{v} {unit}" if str(v) != "N/A" else str(v) for v in engine_usage[key]]
|
||||
save_value = engine_usage[key]
|
||||
pretty_array = "["
|
||||
for i in range(len(save_value)):
|
||||
if (i+1 != len(save_value)):
|
||||
pretty_array += save_value[i] + ", "
|
||||
else:
|
||||
pretty_array += save_value[i] + "]"
|
||||
engine_usage[key] = pretty_array
|
||||
elif not isinstance(value, list) and engine_usage[key] != "N/A":
|
||||
engine_usage[key] = f"{value} {unit}"
|
||||
|
||||
values_dict['usage'] = engine_usage
|
||||
@@ -1225,9 +1244,6 @@ class AMDSMICommands():
|
||||
logging.debug("Failed to get pcie replay rollover counter for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
try:
|
||||
# nak_info = amdsmi_interface.amdsmi_get_gpu_pci_nak_info(args.gpu)
|
||||
# pcie_dict['nak_sent_count'] = nak_info['nak_sent_count']
|
||||
# pcie_dict['nak_received_count'] = nak_info['nak_received_count']
|
||||
pcie_dict['nak_sent_count'] = "N/A"
|
||||
pcie_dict['nak_received_count'] = "N/A"
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
|
||||
@@ -88,6 +88,55 @@ typedef enum {
|
||||
|
||||
#define AMDSMI_GPU_UUID_SIZE 38
|
||||
|
||||
/**
|
||||
* @brief The following structure holds the gpu metrics values for a device.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @brief Unit conversion factor for HBM temperatures
|
||||
*/
|
||||
#define CENTRIGRADE_TO_MILLI_CENTIGRADE 1000
|
||||
|
||||
/**
|
||||
* @brief This should match NUM_HBM_INSTANCES
|
||||
*/
|
||||
#define AMDSMI_NUM_HBM_INSTANCES 4
|
||||
|
||||
/**
|
||||
* @brief This should match MAX_NUM_VCN
|
||||
*/
|
||||
#define AMDSMI_MAX_NUM_VCN 4
|
||||
|
||||
/**
|
||||
* @brief This should match MAX_NUM_CLKS
|
||||
*/
|
||||
#define AMDSMI_MAX_NUM_CLKS 4
|
||||
|
||||
/**
|
||||
* @brief This should match MAX_NUM_XGMI_LINKS
|
||||
*/
|
||||
#define AMDSMI_MAX_NUM_XGMI_LINKS 8
|
||||
|
||||
/**
|
||||
* @brief This should match MAX_NUM_GFX_CLKS
|
||||
*/
|
||||
#define AMDSMI_MAX_NUM_GFX_CLKS 8
|
||||
|
||||
/**
|
||||
* @brief This should match AMDSMI_MAX_AID
|
||||
*/
|
||||
#define AMDSMI_MAX_AID 4
|
||||
|
||||
/**
|
||||
* @brief This should match AMDSMI_MAX_ENGINES
|
||||
*/
|
||||
#define AMDSMI_MAX_ENGINES 8
|
||||
|
||||
/**
|
||||
* @brief This should match AMDSMI_MAX_NUM_JPEG (8*4=32)
|
||||
*/
|
||||
#define AMDSMI_MAX_NUM_JPEG 32
|
||||
|
||||
/* string format */
|
||||
#define AMDSMI_TIME_FORMAT "%02d:%02d:%02d.%03d"
|
||||
#define AMDSMI_DATE_FORMAT "%04d-%02d-%02d:%02d:%02d:%02d.%03d"
|
||||
@@ -544,6 +593,12 @@ typedef struct {
|
||||
uint32_t reserved[4];
|
||||
} amdsmi_clk_info_t;
|
||||
|
||||
/**
|
||||
* amdsmi_engine_usage_t:
|
||||
* This structure holds common
|
||||
* GPU activity values seen in both BM or
|
||||
* SRIOV
|
||||
**/
|
||||
typedef struct {
|
||||
uint32_t gfx_activity;
|
||||
uint32_t umc_activity;
|
||||
@@ -1137,41 +1192,6 @@ typedef struct {
|
||||
/// \endcond
|
||||
} amd_metrics_table_header_t;
|
||||
|
||||
/**
|
||||
* @brief The following structure holds the gpu metrics values for a device.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @brief Unit conversion factor for HBM temperatures
|
||||
*/
|
||||
#define CENTRIGRADE_TO_MILLI_CENTIGRADE 1000
|
||||
|
||||
/**
|
||||
* @brief This should match NUM_HBM_INSTANCES
|
||||
*/
|
||||
#define AMDSMI_NUM_HBM_INSTANCES 4
|
||||
|
||||
/**
|
||||
* @brief This should match MAX_NUM_VCN
|
||||
*/
|
||||
#define AMDSMI_MAX_NUM_VCN 4
|
||||
|
||||
/**
|
||||
* @brief This should match MAX_NUM_CLKS
|
||||
*/
|
||||
#define AMDSMI_MAX_NUM_CLKS 4
|
||||
|
||||
/**
|
||||
* @brief This should match MAX_NUM_XGMI_LINKS
|
||||
*/
|
||||
#define AMDSMI_MAX_NUM_XGMI_LINKS 8
|
||||
|
||||
/**
|
||||
* @brief This should match MAX_NUM_GFX_CLKS
|
||||
*/
|
||||
#define AMDSMI_MAX_NUM_GFX_CLKS 8
|
||||
|
||||
|
||||
typedef struct {
|
||||
// TODO(amd) Doxygen documents
|
||||
// Note: This structure is extended to fit the needs of different GPU metric
|
||||
@@ -1191,7 +1211,7 @@ typedef struct {
|
||||
/*
|
||||
* v1.0 Base
|
||||
*/
|
||||
// Temperature
|
||||
// Temperature (C)
|
||||
uint16_t temperature_edge;
|
||||
uint16_t temperature_hotspot;
|
||||
uint16_t temperature_mem;
|
||||
@@ -1199,19 +1219,19 @@ typedef struct {
|
||||
uint16_t temperature_vrsoc;
|
||||
uint16_t temperature_vrmem;
|
||||
|
||||
// Utilization
|
||||
// Utilization (%)
|
||||
uint16_t average_gfx_activity;
|
||||
uint16_t average_umc_activity; // memory controller
|
||||
uint16_t average_mm_activity; // UVD or VCN
|
||||
|
||||
// Power/Energy
|
||||
// Power (W) /Energy (15.259uJ per 1ns)
|
||||
uint16_t average_socket_power;
|
||||
uint64_t energy_accumulator; // v1 mod. (32->64)
|
||||
|
||||
// Driver attached timestamp (in ns)
|
||||
uint64_t system_clock_counter; // v1 mod. (moved from top of struct)
|
||||
|
||||
// Average clocks
|
||||
// Average clocks (MHz)
|
||||
uint16_t average_gfxclk_frequency;
|
||||
uint16_t average_socclk_frequency;
|
||||
uint16_t average_uclk_frequency;
|
||||
@@ -1220,7 +1240,7 @@ typedef struct {
|
||||
uint16_t average_vclk1_frequency;
|
||||
uint16_t average_dclk1_frequency;
|
||||
|
||||
// Current clocks
|
||||
// Current clocks (MHz)
|
||||
uint16_t current_gfxclk;
|
||||
uint16_t current_socclk;
|
||||
uint16_t current_uclk;
|
||||
@@ -1232,10 +1252,10 @@ typedef struct {
|
||||
// Throttle status
|
||||
uint32_t throttle_status;
|
||||
|
||||
// Fans
|
||||
// Fans (RPM)
|
||||
uint16_t current_fan_speed;
|
||||
|
||||
// Link width/speed
|
||||
// Link width (number of lanes) /speed (0.1 GT/s)
|
||||
uint16_t pcie_link_width; // v1 mod.(8->16)
|
||||
uint16_t pcie_link_speed; // in 0.1 GT/s; v1 mod. (8->16)
|
||||
|
||||
@@ -1274,19 +1294,19 @@ typedef struct {
|
||||
uint16_t current_socket_power;
|
||||
|
||||
// Utilization (%)
|
||||
uint16_t vcn_activity[AMDSMI_MAX_NUM_VCN]; // VCN instances activity percent (encode/decode)
|
||||
uint16_t vcn_activity[AMDSMI_MAX_NUM_VCN];
|
||||
|
||||
// Clock Lock Status. Each bit corresponds to clock instance
|
||||
uint32_t gfxclk_lock_status;
|
||||
|
||||
// XGMI bus width and bitrate (in Gbps)
|
||||
// XGMI bus width and bitrate (in GB/s)
|
||||
uint16_t xgmi_link_width;
|
||||
uint16_t xgmi_link_speed;
|
||||
|
||||
// PCIE accumulated bandwidth (GB/sec)
|
||||
// PCIe accumulated bandwidth (GB/sec)
|
||||
uint64_t pcie_bandwidth_acc;
|
||||
|
||||
// PCIE instantaneous bandwidth (GB/sec)
|
||||
// PCIe instantaneous bandwidth (GB/sec)
|
||||
uint64_t pcie_bandwidth_inst;
|
||||
|
||||
// PCIE L0 to recovery state transition accumulated count
|
||||
@@ -1298,15 +1318,33 @@ typedef struct {
|
||||
// PCIE replay rollover accumulated count
|
||||
uint64_t pcie_replay_rover_count_acc;
|
||||
|
||||
// XGMI accumulated data transfer size(KiloBytes)
|
||||
// XGMI accumulated data transfer size (KB)
|
||||
uint64_t xgmi_read_data_acc[AMDSMI_MAX_NUM_XGMI_LINKS];
|
||||
uint64_t xgmi_write_data_acc[AMDSMI_MAX_NUM_XGMI_LINKS];
|
||||
|
||||
// Current clock frequencies
|
||||
// Current clock frequencies (MHz)
|
||||
uint16_t current_gfxclks[AMDSMI_MAX_NUM_GFX_CLKS];
|
||||
uint16_t current_socclks[AMDSMI_MAX_NUM_CLKS];
|
||||
uint16_t current_vclk0s[AMDSMI_MAX_NUM_CLKS];
|
||||
uint16_t current_dclk0s[AMDSMI_MAX_NUM_CLKS];
|
||||
|
||||
/*
|
||||
* v1.5 additions
|
||||
*/
|
||||
// Memory Bandwidth Usage Accumulated (GB/sec)
|
||||
uint64_t mem_bandwidth_acc;
|
||||
|
||||
// Memory Bandwidth Maximum (GB/sec)
|
||||
uint32_t mem_max_bandwidth;
|
||||
|
||||
// PCIE NAK sent accumulated count
|
||||
uint32_t pcie_nak_sent_count_acc;
|
||||
|
||||
// PCIE NAK received accumulated count
|
||||
uint32_t pcie_nak_rcvd_count_acc;
|
||||
|
||||
// JPEG activity % per AID
|
||||
uint16_t jpeg_activity[AMDSMI_MAX_NUM_JPEG];
|
||||
/// \endcond
|
||||
} amdsmi_gpu_metrics_t;
|
||||
|
||||
|
||||
@@ -48,6 +48,7 @@
|
||||
#include "amd_smi/impl/amd_smi_processor.h"
|
||||
#include "amd_smi/impl/amd_smi_drm.h"
|
||||
#include "shared_mutex.h" // NOLINT
|
||||
#include "rocm_smi/rocm_smi_logger.h"
|
||||
|
||||
namespace amd {
|
||||
namespace smi {
|
||||
|
||||
@@ -2117,63 +2117,68 @@ Input parameters:
|
||||
|
||||
Output: Dictionary with fields
|
||||
|
||||
Field | Description
|
||||
`---|---
|
||||
`temperature_edge` | edge temperature value
|
||||
`temperature_hotspot` | hotspot temperature value
|
||||
`temperature_mem` | memory temperature value
|
||||
`temperature_vrgfx` | vrgfx temperature value
|
||||
`temperature_vrsoc` | vrsoc temperature value
|
||||
`temperature_vrmem` | vrmem temperature value
|
||||
`average_gfx_activity` | average gfx activity
|
||||
`average_umc_activity` | average umc activity
|
||||
`average_mm_activity` | average mm activity
|
||||
`average_socket_power` | average socket power
|
||||
`energy_accumulator` | energy accumulator value
|
||||
`system_clock_counter` | system clock counter
|
||||
`average_gfxclk_frequency` | average gfx clock frequency
|
||||
`average_socclk_frequency` | average soc clock frequency
|
||||
`average_uclk_frequency` | average uclk frequency
|
||||
`average_vclk0_frequency` | average vclk0 frequency
|
||||
`average_dclk0_frequency` | average dclk0 frequency
|
||||
`average_vclk1_frequency` | average vclk1 frequency
|
||||
`average_dclk1_frequency` | average dclk1 frequency
|
||||
`current_gfxclk` | current gfx clock
|
||||
`current_socclk` | current soc clock
|
||||
`current_uclk` | current uclk
|
||||
`current_vclk0` | current vclk0
|
||||
`current_dclk0` | current dclk0
|
||||
`current_vclk1` | current vclk1
|
||||
`current_dclk1` | current dclk1
|
||||
`throttle_status` | current throttle status
|
||||
`current_fan_speed` | current fan speed
|
||||
`pcie_link_width` | pcie link width
|
||||
`pcie_link_speed` | pcie link speed
|
||||
| Field | Description |Unit|
|
||||
|-------|-------------|----|
|
||||
`temperature_edge` | Edge temperature value | Celsius (C)
|
||||
`temperature_hotspot` | Hotspot (aka junction) temperature value | Celsius (C)
|
||||
`temperature_mem` | Memory temperature value | Celsius (C)
|
||||
`temperature_vrgfx` | vrgfx temperature value | Celsius (C)
|
||||
`temperature_vrsoc` | vrsoc temperature value | Celsius (C)
|
||||
`temperature_vrmem` | vrmem temperature value | Celsius (C)
|
||||
`average_gfx_activity` | Average gfx activity | %
|
||||
`average_umc_activity` | Average umc activity | %
|
||||
`average_mm_activity` | Average mm activity | %
|
||||
`average_socket_power` | Average socket power | W
|
||||
`energy_accumulator` | Energy accumulated with a 15.3 uJ resolution over 1ns | uJ
|
||||
`system_clock_counter` | System clock counter | ns
|
||||
`average_gfxclk_frequency` | Average gfx clock frequency | MHz
|
||||
`average_socclk_frequency` | Average soc clock frequency | MHz
|
||||
`average_uclk_frequency` | Average uclk frequency | MHz
|
||||
`average_vclk0_frequency` | Average vclk0 frequency | MHz
|
||||
`average_dclk0_frequency` | Average dclk0 frequency | MHz
|
||||
`average_vclk1_frequency` | Average vclk1 frequency | MHz
|
||||
`average_dclk1_frequency` | Average dclk1 frequency | MHz
|
||||
`current_gfxclk` | Current gfx clock | MHz
|
||||
`current_socclk` | Current soc clock | MHz
|
||||
`current_uclk` | Current uclk | MHz
|
||||
`current_vclk0` | Current vclk0 | MHz
|
||||
`current_dclk0` | Current dclk0 | MHz
|
||||
`current_vclk1` | Current vclk1 | MHz
|
||||
`current_dclk1` | Current dclk1 | MHz
|
||||
`throttle_status` | Current throttle status | MHz
|
||||
`current_fan_speed` | Current fan speed | RPM
|
||||
`pcie_link_width` | PCIe link width (number of lanes) | lanes
|
||||
`pcie_link_speed` | PCIe link speed in 0.1 GT/s (Giga Transfers per second) | GT/s
|
||||
`padding` | padding
|
||||
`gfx_activity_acc` | gfx activity acc
|
||||
`mem_activity_acc` | mem activity acc
|
||||
`temperature_hbm` | list of hbm temperatures
|
||||
`firmware_timestamp` | timestamp from PMFW
|
||||
`voltage_soc` | soc voltage
|
||||
`voltage_gfx` | gfx voltage
|
||||
`voltage_mem` | mem voltage
|
||||
`indep_throttle_status` | asic independent throttle status
|
||||
`current_socket_power` | current socket power
|
||||
`vcn_activity` | list of encoding and decoding engine utilizations
|
||||
`gfxclk_lock_status` | gfx clock lock status
|
||||
`xgmi_link_width` | XGMI bus width
|
||||
`xgmi_link_speed` | XGMI bitrate (in Gbps)
|
||||
`pcie_bandwidth_acc` | PCIE accumulated bandwidth (GB/sec)
|
||||
`pcie_bandwidth_inst` | PCIE instantaneous bandwidth (GB/sec)
|
||||
`pcie_l0_to_recov_count_acc` | PCIE L0 to recovery state transition accumulated count
|
||||
`pcie_replay_count_acc` | PCIE replay accumulated count
|
||||
`pcie_replay_rover_count_acc` | PCIE replay rollover accumulated count
|
||||
`xgmi_read_data_acc` | XGMI accumulated read data transfer size(KiloBytes)
|
||||
`xgmi_write_data_acc` | XGMI accumulated write data transfer size(KiloBytes)
|
||||
`current_gfxclks` | list of current gfx clock frequencies
|
||||
`current_socclks` | list of current soc clock frequencies
|
||||
`current_vclk0s` | list of current v0 clock frequencies
|
||||
`current_dclk0s` | list of current d0 clock frequencies
|
||||
`gfx_activity_acc` | gfx activity accumulated | %
|
||||
`mem_activity_acc` | Memory activity accumulated | %
|
||||
`temperature_hbm` | list of hbm temperatures | Celsius (C)
|
||||
`firmware_timestamp` | timestamp from PMFW (10ns resolution) | ns
|
||||
`voltage_soc` | soc voltage | mV
|
||||
`voltage_gfx` | gfx voltage | mV
|
||||
`voltage_mem` | mem voltage | mV
|
||||
`indep_throttle_status` | ASIC independent throttle status (see drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h for bit flags) |
|
||||
`current_socket_power` | Current socket power (also known as instant socket power) | W
|
||||
`vcn_activity` | List of VCN encode/decode engine utilization per AID | %
|
||||
`gfxclk_lock_status` | Clock lock status. Each bit corresponds to clock instance. |
|
||||
`xgmi_link_width` | XGMI bus width | lanes
|
||||
`xgmi_link_speed` | XGMI bitrate | GB/s
|
||||
`pcie_bandwidth_acc` | PCIe accumulated bandwidth | GB/s
|
||||
`pcie_bandwidth_inst` | PCIe instantaneous bandwidth | GB/s
|
||||
`pcie_l0_to_recov_count_acc` | PCIe L0 to recovery state transition accumulated count |
|
||||
`pcie_replay_count_acc` | PCIe replay accumulated count |
|
||||
`pcie_replay_rover_count_acc` | PCIe replay rollover accumulated count |
|
||||
`xgmi_read_data_acc` | XGMI accumulated read data transfer size (KiloBytes) | KB
|
||||
`xgmi_write_data_acc` | XGMI accumulated write data transfer size (KiloBytes) | KB
|
||||
`current_gfxclks` | List of current gfx clock frequencies | MHz
|
||||
`current_socclks` | List of current soc clock frequencies | MHz
|
||||
`current_vclk0s` | List of current v0 clock frequencies | MHz
|
||||
`current_dclk0s` | List of current d0 clock frequencies | MHz
|
||||
`mem_bandwidth_acc` | Memory bandwidth usage accumulated | GB/s
|
||||
`mem_max_bandwidth` | Maximum memory bandwidth usage accumulated | GB/s
|
||||
`pcie_nak_sent_count_acc` | PCIe NAC sent count accumulated |
|
||||
`pcie_nak_rcvd_count_acc` | PCIe NAC received count accumulated |
|
||||
`jpeg_activitys[AID<X>]` | List of JPEG engine activity for each AID (X=0-3) | %
|
||||
|
||||
Exceptions that can be thrown by `amdsmi_get_gpu_metrics_info` function:
|
||||
|
||||
|
||||
@@ -2978,6 +2978,14 @@ def amdsmi_get_gpu_metrics_info(
|
||||
"current_socclks": list(gpu_metrics.current_socclks),
|
||||
"current_vclk0s": list(gpu_metrics.current_vclk0s),
|
||||
"current_dclk0s": list(gpu_metrics.current_dclk0s),
|
||||
"mem_bandwidth_acc": gpu_metrics.mem_bandwidth_acc,
|
||||
"mem_max_bandwidth": gpu_metrics.mem_max_bandwidth,
|
||||
"pcie_nak_sent_count_acc": gpu_metrics.pcie_nak_sent_count_acc,
|
||||
"pcie_nak_rcvd_count_acc": gpu_metrics.pcie_nak_rcvd_count_acc,
|
||||
"jpeg_activities[AID0]": list(gpu_metrics.jpeg_activities)[:8],
|
||||
"jpeg_activities[AID1]": list(gpu_metrics.jpeg_activities)[8:16],
|
||||
"jpeg_activities[AID2]": list(gpu_metrics.jpeg_activities)[16:24],
|
||||
"jpeg_activities[AID3]": list(gpu_metrics.jpeg_activities)[24:32],
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -894,6 +894,7 @@ amdsmi_clk_info_t = struct_amdsmi_clk_info_t
|
||||
class struct_amdsmi_engine_usage_t(Structure):
|
||||
pass
|
||||
|
||||
|
||||
struct_amdsmi_engine_usage_t._pack_ = 1 # source:False
|
||||
struct_amdsmi_engine_usage_t._fields_ = [
|
||||
('gfx_activity', ctypes.c_uint32),
|
||||
@@ -1514,6 +1515,11 @@ struct_amdsmi_gpu_metrics_t._fields_ = [
|
||||
('current_socclks', ctypes.c_uint16 * 4),
|
||||
('current_vclk0s', ctypes.c_uint16 * 4),
|
||||
('current_dclk0s', ctypes.c_uint16 * 4),
|
||||
('mem_bandwidth_acc', ctypes.c_uint64),
|
||||
('mem_max_bandwidth', ctypes.c_uint32),
|
||||
('pcie_nak_sent_count_acc', ctypes.c_uint32),
|
||||
('pcie_nak_rcvd_count_acc', ctypes.c_uint32),
|
||||
('jpeg_activities', ctypes.c_uint16 * 32),
|
||||
]
|
||||
|
||||
amdsmi_gpu_metrics_t = struct_amdsmi_gpu_metrics_t
|
||||
|
||||
@@ -68,6 +68,7 @@
|
||||
#include "amd_smi/impl/amdgpu_drm.h"
|
||||
#include "amd_smi/impl/amd_smi_utils.h"
|
||||
#include "amd_smi/impl/amd_smi_processor.h"
|
||||
#include "rocm_smi/rocm_smi_logger.h"
|
||||
#ifdef ENABLE_ESMI_LIB
|
||||
#include "amd_smi/impl/amd_smi_cpu_socket.h"
|
||||
#include "amd_smi/impl/amd_smi_cpu_core.h"
|
||||
@@ -1154,9 +1155,32 @@ amdsmi_status_t amdsmi_get_gpu_metrics_info(
|
||||
amdsmi_gpu_metrics_t *pgpu_metrics) {
|
||||
AMDSMI_CHECK_INIT();
|
||||
// nullptr api supported
|
||||
|
||||
return rsmi_wrapper(rsmi_dev_gpu_metrics_info_get, processor_handle,
|
||||
amdsmi_status_t ret =
|
||||
rsmi_wrapper(rsmi_dev_gpu_metrics_info_get, processor_handle,
|
||||
reinterpret_cast<rsmi_gpu_metrics_t*>(pgpu_metrics));
|
||||
// WARNING: TEMPORARY - awaiting 1.5 update from amdgpu driver/firmware
|
||||
// intended to be removed later
|
||||
// START: REMOVE WHATS BELOW ME
|
||||
uint8_t content_ver = pgpu_metrics->common_header.content_revision;
|
||||
int8_t format_ver = pgpu_metrics->common_header.format_revision;
|
||||
const uint8_t expected_format_ver = 1;
|
||||
const uint8_t expected_content_ver = 4;
|
||||
if (ret == AMDSMI_STATUS_SUCCESS &&
|
||||
(format_ver == expected_format_ver &&
|
||||
content_ver <= expected_content_ver)) {
|
||||
std::ostringstream ss;
|
||||
ss << __PRETTY_FUNCTION__ << " | SET JPEG_ACTIVITY to MAX_UINT16, "
|
||||
<< "detected content version: " << std::dec << +content_ver
|
||||
<< "; format version: " << std::dec << +format_ver
|
||||
<< "; awaiting 1.5 metrics remove once released";
|
||||
LOG_ALWAYS(ss);
|
||||
std::fill_n(&pgpu_metrics->jpeg_activity[0],
|
||||
(sizeof(pgpu_metrics->jpeg_activity) /
|
||||
sizeof(pgpu_metrics->jpeg_activity[0])),
|
||||
std::numeric_limits<uint16_t>::max());
|
||||
}
|
||||
// END: REMOVE WHATS ABOVE ME
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -43,6 +43,7 @@
|
||||
|
||||
#include "amd_smi/impl/amd_smi_utils.h"
|
||||
#include "shared_mutex.h" // NOLINT
|
||||
#include "rocm_smi/rocm_smi_logger.h"
|
||||
|
||||
static const uint32_t kAmdGpuId = 0x1002;
|
||||
|
||||
@@ -153,6 +154,20 @@ amdsmi_status_t smi_amdgpu_get_board_info(amd::smi::AMDSmiGPUDevice* device, amd
|
||||
fgets(info->product_name, sizeof(info->product_name), fp);
|
||||
fclose(fp);
|
||||
}
|
||||
std::ostringstream ss;
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< "Returning status = AMDSMI_STATUS_SUCCESS"
|
||||
<< " | model_number_path = " << model_number_path
|
||||
<< "; info->model_number: " << info->model_number
|
||||
<< "\n product_serial_path = " << product_serial_path
|
||||
<< "; info->product_serial: " << info->product_serial
|
||||
<< "\n fru_id_path = " << fru_id_path
|
||||
<< "; info->fru_id: " << info->fru_id
|
||||
<< "\n manufacturer_name_path = " << manufacturer_name_path
|
||||
<< "; info->manufacturer_name: " << info->manufacturer_name
|
||||
<< "\n product_name_path = " << product_name_path
|
||||
<< "; info->product_name: " << info->product_name;
|
||||
LOG_INFO(ss);
|
||||
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
@@ -236,8 +251,8 @@ amdsmi_status_t smi_amdgpu_get_ranges(amd::smi::AMDSmiGPUDevice* device, amdsmi_
|
||||
unsigned int dpm_level, freq;
|
||||
|
||||
char firstChar = line[0];
|
||||
if (firstChar == 'S'){
|
||||
if (sscanf(line.c_str(), "%c: %d%s", &single_char, &sleep_freq, str) <= 2){
|
||||
if (firstChar == 'S') {
|
||||
if (sscanf(line.c_str(), "%c: %d%s", &single_char, &sleep_freq, str) <= 2) {
|
||||
ranges.close();
|
||||
return AMDSMI_STATUS_NO_DATA;
|
||||
}
|
||||
|
||||
In neuem Issue referenzieren
Einen Benutzer sperren