Add vcn and jpeg activity

Changes:
    - Add new engine field vcn_activity (from 1.4/1.5
      gpu_metrics
    - Updated log output to enhance view of gpu_metric
      data as json pretty print
    - Added new fields provided in 1.5
    - Added unit overview in python API, CLI is WIP

Signed-off-by: Charis Poag <Charis.Poag@amd.com>
Change-Id: I7d9f29e7ecc35dcd0697814c222cdd02b0d5518e


[ROCm/amdsmi commit: 8f3861e1d9]
Dieser Commit ist enthalten in:
Charis Poag
2023-12-06 03:25:38 -06:00
Ursprung 030a971ce4
Commit 4f502e5dab
8 geänderte Dateien mit 229 neuen und 116 gelöschten Zeilen
@@ -24,6 +24,7 @@ import logging
import sys
import threading
import time
import json
from _version import __version__
from amdsmi_helpers import AMDSMIHelpers
@@ -941,7 +942,9 @@ class AMDSMICommands():
# Put the metrics table in the debug logs
try:
logging.debug("GPU Metrics table for %s | %s", gpu_id, amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu))
gpu_metric_output = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)
gpu_metric_str = json.dumps(gpu_metric_output, indent=4)
logging.debug("GPU Metrics table for %s | %s", gpu_id, gpu_metric_str)
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Unabled to load GPU Metrics table for %s | %s", gpu_id, e.err_info)
@@ -963,14 +966,30 @@ class AMDSMICommands():
engine_usage['gfx_usage'] = engine_usage.pop('gfx_activity')
engine_usage['mem_usage'] = engine_usage.pop('umc_activity')
engine_usage['mm_ip_usage'] = engine_usage.pop('mm_activity')
engine_usage['vcn_activities'] = gpu_metric_output.pop('vcn_activity')
engine_usage['jpeg_activities[AID0]'] = gpu_metric_output.pop('jpeg_activities[AID0]')
engine_usage['jpeg_activities[AID1]'] = gpu_metric_output.pop('jpeg_activities[AID1]')
engine_usage['jpeg_activities[AID2]'] = gpu_metric_output.pop('jpeg_activities[AID2]')
engine_usage['jpeg_activities[AID3]'] = gpu_metric_output.pop('jpeg_activities[AID3]')
for key, value in engine_usage.items():
if value == 65535:
if not isinstance(value, list) and value > 100:
engine_usage[key] = "N/A"
elif isinstance(value, list):
engine_usage[key] = ["N/A" if v > 100 else v for v in value]
if self.logger.is_human_readable_format():
if engine_usage[key] != "N/A":
unit = '%'
unit = '%'
if isinstance(value, list):
engine_usage[key] = [f"{v} {unit}" if str(v) != "N/A" else str(v) for v in engine_usage[key]]
save_value = engine_usage[key]
pretty_array = "["
for i in range(len(save_value)):
if (i+1 != len(save_value)):
pretty_array += save_value[i] + ", "
else:
pretty_array += save_value[i] + "]"
engine_usage[key] = pretty_array
elif not isinstance(value, list) and engine_usage[key] != "N/A":
engine_usage[key] = f"{value} {unit}"
values_dict['usage'] = engine_usage
@@ -1225,9 +1244,6 @@ class AMDSMICommands():
logging.debug("Failed to get pcie replay rollover counter for gpu %s | %s", gpu_id, e.get_error_info())
try:
# nak_info = amdsmi_interface.amdsmi_get_gpu_pci_nak_info(args.gpu)
# pcie_dict['nak_sent_count'] = nak_info['nak_sent_count']
# pcie_dict['nak_received_count'] = nak_info['nak_received_count']
pcie_dict['nak_sent_count'] = "N/A"
pcie_dict['nak_received_count'] = "N/A"
except amdsmi_exception.AmdSmiLibraryException as e:
+86 -48
Datei anzeigen
@@ -88,6 +88,55 @@ typedef enum {
#define AMDSMI_GPU_UUID_SIZE 38
/**
* @brief The following structure holds the gpu metrics values for a device.
*/
/**
* @brief Unit conversion factor for HBM temperatures
*/
#define CENTRIGRADE_TO_MILLI_CENTIGRADE 1000
/**
* @brief This should match NUM_HBM_INSTANCES
*/
#define AMDSMI_NUM_HBM_INSTANCES 4
/**
* @brief This should match MAX_NUM_VCN
*/
#define AMDSMI_MAX_NUM_VCN 4
/**
* @brief This should match MAX_NUM_CLKS
*/
#define AMDSMI_MAX_NUM_CLKS 4
/**
* @brief This should match MAX_NUM_XGMI_LINKS
*/
#define AMDSMI_MAX_NUM_XGMI_LINKS 8
/**
* @brief This should match MAX_NUM_GFX_CLKS
*/
#define AMDSMI_MAX_NUM_GFX_CLKS 8
/**
* @brief This should match AMDSMI_MAX_AID
*/
#define AMDSMI_MAX_AID 4
/**
* @brief This should match AMDSMI_MAX_ENGINES
*/
#define AMDSMI_MAX_ENGINES 8
/**
* @brief This should match AMDSMI_MAX_NUM_JPEG (8*4=32)
*/
#define AMDSMI_MAX_NUM_JPEG 32
/* string format */
#define AMDSMI_TIME_FORMAT "%02d:%02d:%02d.%03d"
#define AMDSMI_DATE_FORMAT "%04d-%02d-%02d:%02d:%02d:%02d.%03d"
@@ -544,6 +593,12 @@ typedef struct {
uint32_t reserved[4];
} amdsmi_clk_info_t;
/**
* amdsmi_engine_usage_t:
* This structure holds common
* GPU activity values seen in both BM or
* SRIOV
**/
typedef struct {
uint32_t gfx_activity;
uint32_t umc_activity;
@@ -1137,41 +1192,6 @@ typedef struct {
/// \endcond
} amd_metrics_table_header_t;
/**
* @brief The following structure holds the gpu metrics values for a device.
*/
/**
* @brief Unit conversion factor for HBM temperatures
*/
#define CENTRIGRADE_TO_MILLI_CENTIGRADE 1000
/**
* @brief This should match NUM_HBM_INSTANCES
*/
#define AMDSMI_NUM_HBM_INSTANCES 4
/**
* @brief This should match MAX_NUM_VCN
*/
#define AMDSMI_MAX_NUM_VCN 4
/**
* @brief This should match MAX_NUM_CLKS
*/
#define AMDSMI_MAX_NUM_CLKS 4
/**
* @brief This should match MAX_NUM_XGMI_LINKS
*/
#define AMDSMI_MAX_NUM_XGMI_LINKS 8
/**
* @brief This should match MAX_NUM_GFX_CLKS
*/
#define AMDSMI_MAX_NUM_GFX_CLKS 8
typedef struct {
// TODO(amd) Doxygen documents
// Note: This structure is extended to fit the needs of different GPU metric
@@ -1191,7 +1211,7 @@ typedef struct {
/*
* v1.0 Base
*/
// Temperature
// Temperature (C)
uint16_t temperature_edge;
uint16_t temperature_hotspot;
uint16_t temperature_mem;
@@ -1199,19 +1219,19 @@ typedef struct {
uint16_t temperature_vrsoc;
uint16_t temperature_vrmem;
// Utilization
// Utilization (%)
uint16_t average_gfx_activity;
uint16_t average_umc_activity; // memory controller
uint16_t average_mm_activity; // UVD or VCN
// Power/Energy
// Power (W) /Energy (15.259uJ per 1ns)
uint16_t average_socket_power;
uint64_t energy_accumulator; // v1 mod. (32->64)
// Driver attached timestamp (in ns)
uint64_t system_clock_counter; // v1 mod. (moved from top of struct)
// Average clocks
// Average clocks (MHz)
uint16_t average_gfxclk_frequency;
uint16_t average_socclk_frequency;
uint16_t average_uclk_frequency;
@@ -1220,7 +1240,7 @@ typedef struct {
uint16_t average_vclk1_frequency;
uint16_t average_dclk1_frequency;
// Current clocks
// Current clocks (MHz)
uint16_t current_gfxclk;
uint16_t current_socclk;
uint16_t current_uclk;
@@ -1232,10 +1252,10 @@ typedef struct {
// Throttle status
uint32_t throttle_status;
// Fans
// Fans (RPM)
uint16_t current_fan_speed;
// Link width/speed
// Link width (number of lanes) /speed (0.1 GT/s)
uint16_t pcie_link_width; // v1 mod.(8->16)
uint16_t pcie_link_speed; // in 0.1 GT/s; v1 mod. (8->16)
@@ -1274,19 +1294,19 @@ typedef struct {
uint16_t current_socket_power;
// Utilization (%)
uint16_t vcn_activity[AMDSMI_MAX_NUM_VCN]; // VCN instances activity percent (encode/decode)
uint16_t vcn_activity[AMDSMI_MAX_NUM_VCN];
// Clock Lock Status. Each bit corresponds to clock instance
uint32_t gfxclk_lock_status;
// XGMI bus width and bitrate (in Gbps)
// XGMI bus width and bitrate (in GB/s)
uint16_t xgmi_link_width;
uint16_t xgmi_link_speed;
// PCIE accumulated bandwidth (GB/sec)
// PCIe accumulated bandwidth (GB/sec)
uint64_t pcie_bandwidth_acc;
// PCIE instantaneous bandwidth (GB/sec)
// PCIe instantaneous bandwidth (GB/sec)
uint64_t pcie_bandwidth_inst;
// PCIE L0 to recovery state transition accumulated count
@@ -1298,15 +1318,33 @@ typedef struct {
// PCIE replay rollover accumulated count
uint64_t pcie_replay_rover_count_acc;
// XGMI accumulated data transfer size(KiloBytes)
// XGMI accumulated data transfer size (KB)
uint64_t xgmi_read_data_acc[AMDSMI_MAX_NUM_XGMI_LINKS];
uint64_t xgmi_write_data_acc[AMDSMI_MAX_NUM_XGMI_LINKS];
// Current clock frequencies
// Current clock frequencies (MHz)
uint16_t current_gfxclks[AMDSMI_MAX_NUM_GFX_CLKS];
uint16_t current_socclks[AMDSMI_MAX_NUM_CLKS];
uint16_t current_vclk0s[AMDSMI_MAX_NUM_CLKS];
uint16_t current_dclk0s[AMDSMI_MAX_NUM_CLKS];
/*
* v1.5 additions
*/
// Memory Bandwidth Usage Accumulated (GB/sec)
uint64_t mem_bandwidth_acc;
// Memory Bandwidth Maximum (GB/sec)
uint32_t mem_max_bandwidth;
// PCIE NAK sent accumulated count
uint32_t pcie_nak_sent_count_acc;
// PCIE NAK received accumulated count
uint32_t pcie_nak_rcvd_count_acc;
// JPEG activity % per AID
uint16_t jpeg_activity[AMDSMI_MAX_NUM_JPEG];
/// \endcond
} amdsmi_gpu_metrics_t;
@@ -48,6 +48,7 @@
#include "amd_smi/impl/amd_smi_processor.h"
#include "amd_smi/impl/amd_smi_drm.h"
#include "shared_mutex.h" // NOLINT
#include "rocm_smi/rocm_smi_logger.h"
namespace amd {
namespace smi {
+61 -56
Datei anzeigen
@@ -2117,63 +2117,68 @@ Input parameters:
Output: Dictionary with fields
Field | Description
`---|---
`temperature_edge` | edge temperature value
`temperature_hotspot` | hotspot temperature value
`temperature_mem` | memory temperature value
`temperature_vrgfx` | vrgfx temperature value
`temperature_vrsoc` | vrsoc temperature value
`temperature_vrmem` | vrmem temperature value
`average_gfx_activity` | average gfx activity
`average_umc_activity` | average umc activity
`average_mm_activity` | average mm activity
`average_socket_power` | average socket power
`energy_accumulator` | energy accumulator value
`system_clock_counter` | system clock counter
`average_gfxclk_frequency` | average gfx clock frequency
`average_socclk_frequency` | average soc clock frequency
`average_uclk_frequency` | average uclk frequency
`average_vclk0_frequency` | average vclk0 frequency
`average_dclk0_frequency` | average dclk0 frequency
`average_vclk1_frequency` | average vclk1 frequency
`average_dclk1_frequency` | average dclk1 frequency
`current_gfxclk` | current gfx clock
`current_socclk` | current soc clock
`current_uclk` | current uclk
`current_vclk0` | current vclk0
`current_dclk0` | current dclk0
`current_vclk1` | current vclk1
`current_dclk1` | current dclk1
`throttle_status` | current throttle status
`current_fan_speed` | current fan speed
`pcie_link_width` | pcie link width
`pcie_link_speed` | pcie link speed
| Field | Description |Unit|
|-------|-------------|----|
`temperature_edge` | Edge temperature value | Celsius (C)
`temperature_hotspot` | Hotspot (aka junction) temperature value | Celsius (C)
`temperature_mem` | Memory temperature value | Celsius (C)
`temperature_vrgfx` | vrgfx temperature value | Celsius (C)
`temperature_vrsoc` | vrsoc temperature value | Celsius (C)
`temperature_vrmem` | vrmem temperature value | Celsius (C)
`average_gfx_activity` | Average gfx activity | %
`average_umc_activity` | Average umc activity | %
`average_mm_activity` | Average mm activity | %
`average_socket_power` | Average socket power | W
`energy_accumulator` | Energy accumulated with a 15.3 uJ resolution over 1ns | uJ
`system_clock_counter` | System clock counter | ns
`average_gfxclk_frequency` | Average gfx clock frequency | MHz
`average_socclk_frequency` | Average soc clock frequency | MHz
`average_uclk_frequency` | Average uclk frequency | MHz
`average_vclk0_frequency` | Average vclk0 frequency | MHz
`average_dclk0_frequency` | Average dclk0 frequency | MHz
`average_vclk1_frequency` | Average vclk1 frequency | MHz
`average_dclk1_frequency` | Average dclk1 frequency | MHz
`current_gfxclk` | Current gfx clock | MHz
`current_socclk` | Current soc clock | MHz
`current_uclk` | Current uclk | MHz
`current_vclk0` | Current vclk0 | MHz
`current_dclk0` | Current dclk0 | MHz
`current_vclk1` | Current vclk1 | MHz
`current_dclk1` | Current dclk1 | MHz
`throttle_status` | Current throttle status | MHz
`current_fan_speed` | Current fan speed | RPM
`pcie_link_width` | PCIe link width (number of lanes) | lanes
`pcie_link_speed` | PCIe link speed in 0.1 GT/s (Giga Transfers per second) | GT/s
`padding` | padding
`gfx_activity_acc` | gfx activity acc
`mem_activity_acc` | mem activity acc
`temperature_hbm` | list of hbm temperatures
`firmware_timestamp` | timestamp from PMFW
`voltage_soc` | soc voltage
`voltage_gfx` | gfx voltage
`voltage_mem` | mem voltage
`indep_throttle_status` | asic independent throttle status
`current_socket_power` | current socket power
`vcn_activity` | list of encoding and decoding engine utilizations
`gfxclk_lock_status` | gfx clock lock status
`xgmi_link_width` | XGMI bus width
`xgmi_link_speed` | XGMI bitrate (in Gbps)
`pcie_bandwidth_acc` | PCIE accumulated bandwidth (GB/sec)
`pcie_bandwidth_inst` | PCIE instantaneous bandwidth (GB/sec)
`pcie_l0_to_recov_count_acc` | PCIE L0 to recovery state transition accumulated count
`pcie_replay_count_acc` | PCIE replay accumulated count
`pcie_replay_rover_count_acc` | PCIE replay rollover accumulated count
`xgmi_read_data_acc` | XGMI accumulated read data transfer size(KiloBytes)
`xgmi_write_data_acc` | XGMI accumulated write data transfer size(KiloBytes)
`current_gfxclks` | list of current gfx clock frequencies
`current_socclks` | list of current soc clock frequencies
`current_vclk0s` | list of current v0 clock frequencies
`current_dclk0s` | list of current d0 clock frequencies
`gfx_activity_acc` | gfx activity accumulated | %
`mem_activity_acc` | Memory activity accumulated | %
`temperature_hbm` | list of hbm temperatures | Celsius (C)
`firmware_timestamp` | timestamp from PMFW (10ns resolution) | ns
`voltage_soc` | soc voltage | mV
`voltage_gfx` | gfx voltage | mV
`voltage_mem` | mem voltage | mV
`indep_throttle_status` | ASIC independent throttle status (see drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h for bit flags) |
`current_socket_power` | Current socket power (also known as instant socket power) | W
`vcn_activity` | List of VCN encode/decode engine utilization per AID | %
`gfxclk_lock_status` | Clock lock status. Each bit corresponds to clock instance. |
`xgmi_link_width` | XGMI bus width | lanes
`xgmi_link_speed` | XGMI bitrate | GB/s
`pcie_bandwidth_acc` | PCIe accumulated bandwidth | GB/s
`pcie_bandwidth_inst` | PCIe instantaneous bandwidth | GB/s
`pcie_l0_to_recov_count_acc` | PCIe L0 to recovery state transition accumulated count |
`pcie_replay_count_acc` | PCIe replay accumulated count |
`pcie_replay_rover_count_acc` | PCIe replay rollover accumulated count |
`xgmi_read_data_acc` | XGMI accumulated read data transfer size (KiloBytes) | KB
`xgmi_write_data_acc` | XGMI accumulated write data transfer size (KiloBytes) | KB
`current_gfxclks` | List of current gfx clock frequencies | MHz
`current_socclks` | List of current soc clock frequencies | MHz
`current_vclk0s` | List of current v0 clock frequencies | MHz
`current_dclk0s` | List of current d0 clock frequencies | MHz
`mem_bandwidth_acc` | Memory bandwidth usage accumulated | GB/s
`mem_max_bandwidth` | Maximum memory bandwidth usage accumulated | GB/s
`pcie_nak_sent_count_acc` | PCIe NAC sent count accumulated |
`pcie_nak_rcvd_count_acc` | PCIe NAC received count accumulated |
`jpeg_activitys[AID<X>]` | List of JPEG engine activity for each AID (X=0-3) | %
Exceptions that can be thrown by `amdsmi_get_gpu_metrics_info` function:
@@ -2978,6 +2978,14 @@ def amdsmi_get_gpu_metrics_info(
"current_socclks": list(gpu_metrics.current_socclks),
"current_vclk0s": list(gpu_metrics.current_vclk0s),
"current_dclk0s": list(gpu_metrics.current_dclk0s),
"mem_bandwidth_acc": gpu_metrics.mem_bandwidth_acc,
"mem_max_bandwidth": gpu_metrics.mem_max_bandwidth,
"pcie_nak_sent_count_acc": gpu_metrics.pcie_nak_sent_count_acc,
"pcie_nak_rcvd_count_acc": gpu_metrics.pcie_nak_rcvd_count_acc,
"jpeg_activities[AID0]": list(gpu_metrics.jpeg_activities)[:8],
"jpeg_activities[AID1]": list(gpu_metrics.jpeg_activities)[8:16],
"jpeg_activities[AID2]": list(gpu_metrics.jpeg_activities)[16:24],
"jpeg_activities[AID3]": list(gpu_metrics.jpeg_activities)[24:32],
}
@@ -894,6 +894,7 @@ amdsmi_clk_info_t = struct_amdsmi_clk_info_t
class struct_amdsmi_engine_usage_t(Structure):
pass
struct_amdsmi_engine_usage_t._pack_ = 1 # source:False
struct_amdsmi_engine_usage_t._fields_ = [
('gfx_activity', ctypes.c_uint32),
@@ -1514,6 +1515,11 @@ struct_amdsmi_gpu_metrics_t._fields_ = [
('current_socclks', ctypes.c_uint16 * 4),
('current_vclk0s', ctypes.c_uint16 * 4),
('current_dclk0s', ctypes.c_uint16 * 4),
('mem_bandwidth_acc', ctypes.c_uint64),
('mem_max_bandwidth', ctypes.c_uint32),
('pcie_nak_sent_count_acc', ctypes.c_uint32),
('pcie_nak_rcvd_count_acc', ctypes.c_uint32),
('jpeg_activities', ctypes.c_uint16 * 32),
]
amdsmi_gpu_metrics_t = struct_amdsmi_gpu_metrics_t
+26 -2
Datei anzeigen
@@ -68,6 +68,7 @@
#include "amd_smi/impl/amdgpu_drm.h"
#include "amd_smi/impl/amd_smi_utils.h"
#include "amd_smi/impl/amd_smi_processor.h"
#include "rocm_smi/rocm_smi_logger.h"
#ifdef ENABLE_ESMI_LIB
#include "amd_smi/impl/amd_smi_cpu_socket.h"
#include "amd_smi/impl/amd_smi_cpu_core.h"
@@ -1154,9 +1155,32 @@ amdsmi_status_t amdsmi_get_gpu_metrics_info(
amdsmi_gpu_metrics_t *pgpu_metrics) {
AMDSMI_CHECK_INIT();
// nullptr api supported
return rsmi_wrapper(rsmi_dev_gpu_metrics_info_get, processor_handle,
amdsmi_status_t ret =
rsmi_wrapper(rsmi_dev_gpu_metrics_info_get, processor_handle,
reinterpret_cast<rsmi_gpu_metrics_t*>(pgpu_metrics));
// WARNING: TEMPORARY - awaiting 1.5 update from amdgpu driver/firmware
// intended to be removed later
// START: REMOVE WHATS BELOW ME
uint8_t content_ver = pgpu_metrics->common_header.content_revision;
int8_t format_ver = pgpu_metrics->common_header.format_revision;
const uint8_t expected_format_ver = 1;
const uint8_t expected_content_ver = 4;
if (ret == AMDSMI_STATUS_SUCCESS &&
(format_ver == expected_format_ver &&
content_ver <= expected_content_ver)) {
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << " | SET JPEG_ACTIVITY to MAX_UINT16, "
<< "detected content version: " << std::dec << +content_ver
<< "; format version: " << std::dec << +format_ver
<< "; awaiting 1.5 metrics remove once released";
LOG_ALWAYS(ss);
std::fill_n(&pgpu_metrics->jpeg_activity[0],
(sizeof(pgpu_metrics->jpeg_activity) /
sizeof(pgpu_metrics->jpeg_activity[0])),
std::numeric_limits<uint16_t>::max());
}
// END: REMOVE WHATS ABOVE ME
return ret;
}
+17 -2
Datei anzeigen
@@ -43,6 +43,7 @@
#include "amd_smi/impl/amd_smi_utils.h"
#include "shared_mutex.h" // NOLINT
#include "rocm_smi/rocm_smi_logger.h"
static const uint32_t kAmdGpuId = 0x1002;
@@ -153,6 +154,20 @@ amdsmi_status_t smi_amdgpu_get_board_info(amd::smi::AMDSmiGPUDevice* device, amd
fgets(info->product_name, sizeof(info->product_name), fp);
fclose(fp);
}
std::ostringstream ss;
ss << __PRETTY_FUNCTION__
<< "Returning status = AMDSMI_STATUS_SUCCESS"
<< " | model_number_path = " << model_number_path
<< "; info->model_number: " << info->model_number
<< "\n product_serial_path = " << product_serial_path
<< "; info->product_serial: " << info->product_serial
<< "\n fru_id_path = " << fru_id_path
<< "; info->fru_id: " << info->fru_id
<< "\n manufacturer_name_path = " << manufacturer_name_path
<< "; info->manufacturer_name: " << info->manufacturer_name
<< "\n product_name_path = " << product_name_path
<< "; info->product_name: " << info->product_name;
LOG_INFO(ss);
return AMDSMI_STATUS_SUCCESS;
}
@@ -236,8 +251,8 @@ amdsmi_status_t smi_amdgpu_get_ranges(amd::smi::AMDSmiGPUDevice* device, amdsmi_
unsigned int dpm_level, freq;
char firstChar = line[0];
if (firstChar == 'S'){
if (sscanf(line.c_str(), "%c: %d%s", &single_char, &sleep_freq, str) <= 2){
if (firstChar == 'S') {
if (sscanf(line.c_str(), "%c: %d%s", &single_char, &sleep_freq, str) <= 2) {
ranges.close();
return AMDSMI_STATUS_NO_DATA;
}