[SWDEV-496693]GPU Metrics 1.7

Features added:
- [SWDEV-475244] Add new interface to get max memory bandwidth
Updated API: amdsmi_get_gpu_vram_info
Updated: struct amdsmi_vram_info_t to include vram_max_bandwidth
CLI: amd-smi static --vram

- [SWDEV-488349] Add new interface for XGMI link status
New API: amdsmi_get_gpu_xgmi_link_status
CLI: amd-smi xgmi --link-status

Signed-off-by: Juan Castillo <juan.castillo@amd.com>
Change-Id: I1aa35b741136eb4f02f7ea9a95b865886273eb72


[ROCm/amdsmi commit: f8b8347627]
This commit is contained in:
Juan Castillo
2024-11-07 16:35:17 -06:00
committed by Arif, Maisam
parent 01d303806a
commit 2ddb2ef032
16 changed files with 1281 additions and 63 deletions
+71 -4
View File
@@ -7,6 +7,42 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr
### Added
- **Added support for GPU metrics 1.7 to `amdsmi_get_gpu_metrics_info()`**
Updated `amdsmi_get_gpu_metrics_info()` and structure `amdsmi_gpu_metrics_t` to include new fields for XGMI Link Status, graphics clocks below host limit (per XCP), and VRAM max bandwidth:
- `uint64_t vram_max_bandwidth` - VRAM max bandwidth at max memory clock (GB/s)
- `uint16_t xgmi_link_status[MAX_NUM_XGMI_LINKS]` - XGMI link statis, 1=Up 0=Down
- `uint64_t gfx_below_host_limit_acc[MAX_NUM_XCC]` - graphics clocks below host limit (per XCP) accumulators. Used for graphic clk below host limit violation status.
- **Added new API `amdsmi_get_gpu_xgmi_link_status()` and CLI `amd-smi xgmi --link-status`**
New API is defined as:
```C
typedef enum {
AMDSMI_XGMI_LINK_DOWN, //!< The XGMI Link is down
AMDSMI_XGMI_LINK_UP, //!< The XGMI Link is up
AMDSMI_XGMI_LINK_DISABLE, //!< The XGMI Link is disabled
} amdsmi_xgmi_link_status_type_t;
typedef struct {
uint32_t total_links; //!< The total links in the status array
amdsmi_xgmi_link_status_type_t status[AMDSMI_MAX_NUM_XGMI_LINKS];
uint64_t reserved[7];
} amdsmi_xgmi_link_status_t;
amdsmi_status_t amdsmi_get_gpu_xgmi_link_status(amdsmi_processor_handle processor_handle, amdsmi_xgmi_link_status_t *link_status)
```
Example CLI output:
```shell
$ amd-smi xgmi --link-status
XGMI LINK STATUS:
bdf link_status
GPU0 0000:08:00.0 U U U U D U D X
GPU1 0000:44:00.0 U U U U D U D X
...
* U:Up D:Down X:Disabled
```
- **Added fclk and socclk info to `amd-smi metric -c/--clock`**.
fclk and socclk information such as min and max clock have been added to the metric command, in line with all the other clocks.
@@ -77,12 +113,43 @@ GPU: 0
DCLK1: N/A
```
## amd_smi_lib for ROCm 6.4.0
### Added
### Changed
- **Updated API `amdsmi_get_gpu_vram_info()` structure and CLI `amd-smi static --vram`**
Updated structure `amdsmi_vram_info_t`:
```C
typedef struct {
amdsmi_vram_type_t vram_type;
amdsmi_vram_vendor_type_t vram_vendor;
uint64_t vram_size;
uint32_t vram_bit_width;
uint64_t vram_max_bandwidth; //!< The VRAM max bandwidth at current memory clock (GB/s)
uint64_t reserved[4];
} amdsmi_vram_info_t;
amdsmi_status_t amdsmi_get_gpu_vram_info(amdsmi_processor_handle processor_handle, amdsmi_vram_info_t *info)
```
Example CLI output:
```shell
$ amd-smi static --vram
GPU: 0
VRAM:
TYPE: GDDR6
VENDOR: N/A
SIZE: 16368 MB
BIT_WIDTH: 256
MAX_BANDWIDTH: 1555 GB/s
GPU: 1
VRAM:
TYPE: GDDR6
VENDOR: N/A
SIZE: 30704 MB
BIT_WIDTH: 256
MAX_BANDWIDTH: 1555 GB/s
...
```
### Removed
- **Removed `GFX_BUSY_ACC` from `amd-smi metric --usage`**.
+78 -18
View File
@@ -751,7 +751,8 @@ class AMDSMICommands():
vram_info_dict = {"type" : "N/A",
"vendor" : "N/A",
"size" : "N/A",
"bit_width" : "N/A"}
"bit_width" : "N/A",
"max_bandwidth" : "N/A"}
try:
vram_info = amdsmi_interface.amdsmi_get_gpu_vram_info(args.gpu)
@@ -790,6 +791,15 @@ class AMDSMICommands():
# Populate bit width
vram_info_dict['bit_width'] = vram_info['vram_bit_width']
# Populate vram_max_bandwidth
vram_max_bw = vram_info['vram_max_bandwidth']
vram_max_bw_unit = 'GB/s'
if self.logger.is_human_readable_format():
vram_info_dict["max_bandwidth"] = f"{vram_max_bw} {vram_max_bw_unit if vram_max_bw != 'N/A' else ''}"
if self.logger.is_json_format():
vram_info_dict["max_bandwidth"] = {"value" : vram_max_bw,
"unit" : vram_max_bw_unit}
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get vram info for gpu %s | %s", gpu_id, e.get_error_info())
@@ -1242,7 +1252,8 @@ class AMDSMICommands():
clock=None, temperature=None, ecc=None, ecc_blocks=None, pcie=None,
fan=None, voltage_curve=None, overdrive=None, perf_level=None,
xgmi_err=None, energy=None, mem_usage=None, schedule=None,
guard=None, guest_data=None, fb_usage=None, xgmi=None, throttle=None):
guard=None, guest_data=None, fb_usage=None, xgmi=None, throttle=None,
):
"""Get Metric information for target gpu
Args:
@@ -1338,7 +1349,8 @@ class AMDSMICommands():
current_platform_args += ["fan", "voltage_curve", "overdrive", "perf_level",
"xgmi_err", "energy", "throttle"]
current_platform_values += [args.fan, args.voltage_curve, args.overdrive,
args.perf_level, args.xgmi_err, args.energy, args.throttle]
args.perf_level, args.xgmi_err, args.energy, args.throttle,
]
if self.helpers.is_hypervisor():
if schedule:
@@ -2221,6 +2233,7 @@ class AMDSMICommands():
'socket_thermal_accumulated': "N/A",
'vr_thermal_accumulated': "N/A",
'hbm_thermal_accumulated': "N/A",
'gfx_below_host_limit_acc': "N/A",
# violation status values - active/not active
'prochot_violation_status': "N/A",
@@ -2311,7 +2324,7 @@ class AMDSMICommands():
def metric_cpu(self, args, multiple_devices=False, cpu=None, cpu_power_metrics=None, cpu_prochot=None,
cpu_freq_metrics=None, cpu_c0_res=None, cpu_lclk_dpm_level=None,
cpu_pwr_svi_telemtry_rails=None, cpu_io_bandwidth=None, cpu_xgmi_bandwidth=None,
cpu_pwr_svi_telemetry_rails=None, cpu_io_bandwidth=None, cpu_xgmi_bandwidth=None,
cpu_metrics_ver=None, cpu_metrics_table=None, cpu_socket_energy=None,
cpu_ddr_bandwidth=None, cpu_temp=None, cpu_dimm_temp_range_rate=None,
cpu_dimm_pow_consumption=None, cpu_dimm_thermal_sensor=None):
@@ -2354,8 +2367,8 @@ class AMDSMICommands():
args.cpu_c0_res = cpu_c0_res
if cpu_lclk_dpm_level:
args.cpu_lclk_dpm_level = cpu_lclk_dpm_level
if cpu_pwr_svi_telemtry_rails:
args.cpu_pwr_svi_telemtry_rails = cpu_pwr_svi_telemtry_rails
if cpu_pwr_svi_telemetry_rails:
args.cpu_pwr_svi_telemtry_rails = cpu_pwr_svi_telemetry_rails
if cpu_io_bandwidth:
args.cpu_io_bandwidth = cpu_io_bandwidth
if cpu_xgmi_bandwidth:
@@ -2488,7 +2501,7 @@ class AMDSMICommands():
except amdsmi_exception.AmdSmiLibraryException as e:
static_dict["socket_dpm"]["dpml_level_range"] = "N/A"
logging.debug("Failed to get socket dpm level range for cpu %s | %s", cpu_id, e.get_error_info())
if args.cpu_pwr_svi_telemtry_rails:
if args.cpu_pwr_svi_telemetry_rails:
static_dict["svi_telemetry_all_rails"] = {}
try:
power = amdsmi_interface.amdsmi_get_cpu_pwr_svi_telemetry_all_rails(args.cpu)
@@ -2756,8 +2769,7 @@ class AMDSMICommands():
None: Print output via AMDSMILogger to destination
"""
# TODO Move watch logic into here and make it driver agnostic or enable it for CPU arguments
# Mutually exculsive args
# Mutually exclusive args
if gpu:
args.gpu = gpu
if cpu:
@@ -2832,7 +2844,7 @@ class AMDSMICommands():
cpu_dimm_pow_consumption, cpu_dimm_thermal_sensor)
if args.core:
self.logger.output = {}
self.logger.clear_multiple_devices_ouput()
self.logger.clear_multiple_devices_output()
self.metric_core(args, multiple_devices, core, core_boost_limit,
core_curr_active_freq_core_limit, core_energy)
if args.gpu:
@@ -2843,7 +2855,8 @@ class AMDSMICommands():
clock, temperature, ecc, ecc_blocks, pcie,
fan, voltage_curve, overdrive, perf_level,
xgmi_err, energy, mem_usage, schedule,
guard, guest_data, fb_usage, xgmi, throttle)
guard, guest_data, fb_usage, xgmi, throttle,
)
elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized
if args.cpu == None and args.core == None:
# If no args are set, print out all CPU and Core metrics info
@@ -2877,7 +2890,8 @@ class AMDSMICommands():
usage, watch, watch_time, iterations, power,
clock, temperature, ecc, ecc_blocks, pcie,
fan, voltage_curve, overdrive, perf_level,
xgmi_err, energy, mem_usage, schedule, throttle)
xgmi_err, energy, mem_usage, schedule, throttle,
)
def process(self, args, multiple_devices=False, watching_output=False,
@@ -5350,13 +5364,14 @@ class AMDSMICommands():
print("Placeholder for rocm-smi legacy commands")
def xgmi(self, args, multiple_devices=False, gpu=None, metric=None):
def xgmi(self, args, multiple_devices=False, gpu=None, metric=None, xgmi_link_status=None):
""" Get topology information for target gpus
params:
args - argparser args to pass to subcommand
multiple_devices (bool) - True if checking for multiple devices
gpu (device_handle) - device_handle for target device
metric (bool) - Value override for args.metric
xgmi_link_status (bool) - Value override for args.xgmi_link_status
return:
Nothing
@@ -5368,6 +5383,8 @@ class AMDSMICommands():
args.gpu = gpu
if metric:
args.metric = metric
if xgmi_link_status:
args.link_status = xgmi_link_status
# Handle No GPU passed
if args.gpu == None:
@@ -5377,8 +5394,9 @@ class AMDSMICommands():
args.gpu = [args.gpu]
# Handle all args being false
if not any([args.metric]):
if not any([args.metric, args.link_status]):
args.metric = True
args.link_status = True
# Clear the table header
self.logger.table_header = ''.rjust(7)
@@ -5396,9 +5414,9 @@ class AMDSMICommands():
if args.metric:
# prepend link metrics header to the table header
link_metrics_header = " " + "bdf".ljust(13) + \
"bit_rate".ljust(9) + "max_bandwidth".ljust(14) + \
"link_type".ljust(10)
link_metrics_header = " " + "bdf".ljust(14) + \
"bit_rate".ljust(10) + "max_bandwidth".ljust(15) + \
"link_type".ljust(11)
self.logger.table_header = link_metrics_header + self.logger.table_header.strip()
# Populate dictionary according to format
@@ -5544,7 +5562,7 @@ class AMDSMICommands():
# Print out the tabular output
self.logger.multiple_device_output = tabular_output
self.logger.table_title = "LINK METRIC TABLE"
self.logger.table_title = "\nLINK METRIC TABLE"
self.logger.print_output(multiple_device_enabled=True, tabular=True)
self.logger.multiple_device_output = xgmi_values
@@ -5558,6 +5576,48 @@ class AMDSMICommands():
if not self.logger.is_human_readable_format():
self.logger.print_output(multiple_device_enabled=True)
if args.link_status:
# Header modification
self.logger.table_header = ''.rjust(7)
current_header = " ".ljust(7) + \
"bdf".ljust(14) + \
"link_status".ljust(20)
self.logger.table_header = current_header + self.logger.table_header.strip()
# Process each GPU
tabular_output = []
for xgmi_dict in xgmi_values:
src_gpu_id = xgmi_dict['gpu']
src_gpu_bdf = xgmi_dict['bdf']
src_gpu = amdsmi_interface.amdsmi_get_processor_handle_from_bdf(src_gpu_bdf)
# Populate link statuses
status_row = []
tabular_output_dict = {"gpu#": f"GPU{src_gpu_id}",
"gpu": src_gpu_id,
"bdf": src_gpu_bdf,
"link_status": "N/A"}
try:
link_status = amdsmi_interface.amdsmi_get_gpu_xgmi_link_status(src_gpu)
tabular_output_dict['link_status'] = link_status['status']
if self.logger.is_human_readable_format():
del tabular_output_dict['gpu']
else:
del tabular_output_dict['gpu#']
tabular_output.append(tabular_output_dict)
except amdsmi_exception.AmdSmiLibraryException as e:
xgmi_dict['link_metrics']['link_status']={"status": "failed"}
logging.debug("Failed to get XGMI link status for GPU %s | %s", src_gpu_id, e.get_error_info())
#populate link status data for output
if self.logger.is_human_readable_format():
xgmi_dict['link_status'] = tabular_output
self.logger.multiple_device_output= tabular_output
self.logger.table_title = "\nXGMI LINK STATUS"
self.logger.print_output(multiple_device_enabled=True, tabular=True)
self.logger.clear_multiple_devices_ouput()
if self.logger.is_human_readable_format():
print("\n* U:Up D:Down X:Disabled".ljust(13))
def partition(self, args, multiple_devices=False, gpu=None, current=None, memory=None, accelerator=None):
""" Display parition information for the target GPU
@@ -780,6 +780,8 @@ class AMDSMIHelpers():
def convert_bytes_to_readable(self, bytes_input, format_length=None):
if isinstance(bytes_input, str):
return "N/A"
for unit in ["B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB"]:
if abs(bytes_input) < 1024:
if format_length is not None:
+9 -6
View File
@@ -132,15 +132,18 @@ class AMDSMILogger():
elif key == "gpu#":
table_values += string_value.ljust(7)
elif key == "bdf":
table_values += string_value.ljust(13)
table_values += string_value.ljust(14)
elif "bdf_" in key:
table_values += string_value.ljust(13)
elif key == "bit_rate":
table_values += string_value.ljust(9)
elif key == "max_bandwidth":
table_values += string_value.ljust(14)
elif key == "link_type":
table_values += string_value.ljust(10)
elif key == "max_bandwidth":
table_values += string_value.ljust(15)
elif key == "link_type":
table_values += string_value.ljust(11)
elif key == "link_status":
for i in value:
table_values += str(i).ljust(3)
elif key == "memory":
table_values += string_value.ljust(8)
elif key == "accelerator_type":
@@ -166,7 +169,7 @@ class AMDSMILogger():
elif key == "resources_shared":
table_values += string_value.ljust(18)
elif key == "RW":
table_values += string_value.ljust(53)
table_values += string_value.ljust(57)
elif key == "process_list":
#Add an additional padding between the first instance of GPU and NAME
table_values += ' '
@@ -1313,6 +1313,7 @@ class AMDSMIParser(argparse.ArgumentParser):
# Help text for Arguments only on Guest and BM platforms
metrics_help = "Metric XGMI information"
xgmi_link_status_help = "XGMI Link Status information"
# Create xgmi subparser
xgmi_parser = subparsers.add_parser('xgmi', help=xgmi_help, description=xgmi_subcommand_help)
@@ -1326,6 +1327,7 @@ class AMDSMIParser(argparse.ArgumentParser):
# Optional Args
xgmi_parser.add_argument('-m', '--metric', action='store_true', required=False, help=metrics_help)
xgmi_parser.add_argument('-l', '--link-status', action='store_true', required=False, help=xgmi_link_status_help)
def _add_partition_parser(self, subparsers, func):
+38 -3
View File
@@ -293,9 +293,11 @@ int main() {
CHK_AMDSMI_RET(ret)
printf(" Output of amdsmi_get_gpu_vram_info:\n");
printf("\tVRAM Size: 0x%lx (%ld) \n", vram_info.vram_size, vram_info.vram_size);
printf("\tBIT Width: 0x%x (%d) \n\n", vram_info.vram_bit_width, vram_info.vram_bit_width);
}
else {
printf("\tBIT Width: 0x%x (%d) \n\n", vram_info.vram_bit_width,
vram_info.vram_bit_width);
printf("\tVRAM max bandwidth: 0x%lx (%lu) \n\n", vram_info.vram_max_bandwidth,
vram_info.vram_max_bandwidth);
} else {
printf("\t**amdsmi_get_gpu_vram_info() not supported on this system.\n");
}
@@ -865,6 +867,18 @@ int main() {
++idx;
}
std::cout << std::dec << "\txgmi_link_status= [";
idx = 0;
for (const auto& temp : smu.xgmi_link_status) {
std::cout << temp;
if ((idx + 1) != std::size(smu.xgmi_link_status)) {
std::cout << ", ";
} else {
std::cout << "]\n";
}
++idx;
}
// Voltage (mV)
std::cout << "\tvoltage_soc = " << std::dec << smu.voltage_soc << "\n";
std::cout << "\tvoltage_gfx = " << std::dec << smu.voltage_gfx << "\n";
@@ -880,6 +894,9 @@ int main() {
std::cout << "\tpcie_bandwidth_acc=" << std::dec << smu.pcie_bandwidth_acc << "\n";
std::cout << "\tpcie_bandwidth_inst=" << std::dec << smu.pcie_bandwidth_inst << "\n";
// VRAM max bandwidth at max memory clock
std::cout << "\tvram_max_bandwidth=" << std::dec << smu.vram_max_bandwidth << "\n";
// Counts
std::cout << "\tpcie_l0_to_recov_count_acc= " << std::dec << smu.pcie_l0_to_recov_count_acc
<< "\n";
@@ -983,6 +1000,24 @@ int main() {
idx++;
}
idx = 0;
idy = 0;
std::cout << "\txcp_stats.gfx_below_host_limit_acc: " << "\n";
for (auto& row : smu.xcp_stats) {
std::cout << "\t XCP [" << idx << "] : [";
for (auto& col : row.gfx_below_host_limit_acc) {
if ((idy + 1) != static_cast<int>(std::size(row.gfx_below_host_limit_acc))) {
std::cout << col << ", ";
} else {
std::cout << col;
}
idy++;
}
std::cout << "]\n";
idy = 0;
idx++;
}
std::cout << "\n\n";
std::cout << "\t ** -> Checking metrics with constant changes ** " << "\n";
constexpr uint16_t kMAX_ITER_TEST = 10;
+53 -2
View File
@@ -710,7 +710,8 @@ typedef struct {
amdsmi_vram_vendor_type_t vram_vendor;
uint64_t vram_size;
uint32_t vram_bit_width;
uint64_t reserved[5];
uint64_t vram_max_bandwidth; //!< The VRAM max bandwidth at current memory clock (GB/s)
uint64_t reserved[4];
} amdsmi_vram_info_t;
typedef struct {
@@ -1325,13 +1326,22 @@ typedef struct {
* @brief The following structures hold the gpu statistics for a device.
*/
typedef struct {
/* Utilization Instantaneous (%) */
/*
* v1.6 additions
*/
/* Utilization Instantaneous (%) */
uint32_t gfx_busy_inst[AMDSMI_MAX_NUM_XCC];
uint16_t jpeg_busy[AMDSMI_MAX_NUM_JPEG];
uint16_t vcn_busy[AMDSMI_MAX_NUM_VCN];
/* Utilization Accumulated (%) */
uint64_t gfx_busy_acc[AMDSMI_MAX_NUM_XCC];
/*
* v1.7 additions
*/
/* Total App Clock Counter Accumulated */
uint64_t gfx_below_host_limit_acc[AMDSMI_MAX_NUM_XCC];
} amdsmi_gpu_xcp_metrics_t;
typedef struct {
@@ -1533,9 +1543,30 @@ typedef struct {
/* PCIE other end recovery counter */
uint32_t pcie_lc_perf_other_end_recovery;
/*
* v1.7 additions
*/
/* VRAM max bandwidth at max memory clock (GB/s) */
uint64_t vram_max_bandwidth;
/* XGMI link status(up/down) */
uint16_t xgmi_link_status[AMDSMI_MAX_NUM_XGMI_LINKS];
/// \endcond
} amdsmi_gpu_metrics_t;
typedef enum {
AMDSMI_XGMI_LINK_DOWN, //!< The XGMI Link is down
AMDSMI_XGMI_LINK_UP, //!< The XGMI Link is up
AMDSMI_XGMI_LINK_DISABLE, //!< The XGMI Link is disabled
} amdsmi_xgmi_link_status_type_t;
typedef struct {
uint32_t total_links; //!< The total links in the status array
amdsmi_xgmi_link_status_type_t status[AMDSMI_MAX_NUM_XGMI_LINKS];
uint64_t reserved[7];
} amdsmi_xgmi_link_status_t;
#define MAX_AMDSMI_NAME_LENGTH 64
/**
@@ -4828,6 +4859,25 @@ amdsmi_status_t amdsmi_get_pcie_info(amdsmi_processor_handle processor_handle, a
amdsmi_status_t
amdsmi_get_xgmi_info(amdsmi_processor_handle processor_handle, amdsmi_xgmi_info_t *info);
/**
* @brief Get the XGMI link status
*
* @platform{gpu_bm_linux} @platform{host}
*
* @details Given a processor handle @p processor_handle, this function
* will return the link status for each XGMI link connect to this processor.
* If the processor link type is not XGMI, it should return AMDSMI_STATUS_NOT_SUPPORTED.
*
* @param[in] processor_handle a processor handle
*
* @param[out] link_status The link status of the XGMI connect to this processor.
*
* @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail
*/
amdsmi_status_t
amdsmi_get_gpu_xgmi_link_status(amdsmi_processor_handle processor_handle,
amdsmi_xgmi_link_status_t* link_status);
/** @} End asicinfo */
/*****************************************************************************/
@@ -5756,6 +5806,7 @@ amdsmi_status_t amdsmi_get_cpu_model(uint32_t *cpu_model);
* @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail
*/
amdsmi_status_t amdsmi_get_esmi_err_msg(amdsmi_status_t status, const char **status_string);
#endif
/** @} auxiquer */
+1
View File
@@ -105,6 +105,7 @@ from .amdsmi_interface import amdsmi_get_clock_info
from .amdsmi_interface import amdsmi_get_pcie_info
from .amdsmi_interface import amdsmi_get_gpu_bad_page_info
from .amdsmi_interface import amdsmi_get_violation_status
from .amdsmi_interface import amdsmi_get_gpu_xgmi_link_status
# # Process Information
from .amdsmi_interface import amdsmi_get_gpu_process_list
@@ -1818,15 +1818,50 @@ def amdsmi_get_gpu_vram_info(
amdsmi_wrapper.amdsmi_get_gpu_vram_info(
processor_handle, ctypes.byref(vram_info))
)
return {
"vram_type": vram_info.vram_type,
"vram_vendor": vram_info.vram_vendor,
"vram_size": vram_info.vram_size,
"vram_bit_width": vram_info.vram_bit_width
"vram_bit_width": _validate_if_max_uint(vram_info.vram_bit_width, MaxUIntegerTypes.UINT32_T),
"vram_max_bandwidth": _validate_if_max_uint(vram_info.vram_max_bandwidth, MaxUIntegerTypes.UINT64_T),
}
def amdsmi_get_gpu_xgmi_link_status(
processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
) -> Dict[str, Any]:
if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle):
raise AmdSmiParameterException(
processor_handle, amdsmi_wrapper.amdsmi_processor_handle
)
status_info = amdsmi_wrapper.amdsmi_xgmi_link_status_t()
_check_res(
amdsmi_wrapper.amdsmi_get_gpu_xgmi_link_status(
processor_handle, ctypes.byref(status_info))
)
link_status = []
count = 0
for link in status_info.status:
if count == status_info.total_links:
break
if amdsmi_wrapper.amdsmi_xgmi_link_status_type_t__enumvalues[link] == 'AMDSMI_XGMI_LINK_DISABLE': # XGMI link is disabled
link_status.append("X")
elif amdsmi_wrapper.amdsmi_xgmi_link_status_type_t__enumvalues[link] == 'AMDSMI_XGMI_LINK_UP': # XGMI Link is up
link_status.append("U")
elif amdsmi_wrapper.amdsmi_xgmi_link_status_type_t__enumvalues[link] == 'AMDSMI_XGMI_LINK_DOWN': # XGMI Link is down
link_status.append("D")
else:
link_status.append("N/A")
count += 1
return_dict = {
"status" : link_status,
"total_links": status_info.total_links,
}
return return_dict
def amdsmi_get_gpu_cache_info(
processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
) -> List[Dict[str, Any]]:
@@ -3863,7 +3898,10 @@ def amdsmi_get_gpu_metrics_info(
"xcp_stats.jpeg_busy": list(gpu_metrics.xcp_stats),
"xcp_stats.vcn_busy": list(gpu_metrics.xcp_stats),
"xcp_stats.gfx_busy_acc": list(gpu_metrics.xcp_stats),
"xcp_stats.gfx_below_host_limit_acc": list(gpu_metrics.xcp_stats),
"pcie_lc_perf_other_end_recovery": _validate_if_max_uint(gpu_metrics.pcie_lc_perf_other_end_recovery, MaxUIntegerTypes.UINT32_T),
"vram_max_bandwidth": _validate_if_max_uint(gpu_metrics.vram_max_bandwidth, MaxUIntegerTypes.UINT64_T),
"xgmi_link_status": _validate_if_max_uint(list(gpu_metrics.xgmi_link_status), MaxUIntegerTypes.UINT16_T),
}
# Create 2d array with each XCD's stats
@@ -3893,6 +3931,12 @@ def amdsmi_get_gpu_metrics_info(
for val in item.gfx_busy_acc:
print_xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T, isActivity=True))
gpu_metrics_output[k][curr_xcp] = print_xcp_detail
if 'xcp_stats.gfx_below_host_limit_acc' in k:
for curr_xcp, item in enumerate(v):
print_xcp_detail = []
for val in item.gfx_below_host_limit_acc:
print_xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T, isActivity=True))
gpu_metrics_output[k][curr_xcp] = print_xcp_detail
return gpu_metrics_output
+49 -15
View File
@@ -1044,7 +1044,8 @@ struct_amdsmi_vram_info_t._fields_ = [
('vram_size', ctypes.c_uint64),
('vram_bit_width', ctypes.c_uint32),
('PADDING_0', ctypes.c_ubyte * 4),
('reserved', ctypes.c_uint64 * 5),
('vram_max_bandwidth', ctypes.c_uint64),
('reserved', ctypes.c_uint64 * 4),
]
amdsmi_vram_info_t = struct_amdsmi_vram_info_t
@@ -1119,6 +1120,16 @@ amdsmi_process_handle_t = ctypes.c_uint32
class struct_amdsmi_proc_info_t(Structure):
pass
class struct_engine_usage_(Structure):
pass
struct_engine_usage_._pack_ = 1 # source:False
struct_engine_usage_._fields_ = [
('gfx', ctypes.c_uint64),
('enc', ctypes.c_uint64),
('reserved', ctypes.c_uint32 * 12),
]
class struct_memory_usage_(Structure):
pass
@@ -1130,16 +1141,6 @@ struct_memory_usage_._fields_ = [
('reserved', ctypes.c_uint32 * 10),
]
class struct_engine_usage_(Structure):
pass
struct_engine_usage_._pack_ = 1 # source:False
struct_engine_usage_._fields_ = [
('gfx', ctypes.c_uint64),
('enc', ctypes.c_uint64),
('reserved', ctypes.c_uint32 * 12),
]
struct_amdsmi_proc_info_t._pack_ = 1 # source:False
struct_amdsmi_proc_info_t._fields_ = [
('name', ctypes.c_char * 256),
@@ -1739,6 +1740,7 @@ struct_amdsmi_gpu_xcp_metrics_t._fields_ = [
('jpeg_busy', ctypes.c_uint16 * 32),
('vcn_busy', ctypes.c_uint16 * 4),
('gfx_busy_acc', ctypes.c_uint64 * 8),
('gfx_below_host_limit_acc', ctypes.c_uint64 * 8),
]
amdsmi_gpu_xcp_metrics_t = struct_amdsmi_gpu_xcp_metrics_t
@@ -1820,9 +1822,34 @@ struct_amdsmi_gpu_metrics_t._fields_ = [
('xcp_stats', struct_amdsmi_gpu_xcp_metrics_t * 8),
('pcie_lc_perf_other_end_recovery', ctypes.c_uint32),
('PADDING_5', ctypes.c_ubyte * 4),
('vram_max_bandwidth', ctypes.c_uint64),
('xgmi_link_status', ctypes.c_uint16 * 8),
]
amdsmi_gpu_metrics_t = struct_amdsmi_gpu_metrics_t
# values for enumeration 'amdsmi_xgmi_link_status_type_t'
amdsmi_xgmi_link_status_type_t__enumvalues = {
0: 'AMDSMI_XGMI_LINK_DOWN',
1: 'AMDSMI_XGMI_LINK_UP',
2: 'AMDSMI_XGMI_LINK_DISABLE',
}
AMDSMI_XGMI_LINK_DOWN = 0
AMDSMI_XGMI_LINK_UP = 1
AMDSMI_XGMI_LINK_DISABLE = 2
amdsmi_xgmi_link_status_type_t = ctypes.c_uint32 # enum
class struct_amdsmi_xgmi_link_status_t(Structure):
pass
struct_amdsmi_xgmi_link_status_t._pack_ = 1 # source:False
struct_amdsmi_xgmi_link_status_t._fields_ = [
('total_links', ctypes.c_uint32),
('status', amdsmi_xgmi_link_status_type_t * 8),
('PADDING_0', ctypes.c_ubyte * 4),
('reserved', ctypes.c_uint64 * 7),
]
amdsmi_xgmi_link_status_t = struct_amdsmi_xgmi_link_status_t
class struct_amdsmi_name_value_t(Structure):
pass
@@ -2397,6 +2424,9 @@ amdsmi_get_pcie_info.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_
amdsmi_get_xgmi_info = _libraries['libamd_smi.so'].amdsmi_get_xgmi_info
amdsmi_get_xgmi_info.restype = amdsmi_status_t
amdsmi_get_xgmi_info.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_xgmi_info_t)]
amdsmi_get_gpu_xgmi_link_status = _libraries['libamd_smi.so'].amdsmi_get_gpu_xgmi_link_status
amdsmi_get_gpu_xgmi_link_status.restype = amdsmi_status_t
amdsmi_get_gpu_xgmi_link_status.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_xgmi_link_status_t)]
amdsmi_get_fw_info = _libraries['libamd_smi.so'].amdsmi_get_fw_info
amdsmi_get_fw_info.restype = amdsmi_status_t
amdsmi_get_fw_info.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_fw_info_t)]
@@ -2763,8 +2793,9 @@ __all__ = \
'AMDSMI_VRAM_VENDOR__PLACEHOLDER3',
'AMDSMI_VRAM_VENDOR__PLACEHOLDER4',
'AMDSMI_VRAM_VENDOR__PLACEHOLDER5', 'AMDSMI_VRAM_VENDOR__SAMSUNG',
'AMDSMI_VRAM_VENDOR__WINBOND', 'AMDSMI_XGMI_STATUS_ERROR',
'AMDSMI_XGMI_STATUS_MULTIPLE_ERRORS',
'AMDSMI_VRAM_VENDOR__WINBOND', 'AMDSMI_XGMI_LINK_DISABLE',
'AMDSMI_XGMI_LINK_DOWN', 'AMDSMI_XGMI_LINK_UP',
'AMDSMI_XGMI_STATUS_ERROR', 'AMDSMI_XGMI_STATUS_MULTIPLE_ERRORS',
'AMDSMI_XGMI_STATUS_NO_ERRORS', 'CLK_LIMIT_MAX', 'CLK_LIMIT_MIN',
'RD_BW0', 'WR_BW0', 'amd_metrics_table_header_t',
'amdsmi_accelerator_partition_profile_t',
@@ -2851,7 +2882,8 @@ __all__ = \
'amdsmi_get_gpu_total_ecc_count', 'amdsmi_get_gpu_vbios_info',
'amdsmi_get_gpu_vendor_name', 'amdsmi_get_gpu_volt_metric',
'amdsmi_get_gpu_vram_info', 'amdsmi_get_gpu_vram_usage',
'amdsmi_get_gpu_vram_vendor', 'amdsmi_get_hsmp_metrics_table',
'amdsmi_get_gpu_vram_vendor', 'amdsmi_get_gpu_xgmi_link_status',
'amdsmi_get_hsmp_metrics_table',
'amdsmi_get_hsmp_metrics_table_version', 'amdsmi_get_lib_version',
'amdsmi_get_link_metrics', 'amdsmi_get_link_topology_nearest',
'amdsmi_get_minmax_bandwidth_between_processors',
@@ -2924,6 +2956,7 @@ __all__ = \
'amdsmi_voltage_metric_t', 'amdsmi_voltage_type_t',
'amdsmi_vram_info_t', 'amdsmi_vram_type_t', 'amdsmi_vram_usage_t',
'amdsmi_vram_vendor_type_t', 'amdsmi_xgmi_info_t',
'amdsmi_xgmi_link_status_t', 'amdsmi_xgmi_link_status_type_t',
'amdsmi_xgmi_status_t', 'processor_type_t', 'size_t',
'struct__links', 'struct_amd_metrics_table_header_t',
'struct_amdsmi_accelerator_partition_profile_t',
@@ -2958,7 +2991,8 @@ __all__ = \
'struct_amdsmi_vbios_info_t', 'struct_amdsmi_version_t',
'struct_amdsmi_violation_status_t', 'struct_amdsmi_vram_info_t',
'struct_amdsmi_vram_usage_t', 'struct_amdsmi_xgmi_info_t',
'struct_cache_', 'struct_engine_usage_', 'struct_fw_info_list_',
'struct_amdsmi_xgmi_link_status_t', 'struct_cache_',
'struct_engine_usage_', 'struct_fw_info_list_',
'struct_memory_usage_', 'struct_nps_flags_',
'struct_pcie_metric_', 'struct_pcie_static_',
'struct_amdsmi_bdf_t', 'uint32_t', 'uint64_t', 'uint8_t',
@@ -1085,6 +1085,9 @@ typedef struct metrics_table_header_t metrics_table_header_t;
* @brief The following structures hold the gpu statistics for a device.
*/
struct amdgpu_xcp_metrics_t {
/*
* v1.6 additions
*/
/* Utilization Instantaneous (%) */
uint32_t gfx_busy_inst[RSMI_MAX_NUM_XCC];
uint16_t jpeg_busy[RSMI_MAX_NUM_JPEG_ENGS];
@@ -1092,6 +1095,12 @@ struct amdgpu_xcp_metrics_t {
/* Utilization Accumulated (%) */
uint64_t gfx_busy_acc[RSMI_MAX_NUM_XCC];
/*
* v1.7 additions
*/
/* Total App Clock Counter Accumulated */
uint64_t gfx_below_host_limit_acc[RSMI_MAX_NUM_XCC];
};
typedef struct {
@@ -1295,6 +1304,15 @@ typedef struct {
/* PCIE other end recovery counter */
uint32_t pcie_lc_perf_other_end_recovery;
/*
* v1.7 additions
*/
/* VRAM max bandwidth at max memory clock */
uint64_t vram_max_bandwidth;
/* XGMI link status(up/down) */
uint16_t xgmi_link_status[RSMI_MAX_NUM_XGMI_LINKS];
/// \endcond
} rsmi_gpu_metrics_t;
@@ -88,6 +88,19 @@ struct AMDGpuMetricsHeader_v1_t {
uint8_t m_content_revision;
};
struct amdgpu_xcp_metrics_v1_1 {
/* Utilization Instantaneous (%) */
uint32_t gfx_busy_inst[kRSMI_MAX_NUM_XCC];
uint16_t jpeg_busy[kRSMI_MAX_JPEG_ENGINES];
uint16_t vcn_busy[kRSMI_MAX_NUM_VCNS];
/* Utilization Accumulated (%) */
uint64_t gfx_busy_acc[kRSMI_MAX_NUM_XCC];
/* Total App Clock Counter Accumulated */
uint64_t gfx_below_host_limit_acc[kRSMI_MAX_NUM_XCC];
};
struct amdgpu_xcp_metrics {
/* Utilization Instantaneous (%) */
uint32_t gfx_busy_inst[kRSMI_MAX_NUM_XCC];
@@ -551,7 +564,107 @@ struct AMDGpuMetrics_v16_t {
/* PCIE other end recovery counter */
uint32_t m_pcie_lc_perf_other_end_recovery;
};
using AMGpuMetricsLatest_t = AMDGpuMetrics_v16_t;
struct AMDGpuMetrics_v17_t {
~AMDGpuMetrics_v17_t() = default;
struct AMDGpuMetricsHeader_v1_t m_common_header;
/* Temperature (Celsius) */
uint16_t m_temperature_hotspot;
uint16_t m_temperature_mem;
uint16_t m_temperature_vrsoc;
/* Power (Watts) */
uint16_t m_current_socket_power;
/* Utilization (%) */
uint16_t m_average_gfx_activity;
uint16_t m_average_umc_activity; // memory controller
/* VRAM max bandwidth at max memory clock (GB/s) */
uint64_t m_vram_max_bandwidth; // new for 1.7
/* Energy (15.259uJ (2^-16) units) */
uint64_t m_energy_accumulator;
/* Driver attached timestamp (in ns) */
uint64_t m_system_clock_counter;
/* Accumulation cycle counter */
uint32_t m_accumulation_counter;
/* Accumulated throttler residencies */
uint32_t m_prochot_residency_acc;
uint32_t m_ppt_residency_acc;
uint32_t m_socket_thm_residency_acc;
uint32_t m_vr_thm_residency_acc;
uint32_t m_hbm_thm_residency_acc;
/* Clock Lock Status. Each bit corresponds to clock instance */
uint32_t m_gfxclk_lock_status;
/* Link width (number of lanes) and speed (in 0.1 GT/s) */
uint16_t m_pcie_link_width;
uint16_t m_pcie_link_speed;
/* XGMI bus width and bitrate (in Gbps) */
uint16_t m_xgmi_link_width;
uint16_t m_xgmi_link_speed;
/* Utilization Accumulated (%) */
uint32_t m_gfx_activity_acc;
uint32_t m_mem_activity_acc;
/*PCIE accumulated bandwidth (GB/sec) */
uint64_t m_pcie_bandwidth_acc;
/*PCIE instantaneous bandwidth (GB/sec) */
uint64_t m_pcie_bandwidth_inst;
/* PCIE L0 to recovery state transition accumulated count */
uint64_t m_pcie_l0_to_recov_count_acc;
/* PCIE replay accumulated count */
uint64_t m_pcie_replay_count_acc;
/* PCIE replay rollover accumulated count */
uint64_t m_pcie_replay_rover_count_acc;
/* PCIE NAK sent accumulated count */
uint32_t m_pcie_nak_sent_count_acc;
/* PCIE NAK received accumulated count */
uint32_t m_pcie_nak_rcvd_count_acc;
/* XGMI accumulated data transfer size(KiloBytes) */
uint64_t m_xgmi_read_data_acc[kRSMI_MAX_NUM_XGMI_LINKS];
uint64_t m_xgmi_write_data_acc[kRSMI_MAX_NUM_XGMI_LINKS];
/* XGMI link status(up/down) */
uint16_t m_xgmi_link_status[kRSMI_MAX_NUM_XGMI_LINKS]; // new for 1.7
uint16_t m_padding;
/* PMFW attached timestamp (10ns resolution) */
uint64_t m_firmware_timestamp;
/* Current clocks (Mhz) */
uint16_t m_current_gfxclk[kRSMI_MAX_NUM_GFX_CLKS];
uint16_t m_current_socclk[kRSMI_MAX_NUM_CLKS];
uint16_t m_current_vclk0[kRSMI_MAX_NUM_CLKS];
uint16_t m_current_dclk0[kRSMI_MAX_NUM_CLKS];
uint16_t m_current_uclk;
/* Number of current partition */
uint16_t m_num_partition;
/* XCP metrics stats */
struct amdgpu_xcp_metrics_v1_1 m_xcp_stats[kRSMI_MAX_NUM_XCP];
/* PCIE other end recovery counter */
uint32_t m_pcie_lc_perf_other_end_recovery;
};
using AMGpuMetricsLatest_t = AMDGpuMetrics_v17_t;
/**
* This is GPU Metrics version that gets to public access.
@@ -766,8 +879,11 @@ enum class AMDGpuMetricsUnitType_t : AMDGpuMetricTypeId_t
kMetricJpegBusy, // v1.6
kMetricVcnBusy, // v1.6
kMetricGfxBusyAcc, // v1.6
kMetricPcieLCPerfOtherEndRecov, // v1.6
kMetricVramMaxBandwidth, // v1.7
kMetricXgmiLinkStatus, // v1.7
kMetricGfxBelowHostLimitAccumulator, // v1.7
};
using AMDGpuMetricsUnitTypeTranslationTbl_t = std::map<AMDGpuMetricsUnitType_t, std::string>;
@@ -805,6 +921,7 @@ enum class AMDGpuMetricVersionFlags_t : AMDGpuMetricVersionFlagId_t
kGpuMetricV14 = (0x1 << 4),
kGpuMetricV15 = (0x1 << 5),
kGpuMetricV16 = (0x1 << 6),
kGpuMetricV17 = (0x1 << 7),
};
using AMDGpuMetricVersionTranslationTbl_t = std::map<uint16_t, AMDGpuMetricVersionFlags_t>;
using GpuMetricTypePtr_t = std::shared_ptr<void>;
@@ -1023,6 +1140,36 @@ class GpuMetricsBase_v16_t final : public GpuMetricsBase_t {
std::shared_ptr<AMDGpuMetrics_v16_t> m_gpu_metric_ptr;
};
class GpuMetricsBase_v17_t final : public GpuMetricsBase_t {
public:
~GpuMetricsBase_v17_t() = default;
size_t sizeof_metric_table() override {
return sizeof(AMDGpuMetrics_v17_t);
}
GpuMetricTypePtr_t get_metrics_table() override {
if (!m_gpu_metric_ptr) {
m_gpu_metric_ptr.reset(&m_gpu_metrics_tbl, [](AMDGpuMetrics_v17_t*){});
}
assert(m_gpu_metric_ptr != nullptr);
return m_gpu_metric_ptr;
}
void dump_internal_metrics_table() override;
AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override {
return AMDGpuMetricVersionFlags_t::kGpuMetricV17;
}
rsmi_status_t populate_metrics_dynamic_tbl() override;
AMGpuMetricsPublicLatestTupl_t copy_internal_to_external_metrics() override;
private:
AMDGpuMetrics_v17_t m_gpu_metrics_tbl;
std::shared_ptr<AMDGpuMetrics_v17_t> m_gpu_metric_ptr;
};
template<typename T>
rsmi_status_t rsmi_dev_gpu_metrics_info_query(uint32_t dv_ind,
AMDGpuMetricsUnitType_t metric_counter, T& metric_value);
@@ -136,6 +136,7 @@ std::string stringfy_metric_header_version(const AMDGpuMetricsHeader_v1_t& metri
// version 1.4: 260
// version 1.5: 261
// version 1.6: 262
// version 1.7: 263
//
const AMDGpuMetricVersionTranslationTbl_t amdgpu_metric_version_translation_table
{
@@ -145,6 +146,7 @@ const AMDGpuMetricVersionTranslationTbl_t amdgpu_metric_version_translation_tabl
{join_metrics_version(1, 4), AMDGpuMetricVersionFlags_t::kGpuMetricV14},
{join_metrics_version(1, 5), AMDGpuMetricVersionFlags_t::kGpuMetricV15},
{join_metrics_version(1, 6), AMDGpuMetricVersionFlags_t::kGpuMetricV16},
{join_metrics_version(1, 7), AMDGpuMetricVersionFlags_t::kGpuMetricV17},
};
/**
@@ -264,6 +266,12 @@ const AMDGpuMetricsUnitTypeTranslationTbl_t amdgpu_metrics_unit_type_translation
// kGpuMetricLinkWidthSpeed
{AMDGpuMetricsUnitType_t::kMetricPcieLCPerfOtherEndRecov, "PcieLCPerfOtherEndRecov"}, /* v1.6 */
{AMDGpuMetricsUnitType_t::kMetricXgmiLinkStatus, "XgmiLinkStatus"}, /* v1.7 */
{AMDGpuMetricsUnitType_t::kMetricVramMaxBandwidth, "VramMaxBandwidth"}, /* v1.7 */
{AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitAccumulator,
"GfxBelowHostLimitAccumulator"}, /* v1.7 */
};
@@ -352,6 +360,7 @@ AMDGpuMetricFactories_t amd_gpu_metrics_factory_table
{AMDGpuMetricVersionFlags_t::kGpuMetricV14, std::make_shared<GpuMetricsBase_v14_t>(GpuMetricsBase_v14_t{})},
{AMDGpuMetricVersionFlags_t::kGpuMetricV15, std::make_shared<GpuMetricsBase_v15_t>(GpuMetricsBase_v15_t{})},
{AMDGpuMetricVersionFlags_t::kGpuMetricV16, std::make_shared<GpuMetricsBase_v16_t>(GpuMetricsBase_v16_t{})},
{AMDGpuMetricVersionFlags_t::kGpuMetricV17, std::make_shared<GpuMetricsBase_v17_t>(GpuMetricsBase_v17_t{})},
};
GpuMetricsBasePtr amdgpu_metrics_factory(AMDGpuMetricVersionFlags_t gpu_metric_version)
@@ -470,6 +479,197 @@ AMDGpuDynamicMetricTblValues_t format_metric_row(const T& metric, const std::str
return multi_values;
}
void GpuMetricsBase_v17_t::dump_internal_metrics_table()
{
std::ostringstream ss;
auto idx = uint64_t(0);
auto idy = uint64_t(0);
std::cout << __PRETTY_FUNCTION__ << " | ======= start ======= \n";
ss << __PRETTY_FUNCTION__
<< " | ======= DEBUG ======= "
<< " | Metric Version: "
<< stringfy_metric_header_version(m_gpu_metrics_tbl.m_common_header)
<< " | Size: "
<< print_unsigned_int(m_gpu_metrics_tbl.m_common_header.m_structure_size)
<< " |"
<< "\n";
ss << " temperature_hotspot: " << m_gpu_metrics_tbl.m_temperature_hotspot << "\n"
<< " temperature_mem: " << m_gpu_metrics_tbl.m_temperature_mem << "\n"
<< " temperature_vrsoc: " << m_gpu_metrics_tbl.m_temperature_vrsoc << "\n"
<< " current_socket_power: " << m_gpu_metrics_tbl.m_current_socket_power << "\n"
<< " average_gfx_activity: " << m_gpu_metrics_tbl.m_average_gfx_activity << "\n"
<< " average_umc_activity: " << m_gpu_metrics_tbl.m_average_umc_activity << "\n";
ss << " vram_max_bandwidth: " << m_gpu_metrics_tbl.m_vram_max_bandwidth << "\n" // new for v1.7
<< " energy_accumulator: " << m_gpu_metrics_tbl.m_energy_accumulator << "\n"
<< " system_clock_counter: " << m_gpu_metrics_tbl.m_system_clock_counter << "\n"
<< " accumulation_counter: " << m_gpu_metrics_tbl.m_accumulation_counter << "\n"
<< " prochot_residency_acc: " << m_gpu_metrics_tbl.m_prochot_residency_acc << "\n"
<< " ppt_residency_acc: " << m_gpu_metrics_tbl.m_ppt_residency_acc << "\n"
<< " socket_thm_residency_acc: " << m_gpu_metrics_tbl.m_socket_thm_residency_acc << "\n"
<< " vr_thm_residency_acc: " << m_gpu_metrics_tbl.m_vr_thm_residency_acc << "\n"
<< " hbm_thm_residency_acc: " << m_gpu_metrics_tbl.m_hbm_thm_residency_acc << "\n"
<< " gfxclk_lock_status: " << m_gpu_metrics_tbl.m_gfxclk_lock_status << "\n"
<< " pcie_link_width: " << m_gpu_metrics_tbl.m_pcie_link_width << "\n"
<< " pcie_link_speed: " << m_gpu_metrics_tbl.m_pcie_link_speed << "\n"
<< " xgmi_link_width: " << m_gpu_metrics_tbl.m_xgmi_link_width << "\n"
<< " xgmi_link_speed: " << m_gpu_metrics_tbl.m_xgmi_link_speed << "\n"
<< " gfx_activity_acc: " << m_gpu_metrics_tbl.m_gfx_activity_acc << "\n"
<< " mem_activity_acc: " << m_gpu_metrics_tbl.m_mem_activity_acc << "\n"
<< " pcie_bandwidth_acc: " << m_gpu_metrics_tbl.m_pcie_bandwidth_acc << "\n"
<< " pcie_bandwidth_inst: " << m_gpu_metrics_tbl.m_pcie_bandwidth_inst << "\n"
<< " pcie_l0_to_recov_count_acc: " << m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc << "\n"
<< " pcie_replay_count_acc: " << m_gpu_metrics_tbl.m_pcie_replay_count_acc << "\n"
<< " pcie_replay_rover_count_acc: " << m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc << "\n"
<< " pcie_nak_sent_count_acc: " << m_gpu_metrics_tbl.m_pcie_nak_sent_count_acc << "\n"
<< " pcie_nak_rcvd_count_acc: " << m_gpu_metrics_tbl.m_pcie_nak_rcvd_count_acc << "\n"
<< " firmware_timestamp: " << m_gpu_metrics_tbl.m_firmware_timestamp << "\n"
<< " current_uclk: " << m_gpu_metrics_tbl.m_current_uclk << "\n"
<< " num_partition: " << m_gpu_metrics_tbl.m_num_partition << "\n"
<< " pcie_lc_perf_other_end_recovery: "
<< m_gpu_metrics_tbl.m_pcie_lc_perf_other_end_recovery << "\n";
idx = 0;
for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_link_status) { // new for v1.7
ss << "\t [" << idx << "]: " << temp << "\n";
++idx;
}
for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_read_data_acc) {
ss << "\t [" << idx << "]: " << temp << "\n";
++idx;
}
ss << " xgmi_write_data_acc: " << "\n";
idx = 0;
for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_write_data_acc) {
ss << "\t [" << idx << "]: " << temp << "\n";
++idx;
}
ss << " current_gfxclk: " << "\n";
idx = 0;
for (const auto& temp : m_gpu_metrics_tbl.m_current_gfxclk) {
ss << "\t [" << idx << "]: " << temp << "\n";
++idx;
}
ss << " current_socclk: " << "\n";
idx = 0;
for (const auto& temp : m_gpu_metrics_tbl.m_current_socclk) {
ss << "\t [" << idx << "]: " << temp << "\n";
++idx;
}
ss << " current_vclk0: " << "\n";
idx = 0;
for (const auto& temp : m_gpu_metrics_tbl.m_current_vclk0) {
ss << "\t [" << idx << "]: " << temp << "\n";
++idx;
}
ss << " current_dclk0: " << "\n";
idx = 0;
for (const auto& temp : m_gpu_metrics_tbl.m_current_dclk0) {
ss << "\t [" << idx << "]: " << temp << "\n";
++idx;
}
idx = 0;
idy = 0;
ss << " xcp_stats.gfx_busy_inst: " << "\n";
for (auto& row : m_gpu_metrics_tbl.m_xcp_stats) {
if (idx == 0) {
ss << "\t [ ";
}
for (auto& col : row.gfx_busy_inst) {
ss << "\t [" << idx << "] [" << idy << "]: " << col;
if (idy + 1 != (std::end(row.gfx_busy_inst) - std::end(row.gfx_busy_inst) - 1)) {
ss << ", ";
}
if (idx + 1 !=
(std::end(m_gpu_metrics_tbl.m_xcp_stats) - std::end(m_gpu_metrics_tbl.m_xcp_stats) - 1)) {
ss << "\n";
} else {
ss << "]\n";
}
idy++;
}
idx++;
}
idx = 0;
idy = 0;
ss << " xcp_stats.vcn_busy: " << "\n";
for (auto& row : m_gpu_metrics_tbl.m_xcp_stats) {
if (idx == 0) {
ss << "\t [ ";
}
for (auto& col : row.vcn_busy) {
ss << "\t [" << idx << "] [" << idy << "]: " << col;
if (idy + 1 != (std::end(row.vcn_busy) - std::end(row.vcn_busy) - 1)) {
ss << ", ";
}
if (idx + 1 !=
(std::end(m_gpu_metrics_tbl.m_xcp_stats) - std::end(m_gpu_metrics_tbl.m_xcp_stats) - 1)) {
ss << "\n";
} else {
ss << "]\n";
}
idy++;
}
idx++;
}
idx = 0;
idy = 0;
ss << " xcp_stats.jpeg_busy: " << "\n";
for (auto& row : m_gpu_metrics_tbl.m_xcp_stats) {
if (idx == 0) {
ss << "\t [ ";
}
for (auto& col : row.jpeg_busy) {
ss << "\t [" << idx << "] [" << idy << "]: " << col;
if (idy + 1 != (std::end(row.jpeg_busy) - std::end(row.jpeg_busy) - 1)) {
ss << ", ";
}
if (idx + 1 !=
(std::end(m_gpu_metrics_tbl.m_xcp_stats) - std::end(m_gpu_metrics_tbl.m_xcp_stats) - 1)) {
ss << "\n";
} else {
ss << "]\n";
}
idy++;
}
idx++;
}
idx = 0;
idy = 0;
ss << " xcp_stats.gfx_busy_acc: " << "\n";
for (auto& row : m_gpu_metrics_tbl.m_xcp_stats) {
if (idx == 0) {
ss << "\t [ ";
}
for (auto& col : row.gfx_busy_acc) {
ss << "\t [" << idx << "] [" << idy << "]: " << col;
if (idy + 1 != (std::end(row.gfx_busy_acc) - std::end(row.gfx_busy_acc) - 1)) {
ss << ", ";
}
if (idx + 1 !=
(std::end(m_gpu_metrics_tbl.m_xcp_stats) - std::end(m_gpu_metrics_tbl.m_xcp_stats) - 1)) {
ss << "\n";
} else {
ss << "]\n";
}
idy++;
}
idx++;
}
LOG_DEBUG(ss);
}
void GpuMetricsBase_v16_t::dump_internal_metrics_table()
{
std::ostringstream ss;
@@ -663,6 +863,263 @@ void GpuMetricsBase_v16_t::dump_internal_metrics_table()
LOG_DEBUG(ss);
}
rsmi_status_t GpuMetricsBase_v17_t::populate_metrics_dynamic_tbl() {
std::ostringstream ss;
auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS);
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
LOG_TRACE(ss);
if (!m_metrics_dynamic_tbl.empty()) {
m_metrics_dynamic_tbl.clear();
}
//
// Note: Any metric treatment/changes (if any) should happen before they
// get written to internal/external tables.
//
auto run_metric_adjustments_v17 = [&]() {
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
const auto gpu_metrics_version =
translate_flag_to_metric_version(get_gpu_metrics_version_used());
ss << __PRETTY_FUNCTION__
<< " | ======= info ======= "
<< " | Applying adjustments "
<< " | Metric Version: " << stringfy_metric_header_version(
disjoin_metrics_version(gpu_metrics_version))
<< " |";
LOG_TRACE(ss);
// firmware_timestamp is at 10ns resolution
ss << __PRETTY_FUNCTION__
<< " | ======= Changes ======= "
<< " | {m_firmware_timestamp} from: " << m_gpu_metrics_tbl.m_firmware_timestamp
<< " to: " << (m_gpu_metrics_tbl.m_firmware_timestamp * 10);
m_gpu_metrics_tbl.m_firmware_timestamp = (m_gpu_metrics_tbl.m_firmware_timestamp * 10);
LOG_DEBUG(ss);
};
// Adjustments/Changes specific to this version
run_metric_adjustments_v17();
// Temperature Info
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempHotspot,
format_metric_row(m_gpu_metrics_tbl.m_temperature_hotspot,
"temperature_hotspot")));
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempMem,
format_metric_row(m_gpu_metrics_tbl.m_temperature_mem,
"temperature_mem")));
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempVrSoc,
format_metric_row(m_gpu_metrics_tbl.m_temperature_vrsoc,
"temperature_vrsoc")));
// Power/Energy Info
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricPowerEnergy]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrSocketPower,
format_metric_row(m_gpu_metrics_tbl.m_current_socket_power,
"curr_socket_power")));
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricPowerEnergy]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricEnergyAccumulator,
format_metric_row(m_gpu_metrics_tbl.m_energy_accumulator,
"energy_acc")));
// Utilization Info
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgGfxActivity,
format_metric_row(m_gpu_metrics_tbl.m_average_gfx_activity,
"average_gfx_activity")));
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgUmcActivity,
format_metric_row(m_gpu_metrics_tbl.m_average_umc_activity,
"average_umc_activity")));
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxActivityAccumulator,
format_metric_row(m_gpu_metrics_tbl.m_gfx_activity_acc,
"gfx_activity_acc")));
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricMemActivityAccumulator,
format_metric_row(m_gpu_metrics_tbl.m_mem_activity_acc,
"mem_activity_acc")));
// Timestamp Info
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTimestamp]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTSFirmware,
format_metric_row(m_gpu_metrics_tbl.m_firmware_timestamp,
"firmware_timestamp")));
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTimestamp]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTSClockCounter,
format_metric_row(m_gpu_metrics_tbl.m_system_clock_counter,
"system_clock_counter")));
// GfxLock Info
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricGfxClkLockStatus]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxClkLockStatus,
format_metric_row(m_gpu_metrics_tbl.m_gfxclk_lock_status,
"gfxclk_lock_status")));
// Link/Width/Speed Info
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieLinkWidth,
format_metric_row(m_gpu_metrics_tbl.m_pcie_link_width,
"pcie_link_width")));
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieLinkSpeed,
format_metric_row(m_gpu_metrics_tbl.m_pcie_link_speed,
"pcie_link_speed")));
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiLinkWidth,
format_metric_row(m_gpu_metrics_tbl.m_xgmi_link_width,
"xgmi_link_width")));
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiLinkSpeed,
format_metric_row(m_gpu_metrics_tbl.m_xgmi_link_speed,
"xgmi_link_speed")));
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieBandwidthAccumulator,
format_metric_row(m_gpu_metrics_tbl.m_pcie_bandwidth_acc,
"pcie_bandwidth_acc")));
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieBandwidthInst,
format_metric_row(m_gpu_metrics_tbl.m_pcie_bandwidth_inst,
"pcie_bandwidth_inst")));
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieL0RecovCountAccumulator,
format_metric_row(m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc,
"pcie_l0_recov_count_acc")));
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieReplayCountAccumulator,
format_metric_row(m_gpu_metrics_tbl.m_pcie_replay_count_acc,
"pcie_replay_count_acc")));
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieReplayRollOverCountAccumulator,
format_metric_row(m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc,
"pcie_replay_rollover_count_acc")));
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieNakSentCountAccumulator,
format_metric_row(m_gpu_metrics_tbl.m_pcie_nak_sent_count_acc,
"pcie_nak_sent_count_acc")));
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieNakReceivedCountAccumulator,
format_metric_row(m_gpu_metrics_tbl.m_pcie_nak_rcvd_count_acc,
"pcie_nak_rcvd_count_acc")));
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiReadDataAccumulator,
format_metric_row(m_gpu_metrics_tbl.m_xgmi_read_data_acc,
"[xgmi_read_data_acc]")));
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiWriteDataAccumulator,
format_metric_row(m_gpu_metrics_tbl.m_xgmi_write_data_acc,
"[xgmi_write_data_acc]")));
// new for v1.7
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiLinkStatus,
format_metric_row(m_gpu_metrics_tbl.m_xgmi_link_status,
"[xgmi_link_status]")));
// CurrentClock Info
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrGfxClock,
format_metric_row(m_gpu_metrics_tbl.m_current_gfxclk,
"[current_gfxclk]")));
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrSocClock,
format_metric_row(m_gpu_metrics_tbl.m_current_socclk,
"[current_socclk]")));
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrVClock0,
format_metric_row(m_gpu_metrics_tbl.m_current_vclk0,
"[current_vclk0]")));
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrDClock0,
format_metric_row(m_gpu_metrics_tbl.m_current_dclk0,
"[current_dclk0]")));
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrUClock,
format_metric_row(m_gpu_metrics_tbl.m_current_uclk,
"current_uclk")));
/* Accumulation cycle counter */
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAccumulationCounter,
format_metric_row(m_gpu_metrics_tbl.m_accumulation_counter,
"accumulation_counter")));
/* Accumulated throttler residencies */
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricProchotResidencyAccumulator,
format_metric_row(m_gpu_metrics_tbl.m_prochot_residency_acc,
"prochot_residency_acc")));
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPPTResidencyAccumulator,
format_metric_row(m_gpu_metrics_tbl.m_ppt_residency_acc,
"ppt_residency_acc")));
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricSocketThmResidencyAccumulator,
format_metric_row(m_gpu_metrics_tbl.m_socket_thm_residency_acc,
"socket_thm_residency_acc")));
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricVRThmResidencyAccumulator,
format_metric_row(m_gpu_metrics_tbl.m_vr_thm_residency_acc,
"vr_thm_residency_acc")));
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricHBMThmResidencyAccumulator,
format_metric_row(m_gpu_metrics_tbl.m_hbm_thm_residency_acc,
"hbm_thm_residency_acc")));
/* Partition info */
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricPartition]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kGpuMetricNumPartition,
format_metric_row(m_gpu_metrics_tbl.m_num_partition,
"num_partition")));
/* xcp_stats info */
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxBusyInst,
format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_busy_inst,
"xcp_stats->gfx_busy_inst")));
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricVcnBusy,
format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->vcn_busy,
"xcp_stats->vcn_busy")));
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricJpegBusy,
format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->jpeg_busy,
"xcp_stats->jpeg_busy")));
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxBusyAcc,
format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_busy_acc,
"xcp_stats->gfx_busy_acc")));
/* PCIE other end recovery counter info */
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieLCPerfOtherEndRecov,
format_metric_row(m_gpu_metrics_tbl.m_pcie_lc_perf_other_end_recovery,
"pcie_lc_perf_other_end_recovery")));
/* VRAM max bandwidth at max memory clock */
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricVramMaxBandwidth,
format_metric_row(m_gpu_metrics_tbl.m_vram_max_bandwidth,
"vram_max_bandwidth")));
/* Total App Clock Counter Accumulated */
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitAccumulator,
format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_below_host_limit_acc,
"gfx_below_host_limit_acc")));
ss << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | Success "
<< " | Returning = " << getRSMIStatusString(status_code)
<< " |";
LOG_TRACE(ss);
return status_code;
}
rsmi_status_t GpuMetricsBase_v16_t::populate_metrics_dynamic_tbl() {
std::ostringstream ss;
auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS);
@@ -700,7 +1157,6 @@ rsmi_status_t GpuMetricsBase_v16_t::populate_metrics_dynamic_tbl() {
// Adjustments/Changes specific to this version
run_metric_adjustments_v16();
// Temperature Info
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempHotspot,
@@ -1594,6 +2050,12 @@ rsmi_status_t init_max_public_gpu_matrics(AMGpuMetricsPublicLatest_t& rsmi_gpu_m
rsmi_gpu_metrics.pcie_link_speed = init_max_uint_types<decltype(rsmi_gpu_metrics.pcie_link_speed)>();
rsmi_gpu_metrics.gfx_activity_acc = init_max_uint_types<decltype(rsmi_gpu_metrics.gfx_activity_acc)>();
rsmi_gpu_metrics.mem_activity_acc = init_max_uint_types<decltype(rsmi_gpu_metrics.mem_activity_acc)>();
rsmi_gpu_metrics.vram_max_bandwidth = init_max_uint_types<decltype(rsmi_gpu_metrics.vram_max_bandwidth)>();
std::fill(std::begin(rsmi_gpu_metrics.xgmi_link_status),
std::end(rsmi_gpu_metrics.xgmi_link_status),
init_max_uint_types<std::uint16_t>());
std::fill(std::begin(rsmi_gpu_metrics.temperature_hbm),
std::end(rsmi_gpu_metrics.temperature_hbm),
@@ -1671,6 +2133,8 @@ rsmi_status_t init_max_public_gpu_matrics(AMGpuMetricsPublicLatest_t& rsmi_gpu_m
init_max_uint_types<std::uint16_t>());
std::fill(std::begin(row.gfx_busy_acc), std::end(row.gfx_busy_acc),
init_max_uint_types<std::uint64_t>());
std::fill(std::begin(row.gfx_below_host_limit_acc), std::end(row.gfx_below_host_limit_acc),
init_max_uint_types<std::uint64_t>());
}
ss << __PRETTY_FUNCTION__
@@ -1683,6 +2147,225 @@ rsmi_status_t init_max_public_gpu_matrics(AMGpuMetricsPublicLatest_t& rsmi_gpu_m
return status_code;
}
AMGpuMetricsPublicLatestTupl_t GpuMetricsBase_v17_t::copy_internal_to_external_metrics()
{
std::ostringstream ss;
auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS);
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
LOG_TRACE(ss);
auto copy_data_from_internal_metrics_tbl = [&]() {
AMGpuMetricsPublicLatest_t metrics_public_init{};
//
// Note: Initializing data members with their max. If field is max,
// no data was assigned to it.
init_max_public_gpu_matrics(metrics_public_init);
// Header
metrics_public_init.common_header.structure_size = m_gpu_metrics_tbl.m_common_header.m_structure_size;
metrics_public_init.common_header.format_revision = m_gpu_metrics_tbl.m_common_header.m_format_revision;
metrics_public_init.common_header.content_revision = m_gpu_metrics_tbl.m_common_header.m_content_revision;
// Temperature
metrics_public_init.temperature_hotspot = m_gpu_metrics_tbl.m_temperature_hotspot;
metrics_public_init.temperature_mem = m_gpu_metrics_tbl.m_temperature_mem;
metrics_public_init.temperature_vrsoc = m_gpu_metrics_tbl.m_temperature_vrsoc;
// Power
metrics_public_init.current_socket_power = m_gpu_metrics_tbl.m_current_socket_power;
// Utilization
metrics_public_init.average_gfx_activity = m_gpu_metrics_tbl.m_average_gfx_activity;
metrics_public_init.average_umc_activity = m_gpu_metrics_tbl.m_average_umc_activity;
// Power/Energy
metrics_public_init.energy_accumulator = m_gpu_metrics_tbl.m_energy_accumulator;
// Driver attached timestamp (in ns)
metrics_public_init.system_clock_counter = m_gpu_metrics_tbl.m_system_clock_counter;
// Clock Lock Status. Each bit corresponds to clock instance
metrics_public_init.gfxclk_lock_status = m_gpu_metrics_tbl.m_gfxclk_lock_status;
// Link width (number of lanes) and speed
metrics_public_init.pcie_link_width = m_gpu_metrics_tbl.m_pcie_link_width;
metrics_public_init.pcie_link_speed = m_gpu_metrics_tbl.m_pcie_link_speed;
// XGMI bus width and bitrate
metrics_public_init.xgmi_link_width = m_gpu_metrics_tbl.m_xgmi_link_width;
metrics_public_init.xgmi_link_speed = m_gpu_metrics_tbl.m_xgmi_link_speed;
// Utilization Accumulated
metrics_public_init.gfx_activity_acc = m_gpu_metrics_tbl.m_gfx_activity_acc;
metrics_public_init.mem_activity_acc = m_gpu_metrics_tbl.m_mem_activity_acc;
// PCIE accumulated bandwidth
metrics_public_init.pcie_bandwidth_acc = m_gpu_metrics_tbl.m_pcie_bandwidth_acc;
// PCIE instantaneous bandwidth
metrics_public_init.pcie_bandwidth_inst = m_gpu_metrics_tbl.m_pcie_bandwidth_inst;
// PCIE L0 to recovery state transition accumulated count
metrics_public_init.pcie_l0_to_recov_count_acc = m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc;
// PCIE replay accumulated count
metrics_public_init.pcie_replay_count_acc = m_gpu_metrics_tbl.m_pcie_replay_count_acc;
// PCIE replay rollover accumulated count
metrics_public_init.pcie_replay_rover_count_acc = m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc;
// PCIE NAK sent accumulated count
metrics_public_init.pcie_nak_sent_count_acc = m_gpu_metrics_tbl.m_pcie_nak_sent_count_acc;
// PCIE NAK received accumulated count
metrics_public_init.pcie_nak_rcvd_count_acc = m_gpu_metrics_tbl.m_pcie_nak_rcvd_count_acc;
// Accumulated throttler residencies
// bumped up public to uint64_t due to planned size increase for newer ASICs
metrics_public_init.accumulation_counter = m_gpu_metrics_tbl.m_accumulation_counter;
metrics_public_init.prochot_residency_acc = m_gpu_metrics_tbl.m_prochot_residency_acc;
metrics_public_init.ppt_residency_acc = m_gpu_metrics_tbl.m_ppt_residency_acc;
metrics_public_init.socket_thm_residency_acc = m_gpu_metrics_tbl.m_socket_thm_residency_acc;
metrics_public_init.vr_thm_residency_acc = m_gpu_metrics_tbl.m_vr_thm_residency_acc;
metrics_public_init.hbm_thm_residency_acc = m_gpu_metrics_tbl.m_hbm_thm_residency_acc;
/* VRAM max bandwidth at max memory clock */
metrics_public_init.vram_max_bandwidth = m_gpu_metrics_tbl.m_vram_max_bandwidth;
// XGMI accumulated data transfer size
// xgmi_read_data
const auto xgmi_read_data_num_elems =
static_cast<uint16_t>(
std::end(m_gpu_metrics_tbl.m_xgmi_read_data_acc) -
std::begin(m_gpu_metrics_tbl.m_xgmi_read_data_acc));
std::copy_n(std::begin(m_gpu_metrics_tbl.m_xgmi_read_data_acc),
xgmi_read_data_num_elems,
metrics_public_init.xgmi_read_data_acc);
// xgmi_write_data
const auto xgmi_write_data_num_elems =
static_cast<uint16_t>(
std::end(m_gpu_metrics_tbl.m_xgmi_write_data_acc) -
std::begin(m_gpu_metrics_tbl.m_xgmi_write_data_acc));
std::copy_n(std::begin(m_gpu_metrics_tbl.m_xgmi_write_data_acc),
xgmi_write_data_num_elems,
metrics_public_init.xgmi_write_data_acc);
// xgmi_link_status // new for 1.7
const auto xgmi_link_status_num_elems = static_cast<uint16_t>(
std::end(m_gpu_metrics_tbl.m_xgmi_link_status) -
std::begin(m_gpu_metrics_tbl.m_xgmi_link_status));
std::copy_n(std::begin(m_gpu_metrics_tbl.m_xgmi_link_status),
xgmi_link_status_num_elems,
metrics_public_init.xgmi_link_status);
// PMFW attached timestamp (10ns resolution)
metrics_public_init.firmware_timestamp = m_gpu_metrics_tbl.m_firmware_timestamp;
// Current clocks
// current_gfxclk
const auto curr_gfxclk_num_elems =
static_cast<uint16_t>(
std::end(m_gpu_metrics_tbl.m_current_gfxclk) -
std::begin(m_gpu_metrics_tbl.m_current_gfxclk));
std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_gfxclk),
curr_gfxclk_num_elems,
metrics_public_init.current_gfxclks);
// current_socclk
const auto curr_socclk_num_elems =
static_cast<uint16_t>(
std::end(m_gpu_metrics_tbl.m_current_socclk) -
std::begin(m_gpu_metrics_tbl.m_current_socclk));
std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_socclk),
curr_socclk_num_elems,
metrics_public_init.current_socclks);
// current_vclk0
const auto curr_vclk0_num_elems =
static_cast<uint16_t>(
std::end(m_gpu_metrics_tbl.m_current_vclk0) -
std::begin(m_gpu_metrics_tbl.m_current_vclk0));
std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_vclk0),
curr_vclk0_num_elems,
metrics_public_init.current_vclk0s);
// current_dclk0
const auto curr_dclk0_num_elems =
static_cast<uint16_t>(
std::end(m_gpu_metrics_tbl.m_current_dclk0) -
std::begin(m_gpu_metrics_tbl.m_current_dclk0));
std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_dclk0),
curr_dclk0_num_elems,
metrics_public_init.current_dclk0s);
metrics_public_init.current_uclk = m_gpu_metrics_tbl.m_current_uclk;
metrics_public_init.num_partition = m_gpu_metrics_tbl.m_num_partition;
metrics_public_init.pcie_lc_perf_other_end_recovery =
m_gpu_metrics_tbl.m_pcie_lc_perf_other_end_recovery;
auto priv_it = std::begin(m_gpu_metrics_tbl.m_xcp_stats);
for (auto pub_it = std::begin(metrics_public_init.xcp_stats);
pub_it != std::end(metrics_public_init.xcp_stats);
++pub_it, ++priv_it) {
std::copy_n(std::begin(priv_it->gfx_busy_inst), RSMI_MAX_NUM_XCC,
pub_it->gfx_busy_inst);
std::copy_n(std::begin(priv_it->jpeg_busy), RSMI_MAX_NUM_JPEG_ENGS,
pub_it->jpeg_busy);
std::copy_n(std::begin(priv_it->vcn_busy), RSMI_MAX_NUM_VCNS,
pub_it->vcn_busy);
std::copy_n(std::begin(priv_it->gfx_busy_acc), RSMI_MAX_NUM_XCC,
pub_it->gfx_busy_acc);
std::copy_n(std::begin(priv_it->gfx_below_host_limit_acc), RSMI_MAX_NUM_XCC,
pub_it->gfx_below_host_limit_acc);
}
//
// Note: Backwards compatibility -> Handling extra/exception cases
// related to earlier versions (1.3/1.4/1.5)
metrics_public_init.current_gfxclk = metrics_public_init.current_gfxclks[0];
metrics_public_init.current_socclk = metrics_public_init.current_socclks[0];
metrics_public_init.current_vclk0 = metrics_public_init.current_vclk0s[0];
metrics_public_init.current_vclk1 = metrics_public_init.current_vclk0s[1];
metrics_public_init.current_dclk0 = metrics_public_init.current_dclk0s[0];
metrics_public_init.current_dclk1 = metrics_public_init.current_dclk0s[1];
// separate by XCP
if (this->m_partition_id < kRSMI_MAX_NUM_XCP
&& m_gpu_metrics_tbl.m_xcp_stats[this->m_partition_id].vcn_busy[0] != UINT16_MAX) {
std::copy(std::begin(m_gpu_metrics_tbl.m_xcp_stats[this->m_partition_id].vcn_busy),
std::end(m_gpu_metrics_tbl.m_xcp_stats[this->m_partition_id].vcn_busy),
std::begin(metrics_public_init.vcn_activity));
}
if (this->m_partition_id < kRSMI_MAX_NUM_XCP
&& m_gpu_metrics_tbl.m_xcp_stats[this->m_partition_id].jpeg_busy[0] != UINT16_MAX) {
std::copy(std::begin(m_gpu_metrics_tbl.m_xcp_stats[this->m_partition_id].jpeg_busy),
std::end(m_gpu_metrics_tbl.m_xcp_stats[this->m_partition_id].jpeg_busy),
std::begin(metrics_public_init.jpeg_activity));
}
return metrics_public_init;
}();
ss << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | Success "
<< " | Returning = " << getRSMIStatusString(status_code)
<< " |";
LOG_TRACE(ss);
return std::make_tuple(status_code, copy_data_from_internal_metrics_tbl);
}
AMGpuMetricsPublicLatestTupl_t GpuMetricsBase_v16_t::copy_internal_to_external_metrics()
{
std::ostringstream ss;
+51 -2
View File
@@ -52,6 +52,7 @@
#include "amd_smi/impl/amd_smi_utils.h"
#include "amd_smi/impl/amd_smi_processor.h"
#include "rocm_smi/rocm_smi_logger.h"
#include "rocm_smi/rocm_smi.h"
// a global instance of std::mutex to protect data passed during threads
std::mutex myMutex;
@@ -80,7 +81,7 @@ static amdsmi_status_t get_gpu_device_from_handle(amdsmi_processor_handle proces
if (r != AMDSMI_STATUS_SUCCESS) return r;
if (device->get_processor_type() == AMDSMI_PROCESSOR_TYPE_AMD_GPU) {
*gpudevice = static_cast<amd::smi::AMDSmiGPUDevice*>(processor_handle);
*gpudevice = static_cast<amd::smi::AMDSmiGPUDevice*>(device);
return AMDSMI_STATUS_SUCCESS;
}
@@ -665,8 +666,11 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha
amdsmi_gpu_metrics_t metric_info_a = {};
amdsmi_status_t status = amdsmi_get_gpu_metrics_info(
processor_handle, &metric_info_a);
processor_handle, &metric_info_a);
if (status != AMDSMI_STATUS_SUCCESS) {
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << " | amdsmi_get_gpu_metrics_info failed with status = " << smi_amdgpu_get_status_string(status, false);
LOG_ERROR(ss);
return status;
}
@@ -1053,6 +1057,43 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t
amdsmi_get_gpu_xgmi_link_status(amdsmi_processor_handle processor_handle,
amdsmi_xgmi_link_status_t *link_status) {
AMDSMI_CHECK_INIT();
if (link_status == nullptr) {
return AMDSMI_STATUS_INVAL;
}
amdsmi_gpu_metrics_t metric_info = {};
amdsmi_status_t status = amdsmi_get_gpu_metrics_info(
processor_handle, &metric_info);
if (status != AMDSMI_STATUS_SUCCESS) {
return status;
}
uint32_t dev_num = 0;
auto r = rsmi_num_monitor_devices(&dev_num);
link_status->total_links = AMDSMI_MAX_NUM_XGMI_LINKS;
if (dev_num <= link_status->total_links) {
link_status->total_links = dev_num;
}
// get the status values from the metric info
for (unsigned int i = 0; i < link_status->total_links; i++) {
if (metric_info.xgmi_link_status[i] == std::numeric_limits<uint16_t>::max()) {
link_status->status[i] = AMDSMI_XGMI_LINK_DISABLE;
} else if (metric_info.xgmi_link_status[i] == 0) {
link_status->status[i] = AMDSMI_XGMI_LINK_DOWN;
} else if (metric_info.xgmi_link_status[i] == 1) {
link_status->status[i] = AMDSMI_XGMI_LINK_UP;
} else {
return AMDSMI_STATUS_UNEXPECTED_DATA;
}
}
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_gpu_kfd_info(amdsmi_processor_handle processor_handle,
amdsmi_kfd_info_t *info) {
AMDSMI_CHECK_INIT();
@@ -1135,6 +1176,7 @@ amdsmi_status_t amdsmi_get_gpu_vram_info(
info->vram_size = 0;
info->vram_vendor = AMDSMI_VRAM_VENDOR__PLACEHOLDER0;
info->vram_bit_width = std::numeric_limits<decltype(info->vram_bit_width)>::max();
info->vram_max_bandwidth = std::numeric_limits<decltype(info->vram_max_bandwidth)>::max();
// Only can read vram type from libdrm
if (gpu_device->check_if_drm_is_supported()) {
@@ -1148,6 +1190,13 @@ amdsmi_status_t amdsmi_get_gpu_vram_info(
}
}
// set info->vram_max_bandwidth to gpu_metrics vram_max_bandwidth if it is not set
amdsmi_gpu_metrics_t metric_info = {};
r = amdsmi_get_gpu_metrics_info(processor_handle, &metric_info);
if (r == AMDSMI_STATUS_SUCCESS) {
info->vram_max_bandwidth = metric_info.vram_max_bandwidth;
}
// if vram type is greater than the max enum set it to unknown
if (info->vram_type > AMDSMI_VRAM_TYPE__MAX)
info->vram_type = AMDSMI_VRAM_TYPE_UNKNOWN;
@@ -239,6 +239,12 @@ void TestGpuMetricsRead::Run(void) {
amd::smi::make_ostream_joiner(&std::cout, ", "));
std::cout << std::dec << "]\n";
std::cout << std::dec << "xgmi_link_status= [";
std::copy(std::begin(smu.xgmi_link_status),
std::end(smu.xgmi_link_status),
amd::smi::make_ostream_joiner(&std::cout, ", "));
std::cout << std::dec << "]\n";
// Voltage (mV)
std::cout << "voltage_soc = " << std::dec << smu.voltage_soc << "\n";
std::cout << "voltage_gfx = " << std::dec << smu.voltage_gfx << "\n";
@@ -254,6 +260,9 @@ void TestGpuMetricsRead::Run(void) {
std::cout << "pcie_bandwidth_acc=" << std::dec << smu.pcie_bandwidth_acc << "\n";
std::cout << "pcie_bandwidth_inst=" << std::dec << smu.pcie_bandwidth_inst << "\n";
// VRAM max bandwidth at max memory clock (GB/sec)
std::cout << "vram_max_bandwidth=" << std::dec << smu.vram_max_bandwidth << "\n";
// Counts
std::cout << "pcie_l0_to_recov_count_acc= " << std::dec << smu.pcie_l0_to_recov_count_acc
<< "\n";
@@ -329,6 +338,17 @@ void TestGpuMetricsRead::Run(void) {
xcp++;
}
xcp = 0;
std::cout << std::dec << "xcp_stats.gfx_below_host_limit_acc = \n";
for (auto& row : smu.xcp_stats) {
std::cout << "XCP[" << xcp << "] = " << "[ ";
std::copy(std::begin(row.gfx_below_host_limit_acc),
std::end(row.gfx_below_host_limit_acc),
amd::smi::make_ostream_joiner(&std::cout, ", "));
std::cout << " ]\n";
xcp++;
}
std::cout << "\n\n";
std::cout << "\t ** -> Checking metrics with constant changes ** " << "\n";
constexpr uint16_t kMAX_ITER_TEST = 10;
@@ -129,17 +129,19 @@ void TestIdInfoRead::Run(void) {
CHK_ERR_ASRT(err)
IF_VERB(STANDARD) {
std::cout << "\t**Device Vram type id: "
<< vram_info.vram_type << std::endl;
<< vram_info.vram_type << std::endl;
std::cout << "\t**Device Vram vendor id: "
<< vram_info.vram_vendor << std::endl;
<< vram_info.vram_vendor << std::endl;
std::cout << "\t**Device Vram size: 0x"
<< std::hex << vram_info.vram_size
<< " (" << std::dec << vram_info.vram_size << ")"
<< std::endl;
<< std::hex << vram_info.vram_size
<< " (" << std::dec << vram_info.vram_size << ")"
<< std::endl;
std::cout << "\t**Device Bit Width: 0x"
<< std::hex << vram_info.vram_bit_width
<< " (" << std::dec << vram_info.vram_bit_width << ")"
<< std::endl;
<< std::hex << vram_info.vram_bit_width
<< " (" << std::dec << vram_info.vram_bit_width << ")"
<< std::endl;
std::cout << "\t**Device Vram Max Bandwidth: "
<< vram_info.vram_max_bandwidth << " GB/s" << std::endl;
}
err = amdsmi_get_gpu_vendor_name(processor_handles_[i], buffer, kBufferLen);