[SWDEV-496693]GPU Metrics 1.7
Features added:
- [SWDEV-475244] Add new interface to get max memory bandwidth
Updated API: amdsmi_get_gpu_vram_info
Updated: struct amdsmi_vram_info_t to include vram_max_bandwidth
CLI: amd-smi static --vram
- [SWDEV-488349] Add new interface for XGMI link status
New API: amdsmi_get_gpu_xgmi_link_status
CLI: amd-smi xgmi --link-status
Signed-off-by: Juan Castillo <juan.castillo@amd.com>
Change-Id: I1aa35b741136eb4f02f7ea9a95b865886273eb72
[ROCm/amdsmi commit: f8b8347627]
This commit is contained in:
committed by
Arif, Maisam
parent
01d303806a
commit
2ddb2ef032
@@ -7,6 +7,42 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr
|
||||
|
||||
### Added
|
||||
|
||||
- **Added support for GPU metrics 1.7 to `amdsmi_get_gpu_metrics_info()`**
|
||||
Updated `amdsmi_get_gpu_metrics_info()` and structure `amdsmi_gpu_metrics_t` to include new fields for XGMI Link Status, graphics clocks below host limit (per XCP), and VRAM max bandwidth:
|
||||
- `uint64_t vram_max_bandwidth` - VRAM max bandwidth at max memory clock (GB/s)
|
||||
- `uint16_t xgmi_link_status[MAX_NUM_XGMI_LINKS]` - XGMI link statis, 1=Up 0=Down
|
||||
- `uint64_t gfx_below_host_limit_acc[MAX_NUM_XCC]` - graphics clocks below host limit (per XCP) accumulators. Used for graphic clk below host limit violation status.
|
||||
|
||||
- **Added new API `amdsmi_get_gpu_xgmi_link_status()` and CLI `amd-smi xgmi --link-status`**
|
||||
New API is defined as:
|
||||
```C
|
||||
typedef enum {
|
||||
AMDSMI_XGMI_LINK_DOWN, //!< The XGMI Link is down
|
||||
AMDSMI_XGMI_LINK_UP, //!< The XGMI Link is up
|
||||
AMDSMI_XGMI_LINK_DISABLE, //!< The XGMI Link is disabled
|
||||
} amdsmi_xgmi_link_status_type_t;
|
||||
|
||||
typedef struct {
|
||||
uint32_t total_links; //!< The total links in the status array
|
||||
amdsmi_xgmi_link_status_type_t status[AMDSMI_MAX_NUM_XGMI_LINKS];
|
||||
uint64_t reserved[7];
|
||||
} amdsmi_xgmi_link_status_t;
|
||||
|
||||
amdsmi_status_t amdsmi_get_gpu_xgmi_link_status(amdsmi_processor_handle processor_handle, amdsmi_xgmi_link_status_t *link_status)
|
||||
```
|
||||
Example CLI output:
|
||||
```shell
|
||||
$ amd-smi xgmi --link-status
|
||||
|
||||
XGMI LINK STATUS:
|
||||
bdf link_status
|
||||
GPU0 0000:08:00.0 U U U U D U D X
|
||||
GPU1 0000:44:00.0 U U U U D U D X
|
||||
...
|
||||
|
||||
* U:Up D:Down X:Disabled
|
||||
```
|
||||
|
||||
- **Added fclk and socclk info to `amd-smi metric -c/--clock`**.
|
||||
fclk and socclk information such as min and max clock have been added to the metric command, in line with all the other clocks.
|
||||
|
||||
@@ -77,12 +113,43 @@ GPU: 0
|
||||
DCLK1: N/A
|
||||
```
|
||||
|
||||
## amd_smi_lib for ROCm 6.4.0
|
||||
|
||||
### Added
|
||||
|
||||
### Changed
|
||||
|
||||
- **Updated API `amdsmi_get_gpu_vram_info()` structure and CLI `amd-smi static --vram`**
|
||||
Updated structure `amdsmi_vram_info_t`:
|
||||
```C
|
||||
typedef struct {
|
||||
amdsmi_vram_type_t vram_type;
|
||||
amdsmi_vram_vendor_type_t vram_vendor;
|
||||
uint64_t vram_size;
|
||||
uint32_t vram_bit_width;
|
||||
uint64_t vram_max_bandwidth; //!< The VRAM max bandwidth at current memory clock (GB/s)
|
||||
uint64_t reserved[4];
|
||||
} amdsmi_vram_info_t;
|
||||
|
||||
amdsmi_status_t amdsmi_get_gpu_vram_info(amdsmi_processor_handle processor_handle, amdsmi_vram_info_t *info)
|
||||
```
|
||||
Example CLI output:
|
||||
```shell
|
||||
$ amd-smi static --vram
|
||||
GPU: 0
|
||||
VRAM:
|
||||
TYPE: GDDR6
|
||||
VENDOR: N/A
|
||||
SIZE: 16368 MB
|
||||
BIT_WIDTH: 256
|
||||
MAX_BANDWIDTH: 1555 GB/s
|
||||
GPU: 1
|
||||
VRAM:
|
||||
TYPE: GDDR6
|
||||
VENDOR: N/A
|
||||
SIZE: 30704 MB
|
||||
BIT_WIDTH: 256
|
||||
MAX_BANDWIDTH: 1555 GB/s
|
||||
...
|
||||
|
||||
```
|
||||
|
||||
### Removed
|
||||
|
||||
- **Removed `GFX_BUSY_ACC` from `amd-smi metric --usage`**.
|
||||
|
||||
@@ -751,7 +751,8 @@ class AMDSMICommands():
|
||||
vram_info_dict = {"type" : "N/A",
|
||||
"vendor" : "N/A",
|
||||
"size" : "N/A",
|
||||
"bit_width" : "N/A"}
|
||||
"bit_width" : "N/A",
|
||||
"max_bandwidth" : "N/A"}
|
||||
try:
|
||||
vram_info = amdsmi_interface.amdsmi_get_gpu_vram_info(args.gpu)
|
||||
|
||||
@@ -790,6 +791,15 @@ class AMDSMICommands():
|
||||
# Populate bit width
|
||||
vram_info_dict['bit_width'] = vram_info['vram_bit_width']
|
||||
|
||||
# Populate vram_max_bandwidth
|
||||
vram_max_bw = vram_info['vram_max_bandwidth']
|
||||
vram_max_bw_unit = 'GB/s'
|
||||
if self.logger.is_human_readable_format():
|
||||
vram_info_dict["max_bandwidth"] = f"{vram_max_bw} {vram_max_bw_unit if vram_max_bw != 'N/A' else ''}"
|
||||
if self.logger.is_json_format():
|
||||
vram_info_dict["max_bandwidth"] = {"value" : vram_max_bw,
|
||||
"unit" : vram_max_bw_unit}
|
||||
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get vram info for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
@@ -1242,7 +1252,8 @@ class AMDSMICommands():
|
||||
clock=None, temperature=None, ecc=None, ecc_blocks=None, pcie=None,
|
||||
fan=None, voltage_curve=None, overdrive=None, perf_level=None,
|
||||
xgmi_err=None, energy=None, mem_usage=None, schedule=None,
|
||||
guard=None, guest_data=None, fb_usage=None, xgmi=None, throttle=None):
|
||||
guard=None, guest_data=None, fb_usage=None, xgmi=None, throttle=None,
|
||||
):
|
||||
"""Get Metric information for target gpu
|
||||
|
||||
Args:
|
||||
@@ -1338,7 +1349,8 @@ class AMDSMICommands():
|
||||
current_platform_args += ["fan", "voltage_curve", "overdrive", "perf_level",
|
||||
"xgmi_err", "energy", "throttle"]
|
||||
current_platform_values += [args.fan, args.voltage_curve, args.overdrive,
|
||||
args.perf_level, args.xgmi_err, args.energy, args.throttle]
|
||||
args.perf_level, args.xgmi_err, args.energy, args.throttle,
|
||||
]
|
||||
|
||||
if self.helpers.is_hypervisor():
|
||||
if schedule:
|
||||
@@ -2221,6 +2233,7 @@ class AMDSMICommands():
|
||||
'socket_thermal_accumulated': "N/A",
|
||||
'vr_thermal_accumulated': "N/A",
|
||||
'hbm_thermal_accumulated': "N/A",
|
||||
'gfx_below_host_limit_acc': "N/A",
|
||||
|
||||
# violation status values - active/not active
|
||||
'prochot_violation_status': "N/A",
|
||||
@@ -2311,7 +2324,7 @@ class AMDSMICommands():
|
||||
|
||||
def metric_cpu(self, args, multiple_devices=False, cpu=None, cpu_power_metrics=None, cpu_prochot=None,
|
||||
cpu_freq_metrics=None, cpu_c0_res=None, cpu_lclk_dpm_level=None,
|
||||
cpu_pwr_svi_telemtry_rails=None, cpu_io_bandwidth=None, cpu_xgmi_bandwidth=None,
|
||||
cpu_pwr_svi_telemetry_rails=None, cpu_io_bandwidth=None, cpu_xgmi_bandwidth=None,
|
||||
cpu_metrics_ver=None, cpu_metrics_table=None, cpu_socket_energy=None,
|
||||
cpu_ddr_bandwidth=None, cpu_temp=None, cpu_dimm_temp_range_rate=None,
|
||||
cpu_dimm_pow_consumption=None, cpu_dimm_thermal_sensor=None):
|
||||
@@ -2354,8 +2367,8 @@ class AMDSMICommands():
|
||||
args.cpu_c0_res = cpu_c0_res
|
||||
if cpu_lclk_dpm_level:
|
||||
args.cpu_lclk_dpm_level = cpu_lclk_dpm_level
|
||||
if cpu_pwr_svi_telemtry_rails:
|
||||
args.cpu_pwr_svi_telemtry_rails = cpu_pwr_svi_telemtry_rails
|
||||
if cpu_pwr_svi_telemetry_rails:
|
||||
args.cpu_pwr_svi_telemtry_rails = cpu_pwr_svi_telemetry_rails
|
||||
if cpu_io_bandwidth:
|
||||
args.cpu_io_bandwidth = cpu_io_bandwidth
|
||||
if cpu_xgmi_bandwidth:
|
||||
@@ -2488,7 +2501,7 @@ class AMDSMICommands():
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
static_dict["socket_dpm"]["dpml_level_range"] = "N/A"
|
||||
logging.debug("Failed to get socket dpm level range for cpu %s | %s", cpu_id, e.get_error_info())
|
||||
if args.cpu_pwr_svi_telemtry_rails:
|
||||
if args.cpu_pwr_svi_telemetry_rails:
|
||||
static_dict["svi_telemetry_all_rails"] = {}
|
||||
try:
|
||||
power = amdsmi_interface.amdsmi_get_cpu_pwr_svi_telemetry_all_rails(args.cpu)
|
||||
@@ -2756,8 +2769,7 @@ class AMDSMICommands():
|
||||
None: Print output via AMDSMILogger to destination
|
||||
"""
|
||||
# TODO Move watch logic into here and make it driver agnostic or enable it for CPU arguments
|
||||
|
||||
# Mutually exculsive args
|
||||
# Mutually exclusive args
|
||||
if gpu:
|
||||
args.gpu = gpu
|
||||
if cpu:
|
||||
@@ -2832,7 +2844,7 @@ class AMDSMICommands():
|
||||
cpu_dimm_pow_consumption, cpu_dimm_thermal_sensor)
|
||||
if args.core:
|
||||
self.logger.output = {}
|
||||
self.logger.clear_multiple_devices_ouput()
|
||||
self.logger.clear_multiple_devices_output()
|
||||
self.metric_core(args, multiple_devices, core, core_boost_limit,
|
||||
core_curr_active_freq_core_limit, core_energy)
|
||||
if args.gpu:
|
||||
@@ -2843,7 +2855,8 @@ class AMDSMICommands():
|
||||
clock, temperature, ecc, ecc_blocks, pcie,
|
||||
fan, voltage_curve, overdrive, perf_level,
|
||||
xgmi_err, energy, mem_usage, schedule,
|
||||
guard, guest_data, fb_usage, xgmi, throttle)
|
||||
guard, guest_data, fb_usage, xgmi, throttle,
|
||||
)
|
||||
elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized
|
||||
if args.cpu == None and args.core == None:
|
||||
# If no args are set, print out all CPU and Core metrics info
|
||||
@@ -2877,7 +2890,8 @@ class AMDSMICommands():
|
||||
usage, watch, watch_time, iterations, power,
|
||||
clock, temperature, ecc, ecc_blocks, pcie,
|
||||
fan, voltage_curve, overdrive, perf_level,
|
||||
xgmi_err, energy, mem_usage, schedule, throttle)
|
||||
xgmi_err, energy, mem_usage, schedule, throttle,
|
||||
)
|
||||
|
||||
|
||||
def process(self, args, multiple_devices=False, watching_output=False,
|
||||
@@ -5350,13 +5364,14 @@ class AMDSMICommands():
|
||||
print("Placeholder for rocm-smi legacy commands")
|
||||
|
||||
|
||||
def xgmi(self, args, multiple_devices=False, gpu=None, metric=None):
|
||||
def xgmi(self, args, multiple_devices=False, gpu=None, metric=None, xgmi_link_status=None):
|
||||
""" Get topology information for target gpus
|
||||
params:
|
||||
args - argparser args to pass to subcommand
|
||||
multiple_devices (bool) - True if checking for multiple devices
|
||||
gpu (device_handle) - device_handle for target device
|
||||
metric (bool) - Value override for args.metric
|
||||
xgmi_link_status (bool) - Value override for args.xgmi_link_status
|
||||
|
||||
return:
|
||||
Nothing
|
||||
@@ -5368,6 +5383,8 @@ class AMDSMICommands():
|
||||
args.gpu = gpu
|
||||
if metric:
|
||||
args.metric = metric
|
||||
if xgmi_link_status:
|
||||
args.link_status = xgmi_link_status
|
||||
|
||||
# Handle No GPU passed
|
||||
if args.gpu == None:
|
||||
@@ -5377,8 +5394,9 @@ class AMDSMICommands():
|
||||
args.gpu = [args.gpu]
|
||||
|
||||
# Handle all args being false
|
||||
if not any([args.metric]):
|
||||
if not any([args.metric, args.link_status]):
|
||||
args.metric = True
|
||||
args.link_status = True
|
||||
|
||||
# Clear the table header
|
||||
self.logger.table_header = ''.rjust(7)
|
||||
@@ -5396,9 +5414,9 @@ class AMDSMICommands():
|
||||
|
||||
if args.metric:
|
||||
# prepend link metrics header to the table header
|
||||
link_metrics_header = " " + "bdf".ljust(13) + \
|
||||
"bit_rate".ljust(9) + "max_bandwidth".ljust(14) + \
|
||||
"link_type".ljust(10)
|
||||
link_metrics_header = " " + "bdf".ljust(14) + \
|
||||
"bit_rate".ljust(10) + "max_bandwidth".ljust(15) + \
|
||||
"link_type".ljust(11)
|
||||
self.logger.table_header = link_metrics_header + self.logger.table_header.strip()
|
||||
|
||||
# Populate dictionary according to format
|
||||
@@ -5544,7 +5562,7 @@ class AMDSMICommands():
|
||||
|
||||
# Print out the tabular output
|
||||
self.logger.multiple_device_output = tabular_output
|
||||
self.logger.table_title = "LINK METRIC TABLE"
|
||||
self.logger.table_title = "\nLINK METRIC TABLE"
|
||||
self.logger.print_output(multiple_device_enabled=True, tabular=True)
|
||||
|
||||
self.logger.multiple_device_output = xgmi_values
|
||||
@@ -5558,6 +5576,48 @@ class AMDSMICommands():
|
||||
if not self.logger.is_human_readable_format():
|
||||
self.logger.print_output(multiple_device_enabled=True)
|
||||
|
||||
if args.link_status:
|
||||
# Header modification
|
||||
self.logger.table_header = ''.rjust(7)
|
||||
current_header = " ".ljust(7) + \
|
||||
"bdf".ljust(14) + \
|
||||
"link_status".ljust(20)
|
||||
self.logger.table_header = current_header + self.logger.table_header.strip()
|
||||
# Process each GPU
|
||||
tabular_output = []
|
||||
for xgmi_dict in xgmi_values:
|
||||
src_gpu_id = xgmi_dict['gpu']
|
||||
src_gpu_bdf = xgmi_dict['bdf']
|
||||
src_gpu = amdsmi_interface.amdsmi_get_processor_handle_from_bdf(src_gpu_bdf)
|
||||
|
||||
# Populate link statuses
|
||||
status_row = []
|
||||
tabular_output_dict = {"gpu#": f"GPU{src_gpu_id}",
|
||||
"gpu": src_gpu_id,
|
||||
"bdf": src_gpu_bdf,
|
||||
"link_status": "N/A"}
|
||||
try:
|
||||
link_status = amdsmi_interface.amdsmi_get_gpu_xgmi_link_status(src_gpu)
|
||||
tabular_output_dict['link_status'] = link_status['status']
|
||||
if self.logger.is_human_readable_format():
|
||||
del tabular_output_dict['gpu']
|
||||
else:
|
||||
del tabular_output_dict['gpu#']
|
||||
tabular_output.append(tabular_output_dict)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
xgmi_dict['link_metrics']['link_status']={"status": "failed"}
|
||||
logging.debug("Failed to get XGMI link status for GPU %s | %s", src_gpu_id, e.get_error_info())
|
||||
|
||||
#populate link status data for output
|
||||
if self.logger.is_human_readable_format():
|
||||
xgmi_dict['link_status'] = tabular_output
|
||||
self.logger.multiple_device_output= tabular_output
|
||||
self.logger.table_title = "\nXGMI LINK STATUS"
|
||||
self.logger.print_output(multiple_device_enabled=True, tabular=True)
|
||||
self.logger.clear_multiple_devices_ouput()
|
||||
if self.logger.is_human_readable_format():
|
||||
print("\n* U:Up D:Down X:Disabled".ljust(13))
|
||||
|
||||
|
||||
def partition(self, args, multiple_devices=False, gpu=None, current=None, memory=None, accelerator=None):
|
||||
""" Display parition information for the target GPU
|
||||
|
||||
@@ -780,6 +780,8 @@ class AMDSMIHelpers():
|
||||
|
||||
|
||||
def convert_bytes_to_readable(self, bytes_input, format_length=None):
|
||||
if isinstance(bytes_input, str):
|
||||
return "N/A"
|
||||
for unit in ["B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB"]:
|
||||
if abs(bytes_input) < 1024:
|
||||
if format_length is not None:
|
||||
|
||||
@@ -132,15 +132,18 @@ class AMDSMILogger():
|
||||
elif key == "gpu#":
|
||||
table_values += string_value.ljust(7)
|
||||
elif key == "bdf":
|
||||
table_values += string_value.ljust(13)
|
||||
table_values += string_value.ljust(14)
|
||||
elif "bdf_" in key:
|
||||
table_values += string_value.ljust(13)
|
||||
elif key == "bit_rate":
|
||||
table_values += string_value.ljust(9)
|
||||
elif key == "max_bandwidth":
|
||||
table_values += string_value.ljust(14)
|
||||
elif key == "link_type":
|
||||
table_values += string_value.ljust(10)
|
||||
elif key == "max_bandwidth":
|
||||
table_values += string_value.ljust(15)
|
||||
elif key == "link_type":
|
||||
table_values += string_value.ljust(11)
|
||||
elif key == "link_status":
|
||||
for i in value:
|
||||
table_values += str(i).ljust(3)
|
||||
elif key == "memory":
|
||||
table_values += string_value.ljust(8)
|
||||
elif key == "accelerator_type":
|
||||
@@ -166,7 +169,7 @@ class AMDSMILogger():
|
||||
elif key == "resources_shared":
|
||||
table_values += string_value.ljust(18)
|
||||
elif key == "RW":
|
||||
table_values += string_value.ljust(53)
|
||||
table_values += string_value.ljust(57)
|
||||
elif key == "process_list":
|
||||
#Add an additional padding between the first instance of GPU and NAME
|
||||
table_values += ' '
|
||||
|
||||
@@ -1313,6 +1313,7 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
|
||||
# Help text for Arguments only on Guest and BM platforms
|
||||
metrics_help = "Metric XGMI information"
|
||||
xgmi_link_status_help = "XGMI Link Status information"
|
||||
|
||||
# Create xgmi subparser
|
||||
xgmi_parser = subparsers.add_parser('xgmi', help=xgmi_help, description=xgmi_subcommand_help)
|
||||
@@ -1326,6 +1327,7 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
|
||||
# Optional Args
|
||||
xgmi_parser.add_argument('-m', '--metric', action='store_true', required=False, help=metrics_help)
|
||||
xgmi_parser.add_argument('-l', '--link-status', action='store_true', required=False, help=xgmi_link_status_help)
|
||||
|
||||
|
||||
def _add_partition_parser(self, subparsers, func):
|
||||
|
||||
@@ -293,9 +293,11 @@ int main() {
|
||||
CHK_AMDSMI_RET(ret)
|
||||
printf(" Output of amdsmi_get_gpu_vram_info:\n");
|
||||
printf("\tVRAM Size: 0x%lx (%ld) \n", vram_info.vram_size, vram_info.vram_size);
|
||||
printf("\tBIT Width: 0x%x (%d) \n\n", vram_info.vram_bit_width, vram_info.vram_bit_width);
|
||||
}
|
||||
else {
|
||||
printf("\tBIT Width: 0x%x (%d) \n\n", vram_info.vram_bit_width,
|
||||
vram_info.vram_bit_width);
|
||||
printf("\tVRAM max bandwidth: 0x%lx (%lu) \n\n", vram_info.vram_max_bandwidth,
|
||||
vram_info.vram_max_bandwidth);
|
||||
} else {
|
||||
printf("\t**amdsmi_get_gpu_vram_info() not supported on this system.\n");
|
||||
}
|
||||
|
||||
@@ -865,6 +867,18 @@ int main() {
|
||||
++idx;
|
||||
}
|
||||
|
||||
std::cout << std::dec << "\txgmi_link_status= [";
|
||||
idx = 0;
|
||||
for (const auto& temp : smu.xgmi_link_status) {
|
||||
std::cout << temp;
|
||||
if ((idx + 1) != std::size(smu.xgmi_link_status)) {
|
||||
std::cout << ", ";
|
||||
} else {
|
||||
std::cout << "]\n";
|
||||
}
|
||||
++idx;
|
||||
}
|
||||
|
||||
// Voltage (mV)
|
||||
std::cout << "\tvoltage_soc = " << std::dec << smu.voltage_soc << "\n";
|
||||
std::cout << "\tvoltage_gfx = " << std::dec << smu.voltage_gfx << "\n";
|
||||
@@ -880,6 +894,9 @@ int main() {
|
||||
std::cout << "\tpcie_bandwidth_acc=" << std::dec << smu.pcie_bandwidth_acc << "\n";
|
||||
std::cout << "\tpcie_bandwidth_inst=" << std::dec << smu.pcie_bandwidth_inst << "\n";
|
||||
|
||||
// VRAM max bandwidth at max memory clock
|
||||
std::cout << "\tvram_max_bandwidth=" << std::dec << smu.vram_max_bandwidth << "\n";
|
||||
|
||||
// Counts
|
||||
std::cout << "\tpcie_l0_to_recov_count_acc= " << std::dec << smu.pcie_l0_to_recov_count_acc
|
||||
<< "\n";
|
||||
@@ -983,6 +1000,24 @@ int main() {
|
||||
idx++;
|
||||
}
|
||||
|
||||
idx = 0;
|
||||
idy = 0;
|
||||
std::cout << "\txcp_stats.gfx_below_host_limit_acc: " << "\n";
|
||||
for (auto& row : smu.xcp_stats) {
|
||||
std::cout << "\t XCP [" << idx << "] : [";
|
||||
for (auto& col : row.gfx_below_host_limit_acc) {
|
||||
if ((idy + 1) != static_cast<int>(std::size(row.gfx_below_host_limit_acc))) {
|
||||
std::cout << col << ", ";
|
||||
} else {
|
||||
std::cout << col;
|
||||
}
|
||||
idy++;
|
||||
}
|
||||
std::cout << "]\n";
|
||||
idy = 0;
|
||||
idx++;
|
||||
}
|
||||
|
||||
std::cout << "\n\n";
|
||||
std::cout << "\t ** -> Checking metrics with constant changes ** " << "\n";
|
||||
constexpr uint16_t kMAX_ITER_TEST = 10;
|
||||
|
||||
@@ -710,7 +710,8 @@ typedef struct {
|
||||
amdsmi_vram_vendor_type_t vram_vendor;
|
||||
uint64_t vram_size;
|
||||
uint32_t vram_bit_width;
|
||||
uint64_t reserved[5];
|
||||
uint64_t vram_max_bandwidth; //!< The VRAM max bandwidth at current memory clock (GB/s)
|
||||
uint64_t reserved[4];
|
||||
} amdsmi_vram_info_t;
|
||||
|
||||
typedef struct {
|
||||
@@ -1325,13 +1326,22 @@ typedef struct {
|
||||
* @brief The following structures hold the gpu statistics for a device.
|
||||
*/
|
||||
typedef struct {
|
||||
/* Utilization Instantaneous (%) */
|
||||
/*
|
||||
* v1.6 additions
|
||||
*/
|
||||
/* Utilization Instantaneous (%) */
|
||||
uint32_t gfx_busy_inst[AMDSMI_MAX_NUM_XCC];
|
||||
uint16_t jpeg_busy[AMDSMI_MAX_NUM_JPEG];
|
||||
uint16_t vcn_busy[AMDSMI_MAX_NUM_VCN];
|
||||
|
||||
/* Utilization Accumulated (%) */
|
||||
uint64_t gfx_busy_acc[AMDSMI_MAX_NUM_XCC];
|
||||
|
||||
/*
|
||||
* v1.7 additions
|
||||
*/
|
||||
/* Total App Clock Counter Accumulated */
|
||||
uint64_t gfx_below_host_limit_acc[AMDSMI_MAX_NUM_XCC];
|
||||
} amdsmi_gpu_xcp_metrics_t;
|
||||
|
||||
typedef struct {
|
||||
@@ -1533,9 +1543,30 @@ typedef struct {
|
||||
/* PCIE other end recovery counter */
|
||||
uint32_t pcie_lc_perf_other_end_recovery;
|
||||
|
||||
/*
|
||||
* v1.7 additions
|
||||
*/
|
||||
/* VRAM max bandwidth at max memory clock (GB/s) */
|
||||
uint64_t vram_max_bandwidth;
|
||||
|
||||
/* XGMI link status(up/down) */
|
||||
uint16_t xgmi_link_status[AMDSMI_MAX_NUM_XGMI_LINKS];
|
||||
|
||||
/// \endcond
|
||||
} amdsmi_gpu_metrics_t;
|
||||
|
||||
typedef enum {
|
||||
AMDSMI_XGMI_LINK_DOWN, //!< The XGMI Link is down
|
||||
AMDSMI_XGMI_LINK_UP, //!< The XGMI Link is up
|
||||
AMDSMI_XGMI_LINK_DISABLE, //!< The XGMI Link is disabled
|
||||
} amdsmi_xgmi_link_status_type_t;
|
||||
|
||||
typedef struct {
|
||||
uint32_t total_links; //!< The total links in the status array
|
||||
amdsmi_xgmi_link_status_type_t status[AMDSMI_MAX_NUM_XGMI_LINKS];
|
||||
uint64_t reserved[7];
|
||||
} amdsmi_xgmi_link_status_t;
|
||||
|
||||
#define MAX_AMDSMI_NAME_LENGTH 64
|
||||
|
||||
/**
|
||||
@@ -4828,6 +4859,25 @@ amdsmi_status_t amdsmi_get_pcie_info(amdsmi_processor_handle processor_handle, a
|
||||
amdsmi_status_t
|
||||
amdsmi_get_xgmi_info(amdsmi_processor_handle processor_handle, amdsmi_xgmi_info_t *info);
|
||||
|
||||
/**
|
||||
* @brief Get the XGMI link status
|
||||
*
|
||||
* @platform{gpu_bm_linux} @platform{host}
|
||||
*
|
||||
* @details Given a processor handle @p processor_handle, this function
|
||||
* will return the link status for each XGMI link connect to this processor.
|
||||
* If the processor link type is not XGMI, it should return AMDSMI_STATUS_NOT_SUPPORTED.
|
||||
*
|
||||
* @param[in] processor_handle a processor handle
|
||||
*
|
||||
* @param[out] link_status The link status of the XGMI connect to this processor.
|
||||
*
|
||||
* @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail
|
||||
*/
|
||||
amdsmi_status_t
|
||||
amdsmi_get_gpu_xgmi_link_status(amdsmi_processor_handle processor_handle,
|
||||
amdsmi_xgmi_link_status_t* link_status);
|
||||
|
||||
/** @} End asicinfo */
|
||||
|
||||
/*****************************************************************************/
|
||||
@@ -5756,6 +5806,7 @@ amdsmi_status_t amdsmi_get_cpu_model(uint32_t *cpu_model);
|
||||
* @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail
|
||||
*/
|
||||
amdsmi_status_t amdsmi_get_esmi_err_msg(amdsmi_status_t status, const char **status_string);
|
||||
|
||||
#endif
|
||||
|
||||
/** @} auxiquer */
|
||||
|
||||
@@ -105,6 +105,7 @@ from .amdsmi_interface import amdsmi_get_clock_info
|
||||
from .amdsmi_interface import amdsmi_get_pcie_info
|
||||
from .amdsmi_interface import amdsmi_get_gpu_bad_page_info
|
||||
from .amdsmi_interface import amdsmi_get_violation_status
|
||||
from .amdsmi_interface import amdsmi_get_gpu_xgmi_link_status
|
||||
|
||||
# # Process Information
|
||||
from .amdsmi_interface import amdsmi_get_gpu_process_list
|
||||
|
||||
@@ -1818,15 +1818,50 @@ def amdsmi_get_gpu_vram_info(
|
||||
amdsmi_wrapper.amdsmi_get_gpu_vram_info(
|
||||
processor_handle, ctypes.byref(vram_info))
|
||||
)
|
||||
|
||||
return {
|
||||
"vram_type": vram_info.vram_type,
|
||||
"vram_vendor": vram_info.vram_vendor,
|
||||
"vram_size": vram_info.vram_size,
|
||||
"vram_bit_width": vram_info.vram_bit_width
|
||||
"vram_bit_width": _validate_if_max_uint(vram_info.vram_bit_width, MaxUIntegerTypes.UINT32_T),
|
||||
"vram_max_bandwidth": _validate_if_max_uint(vram_info.vram_max_bandwidth, MaxUIntegerTypes.UINT64_T),
|
||||
}
|
||||
|
||||
|
||||
def amdsmi_get_gpu_xgmi_link_status(
|
||||
processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
|
||||
) -> Dict[str, Any]:
|
||||
if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle):
|
||||
raise AmdSmiParameterException(
|
||||
processor_handle, amdsmi_wrapper.amdsmi_processor_handle
|
||||
)
|
||||
status_info = amdsmi_wrapper.amdsmi_xgmi_link_status_t()
|
||||
_check_res(
|
||||
amdsmi_wrapper.amdsmi_get_gpu_xgmi_link_status(
|
||||
processor_handle, ctypes.byref(status_info))
|
||||
)
|
||||
|
||||
link_status = []
|
||||
count = 0
|
||||
for link in status_info.status:
|
||||
if count == status_info.total_links:
|
||||
break
|
||||
if amdsmi_wrapper.amdsmi_xgmi_link_status_type_t__enumvalues[link] == 'AMDSMI_XGMI_LINK_DISABLE': # XGMI link is disabled
|
||||
link_status.append("X")
|
||||
elif amdsmi_wrapper.amdsmi_xgmi_link_status_type_t__enumvalues[link] == 'AMDSMI_XGMI_LINK_UP': # XGMI Link is up
|
||||
link_status.append("U")
|
||||
elif amdsmi_wrapper.amdsmi_xgmi_link_status_type_t__enumvalues[link] == 'AMDSMI_XGMI_LINK_DOWN': # XGMI Link is down
|
||||
link_status.append("D")
|
||||
else:
|
||||
link_status.append("N/A")
|
||||
count += 1
|
||||
|
||||
return_dict = {
|
||||
"status" : link_status,
|
||||
"total_links": status_info.total_links,
|
||||
}
|
||||
return return_dict
|
||||
|
||||
|
||||
def amdsmi_get_gpu_cache_info(
|
||||
processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
|
||||
) -> List[Dict[str, Any]]:
|
||||
@@ -3863,7 +3898,10 @@ def amdsmi_get_gpu_metrics_info(
|
||||
"xcp_stats.jpeg_busy": list(gpu_metrics.xcp_stats),
|
||||
"xcp_stats.vcn_busy": list(gpu_metrics.xcp_stats),
|
||||
"xcp_stats.gfx_busy_acc": list(gpu_metrics.xcp_stats),
|
||||
"xcp_stats.gfx_below_host_limit_acc": list(gpu_metrics.xcp_stats),
|
||||
"pcie_lc_perf_other_end_recovery": _validate_if_max_uint(gpu_metrics.pcie_lc_perf_other_end_recovery, MaxUIntegerTypes.UINT32_T),
|
||||
"vram_max_bandwidth": _validate_if_max_uint(gpu_metrics.vram_max_bandwidth, MaxUIntegerTypes.UINT64_T),
|
||||
"xgmi_link_status": _validate_if_max_uint(list(gpu_metrics.xgmi_link_status), MaxUIntegerTypes.UINT16_T),
|
||||
}
|
||||
|
||||
# Create 2d array with each XCD's stats
|
||||
@@ -3893,6 +3931,12 @@ def amdsmi_get_gpu_metrics_info(
|
||||
for val in item.gfx_busy_acc:
|
||||
print_xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T, isActivity=True))
|
||||
gpu_metrics_output[k][curr_xcp] = print_xcp_detail
|
||||
if 'xcp_stats.gfx_below_host_limit_acc' in k:
|
||||
for curr_xcp, item in enumerate(v):
|
||||
print_xcp_detail = []
|
||||
for val in item.gfx_below_host_limit_acc:
|
||||
print_xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T, isActivity=True))
|
||||
gpu_metrics_output[k][curr_xcp] = print_xcp_detail
|
||||
return gpu_metrics_output
|
||||
|
||||
|
||||
|
||||
@@ -1044,7 +1044,8 @@ struct_amdsmi_vram_info_t._fields_ = [
|
||||
('vram_size', ctypes.c_uint64),
|
||||
('vram_bit_width', ctypes.c_uint32),
|
||||
('PADDING_0', ctypes.c_ubyte * 4),
|
||||
('reserved', ctypes.c_uint64 * 5),
|
||||
('vram_max_bandwidth', ctypes.c_uint64),
|
||||
('reserved', ctypes.c_uint64 * 4),
|
||||
]
|
||||
|
||||
amdsmi_vram_info_t = struct_amdsmi_vram_info_t
|
||||
@@ -1119,6 +1120,16 @@ amdsmi_process_handle_t = ctypes.c_uint32
|
||||
class struct_amdsmi_proc_info_t(Structure):
|
||||
pass
|
||||
|
||||
class struct_engine_usage_(Structure):
|
||||
pass
|
||||
|
||||
struct_engine_usage_._pack_ = 1 # source:False
|
||||
struct_engine_usage_._fields_ = [
|
||||
('gfx', ctypes.c_uint64),
|
||||
('enc', ctypes.c_uint64),
|
||||
('reserved', ctypes.c_uint32 * 12),
|
||||
]
|
||||
|
||||
class struct_memory_usage_(Structure):
|
||||
pass
|
||||
|
||||
@@ -1130,16 +1141,6 @@ struct_memory_usage_._fields_ = [
|
||||
('reserved', ctypes.c_uint32 * 10),
|
||||
]
|
||||
|
||||
class struct_engine_usage_(Structure):
|
||||
pass
|
||||
|
||||
struct_engine_usage_._pack_ = 1 # source:False
|
||||
struct_engine_usage_._fields_ = [
|
||||
('gfx', ctypes.c_uint64),
|
||||
('enc', ctypes.c_uint64),
|
||||
('reserved', ctypes.c_uint32 * 12),
|
||||
]
|
||||
|
||||
struct_amdsmi_proc_info_t._pack_ = 1 # source:False
|
||||
struct_amdsmi_proc_info_t._fields_ = [
|
||||
('name', ctypes.c_char * 256),
|
||||
@@ -1739,6 +1740,7 @@ struct_amdsmi_gpu_xcp_metrics_t._fields_ = [
|
||||
('jpeg_busy', ctypes.c_uint16 * 32),
|
||||
('vcn_busy', ctypes.c_uint16 * 4),
|
||||
('gfx_busy_acc', ctypes.c_uint64 * 8),
|
||||
('gfx_below_host_limit_acc', ctypes.c_uint64 * 8),
|
||||
]
|
||||
|
||||
amdsmi_gpu_xcp_metrics_t = struct_amdsmi_gpu_xcp_metrics_t
|
||||
@@ -1820,9 +1822,34 @@ struct_amdsmi_gpu_metrics_t._fields_ = [
|
||||
('xcp_stats', struct_amdsmi_gpu_xcp_metrics_t * 8),
|
||||
('pcie_lc_perf_other_end_recovery', ctypes.c_uint32),
|
||||
('PADDING_5', ctypes.c_ubyte * 4),
|
||||
('vram_max_bandwidth', ctypes.c_uint64),
|
||||
('xgmi_link_status', ctypes.c_uint16 * 8),
|
||||
]
|
||||
|
||||
amdsmi_gpu_metrics_t = struct_amdsmi_gpu_metrics_t
|
||||
|
||||
# values for enumeration 'amdsmi_xgmi_link_status_type_t'
|
||||
amdsmi_xgmi_link_status_type_t__enumvalues = {
|
||||
0: 'AMDSMI_XGMI_LINK_DOWN',
|
||||
1: 'AMDSMI_XGMI_LINK_UP',
|
||||
2: 'AMDSMI_XGMI_LINK_DISABLE',
|
||||
}
|
||||
AMDSMI_XGMI_LINK_DOWN = 0
|
||||
AMDSMI_XGMI_LINK_UP = 1
|
||||
AMDSMI_XGMI_LINK_DISABLE = 2
|
||||
amdsmi_xgmi_link_status_type_t = ctypes.c_uint32 # enum
|
||||
class struct_amdsmi_xgmi_link_status_t(Structure):
|
||||
pass
|
||||
|
||||
struct_amdsmi_xgmi_link_status_t._pack_ = 1 # source:False
|
||||
struct_amdsmi_xgmi_link_status_t._fields_ = [
|
||||
('total_links', ctypes.c_uint32),
|
||||
('status', amdsmi_xgmi_link_status_type_t * 8),
|
||||
('PADDING_0', ctypes.c_ubyte * 4),
|
||||
('reserved', ctypes.c_uint64 * 7),
|
||||
]
|
||||
|
||||
amdsmi_xgmi_link_status_t = struct_amdsmi_xgmi_link_status_t
|
||||
class struct_amdsmi_name_value_t(Structure):
|
||||
pass
|
||||
|
||||
@@ -2397,6 +2424,9 @@ amdsmi_get_pcie_info.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_
|
||||
amdsmi_get_xgmi_info = _libraries['libamd_smi.so'].amdsmi_get_xgmi_info
|
||||
amdsmi_get_xgmi_info.restype = amdsmi_status_t
|
||||
amdsmi_get_xgmi_info.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_xgmi_info_t)]
|
||||
amdsmi_get_gpu_xgmi_link_status = _libraries['libamd_smi.so'].amdsmi_get_gpu_xgmi_link_status
|
||||
amdsmi_get_gpu_xgmi_link_status.restype = amdsmi_status_t
|
||||
amdsmi_get_gpu_xgmi_link_status.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_xgmi_link_status_t)]
|
||||
amdsmi_get_fw_info = _libraries['libamd_smi.so'].amdsmi_get_fw_info
|
||||
amdsmi_get_fw_info.restype = amdsmi_status_t
|
||||
amdsmi_get_fw_info.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_fw_info_t)]
|
||||
@@ -2763,8 +2793,9 @@ __all__ = \
|
||||
'AMDSMI_VRAM_VENDOR__PLACEHOLDER3',
|
||||
'AMDSMI_VRAM_VENDOR__PLACEHOLDER4',
|
||||
'AMDSMI_VRAM_VENDOR__PLACEHOLDER5', 'AMDSMI_VRAM_VENDOR__SAMSUNG',
|
||||
'AMDSMI_VRAM_VENDOR__WINBOND', 'AMDSMI_XGMI_STATUS_ERROR',
|
||||
'AMDSMI_XGMI_STATUS_MULTIPLE_ERRORS',
|
||||
'AMDSMI_VRAM_VENDOR__WINBOND', 'AMDSMI_XGMI_LINK_DISABLE',
|
||||
'AMDSMI_XGMI_LINK_DOWN', 'AMDSMI_XGMI_LINK_UP',
|
||||
'AMDSMI_XGMI_STATUS_ERROR', 'AMDSMI_XGMI_STATUS_MULTIPLE_ERRORS',
|
||||
'AMDSMI_XGMI_STATUS_NO_ERRORS', 'CLK_LIMIT_MAX', 'CLK_LIMIT_MIN',
|
||||
'RD_BW0', 'WR_BW0', 'amd_metrics_table_header_t',
|
||||
'amdsmi_accelerator_partition_profile_t',
|
||||
@@ -2851,7 +2882,8 @@ __all__ = \
|
||||
'amdsmi_get_gpu_total_ecc_count', 'amdsmi_get_gpu_vbios_info',
|
||||
'amdsmi_get_gpu_vendor_name', 'amdsmi_get_gpu_volt_metric',
|
||||
'amdsmi_get_gpu_vram_info', 'amdsmi_get_gpu_vram_usage',
|
||||
'amdsmi_get_gpu_vram_vendor', 'amdsmi_get_hsmp_metrics_table',
|
||||
'amdsmi_get_gpu_vram_vendor', 'amdsmi_get_gpu_xgmi_link_status',
|
||||
'amdsmi_get_hsmp_metrics_table',
|
||||
'amdsmi_get_hsmp_metrics_table_version', 'amdsmi_get_lib_version',
|
||||
'amdsmi_get_link_metrics', 'amdsmi_get_link_topology_nearest',
|
||||
'amdsmi_get_minmax_bandwidth_between_processors',
|
||||
@@ -2924,6 +2956,7 @@ __all__ = \
|
||||
'amdsmi_voltage_metric_t', 'amdsmi_voltage_type_t',
|
||||
'amdsmi_vram_info_t', 'amdsmi_vram_type_t', 'amdsmi_vram_usage_t',
|
||||
'amdsmi_vram_vendor_type_t', 'amdsmi_xgmi_info_t',
|
||||
'amdsmi_xgmi_link_status_t', 'amdsmi_xgmi_link_status_type_t',
|
||||
'amdsmi_xgmi_status_t', 'processor_type_t', 'size_t',
|
||||
'struct__links', 'struct_amd_metrics_table_header_t',
|
||||
'struct_amdsmi_accelerator_partition_profile_t',
|
||||
@@ -2958,7 +2991,8 @@ __all__ = \
|
||||
'struct_amdsmi_vbios_info_t', 'struct_amdsmi_version_t',
|
||||
'struct_amdsmi_violation_status_t', 'struct_amdsmi_vram_info_t',
|
||||
'struct_amdsmi_vram_usage_t', 'struct_amdsmi_xgmi_info_t',
|
||||
'struct_cache_', 'struct_engine_usage_', 'struct_fw_info_list_',
|
||||
'struct_amdsmi_xgmi_link_status_t', 'struct_cache_',
|
||||
'struct_engine_usage_', 'struct_fw_info_list_',
|
||||
'struct_memory_usage_', 'struct_nps_flags_',
|
||||
'struct_pcie_metric_', 'struct_pcie_static_',
|
||||
'struct_amdsmi_bdf_t', 'uint32_t', 'uint64_t', 'uint8_t',
|
||||
|
||||
@@ -1085,6 +1085,9 @@ typedef struct metrics_table_header_t metrics_table_header_t;
|
||||
* @brief The following structures hold the gpu statistics for a device.
|
||||
*/
|
||||
struct amdgpu_xcp_metrics_t {
|
||||
/*
|
||||
* v1.6 additions
|
||||
*/
|
||||
/* Utilization Instantaneous (%) */
|
||||
uint32_t gfx_busy_inst[RSMI_MAX_NUM_XCC];
|
||||
uint16_t jpeg_busy[RSMI_MAX_NUM_JPEG_ENGS];
|
||||
@@ -1092,6 +1095,12 @@ struct amdgpu_xcp_metrics_t {
|
||||
|
||||
/* Utilization Accumulated (%) */
|
||||
uint64_t gfx_busy_acc[RSMI_MAX_NUM_XCC];
|
||||
|
||||
/*
|
||||
* v1.7 additions
|
||||
*/
|
||||
/* Total App Clock Counter Accumulated */
|
||||
uint64_t gfx_below_host_limit_acc[RSMI_MAX_NUM_XCC];
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
@@ -1295,6 +1304,15 @@ typedef struct {
|
||||
/* PCIE other end recovery counter */
|
||||
uint32_t pcie_lc_perf_other_end_recovery;
|
||||
|
||||
/*
|
||||
* v1.7 additions
|
||||
*/
|
||||
/* VRAM max bandwidth at max memory clock */
|
||||
uint64_t vram_max_bandwidth;
|
||||
|
||||
/* XGMI link status(up/down) */
|
||||
uint16_t xgmi_link_status[RSMI_MAX_NUM_XGMI_LINKS];
|
||||
|
||||
/// \endcond
|
||||
} rsmi_gpu_metrics_t;
|
||||
|
||||
|
||||
@@ -88,6 +88,19 @@ struct AMDGpuMetricsHeader_v1_t {
|
||||
uint8_t m_content_revision;
|
||||
};
|
||||
|
||||
struct amdgpu_xcp_metrics_v1_1 {
|
||||
/* Utilization Instantaneous (%) */
|
||||
uint32_t gfx_busy_inst[kRSMI_MAX_NUM_XCC];
|
||||
uint16_t jpeg_busy[kRSMI_MAX_JPEG_ENGINES];
|
||||
uint16_t vcn_busy[kRSMI_MAX_NUM_VCNS];
|
||||
|
||||
/* Utilization Accumulated (%) */
|
||||
uint64_t gfx_busy_acc[kRSMI_MAX_NUM_XCC];
|
||||
|
||||
/* Total App Clock Counter Accumulated */
|
||||
uint64_t gfx_below_host_limit_acc[kRSMI_MAX_NUM_XCC];
|
||||
};
|
||||
|
||||
struct amdgpu_xcp_metrics {
|
||||
/* Utilization Instantaneous (%) */
|
||||
uint32_t gfx_busy_inst[kRSMI_MAX_NUM_XCC];
|
||||
@@ -551,7 +564,107 @@ struct AMDGpuMetrics_v16_t {
|
||||
/* PCIE other end recovery counter */
|
||||
uint32_t m_pcie_lc_perf_other_end_recovery;
|
||||
};
|
||||
using AMGpuMetricsLatest_t = AMDGpuMetrics_v16_t;
|
||||
|
||||
struct AMDGpuMetrics_v17_t {
|
||||
~AMDGpuMetrics_v17_t() = default;
|
||||
struct AMDGpuMetricsHeader_v1_t m_common_header;
|
||||
|
||||
/* Temperature (Celsius) */
|
||||
uint16_t m_temperature_hotspot;
|
||||
uint16_t m_temperature_mem;
|
||||
uint16_t m_temperature_vrsoc;
|
||||
|
||||
/* Power (Watts) */
|
||||
uint16_t m_current_socket_power;
|
||||
|
||||
/* Utilization (%) */
|
||||
uint16_t m_average_gfx_activity;
|
||||
uint16_t m_average_umc_activity; // memory controller
|
||||
|
||||
/* VRAM max bandwidth at max memory clock (GB/s) */
|
||||
uint64_t m_vram_max_bandwidth; // new for 1.7
|
||||
|
||||
/* Energy (15.259uJ (2^-16) units) */
|
||||
uint64_t m_energy_accumulator;
|
||||
|
||||
/* Driver attached timestamp (in ns) */
|
||||
uint64_t m_system_clock_counter;
|
||||
|
||||
/* Accumulation cycle counter */
|
||||
uint32_t m_accumulation_counter;
|
||||
|
||||
/* Accumulated throttler residencies */
|
||||
uint32_t m_prochot_residency_acc;
|
||||
uint32_t m_ppt_residency_acc;
|
||||
uint32_t m_socket_thm_residency_acc;
|
||||
uint32_t m_vr_thm_residency_acc;
|
||||
uint32_t m_hbm_thm_residency_acc;
|
||||
|
||||
/* Clock Lock Status. Each bit corresponds to clock instance */
|
||||
uint32_t m_gfxclk_lock_status;
|
||||
|
||||
/* Link width (number of lanes) and speed (in 0.1 GT/s) */
|
||||
uint16_t m_pcie_link_width;
|
||||
uint16_t m_pcie_link_speed;
|
||||
|
||||
/* XGMI bus width and bitrate (in Gbps) */
|
||||
uint16_t m_xgmi_link_width;
|
||||
uint16_t m_xgmi_link_speed;
|
||||
|
||||
/* Utilization Accumulated (%) */
|
||||
uint32_t m_gfx_activity_acc;
|
||||
uint32_t m_mem_activity_acc;
|
||||
|
||||
/*PCIE accumulated bandwidth (GB/sec) */
|
||||
uint64_t m_pcie_bandwidth_acc;
|
||||
|
||||
/*PCIE instantaneous bandwidth (GB/sec) */
|
||||
uint64_t m_pcie_bandwidth_inst;
|
||||
|
||||
/* PCIE L0 to recovery state transition accumulated count */
|
||||
uint64_t m_pcie_l0_to_recov_count_acc;
|
||||
|
||||
/* PCIE replay accumulated count */
|
||||
uint64_t m_pcie_replay_count_acc;
|
||||
|
||||
/* PCIE replay rollover accumulated count */
|
||||
uint64_t m_pcie_replay_rover_count_acc;
|
||||
|
||||
/* PCIE NAK sent accumulated count */
|
||||
uint32_t m_pcie_nak_sent_count_acc;
|
||||
|
||||
/* PCIE NAK received accumulated count */
|
||||
uint32_t m_pcie_nak_rcvd_count_acc;
|
||||
|
||||
/* XGMI accumulated data transfer size(KiloBytes) */
|
||||
uint64_t m_xgmi_read_data_acc[kRSMI_MAX_NUM_XGMI_LINKS];
|
||||
uint64_t m_xgmi_write_data_acc[kRSMI_MAX_NUM_XGMI_LINKS];
|
||||
|
||||
/* XGMI link status(up/down) */
|
||||
uint16_t m_xgmi_link_status[kRSMI_MAX_NUM_XGMI_LINKS]; // new for 1.7
|
||||
|
||||
uint16_t m_padding;
|
||||
|
||||
/* PMFW attached timestamp (10ns resolution) */
|
||||
uint64_t m_firmware_timestamp;
|
||||
|
||||
/* Current clocks (Mhz) */
|
||||
uint16_t m_current_gfxclk[kRSMI_MAX_NUM_GFX_CLKS];
|
||||
uint16_t m_current_socclk[kRSMI_MAX_NUM_CLKS];
|
||||
uint16_t m_current_vclk0[kRSMI_MAX_NUM_CLKS];
|
||||
uint16_t m_current_dclk0[kRSMI_MAX_NUM_CLKS];
|
||||
uint16_t m_current_uclk;
|
||||
|
||||
/* Number of current partition */
|
||||
uint16_t m_num_partition;
|
||||
|
||||
/* XCP metrics stats */
|
||||
struct amdgpu_xcp_metrics_v1_1 m_xcp_stats[kRSMI_MAX_NUM_XCP];
|
||||
|
||||
/* PCIE other end recovery counter */
|
||||
uint32_t m_pcie_lc_perf_other_end_recovery;
|
||||
};
|
||||
using AMGpuMetricsLatest_t = AMDGpuMetrics_v17_t;
|
||||
|
||||
/**
|
||||
* This is GPU Metrics version that gets to public access.
|
||||
@@ -766,8 +879,11 @@ enum class AMDGpuMetricsUnitType_t : AMDGpuMetricTypeId_t
|
||||
kMetricJpegBusy, // v1.6
|
||||
kMetricVcnBusy, // v1.6
|
||||
kMetricGfxBusyAcc, // v1.6
|
||||
|
||||
kMetricPcieLCPerfOtherEndRecov, // v1.6
|
||||
|
||||
kMetricVramMaxBandwidth, // v1.7
|
||||
kMetricXgmiLinkStatus, // v1.7
|
||||
kMetricGfxBelowHostLimitAccumulator, // v1.7
|
||||
};
|
||||
using AMDGpuMetricsUnitTypeTranslationTbl_t = std::map<AMDGpuMetricsUnitType_t, std::string>;
|
||||
|
||||
@@ -805,6 +921,7 @@ enum class AMDGpuMetricVersionFlags_t : AMDGpuMetricVersionFlagId_t
|
||||
kGpuMetricV14 = (0x1 << 4),
|
||||
kGpuMetricV15 = (0x1 << 5),
|
||||
kGpuMetricV16 = (0x1 << 6),
|
||||
kGpuMetricV17 = (0x1 << 7),
|
||||
};
|
||||
using AMDGpuMetricVersionTranslationTbl_t = std::map<uint16_t, AMDGpuMetricVersionFlags_t>;
|
||||
using GpuMetricTypePtr_t = std::shared_ptr<void>;
|
||||
@@ -1023,6 +1140,36 @@ class GpuMetricsBase_v16_t final : public GpuMetricsBase_t {
|
||||
std::shared_ptr<AMDGpuMetrics_v16_t> m_gpu_metric_ptr;
|
||||
};
|
||||
|
||||
class GpuMetricsBase_v17_t final : public GpuMetricsBase_t {
|
||||
public:
|
||||
~GpuMetricsBase_v17_t() = default;
|
||||
|
||||
size_t sizeof_metric_table() override {
|
||||
return sizeof(AMDGpuMetrics_v17_t);
|
||||
}
|
||||
|
||||
GpuMetricTypePtr_t get_metrics_table() override {
|
||||
if (!m_gpu_metric_ptr) {
|
||||
m_gpu_metric_ptr.reset(&m_gpu_metrics_tbl, [](AMDGpuMetrics_v17_t*){});
|
||||
}
|
||||
assert(m_gpu_metric_ptr != nullptr);
|
||||
return m_gpu_metric_ptr;
|
||||
}
|
||||
|
||||
void dump_internal_metrics_table() override;
|
||||
|
||||
AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override {
|
||||
return AMDGpuMetricVersionFlags_t::kGpuMetricV17;
|
||||
}
|
||||
|
||||
rsmi_status_t populate_metrics_dynamic_tbl() override;
|
||||
AMGpuMetricsPublicLatestTupl_t copy_internal_to_external_metrics() override;
|
||||
|
||||
private:
|
||||
AMDGpuMetrics_v17_t m_gpu_metrics_tbl;
|
||||
std::shared_ptr<AMDGpuMetrics_v17_t> m_gpu_metric_ptr;
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
rsmi_status_t rsmi_dev_gpu_metrics_info_query(uint32_t dv_ind,
|
||||
AMDGpuMetricsUnitType_t metric_counter, T& metric_value);
|
||||
|
||||
@@ -136,6 +136,7 @@ std::string stringfy_metric_header_version(const AMDGpuMetricsHeader_v1_t& metri
|
||||
// version 1.4: 260
|
||||
// version 1.5: 261
|
||||
// version 1.6: 262
|
||||
// version 1.7: 263
|
||||
//
|
||||
const AMDGpuMetricVersionTranslationTbl_t amdgpu_metric_version_translation_table
|
||||
{
|
||||
@@ -145,6 +146,7 @@ const AMDGpuMetricVersionTranslationTbl_t amdgpu_metric_version_translation_tabl
|
||||
{join_metrics_version(1, 4), AMDGpuMetricVersionFlags_t::kGpuMetricV14},
|
||||
{join_metrics_version(1, 5), AMDGpuMetricVersionFlags_t::kGpuMetricV15},
|
||||
{join_metrics_version(1, 6), AMDGpuMetricVersionFlags_t::kGpuMetricV16},
|
||||
{join_metrics_version(1, 7), AMDGpuMetricVersionFlags_t::kGpuMetricV17},
|
||||
};
|
||||
|
||||
/**
|
||||
@@ -264,6 +266,12 @@ const AMDGpuMetricsUnitTypeTranslationTbl_t amdgpu_metrics_unit_type_translation
|
||||
|
||||
// kGpuMetricLinkWidthSpeed
|
||||
{AMDGpuMetricsUnitType_t::kMetricPcieLCPerfOtherEndRecov, "PcieLCPerfOtherEndRecov"}, /* v1.6 */
|
||||
|
||||
|
||||
{AMDGpuMetricsUnitType_t::kMetricXgmiLinkStatus, "XgmiLinkStatus"}, /* v1.7 */
|
||||
{AMDGpuMetricsUnitType_t::kMetricVramMaxBandwidth, "VramMaxBandwidth"}, /* v1.7 */
|
||||
{AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitAccumulator,
|
||||
"GfxBelowHostLimitAccumulator"}, /* v1.7 */
|
||||
};
|
||||
|
||||
|
||||
@@ -352,6 +360,7 @@ AMDGpuMetricFactories_t amd_gpu_metrics_factory_table
|
||||
{AMDGpuMetricVersionFlags_t::kGpuMetricV14, std::make_shared<GpuMetricsBase_v14_t>(GpuMetricsBase_v14_t{})},
|
||||
{AMDGpuMetricVersionFlags_t::kGpuMetricV15, std::make_shared<GpuMetricsBase_v15_t>(GpuMetricsBase_v15_t{})},
|
||||
{AMDGpuMetricVersionFlags_t::kGpuMetricV16, std::make_shared<GpuMetricsBase_v16_t>(GpuMetricsBase_v16_t{})},
|
||||
{AMDGpuMetricVersionFlags_t::kGpuMetricV17, std::make_shared<GpuMetricsBase_v17_t>(GpuMetricsBase_v17_t{})},
|
||||
};
|
||||
|
||||
GpuMetricsBasePtr amdgpu_metrics_factory(AMDGpuMetricVersionFlags_t gpu_metric_version)
|
||||
@@ -470,6 +479,197 @@ AMDGpuDynamicMetricTblValues_t format_metric_row(const T& metric, const std::str
|
||||
return multi_values;
|
||||
}
|
||||
|
||||
void GpuMetricsBase_v17_t::dump_internal_metrics_table()
|
||||
{
|
||||
std::ostringstream ss;
|
||||
auto idx = uint64_t(0);
|
||||
auto idy = uint64_t(0);
|
||||
std::cout << __PRETTY_FUNCTION__ << " | ======= start ======= \n";
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | ======= DEBUG ======= "
|
||||
<< " | Metric Version: "
|
||||
<< stringfy_metric_header_version(m_gpu_metrics_tbl.m_common_header)
|
||||
<< " | Size: "
|
||||
<< print_unsigned_int(m_gpu_metrics_tbl.m_common_header.m_structure_size)
|
||||
<< " |"
|
||||
<< "\n";
|
||||
ss << " temperature_hotspot: " << m_gpu_metrics_tbl.m_temperature_hotspot << "\n"
|
||||
<< " temperature_mem: " << m_gpu_metrics_tbl.m_temperature_mem << "\n"
|
||||
<< " temperature_vrsoc: " << m_gpu_metrics_tbl.m_temperature_vrsoc << "\n"
|
||||
<< " current_socket_power: " << m_gpu_metrics_tbl.m_current_socket_power << "\n"
|
||||
<< " average_gfx_activity: " << m_gpu_metrics_tbl.m_average_gfx_activity << "\n"
|
||||
<< " average_umc_activity: " << m_gpu_metrics_tbl.m_average_umc_activity << "\n";
|
||||
|
||||
ss << " vram_max_bandwidth: " << m_gpu_metrics_tbl.m_vram_max_bandwidth << "\n" // new for v1.7
|
||||
<< " energy_accumulator: " << m_gpu_metrics_tbl.m_energy_accumulator << "\n"
|
||||
<< " system_clock_counter: " << m_gpu_metrics_tbl.m_system_clock_counter << "\n"
|
||||
<< " accumulation_counter: " << m_gpu_metrics_tbl.m_accumulation_counter << "\n"
|
||||
<< " prochot_residency_acc: " << m_gpu_metrics_tbl.m_prochot_residency_acc << "\n"
|
||||
<< " ppt_residency_acc: " << m_gpu_metrics_tbl.m_ppt_residency_acc << "\n"
|
||||
<< " socket_thm_residency_acc: " << m_gpu_metrics_tbl.m_socket_thm_residency_acc << "\n"
|
||||
<< " vr_thm_residency_acc: " << m_gpu_metrics_tbl.m_vr_thm_residency_acc << "\n"
|
||||
<< " hbm_thm_residency_acc: " << m_gpu_metrics_tbl.m_hbm_thm_residency_acc << "\n"
|
||||
<< " gfxclk_lock_status: " << m_gpu_metrics_tbl.m_gfxclk_lock_status << "\n"
|
||||
<< " pcie_link_width: " << m_gpu_metrics_tbl.m_pcie_link_width << "\n"
|
||||
<< " pcie_link_speed: " << m_gpu_metrics_tbl.m_pcie_link_speed << "\n"
|
||||
<< " xgmi_link_width: " << m_gpu_metrics_tbl.m_xgmi_link_width << "\n"
|
||||
<< " xgmi_link_speed: " << m_gpu_metrics_tbl.m_xgmi_link_speed << "\n"
|
||||
<< " gfx_activity_acc: " << m_gpu_metrics_tbl.m_gfx_activity_acc << "\n"
|
||||
<< " mem_activity_acc: " << m_gpu_metrics_tbl.m_mem_activity_acc << "\n"
|
||||
<< " pcie_bandwidth_acc: " << m_gpu_metrics_tbl.m_pcie_bandwidth_acc << "\n"
|
||||
<< " pcie_bandwidth_inst: " << m_gpu_metrics_tbl.m_pcie_bandwidth_inst << "\n"
|
||||
<< " pcie_l0_to_recov_count_acc: " << m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc << "\n"
|
||||
<< " pcie_replay_count_acc: " << m_gpu_metrics_tbl.m_pcie_replay_count_acc << "\n"
|
||||
<< " pcie_replay_rover_count_acc: " << m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc << "\n"
|
||||
<< " pcie_nak_sent_count_acc: " << m_gpu_metrics_tbl.m_pcie_nak_sent_count_acc << "\n"
|
||||
<< " pcie_nak_rcvd_count_acc: " << m_gpu_metrics_tbl.m_pcie_nak_rcvd_count_acc << "\n"
|
||||
<< " firmware_timestamp: " << m_gpu_metrics_tbl.m_firmware_timestamp << "\n"
|
||||
<< " current_uclk: " << m_gpu_metrics_tbl.m_current_uclk << "\n"
|
||||
<< " num_partition: " << m_gpu_metrics_tbl.m_num_partition << "\n"
|
||||
<< " pcie_lc_perf_other_end_recovery: "
|
||||
<< m_gpu_metrics_tbl.m_pcie_lc_perf_other_end_recovery << "\n";
|
||||
idx = 0;
|
||||
for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_link_status) { // new for v1.7
|
||||
ss << "\t [" << idx << "]: " << temp << "\n";
|
||||
++idx;
|
||||
}
|
||||
|
||||
for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_read_data_acc) {
|
||||
ss << "\t [" << idx << "]: " << temp << "\n";
|
||||
++idx;
|
||||
}
|
||||
|
||||
ss << " xgmi_write_data_acc: " << "\n";
|
||||
idx = 0;
|
||||
for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_write_data_acc) {
|
||||
ss << "\t [" << idx << "]: " << temp << "\n";
|
||||
++idx;
|
||||
}
|
||||
|
||||
ss << " current_gfxclk: " << "\n";
|
||||
idx = 0;
|
||||
for (const auto& temp : m_gpu_metrics_tbl.m_current_gfxclk) {
|
||||
ss << "\t [" << idx << "]: " << temp << "\n";
|
||||
++idx;
|
||||
}
|
||||
|
||||
ss << " current_socclk: " << "\n";
|
||||
idx = 0;
|
||||
for (const auto& temp : m_gpu_metrics_tbl.m_current_socclk) {
|
||||
ss << "\t [" << idx << "]: " << temp << "\n";
|
||||
++idx;
|
||||
}
|
||||
|
||||
ss << " current_vclk0: " << "\n";
|
||||
idx = 0;
|
||||
for (const auto& temp : m_gpu_metrics_tbl.m_current_vclk0) {
|
||||
ss << "\t [" << idx << "]: " << temp << "\n";
|
||||
++idx;
|
||||
}
|
||||
|
||||
ss << " current_dclk0: " << "\n";
|
||||
idx = 0;
|
||||
for (const auto& temp : m_gpu_metrics_tbl.m_current_dclk0) {
|
||||
ss << "\t [" << idx << "]: " << temp << "\n";
|
||||
++idx;
|
||||
}
|
||||
|
||||
idx = 0;
|
||||
idy = 0;
|
||||
ss << " xcp_stats.gfx_busy_inst: " << "\n";
|
||||
for (auto& row : m_gpu_metrics_tbl.m_xcp_stats) {
|
||||
if (idx == 0) {
|
||||
ss << "\t [ ";
|
||||
}
|
||||
for (auto& col : row.gfx_busy_inst) {
|
||||
ss << "\t [" << idx << "] [" << idy << "]: " << col;
|
||||
if (idy + 1 != (std::end(row.gfx_busy_inst) - std::end(row.gfx_busy_inst) - 1)) {
|
||||
ss << ", ";
|
||||
}
|
||||
if (idx + 1 !=
|
||||
(std::end(m_gpu_metrics_tbl.m_xcp_stats) - std::end(m_gpu_metrics_tbl.m_xcp_stats) - 1)) {
|
||||
ss << "\n";
|
||||
} else {
|
||||
ss << "]\n";
|
||||
}
|
||||
idy++;
|
||||
}
|
||||
idx++;
|
||||
}
|
||||
|
||||
idx = 0;
|
||||
idy = 0;
|
||||
ss << " xcp_stats.vcn_busy: " << "\n";
|
||||
for (auto& row : m_gpu_metrics_tbl.m_xcp_stats) {
|
||||
if (idx == 0) {
|
||||
ss << "\t [ ";
|
||||
}
|
||||
for (auto& col : row.vcn_busy) {
|
||||
ss << "\t [" << idx << "] [" << idy << "]: " << col;
|
||||
if (idy + 1 != (std::end(row.vcn_busy) - std::end(row.vcn_busy) - 1)) {
|
||||
ss << ", ";
|
||||
}
|
||||
if (idx + 1 !=
|
||||
(std::end(m_gpu_metrics_tbl.m_xcp_stats) - std::end(m_gpu_metrics_tbl.m_xcp_stats) - 1)) {
|
||||
ss << "\n";
|
||||
} else {
|
||||
ss << "]\n";
|
||||
}
|
||||
idy++;
|
||||
}
|
||||
idx++;
|
||||
}
|
||||
|
||||
idx = 0;
|
||||
idy = 0;
|
||||
ss << " xcp_stats.jpeg_busy: " << "\n";
|
||||
for (auto& row : m_gpu_metrics_tbl.m_xcp_stats) {
|
||||
if (idx == 0) {
|
||||
ss << "\t [ ";
|
||||
}
|
||||
for (auto& col : row.jpeg_busy) {
|
||||
ss << "\t [" << idx << "] [" << idy << "]: " << col;
|
||||
if (idy + 1 != (std::end(row.jpeg_busy) - std::end(row.jpeg_busy) - 1)) {
|
||||
ss << ", ";
|
||||
}
|
||||
if (idx + 1 !=
|
||||
(std::end(m_gpu_metrics_tbl.m_xcp_stats) - std::end(m_gpu_metrics_tbl.m_xcp_stats) - 1)) {
|
||||
ss << "\n";
|
||||
} else {
|
||||
ss << "]\n";
|
||||
}
|
||||
idy++;
|
||||
}
|
||||
idx++;
|
||||
}
|
||||
|
||||
idx = 0;
|
||||
idy = 0;
|
||||
ss << " xcp_stats.gfx_busy_acc: " << "\n";
|
||||
for (auto& row : m_gpu_metrics_tbl.m_xcp_stats) {
|
||||
if (idx == 0) {
|
||||
ss << "\t [ ";
|
||||
}
|
||||
for (auto& col : row.gfx_busy_acc) {
|
||||
ss << "\t [" << idx << "] [" << idy << "]: " << col;
|
||||
if (idy + 1 != (std::end(row.gfx_busy_acc) - std::end(row.gfx_busy_acc) - 1)) {
|
||||
ss << ", ";
|
||||
}
|
||||
if (idx + 1 !=
|
||||
(std::end(m_gpu_metrics_tbl.m_xcp_stats) - std::end(m_gpu_metrics_tbl.m_xcp_stats) - 1)) {
|
||||
ss << "\n";
|
||||
} else {
|
||||
ss << "]\n";
|
||||
}
|
||||
idy++;
|
||||
}
|
||||
idx++;
|
||||
}
|
||||
|
||||
LOG_DEBUG(ss);
|
||||
}
|
||||
|
||||
|
||||
void GpuMetricsBase_v16_t::dump_internal_metrics_table()
|
||||
{
|
||||
std::ostringstream ss;
|
||||
@@ -663,6 +863,263 @@ void GpuMetricsBase_v16_t::dump_internal_metrics_table()
|
||||
LOG_DEBUG(ss);
|
||||
}
|
||||
|
||||
rsmi_status_t GpuMetricsBase_v17_t::populate_metrics_dynamic_tbl() {
|
||||
std::ostringstream ss;
|
||||
auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS);
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
|
||||
LOG_TRACE(ss);
|
||||
|
||||
if (!m_metrics_dynamic_tbl.empty()) {
|
||||
m_metrics_dynamic_tbl.clear();
|
||||
}
|
||||
|
||||
//
|
||||
// Note: Any metric treatment/changes (if any) should happen before they
|
||||
// get written to internal/external tables.
|
||||
//
|
||||
auto run_metric_adjustments_v17 = [&]() {
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
|
||||
const auto gpu_metrics_version =
|
||||
translate_flag_to_metric_version(get_gpu_metrics_version_used());
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | ======= info ======= "
|
||||
<< " | Applying adjustments "
|
||||
<< " | Metric Version: " << stringfy_metric_header_version(
|
||||
disjoin_metrics_version(gpu_metrics_version))
|
||||
<< " |";
|
||||
LOG_TRACE(ss);
|
||||
|
||||
// firmware_timestamp is at 10ns resolution
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | ======= Changes ======= "
|
||||
<< " | {m_firmware_timestamp} from: " << m_gpu_metrics_tbl.m_firmware_timestamp
|
||||
<< " to: " << (m_gpu_metrics_tbl.m_firmware_timestamp * 10);
|
||||
m_gpu_metrics_tbl.m_firmware_timestamp = (m_gpu_metrics_tbl.m_firmware_timestamp * 10);
|
||||
LOG_DEBUG(ss);
|
||||
};
|
||||
|
||||
// Adjustments/Changes specific to this version
|
||||
run_metric_adjustments_v17();
|
||||
|
||||
// Temperature Info
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempHotspot,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_temperature_hotspot,
|
||||
"temperature_hotspot")));
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempMem,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_temperature_mem,
|
||||
"temperature_mem")));
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempVrSoc,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_temperature_vrsoc,
|
||||
"temperature_vrsoc")));
|
||||
|
||||
// Power/Energy Info
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricPowerEnergy]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrSocketPower,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_current_socket_power,
|
||||
"curr_socket_power")));
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricPowerEnergy]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricEnergyAccumulator,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_energy_accumulator,
|
||||
"energy_acc")));
|
||||
|
||||
// Utilization Info
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgGfxActivity,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_average_gfx_activity,
|
||||
"average_gfx_activity")));
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgUmcActivity,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_average_umc_activity,
|
||||
"average_umc_activity")));
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxActivityAccumulator,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_gfx_activity_acc,
|
||||
"gfx_activity_acc")));
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricMemActivityAccumulator,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_mem_activity_acc,
|
||||
"mem_activity_acc")));
|
||||
|
||||
// Timestamp Info
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTimestamp]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTSFirmware,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_firmware_timestamp,
|
||||
"firmware_timestamp")));
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTimestamp]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTSClockCounter,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_system_clock_counter,
|
||||
"system_clock_counter")));
|
||||
|
||||
|
||||
// GfxLock Info
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricGfxClkLockStatus]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxClkLockStatus,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_gfxclk_lock_status,
|
||||
"gfxclk_lock_status")));
|
||||
|
||||
// Link/Width/Speed Info
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieLinkWidth,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_pcie_link_width,
|
||||
"pcie_link_width")));
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieLinkSpeed,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_pcie_link_speed,
|
||||
"pcie_link_speed")));
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiLinkWidth,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_xgmi_link_width,
|
||||
"xgmi_link_width")));
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiLinkSpeed,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_xgmi_link_speed,
|
||||
"xgmi_link_speed")));
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieBandwidthAccumulator,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_pcie_bandwidth_acc,
|
||||
"pcie_bandwidth_acc")));
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieBandwidthInst,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_pcie_bandwidth_inst,
|
||||
"pcie_bandwidth_inst")));
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieL0RecovCountAccumulator,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc,
|
||||
"pcie_l0_recov_count_acc")));
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieReplayCountAccumulator,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_pcie_replay_count_acc,
|
||||
"pcie_replay_count_acc")));
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieReplayRollOverCountAccumulator,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc,
|
||||
"pcie_replay_rollover_count_acc")));
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieNakSentCountAccumulator,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_pcie_nak_sent_count_acc,
|
||||
"pcie_nak_sent_count_acc")));
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieNakReceivedCountAccumulator,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_pcie_nak_rcvd_count_acc,
|
||||
"pcie_nak_rcvd_count_acc")));
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiReadDataAccumulator,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_xgmi_read_data_acc,
|
||||
"[xgmi_read_data_acc]")));
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiWriteDataAccumulator,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_xgmi_write_data_acc,
|
||||
"[xgmi_write_data_acc]")));
|
||||
// new for v1.7
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiLinkStatus,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_xgmi_link_status,
|
||||
"[xgmi_link_status]")));
|
||||
// CurrentClock Info
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrGfxClock,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_current_gfxclk,
|
||||
"[current_gfxclk]")));
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrSocClock,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_current_socclk,
|
||||
"[current_socclk]")));
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrVClock0,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_current_vclk0,
|
||||
"[current_vclk0]")));
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrDClock0,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_current_dclk0,
|
||||
"[current_dclk0]")));
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrUClock,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_current_uclk,
|
||||
"current_uclk")));
|
||||
|
||||
/* Accumulation cycle counter */
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAccumulationCounter,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_accumulation_counter,
|
||||
"accumulation_counter")));
|
||||
|
||||
/* Accumulated throttler residencies */
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricProchotResidencyAccumulator,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_prochot_residency_acc,
|
||||
"prochot_residency_acc")));
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPPTResidencyAccumulator,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_ppt_residency_acc,
|
||||
"ppt_residency_acc")));
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricSocketThmResidencyAccumulator,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_socket_thm_residency_acc,
|
||||
"socket_thm_residency_acc")));
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricVRThmResidencyAccumulator,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_vr_thm_residency_acc,
|
||||
"vr_thm_residency_acc")));
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricHBMThmResidencyAccumulator,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_hbm_thm_residency_acc,
|
||||
"hbm_thm_residency_acc")));
|
||||
|
||||
/* Partition info */
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricPartition]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kGpuMetricNumPartition,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_num_partition,
|
||||
"num_partition")));
|
||||
|
||||
/* xcp_stats info */
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxBusyInst,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_busy_inst,
|
||||
"xcp_stats->gfx_busy_inst")));
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricVcnBusy,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->vcn_busy,
|
||||
"xcp_stats->vcn_busy")));
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricJpegBusy,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->jpeg_busy,
|
||||
"xcp_stats->jpeg_busy")));
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxBusyAcc,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_busy_acc,
|
||||
"xcp_stats->gfx_busy_acc")));
|
||||
|
||||
/* PCIE other end recovery counter info */
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieLCPerfOtherEndRecov,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_pcie_lc_perf_other_end_recovery,
|
||||
"pcie_lc_perf_other_end_recovery")));
|
||||
|
||||
/* VRAM max bandwidth at max memory clock */
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricVramMaxBandwidth,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_vram_max_bandwidth,
|
||||
"vram_max_bandwidth")));
|
||||
|
||||
/* Total App Clock Counter Accumulated */
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitAccumulator,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_below_host_limit_acc,
|
||||
"gfx_below_host_limit_acc")));
|
||||
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Success "
|
||||
<< " | Returning = " << getRSMIStatusString(status_code)
|
||||
<< " |";
|
||||
LOG_TRACE(ss);
|
||||
|
||||
return status_code;
|
||||
}
|
||||
|
||||
rsmi_status_t GpuMetricsBase_v16_t::populate_metrics_dynamic_tbl() {
|
||||
std::ostringstream ss;
|
||||
auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS);
|
||||
@@ -700,7 +1157,6 @@ rsmi_status_t GpuMetricsBase_v16_t::populate_metrics_dynamic_tbl() {
|
||||
|
||||
// Adjustments/Changes specific to this version
|
||||
run_metric_adjustments_v16();
|
||||
|
||||
// Temperature Info
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempHotspot,
|
||||
@@ -1594,6 +2050,12 @@ rsmi_status_t init_max_public_gpu_matrics(AMGpuMetricsPublicLatest_t& rsmi_gpu_m
|
||||
rsmi_gpu_metrics.pcie_link_speed = init_max_uint_types<decltype(rsmi_gpu_metrics.pcie_link_speed)>();
|
||||
rsmi_gpu_metrics.gfx_activity_acc = init_max_uint_types<decltype(rsmi_gpu_metrics.gfx_activity_acc)>();
|
||||
rsmi_gpu_metrics.mem_activity_acc = init_max_uint_types<decltype(rsmi_gpu_metrics.mem_activity_acc)>();
|
||||
rsmi_gpu_metrics.vram_max_bandwidth = init_max_uint_types<decltype(rsmi_gpu_metrics.vram_max_bandwidth)>();
|
||||
|
||||
std::fill(std::begin(rsmi_gpu_metrics.xgmi_link_status),
|
||||
std::end(rsmi_gpu_metrics.xgmi_link_status),
|
||||
init_max_uint_types<std::uint16_t>());
|
||||
|
||||
|
||||
std::fill(std::begin(rsmi_gpu_metrics.temperature_hbm),
|
||||
std::end(rsmi_gpu_metrics.temperature_hbm),
|
||||
@@ -1671,6 +2133,8 @@ rsmi_status_t init_max_public_gpu_matrics(AMGpuMetricsPublicLatest_t& rsmi_gpu_m
|
||||
init_max_uint_types<std::uint16_t>());
|
||||
std::fill(std::begin(row.gfx_busy_acc), std::end(row.gfx_busy_acc),
|
||||
init_max_uint_types<std::uint64_t>());
|
||||
std::fill(std::begin(row.gfx_below_host_limit_acc), std::end(row.gfx_below_host_limit_acc),
|
||||
init_max_uint_types<std::uint64_t>());
|
||||
}
|
||||
|
||||
ss << __PRETTY_FUNCTION__
|
||||
@@ -1683,6 +2147,225 @@ rsmi_status_t init_max_public_gpu_matrics(AMGpuMetricsPublicLatest_t& rsmi_gpu_m
|
||||
return status_code;
|
||||
}
|
||||
|
||||
AMGpuMetricsPublicLatestTupl_t GpuMetricsBase_v17_t::copy_internal_to_external_metrics()
|
||||
{
|
||||
std::ostringstream ss;
|
||||
auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS);
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
|
||||
LOG_TRACE(ss);
|
||||
|
||||
auto copy_data_from_internal_metrics_tbl = [&]() {
|
||||
AMGpuMetricsPublicLatest_t metrics_public_init{};
|
||||
|
||||
//
|
||||
// Note: Initializing data members with their max. If field is max,
|
||||
// no data was assigned to it.
|
||||
init_max_public_gpu_matrics(metrics_public_init);
|
||||
|
||||
// Header
|
||||
metrics_public_init.common_header.structure_size = m_gpu_metrics_tbl.m_common_header.m_structure_size;
|
||||
metrics_public_init.common_header.format_revision = m_gpu_metrics_tbl.m_common_header.m_format_revision;
|
||||
metrics_public_init.common_header.content_revision = m_gpu_metrics_tbl.m_common_header.m_content_revision;
|
||||
|
||||
|
||||
// Temperature
|
||||
metrics_public_init.temperature_hotspot = m_gpu_metrics_tbl.m_temperature_hotspot;
|
||||
metrics_public_init.temperature_mem = m_gpu_metrics_tbl.m_temperature_mem;
|
||||
metrics_public_init.temperature_vrsoc = m_gpu_metrics_tbl.m_temperature_vrsoc;
|
||||
|
||||
// Power
|
||||
metrics_public_init.current_socket_power = m_gpu_metrics_tbl.m_current_socket_power;
|
||||
|
||||
// Utilization
|
||||
metrics_public_init.average_gfx_activity = m_gpu_metrics_tbl.m_average_gfx_activity;
|
||||
metrics_public_init.average_umc_activity = m_gpu_metrics_tbl.m_average_umc_activity;
|
||||
|
||||
// Power/Energy
|
||||
metrics_public_init.energy_accumulator = m_gpu_metrics_tbl.m_energy_accumulator;
|
||||
|
||||
// Driver attached timestamp (in ns)
|
||||
metrics_public_init.system_clock_counter = m_gpu_metrics_tbl.m_system_clock_counter;
|
||||
|
||||
// Clock Lock Status. Each bit corresponds to clock instance
|
||||
metrics_public_init.gfxclk_lock_status = m_gpu_metrics_tbl.m_gfxclk_lock_status;
|
||||
|
||||
// Link width (number of lanes) and speed
|
||||
metrics_public_init.pcie_link_width = m_gpu_metrics_tbl.m_pcie_link_width;
|
||||
metrics_public_init.pcie_link_speed = m_gpu_metrics_tbl.m_pcie_link_speed;
|
||||
|
||||
// XGMI bus width and bitrate
|
||||
metrics_public_init.xgmi_link_width = m_gpu_metrics_tbl.m_xgmi_link_width;
|
||||
metrics_public_init.xgmi_link_speed = m_gpu_metrics_tbl.m_xgmi_link_speed;
|
||||
|
||||
// Utilization Accumulated
|
||||
metrics_public_init.gfx_activity_acc = m_gpu_metrics_tbl.m_gfx_activity_acc;
|
||||
metrics_public_init.mem_activity_acc = m_gpu_metrics_tbl.m_mem_activity_acc;
|
||||
|
||||
// PCIE accumulated bandwidth
|
||||
metrics_public_init.pcie_bandwidth_acc = m_gpu_metrics_tbl.m_pcie_bandwidth_acc;
|
||||
|
||||
// PCIE instantaneous bandwidth
|
||||
metrics_public_init.pcie_bandwidth_inst = m_gpu_metrics_tbl.m_pcie_bandwidth_inst;
|
||||
|
||||
// PCIE L0 to recovery state transition accumulated count
|
||||
metrics_public_init.pcie_l0_to_recov_count_acc = m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc;
|
||||
|
||||
// PCIE replay accumulated count
|
||||
metrics_public_init.pcie_replay_count_acc = m_gpu_metrics_tbl.m_pcie_replay_count_acc;
|
||||
|
||||
// PCIE replay rollover accumulated count
|
||||
metrics_public_init.pcie_replay_rover_count_acc = m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc;
|
||||
|
||||
// PCIE NAK sent accumulated count
|
||||
metrics_public_init.pcie_nak_sent_count_acc = m_gpu_metrics_tbl.m_pcie_nak_sent_count_acc;
|
||||
|
||||
// PCIE NAK received accumulated count
|
||||
metrics_public_init.pcie_nak_rcvd_count_acc = m_gpu_metrics_tbl.m_pcie_nak_rcvd_count_acc;
|
||||
|
||||
// Accumulated throttler residencies
|
||||
// bumped up public to uint64_t due to planned size increase for newer ASICs
|
||||
metrics_public_init.accumulation_counter = m_gpu_metrics_tbl.m_accumulation_counter;
|
||||
metrics_public_init.prochot_residency_acc = m_gpu_metrics_tbl.m_prochot_residency_acc;
|
||||
metrics_public_init.ppt_residency_acc = m_gpu_metrics_tbl.m_ppt_residency_acc;
|
||||
metrics_public_init.socket_thm_residency_acc = m_gpu_metrics_tbl.m_socket_thm_residency_acc;
|
||||
metrics_public_init.vr_thm_residency_acc = m_gpu_metrics_tbl.m_vr_thm_residency_acc;
|
||||
metrics_public_init.hbm_thm_residency_acc = m_gpu_metrics_tbl.m_hbm_thm_residency_acc;
|
||||
|
||||
/* VRAM max bandwidth at max memory clock */
|
||||
metrics_public_init.vram_max_bandwidth = m_gpu_metrics_tbl.m_vram_max_bandwidth;
|
||||
|
||||
// XGMI accumulated data transfer size
|
||||
// xgmi_read_data
|
||||
const auto xgmi_read_data_num_elems =
|
||||
static_cast<uint16_t>(
|
||||
std::end(m_gpu_metrics_tbl.m_xgmi_read_data_acc) -
|
||||
std::begin(m_gpu_metrics_tbl.m_xgmi_read_data_acc));
|
||||
std::copy_n(std::begin(m_gpu_metrics_tbl.m_xgmi_read_data_acc),
|
||||
xgmi_read_data_num_elems,
|
||||
metrics_public_init.xgmi_read_data_acc);
|
||||
// xgmi_write_data
|
||||
const auto xgmi_write_data_num_elems =
|
||||
static_cast<uint16_t>(
|
||||
std::end(m_gpu_metrics_tbl.m_xgmi_write_data_acc) -
|
||||
std::begin(m_gpu_metrics_tbl.m_xgmi_write_data_acc));
|
||||
std::copy_n(std::begin(m_gpu_metrics_tbl.m_xgmi_write_data_acc),
|
||||
xgmi_write_data_num_elems,
|
||||
metrics_public_init.xgmi_write_data_acc);
|
||||
|
||||
// xgmi_link_status // new for 1.7
|
||||
const auto xgmi_link_status_num_elems = static_cast<uint16_t>(
|
||||
std::end(m_gpu_metrics_tbl.m_xgmi_link_status) -
|
||||
std::begin(m_gpu_metrics_tbl.m_xgmi_link_status));
|
||||
std::copy_n(std::begin(m_gpu_metrics_tbl.m_xgmi_link_status),
|
||||
xgmi_link_status_num_elems,
|
||||
metrics_public_init.xgmi_link_status);
|
||||
|
||||
// PMFW attached timestamp (10ns resolution)
|
||||
metrics_public_init.firmware_timestamp = m_gpu_metrics_tbl.m_firmware_timestamp;
|
||||
|
||||
// Current clocks
|
||||
// current_gfxclk
|
||||
const auto curr_gfxclk_num_elems =
|
||||
static_cast<uint16_t>(
|
||||
std::end(m_gpu_metrics_tbl.m_current_gfxclk) -
|
||||
std::begin(m_gpu_metrics_tbl.m_current_gfxclk));
|
||||
std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_gfxclk),
|
||||
curr_gfxclk_num_elems,
|
||||
metrics_public_init.current_gfxclks);
|
||||
|
||||
// current_socclk
|
||||
const auto curr_socclk_num_elems =
|
||||
static_cast<uint16_t>(
|
||||
std::end(m_gpu_metrics_tbl.m_current_socclk) -
|
||||
std::begin(m_gpu_metrics_tbl.m_current_socclk));
|
||||
std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_socclk),
|
||||
curr_socclk_num_elems,
|
||||
metrics_public_init.current_socclks);
|
||||
|
||||
// current_vclk0
|
||||
const auto curr_vclk0_num_elems =
|
||||
static_cast<uint16_t>(
|
||||
std::end(m_gpu_metrics_tbl.m_current_vclk0) -
|
||||
std::begin(m_gpu_metrics_tbl.m_current_vclk0));
|
||||
std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_vclk0),
|
||||
curr_vclk0_num_elems,
|
||||
metrics_public_init.current_vclk0s);
|
||||
|
||||
// current_dclk0
|
||||
const auto curr_dclk0_num_elems =
|
||||
static_cast<uint16_t>(
|
||||
std::end(m_gpu_metrics_tbl.m_current_dclk0) -
|
||||
std::begin(m_gpu_metrics_tbl.m_current_dclk0));
|
||||
std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_dclk0),
|
||||
curr_dclk0_num_elems,
|
||||
metrics_public_init.current_dclk0s);
|
||||
|
||||
metrics_public_init.current_uclk = m_gpu_metrics_tbl.m_current_uclk;
|
||||
|
||||
metrics_public_init.num_partition = m_gpu_metrics_tbl.m_num_partition;
|
||||
|
||||
metrics_public_init.pcie_lc_perf_other_end_recovery =
|
||||
m_gpu_metrics_tbl.m_pcie_lc_perf_other_end_recovery;
|
||||
|
||||
auto priv_it = std::begin(m_gpu_metrics_tbl.m_xcp_stats);
|
||||
for (auto pub_it = std::begin(metrics_public_init.xcp_stats);
|
||||
pub_it != std::end(metrics_public_init.xcp_stats);
|
||||
++pub_it, ++priv_it) {
|
||||
std::copy_n(std::begin(priv_it->gfx_busy_inst), RSMI_MAX_NUM_XCC,
|
||||
pub_it->gfx_busy_inst);
|
||||
std::copy_n(std::begin(priv_it->jpeg_busy), RSMI_MAX_NUM_JPEG_ENGS,
|
||||
pub_it->jpeg_busy);
|
||||
std::copy_n(std::begin(priv_it->vcn_busy), RSMI_MAX_NUM_VCNS,
|
||||
pub_it->vcn_busy);
|
||||
std::copy_n(std::begin(priv_it->gfx_busy_acc), RSMI_MAX_NUM_XCC,
|
||||
pub_it->gfx_busy_acc);
|
||||
std::copy_n(std::begin(priv_it->gfx_below_host_limit_acc), RSMI_MAX_NUM_XCC,
|
||||
pub_it->gfx_below_host_limit_acc);
|
||||
}
|
||||
|
||||
//
|
||||
// Note: Backwards compatibility -> Handling extra/exception cases
|
||||
// related to earlier versions (1.3/1.4/1.5)
|
||||
metrics_public_init.current_gfxclk = metrics_public_init.current_gfxclks[0];
|
||||
|
||||
metrics_public_init.current_socclk = metrics_public_init.current_socclks[0];
|
||||
|
||||
metrics_public_init.current_vclk0 = metrics_public_init.current_vclk0s[0];
|
||||
|
||||
metrics_public_init.current_vclk1 = metrics_public_init.current_vclk0s[1];
|
||||
|
||||
metrics_public_init.current_dclk0 = metrics_public_init.current_dclk0s[0];
|
||||
|
||||
metrics_public_init.current_dclk1 = metrics_public_init.current_dclk0s[1];
|
||||
|
||||
// separate by XCP
|
||||
if (this->m_partition_id < kRSMI_MAX_NUM_XCP
|
||||
&& m_gpu_metrics_tbl.m_xcp_stats[this->m_partition_id].vcn_busy[0] != UINT16_MAX) {
|
||||
std::copy(std::begin(m_gpu_metrics_tbl.m_xcp_stats[this->m_partition_id].vcn_busy),
|
||||
std::end(m_gpu_metrics_tbl.m_xcp_stats[this->m_partition_id].vcn_busy),
|
||||
std::begin(metrics_public_init.vcn_activity));
|
||||
}
|
||||
if (this->m_partition_id < kRSMI_MAX_NUM_XCP
|
||||
&& m_gpu_metrics_tbl.m_xcp_stats[this->m_partition_id].jpeg_busy[0] != UINT16_MAX) {
|
||||
std::copy(std::begin(m_gpu_metrics_tbl.m_xcp_stats[this->m_partition_id].jpeg_busy),
|
||||
std::end(m_gpu_metrics_tbl.m_xcp_stats[this->m_partition_id].jpeg_busy),
|
||||
std::begin(metrics_public_init.jpeg_activity));
|
||||
}
|
||||
|
||||
return metrics_public_init;
|
||||
}();
|
||||
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Success "
|
||||
<< " | Returning = " << getRSMIStatusString(status_code)
|
||||
<< " |";
|
||||
LOG_TRACE(ss);
|
||||
|
||||
return std::make_tuple(status_code, copy_data_from_internal_metrics_tbl);
|
||||
|
||||
}
|
||||
|
||||
AMGpuMetricsPublicLatestTupl_t GpuMetricsBase_v16_t::copy_internal_to_external_metrics()
|
||||
{
|
||||
std::ostringstream ss;
|
||||
|
||||
@@ -52,6 +52,7 @@
|
||||
#include "amd_smi/impl/amd_smi_utils.h"
|
||||
#include "amd_smi/impl/amd_smi_processor.h"
|
||||
#include "rocm_smi/rocm_smi_logger.h"
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
|
||||
// a global instance of std::mutex to protect data passed during threads
|
||||
std::mutex myMutex;
|
||||
@@ -80,7 +81,7 @@ static amdsmi_status_t get_gpu_device_from_handle(amdsmi_processor_handle proces
|
||||
if (r != AMDSMI_STATUS_SUCCESS) return r;
|
||||
|
||||
if (device->get_processor_type() == AMDSMI_PROCESSOR_TYPE_AMD_GPU) {
|
||||
*gpudevice = static_cast<amd::smi::AMDSmiGPUDevice*>(processor_handle);
|
||||
*gpudevice = static_cast<amd::smi::AMDSmiGPUDevice*>(device);
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -665,8 +666,11 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha
|
||||
|
||||
amdsmi_gpu_metrics_t metric_info_a = {};
|
||||
amdsmi_status_t status = amdsmi_get_gpu_metrics_info(
|
||||
processor_handle, &metric_info_a);
|
||||
processor_handle, &metric_info_a);
|
||||
if (status != AMDSMI_STATUS_SUCCESS) {
|
||||
std::ostringstream ss;
|
||||
ss << __PRETTY_FUNCTION__ << " | amdsmi_get_gpu_metrics_info failed with status = " << smi_amdgpu_get_status_string(status, false);
|
||||
LOG_ERROR(ss);
|
||||
return status;
|
||||
}
|
||||
|
||||
@@ -1053,6 +1057,43 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
amdsmi_status_t
|
||||
amdsmi_get_gpu_xgmi_link_status(amdsmi_processor_handle processor_handle,
|
||||
amdsmi_xgmi_link_status_t *link_status) {
|
||||
AMDSMI_CHECK_INIT();
|
||||
|
||||
if (link_status == nullptr) {
|
||||
return AMDSMI_STATUS_INVAL;
|
||||
}
|
||||
|
||||
amdsmi_gpu_metrics_t metric_info = {};
|
||||
amdsmi_status_t status = amdsmi_get_gpu_metrics_info(
|
||||
processor_handle, &metric_info);
|
||||
if (status != AMDSMI_STATUS_SUCCESS) {
|
||||
return status;
|
||||
}
|
||||
|
||||
uint32_t dev_num = 0;
|
||||
auto r = rsmi_num_monitor_devices(&dev_num);
|
||||
link_status->total_links = AMDSMI_MAX_NUM_XGMI_LINKS;
|
||||
if (dev_num <= link_status->total_links) {
|
||||
link_status->total_links = dev_num;
|
||||
}
|
||||
// get the status values from the metric info
|
||||
for (unsigned int i = 0; i < link_status->total_links; i++) {
|
||||
if (metric_info.xgmi_link_status[i] == std::numeric_limits<uint16_t>::max()) {
|
||||
link_status->status[i] = AMDSMI_XGMI_LINK_DISABLE;
|
||||
} else if (metric_info.xgmi_link_status[i] == 0) {
|
||||
link_status->status[i] = AMDSMI_XGMI_LINK_DOWN;
|
||||
} else if (metric_info.xgmi_link_status[i] == 1) {
|
||||
link_status->status[i] = AMDSMI_XGMI_LINK_UP;
|
||||
} else {
|
||||
return AMDSMI_STATUS_UNEXPECTED_DATA;
|
||||
}
|
||||
}
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
amdsmi_status_t amdsmi_get_gpu_kfd_info(amdsmi_processor_handle processor_handle,
|
||||
amdsmi_kfd_info_t *info) {
|
||||
AMDSMI_CHECK_INIT();
|
||||
@@ -1135,6 +1176,7 @@ amdsmi_status_t amdsmi_get_gpu_vram_info(
|
||||
info->vram_size = 0;
|
||||
info->vram_vendor = AMDSMI_VRAM_VENDOR__PLACEHOLDER0;
|
||||
info->vram_bit_width = std::numeric_limits<decltype(info->vram_bit_width)>::max();
|
||||
info->vram_max_bandwidth = std::numeric_limits<decltype(info->vram_max_bandwidth)>::max();
|
||||
|
||||
// Only can read vram type from libdrm
|
||||
if (gpu_device->check_if_drm_is_supported()) {
|
||||
@@ -1148,6 +1190,13 @@ amdsmi_status_t amdsmi_get_gpu_vram_info(
|
||||
}
|
||||
}
|
||||
|
||||
// set info->vram_max_bandwidth to gpu_metrics vram_max_bandwidth if it is not set
|
||||
amdsmi_gpu_metrics_t metric_info = {};
|
||||
r = amdsmi_get_gpu_metrics_info(processor_handle, &metric_info);
|
||||
if (r == AMDSMI_STATUS_SUCCESS) {
|
||||
info->vram_max_bandwidth = metric_info.vram_max_bandwidth;
|
||||
}
|
||||
|
||||
// if vram type is greater than the max enum set it to unknown
|
||||
if (info->vram_type > AMDSMI_VRAM_TYPE__MAX)
|
||||
info->vram_type = AMDSMI_VRAM_TYPE_UNKNOWN;
|
||||
|
||||
@@ -239,6 +239,12 @@ void TestGpuMetricsRead::Run(void) {
|
||||
amd::smi::make_ostream_joiner(&std::cout, ", "));
|
||||
std::cout << std::dec << "]\n";
|
||||
|
||||
std::cout << std::dec << "xgmi_link_status= [";
|
||||
std::copy(std::begin(smu.xgmi_link_status),
|
||||
std::end(smu.xgmi_link_status),
|
||||
amd::smi::make_ostream_joiner(&std::cout, ", "));
|
||||
std::cout << std::dec << "]\n";
|
||||
|
||||
// Voltage (mV)
|
||||
std::cout << "voltage_soc = " << std::dec << smu.voltage_soc << "\n";
|
||||
std::cout << "voltage_gfx = " << std::dec << smu.voltage_gfx << "\n";
|
||||
@@ -254,6 +260,9 @@ void TestGpuMetricsRead::Run(void) {
|
||||
std::cout << "pcie_bandwidth_acc=" << std::dec << smu.pcie_bandwidth_acc << "\n";
|
||||
std::cout << "pcie_bandwidth_inst=" << std::dec << smu.pcie_bandwidth_inst << "\n";
|
||||
|
||||
// VRAM max bandwidth at max memory clock (GB/sec)
|
||||
std::cout << "vram_max_bandwidth=" << std::dec << smu.vram_max_bandwidth << "\n";
|
||||
|
||||
// Counts
|
||||
std::cout << "pcie_l0_to_recov_count_acc= " << std::dec << smu.pcie_l0_to_recov_count_acc
|
||||
<< "\n";
|
||||
@@ -329,6 +338,17 @@ void TestGpuMetricsRead::Run(void) {
|
||||
xcp++;
|
||||
}
|
||||
|
||||
xcp = 0;
|
||||
std::cout << std::dec << "xcp_stats.gfx_below_host_limit_acc = \n";
|
||||
for (auto& row : smu.xcp_stats) {
|
||||
std::cout << "XCP[" << xcp << "] = " << "[ ";
|
||||
std::copy(std::begin(row.gfx_below_host_limit_acc),
|
||||
std::end(row.gfx_below_host_limit_acc),
|
||||
amd::smi::make_ostream_joiner(&std::cout, ", "));
|
||||
std::cout << " ]\n";
|
||||
xcp++;
|
||||
}
|
||||
|
||||
std::cout << "\n\n";
|
||||
std::cout << "\t ** -> Checking metrics with constant changes ** " << "\n";
|
||||
constexpr uint16_t kMAX_ITER_TEST = 10;
|
||||
|
||||
@@ -129,17 +129,19 @@ void TestIdInfoRead::Run(void) {
|
||||
CHK_ERR_ASRT(err)
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**Device Vram type id: "
|
||||
<< vram_info.vram_type << std::endl;
|
||||
<< vram_info.vram_type << std::endl;
|
||||
std::cout << "\t**Device Vram vendor id: "
|
||||
<< vram_info.vram_vendor << std::endl;
|
||||
<< vram_info.vram_vendor << std::endl;
|
||||
std::cout << "\t**Device Vram size: 0x"
|
||||
<< std::hex << vram_info.vram_size
|
||||
<< " (" << std::dec << vram_info.vram_size << ")"
|
||||
<< std::endl;
|
||||
<< std::hex << vram_info.vram_size
|
||||
<< " (" << std::dec << vram_info.vram_size << ")"
|
||||
<< std::endl;
|
||||
std::cout << "\t**Device Bit Width: 0x"
|
||||
<< std::hex << vram_info.vram_bit_width
|
||||
<< " (" << std::dec << vram_info.vram_bit_width << ")"
|
||||
<< std::endl;
|
||||
<< std::hex << vram_info.vram_bit_width
|
||||
<< " (" << std::dec << vram_info.vram_bit_width << ")"
|
||||
<< std::endl;
|
||||
std::cout << "\t**Device Vram Max Bandwidth: "
|
||||
<< vram_info.vram_max_bandwidth << " GB/s" << std::endl;
|
||||
}
|
||||
|
||||
err = amdsmi_get_gpu_vendor_name(processor_handles_[i], buffer, kBufferLen);
|
||||
|
||||
Reference in New Issue
Block a user