SWDEV-440760: Removed specific gpu_metric calls & fixed pcie metrics

Signed-off-by: Maisam Arif <maisarif@amd.com>
Change-Id: I679ecede4825c119925de3c9140453653f3f84aa


[ROCm/amdsmi commit: fec1173321]
This commit is contained in:
Maisam Arif
2024-01-24 05:19:20 -06:00
والد 037a4283cd
کامیت 3273fb6239
2فایلهای تغییر یافته به همراه131 افزوده شده و 37 حذف شده
@@ -106,7 +106,7 @@ class AMDSMICommands():
if self.logger.is_human_readable_format():
print(f'AMDSMI Tool: {__version__} | '\
f'AMDSMI Library version: {amdsmi_lib_version_str} | ' \
f'ROCm version: {rocm_version_str}' )
f'ROCm version: {rocm_version_str}')
elif self.logger.is_json_format() or self.logger.is_csv_format():
self.logger.print_output()
@@ -1119,8 +1119,8 @@ class AMDSMICommands():
# Put the metrics table in the debug logs
try:
gpu_metric_output = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)
gpu_metric_str = json.dumps(gpu_metric_output, indent=4)
gpu_metric_debug_output = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)
gpu_metric_str = json.dumps(gpu_metric_debug_output, indent=4)
logging.debug("GPU Metrics table for %s | %s", gpu_id, gpu_metric_str)
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Unabled to load GPU Metrics table for %s | %s", gpu_id, e.err_info)
@@ -1143,8 +1143,12 @@ class AMDSMICommands():
engine_usage['gfx_activity'] = engine_usage.pop('gfx_activity')
engine_usage['umc_activity'] = engine_usage.pop('umc_activity')
engine_usage['mm_activity'] = engine_usage.pop('mm_activity')
engine_usage['vcn_activity'] = gpu_metric_output.pop('vcn_activity')
engine_usage['jpeg_activity'] = gpu_metric_output.pop('jpeg_activity')
# TODO: move vcn_activity and jpeg_activity into amdsmi_get_gpu_activity
gpu_metric_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)
engine_usage['vcn_activity'] = gpu_metric_info.pop('vcn_activity')
engine_usage['jpeg_activity'] = gpu_metric_info.pop('jpeg_activity')
for key, value in engine_usage.items():
if not isinstance(value, list) and value > 100:
engine_usage[key] = "N/A"
@@ -1214,15 +1218,16 @@ class AMDSMICommands():
logging.debug("Failed to get power management status for gpu %s | %s", gpu_id, e.get_error_info())
try:
throttle_status = amdsmi_interface.amdsmi_get_gpu_metrics_throttle_status(args.gpu)
if throttle_status:
power_dict['throttle_status'] = "THROTTLED"
else:
power_dict['throttle_status'] = "UNTHROTTLED"
power_dict['throttle_status'] = "N/A"
throttle_status = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['throttle_status']
if throttle_status != "N/A":
if throttle_status:
power_dict['throttle_status'] = "THROTTLED"
else:
power_dict['throttle_status'] = "UNTHROTTLED"
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get throttle status for gpu %s | %s", gpu_id, e.get_error_info())
values_dict['power'] = power_dict
if "clock" in current_platform_args:
if args.clock:
@@ -1271,8 +1276,8 @@ class AMDSMICommands():
logging.debug("Failed to get %s clock info for gpu %s | %s", clock_name, gpu_id, e.get_error_info())
try:
is_clk_locked = amdsmi_interface.amdsmi_get_gpu_metrics_gfxclk_lock_status(args.gpu)
if self.logger.is_human_readable_format():
is_clk_locked = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['gfxclk_lock_status']
if is_clk_locked != "N/A":
if is_clk_locked:
is_clk_locked = "LOCKED"
else:
@@ -1393,7 +1398,10 @@ class AMDSMICommands():
logging.debug("Failed to get pcie link status for gpu %s | %s", gpu_id, e.get_error_info())
try:
pci_replay_counter = amdsmi_interface.amdsmi_get_gpu_metrics_pcie_replay_count_acc(args.gpu)
pci_replay_counter = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['pcie_replay_count_acc']
if pci_replay_counter == "N/A":
# raising exception here to fall back to sysfs
raise amdsmi_exception.AmdSmiLibraryException(amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED)
pcie_dict['replay_count'] = pci_replay_counter
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get pci replay counter for gpu %s | %s", gpu_id, e.get_error_info())
@@ -1406,22 +1414,23 @@ class AMDSMICommands():
logging.debug("Failed to get sysfs fallback pci replay counter for gpu %s | %s", gpu_id, err.get_error_info())
try:
l0_to_recovery_counter = amdsmi_interface.amdsmi_get_gpu_metrics_pcie_l0_recov_count_acc(args.gpu)
l0_to_recovery_counter = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['pcie_l0_to_recov_count_acc']
pcie_dict['l0_to_recovery_count'] = l0_to_recovery_counter
except amdsmi_exception.AmdSmiLibraryException as e:
pcie_dict['l0_to_recovery_count'] = "N/A"
logging.debug("Failed to get pcie l0 to recovery counter for gpu %s | %s", gpu_id, e.get_error_info())
try:
pci_replay_rollover_counter = amdsmi_interface.amdsmi_get_gpu_metrics_pcie_replay_rover_count_acc(args.gpu)
pcie_dict['replay_rollover_count'] = pci_replay_rollover_counter
pci_replay_rollover_counter = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['pcie_replay_rover_count_acc']
pcie_dict['replay_roll_over_count'] = pci_replay_rollover_counter
except amdsmi_exception.AmdSmiLibraryException as e:
pcie_dict['replay_roll_over_count'] = "N/A"
logging.debug("Failed to get pcie replay rollover counter for gpu %s | %s", gpu_id, e.get_error_info())
try:
pcie_dict['nak_sent_count'] = "N/A"
pcie_dict['nak_received_count'] = "N/A"
gpu_metrics_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)
pcie_dict['nak_sent_count'] = gpu_metrics_info['pcie_nak_sent_count_acc']
pcie_dict['nak_received_count'] = gpu_metrics_info['pcie_nak_rcvd_count_acc']
except amdsmi_exception.AmdSmiLibraryException as e:
pcie_dict['nak_sent_count'] = "N/A"
pcie_dict['nak_received_count'] = "N/A"
@@ -2343,12 +2352,12 @@ class AMDSMICommands():
if ((len(self.device_handles) and ((((not gpus) and (not cpus) and (not cores)) or gpus)
and not cpu_options and not core_options))):
self.metric_gpu( args, multiple_devices, watching_output, gpu,
usage, watch, watch_time, iterations, power,
clock, temperature, ecc, ecc_block, pcie,
fan, voltage_curve, overdrive, perf_level,
xgmi_err, energy, mem_usage, schedule,
guard, guest_data, fb_usage, xgmi)
self.metric_gpu(args, multiple_devices, watching_output, gpu,
usage, watch, watch_time, iterations, power,
clock, temperature, ecc, ecc_block, pcie,
fan, voltage_curve, overdrive, perf_level,
xgmi_err, energy, mem_usage, schedule,
guard, guest_data, fb_usage, xgmi)
if ((len(self.cpu_handles) and ((((not gpus) and (not cpus) and (not cores)) or cpus)
@@ -3357,12 +3366,11 @@ class AMDSMICommands():
if args.power_usage:
try:
gpu_metrics_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)
power_usage = gpu_metrics_info['current_socket_power']
if power_usage >= 0xFFFFFFFF:
power_usage = gpu_metrics_info['average_socket_power']
if power_usage >= 0xFFFFFFFF:
power_usage = "N/A"
monitor_values['power_usage'] = power_usage
monitor_values['power_usage'] = gpu_metrics_info['current_socket_power']
if monitor_values['power_usage'] == "N/A": # Fallback to average_socket_power for older gpu_metrics versions
monitor_values['power_usage'] = gpu_metrics_info['average_socket_power']
if self.logger.is_human_readable_format() and monitor_values['power_usage'] != "N/A":
monitor_values['power_usage'] = f"{monitor_values['power_usage']} W"
except amdsmi_exception.AmdSmiLibraryException as e:
@@ -3379,7 +3387,7 @@ class AMDSMICommands():
logging.debug("Failed to get hotspot temperature on gpu %s | %s", gpu_id, e.get_error_info())
try:
temperature = amdsmi_interface.amdsmi_get_gpu_metrics_temp_mem(args.gpu)
temperature = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['temperature_mem']
monitor_values['memory_temperature'] = temperature
except amdsmi_exception.AmdSmiLibraryException as e:
monitor_values['memory_temperature'] = "N/A"
@@ -3395,7 +3403,7 @@ class AMDSMICommands():
self.logger.table_header += 'MEM_TEMP'.rjust(10)
if args.gfx:
try:
gfx_util = amdsmi_interface.amdsmi_get_gpu_metrics_avg_gfx_activity(args.gpu)
gfx_util = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['average_gfx_activity']
monitor_values['gfx'] = gfx_util
if self.logger.is_human_readable_format():
monitor_values['gfx'] = f"{monitor_values['gfx']} %"
@@ -3417,7 +3425,7 @@ class AMDSMICommands():
self.logger.table_header += 'GFX_CLOCK'.rjust(11)
if args.mem:
try:
mem_util = amdsmi_interface.amdsmi_get_gpu_metrics_avg_umc_activity(args.gpu)
mem_util = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['average_umc_activity']
monitor_values['mem'] = mem_util
if self.logger.is_human_readable_format():
monitor_values['mem'] = f"{monitor_values['mem']} %"
@@ -3440,7 +3448,7 @@ class AMDSMICommands():
if args.encoder:
try:
# Get List of vcn activity values
encoder_util = amdsmi_interface.amdsmi_get_gpu_metrics_vcn_activity(args.gpu)
encoder_util = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['vcn_activity']
encoding_activity_avg = []
for value in encoder_util:
if value < 150: # each encoder chiplet's value range should be a percent
@@ -3493,7 +3501,7 @@ class AMDSMICommands():
self.logger.table_header += 'DEC_CLOCK'.rjust(11)
if args.throttle_status:
try:
throttle_status = amdsmi_interface.amdsmi_get_gpu_metrics_throttle_status(args.gpu)
throttle_status = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['throttle_status']
if throttle_status:
throttle_status = "THROTTLED"
else:
@@ -3250,7 +3250,7 @@ def amdsmi_get_gpu_metrics_info(
)
)
return {
gpu_metrics_output = {
"temperature_edge": gpu_metrics.temperature_edge,
"temperature_hotspot": gpu_metrics.temperature_hotspot,
"temperature_mem": gpu_metrics.temperature_mem,
@@ -3312,6 +3312,92 @@ def amdsmi_get_gpu_metrics_info(
"jpeg_activity": list(gpu_metrics.jpeg_activity),
}
# Validate support for each gpu_metric
uint_16_values = ['temperature_edge', 'temperature_hotspot', 'temperature_mem',
'temperature_vrgfx', 'temperature_vrsoc', 'temperature_vrmem',
'average_gfx_activity', 'average_umc_activity', 'average_mm_activity',
'average_socket_power', 'average_gfxclk_frequency', 'average_socclk_frequency',
'average_uclk_frequency', 'average_vclk0_frequency', 'average_dclk0_frequency',
'average_vclk1_frequency', 'average_dclk1_frequency', 'current_gfxclk',
'current_socclk', 'current_uclk', 'current_vclk0', 'current_dclk0',
'current_vclk1', 'current_dclk1', 'current_fan_speed', 'pcie_link_width',
'pcie_link_speed', 'voltage_soc', 'voltage_gfx', 'voltage_mem',
'current_socket_power', 'xgmi_link_width', 'xgmi_link_speed']
for value in uint_16_values:
if gpu_metrics_output[value] == 0xFFFF:
gpu_metrics_output[value] = "N/A"
uint_32_values = ['gfx_activity_acc', 'mem_activity_acc', 'mem_max_bandwidth',
'pcie_nak_sent_count_acc', 'pcie_nak_rcvd_count_acc']
for value in uint_32_values:
if gpu_metrics_output[value] == 0xFFFFFFFF:
gpu_metrics_output[value] = "N/A"
uint_64_values = ['energy_accumulator', 'system_clock_counter', 'firmware_timestamp',
'pcie_bandwidth_acc', 'pcie_bandwidth_inst',
'pcie_l0_to_recov_count_acc', 'pcie_replay_count_acc',
'pcie_replay_rover_count_acc', 'mem_bandwidth_acc']
for value in uint_64_values:
if gpu_metrics_output[value] == 0xFFFFFFFFFFFFFFFF:
gpu_metrics_output[value] = "N/A"
# Custom validation for specific gpu_metrics
if gpu_metrics_output['throttle_status'] == 0xFFFFFFFF:
gpu_metrics_output['throttle_status'] = "N/A"
else:
gpu_metrics_output['throttle_status'] = bool(gpu_metrics_output['throttle_status'])
for idx, temp in enumerate(gpu_metrics_output['temperature_hbm']):
if temp == 0xFFFF:
gpu_metrics_output['temperature_hbm'][idx] = "N/A"
if gpu_metrics_output['indep_throttle_status'] == 0xFFFFFFFFFFFFFFFF:
gpu_metrics_output['indep_throttle_status'] = "N/A"
else:
gpu_metrics_output['indep_throttle_status'] = bool(gpu_metrics_output['indep_throttle_status'])
for idx, activity in enumerate(gpu_metrics_output['vcn_activity']):
if activity == 0xFFFF:
gpu_metrics_output['vcn_activity'][idx] = "N/A"
if gpu_metrics_output['gfxclk_lock_status'] == 0xFFFFFFFF:
gpu_metrics_output['gfxclk_lock_status'] = "N/A"
else:
gpu_metrics_output['gfxclk_lock_status'] = bool(gpu_metrics_output['gfxclk_lock_status'])
for idx, data in enumerate(gpu_metrics_output['xgmi_read_data_acc']):
if data == 0xFFFFFFFFFFFFFFFF:
gpu_metrics_output['xgmi_read_data_acc'][idx] = "N/A"
for idx, data in enumerate(gpu_metrics_output['xgmi_write_data_acc']):
if data == 0xFFFFFFFFFFFFFFFF:
gpu_metrics_output['xgmi_write_data_acc'][idx] = "N/A"
for idx, clk in enumerate(gpu_metrics_output['current_gfxclks']):
if clk == 0xFFFF:
gpu_metrics_output['current_gfxclks'][idx] = "N/A"
for idx, clk in enumerate(gpu_metrics_output['current_socclks']):
if clk == 0xFFFF:
gpu_metrics_output['current_socclks'][idx] = "N/A"
for idx, clk in enumerate(gpu_metrics_output['current_vclk0s']):
if clk == 0xFFFF:
gpu_metrics_output['current_vclk0s'][idx] = "N/A"
for idx, clk in enumerate(gpu_metrics_output['current_dclk0s']):
if clk == 0xFFFF:
gpu_metrics_output['current_dclk0s'][idx] = "N/A"
for idx, activity in enumerate(gpu_metrics_output['jpeg_activity']):
if activity == 0xFFFF:
gpu_metrics_output['jpeg_activity'][idx] = "N/A"
return gpu_metrics_output
def amdsmi_get_gpu_od_volt_curve_regions(
processor_handle: amdsmi_wrapper.amdsmi_processor_handle, num_regions: int