SWDEV-440760: Removed specific gpu_metric calls & fixed pcie metrics
Signed-off-by: Maisam Arif <maisarif@amd.com>
Change-Id: I679ecede4825c119925de3c9140453653f3f84aa
[ROCm/amdsmi commit: fec1173321]
This commit is contained in:
@@ -106,7 +106,7 @@ class AMDSMICommands():
|
||||
if self.logger.is_human_readable_format():
|
||||
print(f'AMDSMI Tool: {__version__} | '\
|
||||
f'AMDSMI Library version: {amdsmi_lib_version_str} | ' \
|
||||
f'ROCm version: {rocm_version_str}' )
|
||||
f'ROCm version: {rocm_version_str}')
|
||||
elif self.logger.is_json_format() or self.logger.is_csv_format():
|
||||
self.logger.print_output()
|
||||
|
||||
@@ -1119,8 +1119,8 @@ class AMDSMICommands():
|
||||
|
||||
# Put the metrics table in the debug logs
|
||||
try:
|
||||
gpu_metric_output = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)
|
||||
gpu_metric_str = json.dumps(gpu_metric_output, indent=4)
|
||||
gpu_metric_debug_output = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)
|
||||
gpu_metric_str = json.dumps(gpu_metric_debug_output, indent=4)
|
||||
logging.debug("GPU Metrics table for %s | %s", gpu_id, gpu_metric_str)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Unabled to load GPU Metrics table for %s | %s", gpu_id, e.err_info)
|
||||
@@ -1143,8 +1143,12 @@ class AMDSMICommands():
|
||||
engine_usage['gfx_activity'] = engine_usage.pop('gfx_activity')
|
||||
engine_usage['umc_activity'] = engine_usage.pop('umc_activity')
|
||||
engine_usage['mm_activity'] = engine_usage.pop('mm_activity')
|
||||
engine_usage['vcn_activity'] = gpu_metric_output.pop('vcn_activity')
|
||||
engine_usage['jpeg_activity'] = gpu_metric_output.pop('jpeg_activity')
|
||||
|
||||
# TODO: move vcn_activity and jpeg_activity into amdsmi_get_gpu_activity
|
||||
gpu_metric_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)
|
||||
engine_usage['vcn_activity'] = gpu_metric_info.pop('vcn_activity')
|
||||
engine_usage['jpeg_activity'] = gpu_metric_info.pop('jpeg_activity')
|
||||
|
||||
for key, value in engine_usage.items():
|
||||
if not isinstance(value, list) and value > 100:
|
||||
engine_usage[key] = "N/A"
|
||||
@@ -1214,15 +1218,16 @@ class AMDSMICommands():
|
||||
logging.debug("Failed to get power management status for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
try:
|
||||
throttle_status = amdsmi_interface.amdsmi_get_gpu_metrics_throttle_status(args.gpu)
|
||||
if throttle_status:
|
||||
power_dict['throttle_status'] = "THROTTLED"
|
||||
else:
|
||||
power_dict['throttle_status'] = "UNTHROTTLED"
|
||||
power_dict['throttle_status'] = "N/A"
|
||||
throttle_status = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['throttle_status']
|
||||
if throttle_status != "N/A":
|
||||
if throttle_status:
|
||||
power_dict['throttle_status'] = "THROTTLED"
|
||||
else:
|
||||
power_dict['throttle_status'] = "UNTHROTTLED"
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get throttle status for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
|
||||
values_dict['power'] = power_dict
|
||||
if "clock" in current_platform_args:
|
||||
if args.clock:
|
||||
@@ -1271,8 +1276,8 @@ class AMDSMICommands():
|
||||
logging.debug("Failed to get %s clock info for gpu %s | %s", clock_name, gpu_id, e.get_error_info())
|
||||
|
||||
try:
|
||||
is_clk_locked = amdsmi_interface.amdsmi_get_gpu_metrics_gfxclk_lock_status(args.gpu)
|
||||
if self.logger.is_human_readable_format():
|
||||
is_clk_locked = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['gfxclk_lock_status']
|
||||
if is_clk_locked != "N/A":
|
||||
if is_clk_locked:
|
||||
is_clk_locked = "LOCKED"
|
||||
else:
|
||||
@@ -1393,7 +1398,10 @@ class AMDSMICommands():
|
||||
logging.debug("Failed to get pcie link status for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
try:
|
||||
pci_replay_counter = amdsmi_interface.amdsmi_get_gpu_metrics_pcie_replay_count_acc(args.gpu)
|
||||
pci_replay_counter = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['pcie_replay_count_acc']
|
||||
if pci_replay_counter == "N/A":
|
||||
# raising exception here to fall back to sysfs
|
||||
raise amdsmi_exception.AmdSmiLibraryException(amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED)
|
||||
pcie_dict['replay_count'] = pci_replay_counter
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get pci replay counter for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
@@ -1406,22 +1414,23 @@ class AMDSMICommands():
|
||||
logging.debug("Failed to get sysfs fallback pci replay counter for gpu %s | %s", gpu_id, err.get_error_info())
|
||||
|
||||
try:
|
||||
l0_to_recovery_counter = amdsmi_interface.amdsmi_get_gpu_metrics_pcie_l0_recov_count_acc(args.gpu)
|
||||
l0_to_recovery_counter = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['pcie_l0_to_recov_count_acc']
|
||||
pcie_dict['l0_to_recovery_count'] = l0_to_recovery_counter
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
pcie_dict['l0_to_recovery_count'] = "N/A"
|
||||
logging.debug("Failed to get pcie l0 to recovery counter for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
try:
|
||||
pci_replay_rollover_counter = amdsmi_interface.amdsmi_get_gpu_metrics_pcie_replay_rover_count_acc(args.gpu)
|
||||
pcie_dict['replay_rollover_count'] = pci_replay_rollover_counter
|
||||
pci_replay_rollover_counter = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['pcie_replay_rover_count_acc']
|
||||
pcie_dict['replay_roll_over_count'] = pci_replay_rollover_counter
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
pcie_dict['replay_roll_over_count'] = "N/A"
|
||||
logging.debug("Failed to get pcie replay rollover counter for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
try:
|
||||
pcie_dict['nak_sent_count'] = "N/A"
|
||||
pcie_dict['nak_received_count'] = "N/A"
|
||||
gpu_metrics_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)
|
||||
pcie_dict['nak_sent_count'] = gpu_metrics_info['pcie_nak_sent_count_acc']
|
||||
pcie_dict['nak_received_count'] = gpu_metrics_info['pcie_nak_rcvd_count_acc']
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
pcie_dict['nak_sent_count'] = "N/A"
|
||||
pcie_dict['nak_received_count'] = "N/A"
|
||||
@@ -2343,12 +2352,12 @@ class AMDSMICommands():
|
||||
|
||||
if ((len(self.device_handles) and ((((not gpus) and (not cpus) and (not cores)) or gpus)
|
||||
and not cpu_options and not core_options))):
|
||||
self.metric_gpu( args, multiple_devices, watching_output, gpu,
|
||||
usage, watch, watch_time, iterations, power,
|
||||
clock, temperature, ecc, ecc_block, pcie,
|
||||
fan, voltage_curve, overdrive, perf_level,
|
||||
xgmi_err, energy, mem_usage, schedule,
|
||||
guard, guest_data, fb_usage, xgmi)
|
||||
self.metric_gpu(args, multiple_devices, watching_output, gpu,
|
||||
usage, watch, watch_time, iterations, power,
|
||||
clock, temperature, ecc, ecc_block, pcie,
|
||||
fan, voltage_curve, overdrive, perf_level,
|
||||
xgmi_err, energy, mem_usage, schedule,
|
||||
guard, guest_data, fb_usage, xgmi)
|
||||
|
||||
|
||||
if ((len(self.cpu_handles) and ((((not gpus) and (not cpus) and (not cores)) or cpus)
|
||||
@@ -3357,12 +3366,11 @@ class AMDSMICommands():
|
||||
if args.power_usage:
|
||||
try:
|
||||
gpu_metrics_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)
|
||||
power_usage = gpu_metrics_info['current_socket_power']
|
||||
if power_usage >= 0xFFFFFFFF:
|
||||
power_usage = gpu_metrics_info['average_socket_power']
|
||||
if power_usage >= 0xFFFFFFFF:
|
||||
power_usage = "N/A"
|
||||
monitor_values['power_usage'] = power_usage
|
||||
|
||||
monitor_values['power_usage'] = gpu_metrics_info['current_socket_power']
|
||||
if monitor_values['power_usage'] == "N/A": # Fallback to average_socket_power for older gpu_metrics versions
|
||||
monitor_values['power_usage'] = gpu_metrics_info['average_socket_power']
|
||||
|
||||
if self.logger.is_human_readable_format() and monitor_values['power_usage'] != "N/A":
|
||||
monitor_values['power_usage'] = f"{monitor_values['power_usage']} W"
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
@@ -3379,7 +3387,7 @@ class AMDSMICommands():
|
||||
logging.debug("Failed to get hotspot temperature on gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
try:
|
||||
temperature = amdsmi_interface.amdsmi_get_gpu_metrics_temp_mem(args.gpu)
|
||||
temperature = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['temperature_mem']
|
||||
monitor_values['memory_temperature'] = temperature
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
monitor_values['memory_temperature'] = "N/A"
|
||||
@@ -3395,7 +3403,7 @@ class AMDSMICommands():
|
||||
self.logger.table_header += 'MEM_TEMP'.rjust(10)
|
||||
if args.gfx:
|
||||
try:
|
||||
gfx_util = amdsmi_interface.amdsmi_get_gpu_metrics_avg_gfx_activity(args.gpu)
|
||||
gfx_util = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['average_gfx_activity']
|
||||
monitor_values['gfx'] = gfx_util
|
||||
if self.logger.is_human_readable_format():
|
||||
monitor_values['gfx'] = f"{monitor_values['gfx']} %"
|
||||
@@ -3417,7 +3425,7 @@ class AMDSMICommands():
|
||||
self.logger.table_header += 'GFX_CLOCK'.rjust(11)
|
||||
if args.mem:
|
||||
try:
|
||||
mem_util = amdsmi_interface.amdsmi_get_gpu_metrics_avg_umc_activity(args.gpu)
|
||||
mem_util = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['average_umc_activity']
|
||||
monitor_values['mem'] = mem_util
|
||||
if self.logger.is_human_readable_format():
|
||||
monitor_values['mem'] = f"{monitor_values['mem']} %"
|
||||
@@ -3440,7 +3448,7 @@ class AMDSMICommands():
|
||||
if args.encoder:
|
||||
try:
|
||||
# Get List of vcn activity values
|
||||
encoder_util = amdsmi_interface.amdsmi_get_gpu_metrics_vcn_activity(args.gpu)
|
||||
encoder_util = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['vcn_activity']
|
||||
encoding_activity_avg = []
|
||||
for value in encoder_util:
|
||||
if value < 150: # each encoder chiplet's value range should be a percent
|
||||
@@ -3493,7 +3501,7 @@ class AMDSMICommands():
|
||||
self.logger.table_header += 'DEC_CLOCK'.rjust(11)
|
||||
if args.throttle_status:
|
||||
try:
|
||||
throttle_status = amdsmi_interface.amdsmi_get_gpu_metrics_throttle_status(args.gpu)
|
||||
throttle_status = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['throttle_status']
|
||||
if throttle_status:
|
||||
throttle_status = "THROTTLED"
|
||||
else:
|
||||
|
||||
@@ -3250,7 +3250,7 @@ def amdsmi_get_gpu_metrics_info(
|
||||
)
|
||||
)
|
||||
|
||||
return {
|
||||
gpu_metrics_output = {
|
||||
"temperature_edge": gpu_metrics.temperature_edge,
|
||||
"temperature_hotspot": gpu_metrics.temperature_hotspot,
|
||||
"temperature_mem": gpu_metrics.temperature_mem,
|
||||
@@ -3312,6 +3312,92 @@ def amdsmi_get_gpu_metrics_info(
|
||||
"jpeg_activity": list(gpu_metrics.jpeg_activity),
|
||||
}
|
||||
|
||||
# Validate support for each gpu_metric
|
||||
uint_16_values = ['temperature_edge', 'temperature_hotspot', 'temperature_mem',
|
||||
'temperature_vrgfx', 'temperature_vrsoc', 'temperature_vrmem',
|
||||
'average_gfx_activity', 'average_umc_activity', 'average_mm_activity',
|
||||
'average_socket_power', 'average_gfxclk_frequency', 'average_socclk_frequency',
|
||||
'average_uclk_frequency', 'average_vclk0_frequency', 'average_dclk0_frequency',
|
||||
'average_vclk1_frequency', 'average_dclk1_frequency', 'current_gfxclk',
|
||||
'current_socclk', 'current_uclk', 'current_vclk0', 'current_dclk0',
|
||||
'current_vclk1', 'current_dclk1', 'current_fan_speed', 'pcie_link_width',
|
||||
'pcie_link_speed', 'voltage_soc', 'voltage_gfx', 'voltage_mem',
|
||||
'current_socket_power', 'xgmi_link_width', 'xgmi_link_speed']
|
||||
|
||||
for value in uint_16_values:
|
||||
if gpu_metrics_output[value] == 0xFFFF:
|
||||
gpu_metrics_output[value] = "N/A"
|
||||
|
||||
uint_32_values = ['gfx_activity_acc', 'mem_activity_acc', 'mem_max_bandwidth',
|
||||
'pcie_nak_sent_count_acc', 'pcie_nak_rcvd_count_acc']
|
||||
|
||||
for value in uint_32_values:
|
||||
if gpu_metrics_output[value] == 0xFFFFFFFF:
|
||||
gpu_metrics_output[value] = "N/A"
|
||||
|
||||
uint_64_values = ['energy_accumulator', 'system_clock_counter', 'firmware_timestamp',
|
||||
'pcie_bandwidth_acc', 'pcie_bandwidth_inst',
|
||||
'pcie_l0_to_recov_count_acc', 'pcie_replay_count_acc',
|
||||
'pcie_replay_rover_count_acc', 'mem_bandwidth_acc']
|
||||
|
||||
for value in uint_64_values:
|
||||
if gpu_metrics_output[value] == 0xFFFFFFFFFFFFFFFF:
|
||||
gpu_metrics_output[value] = "N/A"
|
||||
|
||||
# Custom validation for specific gpu_metrics
|
||||
if gpu_metrics_output['throttle_status'] == 0xFFFFFFFF:
|
||||
gpu_metrics_output['throttle_status'] = "N/A"
|
||||
else:
|
||||
gpu_metrics_output['throttle_status'] = bool(gpu_metrics_output['throttle_status'])
|
||||
|
||||
for idx, temp in enumerate(gpu_metrics_output['temperature_hbm']):
|
||||
if temp == 0xFFFF:
|
||||
gpu_metrics_output['temperature_hbm'][idx] = "N/A"
|
||||
|
||||
if gpu_metrics_output['indep_throttle_status'] == 0xFFFFFFFFFFFFFFFF:
|
||||
gpu_metrics_output['indep_throttle_status'] = "N/A"
|
||||
else:
|
||||
gpu_metrics_output['indep_throttle_status'] = bool(gpu_metrics_output['indep_throttle_status'])
|
||||
|
||||
for idx, activity in enumerate(gpu_metrics_output['vcn_activity']):
|
||||
if activity == 0xFFFF:
|
||||
gpu_metrics_output['vcn_activity'][idx] = "N/A"
|
||||
|
||||
if gpu_metrics_output['gfxclk_lock_status'] == 0xFFFFFFFF:
|
||||
gpu_metrics_output['gfxclk_lock_status'] = "N/A"
|
||||
else:
|
||||
gpu_metrics_output['gfxclk_lock_status'] = bool(gpu_metrics_output['gfxclk_lock_status'])
|
||||
|
||||
for idx, data in enumerate(gpu_metrics_output['xgmi_read_data_acc']):
|
||||
if data == 0xFFFFFFFFFFFFFFFF:
|
||||
gpu_metrics_output['xgmi_read_data_acc'][idx] = "N/A"
|
||||
|
||||
for idx, data in enumerate(gpu_metrics_output['xgmi_write_data_acc']):
|
||||
if data == 0xFFFFFFFFFFFFFFFF:
|
||||
gpu_metrics_output['xgmi_write_data_acc'][idx] = "N/A"
|
||||
|
||||
for idx, clk in enumerate(gpu_metrics_output['current_gfxclks']):
|
||||
if clk == 0xFFFF:
|
||||
gpu_metrics_output['current_gfxclks'][idx] = "N/A"
|
||||
|
||||
for idx, clk in enumerate(gpu_metrics_output['current_socclks']):
|
||||
if clk == 0xFFFF:
|
||||
gpu_metrics_output['current_socclks'][idx] = "N/A"
|
||||
|
||||
for idx, clk in enumerate(gpu_metrics_output['current_vclk0s']):
|
||||
if clk == 0xFFFF:
|
||||
gpu_metrics_output['current_vclk0s'][idx] = "N/A"
|
||||
|
||||
for idx, clk in enumerate(gpu_metrics_output['current_dclk0s']):
|
||||
if clk == 0xFFFF:
|
||||
gpu_metrics_output['current_dclk0s'][idx] = "N/A"
|
||||
|
||||
for idx, activity in enumerate(gpu_metrics_output['jpeg_activity']):
|
||||
if activity == 0xFFFF:
|
||||
gpu_metrics_output['jpeg_activity'][idx] = "N/A"
|
||||
|
||||
return gpu_metrics_output
|
||||
|
||||
|
||||
def amdsmi_get_gpu_od_volt_curve_regions(
|
||||
processor_handle: amdsmi_wrapper.amdsmi_processor_handle, num_regions: int
|
||||
|
||||
مرجع در شماره جدید
Block a user