diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index 1c1ee3ed59..1179709e6d 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -106,7 +106,7 @@ class AMDSMICommands(): if self.logger.is_human_readable_format(): print(f'AMDSMI Tool: {__version__} | '\ f'AMDSMI Library version: {amdsmi_lib_version_str} | ' \ - f'ROCm version: {rocm_version_str}' ) + f'ROCm version: {rocm_version_str}') elif self.logger.is_json_format() or self.logger.is_csv_format(): self.logger.print_output() @@ -1119,8 +1119,8 @@ class AMDSMICommands(): # Put the metrics table in the debug logs try: - gpu_metric_output = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu) - gpu_metric_str = json.dumps(gpu_metric_output, indent=4) + gpu_metric_debug_output = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu) + gpu_metric_str = json.dumps(gpu_metric_debug_output, indent=4) logging.debug("GPU Metrics table for %s | %s", gpu_id, gpu_metric_str) except amdsmi_exception.AmdSmiLibraryException as e: logging.debug("Unabled to load GPU Metrics table for %s | %s", gpu_id, e.err_info) @@ -1143,8 +1143,12 @@ class AMDSMICommands(): engine_usage['gfx_activity'] = engine_usage.pop('gfx_activity') engine_usage['umc_activity'] = engine_usage.pop('umc_activity') engine_usage['mm_activity'] = engine_usage.pop('mm_activity') - engine_usage['vcn_activity'] = gpu_metric_output.pop('vcn_activity') - engine_usage['jpeg_activity'] = gpu_metric_output.pop('jpeg_activity') + + # TODO: move vcn_activity and jpeg_activity into amdsmi_get_gpu_activity + gpu_metric_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu) + engine_usage['vcn_activity'] = gpu_metric_info.pop('vcn_activity') + engine_usage['jpeg_activity'] = gpu_metric_info.pop('jpeg_activity') + for key, value in engine_usage.items(): if not isinstance(value, list) and value > 100: engine_usage[key] = "N/A" @@ -1214,15 +1218,16 @@ class AMDSMICommands(): logging.debug("Failed to get power management status for gpu %s | %s", gpu_id, e.get_error_info()) try: - throttle_status = amdsmi_interface.amdsmi_get_gpu_metrics_throttle_status(args.gpu) - if throttle_status: - power_dict['throttle_status'] = "THROTTLED" - else: - power_dict['throttle_status'] = "UNTHROTTLED" + power_dict['throttle_status'] = "N/A" + throttle_status = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['throttle_status'] + if throttle_status != "N/A": + if throttle_status: + power_dict['throttle_status'] = "THROTTLED" + else: + power_dict['throttle_status'] = "UNTHROTTLED" except amdsmi_exception.AmdSmiLibraryException as e: logging.debug("Failed to get throttle status for gpu %s | %s", gpu_id, e.get_error_info()) - values_dict['power'] = power_dict if "clock" in current_platform_args: if args.clock: @@ -1271,8 +1276,8 @@ class AMDSMICommands(): logging.debug("Failed to get %s clock info for gpu %s | %s", clock_name, gpu_id, e.get_error_info()) try: - is_clk_locked = amdsmi_interface.amdsmi_get_gpu_metrics_gfxclk_lock_status(args.gpu) - if self.logger.is_human_readable_format(): + is_clk_locked = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['gfxclk_lock_status'] + if is_clk_locked != "N/A": if is_clk_locked: is_clk_locked = "LOCKED" else: @@ -1393,7 +1398,10 @@ class AMDSMICommands(): logging.debug("Failed to get pcie link status for gpu %s | %s", gpu_id, e.get_error_info()) try: - pci_replay_counter = amdsmi_interface.amdsmi_get_gpu_metrics_pcie_replay_count_acc(args.gpu) + pci_replay_counter = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['pcie_replay_count_acc'] + if pci_replay_counter == "N/A": + # raising exception here to fall back to sysfs + raise amdsmi_exception.AmdSmiLibraryException(amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) pcie_dict['replay_count'] = pci_replay_counter except amdsmi_exception.AmdSmiLibraryException as e: logging.debug("Failed to get pci replay counter for gpu %s | %s", gpu_id, e.get_error_info()) @@ -1406,22 +1414,23 @@ class AMDSMICommands(): logging.debug("Failed to get sysfs fallback pci replay counter for gpu %s | %s", gpu_id, err.get_error_info()) try: - l0_to_recovery_counter = amdsmi_interface.amdsmi_get_gpu_metrics_pcie_l0_recov_count_acc(args.gpu) + l0_to_recovery_counter = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['pcie_l0_to_recov_count_acc'] pcie_dict['l0_to_recovery_count'] = l0_to_recovery_counter except amdsmi_exception.AmdSmiLibraryException as e: pcie_dict['l0_to_recovery_count'] = "N/A" logging.debug("Failed to get pcie l0 to recovery counter for gpu %s | %s", gpu_id, e.get_error_info()) try: - pci_replay_rollover_counter = amdsmi_interface.amdsmi_get_gpu_metrics_pcie_replay_rover_count_acc(args.gpu) - pcie_dict['replay_rollover_count'] = pci_replay_rollover_counter + pci_replay_rollover_counter = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['pcie_replay_rover_count_acc'] + pcie_dict['replay_roll_over_count'] = pci_replay_rollover_counter except amdsmi_exception.AmdSmiLibraryException as e: pcie_dict['replay_roll_over_count'] = "N/A" logging.debug("Failed to get pcie replay rollover counter for gpu %s | %s", gpu_id, e.get_error_info()) try: - pcie_dict['nak_sent_count'] = "N/A" - pcie_dict['nak_received_count'] = "N/A" + gpu_metrics_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu) + pcie_dict['nak_sent_count'] = gpu_metrics_info['pcie_nak_sent_count_acc'] + pcie_dict['nak_received_count'] = gpu_metrics_info['pcie_nak_rcvd_count_acc'] except amdsmi_exception.AmdSmiLibraryException as e: pcie_dict['nak_sent_count'] = "N/A" pcie_dict['nak_received_count'] = "N/A" @@ -2343,12 +2352,12 @@ class AMDSMICommands(): if ((len(self.device_handles) and ((((not gpus) and (not cpus) and (not cores)) or gpus) and not cpu_options and not core_options))): - self.metric_gpu( args, multiple_devices, watching_output, gpu, - usage, watch, watch_time, iterations, power, - clock, temperature, ecc, ecc_block, pcie, - fan, voltage_curve, overdrive, perf_level, - xgmi_err, energy, mem_usage, schedule, - guard, guest_data, fb_usage, xgmi) + self.metric_gpu(args, multiple_devices, watching_output, gpu, + usage, watch, watch_time, iterations, power, + clock, temperature, ecc, ecc_block, pcie, + fan, voltage_curve, overdrive, perf_level, + xgmi_err, energy, mem_usage, schedule, + guard, guest_data, fb_usage, xgmi) if ((len(self.cpu_handles) and ((((not gpus) and (not cpus) and (not cores)) or cpus) @@ -3357,12 +3366,11 @@ class AMDSMICommands(): if args.power_usage: try: gpu_metrics_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu) - power_usage = gpu_metrics_info['current_socket_power'] - if power_usage >= 0xFFFFFFFF: - power_usage = gpu_metrics_info['average_socket_power'] - if power_usage >= 0xFFFFFFFF: - power_usage = "N/A" - monitor_values['power_usage'] = power_usage + + monitor_values['power_usage'] = gpu_metrics_info['current_socket_power'] + if monitor_values['power_usage'] == "N/A": # Fallback to average_socket_power for older gpu_metrics versions + monitor_values['power_usage'] = gpu_metrics_info['average_socket_power'] + if self.logger.is_human_readable_format() and monitor_values['power_usage'] != "N/A": monitor_values['power_usage'] = f"{monitor_values['power_usage']} W" except amdsmi_exception.AmdSmiLibraryException as e: @@ -3379,7 +3387,7 @@ class AMDSMICommands(): logging.debug("Failed to get hotspot temperature on gpu %s | %s", gpu_id, e.get_error_info()) try: - temperature = amdsmi_interface.amdsmi_get_gpu_metrics_temp_mem(args.gpu) + temperature = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['temperature_mem'] monitor_values['memory_temperature'] = temperature except amdsmi_exception.AmdSmiLibraryException as e: monitor_values['memory_temperature'] = "N/A" @@ -3395,7 +3403,7 @@ class AMDSMICommands(): self.logger.table_header += 'MEM_TEMP'.rjust(10) if args.gfx: try: - gfx_util = amdsmi_interface.amdsmi_get_gpu_metrics_avg_gfx_activity(args.gpu) + gfx_util = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['average_gfx_activity'] monitor_values['gfx'] = gfx_util if self.logger.is_human_readable_format(): monitor_values['gfx'] = f"{monitor_values['gfx']} %" @@ -3417,7 +3425,7 @@ class AMDSMICommands(): self.logger.table_header += 'GFX_CLOCK'.rjust(11) if args.mem: try: - mem_util = amdsmi_interface.amdsmi_get_gpu_metrics_avg_umc_activity(args.gpu) + mem_util = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['average_umc_activity'] monitor_values['mem'] = mem_util if self.logger.is_human_readable_format(): monitor_values['mem'] = f"{monitor_values['mem']} %" @@ -3440,7 +3448,7 @@ class AMDSMICommands(): if args.encoder: try: # Get List of vcn activity values - encoder_util = amdsmi_interface.amdsmi_get_gpu_metrics_vcn_activity(args.gpu) + encoder_util = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['vcn_activity'] encoding_activity_avg = [] for value in encoder_util: if value < 150: # each encoder chiplet's value range should be a percent @@ -3493,7 +3501,7 @@ class AMDSMICommands(): self.logger.table_header += 'DEC_CLOCK'.rjust(11) if args.throttle_status: try: - throttle_status = amdsmi_interface.amdsmi_get_gpu_metrics_throttle_status(args.gpu) + throttle_status = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['throttle_status'] if throttle_status: throttle_status = "THROTTLED" else: diff --git a/projects/amdsmi/py-interface/amdsmi_interface.py b/projects/amdsmi/py-interface/amdsmi_interface.py index 96343f2006..ac6bbf736e 100644 --- a/projects/amdsmi/py-interface/amdsmi_interface.py +++ b/projects/amdsmi/py-interface/amdsmi_interface.py @@ -3250,7 +3250,7 @@ def amdsmi_get_gpu_metrics_info( ) ) - return { + gpu_metrics_output = { "temperature_edge": gpu_metrics.temperature_edge, "temperature_hotspot": gpu_metrics.temperature_hotspot, "temperature_mem": gpu_metrics.temperature_mem, @@ -3312,6 +3312,92 @@ def amdsmi_get_gpu_metrics_info( "jpeg_activity": list(gpu_metrics.jpeg_activity), } + # Validate support for each gpu_metric + uint_16_values = ['temperature_edge', 'temperature_hotspot', 'temperature_mem', + 'temperature_vrgfx', 'temperature_vrsoc', 'temperature_vrmem', + 'average_gfx_activity', 'average_umc_activity', 'average_mm_activity', + 'average_socket_power', 'average_gfxclk_frequency', 'average_socclk_frequency', + 'average_uclk_frequency', 'average_vclk0_frequency', 'average_dclk0_frequency', + 'average_vclk1_frequency', 'average_dclk1_frequency', 'current_gfxclk', + 'current_socclk', 'current_uclk', 'current_vclk0', 'current_dclk0', + 'current_vclk1', 'current_dclk1', 'current_fan_speed', 'pcie_link_width', + 'pcie_link_speed', 'voltage_soc', 'voltage_gfx', 'voltage_mem', + 'current_socket_power', 'xgmi_link_width', 'xgmi_link_speed'] + + for value in uint_16_values: + if gpu_metrics_output[value] == 0xFFFF: + gpu_metrics_output[value] = "N/A" + + uint_32_values = ['gfx_activity_acc', 'mem_activity_acc', 'mem_max_bandwidth', + 'pcie_nak_sent_count_acc', 'pcie_nak_rcvd_count_acc'] + + for value in uint_32_values: + if gpu_metrics_output[value] == 0xFFFFFFFF: + gpu_metrics_output[value] = "N/A" + + uint_64_values = ['energy_accumulator', 'system_clock_counter', 'firmware_timestamp', + 'pcie_bandwidth_acc', 'pcie_bandwidth_inst', + 'pcie_l0_to_recov_count_acc', 'pcie_replay_count_acc', + 'pcie_replay_rover_count_acc', 'mem_bandwidth_acc'] + + for value in uint_64_values: + if gpu_metrics_output[value] == 0xFFFFFFFFFFFFFFFF: + gpu_metrics_output[value] = "N/A" + + # Custom validation for specific gpu_metrics + if gpu_metrics_output['throttle_status'] == 0xFFFFFFFF: + gpu_metrics_output['throttle_status'] = "N/A" + else: + gpu_metrics_output['throttle_status'] = bool(gpu_metrics_output['throttle_status']) + + for idx, temp in enumerate(gpu_metrics_output['temperature_hbm']): + if temp == 0xFFFF: + gpu_metrics_output['temperature_hbm'][idx] = "N/A" + + if gpu_metrics_output['indep_throttle_status'] == 0xFFFFFFFFFFFFFFFF: + gpu_metrics_output['indep_throttle_status'] = "N/A" + else: + gpu_metrics_output['indep_throttle_status'] = bool(gpu_metrics_output['indep_throttle_status']) + + for idx, activity in enumerate(gpu_metrics_output['vcn_activity']): + if activity == 0xFFFF: + gpu_metrics_output['vcn_activity'][idx] = "N/A" + + if gpu_metrics_output['gfxclk_lock_status'] == 0xFFFFFFFF: + gpu_metrics_output['gfxclk_lock_status'] = "N/A" + else: + gpu_metrics_output['gfxclk_lock_status'] = bool(gpu_metrics_output['gfxclk_lock_status']) + + for idx, data in enumerate(gpu_metrics_output['xgmi_read_data_acc']): + if data == 0xFFFFFFFFFFFFFFFF: + gpu_metrics_output['xgmi_read_data_acc'][idx] = "N/A" + + for idx, data in enumerate(gpu_metrics_output['xgmi_write_data_acc']): + if data == 0xFFFFFFFFFFFFFFFF: + gpu_metrics_output['xgmi_write_data_acc'][idx] = "N/A" + + for idx, clk in enumerate(gpu_metrics_output['current_gfxclks']): + if clk == 0xFFFF: + gpu_metrics_output['current_gfxclks'][idx] = "N/A" + + for idx, clk in enumerate(gpu_metrics_output['current_socclks']): + if clk == 0xFFFF: + gpu_metrics_output['current_socclks'][idx] = "N/A" + + for idx, clk in enumerate(gpu_metrics_output['current_vclk0s']): + if clk == 0xFFFF: + gpu_metrics_output['current_vclk0s'][idx] = "N/A" + + for idx, clk in enumerate(gpu_metrics_output['current_dclk0s']): + if clk == 0xFFFF: + gpu_metrics_output['current_dclk0s'][idx] = "N/A" + + for idx, activity in enumerate(gpu_metrics_output['jpeg_activity']): + if activity == 0xFFFF: + gpu_metrics_output['jpeg_activity'][idx] = "N/A" + + return gpu_metrics_output + def amdsmi_get_gpu_od_volt_curve_regions( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, num_regions: int