From c404fbd851d8076a40d2b4adae7d90e882daa658 Mon Sep 17 00:00:00 2001 From: "systems-assistant[bot]" <221163467+systems-assistant[bot]@users.noreply.github.com> Date: Mon, 24 Nov 2025 13:12:09 -0600 Subject: [PATCH] [SWDEV-560235] Add gpu_board and base_board temperatures to monitor (#1906) * Add helpers for gpu_board and base_board temperatures * Added gpu_board and base_board temperatures arguments for non-default monitor subcommand Signed-off-by: Bindhiya Kanangot Balakrishnan Co-authored-by: Bindhiya Kanangot Balakrishnan --- projects/amdsmi/CHANGELOG.md | 13 ++ projects/amdsmi/amdsmi_cli/amdsmi_commands.py | 130 +++++++----------- projects/amdsmi/amdsmi_cli/amdsmi_helpers.py | 106 ++++++++++++++ projects/amdsmi/amdsmi_cli/amdsmi_logger.py | 3 + projects/amdsmi/amdsmi_cli/amdsmi_parser.py | 4 + 5 files changed, 176 insertions(+), 80 deletions(-) diff --git a/projects/amdsmi/CHANGELOG.md b/projects/amdsmi/CHANGELOG.md index a085d079aa..05e2245830 100644 --- a/projects/amdsmi/CHANGELOG.md +++ b/projects/amdsmi/CHANGELOG.md @@ -8,6 +8,19 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr ### Added +- **Added GPU and base board temperature `amd-smi monitor` CLI support**. + - Added `--gpu-board-temps` option to `amd-smi monitor` command for GPU board temperature sensors + - Added `--base-board-temps` option to `amd-smi monitor` command for base board temperature sensors + +- **Added Node Power Management (NPM) support**. + - Added new Node Power Management APIs and CLI for node monitoring + - Added C API functions: + - `amdsmi_get_node_handle()`: Get handle for node devices + - `amdsmi_get_npm_info()`: Retrieve Node Power Management information + - Added Python API wrappers for new node device functions + - Added `amd-smi node` CLI command for Node Power Management operations + - Currently supported for OAM_ID 0 only. + - **Added the following C API's to amdsmi_interface.py**. - amdsmi_get_cpu_handle() - amdsmi_get_esmi_err_msg() diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index 9669683891..a5a0e1a45a 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -2311,41 +2311,7 @@ class AMDSMICommands(): if "gpu_board" in current_platform_args: if args.gpu_board: - gpu_board_temp_dict = {} - gpu_board_temp_types = [ - amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_NODE_RETIMER_X, - amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_NODE_OAM_X_IBC, - amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_NODE_OAM_X_IBC_2, - amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_NODE_OAM_X_VDD18_VR, - amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_NODE_OAM_X_04_HBM_B_VR, - amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_NODE_OAM_X_04_HBM_D_VR, - amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_VDD0, - amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_VDD1, - amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_VDD2, - amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_VDD3, - amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_SOC_A, - amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_SOC_C, - amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_SOCIO_A, - amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_SOCIO_C, - amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDD_085_HBM, - amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_11_HBM_B, - amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_11_HBM_D, - amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDD_USR, - amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDIO_11_E32 - ] - for type in gpu_board_temp_types: - type_name = type.name.replace("GPUBOARD_", "") - try: - gpu_board_temp_holder = amdsmi_interface.amdsmi_get_temp_metric(args.gpu, type, amdsmi_interface.AmdSmiTemperatureMetric.CURRENT) - if gpu_board_temp_holder != "N/A": - gpu_board_temp_dict[f'{type_name}'] = self.helpers.unit_format(self.logger, - gpu_board_temp_holder, - '\N{DEGREE SIGN}C') - else: - gpu_board_temp_dict[f'{type_name}'] = "N/A" - except amdsmi_exception.AmdSmiLibraryException as e: - gpu_board_temp_dict[f'{type_name}'] = "N/A" - logging.debug("Failed to get gpu_board %s for gpu %s | %s", type_name, gpu_id, e.get_error_info()) + gpu_board_temp_dict = self.helpers.get_gpu_board_temperatures(args.gpu, gpu_id, self.logger) # if every value is N/A, then we don't want to display the values unless explicitly told to # all args_list being True indicates that this gpu_board is not explicitly called itself args_list = [getattr(args, arg) for arg in current_platform_args] @@ -2355,46 +2321,7 @@ class AMDSMICommands(): values_dict['gpu_board'] = {'temperature':gpu_board_temp_dict} if "base_board" in current_platform_args: if args.base_board: - base_board_temp_dict = {} - base_board_temp_types = [ - amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_FPGA, - amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_FRONT, - amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_BACK, - amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_OAM7, - amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_IBC, - amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_UFPGA, - amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_OAM1, - amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_OAM_0_1_HSC, - amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_OAM_2_3_HSC, - amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_OAM_4_5_HSC, - amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_OAM_6_7_HSC, - amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_FPGA_0V72_VR, - amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_FPGA_3V3_VR, - amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_RETIMER_0_1_2_3_1V2_VR, - amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_RETIMER_4_5_6_7_1V2_VR, - amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_RETIMER_0_1_0V9_VR, - amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_RETIMER_4_5_0V9_VR, - amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_RETIMER_2_3_0V9_VR, - amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_RETIMER_6_7_0V9_VR, - amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_OAM_0_1_2_3_3V3_VR, - amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_OAM_4_5_6_7_3V3_VR, - amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_IBC_HSC, - amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_IBC - ] - for type in base_board_temp_types: - type_name = type.name.replace("BASEBOARD_", "") - try: - base_board_temp_holder = amdsmi_interface.amdsmi_get_temp_metric(args.gpu, type, amdsmi_interface.AmdSmiTemperatureMetric.CURRENT) - if base_board_temp_holder != "N/A": - - base_board_temp_dict[f'{type_name}'] = self.helpers.unit_format(self.logger, - base_board_temp_holder, - '\N{DEGREE SIGN}C') - else: - base_board_temp_dict[f'{type_name}'] = "N/A" - except amdsmi_exception.AmdSmiLibraryException as e: - base_board_temp_dict[f'{type_name}'] = "N/A" - logging.debug("Failed to get base_board %s for gpu %s | %s", type_name, gpu_id, e.get_error_info()) + base_board_temp_dict = self.helpers.get_base_board_temperatures(args.gpu, gpu_id, self.logger) # if every value is N/A, then we don't want to display the values unless explicitly told to # all args_list being True indicates that this base_board is not explicitly called itself args_list = [getattr(args, arg) for arg in current_platform_args] @@ -5680,8 +5607,9 @@ class AMDSMICommands(): def monitor(self, args, multiple_devices=False, watching_output=False, gpu=None, watch=None, watch_time=None, iterations=None, power_usage=None, - temperature=None, gfx_util=None, mem_util=None, encoder=None, - decoder=None, ecc=None, vram_usage=None, pcie=None, process=None, + temperature=None, base_board_temps=None, gpu_board_temps=None, + gfx_util=None, mem_util=None, encoder=None, decoder=None, + ecc=None, vram_usage=None, pcie=None, process=None, violation=None): """ Populate a table with each GPU as an index to rows of targeted data @@ -5694,6 +5622,8 @@ class AMDSMICommands(): iterations (int, optional): Value override for args.iterations. Defaults to None. power_usage (bool, optional): Value override for args.power_usage. Defaults to None. temperature (bool, optional): Value override for args.temperature. Defaults to None. + base_board_temps (bool, optional): Value override for args.base_board_temps. Defaults to None. + gpu_board_temps (bool, optional): Value override for args.gpu_board_temps. Defaults to None. gfx (bool, optional): Value override for args.gfx. Defaults to None. mem_util (bool, optional): Value override for args.mem. Defaults to None. encoder (bool, optional): Value override for args.encoder. Defaults to None. @@ -5726,6 +5656,10 @@ class AMDSMICommands(): args.power_usage = power_usage if temperature: args.temperature = temperature + if base_board_temps: + args.base_board_temps = base_board_temps + if gpu_board_temps: + args.gpu_board_temps = gpu_board_temps if gfx_util: args.gfx = gfx_util if mem_util: @@ -5758,9 +5692,10 @@ class AMDSMICommands(): # If all arguments are False, the print all values # Don't include process in this logic as it's an optional edge case - if not any([args.power_usage, args.temperature, args.gfx, args.mem, - args.encoder, args.decoder, args.ecc, args.vram_usage, - args.pcie, args.violation]): + if not any([args.power_usage, args.temperature, args.base_board_temps, + args.gpu_board_temps, args.gfx, args.mem, args.encoder, + args.decoder, args.ecc, args.vram_usage, args.pcie, + args.violation]): args.power_usage = args.temperature = args.gfx = args.mem = \ args.encoder = args.decoder = args.vram_usage = True # set extra args for default output filtering @@ -5942,6 +5877,41 @@ class AMDSMICommands(): self.logger.table_header += 'GPU_T'.rjust(8) self.logger.table_header += 'MEM_T'.rjust(8) + + if args.gpu_board_temps: + try: + gpu_board_temp_dict = self.helpers.get_gpu_board_temperatures(args.gpu, gpu_id, self.logger) + + temp_unit_json = 'C' + # Add GPU board sensor headers + if gpu_board_temp_dict: + for temp_sensor in sorted(gpu_board_temp_dict.keys()): + self.logger.table_header += f"{temp_sensor}".rjust(max(len(temp_sensor)+2, 7)) + for temp_type, temp_value in gpu_board_temp_dict.items(): + if self.logger.is_json_format() and isinstance(temp_value, dict): + temp_value['unit'] = temp_unit_json + monitor_values[temp_type] = temp_value + except Exception as e: + logging.debug("Failed to get GPU board temperatures on gpu %s | %s", gpu_id, e) + + + if args.base_board_temps: + try: + base_board_temp_dict = self.helpers.get_base_board_temperatures(args.gpu, gpu_id, self.logger) + + temp_unit_json = 'C' + # Add base board sensor headers + if base_board_temp_dict: + for temp_sensor in sorted(base_board_temp_dict.keys()): + self.logger.table_header += f"{temp_sensor}".rjust(max(len(temp_sensor)+2, 7)) + for temp_type, temp_value in base_board_temp_dict.items(): + if self.logger.is_json_format() and isinstance(temp_value, dict): + temp_value['unit'] = temp_unit_json + monitor_values[temp_type] = temp_value + except Exception as e: + logging.debug("Failed to get base board temperatures on gpu %s | %s", gpu_id, e) + + if args.gfx: try: gfx_clk = gpu_metrics_info['current_gfxclk'] diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py index e30e433a57..17bfed3f9d 100755 --- a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py @@ -1932,3 +1932,109 @@ class AMDSMIHelpers(): 'num_partition': num_partition, 'num_xcp': num_xcp } + + def get_gpu_board_temperatures(self, device_handle, gpu_id, logger): + """Get GPU board temperature readings + + Args: + device_handle: GPU device handle + gpu_id: GPU identifier for logging + logger: AMDSMILogger instance + + Returns: + dict: GPU board temperature data or empty dict if all values are N/A + """ + gpu_board_temp_dict = {} + gpu_board_temp_types = [ + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_NODE_RETIMER_X, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_NODE_OAM_X_IBC, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_NODE_OAM_X_IBC_2, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_NODE_OAM_X_VDD18_VR, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_NODE_OAM_X_04_HBM_B_VR, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_NODE_OAM_X_04_HBM_D_VR, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_VDD0, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_VDD1, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_VDD2, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_VDD3, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_SOC_A, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_SOC_C, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_SOCIO_A, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_SOCIO_C, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDD_085_HBM, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_11_HBM_B, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_11_HBM_D, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDD_USR, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDIO_11_E32 + ] + + for temp_type in gpu_board_temp_types: + type_name = temp_type.name.replace("GPUBOARD_", "") + try: + gpu_board_temp_holder = amdsmi_interface.amdsmi_get_temp_metric( + device_handle, temp_type, amdsmi_interface.AmdSmiTemperatureMetric.CURRENT) + if gpu_board_temp_holder != "N/A": + gpu_board_temp_dict[f'{type_name}'] = self.unit_format( + logger, gpu_board_temp_holder, '\N{DEGREE SIGN}C') + else: + gpu_board_temp_dict[f'{type_name}'] = "N/A" + except amdsmi_exception.AmdSmiLibraryException as e: + gpu_board_temp_dict[f'{type_name}'] = "N/A" + logging.debug("Failed to get gpu_board %s for gpu %s | %s", + type_name, gpu_id, e.get_error_info()) + + return gpu_board_temp_dict + + def get_base_board_temperatures(self, device_handle, gpu_id, logger): + """Get base board temperature readings + + Args: + device_handle: GPU device handle + gpu_id: GPU identifier for logging + logger: AMDSMILogger instance + + Returns: + dict: Base board temperature data or empty dict if all values are N/A + """ + base_board_temp_dict = {} + base_board_temp_types = [ + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_FPGA, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_FRONT, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_BACK, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_OAM7, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_IBC, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_UFPGA, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_OAM1, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_OAM_0_1_HSC, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_OAM_2_3_HSC, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_OAM_4_5_HSC, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_OAM_6_7_HSC, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_FPGA_0V72_VR, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_FPGA_3V3_VR, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_RETIMER_0_1_2_3_1V2_VR, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_RETIMER_4_5_6_7_1V2_VR, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_RETIMER_0_1_0V9_VR, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_RETIMER_4_5_0V9_VR, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_RETIMER_2_3_0V9_VR, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_RETIMER_6_7_0V9_VR, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_OAM_0_1_2_3_3V3_VR, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_OAM_4_5_6_7_3V3_VR, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_IBC_HSC, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_IBC + ] + + for temp_type in base_board_temp_types: + type_name = temp_type.name.replace("BASEBOARD_", "") + try: + base_board_temp_holder = amdsmi_interface.amdsmi_get_temp_metric( + device_handle, temp_type, amdsmi_interface.AmdSmiTemperatureMetric.CURRENT) + if base_board_temp_holder != "N/A": + base_board_temp_dict[f'{type_name}'] = self.unit_format( + logger, base_board_temp_holder, '\N{DEGREE SIGN}C') + else: + base_board_temp_dict[f'{type_name}'] = "N/A" + except amdsmi_exception.AmdSmiLibraryException as e: + base_board_temp_dict[f'{type_name}'] = "N/A" + logging.debug("Failed to get base_board %s for gpu %s | %s", + type_name, gpu_id, e.get_error_info()) + + return base_board_temp_dict diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_logger.py b/projects/amdsmi/amdsmi_cli/amdsmi_logger.py index eaddb81e77..356aa71fa1 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_logger.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_logger.py @@ -263,6 +263,9 @@ class AMDSMILogger(): # Remove excess two values after a new line in table_values table_values = table_values[:table_values.rfind('\n')] table_values += '\n' + # Board temperature key patterns + elif any(pattern in key for pattern in ['IBC', 'OAM', 'RETIMER', 'UBB', 'HSC', 'VR', 'VDDCR', 'NODE', 'VDD', 'HBM']): + table_values += string_value.rjust(max((len(key)+2), 7)) # Default spacing else: table_values += string_value.rjust(10) diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py index 98370bbd7d..c158412b22 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py @@ -1412,6 +1412,8 @@ class AMDSMIParser(argparse.ArgumentParser): # Help text for Arguments only on Guest and BM platforms power_usage_help = "Monitor power usage and power cap in Watts" temperature_help = "Monitor temperature in Celsius" + base_board_temps_help = "Monitor base board temperatures in Celsius" + gpu_board_temps_help = "Monitor GPU board temperatures in Celsius" gfx_util_help = "Monitor graphics utilization (%%) and clock (MHz)" mem_util_help = "Monitor memory utilization (%%) and clock (MHz)" encoder_util_help = "Monitor encoder utilization (%%) and clock (MHz)" @@ -1431,6 +1433,8 @@ class AMDSMIParser(argparse.ArgumentParser): # Add monitor arguments monitor_parser.add_argument('-p', '--power-usage', action='store_true', required=False, help=power_usage_help) monitor_parser.add_argument('-t', '--temperature', action='store_true', required=False, help=temperature_help) + monitor_parser.add_argument('-b', '--base-board-temps', action='store_true', required=False, help=base_board_temps_help) + monitor_parser.add_argument('-o', '--gpu-board-temps', action='store_true', required=False, help=gpu_board_temps_help) monitor_parser.add_argument('-u', '--gfx', action='store_true', required=False, help=gfx_util_help) monitor_parser.add_argument('-m', '--mem', action='store_true', required=False, help=mem_util_help) monitor_parser.add_argument('-n', '--encoder', action='store_true', required=False, help=encoder_util_help)